teradataml 20.0.0.5__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (119) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +306 -0
  3. teradataml/__init__.py +1 -1
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +162 -76
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/__init__.py +2 -0
  8. teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
  9. teradataml/analytics/json_parser/metadata.py +22 -4
  10. teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
  11. teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
  12. teradataml/analytics/sqle/__init__.py +3 -0
  13. teradataml/analytics/utils.py +59 -11
  14. teradataml/automl/__init__.py +2369 -464
  15. teradataml/automl/autodataprep/__init__.py +15 -0
  16. teradataml/automl/custom_json_utils.py +184 -112
  17. teradataml/automl/data_preparation.py +113 -58
  18. teradataml/automl/data_transformation.py +154 -53
  19. teradataml/automl/feature_engineering.py +113 -53
  20. teradataml/automl/feature_exploration.py +548 -25
  21. teradataml/automl/model_evaluation.py +260 -32
  22. teradataml/automl/model_training.py +399 -206
  23. teradataml/clients/auth_client.py +10 -6
  24. teradataml/clients/keycloak_client.py +165 -0
  25. teradataml/common/aed_utils.py +11 -2
  26. teradataml/common/bulk_exposed_utils.py +4 -2
  27. teradataml/common/constants.py +72 -2
  28. teradataml/common/exceptions.py +32 -0
  29. teradataml/common/garbagecollector.py +50 -21
  30. teradataml/common/messagecodes.py +73 -1
  31. teradataml/common/messages.py +27 -1
  32. teradataml/common/sqlbundle.py +25 -7
  33. teradataml/common/utils.py +210 -22
  34. teradataml/context/aed_context.py +16 -10
  35. teradataml/context/context.py +37 -9
  36. teradataml/data/Employee.csv +5 -0
  37. teradataml/data/Employee_Address.csv +4 -0
  38. teradataml/data/Employee_roles.csv +5 -0
  39. teradataml/data/JulesBelvezeDummyData.csv +100 -0
  40. teradataml/data/byom_example.json +5 -0
  41. teradataml/data/creditcard_data.csv +284618 -0
  42. teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
  43. teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
  44. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
  45. teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
  46. teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
  47. teradataml/data/jsons/byom/onnxembeddings.json +1 -0
  48. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
  49. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
  50. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
  51. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
  52. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
  53. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
  54. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
  55. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
  56. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
  57. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
  58. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
  59. teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
  60. teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
  61. teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
  62. teradataml/data/load_example_data.py +29 -11
  63. teradataml/data/pattern_matching_data.csv +11 -0
  64. teradataml/data/payment_fraud_dataset.csv +10001 -0
  65. teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
  66. teradataml/data/teradataml_example.json +75 -1
  67. teradataml/data/url_data.csv +10 -9
  68. teradataml/dataframe/copy_to.py +715 -55
  69. teradataml/dataframe/dataframe.py +2115 -97
  70. teradataml/dataframe/dataframe_utils.py +66 -28
  71. teradataml/dataframe/functions.py +1130 -2
  72. teradataml/dataframe/setop.py +4 -1
  73. teradataml/dataframe/sql.py +710 -1039
  74. teradataml/dbutils/dbutils.py +470 -35
  75. teradataml/dbutils/filemgr.py +1 -1
  76. teradataml/hyperparameter_tuner/optimizer.py +456 -142
  77. teradataml/hyperparameter_tuner/utils.py +4 -2
  78. teradataml/lib/aed_0_1.dll +0 -0
  79. teradataml/lib/libaed_0_1.dylib +0 -0
  80. teradataml/lib/libaed_0_1.so +0 -0
  81. teradataml/lib/libaed_0_1_aarch64.so +0 -0
  82. teradataml/opensource/_base.py +7 -1
  83. teradataml/options/configure.py +20 -4
  84. teradataml/scriptmgmt/UserEnv.py +247 -36
  85. teradataml/scriptmgmt/lls_utils.py +140 -39
  86. teradataml/sdk/README.md +79 -0
  87. teradataml/sdk/__init__.py +4 -0
  88. teradataml/sdk/_auth_modes.py +422 -0
  89. teradataml/sdk/_func_params.py +487 -0
  90. teradataml/sdk/_json_parser.py +453 -0
  91. teradataml/sdk/_openapi_spec_constants.py +249 -0
  92. teradataml/sdk/_utils.py +236 -0
  93. teradataml/sdk/api_client.py +900 -0
  94. teradataml/sdk/constants.py +62 -0
  95. teradataml/sdk/modelops/__init__.py +98 -0
  96. teradataml/sdk/modelops/_client.py +409 -0
  97. teradataml/sdk/modelops/_constants.py +304 -0
  98. teradataml/sdk/modelops/models.py +2308 -0
  99. teradataml/sdk/spinner.py +107 -0
  100. teradataml/series/series.py +12 -7
  101. teradataml/store/feature_store/constants.py +601 -234
  102. teradataml/store/feature_store/feature_store.py +2886 -616
  103. teradataml/store/feature_store/mind_map.py +639 -0
  104. teradataml/store/feature_store/models.py +5831 -214
  105. teradataml/store/feature_store/utils.py +390 -0
  106. teradataml/table_operators/query_generator.py +4 -21
  107. teradataml/table_operators/table_operator_util.py +1 -1
  108. teradataml/table_operators/templates/dataframe_register.template +6 -2
  109. teradataml/table_operators/templates/dataframe_udf.template +6 -2
  110. teradataml/utils/docstring.py +527 -0
  111. teradataml/utils/dtypes.py +95 -1
  112. teradataml/utils/internal_buffer.py +2 -2
  113. teradataml/utils/utils.py +41 -3
  114. teradataml/utils/validators.py +699 -18
  115. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +312 -2
  116. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +119 -87
  117. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
  118. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
  119. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
@@ -12,63 +12,74 @@ This file implements the teradataml dataframe.
12
12
  A teradataml dataframe maps virtually to teradata tables and views.
13
13
  """
14
14
  import decimal
15
- import inspect, itertools
15
+ import inspect
16
+ import itertools
16
17
  import json
17
18
  import numbers
18
- import pandas as pd
19
19
  import re
20
- import sqlalchemy
21
20
  import sys
22
21
  import urllib.parse
22
+ from collections import OrderedDict
23
+ from collections.abc import Iterator
23
24
 
25
+ import numpy as np
26
+ import pandas as pd
27
+ import sqlalchemy
24
28
  from sqlalchemy import Column
29
+ from sqlalchemy.exc import NoSuchColumnError
30
+ from datetime import datetime, date
31
+ from sqlalchemy.sql import ClauseElement
32
+ from teradatasql import OperationalError
33
+ from teradatasqlalchemy import types as tdtypes
34
+ from teradatasqlalchemy.dialect import dialect as td_dialect
35
+ from teradatasqlalchemy.dialect import preparer
36
+ from teradatasqlalchemy.types import (BIGINT, BYTEINT, DECIMAL, FLOAT, INTEGER,
37
+ PERIOD_TIMESTAMP, SMALLINT, _TDType)
25
38
 
26
39
  import teradataml.context.context as tdmlctx
27
-
28
- from collections import OrderedDict, namedtuple
29
- from sqlalchemy.sql import ClauseElement
30
- from teradataml import execute_sql
31
- from teradataml import GarbageCollector
32
- from teradataml.dataframe.sql import _MetaExpression
33
- from teradataml.dataframe.sql_interfaces import ColumnExpression
34
- from teradataml.dataframe.sql_functions import case
35
- from teradataml.series.series import Series
36
- from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
37
- from teradataml.common.deprecations import argument_deprecation
38
- from teradataml.common.utils import UtilFuncs
40
+ from teradataml import GarbageCollector, execute_sql
41
+ from teradataml.common.bulk_exposed_utils import \
42
+ _validate_unimplemented_function
43
+ from teradataml.common.constants import (AEDConstants, DataFrameTypes, OutputStyle,
44
+ PTITableConstants, PythonTypes,
45
+ SourceType, SQLConstants,
46
+ SQLFunctionConstants,
47
+ TableOperatorConstants,
48
+ TeradataConstants, TeradataTypes)
39
49
  from teradataml.common.exceptions import TeradataMlException
40
- from teradataml.common.messages import Messages
41
50
  from teradataml.common.messagecodes import MessageCodes
42
- from teradataml.common.constants import AEDConstants
43
- from teradataml.common.constants import SourceType, PythonTypes, TeradataConstants, \
44
- TeradataTypes, PTITableConstants, TableOperatorConstants, SQLFunctionConstants
45
- from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, DataFrameUtils
46
- from teradataml.dataframe.indexer import _LocationIndexer
47
- from teradataml.common.aed_utils import AedUtils
48
- from teradataml.options.display import display
49
- from teradataml.options.configure import configure
51
+ from teradataml.common.messages import Messages
52
+ from teradataml.common.sqlbundle import SQLBundle
53
+ from teradataml.common.utils import UtilFuncs
50
54
  from teradataml.dataframe.copy_to import copy_to_sql
55
+ from teradataml.dataframe.data_transfer import _DataTransferUtils
56
+ from teradataml.dataframe.dataframe_utils import DataFrameUtils
57
+ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
58
+ from teradataml.dataframe.indexer import _LocationIndexer
51
59
  from teradataml.dataframe.row import _Row
52
60
  from teradataml.dataframe.setop import concat
61
+ from teradataml.dataframe.sql import _MetaExpression
62
+ from teradataml.dataframe.sql_functions import case
63
+ from teradataml.dataframe.sql_interfaces import ColumnExpression
64
+ from teradataml.dataframe.window import Window
53
65
  from teradataml.dbutils.dbutils import list_td_reserved_keywords
66
+ from teradataml.options.configure import configure
67
+ from teradataml.options.display import display
54
68
  from teradataml.plot.plot import _Plot
55
69
  from teradataml.scriptmgmt.UserEnv import UserEnv
56
- from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
57
- from teradataml.utils.validators import _Validators
70
+ from teradataml.series.series import Series
58
71
  from teradataml.table_operators.table_operator_util import _TableOperatorUtils
59
- from teradatasqlalchemy.dialect import preparer, dialect as td_dialect
60
- from teradatasql import OperationalError
61
- from teradataml.dataframe.window import Window
62
- from teradataml.dataframe.data_transfer import _DataTransferUtils
63
- from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
64
72
  from teradataml.telemetry_utils.queryband import collect_queryband
65
- from teradataml.options.configure import configure
66
- from teradataml.utils.internal_buffer import _InternalBuffer
67
- from teradataml.common.constants import OutputStyle
73
+ from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
74
+ from teradataml.utils.validators import _Validators
75
+
76
+ # Adding imports at the end to avoid circular imports.
77
+ from teradataml.common.aed_utils import AedUtils
68
78
 
69
79
  # TODO use logger when available on master branch
70
80
  # logger = teradatapylog.getLogger()
71
81
 
82
+
72
83
  class in_schema:
73
84
  """
74
85
  Class takes a schema name, a table name and datalake name attributes
@@ -149,26 +160,37 @@ class DataFrame():
149
160
  on tables, views, and queries on Teradata Vantage.
150
161
  """
151
162
 
152
- def __init__(self, table_name=None, index=True, index_label=None, query=None, materialize=False):
163
+ def __init__(self, data=None, index=True, index_label=None, query=None, materialize=False, **kwargs):
153
164
  """
154
165
  Constructor for teradataml DataFrame.
155
166
 
156
167
  PARAMETERS:
157
- table_name:
168
+ data:
158
169
  Optional Argument.
159
- The table name or view name in Teradata Vantage referenced by this DataFrame.
160
- Types: str
170
+ Specifies the input data to create a teradataml DataFrame.
171
+ Notes:
172
+ If a dictionary is provided, it must follow the below requirements:
173
+ * Keys must be strings (column names).
174
+ * Values must be lists of equal length (column data).
175
+ * Nested dictionaries are not supported.
176
+ Types: str OR pandas DataFrame OR in_schema OR numpy array OR list OR dictionary
161
177
 
162
178
  index:
163
179
  Optional Argument.
164
- True if using index column for sorting, otherwise False.
180
+ If "data" is a string, then the argument specifies whether to use the index column
181
+ for sorting or not.
182
+ If "data" is a pandas DataFrame, then this argument specifies whether to
183
+ save Pandas DataFrame index as a column or not.
165
184
  Default Value: True
166
185
  Types: bool
167
186
 
168
187
  index_label:
169
188
  Optional Argument.
170
- Column/s used for sorting.
171
- Types: str OR list of Strings (str)
189
+ If "data" is a string, then the argument specifies column(s) used for sorting.
190
+ If "data" is a pandas DataFrame, then the default behavior is applied.
191
+ Note:
192
+ * Refer to the "index_label" parameter of copy_to_sql() for details on the default behaviour.
193
+ Types: str OR list of str
172
194
 
173
195
  query:
174
196
  Optional Argument.
@@ -187,29 +209,136 @@ class DataFrame():
187
209
  Default Value: False (No materialization)
188
210
  Types: bool
189
211
 
212
+ kwargs:
213
+ table_name:
214
+ Optional Argument.
215
+ The table name or view name in Teradata Vantage referenced by this DataFrame.
216
+ Note:
217
+ * If "data" and "table_name" are both specified, then the "table_name" argument is ignored.
218
+ Types: str or in_schema
219
+
220
+ primary_index:
221
+ Optional Argument.
222
+ Specifies which column(s) to use as primary index for the teradataml DataFrame.
223
+ Note:
224
+ * This argument is only applicable when creating a DataFrame from a pandas DataFrame.
225
+ Types: str OR list of str
226
+
227
+ types:
228
+ Optional Argument.
229
+ Specifies required data types for requested columns to be saved in Teradata Vantage.
230
+ Notes:
231
+ * This argument is not applicable when "data" argument is of type str or in_schema.
232
+ * Refer to the "types" parameter of copy_to_sql() for more details.
233
+ Types: dict
234
+
235
+ columns:
236
+ Optional Argument.
237
+ Specifies the names of the columns to be used in the DataFrame.
238
+ Notes:
239
+ * This argument is not applicable when "data" argument is of type str or in_schema.
240
+ * If "data" is a dictionary and this argument is specified, only the specified columns will be
241
+ included in the DataFrame if the dictionary contains those keys. If the dictionary does not
242
+ contain the specified keys, those columns will be added with NaN values.
243
+ Types: str OR list of str
244
+
245
+ persist:
246
+ Optional Argument.
247
+ Specifies whether to persist the DataFrame.
248
+ Note:
249
+ * This argument is only applicable when the "data" argument is of type dict, list or
250
+ pandas DataFrame.
251
+ Default Value: False
252
+ Types: bool
253
+
190
254
  EXAMPLES:
191
- from teradataml.dataframe.dataframe import DataFrame
255
+ >>> from teradataml.dataframe.dataframe import DataFrame
256
+ >>> import pandas as pd
192
257
 
193
- # Example 1: The following example creates a DataFrame from the 'table_name'
194
- # or 'view_name'.
195
- # Created DataFrame using table name.
196
- df = DataFrame("mytab")
258
+ # Example 1: Create a teradataml DataFrame from table name.
259
+ >>> df = DataFrame("mytab")
197
260
 
198
- # Created DataFrame using view name.
199
- df = DataFrame("myview")
261
+ # Example 2: Create a teradataml DataFrame from view name.
262
+ >>> df = DataFrame("myview")
200
263
 
201
- # Created DataFrame using view name without using index column for sorting.
202
- df = DataFrame("myview", False)
264
+ # Example 3: Create a teradataml DataFrame using view name without using index column for sorting.
265
+ >>> df = DataFrame("myview", False)
203
266
 
204
- # Created DataFrame using table name and sorted using Col1 and Col2
205
- df = DataFrame("mytab", True, "Col1, Col2")
267
+ # Example 4: Create a teradataml DataFrame using table name and consider columns Col1 and Col2
268
+ # while running DataFrame.head() or DataFrame.tail() methods.
269
+ >>> df = DataFrame("mytab", True, ["Col1", "Col2"])
206
270
 
271
+ # Example 5: Create a teradataml DataFrame from the existing Vantage table "dbcinfo"
272
+ # in the non-default database "dbc" using the in_schema() object.
273
+ >>> from teradataml.dataframe.dataframe import in_schema
274
+ >>> df = DataFrame(in_schema("dbc", "dbcinfo"))
207
275
 
208
- # Example 2: The following example creates a DataFrame from the existing Vantage
209
- # table "dbcinfo" in the non-default database "dbc" using the
210
- # in_schema() function.
211
- from teradataml.dataframe.dataframe import in_schema
212
- df = DataFrame(in_schema("dbc", "dbcinfo"))
276
+ # Example 6: Create a teradataml DataFrame from a pandas DataFrame.
277
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
278
+ >>> df = DataFrame(pdf)
279
+ >>> df
280
+ col1 col2 index_label
281
+ 0 3 6 2
282
+ 1 2 5 1
283
+ 2 1 4 0
284
+
285
+ # Example 7: Create a teradataml DataFrame from a pandas DataFrame without index column.
286
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
287
+ >>> df = DataFrame(data=pdf, index=False)
288
+ >>> df
289
+ col1 col2
290
+ 0 3 6
291
+ 1 2 5
292
+ 2 1 4
293
+
294
+ # Example 8: Create a teradataml DataFrame from a pandas DataFrame with
295
+ # index label and primary index as 'id'.
296
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
297
+ >>> df = DataFrame(pdf, index=True, index_label='id', primary_index='id')
298
+ >>> df
299
+ col1 col2
300
+ id
301
+ 2 3 6
302
+ 1 2 5
303
+ 0 1 4
304
+
305
+ # Example 9: Create a teradataml DataFrame from list of lists.
306
+ >>> df = DataFrame([[1, 2], [3, 4]])
307
+ >>> df
308
+ col_0 col_1 index_label
309
+ 0 3 4 1
310
+ 1 1 2 0
311
+
312
+ # Example 10: Create a teradataml DataFrame from numpy array.
313
+ >>> import numpy as np
314
+ >>> df = DataFrame(np.array([[1, 2], [3, 4]]), index=True, index_label="id")
315
+ >>> df
316
+ col_0 col_1
317
+ id
318
+ 1 3 4
319
+ 0 1 2
320
+
321
+ # Example 11: Create a teradataml DataFrame from a dictionary.
322
+ >>> df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=True, index_label="id")
323
+ >>> df
324
+ col1 col2
325
+ id
326
+ 1 2 4
327
+ 0 1 3
328
+
329
+ # Example 12: Create a teradataml DataFrame from list of dictionaries.
330
+ >>> df = DataFrame([{"col1": 1, "col2": 2}, {"col1": 3, "col2": 4}], index=False)
331
+ >>> df
332
+ col1 col2
333
+ 0 3 4
334
+ 1 1 2
335
+
336
+ # Example 13: Create a teradataml DataFrame from list of tuples.
337
+ >>> df = DataFrame([("Alice", 1), ("Bob", 2)])
338
+ >>> df
339
+ col_0 col_1 index_label
340
+ 0 Alice 1 1
341
+ 1 Bob 2 0
213
342
 
214
343
  RAISES:
215
344
  TeradataMlException - TDMLDF_CREATE_FAIL
@@ -248,17 +377,39 @@ class DataFrame():
248
377
  # This attribute stores the root DataFrame columns.
249
378
  self._root_columns = None
250
379
 
380
+ # Internal argument, when this attribute is set to True, the teradataml DataFrame locks
381
+ # the corresponding row(s) in the underlying table(s) while accessing the data.
382
+ _lock_rows = kwargs.get("_lock_rows", False)
383
+
251
384
  self._datalake = None
252
385
  self._database = None
253
386
  self._table = None
254
387
  self._otf = False
255
-
256
- if isinstance(table_name, in_schema):
257
- self._table = table_name.table_name
258
- self._datalake = table_name.datalake_name
259
- self._database = table_name.schema_name
388
+ self._df_type = None
389
+ self._valid_time_column = None
390
+ self._transaction_time_column = None
391
+
392
+
393
+ table_name = kwargs.get("table_name", None)
394
+ primary_index = kwargs.get("primary_index", None)
395
+ columns = kwargs.get("columns", None)
396
+ types = kwargs.get("types", None)
397
+ persist = kwargs.get("persist", False)
398
+
399
+ # Check if the data is an instance of in_schema or if the data is None
400
+ # and table_name is an instance of in_schema, then assign the table_name,
401
+ # datalake_name and schema_name to the DataFrame object.
402
+ schema_obj = data if isinstance(data, in_schema) else (
403
+ table_name if data is None and isinstance(table_name, in_schema) else None)
404
+
405
+ if schema_obj:
406
+ self._table = schema_obj.table_name
407
+ self._datalake = schema_obj.datalake_name
408
+ self._database = schema_obj.schema_name
260
409
  self._otf = True if self._datalake else False
261
410
 
411
+ # Convert schema objects to strings.
412
+ data = str(data) if isinstance(data, in_schema) else data
262
413
  table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
263
414
 
264
415
  # Below matrix is list of list, where in each row contains following elements:
@@ -277,18 +428,51 @@ class DataFrame():
277
428
  # 6. element6 --> A list of permitted values, an argument can accept.
278
429
  # If not specified, it is as good as passing None. If a list is passed, validation will be
279
430
  # performed for permitted values.
431
+
280
432
  awu_matrix = []
281
- awu_matrix.append(["table_name", table_name, True, (str), True])
433
+ dtypes = (list, tuple, dict)
434
+ awu_matrix.append(["data", data, True, (str, pd.DataFrame, np.ndarray, dict, _ListOf(dtypes)), True])
282
435
  awu_matrix.append(["index", index, True, (bool)])
283
436
  awu_matrix.append(["index_label", index_label, True, (str, list)])
284
437
  awu_matrix.append(["query", query, True, (str), True])
285
438
  awu_matrix.append(["materialize", materialize, True, (bool)])
439
+ awu_matrix.append(["table_name", table_name, True, (str), True])
440
+ awu_matrix.append(["primary_index", primary_index, True, (str, list)])
441
+ awu_matrix.append(["types", types, True, (dict)])
442
+ awu_matrix.append(["columns", columns, True, (str, list), True])
286
443
 
287
444
  # Validate argument types
288
445
  _Validators._validate_function_arguments(awu_matrix)
289
446
 
447
+ # Convert columns to list if it is a string.
448
+ if isinstance(columns, str):
449
+ columns = [columns]
450
+
290
451
  try:
291
- if table_name is not None:
452
+ if table_name is not None or data is not None:
453
+
454
+ # If data is list or numpy array or dictionary, then convert it to a pandas DataFrame.
455
+ if isinstance(data, (list, np.ndarray, dict)):
456
+ data = pd.DataFrame(data, columns=columns)
457
+ # If the data is a pandas DataFrame, then store the data in a temporary table in Vantage.
458
+ if isinstance(data, pd.DataFrame):
459
+ # Create a copy of the pandas DataFrame to avoid modifying the original,
460
+ # because column names will be changed if they are integers.
461
+ pd_data = data.copy()
462
+ # If the columns are not of type string, then convert them to string.
463
+ pd_data.columns = [f"col_{i}" if isinstance(i, int) else i for i in pd_data.columns]
464
+
465
+ # Set the table_name to the name of the table created in the database.
466
+ table_name = UtilFuncs._generate_temp_table_name(prefix="from_pandas",
467
+ table_type=TeradataConstants.TERADATA_TABLE,
468
+ gc_on_quit=not(persist))
469
+
470
+ copy_to_sql(pd_data, table_name, index=index, index_label=index_label, primary_index=primary_index,
471
+ types=types)
472
+ # If the data is a string, then set the table_name to the data.
473
+ elif isinstance(data, str):
474
+ table_name = data
475
+
292
476
  self._table_name = UtilFuncs._quote_table_names(table_name)
293
477
  self._source_type = SourceType.TABLE.value
294
478
  self._nodeid = self._aed_utils._aed_table(self._table_name)
@@ -329,6 +513,8 @@ class DataFrame():
329
513
 
330
514
  if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
331
515
  __execute_params = (self._table_name, self._query, True)
516
+ elif configure.temp_object_type == TeradataConstants.TERADATA_VIEW:
517
+ __execute_params = (self._table_name, self._query, _lock_rows)
332
518
 
333
519
  try:
334
520
  __execute(*__execute_params)
@@ -342,6 +528,12 @@ class DataFrame():
342
528
  elif "[Error 3706] Syntax error" in str(oe):
343
529
  raise ValueError(Messages.get_message(
344
530
  MessageCodes.FROM_QUERY_SELECT_SUPPORTED).format("Check the syntax."))
531
+ elif "[Error 7825]" in str(oe):
532
+ # The UDF/XSP/UDM routine has thrown an SQLException
533
+ # with an SQL state in the range of 38001-38999 which
534
+ # is not a syntax error. Hence not a ValueError wrt query string.
535
+ # Expected when OTF snapshot related query is executed.
536
+ raise
345
537
  raise ValueError(Messages.get_message(
346
538
  MessageCodes.FROM_QUERY_SELECT_SUPPORTED))
347
539
 
@@ -351,6 +543,7 @@ class DataFrame():
351
543
  raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
352
544
  MessageCodes.TDMLDF_CREATE_FAIL)
353
545
 
546
+ # _get_metaexpr() can be only used if self._table_name is set.
354
547
  if table_name or query:
355
548
  self._metaexpr = self._get_metaexpr()
356
549
  self._get_metadata_from_metaexpr(self._metaexpr)
@@ -503,7 +696,7 @@ class DataFrame():
503
696
  Types: str
504
697
 
505
698
  EXAMPLES:
506
- >>> from teradataml.dataframe.dataframe import DataFrame
699
+ >>> from teradataml import DataFrame
507
700
 
508
701
  # Example 1: The following example creates a DataFrame from a table or
509
702
  a view.
@@ -543,13 +736,13 @@ class DataFrame():
543
736
 
544
737
  """
545
738
  if schema_name:
546
- return cls(in_schema(schema_name, table_name, datalake_name))
547
-
548
- return cls(table_name, index, index_label)
739
+ return cls(table_name=in_schema(schema_name, table_name, datalake_name),
740
+ index=index, index_label=index_label)
741
+ return cls(table_name=table_name, index=index, index_label=index_label)
549
742
 
550
743
  @classmethod
551
744
  @collect_queryband(queryband="DF_fromQuery")
552
- def from_query(cls, query, index=True, index_label=None, materialize=False):
745
+ def from_query(cls, query, index=True, index_label=None, materialize=False, **kwargs):
553
746
  """
554
747
  Class method for creating a DataFrame from a query.
555
748
 
@@ -647,6 +840,7 @@ class DataFrame():
647
840
  df._nodeid = nodeid
648
841
  df._source_type = SourceType.TABLE.value
649
842
 
843
+
650
844
  if not reuse_metaexpr:
651
845
  # Create new _MetaExpression object using reference metaExpression
652
846
  # for newly created DataFrame.
@@ -692,6 +886,322 @@ class DataFrame():
692
886
  df.__setattr__(arg, arg_value)
693
887
  return df
694
888
 
889
+ @classmethod
890
+ @collect_queryband(queryband="DF_fromPandas")
891
+ def from_pandas(cls, pandas_df, index=True, index_label=None, primary_index=None, persist=False):
892
+ """
893
+ DESCRIPTION:
894
+ Creates a teradataml DataFrame from a pandas DataFrame.
895
+
896
+ PARAMETERS:
897
+ pandas_df:
898
+ Required Argument.
899
+ Specifies the pandas DataFrame to be converted to teradataml DataFrame.
900
+ Types: pandas DataFrame
901
+
902
+ index:
903
+ Optional Argument.
904
+ Specifies whether to save Pandas DataFrame index as a column or not.
905
+ Default Value: True
906
+ Types: bool
907
+
908
+ index_label:
909
+ Optional Argument.
910
+ Specifies the column label(s) for Pandas DataFrame index column(s).
911
+ Note:
912
+ * Refer to the "index_label" parameter of copy_to_sql() for more details.
913
+ Default Value: None
914
+ Types: str OR list of str
915
+
916
+ primary_index:
917
+ Optional Argument.
918
+ Specifies which column(s) to use as primary index for the teradataml DataFrame.
919
+ Types: str OR list of str
920
+
921
+ persist:
922
+ Optional Argument.
923
+ Specifies whether to persist the DataFrame.
924
+ Default Value: False
925
+ Types: bool
926
+
927
+ RETURNS:
928
+ teradataml DataFrame
929
+
930
+ RAISES:
931
+ TeradataMlException
932
+
933
+ EXAMPLES:
934
+ >>> import pandas as pd
935
+ >>> from teradataml import DataFrame
936
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
937
+ >>> pdf1 = pd.DataFrame([[1, 2], [3, 4]])
938
+
939
+ # Example 1: Create a teradataml DataFrame from a pandas DataFrame.
940
+ >>> df = DataFrame.from_pandas(pdf)
941
+ >>> df
942
+ col1 col2 index_label
943
+ 0 3 6 2
944
+ 1 2 5 1
945
+ 2 1 4 0
946
+
947
+ # Example 2: Create a teradataml DataFrame from a pandas DataFrame
948
+ # and do not save the index as a column.
949
+ >>> df = DataFrame.from_pandas(pdf, index=False)
950
+ >>> df
951
+ col1 col2
952
+ 0 3 6
953
+ 1 2 5
954
+ 2 1 4
955
+
956
+ # Example 3: Create a teradataml DataFrame from a pandas DataFrame
957
+ # with index label as 'id' and set it as primary index.
958
+ >>> df = DataFrame.from_pandas(pdf, index=True, index_label='id', primary_index='id')
959
+ >>> df
960
+ col1 col2
961
+ id
962
+ 2 3 6
963
+ 1 2 5
964
+ 0 1 4
965
+
966
+ # Example 4: Create a teradataml DataFrame from a pandas DataFrame where
967
+ # columns are not explicitly defined in the pandas DataFrame.
968
+ >>> df = DataFrame.from_pandas(pdf1)
969
+ >>> df
970
+ col_0 col_1 index_label
971
+ 0 3 4 1
972
+ 1 1 2 0
973
+ """
974
+ # Validate 'pandas_df' argument, other arguments, will be validated as part of DataFrame().
975
+ arg_type_matrix = []
976
+ arg_type_matrix.append(["pandas_df", pandas_df, False, (pd.DataFrame,), True])
977
+ arg_type_matrix.append(["persist", persist, True, (bool), True])
978
+
979
+ _Validators._validate_function_arguments(arg_type_matrix)
980
+
981
+ return cls(pandas_df, index, index_label, primary_index=primary_index, persist=persist)
982
+
983
+ @classmethod
984
+ @collect_queryband(queryband="DF_fromDict")
985
+ def from_dict(cls, data, columns=None, persist=False):
986
+ """
987
+ DESCRIPTION:
988
+ Creates a DataFrame from a dictionary containing values as lists or numpy arrays.
989
+
990
+ PARAMETERS:
991
+ data:
992
+ Required Argument.
993
+ Specifies the Python dictionary to create a teradataml DataFrame.
994
+ Notes:
995
+ * Keys of the dictionary are used as column names.
996
+ * Values of the dictionary should be lists or numpy arrays.
997
+ * Nested dictionaries are not supported.
998
+ Types: dict
999
+
1000
+ columns:
1001
+ Optional Argument.
1002
+ Specifies the column names for the DataFrame.
1003
+ Types: str OR list of str
1004
+
1005
+ persist:
1006
+ Optional Argument.
1007
+ Specifies whether to persist the DataFrame.
1008
+ Default Value: False
1009
+ Types: bool
1010
+
1011
+ RETURNS:
1012
+ teradataml DataFrame
1013
+
1014
+ RAISES:
1015
+ TeradataMlException
1016
+
1017
+ EXAMPLES:
1018
+ >>> from teradataml import DataFrame
1019
+ >>> data_dict = {"name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 28]}
1020
+
1021
+ # Example 1: Create a teradataml DataFrame from a dictionary where
1022
+ # keys are column names and values are lists of column data.
1023
+ >>> df = DataFrame.from_dict(data_dict)
1024
+ >>> df
1025
+ name age
1026
+ 0 Charlie 28
1027
+ 1 Bob 30
1028
+ 2 Alice 25
1029
+
1030
+ # Example 2: Create a teradataml DataFrame from a dictionary where
1031
+ # keys are column names and values are numpy arrays.
1032
+ >>> import numpy as np
1033
+ >>> data_dict = {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}
1034
+ >>> df = DataFrame.from_dict(data_dict)
1035
+ >>> df
1036
+ col1 col2
1037
+ 0 3 6
1038
+ 1 2 5
1039
+ 2 1 4
1040
+ """
1041
+ arg_type_matrix = []
1042
+ arg_type_matrix.append(["data", data, False, (dict), True])
1043
+ arg_type_matrix.append(["columns", columns, True, (str, list), True])
1044
+ arg_type_matrix.append(["persist", persist, True, (bool), True])
1045
+
1046
+ _Validators._validate_function_arguments(arg_type_matrix)
1047
+
1048
+ return cls(data, columns=columns, index=False, persist=persist)
1049
+
1050
+ @classmethod
1051
+ @collect_queryband(queryband="DF_fromRecords")
1052
+ def from_records(cls, data, columns=None, **kwargs):
1053
+ """
1054
+ DESCRIPTION:
1055
+ Create a DataFrame from a list of lists/tuples/dictionaries/numpy arrays.
1056
+
1057
+ PARAMETERS:
1058
+ data:
1059
+ Required Argument.
1060
+ Specifies the iterator of data or the list of lists/tuples/dictionaries/numpy arrays to
1061
+ be converted to teradataml DataFrame.
1062
+ Note:
1063
+ * Nested lists or tuples or dictionaries are not supported.
1064
+ Types: Iterator, list
1065
+
1066
+ columns:
1067
+ Optional Argument.
1068
+ Specifies the column names for the DataFrame.
1069
+ Note:
1070
+ * If the data is a list of lists/tuples/numpy arrays and this argument
1071
+ is not specified, column names will be auto-generated as 'col_0', 'col_1', etc.
1072
+ Types: str OR list of str
1073
+
1074
+ kwargs:
1075
+ exclude:
1076
+ Optional Argument.
1077
+ Specifies the columns to be excluded from the DataFrame.
1078
+ Types: list OR tuple
1079
+
1080
+ coerce_float:
1081
+ Optional Argument.
1082
+ Specifies whether to convert values of non-string, non-numeric objects (like decimal.Decimal)
1083
+ to floating point, useful for SQL result sets.
1084
+ Default Value: True
1085
+ Types: bool
1086
+
1087
+ nrows:
1088
+ Optional Argument.
1089
+ Specifies the number of rows to be read from the data if the data is iterator.
1090
+ Types: int
1091
+
1092
+ persist:
1093
+ Optional Argument.
1094
+ Specifies whether to persist the DataFrame.
1095
+ Default Value: False
1096
+ Types: bool
1097
+
1098
+ RETURNS:
1099
+ teradataml DataFrame
1100
+
1101
+ RAISES:
1102
+ TeradataMlException
1103
+
1104
+ EXAMPLES:
1105
+ >>> from teradataml import DataFrame
1106
+
1107
+ # Example 1: Create a teradataml DataFrame from a list of lists.
1108
+ >>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]], columns=['name', 'age'])
1109
+ >>> df
1110
+ name age
1111
+ 0 Bob 2
1112
+ 1 Alice 1
1113
+
1114
+ # Example 2: Create a teradataml DataFrame from a list of tuples.
1115
+ >>> df = DataFrame.from_records([('Alice', 1), ('Bob', 3)], columns=['name', 'age'])
1116
+ >>> df
1117
+ name age
1118
+ 0 Bob 3
1119
+ 1 Alice 1
1120
+
1121
+ # Example 3: Create a teradataml DataFrame from a list of dictionaries.
1122
+ >>> df = DataFrame.from_records([{'name': 'Alice', 'age': 4}, {'name': 'Bob', 'age': 2}])
1123
+ >>> df
1124
+ name age
1125
+ 0 Bob 2
1126
+ 1 Alice 4
1127
+
1128
+ # Example 4: Create a teradataml DataFrame from a list where columns
1129
+ # are not explicitly defined.
1130
+ >>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]])
1131
+ >>> df
1132
+ col_0 col_1
1133
+ 0 Bob 2
1134
+ 1 Alice 1
1135
+
1136
+ # Example 5: Create a teradataml DataFrame from a list by excluding 'grade' column.
1137
+ >>> df = DataFrame.from_records([['Alice', 1, 'A'], ['Bob', 2, 'B']],
1138
+ ... columns=['name', 'age', 'grade'],
1139
+ ... exclude=['grade'])
1140
+ >>> df
1141
+ name age
1142
+ 0 Bob 2
1143
+ 1 Alice 1
1144
+
1145
+ # Example 6: Create a teradataml DataFrame from a list of lists
1146
+ # with "coerce_float" set to False.
1147
+ >>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
1148
+ ... columns=['col1', 'col2'], coerce_float=False)
1149
+ >>> df
1150
+ col1 col2
1151
+ 0 3 4.0
1152
+ 1 1 2.5
1153
+ >>> df.tdtypes
1154
+ col1 BIGINT()
1155
+ col2 VARCHAR(length=1024, charset='UNICODE')
1156
+
1157
+ # Example 7: Create a teradataml DataFrame from a list of lists
1158
+ # with "coerce_float" set to True.
1159
+ >>> from decimal import Decimal
1160
+ >>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
1161
+ ... columns=['col1', 'col2'], coerce_float=True)
1162
+ >>> df
1163
+ col1 col2
1164
+ 0 3 4.0
1165
+ 1 1 2.5
1166
+ >>> df.tdtypes
1167
+ col1 BIGINT()
1168
+ col2 FLOAT()
1169
+
1170
+ # Example 8: Create a teradataml DataFrame from an iterator with "nrows" set to 2.
1171
+ >>> def data_gen():
1172
+ ... yield ['Alice', 1]
1173
+ ... yield ['Bob', 2]
1174
+ ... yield ['Charlie', 3]
1175
+ >>> df = DataFrame.from_records(data_gen(), columns=['name', 'age'], nrows=2)
1176
+ >>> df
1177
+ name age
1178
+ 0 Bob 2
1179
+ 1 Alice 1
1180
+ """
1181
+
1182
+ exclude = kwargs.get("exclude", None)
1183
+ coerce_float = kwargs.get("coerce_float", True)
1184
+ nrows = kwargs.get("nrows", None)
1185
+ persist = kwargs.get("persist", False)
1186
+
1187
+ arg_type_matrix = []
1188
+ dtypes = (list, tuple, dict)
1189
+ arg_type_matrix.append(["data", data, False, (Iterator, _ListOf(dtypes)), True])
1190
+ arg_type_matrix.append(["columns", columns, True, (str, _ListOf(str)), True])
1191
+ arg_type_matrix.append(["exclude", exclude, True, (_ListOf(str),), True])
1192
+ arg_type_matrix.append(["coerce_float", coerce_float, True, (bool, ), True])
1193
+ arg_type_matrix.append(["nrows", nrows, True, (int,), True])
1194
+ arg_type_matrix.append(["persist", persist, True, (bool,), True])
1195
+
1196
+ _Validators._validate_function_arguments(arg_type_matrix)
1197
+
1198
+ if isinstance(columns, str):
1199
+ columns = [columns]
1200
+
1201
+ df = pd.DataFrame.from_records(data, columns=columns, exclude=exclude,
1202
+ coerce_float=coerce_float, nrows=nrows)
1203
+ return cls(df, index=False, persist=persist)
1204
+
695
1205
  def create_temp_view(self, name):
696
1206
  """
697
1207
  DESCRIPTION:
@@ -1084,6 +1594,57 @@ class DataFrame():
1084
1594
  self._is_art = res[0][0] == 1
1085
1595
  return self._is_art
1086
1596
 
1597
+
1598
+ def _process_columns_metadata(self):
1599
+ """
1600
+ DESCRIPTION:
1601
+ Processes the metadata of columns to determine their time dimension properties
1602
+ and to check whether database object is a view, volatile table, or ART table.
1603
+
1604
+ PARAMETERS:
1605
+ None
1606
+
1607
+ RAISES:
1608
+ None
1609
+
1610
+ RETURNS:
1611
+ Tuple containing five boolean values:
1612
+ - is_view: True if the database object is a view, False otherwise.
1613
+ - is_volatile: True if the database object is a volatile table, False otherwise.
1614
+ - is_art_table: True if the database object is an ART table, False otherwise.
1615
+ - has_valid_time: True if any column has a valid time dimension, False otherwise.
1616
+ - has_transaction_time: True if any column has a transaction time dimension, False otherwise.
1617
+ EXAMPLES:
1618
+ >>> load_example_data("teradataml", "Employee")
1619
+ >>> df = DataFrame.from_table("Employee")
1620
+ >>> is_view, is_volatile, is_art_table, valid_time, transaction_time = (
1621
+ df._process_columns_metadata()
1622
+ )
1623
+ >>> is_view, is_volatile, is_art_table, valid_time, transaction_time
1624
+ (False, False, False, True, True)
1625
+
1626
+ """
1627
+
1628
+ is_view = is_volatile = is_art_table = False
1629
+
1630
+ for col in self._metaexpr.c:
1631
+ metadata = col.expression.info
1632
+ time_dimension = metadata.get('time_dimension')
1633
+ is_view = metadata.get('is_view', is_view)
1634
+ is_volatile = metadata.get('is_volatile', is_volatile)
1635
+ is_art_table = metadata.get('is_art_table', is_art_table)
1636
+
1637
+ if time_dimension == "V":
1638
+ self._valid_time_column = col
1639
+
1640
+ if time_dimension == "T":
1641
+ self._transaction_time_column = col
1642
+
1643
+ has_valid_time = self._valid_time_column is not None
1644
+ has_transaction_time = self._transaction_time_column is not None
1645
+
1646
+ return is_view, is_volatile, is_art_table, has_valid_time, has_transaction_time
1647
+
1087
1648
  def _get_metadata_from_metaexpr(self, metaexpr):
1088
1649
  """
1089
1650
  Private method for setting _metaexpr and retrieving column names and types.
@@ -1136,6 +1697,7 @@ class DataFrame():
1136
1697
  meta = sqlalchemy.MetaData()
1137
1698
  db_schema = UtilFuncs._extract_db_name(self._table_name)
1138
1699
  db_table_name = UtilFuncs._extract_table_name(self._table_name)
1700
+
1139
1701
  if not self._datalake:
1140
1702
  t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1141
1703
  return _MetaExpression(t)
@@ -1149,12 +1711,22 @@ class DataFrame():
1149
1711
  datalake=self._datalake)
1150
1712
 
1151
1713
  # Extract column names and corresponding teradatasqlalchemy types.
1152
- col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1153
- self._table,
1154
- self._datalake)
1714
+ try:
1715
+ # For latest OTF help table query results.
1716
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1717
+ self._table,
1718
+ self._datalake,
1719
+ use_dialect=True)
1720
+ except NoSuchColumnError:
1721
+ # For older OTF help table query result.
1722
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1723
+ self._table,
1724
+ self._datalake)
1725
+
1726
+ # Create a SQLAlchemy table object representing datalake table.
1155
1727
  t = sqlalchemy.Table(self._table, meta, schema=self._database,
1156
1728
  *(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
1157
- return _MetaExpression(t)
1729
+ return _MetaExpression(t, datalake=self._datalake)
1158
1730
 
1159
1731
  def __getattr__(self, name):
1160
1732
  """
@@ -1693,8 +2265,150 @@ class DataFrame():
1693
2265
  td_metadata = [(column.name, repr(column.type)) for column in self._metaexpr.c]
1694
2266
  return MetaData(td_metadata)
1695
2267
 
1696
- @collect_queryband(queryband="DF_info")
1697
- def info(self, verbose=True, buf=None, max_cols=None, null_counts=False):
2268
+ @property
2269
+ def df_type(self):
2270
+ """
2271
+ DESCRIPTION:
2272
+ Returns the type of the DataFrame based on the underlying database object.
2273
+ Possible teradataml DataFrame types are:
2274
+ - VALID_TIME_VIEW: DataFrame is created on Valid-Time dimension view.
2275
+ - TRANSACTION_TIME_VIEW: DataFrame is created on Transaction-Time dimension view.
2276
+ - BI_TEMPORAL_VIEW: DataFrame is created on Bi-temporal view.
2277
+ - VALID_TIME: DataFrame is created on Valid-Time dimension table.
2278
+ - TRANSACTION_TIME: DataFrame is created on Transaction-Time dimension table.
2279
+ - BI_TEMPORAL: DataFrame is created on Bi-temporal dimension table.
2280
+ - VIEW: DataFrame is created on a view.
2281
+ - TABLE: DataFrame is created on a table.
2282
+ - OTF: DataFrame is created on an OTF table.
2283
+ - ART: DataFrame is created on an ART table.
2284
+ - VOLATILE_TABLE: DataFrame is created on a volatile table.
2285
+ - BI_TEMPORAL_VOLATILE_TABLE: DataFrame is created on a Bi-temporal dimension volatile table.
2286
+ - VALID_TIME_VOLATILE_TABLE: DataFrame is created on a Valid-Time dimension volatile table.
2287
+ - TRANSACTION_TIME_VOLATILE_TABLE: DataFrame is created on a Transaction-Time dimension volatile table.
2288
+
2289
+ RETURNS:
2290
+ str
2291
+
2292
+ RAISES:
2293
+ None
2294
+
2295
+ EXAMPLES:
2296
+ # Load the data to run the example.
2297
+ >>> load_example_data("teradataml", "Employee_roles") # load valid time data.
2298
+ >>> load_example_data("teradataml", "Employee_Address") # load transaction time data.
2299
+ >>> load_example_data("teradataml", "Employee") # load bitemporal data.
2300
+ >>> load_example_data("uaf", ["ocean_buoys2"]) # load data to create art table.
2301
+ >>> load_example_data('dataframe', ['admissions_train']) # load data to create a regular table.
2302
+
2303
+ # Example 1: DataFrame created on a Valid-Time dimension table.
2304
+ >>> df = DataFrame.from_table('Employee_roles')
2305
+ >>> df.df_type
2306
+ 'VALID_TIME'
2307
+
2308
+ # Example 2: DataFrame created on a Transaction-Time dimension table.
2309
+ >>> df = DataFrame.from_table('Employee_Address')
2310
+ >>> df.df_type
2311
+ 'TRANSACTION_TIME'
2312
+
2313
+ # Example 3: DataFrame created on a Bi-temporal dimension table.
2314
+ >>> df = DataFrame.from_table('Employee')
2315
+ >>> df.df_type
2316
+ 'BI_TEMPORAL'
2317
+
2318
+ # Example 4: DataFrame created on a ART table.
2319
+ >>> data = DataFrame.from_table('ocean_buoys2')
2320
+ >>> from teradataml import TDSeries,SInfo
2321
+ >>> data_series_df = TDSeries(data=data,
2322
+ ... id=["ocean_name","buoyid"],
2323
+ ... row_index="TD_TIMECODE",
2324
+ ... row_index_style="TIMECODE",
2325
+ ... payload_field="jsoncol.Measure.salinity",
2326
+ ... payload_content="REAL")
2327
+ >>> uaf_out = SInfo(data=data_series_df, output_table_name='TSINFO_RESULTS')
2328
+ >>> df = DataFrame.from_table('TSINFO_RESULTS')
2329
+ >>> df.df_type
2330
+ 'ART'
2331
+
2332
+ # Example 5: DataFrame created on a regular table.
2333
+ >>> df = DataFrame.from_table('admissions_train')
2334
+ >>> df.df_type
2335
+ 'REGULAR_TABLE'
2336
+
2337
+ # Example 6: DataFrame created on a volatile table.
2338
+ >>> df = DataFrame.from_table('admissions_train')
2339
+ >>> df.to_sql(table_name='admissions_train_volatile', temporary=True)
2340
+ >>> df = DataFrame.from_table('admissions_train_volatile')
2341
+ >>> df.df_type
2342
+ 'VOLATILE_TABLE'
2343
+
2344
+ # Example 7: DataFrame created on a Bi-temporal dimension view.
2345
+ >>> execute_sql('create view Employee_view AS SEQUENCED VALIDTIME AND SEQUENCED TRANSACTIONTIME select * from Employee')
2346
+ >>> df = DataFrame.from_table('Employee_view')
2347
+ >>> df.df_type
2348
+ 'BI_TEMPORAL_VIEW'
2349
+
2350
+ """
2351
+
2352
+ if self._df_type is not None:
2353
+ return self._df_type
2354
+
2355
+ is_view, is_volatile, is_art_table, valid_time, transaction_time = (
2356
+ self._process_columns_metadata()
2357
+ )
2358
+
2359
+ # Check if the DataFrame is created from an OTF table
2360
+ if self._otf:
2361
+ self._df_type = DataFrameTypes.OTF_TABLE.value
2362
+ return self._df_type
2363
+
2364
+ # Check if the DataFrame is created from an ART table
2365
+ if is_art_table:
2366
+ self._df_type = DataFrameTypes.ART_TABLE.value
2367
+ return self._df_type
2368
+
2369
+ # Determine the type based on valid-time, transaction-time columns, and volatility
2370
+ if valid_time and transaction_time:
2371
+ if is_volatile:
2372
+ self._df_type = DataFrameTypes.BI_TEMPORAL_VOLATILE_TABLE.value
2373
+ else:
2374
+ self._df_type = (
2375
+ DataFrameTypes.BI_TEMPORAL_VIEW.value
2376
+ if is_view
2377
+ else DataFrameTypes.BI_TEMPORAL.value
2378
+ )
2379
+ elif valid_time:
2380
+ if is_volatile:
2381
+ self._df_type = DataFrameTypes.VALID_TIME_VOLATILE_TABLE.value
2382
+ else:
2383
+ self._df_type = (
2384
+ DataFrameTypes.VALID_TIME_VIEW.value
2385
+ if is_view
2386
+ else DataFrameTypes.VALID_TIME.value
2387
+ )
2388
+ elif transaction_time:
2389
+ if is_volatile:
2390
+ self._df_type = DataFrameTypes.TRANSACTION_TIME_VOLATILE_TABLE.value
2391
+ else:
2392
+ self._df_type = (
2393
+ DataFrameTypes.TRANSACTION_TIME_VIEW.value
2394
+ if is_view
2395
+ else DataFrameTypes.TRANSACTION_TIME.value
2396
+ )
2397
+ else:
2398
+ self._df_type = (
2399
+ DataFrameTypes.VOLATILE_TABLE.value
2400
+ if is_volatile
2401
+ else (
2402
+ DataFrameTypes.VIEW.value
2403
+ if is_view
2404
+ else DataFrameTypes.REGULAR_TABLE.value
2405
+ )
2406
+ )
2407
+
2408
+ return self._df_type
2409
+
2410
+ @collect_queryband(queryband="DF_info")
2411
+ def info(self, verbose=True, buf=None, max_cols=None, null_counts=False):
1698
2412
  """
1699
2413
  DESCRIPTION:
1700
2414
  Print a summary of the DataFrame.
@@ -5888,8 +6602,11 @@ class DataFrame():
5888
6602
  groupby_col_names.append(col)
5889
6603
  groupby_col_types.append(self[col].type)
5890
6604
 
5891
- if col in col_names:
5892
- # If group by column is not specified in the columns argument,
6605
+ include_grouping_columns = True if isinstance(self, DataFrameGroupBy) and \
6606
+ self._include_grouping_columns else False
6607
+ if not include_grouping_columns and col in col_names:
6608
+ # If 'include_grouping_columns' argument is set to True and,
6609
+ # group by column is not specified in the columns argument,
5893
6610
  # then, we should ignore this processing, otherwise we
5894
6611
  # should process it in the same way to remove the reference
5895
6612
  # for grouping column from aggregation list.
@@ -5933,7 +6650,8 @@ class DataFrame():
5933
6650
 
5934
6651
  new_metaexpr = UtilFuncs._get_metaexpr_using_columns(aggregate_node_id,
5935
6652
  zip(new_column_names,
5936
- new_column_types))
6653
+ new_column_types),
6654
+ datalake=self._metaexpr.datalake)
5937
6655
  agg_df = self._create_dataframe_from_node \
5938
6656
  (aggregate_node_id, new_metaexpr, self._index_label)
5939
6657
 
@@ -6352,7 +7070,8 @@ class DataFrame():
6352
7070
  sel_nodeid = self._aed_utils._aed_select(self._nodeid, column_expression)
6353
7071
 
6354
7072
  # Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
6355
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
7073
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items(),
7074
+ datalake=self._metaexpr.datalake)
6356
7075
  return self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
6357
7076
 
6358
7077
  except TeradataMlException:
@@ -7302,7 +8021,8 @@ class DataFrame():
7302
8021
 
7303
8022
  # Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
7304
8023
  # and underlying table name.
7305
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
8024
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items(),
8025
+ datalake=self._metaexpr.datalake)
7306
8026
 
7307
8027
  # Return a new joined dataframe.
7308
8028
  return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
@@ -8675,7 +9395,6 @@ class DataFrame():
8675
9395
 
8676
9396
  return df
8677
9397
 
8678
-
8679
9398
  @collect_queryband(queryband="DF_get")
8680
9399
  def get(self, key):
8681
9400
  """
@@ -8785,7 +9504,7 @@ class DataFrame():
8785
9504
  append:
8786
9505
  Optional Argument.
8787
9506
  Specifies whether or not to append requested columns to the existing index.
8788
- ` When append is False, replaces existing index.
9507
+ When append is False, replaces existing index.
8789
9508
  When append is True, retains both existing & currently appended index.
8790
9509
  Default Value: False
8791
9510
  Types: bool
@@ -8998,6 +9717,15 @@ class DataFrame():
8998
9717
  Permitted Values: "CUBE", "ROLLUP", None
8999
9718
  Types: str or NoneType
9000
9719
 
9720
+ include_grouping_columns:
9721
+ Optional Argument.
9722
+ Specifies whether to include aggregations on the grouping column(s) or not.
9723
+ When set to True, the resultant DataFrame will have the aggregations on the
9724
+ columns mentioned in "columns_expr". Otherwise, resultant DataFrame will not have
9725
+ aggregations on the columns mentioned in "columns_expr".
9726
+ Default Value: False
9727
+ Types: bool
9728
+
9001
9729
  NOTES:
9002
9730
  1. Users can still apply teradataml DataFrame methods (filters/sort/etc) on top of the result.
9003
9731
  2. Consecutive operations of grouping, i.e., groupby_time(), resample() and groupby() are not permitted.
@@ -9014,14 +9742,54 @@ class DataFrame():
9014
9742
  TeradataMlException
9015
9743
 
9016
9744
  EXAMPLES:
9745
+ # Load the data to run the example.
9017
9746
  >>> load_example_data("dataframe","admissions_train")
9747
+
9748
+ # Create a DataFrame on 'admissions_train' table.
9018
9749
  >>> df = DataFrame("admissions_train")
9750
+ >>> df
9751
+ masters gpa stats programming admitted
9752
+ id
9753
+ 15 yes 4.00 Advanced Advanced 1
9754
+ 34 yes 3.85 Advanced Beginner 0
9755
+ 13 no 4.00 Advanced Novice 1
9756
+ 38 yes 2.65 Advanced Beginner 1
9757
+ 5 no 3.44 Novice Novice 0
9758
+ 40 yes 3.95 Novice Beginner 0
9759
+ 7 yes 2.33 Novice Novice 1
9760
+ 22 yes 3.46 Novice Beginner 0
9761
+ 26 yes 3.57 Advanced Advanced 1
9762
+ 17 no 3.83 Advanced Advanced 1
9763
+
9764
+ # Example 1: Find the minimum value of all valid columns by
9765
+ # grouping the DataFrame with column 'masters'.
9019
9766
  >>> df1 = df.groupby(["masters"])
9020
9767
  >>> df1.min()
9021
9768
  masters min_id min_gpa min_stats min_programming min_admitted
9022
9769
  0 no 3 1.87 Advanced Advanced 0
9023
9770
  1 yes 1 1.98 Advanced Advanced 0
9024
9771
 
9772
+ # Example 2: Find the sum of all valid columns by grouping the DataFrame
9773
+ # with columns 'masters' and 'admitted'. Include grouping columns
9774
+ # in aggregate function 'sum'.
9775
+ >>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=True)
9776
+ >>> df1.sum()
9777
+ masters admitted sum_id sum_gpa sum_admitted
9778
+ 0 yes 1 188 34.35 10
9779
+ 1 yes 0 289 43.36 0
9780
+ 2 no 0 41 6.44 0
9781
+ 3 no 1 302 57.52 16
9782
+
9783
+ # Example 3: Find the sum of all valid columns by grouping the DataFrame with
9784
+ # columns 'masters' and 'admitted'. Do not include grouping columns
9785
+ # in aggregate function 'sum'.
9786
+ >>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=False)
9787
+ >>> df1.sum()
9788
+ masters admitted sum_id sum_gpa
9789
+ 0 yes 0 289 43.36
9790
+ 1 no 0 41 6.44
9791
+ 2 no 1 302 57.52
9792
+ 3 yes 1 188 34.35
9025
9793
  """
9026
9794
  # Argument validations
9027
9795
  arg_info_matrix = []
@@ -9029,6 +9797,8 @@ class DataFrame():
9029
9797
  option = kwargs.get("option", None)
9030
9798
  arg_info_matrix.append(["option", option, True, (str, type(None)), True,
9031
9799
  ["CUBE", "ROLLUP", None]])
9800
+ include_grouping_columns = kwargs.get("include_grouping_columns", False)
9801
+ arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, True, (bool)])
9032
9802
 
9033
9803
  # Validate argument types
9034
9804
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -9073,7 +9843,8 @@ class DataFrame():
9073
9843
 
9074
9844
  groupbyexpr = ', '.join(UtilFuncs._teradata_quote_arg(col, "\"", False) for col in column_list)
9075
9845
  groupbyObj = DataFrameGroupBy(self._nodeid, self._metaexpr, self._column_names_and_types, self.columns,
9076
- groupbyexpr, column_list, option)
9846
+ groupbyexpr, column_list, option, include_grouping_columns)
9847
+
9077
9848
  return groupbyObj
9078
9849
  except TeradataMlException:
9079
9850
  raise
@@ -11437,7 +12208,8 @@ class DataFrame():
11437
12208
  column_info = ((col_name, col_type) for col_name, col_type in
11438
12209
  new_metaexpr_columns_types.items())
11439
12210
  # Get new metaexpr for sample_node_id
11440
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sample_node_id, column_info, is_persist=True)
12211
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sample_node_id, column_info, is_persist=True,
12212
+ datalake=self._metaexpr.datalake)
11441
12213
 
11442
12214
  # Make this non-lazy. Added this in order to fix https://teradata-pe.atlassian.net/browse/ELE-6368
11443
12215
  # Cannot use __execute_node_and_set_table_name because self points to original df.
@@ -12872,9 +13644,9 @@ class DataFrame():
12872
13644
  3. When ColumnExpression(s) is(are) passed to "order_columns", then the
12873
13645
  corresponding expression takes precedence over arguments
12874
13646
  "sort_ascending" and "nulls_first". Say, ColumnExpression is col1, then
12875
- 1. col1.asc() or col.desc() is effective irrespective of "sort_ascending".
12876
- 2. col1.nulls_first() or col.nulls_last() is effective irrespective of "nulls_first".
12877
- 3. Any combination of above two take precedence over "sort_ascending" and "nulls_first".
13647
+ 1. col1.asc() or col.desc() is effective irrespective of "sort_ascending".
13648
+ 2. col1.nulls_first() or col.nulls_last() is effective irrespective of "nulls_first".
13649
+ 3. Any combination of above two take precedence over "sort_ascending" and "nulls_first".
12878
13650
  Types: str OR list of Strings (str) OR ColumnExpression OR list of ColumnExpressions
12879
13651
 
12880
13652
  sort_ascending:
@@ -13150,12 +13922,14 @@ class DataFrame():
13150
13922
  False)
13151
13923
  column_names = list(dict.fromkeys(column_names))
13152
13924
 
13153
- if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
13925
+ if list_td_reserved_keywords(column_names) or UtilFuncs._is_non_ascii(column_names):
13154
13926
  column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
13155
13927
 
13156
13928
  col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
13157
13929
  sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
13158
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
13930
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items(),
13931
+ datalake=self._metaexpr.datalake)
13932
+
13159
13933
  return self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
13160
13934
 
13161
13935
  @collect_queryband(queryband="DF_toCsv")
@@ -15336,7 +16110,7 @@ class DataFrame():
15336
16110
  return self.assign(**new_columns, drop_columns=True).select(self.columns)
15337
16111
 
15338
16112
  @collect_queryband(queryband="DF_cube")
15339
- def cube(self, columns):
16113
+ def cube(self, columns, include_grouping_columns=False):
15340
16114
  """
15341
16115
  DESCRIPTION:
15342
16116
  cube() function creates a multi-dimensional cube for the DataFrame
@@ -15350,6 +16124,15 @@ class DataFrame():
15350
16124
  Specifies the name(s) of input teradataml DataFrame column(s).
15351
16125
  Types: str OR list of str(s)
15352
16126
 
16127
+ include_grouping_columns:
16128
+ Optional Argument.
16129
+ Specifies whether to include aggregations on the grouping column(s) or not.
16130
+ When set to True, the resultant DataFrame will have the aggregations on the
16131
+ columns mentioned in "columns". Otherwise, resultant DataFrame will not have
16132
+ aggregations on the columns mentioned in "columns".
16133
+ Default Value: False
16134
+ Types: bool
16135
+
15353
16136
  RETURNS:
15354
16137
  teradataml DataFrameGroupBy
15355
16138
 
@@ -15357,9 +16140,27 @@ class DataFrame():
15357
16140
  TeradataMlException
15358
16141
 
15359
16142
  EXAMPLES :
15360
- # Example 1: Analyzes the data by grouping into masters and stats dimensions.
16143
+ # Load the data to run the example.
15361
16144
  >>> load_example_data("dataframe","admissions_train")
16145
+
16146
+ # Create a DataFrame on 'admissions_train' table.
15362
16147
  >>> df = DataFrame("admissions_train")
16148
+ >>> df
16149
+ masters gpa stats programming admitted
16150
+ id
16151
+ 15 yes 4.00 Advanced Advanced 1
16152
+ 34 yes 3.85 Advanced Beginner 0
16153
+ 13 no 4.00 Advanced Novice 1
16154
+ 38 yes 2.65 Advanced Beginner 1
16155
+ 5 no 3.44 Novice Novice 0
16156
+ 40 yes 3.95 Novice Beginner 0
16157
+ 7 yes 2.33 Novice Novice 1
16158
+ 22 yes 3.46 Novice Beginner 0
16159
+ 26 yes 3.57 Advanced Advanced 1
16160
+ 17 no 3.83 Advanced Advanced 1
16161
+
16162
+ # Example 1: Find the sum of all valid columns by grouping the
16163
+ # DataFrame columns with 'masters' and 'stats'.
15363
16164
  >>> df1 = df.cube(["masters", "stats"]).sum()
15364
16165
  >>> df1
15365
16166
  masters stats sum_id sum_gpa sum_admitted
@@ -15374,10 +16175,42 @@ class DataFrame():
15374
16175
  8 no Advanced 189 34.95 9
15375
16176
  9 yes Novice 98 13.74 1
15376
16177
 
16178
+ # Example 2: Find the avg of all valid columns by grouping the DataFrame
16179
+ # with columns 'masters' and 'admitted'. Include grouping columns
16180
+ # in aggregate function 'avg'.
16181
+ >>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=True).avg()
16182
+ >>> df1
16183
+ masters admitted avg_id avg_gpa avg_admitted
16184
+ 0 yes NaN 21.681818 3.532273 0.454545
16185
+ 1 None 1.0 18.846154 3.533462 1.000000
16186
+ 2 no NaN 19.055556 3.553333 0.888889
16187
+ 3 yes 0.0 24.083333 3.613333 0.000000
16188
+ 4 None NaN 20.500000 3.541750 0.650000
16189
+ 5 None 0.0 23.571429 3.557143 0.000000
16190
+ 6 yes 1.0 18.800000 3.435000 1.000000
16191
+ 7 no 1.0 18.875000 3.595000 1.000000
16192
+ 8 no 0.0 20.500000 3.220000 0.000000
16193
+
16194
+ # Example 3: Find the avg of all valid columns by grouping the DataFrame with
16195
+ # columns 'masters' and 'admitted'. Do not include grouping columns
16196
+ # in aggregate function 'avg'.
16197
+ >>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=False).avg()
16198
+ >>> df1
16199
+ masters admitted avg_id avg_gpa
16200
+ 0 no 0.0 20.500000 3.220000
16201
+ 1 None 1.0 18.846154 3.533462
16202
+ 2 no NaN 19.055556 3.553333
16203
+ 3 yes 0.0 24.083333 3.613333
16204
+ 4 None NaN 20.500000 3.541750
16205
+ 5 None 0.0 23.571429 3.557143
16206
+ 6 yes 1.0 18.800000 3.435000
16207
+ 7 yes NaN 21.681818 3.532273
16208
+ 8 no 1.0 18.875000 3.595000
15377
16209
  """
15378
16210
  # Validate columns argument.
15379
16211
  arg_info_matrix = []
15380
16212
  arg_info_matrix.append(["columns", columns, False, (str, list), True])
16213
+ arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
15381
16214
 
15382
16215
  # Validate argument types
15383
16216
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -15387,10 +16220,10 @@ class DataFrame():
15387
16220
 
15388
16221
  # Query generation of cube API is same as the group by.
15389
16222
  # Only 'cube' is concatenated with 'group by' clause.
15390
- return self.groupby(columns, option="cube")
16223
+ return self.groupby(columns, option="cube", include_grouping_columns=include_grouping_columns)
15391
16224
 
15392
16225
  @collect_queryband(queryband="DF_rollup")
15393
- def rollup(self, columns):
16226
+ def rollup(self, columns, include_grouping_columns=False):
15394
16227
  """
15395
16228
  DESCRIPTION:
15396
16229
  rollup() function creates a multi-dimensional rollup for the DataFrame
@@ -15404,6 +16237,15 @@ class DataFrame():
15404
16237
  Specifies the name(s) of input teradataml DataFrame column(s).
15405
16238
  Types: str OR list of str(s)
15406
16239
 
16240
+ include_grouping_columns:
16241
+ Optional Argument.
16242
+ Specifies whether to include aggregations on the grouping column(s) or not.
16243
+ When set to True, the resultant DataFrame will have the aggregations on the
16244
+ columns mentioned in "columns". Otherwise, resultant DataFrame will not have
16245
+ aggregations on the columns mentioned in "columns".
16246
+ Default Value: False
16247
+ Types: bool
16248
+
15407
16249
  RETURNS:
15408
16250
  teradataml DataFrameGroupBy
15409
16251
 
@@ -15411,9 +16253,27 @@ class DataFrame():
15411
16253
  TeradataMlException
15412
16254
 
15413
16255
  EXAMPLES :
15414
- # Example 1: Analyzes the data by grouping into masters and stats dimensions.
16256
+ # Load the data to run the example.
15415
16257
  >>> load_example_data("dataframe","admissions_train")
16258
+
16259
+ # Create a DataFrame on 'admissions_train' table.
15416
16260
  >>> df = DataFrame("admissions_train")
16261
+ >>> df
16262
+ masters gpa stats programming admitted
16263
+ id
16264
+ 15 yes 4.00 Advanced Advanced 1
16265
+ 34 yes 3.85 Advanced Beginner 0
16266
+ 13 no 4.00 Advanced Novice 1
16267
+ 38 yes 2.65 Advanced Beginner 1
16268
+ 5 no 3.44 Novice Novice 0
16269
+ 40 yes 3.95 Novice Beginner 0
16270
+ 7 yes 2.33 Novice Novice 1
16271
+ 22 yes 3.46 Novice Beginner 0
16272
+ 26 yes 3.57 Advanced Advanced 1
16273
+ 17 no 3.83 Advanced Advanced 1
16274
+
16275
+ # Example 1: Find the sum of all valid columns by grouping the
16276
+ # DataFrame columns with 'masters' and 'stats'.
15417
16277
  >>> df1 = df.rollup(["masters", "stats"]).sum()
15418
16278
  >>> df1
15419
16279
  masters stats sum_id sum_gpa sum_admitted
@@ -15427,10 +16287,38 @@ class DataFrame():
15427
16287
  7 yes Advanced 366 49.26 7
15428
16288
  8 no Advanced 189 34.95 9
15429
16289
 
16290
+ # Example 2: Find the avg of all valid columns by grouping the DataFrame
16291
+ # with columns 'masters' and 'admitted'. Include grouping columns
16292
+ # in aggregate function 'avg'.
16293
+ >>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=True).avg()
16294
+ >>> df1
16295
+ masters admitted avg_id avg_gpa avg_admitted
16296
+ 0 no NaN 19.055556 3.553333 0.888889
16297
+ 1 yes NaN 21.681818 3.532273 0.454545
16298
+ 2 None NaN 20.500000 3.541750 0.650000
16299
+ 3 yes 0.0 24.083333 3.613333 0.000000
16300
+ 4 no 1.0 18.875000 3.595000 1.000000
16301
+ 5 yes 1.0 18.800000 3.435000 1.000000
16302
+ 6 no 0.0 20.500000 3.220000 0.000000
16303
+
16304
+ # Example 3: Find the avg of all valid columns by grouping the DataFrame with
16305
+ # columns 'masters' and 'admitted'. Do not include grouping columns
16306
+ # in aggregate function 'avg'.
16307
+ >>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=False).avg()
16308
+ >>> df1
16309
+ masters admitted avg_id avg_gpa
16310
+ 0 no NaN 19.055556 3.553333
16311
+ 1 yes NaN 21.681818 3.532273
16312
+ 2 no 0.0 20.500000 3.220000
16313
+ 3 yes 0.0 24.083333 3.613333
16314
+ 4 no 1.0 18.875000 3.595000
16315
+ 5 yes 1.0 18.800000 3.435000
16316
+ 6 None NaN 20.500000 3.541750
15430
16317
  """
15431
16318
  # Validate columns argument.
15432
16319
  arg_info_matrix = []
15433
16320
  arg_info_matrix.append(["columns", columns, False, (str, list), True])
16321
+ arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
15434
16322
 
15435
16323
  # Validate argument types
15436
16324
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -15440,7 +16328,1126 @@ class DataFrame():
15440
16328
 
15441
16329
  # Query generation of cube API is same as the group by.
15442
16330
  # Only 'rollup' is concatenated with 'group by' clause.
15443
- return self.groupby(columns, option="rollup")
16331
+ return self.groupby(columns, option="rollup", include_grouping_columns=include_grouping_columns)
16332
+
16333
+ # Metadata functions for DataFrame created on datalake/OTF table.
16334
+ @property
16335
+ @collect_queryband(queryband="DF_snpsht")
16336
+ @df_utils.check_otf_dataframe()
16337
+ def snapshots(self):
16338
+ """
16339
+ DESCRIPTION:
16340
+ Gets snapshot information for a DataLake table.
16341
+
16342
+ PARAMETERS:
16343
+ None
16344
+
16345
+ RETURNS:
16346
+ teradataml DataFrame.
16347
+
16348
+ RAISES:
16349
+ TeradataMLException.
16350
+
16351
+ EXAMPLES :
16352
+ # Example 1: Get the snapshot information for datalake table.
16353
+ >>> from teradataml.dataframe.dataframe import in_schema
16354
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16355
+ ... table_name="datalake_table",
16356
+ ... datalake_name="datalake")
16357
+ >>> datalake_df = DataFrame(in_schema_tbl)
16358
+ >>> datalake_df.snapshots
16359
+ snapshotId snapshotTimestamp timestampMSecs manifestList summary
16360
+ 0 6373759902296319074 2023-06-15 00:07:47 1686787667420 s3://vim-iceberg-v1/glue/metadata/snap-6373759... {"added-data-files":"1","added-records":"5","a...}
16361
+ 1 4768076782814510171 2023-06-15 00:09:01 1686787741964 s3://vim-iceberg-v1/glue/metadata/snap-4768076... {"added-data-files":"1","added-records":"2","a...}
16362
+ 2 7771482207931850214 2024-05-29 04:59:09 1716958749946 s3://vim-iceberg-v1/glue/metadata/snap-7771482... {"deleted-data-files":"2","deleted-records":"7...}
16363
+ 3 1545363077953282623 2024-05-29 05:13:39 1716959619455 s3://vim-iceberg-v1/glue/metadata/snap-1545363... {"changed-partition-count":"0","total-records"...}
16364
+ 4 2166707884289108360 2024-05-29 05:17:49 1716959869075 s3://vim-iceberg-v1/glue/metadata/snap-2166707... {"changed-partition-count":"0","total-records"...}
16365
+ 5 8934190131471882700 2024-05-29 05:21:32 1716960092422 s3://vim-iceberg-v1/glue/metadata/snap-8934190... {"changed-partition-count":"0","total-records"...}
16366
+ 6 3086605171258231948 2024-05-29 05:34:43 1716960883786 s3://vim-iceberg-v1/glue/metadata/snap-3086605... {"changed-partition-count":"0","total-records"...}
16367
+ 7 7592503716012384122 2024-05-29 06:04:48 1716962688047 s3://vim-iceberg-v1/glue/metadata/snap-7592503... {"changed-partition-count":"0","total-records"...}
16368
+ 8 2831061717890032890 2024-06-04 17:21:01 1717521661689 s3://vim-iceberg-v1/glue/metadata/snap-2831061... {"added-data-files":"2","added-records":"7","a...}
16369
+ 9 8810491341502972715 2024-10-22 23:47:22 1729640842067 s3://vim-iceberg-v1/glue/metadata/snap-8810491... {"added-data-files":"1","added-records":"1","a...}
16370
+ 10 3953136136558551163 2024-12-03 04:40:48 1733200848733 s3://vim-iceberg-v1/glue/metadata/snap-3953136... {"added-data-files":"1","added-records":"4","a...}
16371
+ 11 6034775168901969481 2024-12-03 04:40:49 1733200849966 s3://vim-iceberg-v1/glue/metadata/snap-6034775... {"deleted-data-files":"1","deleted-records":"5...}
16372
+ """
16373
+ return self._execute_metadata_query_and_generate_dataframe("TD_SNAPSHOTS")
16374
+
16375
+ @property
16376
+ @collect_queryband(queryband="DF_prttns")
16377
+ @df_utils.check_otf_dataframe()
16378
+ def partitions(self):
16379
+ """
16380
+ DESCRIPTION:
16381
+ Gets partition information for a DataLake table.
16382
+
16383
+ PARAMETERS:
16384
+ None
16385
+
16386
+ RETURNS:
16387
+ teradataml DataFrame.
16388
+
16389
+ RAISES:
16390
+ TeradataMLException.
16391
+
16392
+ EXAMPLES :
16393
+ # Example 1: Get the partition information for datalake table.
16394
+ >>> from teradataml.dataframe.dataframe import in_schema
16395
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16396
+ ... table_name="datalake_table",
16397
+ ... datalake_name="datalake")
16398
+ >>> datalake_df = DataFrame(in_schema_tbl)
16399
+ >>> datalake_df.partitions
16400
+ id name
16401
+ 0 1000 c2
16402
+ 1 1001 c3
16403
+
16404
+
16405
+ """
16406
+ return self._execute_metadata_query_and_generate_dataframe("TD_PARTITIONS")
16407
+
16408
+ @property
16409
+ @collect_queryband(queryband="DF_mnfsts")
16410
+ @df_utils.check_otf_dataframe()
16411
+ def manifests(self):
16412
+ """
16413
+ DESCRIPTION:
16414
+ Gets manifest information for a DataLake table.
16415
+
16416
+ PARAMETERS:
16417
+ None
16418
+
16419
+ RETURNS:
16420
+ teradataml DataFrame.
16421
+
16422
+ RAISES:
16423
+ TeradataMLException.
16424
+
16425
+ EXAMPLES :
16426
+ # Example 1: Get the manifest information for datalake table.
16427
+ >>> from teradataml.dataframe.dataframe import in_schema
16428
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16429
+ ... table_name="datalake_table",
16430
+ ... datalake_name="datalake")
16431
+ >>> datalake_df = DataFrame(in_schema_tbl)
16432
+ >>> datalake_df.manifests
16433
+ snapshotId snapshotTimestamp manifestList manifestFile manifestFileLength datafilecount totalrowcount
16434
+ 0 8068130797628952520 2025-05-02 11:45:26 s3://vim-iceberg-v1/otftestdb/nt_sales/... s3://vim-iceberg-v1/otftestdb/nt_sales/... 7158 6 6
16435
+ """
16436
+ return self._execute_metadata_query_and_generate_dataframe("TD_MANIFESTS")
16437
+
16438
+ @property
16439
+ @collect_queryband(queryband="DF_hstry")
16440
+ @df_utils.check_otf_dataframe()
16441
+ def history(self):
16442
+ """
16443
+ DESCRIPTION:
16444
+ Gets the snapshot history related to a DataLake table.
16445
+
16446
+ PARAMETERS:
16447
+ None
16448
+
16449
+ RETURNS:
16450
+ teradataml DataFrame.
16451
+
16452
+ RAISES:
16453
+ TeradataMLException.
16454
+
16455
+ EXAMPLES :
16456
+ # Example 1: Get the partition information for datalake table.
16457
+ >>> from teradataml.dataframe.dataframe import in_schema
16458
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16459
+ ... table_name="datalake_table",
16460
+ ... datalake_name="datalake")
16461
+ >>> datalake_df = DataFrame(in_schema_tbl)
16462
+ >>> datalake_df.history
16463
+ id timestamp
16464
+ 0 8068130797628952520 2025-05-02 11:45:26
16465
+ """
16466
+ return self._execute_metadata_query_and_generate_dataframe("TD_HISTORY")
16467
+
16468
+ def _execute_metadata_query_and_generate_dataframe(self, func_name):
16469
+ """Function executes OTF metadata query and return result in DataFrame format"""
16470
+ query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_METADATA).format(func_name,
16471
+ self._table_name)
16472
+ return DataFrame.from_query(query)
16473
+
16474
+ @collect_queryband(queryband="DF_gt_snpsht")
16475
+ @df_utils.check_otf_dataframe()
16476
+ def get_snapshot(self, as_of):
16477
+ """
16478
+ DESCRIPTION:
16479
+ Gets the data from a DataLake table for the given snapshot id or timestamp string.
16480
+ Notes:
16481
+ * The snapshot id can be obtained from the 'snapshots' property of the DataFrame.
16482
+ * The time travel value represented by 'as_of' should be in the format "YYYY-MM-DD HH:MM:SS.FFFFFFF"
16483
+ for TIMESTAMP string or "YYYY-MM-DD" for DATE string.
16484
+
16485
+ PARAMETERS:
16486
+ as_of:
16487
+ Required Argument.
16488
+ Specifies the snapshot id or timestamp information for which the snapshot is to be fetched.
16489
+ Types: str or int
16490
+
16491
+ RETURNS:
16492
+ teradataml DataFrame.
16493
+
16494
+ RAISES:
16495
+ TeradataMLException.
16496
+
16497
+ EXAMPLES:
16498
+ # DataFrame creation on OTF table.
16499
+ >>> from teradataml.dataframe.dataframe import in_schema
16500
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16501
+ ... table_name="datalake_table",
16502
+ ... datalake_name="datalake")
16503
+ >>> datalake_df = DataFrame(in_schema_tbl)
16504
+
16505
+ # List snapshots first.
16506
+ >>> datalake_df.snapshots
16507
+ snapshotId snapshotTimestamp timestampMSecs manifestList summary
16508
+ 2046682612111137809 2025-06-03 13:26:15 1748957175692 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-204... {"added-data-files":"Red Inc","added-records"...}
16509
+ 282293708812257203 2025-06-03 05:53:19 1748929999245 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-282... {"added-data-files":"Blue Inc","added-records"...}
16510
+
16511
+ # Example 1: Get the snapshot using snapshot id.
16512
+ >>> datalake_df.get_snapshot(2046682612111137809)
16513
+ Feb Jan Mar Apr datetime
16514
+ accounts
16515
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16516
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16517
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16518
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16519
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16520
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16521
+
16522
+ # Example 2: Get the snapshot using snapshot id in string format.
16523
+ >>> datalake_df.get_snapshot("2046682612111137809")
16524
+ Feb Jan Mar Apr datetime
16525
+ accounts
16526
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16527
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16528
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16529
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16530
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16531
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16532
+
16533
+ # Example 3: Get the snapshot using timestamp string.
16534
+ >>> datalake_df.get_snapshot("2025-06-03 13:26:16")
16535
+ Feb Jan Mar Apr datetime
16536
+ accounts
16537
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16538
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16539
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16540
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16541
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16542
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16543
+
16544
+ # Example 4: Get the snapshot using date string.
16545
+ >>> datalake_df.get_snapshot("2025-06-04")
16546
+ Feb Jan Mar Apr datetime
16547
+ accounts
16548
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16549
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16550
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16551
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16552
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16553
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16554
+
16555
+ """
16556
+ _Validators._validate_function_arguments([["as_of", as_of, False, (int, str)]])
16557
+
16558
+ # If already int or string representation of int, return by quoting it
16559
+ if isinstance(as_of, int) or (isinstance(as_of, str) and as_of.isdigit()):
16560
+ snapshot_on = "'{}'".format(as_of)
16561
+ else:
16562
+ try:
16563
+ snapshot_on = UtilFuncs._get_time_formatted_string(as_of)
16564
+ except ValueError as e:
16565
+ raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
16566
+ "get_snapshot", "Invalid value for 'as_of' argument: {}. "
16567
+ "Use valid format [\"YYYY-MM-DD HH:MM:SS.FFFFFFF\", \"YYYY-MM-DD HH:MM:SS\","
16568
+ "\"YYYY-MM-DD\"]".format(as_of)),
16569
+ MessageCodes.FUNC_EXECUTION_FAILED)
16570
+
16571
+ query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_SNAPSHOT).format(self._table_name, snapshot_on)
16572
+
16573
+ try:
16574
+ return DataFrame.from_query(query)
16575
+ except TeradataMlException as e:
16576
+ raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
16577
+ "get_snapshot()", "Invalid value for 'as_of' argument: {}. "
16578
+ "Use valid timestamp or correct snapshot id listed using 'snapshots' property.".format(as_of)),
16579
+ MessageCodes.FUNC_EXECUTION_FAILED)
16580
+
16581
+ def as_of(self, **kwargs):
16582
+ """
16583
+ DESCRIPTION:
16584
+ Function to get DataFrame at specific time on temporal table.
16585
+ Note:
16586
+ Function is supported only on temporal tables or temporal views.
16587
+
16588
+ PARAMETERS:
16589
+ kwargs:
16590
+ Specifies keyword arguments.
16591
+
16592
+ valid_time:
16593
+ Optional Argument.
16594
+ Specifies the valid time to retrieve data from DataFrame created on either ValidTime
16595
+ or BiTemporal table/view.
16596
+ Notes:
16597
+ * Either "valid_time" or "transaction_time" must be provided.
16598
+ * Argument accepts below values:
16599
+ * "current" - to get the current valid time data.
16600
+ * any string other than "current" is considered as date and data will be retrieved at that of time.
16601
+ * date object - to get the data valid on that date.
16602
+ * datetime object - to get the data valid at that point of time.
16603
+ * tuple - to get the data which is valid between the two valid times.
16604
+ * tuple should have only two elements. First element considered as starting time
16605
+ and second element considered as end time for a period of time.
16606
+ Records will be retrieved which are valid between the two valid times.
16607
+ * Both elements can be of date or datetime or string type. If you are using
16608
+ string, make sure the string represents a valid date.
16609
+ * Any element can be None.
16610
+ * If first element is None and valid time dimension column is PERIOD_DATE type,
16611
+ then it is considered as '0001-01-01'.
16612
+ * If first element is None and valid time dimension column is PERIOD_TIMESTAMP type,
16613
+ then it is considered as '0001-01-01 00:00:00.000000+00:00'.
16614
+ * If second element is None and valid time dimension column is PERIOD_DATE type,
16615
+ then it is considered as '9999-12-31'.
16616
+ * If second element is None and valid time dimension column is PERIOD_TIMESTAMP type,
16617
+ then it is considered as '9999-12-31 23:59:59.999999+00:00'.
16618
+ * None - to consider the DataFrame as regular DataFrame and retrieve all the records from
16619
+ valid time dimension.
16620
+ Types: date or str or tuple or NoneType
16621
+
16622
+ include_valid_time_column:
16623
+ Optional Argument.
16624
+ Specifies whether to include the valid time dimension column in the resultant DataFrame.
16625
+ When set to True, valid time dimension column is included in resultant DataFrame.
16626
+ Otherwise, valid time dimension column is not included in resultant DataFrame.
16627
+ Note:
16628
+ Ignored when "valid_time" is either tuple or None.
16629
+ Default Value: False
16630
+ Types: bool
16631
+
16632
+ transaction_time:
16633
+ Optional Argument.
16634
+ Specifies the transaction time to retrieve data from DataFrame created on either
16635
+ TransactionTime or BiTemporal table/view.
16636
+ Notes:
16637
+ * Either "valid_time" or "transaction_time" must be provided.
16638
+ * Argument accepts below values.
16639
+ * "current" - to get the records which are valid at current time.
16640
+ * any string other than "current" is considered as timestamp and records which are
16641
+ valid at that of time.
16642
+ * datetime object - to get the records which are valid at that of time.
16643
+ * None - to consider the DataFrame as regular DataFrame and retrieve all the records
16644
+ from transaction time dimension.
16645
+ Types: datetime or str or NoneType
16646
+
16647
+ include_transaction_time_column:
16648
+ Optional Argument.
16649
+ Specifies whether to include the transaction time dimension column in the resultant DataFrame.
16650
+ When set to True, transaction time dimension column is included in resultant DataFrame.
16651
+ Otherwise, transaction time dimension column is not included in resultant DataFrame.
16652
+ Default Value: False
16653
+ Types: bool
16654
+
16655
+ additional_period:
16656
+ Optional Argument.
16657
+ Specifies the additional period to be kept in resultant DataFrame.
16658
+ Note:
16659
+ This is applicable only when "valid_time" is None.
16660
+ Types: tuple of date or str
16661
+
16662
+ RETURNS:
16663
+ teradataml DataFrame
16664
+
16665
+ RAISES:
16666
+ TeradatamlException.
16667
+
16668
+ EXAMPLES:
16669
+ # Load the data to run the example.
16670
+ >>> load_example_data("teradataml", "Employee_roles") # load valid time data.
16671
+ >>> load_example_data("teradataml", "Employee_Address") # load transaction time data.
16672
+ >>> load_example_data("teradataml", "Employee") # load bitemporal data.
16673
+
16674
+ >>> df1 = DataFrame("Employee_roles")
16675
+ EmployeeName Department Salary role_validity_period
16676
+ EmployeeID
16677
+ 1 John Doe IT 100.0 ('20/01/01', '24/12/31')
16678
+ 2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
16679
+ 3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
16680
+ 3 Bob Sales 300.0 ('24/01/01', '24/12/31')
16681
+
16682
+ # Example 1: Get the employee roles from DataFrame df1 which are valid at current time.
16683
+ >>> df1.as_of(valid_time="current")
16684
+ EmployeeName Department Salary
16685
+ EmployeeID
16686
+ 2 Jane Smith DA 200.0
16687
+ 3 Bob Marketing 330.0
16688
+
16689
+ # Example 2: Get the employee roles from DataFrame df1 which are valid at current time.
16690
+ # Also include valid time dimension column.
16691
+ >>> df1.as_of(valid_time="current", include_valid_time_column=True)
16692
+ EmployeeName Department Salary role_validity_period
16693
+ EmployeeID
16694
+ 2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
16695
+ 3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
16696
+
16697
+ # Example 3: Get the employee roles from DataFrame df1 which are valid at 31st Dec 2026.
16698
+ Include valid time dimension column.
16699
+ >>> df1.as_of(valid_time="2026-12-31", include_valid_time_column=True)
16700
+ EmployeeName Department Salary role_validity_period
16701
+ EmployeeID
16702
+ 2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
16703
+ 3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
16704
+
16705
+ # Example 4: Get the employee roles from DataFrame df1 which are valid at 31st Dec 2026.
16706
+ # Also include valid time dimension column. Use date object instead of string
16707
+ # to specify the date.
16708
+ >>> from datetime import date
16709
+ >>> d = date(2026, 12, 31)
16710
+ >>> df1.as_of(valid_time=d, include_valid_time_column=True)
16711
+ EmployeeName Department Salary role_validity_period
16712
+ EmployeeID
16713
+ 2 Jane Smith DA 200.0 ('20/01/01', '99/12/31')
16714
+ 3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
16715
+
16716
+ # Example 5: Get the employee roles which are valid between 20th Jan 2018 and 5th March 2024.
16717
+ # Include valid time dimension column.
16718
+ >>> df1.as_of(valid_time=("2018-01-20", "2024-03-05"), include_valid_time_column=True)
16719
+ EmployeeName Department Salary VALIDTIME
16720
+ EmployeeID
16721
+ 2 Jane Smith DA 200.0 ('20/01/01', '24/03/05')
16722
+ 1 John Doe IT 100.0 ('20/01/01', '24/03/05')
16723
+ 3 Bob Sales 300.0 ('24/01/01', '24/03/05')
16724
+
16725
+ # Example 6: Get the employee roles which are valid between 20th Jan 2018 and 5th March 2024.
16726
+ # Then again get the records which are valid at 1st Jan 2023. Do not include
16727
+ # valid time dimension column since selecting valid time dimension column is ignored
16728
+ # when "valid_time" is a tuple.
16729
+ >>> df1.as_of(valid_time=(date(2018, 1, 20), "2024-03-05")).as_of(valid_time=date(2023, 1, 1))
16730
+ EmployeeName Department Salary
16731
+ EmployeeID
16732
+ 2 Jane Smith DA 200.0
16733
+ 1 John Doe IT 100.0
16734
+
16735
+ # Example 7: Get the employee roles which are valid between 1st Jan 0001 and 1st Jun 2024.
16736
+ >>> df1.as_of(valid_time=(None, date(2024, 3, 5)))
16737
+ EmployeeName Department Salary VALIDTIME
16738
+ EmployeeID
16739
+ 2 Jane Smith DA 200.0 ('20/01/01', '24/03/05')
16740
+ 1 John Doe IT 100.0 ('20/01/01', '24/03/05')
16741
+ 3 Bob Sales 300.0 ('24/01/01', '24/03/05')
16742
+
16743
+ # Example 8: Get the employee roles which are valid between 1st Jun 2024 and 31st Dec 9999.
16744
+ >>> df1.as_of(valid_time=("2024-06-01", None))
16745
+ EmployeeName Department Salary VALIDTIME
16746
+ EmployeeID
16747
+ 1 John Doe IT 100.0 ('24/06/01', '24/12/31')
16748
+ 2 Jane Smith DA 200.0 ('24/06/01', '99/12/31')
16749
+ 3 Bob Marketing 330.0 ('25/01/01', '99/12/31')
16750
+ 3 Bob Sales 300.0 ('24/06/01', '24/12/31')
16751
+
16752
+ # Example 9: Consider df1 as regular DataFrame and retrieve all the records irrespective
16753
+ # whether records are valid or not.
16754
+ >>> df1.as_of(valid_time=None)
16755
+ EmployeeName Department Salary
16756
+ EmployeeID
16757
+ 1 John Doe IT 100.0
16758
+ 2 Jane Smith DA 200.0
16759
+ 3 Bob Marketing 330.0
16760
+ 3 Bob Sales 300.0
16761
+
16762
+ # Example 10. Consider df1 as regular DataFrame and retrieve all the records irrespective
16763
+ # whether records are valid or not. Also include additional period and valid time
16764
+ # dimension column.
16765
+ >>> df1.as_of(valid_time=None, additional_period=("2024-01-01", "2024-03-05"), include_valid_time_column=True)
16766
+ EmployeeName Department Salary role_validity_period VALIDTIME
16767
+ EmployeeID
16768
+ 1 John Doe IT 100.0 ('20/01/01', '24/12/31') ('24/01/01', '24/03/05')
16769
+ 2 Jane Smith DA 200.0 ('20/01/01', '99/12/31') ('24/01/01', '24/03/05')
16770
+ 3 Bob Marketing 330.0 ('25/01/01', '99/12/31') ('24/01/01', '24/03/05')
16771
+ 3 Bob Sales 300.0 ('24/01/01', '24/12/31') ('24/01/01', '24/03/05')
16772
+
16773
+ >>> df2 = DataFrame("Employee_Address")
16774
+ EmployeeName address validity_period
16775
+ EmployeeID
16776
+ 2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16777
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16778
+ 3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
16779
+
16780
+ # Example 11: Consider df2 as regular DataFrame and retrieve all the records including historic
16781
+ # records. Also include transaction time dimension column.
16782
+ >>> df2.as_of(transaction_time=None, include_transaction_time_column=True)
16783
+ EmployeeName address validity_period
16784
+ EmployeeID
16785
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16786
+ 2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16787
+ 3 Bob Johnson 789 Oak Street ('2025-03-04 15:41:44.610000+00:00', '2025-03-04 15:41:44.610001+00:00')
16788
+ 3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
16789
+
16790
+ # Example 12: Get the employee address which are valid at current time from DataFrame df2.
16791
+ # Also include transaction time dimension column.
16792
+ >>> df2.as_of(transaction_time="current", include_transaction_time_column=True)
16793
+ EmployeeName address validity_period
16794
+ EmployeeID
16795
+ 2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16796
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16797
+ 3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
16798
+
16799
+ # Example 13: Get the employee address which are valid at current time from DataFrame df2.
16800
+ # Do not include transaction time dimension column.
16801
+ >>> df2.as_of(transaction_time="current", include_transaction_time_column=False)
16802
+ EmployeeName address
16803
+ EmployeeID
16804
+ 2 Jane Smith 456 Elm St
16805
+ 1 John Doe 123 Main St
16806
+ 3 Bob Johnson 789 Oak St
16807
+
16808
+ # Example 14: Get the employee address which are valid at 2025-03-04 15:41:44.610000+00:00 from DataFrame df2.
16809
+ # Include transaction time dimension column.
16810
+ >>> df2.as_of(transaction_time="2025-03-04 15:41:44.610000+00:00", include_transaction_time_column=True)
16811
+ EmployeeName address validity_period
16812
+ EmployeeID
16813
+ 2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16814
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16815
+ 3 Bob Johnson 789 Oak Street ('2025-03-04 15:41:44.610000+00:00', '2025-03-04 15:41:44.610001+00:00')
16816
+
16817
+ # Example 15: Get the employee address which are valid at 2025-03-04 15:41:44.610001+00:00 from DataFrame df2.
16818
+ # Include transaction time dimension column.
16819
+ >>> from datetime import datetime, timezone, timedelta
16820
+ >>> dt = datetime(2025, 3, 4, 15, 41, 44, 610001)
16821
+ >>> dt_with_tz = dt.replace(tzinfo=timezone(timedelta(hours=0)))
16822
+ >>> df2.as_of(transaction_time=dt_with_tz, include_transaction_time_column=True)
16823
+ EmployeeName address validity_period
16824
+ EmployeeID
16825
+ 2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16826
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
16827
+ 3 Bob Johnson 789 Oak St ('2025-03-04 15:41:44.610001+00:00', '9999-12-31 23:59:59.999999+00:00')
16828
+
16829
+ >>> df3 = DataFrame("Employee")
16830
+ EmployeeName address Department Salary role_validity validity_period
16831
+ EmployeeID
16832
+ 1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16833
+ 2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16834
+ 3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00')
16835
+ 3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
16836
+
16837
+ # Example 16: Get all the records from DataFrame df3 by considering the DataFrame as
16838
+ # regular DataFrame. Include both valid time and transaction time dimension columns.
16839
+ >>> df3.as_of(valid_time=None,
16840
+ ... transaction_time=None,
16841
+ ... include_valid_time_column=True,
16842
+ ... include_transaction_time_column=True
16843
+ ... )
16844
+ EmployeeName address Department Salary role_validity validity_period
16845
+ EmployeeID
16846
+ 3 Bob 789 Oak Street Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
16847
+ 3 Bob 789 Oak St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-03-04 18:09:08.830000+00:00', '2025-05-06 11:39:25.580000+00:00')
16848
+ 1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16849
+ 2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16850
+ 3 Bob 789 Oak Street Marketing 330.0 ('25/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
16851
+ 3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00')
16852
+ 3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
16853
+
16854
+ # Example 17: Get the employee address from DataFrame df3 which are valid at 1st Jun 2024 from
16855
+ # valid time dimension and valid at '2025-03-04 18:09:08.720001+00:00' from transaction
16856
+ # time dimension. Include both valid time and transaction time dimension columns.
16857
+ >>> df3.as_of(valid_time="2024-06-01",
16858
+ ... transaction_time="2025-03-04 18:09:08.720001+00:00",
16859
+ ... include_valid_time_column=True,
16860
+ ... include_transaction_time_column=True
16861
+ ... )
16862
+ EmployeeName address Department Salary role_validity validity_period
16863
+ EmployeeID
16864
+ 2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16865
+ 1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16866
+ 3 Bob 789 Oak Street Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '2025-03-04 18:09:08.830000+00:00')
16867
+
16868
+ # Example 18: Get the employee address from DataFrame df3 which are valid at 25th Jan 2024
16869
+ # from valid time dimension and valid at current time from transaction time dimension.
16870
+ # Include only transaction time dimension column.
16871
+ >>> df3.as_of(valid_time=date(2024, 1, 25),
16872
+ ... transaction_time="current",
16873
+ ... include_transaction_time_column=True)
16874
+ EmployeeName address Department Salary validity_period
16875
+ EmployeeID
16876
+ 2 Jane Smith 456 Elm St DA 200.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16877
+ 1 John Doe 123 Main St IT 100.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00')
16878
+ 3 Bob 789 Oak St Sales 300.0 ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00')
16879
+
16880
+ # Example 19: Get the employee address from DataFrame df3 which are valid between 1st Jan 2025
16881
+ # and 30th June 2025 from valid time dimension and valid at
16882
+ # '2025-03-04 18:08:59.720000+00:00' from transaction time dimension.
16883
+ # Include both valid time and transaction time dimension columns.
16884
+ >>> from datetime import datetime, timezone
16885
+ >>>df3.as_of(valid_time=("2025-01-01", date(2025, 6, 30)),
16886
+ ... transaction_time=datetime(2025, 3, 4, 18, 8, 59, 720000).astimezone(timezone.utc),
16887
+ ... include_transaction_time_column=True)
16888
+ EmployeeName address Department Salary validity_period VALIDTIME
16889
+ EmployeeID
16890
+ 2 Jane Smith 456 Elm St DA 200.0 ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('25/01/01', '25/06/30')
16891
+ 3 Bob 789 Oak St Marketing 330.0 ('2025-03-04 18:09:08.830000+00:00', '2025-05-06 11:39:25.580000+00:00') ('25/01/01', '25/06/30')
16892
+
16893
+ # Example 20: Get the employee address from DataFrame df3 by considering the DataFrame as regular
16894
+ # DataFrame from valid time dimension and valid at current time from transaction time dimension.
16895
+ # Add additional period and include both valid time and transaction time dimension columns.
16896
+ >>> df3.as_of(valid_time=None,
16897
+ ... transaction_time="current",
16898
+ ... additional_period=("2024-01-01", "2024-03-05"),
16899
+ ... include_valid_time_column=True,
16900
+ ... include_transaction_time_column=True
16901
+ ... )
16902
+ EmployeeName address Department Salary role_validity validity_period VALIDTIME
16903
+ EmployeeID
16904
+ 1 John Doe 123 Main St IT 100.0 ('20/01/01', '24/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
16905
+ 2 Jane Smith 456 Elm St DA 200.0 ('20/01/01', '99/12/31') ('2025-03-04 18:08:58.720000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
16906
+ 3 Bob 789 OAK St Marketing 330.0 ('25/01/01', '99/12/31') ('2025-05-06 11:39:25.580000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
16907
+ 3 Bob 789 Oak St Sales 300.0 ('24/01/01', '24/12/31') ('2025-03-04 18:09:08.830000+00:00', '9999-12-31 23:59:59.999999+00:00') ('24/01/01', '24/03/05')
16908
+ """
16909
+
16910
+ if "valid_time" not in kwargs and "transaction_time" not in kwargs:
16911
+ _Validators._validate_mutually_exclusive_arguments(
16912
+ None, "valid_time", None, "transaction_time")
16913
+
16914
+ # Validate argument types.
16915
+ _validation = []
16916
+ _validation.append(["valid_time", kwargs.get("valid_time"), True, (date, datetime, str, tuple, type(None))])
16917
+ _validation.append(["transaction_time", kwargs.get("transaction_time"), True, (datetime, str, type(None))])
16918
+ _validation.append(["additional_period", kwargs.get("additional_period"), True, (tuple, type(None))])
16919
+ _validation.append(["include_valid_time_column", kwargs.get("include_valid_time_column"), True, bool])
16920
+ _validation.append(["include_transaction_time_column", kwargs.get("include_transaction_time_column"), True, bool])
16921
+
16922
+ # Validate argument types
16923
+ _Validators._validate_function_arguments(_validation)
16924
+
16925
+ # Validate temporal table type.
16926
+ _Validators._validate_temporal_table_type(self.df_type)
16927
+
16928
+ # Extract valid_time and transaction_time from kwargs.
16929
+ valid_time = kwargs.get("valid_time")
16930
+ transaction_time = kwargs.get("transaction_time")
16931
+ additional_period = kwargs.get("additional_period")
16932
+ include_valid_time_column = kwargs.get("include_valid_time_column")
16933
+ include_transaction_time_column = kwargs.get("include_transaction_time_column")
16934
+
16935
+ # Validate if user specifies valid_time for a transaction time table.
16936
+ if "valid_time" in kwargs:
16937
+ _Validators._validate_as_of_arguments(df_type=self.df_type)
16938
+
16939
+ # Validate if user specifies transaction_time for a valid time table.
16940
+ if "transaction_time" in kwargs:
16941
+ _Validators._validate_as_of_arguments(df_type=self.df_type, argument_name='transaction_time')
16942
+
16943
+ add_vt_period = False
16944
+
16945
+ # Generate the time qualifier clause.
16946
+ if "valid_time" in kwargs and "transaction_time" not in kwargs:
16947
+ clause = self.__get_valid_time_clause(valid_time, additional_period)
16948
+ elif "transaction_time" in kwargs and "valid_time" not in kwargs:
16949
+ clause = self.__get_transaction_time_clause(transaction_time)
16950
+ else:
16951
+ # Generate both clauses.
16952
+ clause = "{} AND {}".format(self.__get_valid_time_clause(valid_time, additional_period),
16953
+ self.__get_transaction_time_clause(transaction_time)
16954
+ )
16955
+
16956
+ # Exclude the time dimension columns if user is not willing to see it in output DF.
16957
+ columns_to_exclude = []
16958
+ if not include_valid_time_column and self._valid_time_column:
16959
+ columns_to_exclude.append(self._valid_time_column.name)
16960
+
16961
+ if not include_transaction_time_column and self._transaction_time_column:
16962
+ columns_to_exclude.append(self._transaction_time_column.name)
16963
+
16964
+ columns = [col for col in self.columns if col not in columns_to_exclude]
16965
+ col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
16966
+
16967
+ # Notes:
16968
+ # * If valid_time is tuple, i.e., for valid time qualifier SEQUENCED VALIDTIME,
16969
+ # add additional column VALIDTIME. This column should not be present in SELECT statement.
16970
+ # Also, ValidTime dimension column should not be present in SELECT statement. VALIDTIME column
16971
+ # acts as validTime dimension column here.
16972
+ # * Time qualifier NONSEQUENCED VALIDTIME PERIOD clause also produces additional column VALIDTIME.
16973
+ # Hence, add additional column VALIDTIME also returned in the output DataFrame. However, valid time
16974
+ # column can exist in SELECT statement.
16975
+ if isinstance(valid_time, tuple):
16976
+ add_vt_period = True
16977
+ columns = [col for col in columns if col != self._valid_time_column.name]
16978
+ col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
16979
+ col_names_types["VALIDTIME"] = self._valid_time_column.type
16980
+ elif (isinstance(valid_time, type(None)) and additional_period is not None):
16981
+ col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, columns)
16982
+ col_names_types["VALIDTIME"] = self._valid_time_column.type
16983
+
16984
+ # SELECT Node.
16985
+ column_expression = ", ".join(columns)
16986
+ sel_nodeid = self._aed_utils._aed_select(self._nodeid, column_expression, timestamp_expr=clause)
16987
+
16988
+ # Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
16989
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
16990
+ df = self._create_dataframe_from_node(sel_nodeid, new_metaexpr, self._index_label)
16991
+
16992
+ # If time qualifier is SEQUENCED PERIOD, then add VALIDTIME column to DataFrame
16993
+ # since it produces temporal dataset.
16994
+ if add_vt_period:
16995
+ df._valid_time_column = df['VALIDTIME']
16996
+
16997
+ return df
16998
+
16999
+ def __get_valid_time_clause(self, valid_time, additional_period=None):
17000
+ """
17001
+ DESCRIPTION:
17002
+ Function to get valid time clause for temporal table.
17003
+
17004
+ PARAMETERS:
17005
+ valid_time:
17006
+ Required Argument.
17007
+ Specifies the valid time dimension to represent temporal data when creating the DataFrame.
17008
+ Types: date or str
17009
+
17010
+ additional_period:
17011
+ Optional Argument.
17012
+ Specifies the additional period to be kept in DataFrame.
17013
+ Note:
17014
+ This is applicable only when "valid_time" is None.
17015
+ Types: tuple of date or str
17016
+
17017
+ RETURNS:
17018
+ str
17019
+
17020
+ RAISES:
17021
+ None.
17022
+ """
17023
+ is_vt_dt_type = isinstance(self._valid_time_column.type, tdtypes.PERIOD_DATE)
17024
+ if valid_time == "current":
17025
+ return "CURRENT VALIDTIME"
17026
+
17027
+ if isinstance(valid_time, (str, date, datetime)):
17028
+ # If valid_time is a string, then check what is the type of temporal column.
17029
+ # ValidTime dimension allows both DATE and TIMESTAMP type for ValidTime dimension
17030
+ # columns.
17031
+ if is_vt_dt_type:
17032
+ return "VALIDTIME AS OF DATE '{}'".format(valid_time)
17033
+ return "VALIDTIME AS OF TIMESTAMP '{}'".format(valid_time)
17034
+
17035
+ # If valid_time is a tuple, then it is a period.
17036
+ # User can specify start and/or end time. Derive missing value.
17037
+ if isinstance(valid_time, tuple):
17038
+ start = valid_time[0]
17039
+ end = valid_time[1]
17040
+ start = ("0001-01-01" if is_vt_dt_type else '0001-01-01 00:00:00.000000+00:00') if start is None else str(
17041
+ start)
17042
+ end = ("9999-12-31" if is_vt_dt_type else '9999-12-31 23:59:59.999999+00:00') if end is None else str(end)
17043
+ return "SEQUENCED VALIDTIME PERIOD '({}, {})'".format(start, end)
17044
+
17045
+ if isinstance(valid_time, type(None)) and additional_period is not None:
17046
+ return "NONSEQUENCED VALIDTIME PERIOD '({}, {})'".format(additional_period[0], additional_period[1])
17047
+
17048
+ return "NONSEQUENCED VALIDTIME"
17049
+
17050
+ def __get_transaction_time_clause(self, transaction_time):
17051
+ """
17052
+ DESCRIPTION:
17053
+ Function to get transaction time clause for temporal table.
17054
+
17055
+ PARAMETERS:
17056
+ transaction_time:
17057
+ Required Argument.
17058
+ Specifies the transaction time dimension to represent temporal data when creating the DataFrame.
17059
+ Types: date or str
17060
+
17061
+ RETURNS:
17062
+ str
17063
+
17064
+ RAISES:
17065
+ None.
17066
+ """
17067
+ if transaction_time == "current":
17068
+ return "CURRENT TRANSACTIONTIME"
17069
+
17070
+ if isinstance(transaction_time, type(None)):
17071
+ return "NONSEQUENCED TRANSACTIONTIME"
17072
+
17073
+ return "TRANSACTIONTIME as of timestamp '{}'".format(transaction_time)
17074
+
17075
+ def _generate_temporal_dataframe(self, timestamp_expr, time_column):
17076
+ """
17077
+ DESCRIPTION:
17078
+ Helper method to generate a temporal DataFrame based on the given timestamp expression.
17079
+
17080
+ PARAMETERS:
17081
+ timestamp_expr:
17082
+ Required Argument.
17083
+ Specifies the timestamp expression to filter the temporal data.
17084
+ Types: str
17085
+
17086
+ time_column:
17087
+ Required Argument.
17088
+ Specifies the temporal column (valid-time or transaction-time) to process.
17089
+ Types: ColumnExpression
17090
+
17091
+ RAISES:
17092
+ None.
17093
+
17094
+ RETURNS:
17095
+ teradataml DataFrame
17096
+ """
17097
+ col_expr = "{} as {}".format(time_column.cast(time_column.type).compile(), time_column.name)
17098
+ cols = [col.name if col.name != time_column.name else col_expr for col in self._metaexpr.c]
17099
+ column_expression = ", ".join(cols)
17100
+ sel_node_id = self._aed_utils._aed_select(self._nodeid, column_expression, timestamp_expr=timestamp_expr)
17101
+ return self._create_dataframe_from_node(sel_node_id, self._metaexpr, self._index_label)
17102
+
17103
+ def historic_rows(self):
17104
+ """
17105
+ DESCRIPTION:
17106
+ Retrieves historical rows from a DataFrame created on a valid-time
17107
+ or bi-temporal table/view. Historical rows are defined as those where the
17108
+ end of the valid-time period precedes the current time.
17109
+
17110
+ PARAMETERS:
17111
+ None.
17112
+
17113
+ RETURNS:
17114
+ teradataml DataFrame.
17115
+
17116
+ RAISES:
17117
+ TeradataMLException.
17118
+
17119
+ EXAMPLES:
17120
+ # Load the data to run the example.
17121
+ >>> load_example_data("teradataml", "Employee_roles")
17122
+
17123
+ # Create a DataFrame on 'Employee_roles' table.
17124
+ >>> df = DataFrame("Employee_roles")
17125
+
17126
+ # Retrieve historic rows from the DataFrame.
17127
+ >>> df.historic_rows()
17128
+ EmployeeID EmployeeName Department Salary role_validity_period
17129
+ 1 John Doe IT 100.0 ('20/01/01', '24/12/31')
17130
+ 3 Bob Sales 300.0 ('24/01/01', '24/12/31')
17131
+ """
17132
+
17133
+ from teradataml.dataframe.functions import current_date, current_timestamp
17134
+ # Validate temporal table type.
17135
+ _Validators._validate_temporal_table_type(self.df_type)
17136
+ valid_time_col = self._valid_time_column
17137
+ df = self._generate_temporal_dataframe("NONSEQUENCED VALIDTIME", valid_time_col)
17138
+ # Check the type of the ValidTime dimension column
17139
+ if isinstance(valid_time_col.type, tdtypes.PERIOD_DATE):
17140
+ # Filter records where the end of the ValidTime period is less than the current date
17141
+ return df[valid_time_col.end() < current_date()]
17142
+ return df[valid_time_col.end() < current_timestamp()]
17143
+
17144
+ def future_rows(self):
17145
+ """
17146
+ DESCRIPTION:
17147
+ Retrieves future rows from a DataFrame created on a valid-
17148
+ time or bi-temporal table/view. Future rows are defined as those where the
17149
+ start of the valid-time period is greater than the current time.
17150
+
17151
+ PARAMETERS:
17152
+ None.
17153
+
17154
+ RETURNS:
17155
+ teradataml DataFrame.
17156
+
17157
+ RAISES:
17158
+ TeradataMLException.
17159
+
17160
+ EXAMPLES:
17161
+ # Load the data to run the example.
17162
+ >>> load_example_data("teradataml", "Employee_roles")
17163
+
17164
+ # Create a DataFrame on 'Employee_roles' table.
17165
+ >>> df = DataFrame("Employee_roles")
17166
+
17167
+ # Retrieve future rows from the DataFrame.
17168
+ >>> df.future_rows()
17169
+ EmployeeID EmployeeName Department Salary role_validity_period
17170
+ 3 Bob Marketing 330.0 ('29/01/01', '99/12/31')
17171
+ """
17172
+ from teradataml.dataframe.functions import current_date, current_timestamp
17173
+ # Validate temporal table type.
17174
+ _Validators._validate_temporal_table_type(self.df_type)
17175
+ valid_time_col = self._valid_time_column
17176
+ df = self._generate_temporal_dataframe("NONSEQUENCED VALIDTIME", valid_time_col)
17177
+ # Check the type of the ValidTime dimension column
17178
+ if isinstance(valid_time_col.type, tdtypes.PERIOD_DATE):
17179
+ # Filter records where the start of the ValidTime period is greater than the current date
17180
+ return df[valid_time_col.begin() > current_date()]
17181
+ return df[valid_time_col.begin() > current_timestamp()]
17182
+
17183
+ def open_rows(self):
17184
+ """
17185
+ DESCRIPTION:
17186
+ Retrieves open rows from a DataFrame created on a transaction-time
17187
+ or bi-temporal table/view. Open rows are defined as those where the
17188
+ end of the transaction-time period is greater than or equal to the current time.
17189
+
17190
+ PARAMETERS:
17191
+ None.
17192
+
17193
+ RETURNS:
17194
+ teradataml DataFrame.
17195
+
17196
+ RAISES:
17197
+ TeradataMLException.
17198
+
17199
+ EXAMPLES:
17200
+ # Load the data to run the example.
17201
+ >>> load_example_data("teradataml", "Employee_address")
17202
+
17203
+ # Create a DataFrame on 'Employee_address' table.
17204
+ >>> df = DataFrame("Employee_address")
17205
+
17206
+ # Retrieve open rows from the DataFrame.
17207
+ >>> df.open_rows()
17208
+ EmployeeID EmployeeName address validity_period
17209
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
17210
+ 2 Jane Smith 456 Elm St ('2025-03-04 15:41:44.610000+00:00', '9999-12-31 23:59:59.999999+00:00')
17211
+ """
17212
+ from teradataml.dataframe.functions import current_timestamp
17213
+ # Validate temporal table type.
17214
+ _Validators._validate_temporal_table_type(self.df_type)
17215
+ transaction_time_col = self._transaction_time_column
17216
+ df = self._generate_temporal_dataframe("NONSEQUENCED TRANSACTIONTIME", transaction_time_col)
17217
+ return df[transaction_time_col.end() >= current_timestamp()]
17218
+
17219
+ def closed_rows(self):
17220
+ """
17221
+ DESCRIPTION:
17222
+ Retrieves closed rows from a DataFrame created on a transaction-time
17223
+ or bi-temporal table/view. Closed rows are defined as those where the
17224
+ end of the transaction-time period is less than the current time.
17225
+
17226
+ PARAMETERS:
17227
+ None.
17228
+
17229
+ RETURNS:
17230
+ teradataml DataFrame.
17231
+
17232
+ RAISES:
17233
+ TeradataMLException.
17234
+
17235
+ EXAMPLES:
17236
+ # Load the data to run the example.
17237
+ >>> load_example_data("teradataml", "Employee_address")
17238
+
17239
+ # Create a DataFrame on 'Employee_address' table.
17240
+ >>> df = DataFrame("Employee_address")
17241
+
17242
+ # Retrieve closed rows from the DataFrame.
17243
+ >>> df.closed_rows()
17244
+ EmployeeID EmployeeName address validity_period
17245
+ 1 John Doe 123 Main St ('2025-03-04 15:41:44.610000+00:00', '2025-04-01 23:59:59.999999+00:00')
17246
+ """
17247
+ from teradataml.dataframe.functions import current_timestamp
17248
+ # Validate temporal table type.
17249
+ _Validators._validate_temporal_table_type(self.df_type)
17250
+ transaction_time_col = self._transaction_time_column
17251
+ df = self._generate_temporal_dataframe("NONSEQUENCED TRANSACTIONTIME", transaction_time_col)
17252
+ return df[transaction_time_col.end() < current_timestamp()]
17253
+
17254
+ @collect_queryband(queryband="DF_create_view")
17255
+ def create_view(self, view_name, schema_name=None):
17256
+ """
17257
+ Creates a view from the DataFrame object in the specified schema.
17258
+ As teradataml creates views, internally for operations, which will be garbage
17259
+ collected during remove_context(), this function helps the user to persist the
17260
+ DataFrame as a view.
17261
+ Note:
17262
+ The persisted view can be used across sessions and can be accessed
17263
+ using the view_name and schema_name.
17264
+
17265
+ PARAMETERS:
17266
+ view_name:
17267
+ Required Argument.
17268
+ Specifies the name of the view to be persisted.
17269
+ Types: str
17270
+
17271
+ schema_name:
17272
+ Optional Argument.
17273
+ Specifies the schema name where the view is to be persisted.
17274
+ Note:
17275
+ If the schema_name is not provided, the current database will be used.
17276
+ Types: str
17277
+
17278
+ RETURNS:
17279
+ Persisted teradataml DataFrame.
17280
+
17281
+ RAISES:
17282
+ TeradataMlException
17283
+
17284
+ EXAMPLES:
17285
+ # Load the data to run the example.
17286
+ >>> load_example_data("antiselect", ["antiselect_input"])
17287
+ >>> antiselect_input = DataFrame.from_table("antiselect_input")
17288
+ >>> antiselect_input
17289
+ orderid orderdate priority quantity sales discount shipmode custname province region custsegment prodcat
17290
+ rowids
17291
+ 49 293 12/10/01 high 49 10123.0200 0.07 delivery truck barry french nunavut nunavut consumer office supplies
17292
+ 97 613 11/06/17 high 12 93.5400 0.03 regular air carl jackson nunavut nunavut corporate office supplies
17293
+ 85 515 10/08/28 not specified 19 394.2700 0.08 regular air carlos soltero nunavut nunavut consumer office supplies
17294
+ 86 515 10/08/28 not specified 21 146.6900 0.05 regular air carlos soltero nunavut nunavut consumer furniture
17295
+ 1 3 10/10/13 low 6 261.5400 0.04 regular air muhammed macintyre nunavut nunavut small business office supplies
17296
+ 50 293 12/10/01 high 27 244.5700 0.01 regular air barry french nunavut nunavut consumer office supplies
17297
+ 80 483 11/07/10 high 30 4965.7595 0.08 regular air clay rozendal nunavut nunavut corporate technology
17298
+
17299
+ # Filter the data based on quantity.
17300
+ >>> anti_df = antiselect_input[antiselect_input.quantity < 30]
17301
+ >>> anti_df
17302
+ orderid orderdate priority quantity sales discount shipmode custname province region custsegment prodcat
17303
+ rowids
17304
+ 97 613 11/06/17 high 12 93.54 0.03 regular air carl jackson nunavut nunavut corporate office supplies
17305
+ 86 515 10/08/28 not specified 21 146.69 0.05 regular air carlos soltero nunavut nunavut consumer furniture
17306
+ 85 515 10/08/28 not specified 19 394.27 0.08 regular air carlos soltero nunavut nunavut consumer office supplies
17307
+ 1 3 10/10/13 low 6 261.54 0.04 regular air muhammed macintyre nunavut nunavut small business office supplies
17308
+ 50 293 12/10/01 high 27 244.57 0.01 regular air barry french nunavut nunavut consumer office supplies
17309
+
17310
+ # Run Antiselect on filtered data. This will create temporary view which will be garbage collected.
17311
+ >>> obj = Antiselect(data=anti_df, exclude=['rowids', 'orderdate', 'discount', 'province', 'custsegment'])
17312
+
17313
+ # Get the view name that is internally created by teradataml to store the result of Antiselect.
17314
+ >>> obj.result.db_object_name
17315
+ '"<schema_name>"."ml__td_sqlmr_out__1752582812690000"'
17316
+
17317
+ # Check the output of Antiselect.
17318
+ >>> obj.result
17319
+ orderid priority quantity sales shipmode custname region prodcat
17320
+ 0 613 high 12 93.54 regular air carl jackson nunavut office supplies
17321
+ 1 515 not specified 21 146.69 regular air carlos soltero nunavut furniture
17322
+ 2 515 not specified 19 394.27 regular air carlos soltero nunavut office supplies
17323
+ 3 293 high 27 244.57 regular air barry french nunavut office supplies
17324
+ 4 3 low 6 261.54 regular air muhammed macintyre nunavut office supplies
17325
+
17326
+ # Describe the resultant DataFrame.
17327
+ >>> df = obj.result.describe() # This will create a temporary view.
17328
+
17329
+ # Get the view name.
17330
+ >>> df.db_object_name
17331
+ '"<schema_name>"."ml__td_sqlmr_out__1752585435339977"'
17332
+
17333
+ # Check the output of describe.
17334
+ >>> df
17335
+ ATTRIBUTE StatName StatValue
17336
+ 0 orderid MAXIMUM 613.000000
17337
+ 1 orderid STANDARD DEVIATION 245.016734
17338
+ 2 orderid PERCENTILES(25) 293.000000
17339
+ 3 orderid PERCENTILES(50) 515.000000
17340
+ 4 quantity COUNT 5.000000
17341
+ 5 quantity MINIMUM 6.000000
17342
+ 6 quantity MAXIMUM 27.000000
17343
+ 7 quantity MEAN 17.000000
17344
+ 8 quantity STANDARD DEVIATION 8.154753
17345
+ 9 quantity PERCENTILES(25) 12.000000
17346
+
17347
+ # Example 1: Persist the view which can be accessed across sessions.
17348
+ >>> df_new = df.create_view(view_name="antiselect_describe_view")
17349
+ >>> df_new
17350
+ ATTRIBUTE StatName StatValue
17351
+ 0 quantity MAXIMUM 27.000000
17352
+ 1 quantity STANDARD DEVIATION 8.154753
17353
+ 2 quantity PERCENTILES(25) 12.000000
17354
+ 3 quantity PERCENTILES(50) 19.000000
17355
+ 4 sales COUNT 5.000000
17356
+ 5 sales MINIMUM 93.540000
17357
+ 6 orderid COUNT 5.000000
17358
+ 7 orderid MINIMUM 3.000000
17359
+ 8 orderid MAXIMUM 613.000000
17360
+ 9 orderid MEAN 387.800000
17361
+
17362
+ # Get the view name.
17363
+ >>> df_new.db_object_name # "<schema_name>" is user connected database.
17364
+ '"<schema_name>"."antiselect_describe_view"'
17365
+
17366
+ """
17367
+ # Argument validation
17368
+ arg_info_matrix = []
17369
+ arg_info_matrix.append(["view_name", view_name, False, (str,), True])
17370
+ arg_info_matrix.append(["schema_name", schema_name, True, (str,), True])
17371
+ _Validators._validate_missing_required_arguments(arg_info_matrix)
17372
+ _Validators._validate_function_arguments(arg_info_matrix)
17373
+
17374
+ # TODO: Investigate and identify issue when volatile tables replaces views in future.
17375
+
17376
+ visited = set()
17377
+ to_persist = []
17378
+ is_teradataml_temp_table = lambda x: x.startswith("ml__") or x.startswith("tdml_")
17379
+ sql_bundle = SQLBundle()
17380
+
17381
+ def trace_views(table_name):
17382
+ if table_name in visited:
17383
+ return
17384
+ visited.add(table_name)
17385
+ base_name = UtilFuncs._extract_table_name(full_qualified_name=table_name)
17386
+ if is_teradataml_temp_table(base_name):
17387
+ to_persist.append(table_name)
17388
+ # Try to get the SQL for the view
17389
+ show_view_sql = sql_bundle._get_sql_query(SQLConstants.SQL_SHOW_VIEW).\
17390
+ format(table_name)
17391
+ try:
17392
+ result = execute_sql(show_view_sql).fetchall()
17393
+ if result:
17394
+ view_sql = result[0][0].replace("\r", "").replace("\n", " ")\
17395
+ .replace("\t", " ").strip()
17396
+
17397
+ # Extract all table names from the view SQL
17398
+ for tname in UtilFuncs.extract_table_names_from_query(view_sql):
17399
+ trace_views(tname)
17400
+ except Exception as e:
17401
+ # Check if error is like 'not a view', then try SHOW TABLE
17402
+ err_msg = str(e).lower()
17403
+ if 'not a view' in err_msg:
17404
+ show_table_sql = sql_bundle._get_sql_query(SQLConstants.SQL_SHOW_TABLE).\
17405
+ format(table_name)
17406
+ try:
17407
+ result = execute_sql(show_table_sql).fetchall()
17408
+ if result:
17409
+ # Table found, nothing to trace further.
17410
+ # This table is persisted.
17411
+ return
17412
+ except Exception as e2:
17413
+ # If SHOW TABLE also fails, raise the exception
17414
+ raise e2
17415
+ else:
17416
+ # If error is not about 'not a view', re-raise
17417
+ raise e
17418
+
17419
+ # 1. Get the query for this DataFrame
17420
+ query = self.show_query()
17421
+ # 2. Extract all table names from the query
17422
+ for tname in UtilFuncs.extract_table_names_from_query(query):
17423
+ trace_views(tname)
17424
+
17425
+ # 3.. Persist the current DataFrame as a permanent object
17426
+ # This CREATE VIEW AS SELECT ...
17427
+ # Use object_name, schema_name as needed.
17428
+ from teradataml.dbutils.dbutils import _get_quoted_object_name
17429
+ target_name = _get_quoted_object_name(schema_name=schema_name, object_name=view_name)
17430
+
17431
+ create_sql = sql_bundle._build_create_view(view_name=target_name,
17432
+ select_expression=query)
17433
+
17434
+ # No try-except here, as we want to raise any error that occurs during execution.
17435
+ execute_sql(create_sql)
17436
+
17437
+ # TODO: Add logger message that these views/tables persisted.
17438
+ # if to_persist:
17439
+ # logger.info("to_persist: ", to_persist)
17440
+
17441
+ # Remove the tables/view from GC file as we need to persist them. Removing only after
17442
+ # required object is created.
17443
+ GarbageCollector._delete_object_entry(objects_to_delete=to_persist,
17444
+ object_type=None,
17445
+ remove_entry_from_gc_list=True)
17446
+
17447
+ # Return the teradataml DataFrame for the persisted object.
17448
+ if schema_name is None:
17449
+ schema_name = tdmlctx._get_current_databasename()
17450
+ return DataFrame(in_schema(schema_name=schema_name, table_name=view_name))
15444
17451
 
15445
17452
 
15446
17453
  class DataFrameGroupBy(DataFrame):
@@ -15450,7 +17457,7 @@ class DataFrameGroupBy(DataFrame):
15450
17457
 
15451
17458
  """
15452
17459
 
15453
- def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None):
17460
+ def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None, include_grouping_columns=False):
15454
17461
  """
15455
17462
  init() method for DataFrameGroupBy.
15456
17463
 
@@ -15491,6 +17498,15 @@ class DataFrameGroupBy(DataFrame):
15491
17498
  Permitted Values: "CUBE", "ROLLUP", None
15492
17499
  Types: str or NoneType
15493
17500
 
17501
+ include_grouping_columns:
17502
+ Optional Argument.
17503
+ Specifies whether to include aggregations on the grouping column(s) or not.
17504
+ When set to True, the resultant DataFrame will have the aggregations on the
17505
+ columns mentioned in "columns". Otherwise, resultant DataFrame will not have
17506
+ aggregations on the columns mentioned in "columns".
17507
+ Default Value: False
17508
+ Types: bool
17509
+
15494
17510
  RETURNS:
15495
17511
  teradataml DataFrameGroupBy instance
15496
17512
  """
@@ -15500,6 +17516,7 @@ class DataFrameGroupBy(DataFrame):
15500
17516
  self._column_names_and_types = column_names_and_types
15501
17517
  self._columns = columns
15502
17518
  self.groupby_column_list = column_list
17519
+ self._include_grouping_columns = include_grouping_columns
15503
17520
 
15504
17521
  def _get_assign_allowed_types(self):
15505
17522
  """
@@ -15585,7 +17602,8 @@ class DataFrameGroupBy(DataFrame):
15585
17602
 
15586
17603
  new_meta = UtilFuncs._get_metaexpr_using_columns(new_nodeid,
15587
17604
  zip(new_column_names,
15588
- new_column_types))
17605
+ new_column_types),
17606
+ datalake=self._metaexpr.datalake)
15589
17607
 
15590
17608
  return (new_meta, new_nodeid)
15591
17609