teradataml 20.0.0.5__py3-none-any.whl → 20.0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (53) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +96 -0
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +1 -1
  5. teradataml/analytics/utils.py +56 -11
  6. teradataml/clients/auth_client.py +10 -6
  7. teradataml/clients/keycloak_client.py +165 -0
  8. teradataml/common/constants.py +10 -0
  9. teradataml/common/exceptions.py +32 -0
  10. teradataml/common/messagecodes.py +27 -0
  11. teradataml/common/messages.py +9 -1
  12. teradataml/common/sqlbundle.py +3 -2
  13. teradataml/common/utils.py +94 -12
  14. teradataml/context/context.py +37 -9
  15. teradataml/data/jsons/byom/onnxembeddings.json +1 -0
  16. teradataml/data/pattern_matching_data.csv +11 -0
  17. teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
  18. teradataml/data/teradataml_example.json +8 -1
  19. teradataml/data/url_data.csv +10 -9
  20. teradataml/dataframe/copy_to.py +1 -1
  21. teradataml/dataframe/dataframe.py +980 -82
  22. teradataml/dataframe/dataframe_utils.py +58 -25
  23. teradataml/dataframe/functions.py +962 -1
  24. teradataml/dataframe/sql.py +570 -1031
  25. teradataml/hyperparameter_tuner/utils.py +4 -2
  26. teradataml/lib/aed_0_1.dll +0 -0
  27. teradataml/opensource/_base.py +7 -1
  28. teradataml/options/configure.py +20 -4
  29. teradataml/scriptmgmt/UserEnv.py +13 -2
  30. teradataml/scriptmgmt/lls_utils.py +99 -24
  31. teradataml/sdk/README.md +79 -0
  32. teradataml/sdk/__init__.py +4 -0
  33. teradataml/sdk/_auth_modes.py +422 -0
  34. teradataml/sdk/_func_params.py +487 -0
  35. teradataml/sdk/_json_parser.py +453 -0
  36. teradataml/sdk/_openapi_spec_constants.py +249 -0
  37. teradataml/sdk/_utils.py +236 -0
  38. teradataml/sdk/api_client.py +897 -0
  39. teradataml/sdk/constants.py +62 -0
  40. teradataml/sdk/modelops/__init__.py +98 -0
  41. teradataml/sdk/modelops/_client.py +406 -0
  42. teradataml/sdk/modelops/_constants.py +304 -0
  43. teradataml/sdk/modelops/models.py +2308 -0
  44. teradataml/sdk/spinner.py +107 -0
  45. teradataml/table_operators/query_generator.py +4 -21
  46. teradataml/utils/dtypes.py +2 -1
  47. teradataml/utils/utils.py +0 -1
  48. teradataml/utils/validators.py +5 -1
  49. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.6.dist-info}/METADATA +101 -2
  50. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.6.dist-info}/RECORD +53 -36
  51. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.6.dist-info}/WHEEL +0 -0
  52. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.6.dist-info}/top_level.txt +0 -0
  53. {teradataml-20.0.0.5.dist-info → teradataml-20.0.0.6.dist-info}/zip-safe +0 -0
@@ -12,63 +12,72 @@ This file implements the teradataml dataframe.
12
12
  A teradataml dataframe maps virtually to teradata tables and views.
13
13
  """
14
14
  import decimal
15
- import inspect, itertools
15
+ import inspect
16
+ import itertools
16
17
  import json
17
18
  import numbers
18
- import pandas as pd
19
19
  import re
20
- import sqlalchemy
21
20
  import sys
22
21
  import urllib.parse
22
+ from collections import OrderedDict
23
+ from collections.abc import Iterator
23
24
 
25
+ import numpy as np
26
+ import pandas as pd
27
+ import sqlalchemy
24
28
  from sqlalchemy import Column
29
+ from sqlalchemy.exc import NoSuchColumnError
30
+ from sqlalchemy.sql import ClauseElement
31
+ from teradatasql import OperationalError
32
+ from teradatasqlalchemy.dialect import dialect as td_dialect
33
+ from teradatasqlalchemy.dialect import preparer
34
+ from teradatasqlalchemy.types import (BIGINT, BYTEINT, DECIMAL, FLOAT, INTEGER,
35
+ PERIOD_TIMESTAMP, SMALLINT, _TDType)
25
36
 
26
37
  import teradataml.context.context as tdmlctx
27
-
28
- from collections import OrderedDict, namedtuple
29
- from sqlalchemy.sql import ClauseElement
30
- from teradataml import execute_sql
31
- from teradataml import GarbageCollector
32
- from teradataml.dataframe.sql import _MetaExpression
33
- from teradataml.dataframe.sql_interfaces import ColumnExpression
34
- from teradataml.dataframe.sql_functions import case
35
- from teradataml.series.series import Series
36
- from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
37
- from teradataml.common.deprecations import argument_deprecation
38
- from teradataml.common.utils import UtilFuncs
38
+ from teradataml import GarbageCollector, execute_sql
39
+ from teradataml.common.bulk_exposed_utils import \
40
+ _validate_unimplemented_function
41
+ from teradataml.common.constants import (AEDConstants, OutputStyle,
42
+ PTITableConstants, PythonTypes,
43
+ SourceType, SQLConstants,
44
+ SQLFunctionConstants,
45
+ TableOperatorConstants,
46
+ TeradataConstants, TeradataTypes)
39
47
  from teradataml.common.exceptions import TeradataMlException
40
- from teradataml.common.messages import Messages
41
48
  from teradataml.common.messagecodes import MessageCodes
42
- from teradataml.common.constants import AEDConstants
43
- from teradataml.common.constants import SourceType, PythonTypes, TeradataConstants, \
44
- TeradataTypes, PTITableConstants, TableOperatorConstants, SQLFunctionConstants
45
- from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, DataFrameUtils
46
- from teradataml.dataframe.indexer import _LocationIndexer
47
- from teradataml.common.aed_utils import AedUtils
48
- from teradataml.options.display import display
49
- from teradataml.options.configure import configure
49
+ from teradataml.common.messages import Messages
50
+ from teradataml.common.sqlbundle import SQLBundle
51
+ from teradataml.common.utils import UtilFuncs
50
52
  from teradataml.dataframe.copy_to import copy_to_sql
53
+ from teradataml.dataframe.data_transfer import _DataTransferUtils
54
+ from teradataml.dataframe.dataframe_utils import DataFrameUtils
55
+ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
56
+ from teradataml.dataframe.indexer import _LocationIndexer
51
57
  from teradataml.dataframe.row import _Row
52
58
  from teradataml.dataframe.setop import concat
59
+ from teradataml.dataframe.sql import _MetaExpression
60
+ from teradataml.dataframe.sql_functions import case
61
+ from teradataml.dataframe.sql_interfaces import ColumnExpression
62
+ from teradataml.dataframe.window import Window
53
63
  from teradataml.dbutils.dbutils import list_td_reserved_keywords
64
+ from teradataml.options.configure import configure
65
+ from teradataml.options.display import display
54
66
  from teradataml.plot.plot import _Plot
55
67
  from teradataml.scriptmgmt.UserEnv import UserEnv
56
- from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
57
- from teradataml.utils.validators import _Validators
68
+ from teradataml.series.series import Series
58
69
  from teradataml.table_operators.table_operator_util import _TableOperatorUtils
59
- from teradatasqlalchemy.dialect import preparer, dialect as td_dialect
60
- from teradatasql import OperationalError
61
- from teradataml.dataframe.window import Window
62
- from teradataml.dataframe.data_transfer import _DataTransferUtils
63
- from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
64
70
  from teradataml.telemetry_utils.queryband import collect_queryband
65
- from teradataml.options.configure import configure
66
- from teradataml.utils.internal_buffer import _InternalBuffer
67
- from teradataml.common.constants import OutputStyle
71
+ from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
72
+ from teradataml.utils.validators import _Validators
73
+
74
+ # Adding imports at the end to avoid circular imports.
75
+ from teradataml.common.aed_utils import AedUtils
68
76
 
69
77
  # TODO use logger when available on master branch
70
78
  # logger = teradatapylog.getLogger()
71
79
 
80
+
72
81
  class in_schema:
73
82
  """
74
83
  Class takes a schema name, a table name and datalake name attributes
@@ -149,26 +158,37 @@ class DataFrame():
149
158
  on tables, views, and queries on Teradata Vantage.
150
159
  """
151
160
 
152
- def __init__(self, table_name=None, index=True, index_label=None, query=None, materialize=False):
161
+ def __init__(self, data=None, index=True, index_label=None, query=None, materialize=False, **kwargs):
153
162
  """
154
163
  Constructor for teradataml DataFrame.
155
164
 
156
165
  PARAMETERS:
157
- table_name:
166
+ data:
158
167
  Optional Argument.
159
- The table name or view name in Teradata Vantage referenced by this DataFrame.
160
- Types: str
168
+ Specifies the input data to create a teradataml DataFrame.
169
+ Notes:
170
+ If a dictionary is provided, it must follow the below requirements:
171
+ * Keys must be strings (column names).
172
+ * Values must be lists of equal length (column data).
173
+ * Nested dictionaries are not supported.
174
+ Types: str OR pandas DataFrame OR in_schema OR numpy array OR list OR dictionary
161
175
 
162
176
  index:
163
177
  Optional Argument.
164
- True if using index column for sorting, otherwise False.
178
+ If "data" is a string, then the argument specifies whether to use the index column
179
+ for sorting or not.
180
+ If "data" is a pandas DataFrame, then this argument specifies whether to
181
+ save Pandas DataFrame index as a column or not.
165
182
  Default Value: True
166
183
  Types: bool
167
184
 
168
185
  index_label:
169
186
  Optional Argument.
170
- Column/s used for sorting.
171
- Types: str OR list of Strings (str)
187
+ If "data" is a string, then the argument specifies column(s) used for sorting.
188
+ If "data" is a pandas DataFrame, then the default behavior is applied.
189
+ Note:
190
+ * Refer to the "index_label" parameter of copy_to_sql() for details on the default behaviour.
191
+ Types: str OR list of str
172
192
 
173
193
  query:
174
194
  Optional Argument.
@@ -187,29 +207,127 @@ class DataFrame():
187
207
  Default Value: False (No materialization)
188
208
  Types: bool
189
209
 
210
+ kwargs:
211
+ table_name:
212
+ Optional Argument.
213
+ The table name or view name in Teradata Vantage referenced by this DataFrame.
214
+ Note:
215
+ * If "data" and "table_name" are both specified, then the "table_name" argument is ignored.
216
+ Types: str or in_schema
217
+
218
+ primary_index:
219
+ Optional Argument.
220
+ Specifies which column(s) to use as primary index for the teradataml DataFrame.
221
+ Note:
222
+ * This argument is only applicable when creating a DataFrame from a pandas DataFrame.
223
+ Types: str OR list of str
224
+
225
+ types:
226
+ Optional Argument.
227
+ Specifies required data types for requested columns to be saved in Teradata Vantage.
228
+ Notes:
229
+ * This argument is not applicable when "data" argument is of type str or in_schema.
230
+ * Refer to the "types" parameter of copy_to_sql() for more details.
231
+ Types: dict
232
+
233
+ columns:
234
+ Optional Argument.
235
+ Specifies the names of the columns to be used in the DataFrame.
236
+ Notes:
237
+ * This argument is not applicable when "data" argument is of type str or in_schema.
238
+ * If "data" is a dictionary and this argument is specified, only the specified columns will be
239
+ included in the DataFrame if the dictionary contains those keys. If the dictionary does not
240
+ contain the specified keys, those columns will be added with NaN values.
241
+ Types: str OR list of str
242
+
190
243
  EXAMPLES:
191
- from teradataml.dataframe.dataframe import DataFrame
244
+ >>> from teradataml.dataframe.dataframe import DataFrame
245
+ >>> import pandas as pd
246
+
247
+ # Example 1: Create a teradataml DataFrame from table name.
248
+ >>> df = DataFrame("mytab")
192
249
 
193
- # Example 1: The following example creates a DataFrame from the 'table_name'
194
- # or 'view_name'.
195
- # Created DataFrame using table name.
196
- df = DataFrame("mytab")
250
+ # Example 2: Create a teradataml DataFrame from view name.
251
+ >>> df = DataFrame("myview")
197
252
 
198
- # Created DataFrame using view name.
199
- df = DataFrame("myview")
253
+ # Example 3: Create a teradataml DataFrame using view name without using index column for sorting.
254
+ >>> df = DataFrame("myview", False)
200
255
 
201
- # Created DataFrame using view name without using index column for sorting.
202
- df = DataFrame("myview", False)
256
+ # Example 4: Create a teradataml DataFrame using table name and consider columns Col1 and Col2
257
+ # while running DataFrame.head() or DataFrame.tail() methods.
258
+ >>> df = DataFrame("mytab", True, ["Col1", "Col2"])
203
259
 
204
- # Created DataFrame using table name and sorted using Col1 and Col2
205
- df = DataFrame("mytab", True, "Col1, Col2")
260
+ # Example 5: Create a teradataml DataFrame from the existing Vantage table "dbcinfo"
261
+ # in the non-default database "dbc" using the in_schema() object.
262
+ >>> from teradataml.dataframe.dataframe import in_schema
263
+ >>> df = DataFrame(in_schema("dbc", "dbcinfo"))
206
264
 
265
+ # Example 6: Create a teradataml DataFrame from a pandas DataFrame.
266
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
267
+ >>> df = DataFrame(pdf)
268
+ >>> df
269
+ col1 col2 index_label
270
+ 0 3 6 2
271
+ 1 2 5 1
272
+ 2 1 4 0
273
+
274
+ # Example 7: Create a teradataml DataFrame from a pandas DataFrame without index column.
275
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
276
+ >>> df = DataFrame(data=pdf, index=False)
277
+ >>> df
278
+ col1 col2
279
+ 0 3 6
280
+ 1 2 5
281
+ 2 1 4
282
+
283
+ # Example 8: Create a teradataml DataFrame from a pandas DataFrame with
284
+ # index label and primary index as 'id'.
285
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
286
+ >>> df = DataFrame(pdf, index=True, index_label='id', primary_index='id')
287
+ >>> df
288
+ col1 col2
289
+ id
290
+ 2 3 6
291
+ 1 2 5
292
+ 0 1 4
207
293
 
208
- # Example 2: The following example creates a DataFrame from the existing Vantage
209
- # table "dbcinfo" in the non-default database "dbc" using the
210
- # in_schema() function.
211
- from teradataml.dataframe.dataframe import in_schema
212
- df = DataFrame(in_schema("dbc", "dbcinfo"))
294
+ # Example 9: Create a teradataml DataFrame from list of lists.
295
+ >>> df = DataFrame([[1, 2], [3, 4]])
296
+ >>> df
297
+ col_0 col_1 index_label
298
+ 0 3 4 1
299
+ 1 1 2 0
300
+
301
+ # Example 10: Create a teradataml DataFrame from numpy array.
302
+ >>> import numpy as np
303
+ >>> df = DataFrame(np.array([[1, 2], [3, 4]]), index=True, index_label="id")
304
+ >>> df
305
+ col_0 col_1
306
+ id
307
+ 1 3 4
308
+ 0 1 2
309
+
310
+ # Example 11: Create a teradataml DataFrame from a dictionary.
311
+ >>> df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=True, index_label="id")
312
+ >>> df
313
+ col1 col2
314
+ id
315
+ 1 2 4
316
+ 0 1 3
317
+
318
+ # Example 12: Create a teradataml DataFrame from list of dictionaries.
319
+ >>> df = DataFrame([{"col1": 1, "col2": 2}, {"col1": 3, "col2": 4}], index=False)
320
+ >>> df
321
+ col1 col2
322
+ 0 3 4
323
+ 1 1 2
324
+
325
+ # Example 13: Create a teradataml DataFrame from list of tuples.
326
+ >>> df = DataFrame([("Alice", 1), ("Bob", 2)])
327
+ >>> df
328
+ col_0 col_1 index_label
329
+ 0 Alice 1 1
330
+ 1 Bob 2 0
213
331
 
214
332
  RAISES:
215
333
  TeradataMlException - TDMLDF_CREATE_FAIL
@@ -253,12 +371,25 @@ class DataFrame():
253
371
  self._table = None
254
372
  self._otf = False
255
373
 
256
- if isinstance(table_name, in_schema):
257
- self._table = table_name.table_name
258
- self._datalake = table_name.datalake_name
259
- self._database = table_name.schema_name
374
+ table_name = kwargs.get("table_name", None)
375
+ primary_index = kwargs.get("primary_index", None)
376
+ columns = kwargs.get("columns", None)
377
+ types = kwargs.get("types", None)
378
+
379
+ # Check if the data is an instance of in_schema or if the data is None
380
+ # and table_name is an instance of in_schema, then assign the table_name,
381
+ # datalake_name and schema_name to the DataFrame object.
382
+ schema_obj = data if isinstance(data, in_schema) else (
383
+ table_name if data is None and isinstance(table_name, in_schema) else None)
384
+
385
+ if schema_obj:
386
+ self._table = schema_obj.table_name
387
+ self._datalake = schema_obj.datalake_name
388
+ self._database = schema_obj.schema_name
260
389
  self._otf = True if self._datalake else False
261
390
 
391
+ # Convert schema objects to strings.
392
+ data = str(data) if isinstance(data, in_schema) else data
262
393
  table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
263
394
 
264
395
  # Below matrix is list of list, where in each row contains following elements:
@@ -277,18 +408,49 @@ class DataFrame():
277
408
  # 6. element6 --> A list of permitted values, an argument can accept.
278
409
  # If not specified, it is as good as passing None. If a list is passed, validation will be
279
410
  # performed for permitted values.
411
+
280
412
  awu_matrix = []
281
- awu_matrix.append(["table_name", table_name, True, (str), True])
413
+ dtypes = (list, tuple, dict)
414
+ awu_matrix.append(["data", data, True, (str, pd.DataFrame, np.ndarray, dict, _ListOf(dtypes)), True])
282
415
  awu_matrix.append(["index", index, True, (bool)])
283
416
  awu_matrix.append(["index_label", index_label, True, (str, list)])
284
417
  awu_matrix.append(["query", query, True, (str), True])
285
418
  awu_matrix.append(["materialize", materialize, True, (bool)])
419
+ awu_matrix.append(["table_name", table_name, True, (str), True])
420
+ awu_matrix.append(["primary_index", primary_index, True, (str, list)])
421
+ awu_matrix.append(["types", types, True, (dict)])
422
+ awu_matrix.append(["columns", columns, True, (str, list), True])
286
423
 
287
424
  # Validate argument types
288
425
  _Validators._validate_function_arguments(awu_matrix)
289
426
 
427
+ # Convert columns to list if it is a string.
428
+ if isinstance(columns, str):
429
+ columns = [columns]
430
+
290
431
  try:
291
- if table_name is not None:
432
+ if table_name is not None or data is not None:
433
+
434
+ # If data is list or numpy array or dictionary, then convert it to a pandas DataFrame.
435
+ if isinstance(data, (list, np.ndarray, dict)):
436
+ data = pd.DataFrame(data, columns=columns)
437
+ # If the data is a pandas DataFrame, then store the data in a temporary table in Vantage.
438
+ if isinstance(data, pd.DataFrame):
439
+ # Create a copy of the pandas DataFrame to avoid modifying the original,
440
+ # because column names will be changed if they are integers.
441
+ pd_data = data.copy()
442
+ # If the columns are not of type string, then convert them to string.
443
+ pd_data.columns = [f"col_{i}" if isinstance(i, int) else i for i in pd_data.columns]
444
+ # Set the table_name to the name of the table created in the database.
445
+ table_name = UtilFuncs._generate_temp_table_name(prefix="from_pandas",
446
+ table_type=TeradataConstants.TERADATA_TABLE)
447
+
448
+ copy_to_sql(pd_data, table_name, index=index, index_label=index_label, primary_index=primary_index,
449
+ types=types)
450
+ # If the data is a string, then set the table_name to the data.
451
+ elif isinstance(data, str):
452
+ table_name = data
453
+
292
454
  self._table_name = UtilFuncs._quote_table_names(table_name)
293
455
  self._source_type = SourceType.TABLE.value
294
456
  self._nodeid = self._aed_utils._aed_table(self._table_name)
@@ -342,6 +504,12 @@ class DataFrame():
342
504
  elif "[Error 3706] Syntax error" in str(oe):
343
505
  raise ValueError(Messages.get_message(
344
506
  MessageCodes.FROM_QUERY_SELECT_SUPPORTED).format("Check the syntax."))
507
+ elif "[Error 7825]" in str(oe):
508
+ # The UDF/XSP/UDM routine has thrown an SQLException
509
+ # with an SQL state in the range of 38001-38999 which
510
+ # is not a syntax error. Hence not a ValueError wrt query string.
511
+ # Expected when OTF snapshot related query is executed.
512
+ raise
345
513
  raise ValueError(Messages.get_message(
346
514
  MessageCodes.FROM_QUERY_SELECT_SUPPORTED))
347
515
 
@@ -503,7 +671,7 @@ class DataFrame():
503
671
  Types: str
504
672
 
505
673
  EXAMPLES:
506
- >>> from teradataml.dataframe.dataframe import DataFrame
674
+ >>> from teradataml import DataFrame
507
675
 
508
676
  # Example 1: The following example creates a DataFrame from a table or
509
677
  a view.
@@ -543,9 +711,9 @@ class DataFrame():
543
711
 
544
712
  """
545
713
  if schema_name:
546
- return cls(in_schema(schema_name, table_name, datalake_name))
547
-
548
- return cls(table_name, index, index_label)
714
+ return cls(table_name=in_schema(schema_name, table_name, datalake_name),
715
+ index=index, index_label=index_label)
716
+ return cls(table_name=table_name, index=index, index_label=index_label)
549
717
 
550
718
  @classmethod
551
719
  @collect_queryband(queryband="DF_fromQuery")
@@ -692,6 +860,300 @@ class DataFrame():
692
860
  df.__setattr__(arg, arg_value)
693
861
  return df
694
862
 
863
+ @classmethod
864
+ @collect_queryband(queryband="DF_fromPandas")
865
+ def from_pandas(cls, pandas_df, index=True, index_label=None, primary_index=None):
866
+ """
867
+ DESCRIPTION:
868
+ Creates a teradataml DataFrame from a pandas DataFrame.
869
+
870
+ PARAMETERS:
871
+ pandas_df:
872
+ Required Argument.
873
+ Specifies the pandas DataFrame to be converted to teradataml DataFrame.
874
+ Types: pandas DataFrame
875
+
876
+ index:
877
+ Optional Argument.
878
+ Specifies whether to save Pandas DataFrame index as a column or not.
879
+ Default Value: True
880
+ Types: bool
881
+
882
+ index_label:
883
+ Optional Argument.
884
+ Specifies the column label(s) for Pandas DataFrame index column(s).
885
+ Note:
886
+ * Refer to the "index_label" parameter of copy_to_sql() for more details.
887
+ Default Value: None
888
+ Types: str OR list of str
889
+
890
+ primary_index:
891
+ Optional Argument.
892
+ Specifies which column(s) to use as primary index for the teradataml DataFrame.
893
+ Types: str OR list of str
894
+
895
+ RETURNS:
896
+ teradataml DataFrame
897
+
898
+ RAISES:
899
+ TeradataMlException
900
+
901
+ EXAMPLES:
902
+ >>> import pandas as pd
903
+ >>> from teradataml import DataFrame
904
+ >>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
905
+ >>> pdf1 = pd.DataFrame([[1, 2], [3, 4]])
906
+
907
+ # Example 1: Create a teradataml DataFrame from a pandas DataFrame.
908
+ >>> df = DataFrame.from_pandas(pdf)
909
+ >>> df
910
+ col1 col2 index_label
911
+ 0 3 6 2
912
+ 1 2 5 1
913
+ 2 1 4 0
914
+
915
+ # Example 2: Create a teradataml DataFrame from a pandas DataFrame
916
+ # and do not save the index as a column.
917
+ >>> df = DataFrame.from_pandas(pdf, index=False)
918
+ >>> df
919
+ col1 col2
920
+ 0 3 6
921
+ 1 2 5
922
+ 2 1 4
923
+
924
+ # Example 3: Create a teradataml DataFrame from a pandas DataFrame
925
+ # with index label as 'id' and set it as primary index.
926
+ >>> df = DataFrame.from_pandas(pdf, index=True, index_label='id', primary_index='id')
927
+ >>> df
928
+ col1 col2
929
+ id
930
+ 2 3 6
931
+ 1 2 5
932
+ 0 1 4
933
+
934
+ # Example 4: Create a teradataml DataFrame from a pandas DataFrame where
935
+ # columns are not explicitly defined in the pandas DataFrame.
936
+ >>> df = DataFrame.from_pandas(pdf1)
937
+ >>> df
938
+ col_0 col_1 index_label
939
+ 0 3 4 1
940
+ 1 1 2 0
941
+ """
942
+ # Validate 'pandas_df' argument, other arguments, will be validated as part of DataFrame().
943
+ arg_type_matrix = []
944
+ arg_type_matrix.append(["pandas_df", pandas_df, False, (pd.DataFrame,), True])
945
+
946
+ _Validators._validate_function_arguments(arg_type_matrix)
947
+
948
+ return cls(pandas_df, index, index_label, primary_index=primary_index)
949
+
950
+ @classmethod
951
+ @collect_queryband(queryband="DF_fromDict")
952
+ def from_dict(cls, data, columns=None):
953
+ """
954
+ DESCRIPTION:
955
+ Creates a DataFrame from a dictionary containing values as lists or numpy arrays.
956
+
957
+ PARAMETERS:
958
+ data:
959
+ Required Argument.
960
+ Specifies the Python dictionary to create a teradataml DataFrame.
961
+ Notes:
962
+ * Keys of the dictionary are used as column names.
963
+ * Values of the dictionary should be lists or numpy arrays.
964
+ * Nested dictionaries are not supported.
965
+ Types: dict
966
+
967
+ columns:
968
+ Optional Argument.
969
+ Specifies the column names for the DataFrame.
970
+ Types: str OR list of str
971
+
972
+ RETURNS:
973
+ teradataml DataFrame
974
+
975
+ RAISES:
976
+ TeradataMlException
977
+
978
+ EXAMPLES:
979
+ >>> from teradataml import DataFrame
980
+ >>> data_dict = {"name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 28]}
981
+
982
+ # Example 1: Create a teradataml DataFrame from a dictionary where
983
+ # keys are column names and values are lists of column data.
984
+ >>> df = DataFrame.from_dict(data_dict)
985
+ >>> df
986
+ name age
987
+ 0 Charlie 28
988
+ 1 Bob 30
989
+ 2 Alice 25
990
+
991
+ # Example 2: Create a teradataml DataFrame from a dictionary where
992
+ # keys are column names and values are numpy arrays.
993
+ >>> import numpy as np
994
+ >>> data_dict = {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}
995
+ >>> df = DataFrame.from_dict(data_dict)
996
+ >>> df
997
+ col1 col2
998
+ 0 3 6
999
+ 1 2 5
1000
+ 2 1 4
1001
+ """
1002
+ arg_type_matrix = []
1003
+ arg_type_matrix.append(["data", data, False, (dict), True])
1004
+ arg_type_matrix.append(["columns", columns, True, (str, list), True])
1005
+
1006
+ _Validators._validate_function_arguments(arg_type_matrix)
1007
+
1008
+ return cls(data, columns=columns, index=False)
1009
+
1010
+ @classmethod
1011
+ @collect_queryband(queryband="DF_fromRecords")
1012
+ def from_records(cls, data, columns=None, **kwargs):
1013
+ """
1014
+ DESCRIPTION:
1015
+ Create a DataFrame from a list of lists/tuples/dictionaries/numpy arrays.
1016
+
1017
+ PARAMETERS:
1018
+ data:
1019
+ Required Argument.
1020
+ Specifies the iterator of data or the list of lists/tuples/dictionaries/numpy arrays to
1021
+ be converted to teradataml DataFrame.
1022
+ Note:
1023
+ * Nested lists or tuples or dictionaries are not supported.
1024
+ Types: Iterator, list
1025
+
1026
+ columns:
1027
+ Optional Argument.
1028
+ Specifies the column names for the DataFrame.
1029
+ Note:
1030
+ * If the data is a list of lists/tuples/numpy arrays and this argument
1031
+ is not specified, column names will be auto-generated as 'col_0', 'col_1', etc.
1032
+ Types: str OR list of str
1033
+
1034
+ kwargs:
1035
+ exclude:
1036
+ Optional Argument.
1037
+ Specifies the columns to be excluded from the DataFrame.
1038
+ Types: list OR tuple
1039
+
1040
+ coerce_float:
1041
+ Optional Argument.
1042
+ Specifies whether to convert values of non-string, non-numeric objects (like decimal.Decimal)
1043
+ to floating point, useful for SQL result sets.
1044
+ Default Value: True
1045
+ Types: bool
1046
+
1047
+ nrows:
1048
+ Optional Argument.
1049
+ Specifies the number of rows to be read from the data if the data is iterator.
1050
+ Types: int
1051
+
1052
+ RETURNS:
1053
+ teradataml DataFrame
1054
+
1055
+ RAISES:
1056
+ TeradataMlException
1057
+
1058
+ EXAMPLES:
1059
+ >>> from teradataml import DataFrame
1060
+
1061
+ # Example 1: Create a teradataml DataFrame from a list of lists.
1062
+ >>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]], columns=['name', 'age'])
1063
+ >>> df
1064
+ name age
1065
+ 0 Bob 2
1066
+ 1 Alice 1
1067
+
1068
+ # Example 2: Create a teradataml DataFrame from a list of tuples.
1069
+ >>> df = DataFrame.from_records([('Alice', 1), ('Bob', 3)], columns=['name', 'age'])
1070
+ >>> df
1071
+ name age
1072
+ 0 Bob 3
1073
+ 1 Alice 1
1074
+
1075
+ # Example 3: Create a teradataml DataFrame from a list of dictionaries.
1076
+ >>> df = DataFrame.from_records([{'name': 'Alice', 'age': 4}, {'name': 'Bob', 'age': 2}])
1077
+ >>> df
1078
+ name age
1079
+ 0 Bob 2
1080
+ 1 Alice 4
1081
+
1082
+ # Example 4: Create a teradataml DataFrame from a list where columns
1083
+ # are not explicitly defined.
1084
+ >>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]])
1085
+ >>> df
1086
+ col_0 col_1
1087
+ 0 Bob 2
1088
+ 1 Alice 1
1089
+
1090
+ # Example 5: Create a teradataml DataFrame from a list by excluding 'grade' column.
1091
+ >>> df = DataFrame.from_records([['Alice', 1, 'A'], ['Bob', 2, 'B']],
1092
+ ... columns=['name', 'age', 'grade'],
1093
+ ... exclude=['grade'])
1094
+ >>> df
1095
+ name age
1096
+ 0 Bob 2
1097
+ 1 Alice 1
1098
+
1099
+ # Example 6: Create a teradataml DataFrame from a list of lists
1100
+ # with "coerce_float" set to False.
1101
+ >>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
1102
+ ... columns=['col1', 'col2'], coerce_float=False)
1103
+ >>> df
1104
+ col1 col2
1105
+ 0 3 4.0
1106
+ 1 1 2.5
1107
+ >>> df.tdtypes
1108
+ col1 BIGINT()
1109
+ col2 VARCHAR(length=1024, charset='UNICODE')
1110
+
1111
+ # Example 7: Create a teradataml DataFrame from a list of lists
1112
+ # with "coerce_float" set to True.
1113
+ >>> from decimal import Decimal
1114
+ >>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
1115
+ ... columns=['col1', 'col2'], coerce_float=True)
1116
+ >>> df
1117
+ col1 col2
1118
+ 0 3 4.0
1119
+ 1 1 2.5
1120
+ >>> df.tdtypes
1121
+ col1 BIGINT()
1122
+ col2 FLOAT()
1123
+
1124
+ # Example 8: Create a teradataml DataFrame from an iterator with "nrows" set to 2.
1125
+ >>> def data_gen():
1126
+ ... yield ['Alice', 1]
1127
+ ... yield ['Bob', 2]
1128
+ ... yield ['Charlie', 3]
1129
+ >>> df = DataFrame.from_records(data_gen(), columns=['name', 'age'], nrows=2)
1130
+ >>> df
1131
+ name age
1132
+ 0 Bob 2
1133
+ 1 Alice 1
1134
+ """
1135
+
1136
+ exclude = kwargs.get("exclude", None)
1137
+ coerce_float = kwargs.get("coerce_float", True)
1138
+ nrows = kwargs.get("nrows", None)
1139
+
1140
+ arg_type_matrix = []
1141
+ dtypes = (list, tuple, dict)
1142
+ arg_type_matrix.append(["data", data, False, (Iterator, _ListOf(dtypes)), True])
1143
+ arg_type_matrix.append(["columns", columns, True, (str, _ListOf(str)), True])
1144
+ arg_type_matrix.append(["exclude", exclude, True, (_ListOf(str),), True])
1145
+ arg_type_matrix.append(["coerce_float", coerce_float, True, (bool, ), True])
1146
+ arg_type_matrix.append(["nrows", nrows, True, (int,), True])
1147
+
1148
+ _Validators._validate_function_arguments(arg_type_matrix)
1149
+
1150
+ if isinstance(columns, str):
1151
+ columns = [columns]
1152
+
1153
+ df = pd.DataFrame.from_records(data, columns=columns, exclude=exclude,
1154
+ coerce_float=coerce_float, nrows=nrows)
1155
+ return cls(df, index=False)
1156
+
695
1157
  def create_temp_view(self, name):
696
1158
  """
697
1159
  DESCRIPTION:
@@ -1149,9 +1611,19 @@ class DataFrame():
1149
1611
  datalake=self._datalake)
1150
1612
 
1151
1613
  # Extract column names and corresponding teradatasqlalchemy types.
1152
- col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1153
- self._table,
1154
- self._datalake)
1614
+ try:
1615
+ # For latest OTF help table query results.
1616
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1617
+ self._table,
1618
+ self._datalake,
1619
+ use_dialect=True)
1620
+ except NoSuchColumnError:
1621
+ # For older OTF help table query result.
1622
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1623
+ self._table,
1624
+ self._datalake)
1625
+
1626
+ # Create a SQLAlchemy table object representing datalake table.
1155
1627
  t = sqlalchemy.Table(self._table, meta, schema=self._database,
1156
1628
  *(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
1157
1629
  return _MetaExpression(t)
@@ -5888,8 +6360,11 @@ class DataFrame():
5888
6360
  groupby_col_names.append(col)
5889
6361
  groupby_col_types.append(self[col].type)
5890
6362
 
5891
- if col in col_names:
5892
- # If group by column is not specified in the columns argument,
6363
+ include_grouping_columns = True if isinstance(self, DataFrameGroupBy) and \
6364
+ self._include_grouping_columns else False
6365
+ if not include_grouping_columns and col in col_names:
6366
+ # If 'include_grouping_columns' argument is set to True and,
6367
+ # group by column is not specified in the columns argument,
5893
6368
  # then, we should ignore this processing, otherwise we
5894
6369
  # should process it in the same way to remove the reference
5895
6370
  # for grouping column from aggregation list.
@@ -8998,6 +9473,15 @@ class DataFrame():
8998
9473
  Permitted Values: "CUBE", "ROLLUP", None
8999
9474
  Types: str or NoneType
9000
9475
 
9476
+ include_grouping_columns:
9477
+ Optional Argument.
9478
+ Specifies whether to include aggregations on the grouping column(s) or not.
9479
+ When set to True, the resultant DataFrame will have the aggregations on the
9480
+ columns mentioned in "columns_expr". Otherwise, resultant DataFrame will not have
9481
+ aggregations on the columns mentioned in "columns_expr".
9482
+ Default Value: False
9483
+ Types: bool
9484
+
9001
9485
  NOTES:
9002
9486
  1. Users can still apply teradataml DataFrame methods (filters/sort/etc) on top of the result.
9003
9487
  2. Consecutive operations of grouping, i.e., groupby_time(), resample() and groupby() are not permitted.
@@ -9014,14 +9498,54 @@ class DataFrame():
9014
9498
  TeradataMlException
9015
9499
 
9016
9500
  EXAMPLES:
9501
+ # Load the data to run the example.
9017
9502
  >>> load_example_data("dataframe","admissions_train")
9503
+
9504
+ # Create a DataFrame on 'admissions_train' table.
9018
9505
  >>> df = DataFrame("admissions_train")
9506
+ >>> df
9507
+ masters gpa stats programming admitted
9508
+ id
9509
+ 15 yes 4.00 Advanced Advanced 1
9510
+ 34 yes 3.85 Advanced Beginner 0
9511
+ 13 no 4.00 Advanced Novice 1
9512
+ 38 yes 2.65 Advanced Beginner 1
9513
+ 5 no 3.44 Novice Novice 0
9514
+ 40 yes 3.95 Novice Beginner 0
9515
+ 7 yes 2.33 Novice Novice 1
9516
+ 22 yes 3.46 Novice Beginner 0
9517
+ 26 yes 3.57 Advanced Advanced 1
9518
+ 17 no 3.83 Advanced Advanced 1
9519
+
9520
+ # Example 1: Find the minimum value of all valid columns by
9521
+ # grouping the DataFrame with column 'masters'.
9019
9522
  >>> df1 = df.groupby(["masters"])
9020
9523
  >>> df1.min()
9021
9524
  masters min_id min_gpa min_stats min_programming min_admitted
9022
9525
  0 no 3 1.87 Advanced Advanced 0
9023
9526
  1 yes 1 1.98 Advanced Advanced 0
9024
9527
 
9528
+ # Example 2: Find the sum of all valid columns by grouping the DataFrame
9529
+ # with columns 'masters' and 'admitted'. Include grouping columns
9530
+ # in aggregate function 'sum'.
9531
+ >>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=True)
9532
+ >>> df1.sum()
9533
+ masters admitted sum_id sum_gpa sum_admitted
9534
+ 0 yes 1 188 34.35 10
9535
+ 1 yes 0 289 43.36 0
9536
+ 2 no 0 41 6.44 0
9537
+ 3 no 1 302 57.52 16
9538
+
9539
+ # Example 3: Find the sum of all valid columns by grouping the DataFrame with
9540
+ # columns 'masters' and 'admitted'. Do not include grouping columns
9541
+ # in aggregate function 'sum'.
9542
+ >>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=False)
9543
+ >>> df1.sum()
9544
+ masters admitted sum_id sum_gpa
9545
+ 0 yes 0 289 43.36
9546
+ 1 no 0 41 6.44
9547
+ 2 no 1 302 57.52
9548
+ 3 yes 1 188 34.35
9025
9549
  """
9026
9550
  # Argument validations
9027
9551
  arg_info_matrix = []
@@ -9029,6 +9553,8 @@ class DataFrame():
9029
9553
  option = kwargs.get("option", None)
9030
9554
  arg_info_matrix.append(["option", option, True, (str, type(None)), True,
9031
9555
  ["CUBE", "ROLLUP", None]])
9556
+ include_grouping_columns = kwargs.get("include_grouping_columns", False)
9557
+ arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, True, (bool)])
9032
9558
 
9033
9559
  # Validate argument types
9034
9560
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -9073,7 +9599,8 @@ class DataFrame():
9073
9599
 
9074
9600
  groupbyexpr = ', '.join(UtilFuncs._teradata_quote_arg(col, "\"", False) for col in column_list)
9075
9601
  groupbyObj = DataFrameGroupBy(self._nodeid, self._metaexpr, self._column_names_and_types, self.columns,
9076
- groupbyexpr, column_list, option)
9602
+ groupbyexpr, column_list, option, include_grouping_columns)
9603
+
9077
9604
  return groupbyObj
9078
9605
  except TeradataMlException:
9079
9606
  raise
@@ -13150,7 +13677,7 @@ class DataFrame():
13150
13677
  False)
13151
13678
  column_names = list(dict.fromkeys(column_names))
13152
13679
 
13153
- if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
13680
+ if list_td_reserved_keywords(column_names) or UtilFuncs._is_non_ascii(column_names):
13154
13681
  column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
13155
13682
 
13156
13683
  col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
@@ -15336,7 +15863,7 @@ class DataFrame():
15336
15863
  return self.assign(**new_columns, drop_columns=True).select(self.columns)
15337
15864
 
15338
15865
  @collect_queryband(queryband="DF_cube")
15339
- def cube(self, columns):
15866
+ def cube(self, columns, include_grouping_columns=False):
15340
15867
  """
15341
15868
  DESCRIPTION:
15342
15869
  cube() function creates a multi-dimensional cube for the DataFrame
@@ -15350,6 +15877,15 @@ class DataFrame():
15350
15877
  Specifies the name(s) of input teradataml DataFrame column(s).
15351
15878
  Types: str OR list of str(s)
15352
15879
 
15880
+ include_grouping_columns:
15881
+ Optional Argument.
15882
+ Specifies whether to include aggregations on the grouping column(s) or not.
15883
+ When set to True, the resultant DataFrame will have the aggregations on the
15884
+ columns mentioned in "columns". Otherwise, resultant DataFrame will not have
15885
+ aggregations on the columns mentioned in "columns".
15886
+ Default Value: False
15887
+ Types: bool
15888
+
15353
15889
  RETURNS:
15354
15890
  teradataml DataFrameGroupBy
15355
15891
 
@@ -15357,9 +15893,27 @@ class DataFrame():
15357
15893
  TeradataMlException
15358
15894
 
15359
15895
  EXAMPLES :
15360
- # Example 1: Analyzes the data by grouping into masters and stats dimensions.
15896
+ # Load the data to run the example.
15361
15897
  >>> load_example_data("dataframe","admissions_train")
15898
+
15899
+ # Create a DataFrame on 'admissions_train' table.
15362
15900
  >>> df = DataFrame("admissions_train")
15901
+ >>> df
15902
+ masters gpa stats programming admitted
15903
+ id
15904
+ 15 yes 4.00 Advanced Advanced 1
15905
+ 34 yes 3.85 Advanced Beginner 0
15906
+ 13 no 4.00 Advanced Novice 1
15907
+ 38 yes 2.65 Advanced Beginner 1
15908
+ 5 no 3.44 Novice Novice 0
15909
+ 40 yes 3.95 Novice Beginner 0
15910
+ 7 yes 2.33 Novice Novice 1
15911
+ 22 yes 3.46 Novice Beginner 0
15912
+ 26 yes 3.57 Advanced Advanced 1
15913
+ 17 no 3.83 Advanced Advanced 1
15914
+
15915
+ # Example 1: Find the sum of all valid columns by grouping the
15916
+ # DataFrame columns with 'masters' and 'stats'.
15363
15917
  >>> df1 = df.cube(["masters", "stats"]).sum()
15364
15918
  >>> df1
15365
15919
  masters stats sum_id sum_gpa sum_admitted
@@ -15374,10 +15928,42 @@ class DataFrame():
15374
15928
  8 no Advanced 189 34.95 9
15375
15929
  9 yes Novice 98 13.74 1
15376
15930
 
15931
+ # Example 2: Find the avg of all valid columns by grouping the DataFrame
15932
+ # with columns 'masters' and 'admitted'. Include grouping columns
15933
+ # in aggregate function 'avg'.
15934
+ >>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=True).avg()
15935
+ >>> df1
15936
+ masters admitted avg_id avg_gpa avg_admitted
15937
+ 0 yes NaN 21.681818 3.532273 0.454545
15938
+ 1 None 1.0 18.846154 3.533462 1.000000
15939
+ 2 no NaN 19.055556 3.553333 0.888889
15940
+ 3 yes 0.0 24.083333 3.613333 0.000000
15941
+ 4 None NaN 20.500000 3.541750 0.650000
15942
+ 5 None 0.0 23.571429 3.557143 0.000000
15943
+ 6 yes 1.0 18.800000 3.435000 1.000000
15944
+ 7 no 1.0 18.875000 3.595000 1.000000
15945
+ 8 no 0.0 20.500000 3.220000 0.000000
15946
+
15947
+ # Example 3: Find the avg of all valid columns by grouping the DataFrame with
15948
+ # columns 'masters' and 'admitted'. Do not include grouping columns
15949
+ # in aggregate function 'avg'.
15950
+ >>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=False).avg()
15951
+ >>> df1
15952
+ masters admitted avg_id avg_gpa
15953
+ 0 no 0.0 20.500000 3.220000
15954
+ 1 None 1.0 18.846154 3.533462
15955
+ 2 no NaN 19.055556 3.553333
15956
+ 3 yes 0.0 24.083333 3.613333
15957
+ 4 None NaN 20.500000 3.541750
15958
+ 5 None 0.0 23.571429 3.557143
15959
+ 6 yes 1.0 18.800000 3.435000
15960
+ 7 yes NaN 21.681818 3.532273
15961
+ 8 no 1.0 18.875000 3.595000
15377
15962
  """
15378
15963
  # Validate columns argument.
15379
15964
  arg_info_matrix = []
15380
15965
  arg_info_matrix.append(["columns", columns, False, (str, list), True])
15966
+ arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
15381
15967
 
15382
15968
  # Validate argument types
15383
15969
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -15387,10 +15973,10 @@ class DataFrame():
15387
15973
 
15388
15974
  # Query generation of cube API is same as the group by.
15389
15975
  # Only 'cube' is concatenated with 'group by' clause.
15390
- return self.groupby(columns, option="cube")
15976
+ return self.groupby(columns, option="cube", include_grouping_columns=include_grouping_columns)
15391
15977
 
15392
15978
  @collect_queryband(queryband="DF_rollup")
15393
- def rollup(self, columns):
15979
+ def rollup(self, columns, include_grouping_columns=False):
15394
15980
  """
15395
15981
  DESCRIPTION:
15396
15982
  rollup() function creates a multi-dimensional rollup for the DataFrame
@@ -15404,6 +15990,15 @@ class DataFrame():
15404
15990
  Specifies the name(s) of input teradataml DataFrame column(s).
15405
15991
  Types: str OR list of str(s)
15406
15992
 
15993
+ include_grouping_columns:
15994
+ Optional Argument.
15995
+ Specifies whether to include aggregations on the grouping column(s) or not.
15996
+ When set to True, the resultant DataFrame will have the aggregations on the
15997
+ columns mentioned in "columns". Otherwise, resultant DataFrame will not have
15998
+ aggregations on the columns mentioned in "columns".
15999
+ Default Value: False
16000
+ Types: bool
16001
+
15407
16002
  RETURNS:
15408
16003
  teradataml DataFrameGroupBy
15409
16004
 
@@ -15411,9 +16006,27 @@ class DataFrame():
15411
16006
  TeradataMlException
15412
16007
 
15413
16008
  EXAMPLES :
15414
- # Example 1: Analyzes the data by grouping into masters and stats dimensions.
16009
+ # Load the data to run the example.
15415
16010
  >>> load_example_data("dataframe","admissions_train")
16011
+
16012
+ # Create a DataFrame on 'admissions_train' table.
15416
16013
  >>> df = DataFrame("admissions_train")
16014
+ >>> df
16015
+ masters gpa stats programming admitted
16016
+ id
16017
+ 15 yes 4.00 Advanced Advanced 1
16018
+ 34 yes 3.85 Advanced Beginner 0
16019
+ 13 no 4.00 Advanced Novice 1
16020
+ 38 yes 2.65 Advanced Beginner 1
16021
+ 5 no 3.44 Novice Novice 0
16022
+ 40 yes 3.95 Novice Beginner 0
16023
+ 7 yes 2.33 Novice Novice 1
16024
+ 22 yes 3.46 Novice Beginner 0
16025
+ 26 yes 3.57 Advanced Advanced 1
16026
+ 17 no 3.83 Advanced Advanced 1
16027
+
16028
+ # Example 1: Find the sum of all valid columns by grouping the
16029
+ # DataFrame columns with 'masters' and 'stats'.
15417
16030
  >>> df1 = df.rollup(["masters", "stats"]).sum()
15418
16031
  >>> df1
15419
16032
  masters stats sum_id sum_gpa sum_admitted
@@ -15426,11 +16039,39 @@ class DataFrame():
15426
16039
  6 yes Beginner 13 14.71 2
15427
16040
  7 yes Advanced 366 49.26 7
15428
16041
  8 no Advanced 189 34.95 9
15429
-
16042
+
16043
+ # Example 2: Find the avg of all valid columns by grouping the DataFrame
16044
+ # with columns 'masters' and 'admitted'. Include grouping columns
16045
+ # in aggregate function 'avg'.
16046
+ >>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=True).avg()
16047
+ >>> df1
16048
+ masters admitted avg_id avg_gpa avg_admitted
16049
+ 0 no NaN 19.055556 3.553333 0.888889
16050
+ 1 yes NaN 21.681818 3.532273 0.454545
16051
+ 2 None NaN 20.500000 3.541750 0.650000
16052
+ 3 yes 0.0 24.083333 3.613333 0.000000
16053
+ 4 no 1.0 18.875000 3.595000 1.000000
16054
+ 5 yes 1.0 18.800000 3.435000 1.000000
16055
+ 6 no 0.0 20.500000 3.220000 0.000000
16056
+
16057
+ # Example 3: Find the avg of all valid columns by grouping the DataFrame with
16058
+ # columns 'masters' and 'admitted'. Do not include grouping columns
16059
+ # in aggregate function 'avg'.
16060
+ >>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=False).avg()
16061
+ >>> df1
16062
+ masters admitted avg_id avg_gpa
16063
+ 0 no NaN 19.055556 3.553333
16064
+ 1 yes NaN 21.681818 3.532273
16065
+ 2 no 0.0 20.500000 3.220000
16066
+ 3 yes 0.0 24.083333 3.613333
16067
+ 4 no 1.0 18.875000 3.595000
16068
+ 5 yes 1.0 18.800000 3.435000
16069
+ 6 None NaN 20.500000 3.541750
15430
16070
  """
15431
16071
  # Validate columns argument.
15432
16072
  arg_info_matrix = []
15433
16073
  arg_info_matrix.append(["columns", columns, False, (str, list), True])
16074
+ arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
15434
16075
 
15435
16076
  # Validate argument types
15436
16077
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -15440,8 +16081,255 @@ class DataFrame():
15440
16081
 
15441
16082
  # Query generation of cube API is same as the group by.
15442
16083
  # Only 'rollup' is concatenated with 'group by' clause.
15443
- return self.groupby(columns, option="rollup")
16084
+ return self.groupby(columns, option="rollup", include_grouping_columns=include_grouping_columns)
16085
+
16086
+ # Metadata functions for DataFrame created on datalake/OTF table.
16087
+ @property
16088
+ @collect_queryband(queryband="DF_snpsht")
16089
+ @df_utils.check_otf_dataframe()
16090
+ def snapshots(self):
16091
+ """
16092
+ DESCRIPTION:
16093
+ Gets snapshot information for a DataLake table.
16094
+
16095
+ PARAMETERS:
16096
+ None
16097
+
16098
+ RETURNS:
16099
+ teradataml DataFrame.
16100
+
16101
+ RAISES:
16102
+ TeradataMLException.
16103
+
16104
+ EXAMPLES :
16105
+ # Example 1: Get the snapshot information for datalake table.
16106
+ >>> from teradataml.dataframe.dataframe import in_schema
16107
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16108
+ ... table_name="datalake_table",
16109
+ ... datalake_name="datalake")
16110
+ >>> datalake_df = DataFrame(in_schema_tbl)
16111
+ >>> datalake_df.snapshots
16112
+ snapshotId snapshotTimestamp timestampMSecs manifestList summary
16113
+ 0 6373759902296319074 2023-06-15 00:07:47 1686787667420 s3://vim-iceberg-v1/glue/metadata/snap-6373759... {"added-data-files":"1","added-records":"5","a...}
16114
+ 1 4768076782814510171 2023-06-15 00:09:01 1686787741964 s3://vim-iceberg-v1/glue/metadata/snap-4768076... {"added-data-files":"1","added-records":"2","a...}
16115
+ 2 7771482207931850214 2024-05-29 04:59:09 1716958749946 s3://vim-iceberg-v1/glue/metadata/snap-7771482... {"deleted-data-files":"2","deleted-records":"7...}
16116
+ 3 1545363077953282623 2024-05-29 05:13:39 1716959619455 s3://vim-iceberg-v1/glue/metadata/snap-1545363... {"changed-partition-count":"0","total-records"...}
16117
+ 4 2166707884289108360 2024-05-29 05:17:49 1716959869075 s3://vim-iceberg-v1/glue/metadata/snap-2166707... {"changed-partition-count":"0","total-records"...}
16118
+ 5 8934190131471882700 2024-05-29 05:21:32 1716960092422 s3://vim-iceberg-v1/glue/metadata/snap-8934190... {"changed-partition-count":"0","total-records"...}
16119
+ 6 3086605171258231948 2024-05-29 05:34:43 1716960883786 s3://vim-iceberg-v1/glue/metadata/snap-3086605... {"changed-partition-count":"0","total-records"...}
16120
+ 7 7592503716012384122 2024-05-29 06:04:48 1716962688047 s3://vim-iceberg-v1/glue/metadata/snap-7592503... {"changed-partition-count":"0","total-records"...}
16121
+ 8 2831061717890032890 2024-06-04 17:21:01 1717521661689 s3://vim-iceberg-v1/glue/metadata/snap-2831061... {"added-data-files":"2","added-records":"7","a...}
16122
+ 9 8810491341502972715 2024-10-22 23:47:22 1729640842067 s3://vim-iceberg-v1/glue/metadata/snap-8810491... {"added-data-files":"1","added-records":"1","a...}
16123
+ 10 3953136136558551163 2024-12-03 04:40:48 1733200848733 s3://vim-iceberg-v1/glue/metadata/snap-3953136... {"added-data-files":"1","added-records":"4","a...}
16124
+ 11 6034775168901969481 2024-12-03 04:40:49 1733200849966 s3://vim-iceberg-v1/glue/metadata/snap-6034775... {"deleted-data-files":"1","deleted-records":"5...}
16125
+ """
16126
+ return self._execute_metadata_query_and_generate_dataframe("TD_SNAPSHOTS")
16127
+
16128
+ @property
16129
+ @collect_queryband(queryband="DF_prttns")
16130
+ @df_utils.check_otf_dataframe()
16131
+ def partitions(self):
16132
+ """
16133
+ DESCRIPTION:
16134
+ Gets partition information for a DataLake table.
16135
+
16136
+ PARAMETERS:
16137
+ None
15444
16138
 
16139
+ RETURNS:
16140
+ teradataml DataFrame.
16141
+
16142
+ RAISES:
16143
+ TeradataMLException.
16144
+
16145
+ EXAMPLES :
16146
+ # Example 1: Get the partition information for datalake table.
16147
+ >>> from teradataml.dataframe.dataframe import in_schema
16148
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16149
+ ... table_name="datalake_table",
16150
+ ... datalake_name="datalake")
16151
+ >>> datalake_df = DataFrame(in_schema_tbl)
16152
+ >>> datalake_df.partitions
16153
+ id name
16154
+ 0 1000 c2
16155
+ 1 1001 c3
16156
+
16157
+
16158
+ """
16159
+ return self._execute_metadata_query_and_generate_dataframe("TD_PARTITIONS")
16160
+
16161
+ @property
16162
+ @collect_queryband(queryband="DF_mnfsts")
16163
+ @df_utils.check_otf_dataframe()
16164
+ def manifests(self):
16165
+ """
16166
+ DESCRIPTION:
16167
+ Gets manifest information for a DataLake table.
16168
+
16169
+ PARAMETERS:
16170
+ None
16171
+
16172
+ RETURNS:
16173
+ teradataml DataFrame.
16174
+
16175
+ RAISES:
16176
+ TeradataMLException.
16177
+
16178
+ EXAMPLES :
16179
+ # Example 1: Get the manifest information for datalake table.
16180
+ >>> from teradataml.dataframe.dataframe import in_schema
16181
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16182
+ ... table_name="datalake_table",
16183
+ ... datalake_name="datalake")
16184
+ >>> datalake_df = DataFrame(in_schema_tbl)
16185
+ >>> datalake_df.manifests
16186
+ snapshotId snapshotTimestamp manifestList manifestFile manifestFileLength datafilecount totalrowcount
16187
+ 0 8068130797628952520 2025-05-02 11:45:26 s3://vim-iceberg-v1/otftestdb/nt_sales/... s3://vim-iceberg-v1/otftestdb/nt_sales/... 7158 6 6
16188
+ """
16189
+ return self._execute_metadata_query_and_generate_dataframe("TD_MANIFESTS")
16190
+
16191
+ @property
16192
+ @collect_queryband(queryband="DF_hstry")
16193
+ @df_utils.check_otf_dataframe()
16194
+ def history(self):
16195
+ """
16196
+ DESCRIPTION:
16197
+ Gets the snapshot history related to a DataLake table.
16198
+
16199
+ PARAMETERS:
16200
+ None
16201
+
16202
+ RETURNS:
16203
+ teradataml DataFrame.
16204
+
16205
+ RAISES:
16206
+ TeradataMLException.
16207
+
16208
+ EXAMPLES :
16209
+ # Example 1: Get the partition information for datalake table.
16210
+ >>> from teradataml.dataframe.dataframe import in_schema
16211
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16212
+ ... table_name="datalake_table",
16213
+ ... datalake_name="datalake")
16214
+ >>> datalake_df = DataFrame(in_schema_tbl)
16215
+ >>> datalake_df.history
16216
+ id timestamp
16217
+ 0 8068130797628952520 2025-05-02 11:45:26
16218
+ """
16219
+ return self._execute_metadata_query_and_generate_dataframe("TD_HISTORY")
16220
+
16221
+ def _execute_metadata_query_and_generate_dataframe(self, func_name):
16222
+ """Function executes OTF metadata query and return result in DataFrame format"""
16223
+ query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_METADATA).format(func_name,
16224
+ self._table_name)
16225
+ return DataFrame.from_query(query)
16226
+
16227
+ @collect_queryband(queryband="DF_gt_snpsht")
16228
+ @df_utils.check_otf_dataframe()
16229
+ def get_snapshot(self, as_of):
16230
+ """
16231
+ DESCRIPTION:
16232
+ Gets the data from a DataLake table for the given snapshot id or timestamp string.
16233
+ Notes:
16234
+ * The snapshot id can be obtained from the 'snapshots' property of the DataFrame.
16235
+ * The time travel value represented by 'as_of' should be in the format "YYYY-MM-DD HH:MM:SS.FFFFFFF"
16236
+ for TIMESTAMP string or "YYYY-MM-DD" for DATE string.
16237
+
16238
+ PARAMETERS:
16239
+ as_of:
16240
+ Required Argument.
16241
+ Specifies the snapshot id or timestamp information for which the snapshot is to be fetched.
16242
+ Types: str or int
16243
+
16244
+ RETURNS:
16245
+ teradataml DataFrame.
16246
+
16247
+ RAISES:
16248
+ TeradataMLException.
16249
+
16250
+ EXAMPLES:
16251
+ # DataFrame creation on OTF table.
16252
+ >>> from teradataml.dataframe.dataframe import in_schema
16253
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
16254
+ ... table_name="datalake_table",
16255
+ ... datalake_name="datalake")
16256
+ >>> datalake_df = DataFrame(in_schema_tbl)
16257
+
16258
+ # List snapshots first.
16259
+ >>> datalake_df.snapshots
16260
+ snapshotId snapshotTimestamp timestampMSecs manifestList summary
16261
+ 2046682612111137809 2025-06-03 13:26:15 1748957175692 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-204... {"added-data-files":"Red Inc","added-records"...}
16262
+ 282293708812257203 2025-06-03 05:53:19 1748929999245 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-282... {"added-data-files":"Blue Inc","added-records"...}
16263
+
16264
+ # Example 1: Get the snapshot using snapshot id.
16265
+ >>> datalake_df.get_snapshot(2046682612111137809)
16266
+ Feb Jan Mar Apr datetime
16267
+ accounts
16268
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16269
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16270
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16271
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16272
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16273
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16274
+
16275
+ # Example 2: Get the snapshot using snapshot id in string format.
16276
+ >>> datalake_df.get_snapshot("2046682612111137809")
16277
+ Feb Jan Mar Apr datetime
16278
+ accounts
16279
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16280
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16281
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16282
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16283
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16284
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16285
+
16286
+ # Example 3: Get the snapshot using timestamp string.
16287
+ >>> datalake_df.get_snapshot("2025-06-03 13:26:16")
16288
+ Feb Jan Mar Apr datetime
16289
+ accounts
16290
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16291
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16292
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16293
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16294
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16295
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16296
+
16297
+ # Example 4: Get the snapshot using date string.
16298
+ >>> datalake_df.get_snapshot("2025-06-04")
16299
+ Feb Jan Mar Apr datetime
16300
+ accounts
16301
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
16302
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
16303
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
16304
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
16305
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
16306
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
16307
+
16308
+ """
16309
+ _Validators._validate_function_arguments([["as_of", as_of, False, (int, str)]])
16310
+
16311
+ # If already int or string representation of int, return by quoting it
16312
+ if isinstance(as_of, int) or (isinstance(as_of, str) and as_of.isdigit()):
16313
+ snapshot_on = "'{}'".format(as_of)
16314
+ else:
16315
+ try:
16316
+ snapshot_on = UtilFuncs._get_time_formatted_string(as_of)
16317
+ except ValueError as e:
16318
+ raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
16319
+ "get_snapshot", "Invalid value for 'as_of' argument: {}. "
16320
+ "Use valid format [\"YYYY-MM-DD HH:MM:SS.FFFFFFF\", \"YYYY-MM-DD HH:MM:SS\","
16321
+ "\"YYYY-MM-DD\"]".format(as_of)),
16322
+ MessageCodes.FUNC_EXECUTION_FAILED)
16323
+
16324
+ query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_SNAPSHOT).format(self._table_name, snapshot_on)
16325
+
16326
+ try:
16327
+ return DataFrame.from_query(query)
16328
+ except TeradataMlException as e:
16329
+ raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
16330
+ "get_snapshot()", "Invalid value for 'as_of' argument: {}. "
16331
+ "Use valid timestamp or correct snapshot id listed using 'snapshots' property.".format(as_of)),
16332
+ MessageCodes.FUNC_EXECUTION_FAILED)
15445
16333
 
15446
16334
  class DataFrameGroupBy(DataFrame):
15447
16335
  """
@@ -15450,7 +16338,7 @@ class DataFrameGroupBy(DataFrame):
15450
16338
 
15451
16339
  """
15452
16340
 
15453
- def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None):
16341
+ def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None, include_grouping_columns=False):
15454
16342
  """
15455
16343
  init() method for DataFrameGroupBy.
15456
16344
 
@@ -15491,6 +16379,15 @@ class DataFrameGroupBy(DataFrame):
15491
16379
  Permitted Values: "CUBE", "ROLLUP", None
15492
16380
  Types: str or NoneType
15493
16381
 
16382
+ include_grouping_columns:
16383
+ Optional Argument.
16384
+ Specifies whether to include aggregations on the grouping column(s) or not.
16385
+ When set to True, the resultant DataFrame will have the aggregations on the
16386
+ columns mentioned in "columns". Otherwise, resultant DataFrame will not have
16387
+ aggregations on the columns mentioned in "columns".
16388
+ Default Value: False
16389
+ Types: bool
16390
+
15494
16391
  RETURNS:
15495
16392
  teradataml DataFrameGroupBy instance
15496
16393
  """
@@ -15500,6 +16397,7 @@ class DataFrameGroupBy(DataFrame):
15500
16397
  self._column_names_and_types = column_names_and_types
15501
16398
  self._columns = columns
15502
16399
  self.groupby_column_list = column_list
16400
+ self._include_grouping_columns = include_grouping_columns
15503
16401
 
15504
16402
  def _get_assign_allowed_types(self):
15505
16403
  """