teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +182 -13
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +8 -13
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +16 -1
- teradataml/analytics/utils.py +60 -1
- teradataml/automl/__init__.py +290 -106
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +29 -10
- teradataml/automl/data_transformation.py +11 -0
- teradataml/automl/feature_engineering.py +64 -4
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +1 -1
- teradataml/clients/auth_client.py +12 -8
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/common/constants.py +71 -26
- teradataml/common/exceptions.py +32 -0
- teradataml/common/messagecodes.py +28 -0
- teradataml/common/messages.py +13 -4
- teradataml/common/sqlbundle.py +3 -2
- teradataml/common/utils.py +345 -45
- teradataml/context/context.py +259 -93
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +1 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -1
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/url_data.csv +10 -9
- teradataml/dataframe/copy_to.py +38 -27
- teradataml/dataframe/data_transfer.py +61 -45
- teradataml/dataframe/dataframe.py +1110 -132
- teradataml/dataframe/dataframe_utils.py +73 -27
- teradataml/dataframe/functions.py +1070 -9
- teradataml/dataframe/sql.py +750 -959
- teradataml/dbutils/dbutils.py +33 -13
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/utils.py +4 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/_base.py +12 -157
- teradataml/options/configure.py +24 -9
- teradataml/scriptmgmt/UserEnv.py +317 -39
- teradataml/scriptmgmt/lls_utils.py +456 -135
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +897 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +406 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/store/__init__.py +1 -1
- teradataml/table_operators/Apply.py +16 -1
- teradataml/table_operators/Script.py +20 -1
- teradataml/table_operators/query_generator.py +4 -21
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/internal_buffer.py +22 -2
- teradataml/utils/utils.py +0 -1
- teradataml/utils/validators.py +318 -58
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/METADATA +188 -14
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/RECORD +131 -84
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/zip-safe +0 -0
|
@@ -12,63 +12,72 @@ This file implements the teradataml dataframe.
|
|
|
12
12
|
A teradataml dataframe maps virtually to teradata tables and views.
|
|
13
13
|
"""
|
|
14
14
|
import decimal
|
|
15
|
-
import inspect
|
|
15
|
+
import inspect
|
|
16
|
+
import itertools
|
|
16
17
|
import json
|
|
17
18
|
import numbers
|
|
18
|
-
import pandas as pd
|
|
19
19
|
import re
|
|
20
|
-
import sqlalchemy
|
|
21
20
|
import sys
|
|
22
21
|
import urllib.parse
|
|
22
|
+
from collections import OrderedDict
|
|
23
|
+
from collections.abc import Iterator
|
|
23
24
|
|
|
25
|
+
import numpy as np
|
|
26
|
+
import pandas as pd
|
|
27
|
+
import sqlalchemy
|
|
24
28
|
from sqlalchemy import Column
|
|
29
|
+
from sqlalchemy.exc import NoSuchColumnError
|
|
30
|
+
from sqlalchemy.sql import ClauseElement
|
|
31
|
+
from teradatasql import OperationalError
|
|
32
|
+
from teradatasqlalchemy.dialect import dialect as td_dialect
|
|
33
|
+
from teradatasqlalchemy.dialect import preparer
|
|
34
|
+
from teradatasqlalchemy.types import (BIGINT, BYTEINT, DECIMAL, FLOAT, INTEGER,
|
|
35
|
+
PERIOD_TIMESTAMP, SMALLINT, _TDType)
|
|
25
36
|
|
|
26
37
|
import teradataml.context.context as tdmlctx
|
|
27
|
-
|
|
28
|
-
from
|
|
29
|
-
|
|
30
|
-
from teradataml import
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
|
|
37
|
-
from teradataml.common.deprecations import argument_deprecation
|
|
38
|
-
from teradataml.common.utils import UtilFuncs
|
|
38
|
+
from teradataml import GarbageCollector, execute_sql
|
|
39
|
+
from teradataml.common.bulk_exposed_utils import \
|
|
40
|
+
_validate_unimplemented_function
|
|
41
|
+
from teradataml.common.constants import (AEDConstants, OutputStyle,
|
|
42
|
+
PTITableConstants, PythonTypes,
|
|
43
|
+
SourceType, SQLConstants,
|
|
44
|
+
SQLFunctionConstants,
|
|
45
|
+
TableOperatorConstants,
|
|
46
|
+
TeradataConstants, TeradataTypes)
|
|
39
47
|
from teradataml.common.exceptions import TeradataMlException
|
|
40
|
-
from teradataml.common.messages import Messages
|
|
41
48
|
from teradataml.common.messagecodes import MessageCodes
|
|
42
|
-
from teradataml.common.
|
|
43
|
-
from teradataml.common.
|
|
44
|
-
|
|
45
|
-
from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, DataFrameUtils
|
|
46
|
-
from teradataml.dataframe.indexer import _LocationIndexer
|
|
47
|
-
from teradataml.common.aed_utils import AedUtils
|
|
48
|
-
from teradataml.options.display import display
|
|
49
|
-
from teradataml.options.configure import configure
|
|
49
|
+
from teradataml.common.messages import Messages
|
|
50
|
+
from teradataml.common.sqlbundle import SQLBundle
|
|
51
|
+
from teradataml.common.utils import UtilFuncs
|
|
50
52
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
53
|
+
from teradataml.dataframe.data_transfer import _DataTransferUtils
|
|
54
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils
|
|
55
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
|
|
56
|
+
from teradataml.dataframe.indexer import _LocationIndexer
|
|
51
57
|
from teradataml.dataframe.row import _Row
|
|
52
58
|
from teradataml.dataframe.setop import concat
|
|
59
|
+
from teradataml.dataframe.sql import _MetaExpression
|
|
60
|
+
from teradataml.dataframe.sql_functions import case
|
|
61
|
+
from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
62
|
+
from teradataml.dataframe.window import Window
|
|
53
63
|
from teradataml.dbutils.dbutils import list_td_reserved_keywords
|
|
64
|
+
from teradataml.options.configure import configure
|
|
65
|
+
from teradataml.options.display import display
|
|
54
66
|
from teradataml.plot.plot import _Plot
|
|
55
67
|
from teradataml.scriptmgmt.UserEnv import UserEnv
|
|
56
|
-
from teradataml.
|
|
57
|
-
from teradataml.utils.validators import _Validators
|
|
68
|
+
from teradataml.series.series import Series
|
|
58
69
|
from teradataml.table_operators.table_operator_util import _TableOperatorUtils
|
|
59
|
-
from teradatasqlalchemy.dialect import preparer, dialect as td_dialect
|
|
60
|
-
from teradatasql import OperationalError
|
|
61
|
-
from teradataml.dataframe.window import Window
|
|
62
|
-
from teradataml.dataframe.data_transfer import _DataTransferUtils
|
|
63
|
-
from teradataml.common.bulk_exposed_utils import _validate_unimplemented_function
|
|
64
70
|
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
65
|
-
from teradataml.
|
|
66
|
-
from teradataml.utils.
|
|
67
|
-
|
|
71
|
+
from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
|
|
72
|
+
from teradataml.utils.validators import _Validators
|
|
73
|
+
|
|
74
|
+
# Adding imports at the end to avoid circular imports.
|
|
75
|
+
from teradataml.common.aed_utils import AedUtils
|
|
68
76
|
|
|
69
77
|
# TODO use logger when available on master branch
|
|
70
78
|
# logger = teradatapylog.getLogger()
|
|
71
79
|
|
|
80
|
+
|
|
72
81
|
class in_schema:
|
|
73
82
|
"""
|
|
74
83
|
Class takes a schema name, a table name and datalake name attributes
|
|
@@ -149,26 +158,37 @@ class DataFrame():
|
|
|
149
158
|
on tables, views, and queries on Teradata Vantage.
|
|
150
159
|
"""
|
|
151
160
|
|
|
152
|
-
def __init__(self,
|
|
161
|
+
def __init__(self, data=None, index=True, index_label=None, query=None, materialize=False, **kwargs):
|
|
153
162
|
"""
|
|
154
163
|
Constructor for teradataml DataFrame.
|
|
155
164
|
|
|
156
165
|
PARAMETERS:
|
|
157
|
-
|
|
166
|
+
data:
|
|
158
167
|
Optional Argument.
|
|
159
|
-
|
|
160
|
-
|
|
168
|
+
Specifies the input data to create a teradataml DataFrame.
|
|
169
|
+
Notes:
|
|
170
|
+
If a dictionary is provided, it must follow the below requirements:
|
|
171
|
+
* Keys must be strings (column names).
|
|
172
|
+
* Values must be lists of equal length (column data).
|
|
173
|
+
* Nested dictionaries are not supported.
|
|
174
|
+
Types: str OR pandas DataFrame OR in_schema OR numpy array OR list OR dictionary
|
|
161
175
|
|
|
162
176
|
index:
|
|
163
177
|
Optional Argument.
|
|
164
|
-
|
|
178
|
+
If "data" is a string, then the argument specifies whether to use the index column
|
|
179
|
+
for sorting or not.
|
|
180
|
+
If "data" is a pandas DataFrame, then this argument specifies whether to
|
|
181
|
+
save Pandas DataFrame index as a column or not.
|
|
165
182
|
Default Value: True
|
|
166
183
|
Types: bool
|
|
167
184
|
|
|
168
185
|
index_label:
|
|
169
186
|
Optional Argument.
|
|
170
|
-
|
|
171
|
-
|
|
187
|
+
If "data" is a string, then the argument specifies column(s) used for sorting.
|
|
188
|
+
If "data" is a pandas DataFrame, then the default behavior is applied.
|
|
189
|
+
Note:
|
|
190
|
+
* Refer to the "index_label" parameter of copy_to_sql() for details on the default behaviour.
|
|
191
|
+
Types: str OR list of str
|
|
172
192
|
|
|
173
193
|
query:
|
|
174
194
|
Optional Argument.
|
|
@@ -187,29 +207,127 @@ class DataFrame():
|
|
|
187
207
|
Default Value: False (No materialization)
|
|
188
208
|
Types: bool
|
|
189
209
|
|
|
210
|
+
kwargs:
|
|
211
|
+
table_name:
|
|
212
|
+
Optional Argument.
|
|
213
|
+
The table name or view name in Teradata Vantage referenced by this DataFrame.
|
|
214
|
+
Note:
|
|
215
|
+
* If "data" and "table_name" are both specified, then the "table_name" argument is ignored.
|
|
216
|
+
Types: str or in_schema
|
|
217
|
+
|
|
218
|
+
primary_index:
|
|
219
|
+
Optional Argument.
|
|
220
|
+
Specifies which column(s) to use as primary index for the teradataml DataFrame.
|
|
221
|
+
Note:
|
|
222
|
+
* This argument is only applicable when creating a DataFrame from a pandas DataFrame.
|
|
223
|
+
Types: str OR list of str
|
|
224
|
+
|
|
225
|
+
types:
|
|
226
|
+
Optional Argument.
|
|
227
|
+
Specifies required data types for requested columns to be saved in Teradata Vantage.
|
|
228
|
+
Notes:
|
|
229
|
+
* This argument is not applicable when "data" argument is of type str or in_schema.
|
|
230
|
+
* Refer to the "types" parameter of copy_to_sql() for more details.
|
|
231
|
+
Types: dict
|
|
232
|
+
|
|
233
|
+
columns:
|
|
234
|
+
Optional Argument.
|
|
235
|
+
Specifies the names of the columns to be used in the DataFrame.
|
|
236
|
+
Notes:
|
|
237
|
+
* This argument is not applicable when "data" argument is of type str or in_schema.
|
|
238
|
+
* If "data" is a dictionary and this argument is specified, only the specified columns will be
|
|
239
|
+
included in the DataFrame if the dictionary contains those keys. If the dictionary does not
|
|
240
|
+
contain the specified keys, those columns will be added with NaN values.
|
|
241
|
+
Types: str OR list of str
|
|
242
|
+
|
|
190
243
|
EXAMPLES:
|
|
191
|
-
from teradataml.dataframe.dataframe import DataFrame
|
|
244
|
+
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
245
|
+
>>> import pandas as pd
|
|
192
246
|
|
|
193
|
-
# Example 1:
|
|
194
|
-
|
|
195
|
-
# Created DataFrame using table name.
|
|
196
|
-
df = DataFrame("mytab")
|
|
247
|
+
# Example 1: Create a teradataml DataFrame from table name.
|
|
248
|
+
>>> df = DataFrame("mytab")
|
|
197
249
|
|
|
198
|
-
#
|
|
199
|
-
df = DataFrame("myview")
|
|
250
|
+
# Example 2: Create a teradataml DataFrame from view name.
|
|
251
|
+
>>> df = DataFrame("myview")
|
|
200
252
|
|
|
201
|
-
#
|
|
202
|
-
df = DataFrame("myview", False)
|
|
253
|
+
# Example 3: Create a teradataml DataFrame using view name without using index column for sorting.
|
|
254
|
+
>>> df = DataFrame("myview", False)
|
|
203
255
|
|
|
204
|
-
#
|
|
205
|
-
|
|
256
|
+
# Example 4: Create a teradataml DataFrame using table name and consider columns Col1 and Col2
|
|
257
|
+
# while running DataFrame.head() or DataFrame.tail() methods.
|
|
258
|
+
>>> df = DataFrame("mytab", True, ["Col1", "Col2"])
|
|
206
259
|
|
|
260
|
+
# Example 5: Create a teradataml DataFrame from the existing Vantage table "dbcinfo"
|
|
261
|
+
# in the non-default database "dbc" using the in_schema() object.
|
|
262
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
263
|
+
>>> df = DataFrame(in_schema("dbc", "dbcinfo"))
|
|
207
264
|
|
|
208
|
-
# Example
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
265
|
+
# Example 6: Create a teradataml DataFrame from a pandas DataFrame.
|
|
266
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
267
|
+
>>> df = DataFrame(pdf)
|
|
268
|
+
>>> df
|
|
269
|
+
col1 col2 index_label
|
|
270
|
+
0 3 6 2
|
|
271
|
+
1 2 5 1
|
|
272
|
+
2 1 4 0
|
|
273
|
+
|
|
274
|
+
# Example 7: Create a teradataml DataFrame from a pandas DataFrame without index column.
|
|
275
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
276
|
+
>>> df = DataFrame(data=pdf, index=False)
|
|
277
|
+
>>> df
|
|
278
|
+
col1 col2
|
|
279
|
+
0 3 6
|
|
280
|
+
1 2 5
|
|
281
|
+
2 1 4
|
|
282
|
+
|
|
283
|
+
# Example 8: Create a teradataml DataFrame from a pandas DataFrame with
|
|
284
|
+
# index label and primary index as 'id'.
|
|
285
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
286
|
+
>>> df = DataFrame(pdf, index=True, index_label='id', primary_index='id')
|
|
287
|
+
>>> df
|
|
288
|
+
col1 col2
|
|
289
|
+
id
|
|
290
|
+
2 3 6
|
|
291
|
+
1 2 5
|
|
292
|
+
0 1 4
|
|
293
|
+
|
|
294
|
+
# Example 9: Create a teradataml DataFrame from list of lists.
|
|
295
|
+
>>> df = DataFrame([[1, 2], [3, 4]])
|
|
296
|
+
>>> df
|
|
297
|
+
col_0 col_1 index_label
|
|
298
|
+
0 3 4 1
|
|
299
|
+
1 1 2 0
|
|
300
|
+
|
|
301
|
+
# Example 10: Create a teradataml DataFrame from numpy array.
|
|
302
|
+
>>> import numpy as np
|
|
303
|
+
>>> df = DataFrame(np.array([[1, 2], [3, 4]]), index=True, index_label="id")
|
|
304
|
+
>>> df
|
|
305
|
+
col_0 col_1
|
|
306
|
+
id
|
|
307
|
+
1 3 4
|
|
308
|
+
0 1 2
|
|
309
|
+
|
|
310
|
+
# Example 11: Create a teradataml DataFrame from a dictionary.
|
|
311
|
+
>>> df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=True, index_label="id")
|
|
312
|
+
>>> df
|
|
313
|
+
col1 col2
|
|
314
|
+
id
|
|
315
|
+
1 2 4
|
|
316
|
+
0 1 3
|
|
317
|
+
|
|
318
|
+
# Example 12: Create a teradataml DataFrame from list of dictionaries.
|
|
319
|
+
>>> df = DataFrame([{"col1": 1, "col2": 2}, {"col1": 3, "col2": 4}], index=False)
|
|
320
|
+
>>> df
|
|
321
|
+
col1 col2
|
|
322
|
+
0 3 4
|
|
323
|
+
1 1 2
|
|
324
|
+
|
|
325
|
+
# Example 13: Create a teradataml DataFrame from list of tuples.
|
|
326
|
+
>>> df = DataFrame([("Alice", 1), ("Bob", 2)])
|
|
327
|
+
>>> df
|
|
328
|
+
col_0 col_1 index_label
|
|
329
|
+
0 Alice 1 1
|
|
330
|
+
1 Bob 2 0
|
|
213
331
|
|
|
214
332
|
RAISES:
|
|
215
333
|
TeradataMlException - TDMLDF_CREATE_FAIL
|
|
@@ -243,17 +361,35 @@ class DataFrame():
|
|
|
243
361
|
# Property to determine if table is an ART table or not.
|
|
244
362
|
self._is_art = None
|
|
245
363
|
|
|
364
|
+
# This attribute stores the previous assign arguments in continuous assign calls.
|
|
365
|
+
self._previous_assign_args = None
|
|
366
|
+
# This attribute stores the root DataFrame columns.
|
|
367
|
+
self._root_columns = None
|
|
368
|
+
|
|
246
369
|
self._datalake = None
|
|
247
370
|
self._database = None
|
|
248
371
|
self._table = None
|
|
249
372
|
self._otf = False
|
|
250
373
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
374
|
+
table_name = kwargs.get("table_name", None)
|
|
375
|
+
primary_index = kwargs.get("primary_index", None)
|
|
376
|
+
columns = kwargs.get("columns", None)
|
|
377
|
+
types = kwargs.get("types", None)
|
|
378
|
+
|
|
379
|
+
# Check if the data is an instance of in_schema or if the data is None
|
|
380
|
+
# and table_name is an instance of in_schema, then assign the table_name,
|
|
381
|
+
# datalake_name and schema_name to the DataFrame object.
|
|
382
|
+
schema_obj = data if isinstance(data, in_schema) else (
|
|
383
|
+
table_name if data is None and isinstance(table_name, in_schema) else None)
|
|
384
|
+
|
|
385
|
+
if schema_obj:
|
|
386
|
+
self._table = schema_obj.table_name
|
|
387
|
+
self._datalake = schema_obj.datalake_name
|
|
388
|
+
self._database = schema_obj.schema_name
|
|
255
389
|
self._otf = True if self._datalake else False
|
|
256
390
|
|
|
391
|
+
# Convert schema objects to strings.
|
|
392
|
+
data = str(data) if isinstance(data, in_schema) else data
|
|
257
393
|
table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
|
|
258
394
|
|
|
259
395
|
# Below matrix is list of list, where in each row contains following elements:
|
|
@@ -272,18 +408,49 @@ class DataFrame():
|
|
|
272
408
|
# 6. element6 --> A list of permitted values, an argument can accept.
|
|
273
409
|
# If not specified, it is as good as passing None. If a list is passed, validation will be
|
|
274
410
|
# performed for permitted values.
|
|
411
|
+
|
|
275
412
|
awu_matrix = []
|
|
276
|
-
|
|
413
|
+
dtypes = (list, tuple, dict)
|
|
414
|
+
awu_matrix.append(["data", data, True, (str, pd.DataFrame, np.ndarray, dict, _ListOf(dtypes)), True])
|
|
277
415
|
awu_matrix.append(["index", index, True, (bool)])
|
|
278
416
|
awu_matrix.append(["index_label", index_label, True, (str, list)])
|
|
279
417
|
awu_matrix.append(["query", query, True, (str), True])
|
|
280
418
|
awu_matrix.append(["materialize", materialize, True, (bool)])
|
|
419
|
+
awu_matrix.append(["table_name", table_name, True, (str), True])
|
|
420
|
+
awu_matrix.append(["primary_index", primary_index, True, (str, list)])
|
|
421
|
+
awu_matrix.append(["types", types, True, (dict)])
|
|
422
|
+
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
281
423
|
|
|
282
424
|
# Validate argument types
|
|
283
425
|
_Validators._validate_function_arguments(awu_matrix)
|
|
284
426
|
|
|
427
|
+
# Convert columns to list if it is a string.
|
|
428
|
+
if isinstance(columns, str):
|
|
429
|
+
columns = [columns]
|
|
430
|
+
|
|
285
431
|
try:
|
|
286
|
-
if table_name is not None:
|
|
432
|
+
if table_name is not None or data is not None:
|
|
433
|
+
|
|
434
|
+
# If data is list or numpy array or dictionary, then convert it to a pandas DataFrame.
|
|
435
|
+
if isinstance(data, (list, np.ndarray, dict)):
|
|
436
|
+
data = pd.DataFrame(data, columns=columns)
|
|
437
|
+
# If the data is a pandas DataFrame, then store the data in a temporary table in Vantage.
|
|
438
|
+
if isinstance(data, pd.DataFrame):
|
|
439
|
+
# Create a copy of the pandas DataFrame to avoid modifying the original,
|
|
440
|
+
# because column names will be changed if they are integers.
|
|
441
|
+
pd_data = data.copy()
|
|
442
|
+
# If the columns are not of type string, then convert them to string.
|
|
443
|
+
pd_data.columns = [f"col_{i}" if isinstance(i, int) else i for i in pd_data.columns]
|
|
444
|
+
# Set the table_name to the name of the table created in the database.
|
|
445
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="from_pandas",
|
|
446
|
+
table_type=TeradataConstants.TERADATA_TABLE)
|
|
447
|
+
|
|
448
|
+
copy_to_sql(pd_data, table_name, index=index, index_label=index_label, primary_index=primary_index,
|
|
449
|
+
types=types)
|
|
450
|
+
# If the data is a string, then set the table_name to the data.
|
|
451
|
+
elif isinstance(data, str):
|
|
452
|
+
table_name = data
|
|
453
|
+
|
|
287
454
|
self._table_name = UtilFuncs._quote_table_names(table_name)
|
|
288
455
|
self._source_type = SourceType.TABLE.value
|
|
289
456
|
self._nodeid = self._aed_utils._aed_table(self._table_name)
|
|
@@ -337,6 +504,12 @@ class DataFrame():
|
|
|
337
504
|
elif "[Error 3706] Syntax error" in str(oe):
|
|
338
505
|
raise ValueError(Messages.get_message(
|
|
339
506
|
MessageCodes.FROM_QUERY_SELECT_SUPPORTED).format("Check the syntax."))
|
|
507
|
+
elif "[Error 7825]" in str(oe):
|
|
508
|
+
# The UDF/XSP/UDM routine has thrown an SQLException
|
|
509
|
+
# with an SQL state in the range of 38001-38999 which
|
|
510
|
+
# is not a syntax error. Hence not a ValueError wrt query string.
|
|
511
|
+
# Expected when OTF snapshot related query is executed.
|
|
512
|
+
raise
|
|
340
513
|
raise ValueError(Messages.get_message(
|
|
341
514
|
MessageCodes.FROM_QUERY_SELECT_SUPPORTED))
|
|
342
515
|
|
|
@@ -498,7 +671,7 @@ class DataFrame():
|
|
|
498
671
|
Types: str
|
|
499
672
|
|
|
500
673
|
EXAMPLES:
|
|
501
|
-
>>> from teradataml
|
|
674
|
+
>>> from teradataml import DataFrame
|
|
502
675
|
|
|
503
676
|
# Example 1: The following example creates a DataFrame from a table or
|
|
504
677
|
a view.
|
|
@@ -538,9 +711,9 @@ class DataFrame():
|
|
|
538
711
|
|
|
539
712
|
"""
|
|
540
713
|
if schema_name:
|
|
541
|
-
return cls(in_schema(schema_name, table_name, datalake_name)
|
|
542
|
-
|
|
543
|
-
return cls(table_name, index, index_label)
|
|
714
|
+
return cls(table_name=in_schema(schema_name, table_name, datalake_name),
|
|
715
|
+
index=index, index_label=index_label)
|
|
716
|
+
return cls(table_name=table_name, index=index, index_label=index_label)
|
|
544
717
|
|
|
545
718
|
@classmethod
|
|
546
719
|
@collect_queryband(queryband="DF_fromQuery")
|
|
@@ -687,6 +860,300 @@ class DataFrame():
|
|
|
687
860
|
df.__setattr__(arg, arg_value)
|
|
688
861
|
return df
|
|
689
862
|
|
|
863
|
+
@classmethod
|
|
864
|
+
@collect_queryband(queryband="DF_fromPandas")
|
|
865
|
+
def from_pandas(cls, pandas_df, index=True, index_label=None, primary_index=None):
|
|
866
|
+
"""
|
|
867
|
+
DESCRIPTION:
|
|
868
|
+
Creates a teradataml DataFrame from a pandas DataFrame.
|
|
869
|
+
|
|
870
|
+
PARAMETERS:
|
|
871
|
+
pandas_df:
|
|
872
|
+
Required Argument.
|
|
873
|
+
Specifies the pandas DataFrame to be converted to teradataml DataFrame.
|
|
874
|
+
Types: pandas DataFrame
|
|
875
|
+
|
|
876
|
+
index:
|
|
877
|
+
Optional Argument.
|
|
878
|
+
Specifies whether to save Pandas DataFrame index as a column or not.
|
|
879
|
+
Default Value: True
|
|
880
|
+
Types: bool
|
|
881
|
+
|
|
882
|
+
index_label:
|
|
883
|
+
Optional Argument.
|
|
884
|
+
Specifies the column label(s) for Pandas DataFrame index column(s).
|
|
885
|
+
Note:
|
|
886
|
+
* Refer to the "index_label" parameter of copy_to_sql() for more details.
|
|
887
|
+
Default Value: None
|
|
888
|
+
Types: str OR list of str
|
|
889
|
+
|
|
890
|
+
primary_index:
|
|
891
|
+
Optional Argument.
|
|
892
|
+
Specifies which column(s) to use as primary index for the teradataml DataFrame.
|
|
893
|
+
Types: str OR list of str
|
|
894
|
+
|
|
895
|
+
RETURNS:
|
|
896
|
+
teradataml DataFrame
|
|
897
|
+
|
|
898
|
+
RAISES:
|
|
899
|
+
TeradataMlException
|
|
900
|
+
|
|
901
|
+
EXAMPLES:
|
|
902
|
+
>>> import pandas as pd
|
|
903
|
+
>>> from teradataml import DataFrame
|
|
904
|
+
>>> pdf = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
|
|
905
|
+
>>> pdf1 = pd.DataFrame([[1, 2], [3, 4]])
|
|
906
|
+
|
|
907
|
+
# Example 1: Create a teradataml DataFrame from a pandas DataFrame.
|
|
908
|
+
>>> df = DataFrame.from_pandas(pdf)
|
|
909
|
+
>>> df
|
|
910
|
+
col1 col2 index_label
|
|
911
|
+
0 3 6 2
|
|
912
|
+
1 2 5 1
|
|
913
|
+
2 1 4 0
|
|
914
|
+
|
|
915
|
+
# Example 2: Create a teradataml DataFrame from a pandas DataFrame
|
|
916
|
+
# and do not save the index as a column.
|
|
917
|
+
>>> df = DataFrame.from_pandas(pdf, index=False)
|
|
918
|
+
>>> df
|
|
919
|
+
col1 col2
|
|
920
|
+
0 3 6
|
|
921
|
+
1 2 5
|
|
922
|
+
2 1 4
|
|
923
|
+
|
|
924
|
+
# Example 3: Create a teradataml DataFrame from a pandas DataFrame
|
|
925
|
+
# with index label as 'id' and set it as primary index.
|
|
926
|
+
>>> df = DataFrame.from_pandas(pdf, index=True, index_label='id', primary_index='id')
|
|
927
|
+
>>> df
|
|
928
|
+
col1 col2
|
|
929
|
+
id
|
|
930
|
+
2 3 6
|
|
931
|
+
1 2 5
|
|
932
|
+
0 1 4
|
|
933
|
+
|
|
934
|
+
# Example 4: Create a teradataml DataFrame from a pandas DataFrame where
|
|
935
|
+
# columns are not explicitly defined in the pandas DataFrame.
|
|
936
|
+
>>> df = DataFrame.from_pandas(pdf1)
|
|
937
|
+
>>> df
|
|
938
|
+
col_0 col_1 index_label
|
|
939
|
+
0 3 4 1
|
|
940
|
+
1 1 2 0
|
|
941
|
+
"""
|
|
942
|
+
# Validate 'pandas_df' argument, other arguments, will be validated as part of DataFrame().
|
|
943
|
+
arg_type_matrix = []
|
|
944
|
+
arg_type_matrix.append(["pandas_df", pandas_df, False, (pd.DataFrame,), True])
|
|
945
|
+
|
|
946
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
947
|
+
|
|
948
|
+
return cls(pandas_df, index, index_label, primary_index=primary_index)
|
|
949
|
+
|
|
950
|
+
@classmethod
|
|
951
|
+
@collect_queryband(queryband="DF_fromDict")
|
|
952
|
+
def from_dict(cls, data, columns=None):
|
|
953
|
+
"""
|
|
954
|
+
DESCRIPTION:
|
|
955
|
+
Creates a DataFrame from a dictionary containing values as lists or numpy arrays.
|
|
956
|
+
|
|
957
|
+
PARAMETERS:
|
|
958
|
+
data:
|
|
959
|
+
Required Argument.
|
|
960
|
+
Specifies the Python dictionary to create a teradataml DataFrame.
|
|
961
|
+
Notes:
|
|
962
|
+
* Keys of the dictionary are used as column names.
|
|
963
|
+
* Values of the dictionary should be lists or numpy arrays.
|
|
964
|
+
* Nested dictionaries are not supported.
|
|
965
|
+
Types: dict
|
|
966
|
+
|
|
967
|
+
columns:
|
|
968
|
+
Optional Argument.
|
|
969
|
+
Specifies the column names for the DataFrame.
|
|
970
|
+
Types: str OR list of str
|
|
971
|
+
|
|
972
|
+
RETURNS:
|
|
973
|
+
teradataml DataFrame
|
|
974
|
+
|
|
975
|
+
RAISES:
|
|
976
|
+
TeradataMlException
|
|
977
|
+
|
|
978
|
+
EXAMPLES:
|
|
979
|
+
>>> from teradataml import DataFrame
|
|
980
|
+
>>> data_dict = {"name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 28]}
|
|
981
|
+
|
|
982
|
+
# Example 1: Create a teradataml DataFrame from a dictionary where
|
|
983
|
+
# keys are column names and values are lists of column data.
|
|
984
|
+
>>> df = DataFrame.from_dict(data_dict)
|
|
985
|
+
>>> df
|
|
986
|
+
name age
|
|
987
|
+
0 Charlie 28
|
|
988
|
+
1 Bob 30
|
|
989
|
+
2 Alice 25
|
|
990
|
+
|
|
991
|
+
# Example 2: Create a teradataml DataFrame from a dictionary where
|
|
992
|
+
# keys are column names and values are numpy arrays.
|
|
993
|
+
>>> import numpy as np
|
|
994
|
+
>>> data_dict = {"col1": np.array([1, 2, 3]), "col2": np.array([4, 5, 6])}
|
|
995
|
+
>>> df = DataFrame.from_dict(data_dict)
|
|
996
|
+
>>> df
|
|
997
|
+
col1 col2
|
|
998
|
+
0 3 6
|
|
999
|
+
1 2 5
|
|
1000
|
+
2 1 4
|
|
1001
|
+
"""
|
|
1002
|
+
arg_type_matrix = []
|
|
1003
|
+
arg_type_matrix.append(["data", data, False, (dict), True])
|
|
1004
|
+
arg_type_matrix.append(["columns", columns, True, (str, list), True])
|
|
1005
|
+
|
|
1006
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
1007
|
+
|
|
1008
|
+
return cls(data, columns=columns, index=False)
|
|
1009
|
+
|
|
1010
|
+
@classmethod
|
|
1011
|
+
@collect_queryband(queryband="DF_fromRecords")
|
|
1012
|
+
def from_records(cls, data, columns=None, **kwargs):
|
|
1013
|
+
"""
|
|
1014
|
+
DESCRIPTION:
|
|
1015
|
+
Create a DataFrame from a list of lists/tuples/dictionaries/numpy arrays.
|
|
1016
|
+
|
|
1017
|
+
PARAMETERS:
|
|
1018
|
+
data:
|
|
1019
|
+
Required Argument.
|
|
1020
|
+
Specifies the iterator of data or the list of lists/tuples/dictionaries/numpy arrays to
|
|
1021
|
+
be converted to teradataml DataFrame.
|
|
1022
|
+
Note:
|
|
1023
|
+
* Nested lists or tuples or dictionaries are not supported.
|
|
1024
|
+
Types: Iterator, list
|
|
1025
|
+
|
|
1026
|
+
columns:
|
|
1027
|
+
Optional Argument.
|
|
1028
|
+
Specifies the column names for the DataFrame.
|
|
1029
|
+
Note:
|
|
1030
|
+
* If the data is a list of lists/tuples/numpy arrays and this argument
|
|
1031
|
+
is not specified, column names will be auto-generated as 'col_0', 'col_1', etc.
|
|
1032
|
+
Types: str OR list of str
|
|
1033
|
+
|
|
1034
|
+
kwargs:
|
|
1035
|
+
exclude:
|
|
1036
|
+
Optional Argument.
|
|
1037
|
+
Specifies the columns to be excluded from the DataFrame.
|
|
1038
|
+
Types: list OR tuple
|
|
1039
|
+
|
|
1040
|
+
coerce_float:
|
|
1041
|
+
Optional Argument.
|
|
1042
|
+
Specifies whether to convert values of non-string, non-numeric objects (like decimal.Decimal)
|
|
1043
|
+
to floating point, useful for SQL result sets.
|
|
1044
|
+
Default Value: True
|
|
1045
|
+
Types: bool
|
|
1046
|
+
|
|
1047
|
+
nrows:
|
|
1048
|
+
Optional Argument.
|
|
1049
|
+
Specifies the number of rows to be read from the data if the data is iterator.
|
|
1050
|
+
Types: int
|
|
1051
|
+
|
|
1052
|
+
RETURNS:
|
|
1053
|
+
teradataml DataFrame
|
|
1054
|
+
|
|
1055
|
+
RAISES:
|
|
1056
|
+
TeradataMlException
|
|
1057
|
+
|
|
1058
|
+
EXAMPLES:
|
|
1059
|
+
>>> from teradataml import DataFrame
|
|
1060
|
+
|
|
1061
|
+
# Example 1: Create a teradataml DataFrame from a list of lists.
|
|
1062
|
+
>>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]], columns=['name', 'age'])
|
|
1063
|
+
>>> df
|
|
1064
|
+
name age
|
|
1065
|
+
0 Bob 2
|
|
1066
|
+
1 Alice 1
|
|
1067
|
+
|
|
1068
|
+
# Example 2: Create a teradataml DataFrame from a list of tuples.
|
|
1069
|
+
>>> df = DataFrame.from_records([('Alice', 1), ('Bob', 3)], columns=['name', 'age'])
|
|
1070
|
+
>>> df
|
|
1071
|
+
name age
|
|
1072
|
+
0 Bob 3
|
|
1073
|
+
1 Alice 1
|
|
1074
|
+
|
|
1075
|
+
# Example 3: Create a teradataml DataFrame from a list of dictionaries.
|
|
1076
|
+
>>> df = DataFrame.from_records([{'name': 'Alice', 'age': 4}, {'name': 'Bob', 'age': 2}])
|
|
1077
|
+
>>> df
|
|
1078
|
+
name age
|
|
1079
|
+
0 Bob 2
|
|
1080
|
+
1 Alice 4
|
|
1081
|
+
|
|
1082
|
+
# Example 4: Create a teradataml DataFrame from a list where columns
|
|
1083
|
+
# are not explicitly defined.
|
|
1084
|
+
>>> df = DataFrame.from_records([['Alice', 1], ['Bob', 2]])
|
|
1085
|
+
>>> df
|
|
1086
|
+
col_0 col_1
|
|
1087
|
+
0 Bob 2
|
|
1088
|
+
1 Alice 1
|
|
1089
|
+
|
|
1090
|
+
# Example 5: Create a teradataml DataFrame from a list by excluding 'grade' column.
|
|
1091
|
+
>>> df = DataFrame.from_records([['Alice', 1, 'A'], ['Bob', 2, 'B']],
|
|
1092
|
+
... columns=['name', 'age', 'grade'],
|
|
1093
|
+
... exclude=['grade'])
|
|
1094
|
+
>>> df
|
|
1095
|
+
name age
|
|
1096
|
+
0 Bob 2
|
|
1097
|
+
1 Alice 1
|
|
1098
|
+
|
|
1099
|
+
# Example 6: Create a teradataml DataFrame from a list of lists
|
|
1100
|
+
# with "coerce_float" set to False.
|
|
1101
|
+
>>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
|
|
1102
|
+
... columns=['col1', 'col2'], coerce_float=False)
|
|
1103
|
+
>>> df
|
|
1104
|
+
col1 col2
|
|
1105
|
+
0 3 4.0
|
|
1106
|
+
1 1 2.5
|
|
1107
|
+
>>> df.tdtypes
|
|
1108
|
+
col1 BIGINT()
|
|
1109
|
+
col2 VARCHAR(length=1024, charset='UNICODE')
|
|
1110
|
+
|
|
1111
|
+
# Example 7: Create a teradataml DataFrame from a list of lists
|
|
1112
|
+
# with "coerce_float" set to True.
|
|
1113
|
+
>>> from decimal import Decimal
|
|
1114
|
+
>>> df = DataFrame.from_records([[1, Decimal('2.5')], [3, Decimal('4.0')]],
|
|
1115
|
+
... columns=['col1', 'col2'], coerce_float=True)
|
|
1116
|
+
>>> df
|
|
1117
|
+
col1 col2
|
|
1118
|
+
0 3 4.0
|
|
1119
|
+
1 1 2.5
|
|
1120
|
+
>>> df.tdtypes
|
|
1121
|
+
col1 BIGINT()
|
|
1122
|
+
col2 FLOAT()
|
|
1123
|
+
|
|
1124
|
+
# Example 8: Create a teradataml DataFrame from an iterator with "nrows" set to 2.
|
|
1125
|
+
>>> def data_gen():
|
|
1126
|
+
... yield ['Alice', 1]
|
|
1127
|
+
... yield ['Bob', 2]
|
|
1128
|
+
... yield ['Charlie', 3]
|
|
1129
|
+
>>> df = DataFrame.from_records(data_gen(), columns=['name', 'age'], nrows=2)
|
|
1130
|
+
>>> df
|
|
1131
|
+
name age
|
|
1132
|
+
0 Bob 2
|
|
1133
|
+
1 Alice 1
|
|
1134
|
+
"""
|
|
1135
|
+
|
|
1136
|
+
exclude = kwargs.get("exclude", None)
|
|
1137
|
+
coerce_float = kwargs.get("coerce_float", True)
|
|
1138
|
+
nrows = kwargs.get("nrows", None)
|
|
1139
|
+
|
|
1140
|
+
arg_type_matrix = []
|
|
1141
|
+
dtypes = (list, tuple, dict)
|
|
1142
|
+
arg_type_matrix.append(["data", data, False, (Iterator, _ListOf(dtypes)), True])
|
|
1143
|
+
arg_type_matrix.append(["columns", columns, True, (str, _ListOf(str)), True])
|
|
1144
|
+
arg_type_matrix.append(["exclude", exclude, True, (_ListOf(str),), True])
|
|
1145
|
+
arg_type_matrix.append(["coerce_float", coerce_float, True, (bool, ), True])
|
|
1146
|
+
arg_type_matrix.append(["nrows", nrows, True, (int,), True])
|
|
1147
|
+
|
|
1148
|
+
_Validators._validate_function_arguments(arg_type_matrix)
|
|
1149
|
+
|
|
1150
|
+
if isinstance(columns, str):
|
|
1151
|
+
columns = [columns]
|
|
1152
|
+
|
|
1153
|
+
df = pd.DataFrame.from_records(data, columns=columns, exclude=exclude,
|
|
1154
|
+
coerce_float=coerce_float, nrows=nrows)
|
|
1155
|
+
return cls(df, index=False)
|
|
1156
|
+
|
|
690
1157
|
def create_temp_view(self, name):
|
|
691
1158
|
"""
|
|
692
1159
|
DESCRIPTION:
|
|
@@ -1144,9 +1611,19 @@ class DataFrame():
|
|
|
1144
1611
|
datalake=self._datalake)
|
|
1145
1612
|
|
|
1146
1613
|
# Extract column names and corresponding teradatasqlalchemy types.
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1614
|
+
try:
|
|
1615
|
+
# For latest OTF help table query results.
|
|
1616
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1617
|
+
self._table,
|
|
1618
|
+
self._datalake,
|
|
1619
|
+
use_dialect=True)
|
|
1620
|
+
except NoSuchColumnError:
|
|
1621
|
+
# For older OTF help table query result.
|
|
1622
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1623
|
+
self._table,
|
|
1624
|
+
self._datalake)
|
|
1625
|
+
|
|
1626
|
+
# Create a SQLAlchemy table object representing datalake table.
|
|
1150
1627
|
t = sqlalchemy.Table(self._table, meta, schema=self._database,
|
|
1151
1628
|
*(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
|
|
1152
1629
|
return _MetaExpression(t)
|
|
@@ -2924,9 +3401,8 @@ class DataFrame():
|
|
|
2924
3401
|
msg = Messages.get_message(errcode)
|
|
2925
3402
|
raise TeradataMlException(msg, errcode)
|
|
2926
3403
|
|
|
2927
|
-
@argument_deprecation("20.0.0.5", "include", False, None)
|
|
2928
3404
|
@collect_queryband(queryband="DF_describe")
|
|
2929
|
-
def describe(self, percentiles=[.25, .5, .75],
|
|
3405
|
+
def describe(self, percentiles=[.25, .5, .75], verbose=False, distinct=False, statistics=None,
|
|
2930
3406
|
columns=None, pivot=False):
|
|
2931
3407
|
"""
|
|
2932
3408
|
DESCRIPTION:
|
|
@@ -2956,18 +3432,6 @@ class DataFrame():
|
|
|
2956
3432
|
Default Values: [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
|
|
2957
3433
|
Types: float or List of floats
|
|
2958
3434
|
|
|
2959
|
-
include:
|
|
2960
|
-
Optional Argument.
|
|
2961
|
-
Values can be either None or "all".
|
|
2962
|
-
If the value is "all", both numeric and non-numeric columns are included.
|
|
2963
|
-
Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2964
|
-
Computes count and unique for non-numeric columns.
|
|
2965
|
-
If the value is None, only numeric columns are used for collecting statistics.
|
|
2966
|
-
Note:
|
|
2967
|
-
* Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2968
|
-
Default Values: None
|
|
2969
|
-
Types: str
|
|
2970
|
-
|
|
2971
3435
|
verbose:
|
|
2972
3436
|
Optional Argument.
|
|
2973
3437
|
Specifies a boolean value to be used for time series aggregation, stating whether to get
|
|
@@ -2994,7 +3458,6 @@ class DataFrame():
|
|
|
2994
3458
|
Computes count and unique for non-numeric columns.
|
|
2995
3459
|
Notes:
|
|
2996
3460
|
1. statistics is not applicable for 'Time Series Aggregate Mode'.
|
|
2997
|
-
2. statistics should not be used with include as 'all'.
|
|
2998
3461
|
Permitted Values: count, mean, min, max, unique, std, describe, percentile
|
|
2999
3462
|
Default Values: None
|
|
3000
3463
|
Types: str or List of str
|
|
@@ -3310,7 +3773,6 @@ class DataFrame():
|
|
|
3310
3773
|
awu_matrix = []
|
|
3311
3774
|
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
3312
3775
|
awu_matrix.append(["percentiles", percentiles, True, (float, list)])
|
|
3313
|
-
awu_matrix.append(["include", include, True, (str), True, [None, "all"]])
|
|
3314
3776
|
awu_matrix.append(["verbose", verbose, True, (bool)])
|
|
3315
3777
|
awu_matrix.append(["distinct", distinct, True, (bool)])
|
|
3316
3778
|
awu_matrix.append(["statistics", statistics, True, (str, list), True,
|
|
@@ -3334,22 +3796,11 @@ class DataFrame():
|
|
|
3334
3796
|
if statistics:
|
|
3335
3797
|
statistics = [stats.lower() for stats in UtilFuncs._as_list(statistics)]
|
|
3336
3798
|
|
|
3337
|
-
# Argument include and statistics should not be used together
|
|
3338
|
-
if include is not None and statistics is not None:
|
|
3339
|
-
raise ValueError(Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH).format(
|
|
3340
|
-
'include', 'statistics'
|
|
3341
|
-
))
|
|
3342
|
-
|
|
3343
3799
|
# Percentiles must be a list of values between 0 and 1.
|
|
3344
3800
|
if not isinstance(percentiles, list) or not all(p > 0 and p < 1 for p in percentiles):
|
|
3345
3801
|
raise ValueError(Messages.get_message(MessageCodes.INVALID_ARG_VALUE, percentiles, "percentiles",
|
|
3346
3802
|
"percentiles must be a list of values between 0 and 1"))
|
|
3347
3803
|
|
|
3348
|
-
# Argument 'include' with value 'all' is not allowed for DataFrameGroupByTime
|
|
3349
|
-
if include is not None and include.lower() == "all" and isinstance(self, DataFrameGroupByTime):
|
|
3350
|
-
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
3351
|
-
'include', 'Aggregation', 'all', 'describe()', 'DataFrame or DataFrameGroupBy'))
|
|
3352
|
-
|
|
3353
3804
|
# Argument 'statistics' is not allowed for DataFrameGroupByTime
|
|
3354
3805
|
if statistics is not None and isinstance(self, DataFrameGroupByTime):
|
|
3355
3806
|
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
@@ -3383,7 +3834,7 @@ class DataFrame():
|
|
|
3383
3834
|
# Construct the aggregate query.
|
|
3384
3835
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3385
3836
|
percentiles=percentiles, function_label=function_label,
|
|
3386
|
-
groupby_column_list=groupby_column_list, include=
|
|
3837
|
+
groupby_column_list=groupby_column_list, include=None,
|
|
3387
3838
|
is_time_series_aggregate=True, verbose=verbose,
|
|
3388
3839
|
distinct=distinct,
|
|
3389
3840
|
timebucket_duration=self._timebucket_duration,
|
|
@@ -3414,7 +3865,7 @@ class DataFrame():
|
|
|
3414
3865
|
# Construct the aggregate query.
|
|
3415
3866
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3416
3867
|
percentiles=percentiles, function_label=function_label,
|
|
3417
|
-
groupby_column_list=groupby_column_list, include=
|
|
3868
|
+
groupby_column_list=groupby_column_list, include=None,
|
|
3418
3869
|
is_time_series_aggregate=False, verbose=verbose,
|
|
3419
3870
|
distinct=distinct, statistics=statistics)
|
|
3420
3871
|
|
|
@@ -5570,8 +6021,10 @@ class DataFrame():
|
|
|
5570
6021
|
Specifies the function(s) to apply on DataFrame columns.
|
|
5571
6022
|
|
|
5572
6023
|
Valid values for func are:
|
|
5573
|
-
'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique',
|
|
5574
|
-
|
|
6024
|
+
* 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'percentile_<floatvalue>', 'unique',
|
|
6025
|
+
'median', 'var'
|
|
6026
|
+
* Note: In 'percentile_<floatvalue>', <floatvalue> specifies the desired percentile value to
|
|
6027
|
+
calculate aggregate. It should be in the range of 0.0 to 1.0 (both inclusive).
|
|
5575
6028
|
|
|
5576
6029
|
Acceptable formats for function(s) are
|
|
5577
6030
|
string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
|
|
@@ -5605,12 +6058,17 @@ class DataFrame():
|
|
|
5605
6058
|
Output column names after the above operation are:
|
|
5606
6059
|
min_employee_no, sum_employee_no, var_employee_no, min_first_name
|
|
5607
6060
|
|
|
5608
|
-
4. "
|
|
6061
|
+
4. "percentile_<floatvalue>" passed to agg.
|
|
6062
|
+
>>> df.agg({'employee_no' : ['percentile_0.25', 'percentile_0.75', 'min']})
|
|
6063
|
+
>>> df.agg(['percentile_0.25', 'percentile_0.75', 'sum'])
|
|
6064
|
+
>>> df.agg('percentile_0.25')
|
|
6065
|
+
|
|
6066
|
+
5. "func" passed as a ColumnExpression built using the aggregate functions.
|
|
5609
6067
|
>>> df.agg(df.first_name.count())
|
|
5610
6068
|
Output column name after the above operation is:
|
|
5611
6069
|
count(first_name)
|
|
5612
6070
|
|
|
5613
|
-
|
|
6071
|
+
6. "func" passed as a list of ColumnExpression built using the aggregate functions.
|
|
5614
6072
|
>>> df.agg([df.employee_no.min(), df.first_name.count()])
|
|
5615
6073
|
Output column names after the above operation are:
|
|
5616
6074
|
min(employee_no), count(first_name)
|
|
@@ -5698,6 +6156,12 @@ class DataFrame():
|
|
|
5698
6156
|
min_employee_no sum_employee_no var_employee_no min_first_name
|
|
5699
6157
|
0 100 313 44.333333 abcd
|
|
5700
6158
|
|
|
6159
|
+
# Get the minimum, 25 percentile value and variance of employee number, by passing dictionary of
|
|
6160
|
+
# column names to string function/list of string functions as parameter.
|
|
6161
|
+
>>> df.agg({'employee_no' : ['min', 'percentile_0.25', 'var']})
|
|
6162
|
+
min_employee_no percentile_0.25_employee_no var_employee_no
|
|
6163
|
+
0 100 100 44.333333
|
|
6164
|
+
|
|
5701
6165
|
# Get the minimum and sum of all the columns in the dataframe,
|
|
5702
6166
|
# by passing list of string functions as parameter.
|
|
5703
6167
|
>>> df.agg(['min', 'sum'])
|
|
@@ -5743,9 +6207,15 @@ class DataFrame():
|
|
|
5743
6207
|
mean_employee_no unique_employee_no unique_first_name mean_joined_date unique_joined_date
|
|
5744
6208
|
0 104.333333 3 2 60/12/04 2
|
|
5745
6209
|
|
|
6210
|
+
# Get the percentile of each column in the dataframe with default value 0.5.
|
|
5746
6211
|
>>> df.agg('percentile')
|
|
5747
|
-
|
|
5748
|
-
|
|
6212
|
+
percentile_employee_no percentile_marks
|
|
6213
|
+
0 101 None
|
|
6214
|
+
|
|
6215
|
+
# Get 80 percentile of each column in the datafame.
|
|
6216
|
+
>>> df.agg('percentile_0.8')
|
|
6217
|
+
percentile_0.8_employee_no percentile_0.8_marks
|
|
6218
|
+
0 107 None
|
|
5749
6219
|
|
|
5750
6220
|
# Using another table 'sales' (having repeated values) to demonstrate operations
|
|
5751
6221
|
# 'unique' and 'percentile'.
|
|
@@ -5762,9 +6232,11 @@ class DataFrame():
|
|
|
5762
6232
|
Blue Inc 90.0 50 95 101 2017-04-01
|
|
5763
6233
|
Red Inc 200.0 150 140 None 2017-04-01
|
|
5764
6234
|
|
|
5765
|
-
|
|
5766
|
-
|
|
5767
|
-
|
|
6235
|
+
# Get 80 and 40 percentile values of each column in the dataframe.
|
|
6236
|
+
>>> df1 = df.select(['Feb', 'Jan', 'Mar', 'Apr'])
|
|
6237
|
+
>>> df1.agg(['percentile_0.8', 'percentile_0.4'])
|
|
6238
|
+
percentile_0.8_Feb percentile_0.4_Feb percentile_0.8_Jan percentile_0.4_Jan percentile_0.8_Mar percentile_0.4_Mar percentile_0.8_Apr percentile_0.4_Apr
|
|
6239
|
+
0 210.0 200.0 170 150 170 140 250 194
|
|
5768
6240
|
|
|
5769
6241
|
>>> df.agg('unique')
|
|
5770
6242
|
unique_accounts unique_Feb unique_Jan unique_Mar unique_Apr unique_datetime
|
|
@@ -5888,8 +6360,11 @@ class DataFrame():
|
|
|
5888
6360
|
groupby_col_names.append(col)
|
|
5889
6361
|
groupby_col_types.append(self[col].type)
|
|
5890
6362
|
|
|
5891
|
-
if
|
|
5892
|
-
|
|
6363
|
+
include_grouping_columns = True if isinstance(self, DataFrameGroupBy) and \
|
|
6364
|
+
self._include_grouping_columns else False
|
|
6365
|
+
if not include_grouping_columns and col in col_names:
|
|
6366
|
+
# If 'include_grouping_columns' argument is set to True and,
|
|
6367
|
+
# group by column is not specified in the columns argument,
|
|
5893
6368
|
# then, we should ignore this processing, otherwise we
|
|
5894
6369
|
# should process it in the same way to remove the reference
|
|
5895
6370
|
# for grouping column from aggregation list.
|
|
@@ -5951,6 +6426,8 @@ class DataFrame():
|
|
|
5951
6426
|
|
|
5952
6427
|
except TeradataMlException:
|
|
5953
6428
|
raise
|
|
6429
|
+
except ValueError:
|
|
6430
|
+
raise
|
|
5954
6431
|
except Exception as err:
|
|
5955
6432
|
raise TeradataMlException(Messages.get_message(
|
|
5956
6433
|
MessageCodes.EXECUTION_FAILED, "perform {} on DataFrame".format(operation), str(err)),
|
|
@@ -7760,7 +8237,7 @@ class DataFrame():
|
|
|
7760
8237
|
"""
|
|
7761
8238
|
return (type(None), int, float, str, decimal.Decimal, ColumnExpression, ClauseElement)
|
|
7762
8239
|
|
|
7763
|
-
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
|
|
8240
|
+
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
|
|
7764
8241
|
"""
|
|
7765
8242
|
DESCRIPTION:
|
|
7766
8243
|
Function generates the MetaExpression and AED nodeid for DataFrame.assign()
|
|
@@ -7773,6 +8250,11 @@ class DataFrame():
|
|
|
7773
8250
|
Default Value: False
|
|
7774
8251
|
Types: bool
|
|
7775
8252
|
|
|
8253
|
+
node_id:
|
|
8254
|
+
Optional Argument.
|
|
8255
|
+
Specifies the input nodeid for the assign operation.
|
|
8256
|
+
Types: str
|
|
8257
|
+
|
|
7776
8258
|
kwargs:
|
|
7777
8259
|
keyword, value pairs
|
|
7778
8260
|
- keywords are the column names.
|
|
@@ -7800,7 +8282,7 @@ class DataFrame():
|
|
|
7800
8282
|
|
|
7801
8283
|
# Join the expressions in result.
|
|
7802
8284
|
assign_expression = ', '.join(list(map(lambda x: x[1], result)))
|
|
7803
|
-
new_nodeid = self._aed_utils._aed_assign(
|
|
8285
|
+
new_nodeid = self._aed_utils._aed_assign(node_id,
|
|
7804
8286
|
assign_expression,
|
|
7805
8287
|
AEDConstants.AED_ASSIGN_DROP_EXISITING_COLUMNS.value)
|
|
7806
8288
|
|
|
@@ -7939,7 +8421,7 @@ class DataFrame():
|
|
|
7939
8421
|
env_mapper[env_name] = [colname]
|
|
7940
8422
|
else:
|
|
7941
8423
|
env_mapper[env_name] = udf_expr.keys()
|
|
7942
|
-
|
|
8424
|
+
debug = False
|
|
7943
8425
|
for env_name, cols in env_mapper.items():
|
|
7944
8426
|
# Create a dictionary of output columns to column type.
|
|
7945
8427
|
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
@@ -7950,6 +8432,7 @@ class DataFrame():
|
|
|
7950
8432
|
# Create a dictionary of output column name to udf arguments
|
|
7951
8433
|
function_args = {}
|
|
7952
8434
|
for colname, col in udf_expr.items():
|
|
8435
|
+
debug |= col._debug
|
|
7953
8436
|
delimiter = col._delimiter
|
|
7954
8437
|
quotechar = col._quotechar
|
|
7955
8438
|
if colname in cols:
|
|
@@ -7982,7 +8465,9 @@ class DataFrame():
|
|
|
7982
8465
|
columns_definitions=columns_definitions,
|
|
7983
8466
|
output_type_converters={
|
|
7984
8467
|
col_name: _Dtypes._teradata_type_to_python_type(col_type)
|
|
7985
|
-
for col_name, col_type in returns.items()}
|
|
8468
|
+
for col_name, col_type in returns.items()},
|
|
8469
|
+
debug=debug
|
|
8470
|
+
)
|
|
7986
8471
|
|
|
7987
8472
|
df = tbl_operators.execute()
|
|
7988
8473
|
return df
|
|
@@ -8624,8 +9109,34 @@ class DataFrame():
|
|
|
8624
9109
|
# from udf expression.
|
|
8625
9110
|
if bool(regular_expr):
|
|
8626
9111
|
try:
|
|
8627
|
-
|
|
9112
|
+
root_node_id = None
|
|
9113
|
+
root_df_col = df.columns
|
|
9114
|
+
|
|
9115
|
+
# Get the previous node type, if it is assign and drop_columns is False,
|
|
9116
|
+
# then check if the previous assign arguments exists and are not present
|
|
9117
|
+
# in either the root dataframe columns or the current assign arguments.
|
|
9118
|
+
# if these conditions are met, obtain the root node id (i.e., the first
|
|
9119
|
+
# node of the assign operation) and merge the previous assign arguments with the current ones.
|
|
9120
|
+
|
|
9121
|
+
prev_node_type = df._aed_utils._aed_get_node_query_type(df._nodeid)
|
|
9122
|
+
if not drop_columns and prev_node_type == "assign" and df._previous_assign_args is not None:
|
|
9123
|
+
if not df._root_columns & df._previous_assign_args.keys() and \
|
|
9124
|
+
not df._previous_assign_args.keys() & regular_expr.keys():
|
|
9125
|
+
# Get the root node id and root dataframe columns.
|
|
9126
|
+
root_df_col = df._root_columns
|
|
9127
|
+
root_node_id = df._aed_utils._aed_get_parent_nodeids(df._nodeid)[0]
|
|
9128
|
+
regular_expr = {**df._previous_assign_args, **regular_expr}
|
|
9129
|
+
|
|
9130
|
+
# If root_node_id is None, assign the current node id as root node of assign operation
|
|
9131
|
+
node_id = root_node_id if root_node_id is not None else df._nodeid
|
|
9132
|
+
|
|
9133
|
+
# Generate new meta expression and node id for the new dataframe.
|
|
9134
|
+
(new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(
|
|
9135
|
+
drop_columns, node_id = node_id, **regular_expr)
|
|
8628
9136
|
df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
|
|
9137
|
+
df._previous_assign_args = regular_expr
|
|
9138
|
+
df._root_columns = root_df_col
|
|
9139
|
+
|
|
8629
9140
|
except Exception as err:
|
|
8630
9141
|
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
8631
9142
|
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
@@ -8962,6 +9473,15 @@ class DataFrame():
|
|
|
8962
9473
|
Permitted Values: "CUBE", "ROLLUP", None
|
|
8963
9474
|
Types: str or NoneType
|
|
8964
9475
|
|
|
9476
|
+
include_grouping_columns:
|
|
9477
|
+
Optional Argument.
|
|
9478
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
9479
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
9480
|
+
columns mentioned in "columns_expr". Otherwise, resultant DataFrame will not have
|
|
9481
|
+
aggregations on the columns mentioned in "columns_expr".
|
|
9482
|
+
Default Value: False
|
|
9483
|
+
Types: bool
|
|
9484
|
+
|
|
8965
9485
|
NOTES:
|
|
8966
9486
|
1. Users can still apply teradataml DataFrame methods (filters/sort/etc) on top of the result.
|
|
8967
9487
|
2. Consecutive operations of grouping, i.e., groupby_time(), resample() and groupby() are not permitted.
|
|
@@ -8978,14 +9498,54 @@ class DataFrame():
|
|
|
8978
9498
|
TeradataMlException
|
|
8979
9499
|
|
|
8980
9500
|
EXAMPLES:
|
|
9501
|
+
# Load the data to run the example.
|
|
8981
9502
|
>>> load_example_data("dataframe","admissions_train")
|
|
9503
|
+
|
|
9504
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
8982
9505
|
>>> df = DataFrame("admissions_train")
|
|
9506
|
+
>>> df
|
|
9507
|
+
masters gpa stats programming admitted
|
|
9508
|
+
id
|
|
9509
|
+
15 yes 4.00 Advanced Advanced 1
|
|
9510
|
+
34 yes 3.85 Advanced Beginner 0
|
|
9511
|
+
13 no 4.00 Advanced Novice 1
|
|
9512
|
+
38 yes 2.65 Advanced Beginner 1
|
|
9513
|
+
5 no 3.44 Novice Novice 0
|
|
9514
|
+
40 yes 3.95 Novice Beginner 0
|
|
9515
|
+
7 yes 2.33 Novice Novice 1
|
|
9516
|
+
22 yes 3.46 Novice Beginner 0
|
|
9517
|
+
26 yes 3.57 Advanced Advanced 1
|
|
9518
|
+
17 no 3.83 Advanced Advanced 1
|
|
9519
|
+
|
|
9520
|
+
# Example 1: Find the minimum value of all valid columns by
|
|
9521
|
+
# grouping the DataFrame with column 'masters'.
|
|
8983
9522
|
>>> df1 = df.groupby(["masters"])
|
|
8984
9523
|
>>> df1.min()
|
|
8985
9524
|
masters min_id min_gpa min_stats min_programming min_admitted
|
|
8986
9525
|
0 no 3 1.87 Advanced Advanced 0
|
|
8987
9526
|
1 yes 1 1.98 Advanced Advanced 0
|
|
8988
9527
|
|
|
9528
|
+
# Example 2: Find the sum of all valid columns by grouping the DataFrame
|
|
9529
|
+
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
9530
|
+
# in aggregate function 'sum'.
|
|
9531
|
+
>>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=True)
|
|
9532
|
+
>>> df1.sum()
|
|
9533
|
+
masters admitted sum_id sum_gpa sum_admitted
|
|
9534
|
+
0 yes 1 188 34.35 10
|
|
9535
|
+
1 yes 0 289 43.36 0
|
|
9536
|
+
2 no 0 41 6.44 0
|
|
9537
|
+
3 no 1 302 57.52 16
|
|
9538
|
+
|
|
9539
|
+
# Example 3: Find the sum of all valid columns by grouping the DataFrame with
|
|
9540
|
+
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
9541
|
+
# in aggregate function 'sum'.
|
|
9542
|
+
>>> df1 = df.groupby(["masters", "admitted"], include_grouping_columns=False)
|
|
9543
|
+
>>> df1.sum()
|
|
9544
|
+
masters admitted sum_id sum_gpa
|
|
9545
|
+
0 yes 0 289 43.36
|
|
9546
|
+
1 no 0 41 6.44
|
|
9547
|
+
2 no 1 302 57.52
|
|
9548
|
+
3 yes 1 188 34.35
|
|
8989
9549
|
"""
|
|
8990
9550
|
# Argument validations
|
|
8991
9551
|
arg_info_matrix = []
|
|
@@ -8993,6 +9553,8 @@ class DataFrame():
|
|
|
8993
9553
|
option = kwargs.get("option", None)
|
|
8994
9554
|
arg_info_matrix.append(["option", option, True, (str, type(None)), True,
|
|
8995
9555
|
["CUBE", "ROLLUP", None]])
|
|
9556
|
+
include_grouping_columns = kwargs.get("include_grouping_columns", False)
|
|
9557
|
+
arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, True, (bool)])
|
|
8996
9558
|
|
|
8997
9559
|
# Validate argument types
|
|
8998
9560
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -9037,7 +9599,8 @@ class DataFrame():
|
|
|
9037
9599
|
|
|
9038
9600
|
groupbyexpr = ', '.join(UtilFuncs._teradata_quote_arg(col, "\"", False) for col in column_list)
|
|
9039
9601
|
groupbyObj = DataFrameGroupBy(self._nodeid, self._metaexpr, self._column_names_and_types, self.columns,
|
|
9040
|
-
groupbyexpr, column_list, option)
|
|
9602
|
+
groupbyexpr, column_list, option, include_grouping_columns)
|
|
9603
|
+
|
|
9041
9604
|
return groupbyObj
|
|
9042
9605
|
except TeradataMlException:
|
|
9043
9606
|
raise
|
|
@@ -11569,6 +12132,10 @@ class DataFrame():
|
|
|
11569
12132
|
DESCRIPTION:
|
|
11570
12133
|
Function to apply a user defined function to each row in the
|
|
11571
12134
|
teradataml DataFrame, leveraging Vantage's Script Table Operator.
|
|
12135
|
+
Notes:
|
|
12136
|
+
1. The function requires to use same Python version in both Vantage and local environment.
|
|
12137
|
+
2. Teradata recommends to use "dill" package with same version in both Vantage and
|
|
12138
|
+
local environment.
|
|
11572
12139
|
|
|
11573
12140
|
PARAMETERS:
|
|
11574
12141
|
user_function:
|
|
@@ -11749,6 +12316,15 @@ class DataFrame():
|
|
|
11749
12316
|
Default Value: True
|
|
11750
12317
|
Types: bool
|
|
11751
12318
|
|
|
12319
|
+
debug:
|
|
12320
|
+
Optional Argument.
|
|
12321
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
12322
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
12323
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
12324
|
+
Otherwise, file is removed from the local file system.
|
|
12325
|
+
Default Value: False
|
|
12326
|
+
Types: bool
|
|
12327
|
+
|
|
11752
12328
|
RETURNS:
|
|
11753
12329
|
1. teradataml DataFrame if exec_mode is "IN-DB".
|
|
11754
12330
|
2. Pandas DataFrame if exec_mode is "LOCAL".
|
|
@@ -11901,6 +12477,7 @@ class DataFrame():
|
|
|
11901
12477
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
11902
12478
|
auth = kwargs.pop('auth', None)
|
|
11903
12479
|
charset = kwargs.pop('charset', None)
|
|
12480
|
+
debug = kwargs.pop('debug', False)
|
|
11904
12481
|
|
|
11905
12482
|
# Check for other extra/unknown arguments.
|
|
11906
12483
|
unknown_args = list(kwargs.keys())
|
|
@@ -11919,7 +12496,7 @@ class DataFrame():
|
|
|
11919
12496
|
sort_ascending=sort_ascending,
|
|
11920
12497
|
returns=returns, delimiter=delimiter,
|
|
11921
12498
|
quotechar=quotechar, auth=auth,
|
|
11922
|
-
charset=charset, num_rows=num_rows)
|
|
12499
|
+
charset=charset, num_rows=num_rows, debug=debug)
|
|
11923
12500
|
|
|
11924
12501
|
return tbl_op_util.execute()
|
|
11925
12502
|
|
|
@@ -11936,6 +12513,10 @@ class DataFrame():
|
|
|
11936
12513
|
DESCRIPTION:
|
|
11937
12514
|
Function to apply a user defined function to a group or partition of rows
|
|
11938
12515
|
in the teradataml DataFrame, leveraging Vantage's Script Table Operator.
|
|
12516
|
+
Notes:
|
|
12517
|
+
1. The function requires to use same Python version in both Vantage and local environment.
|
|
12518
|
+
2. Teradata recommends to use "dill" package with same version in both Vantage and
|
|
12519
|
+
local environment.
|
|
11939
12520
|
|
|
11940
12521
|
PARAMETERS:
|
|
11941
12522
|
user_function:
|
|
@@ -12146,6 +12727,15 @@ class DataFrame():
|
|
|
12146
12727
|
Default Value: True
|
|
12147
12728
|
Types: bool
|
|
12148
12729
|
|
|
12730
|
+
debug:
|
|
12731
|
+
Optional Argument.
|
|
12732
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
12733
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
12734
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
12735
|
+
Otherwise, file is removed from the local file system.
|
|
12736
|
+
Default Value: False
|
|
12737
|
+
Types: bool
|
|
12738
|
+
|
|
12149
12739
|
RETURNS:
|
|
12150
12740
|
1. teradataml DataFrame if exec_mode is "IN-DB".
|
|
12151
12741
|
2. Pandas DataFrame if exec_mode is "LOCAL".
|
|
@@ -12311,6 +12901,7 @@ class DataFrame():
|
|
|
12311
12901
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
12312
12902
|
auth = kwargs.pop('auth', None)
|
|
12313
12903
|
charset = kwargs.pop('charset', None)
|
|
12904
|
+
debug = kwargs.pop('debug', False)
|
|
12314
12905
|
|
|
12315
12906
|
# Check for other extra/unknown arguments.
|
|
12316
12907
|
unknown_args = list(kwargs.keys())
|
|
@@ -12329,7 +12920,7 @@ class DataFrame():
|
|
|
12329
12920
|
sort_ascending=sort_ascending,
|
|
12330
12921
|
returns=returns, delimiter=delimiter,
|
|
12331
12922
|
quotechar=quotechar, auth=auth,
|
|
12332
|
-
charset=charset, num_rows=num_rows)
|
|
12923
|
+
charset=charset, num_rows=num_rows, debug=debug)
|
|
12333
12924
|
|
|
12334
12925
|
return tbl_op_util.execute()
|
|
12335
12926
|
|
|
@@ -12346,9 +12937,9 @@ class DataFrame():
|
|
|
12346
12937
|
teradataml DataFrame, leveraging Apply Table Operator of Open
|
|
12347
12938
|
Analytics Framework.
|
|
12348
12939
|
Notes:
|
|
12349
|
-
|
|
12350
|
-
|
|
12351
|
-
|
|
12940
|
+
1. The function requires to use same Python version in both remote environment and local environment.
|
|
12941
|
+
2. Teradata recommends to use "dill" package with same version in both remote environment and
|
|
12942
|
+
local environment.
|
|
12352
12943
|
|
|
12353
12944
|
PARAMETERS:
|
|
12354
12945
|
user_function:
|
|
@@ -12531,6 +13122,15 @@ class DataFrame():
|
|
|
12531
13122
|
Default value: "csv"
|
|
12532
13123
|
Types: str
|
|
12533
13124
|
|
|
13125
|
+
debug:
|
|
13126
|
+
Optional Argument.
|
|
13127
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
13128
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
13129
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
13130
|
+
Otherwise, file is removed from the local file system.
|
|
13131
|
+
Default Value: False
|
|
13132
|
+
Types: bool
|
|
13133
|
+
|
|
12534
13134
|
RETURNS:
|
|
12535
13135
|
teradataml DataFrame.
|
|
12536
13136
|
|
|
@@ -12707,6 +13307,7 @@ class DataFrame():
|
|
|
12707
13307
|
is_local_order = kwargs.pop('is_local_order', False)
|
|
12708
13308
|
nulls_first = kwargs.pop('nulls_first', True)
|
|
12709
13309
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
13310
|
+
debug = kwargs.pop('debug', False)
|
|
12710
13311
|
|
|
12711
13312
|
# Check for other extra/unknown arguments.
|
|
12712
13313
|
unknown_args = list(kwargs.keys())
|
|
@@ -12729,7 +13330,8 @@ class DataFrame():
|
|
|
12729
13330
|
charset=None,
|
|
12730
13331
|
num_rows=num_rows,
|
|
12731
13332
|
env_name=env_name,
|
|
12732
|
-
style=style
|
|
13333
|
+
style=style,
|
|
13334
|
+
debug=debug)
|
|
12733
13335
|
|
|
12734
13336
|
return tbl_op_util.execute()
|
|
12735
13337
|
|
|
@@ -13075,7 +13677,7 @@ class DataFrame():
|
|
|
13075
13677
|
False)
|
|
13076
13678
|
column_names = list(dict.fromkeys(column_names))
|
|
13077
13679
|
|
|
13078
|
-
if list_td_reserved_keywords(column_names) or UtilFuncs.
|
|
13680
|
+
if list_td_reserved_keywords(column_names) or UtilFuncs._is_non_ascii(column_names):
|
|
13079
13681
|
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
13080
13682
|
|
|
13081
13683
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
@@ -15261,7 +15863,7 @@ class DataFrame():
|
|
|
15261
15863
|
return self.assign(**new_columns, drop_columns=True).select(self.columns)
|
|
15262
15864
|
|
|
15263
15865
|
@collect_queryband(queryband="DF_cube")
|
|
15264
|
-
def cube(self, columns):
|
|
15866
|
+
def cube(self, columns, include_grouping_columns=False):
|
|
15265
15867
|
"""
|
|
15266
15868
|
DESCRIPTION:
|
|
15267
15869
|
cube() function creates a multi-dimensional cube for the DataFrame
|
|
@@ -15275,6 +15877,15 @@ class DataFrame():
|
|
|
15275
15877
|
Specifies the name(s) of input teradataml DataFrame column(s).
|
|
15276
15878
|
Types: str OR list of str(s)
|
|
15277
15879
|
|
|
15880
|
+
include_grouping_columns:
|
|
15881
|
+
Optional Argument.
|
|
15882
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
15883
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
15884
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
15885
|
+
aggregations on the columns mentioned in "columns".
|
|
15886
|
+
Default Value: False
|
|
15887
|
+
Types: bool
|
|
15888
|
+
|
|
15278
15889
|
RETURNS:
|
|
15279
15890
|
teradataml DataFrameGroupBy
|
|
15280
15891
|
|
|
@@ -15282,9 +15893,27 @@ class DataFrame():
|
|
|
15282
15893
|
TeradataMlException
|
|
15283
15894
|
|
|
15284
15895
|
EXAMPLES :
|
|
15285
|
-
#
|
|
15896
|
+
# Load the data to run the example.
|
|
15286
15897
|
>>> load_example_data("dataframe","admissions_train")
|
|
15898
|
+
|
|
15899
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
15287
15900
|
>>> df = DataFrame("admissions_train")
|
|
15901
|
+
>>> df
|
|
15902
|
+
masters gpa stats programming admitted
|
|
15903
|
+
id
|
|
15904
|
+
15 yes 4.00 Advanced Advanced 1
|
|
15905
|
+
34 yes 3.85 Advanced Beginner 0
|
|
15906
|
+
13 no 4.00 Advanced Novice 1
|
|
15907
|
+
38 yes 2.65 Advanced Beginner 1
|
|
15908
|
+
5 no 3.44 Novice Novice 0
|
|
15909
|
+
40 yes 3.95 Novice Beginner 0
|
|
15910
|
+
7 yes 2.33 Novice Novice 1
|
|
15911
|
+
22 yes 3.46 Novice Beginner 0
|
|
15912
|
+
26 yes 3.57 Advanced Advanced 1
|
|
15913
|
+
17 no 3.83 Advanced Advanced 1
|
|
15914
|
+
|
|
15915
|
+
# Example 1: Find the sum of all valid columns by grouping the
|
|
15916
|
+
# DataFrame columns with 'masters' and 'stats'.
|
|
15288
15917
|
>>> df1 = df.cube(["masters", "stats"]).sum()
|
|
15289
15918
|
>>> df1
|
|
15290
15919
|
masters stats sum_id sum_gpa sum_admitted
|
|
@@ -15299,10 +15928,42 @@ class DataFrame():
|
|
|
15299
15928
|
8 no Advanced 189 34.95 9
|
|
15300
15929
|
9 yes Novice 98 13.74 1
|
|
15301
15930
|
|
|
15931
|
+
# Example 2: Find the avg of all valid columns by grouping the DataFrame
|
|
15932
|
+
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
15933
|
+
# in aggregate function 'avg'.
|
|
15934
|
+
>>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=True).avg()
|
|
15935
|
+
>>> df1
|
|
15936
|
+
masters admitted avg_id avg_gpa avg_admitted
|
|
15937
|
+
0 yes NaN 21.681818 3.532273 0.454545
|
|
15938
|
+
1 None 1.0 18.846154 3.533462 1.000000
|
|
15939
|
+
2 no NaN 19.055556 3.553333 0.888889
|
|
15940
|
+
3 yes 0.0 24.083333 3.613333 0.000000
|
|
15941
|
+
4 None NaN 20.500000 3.541750 0.650000
|
|
15942
|
+
5 None 0.0 23.571429 3.557143 0.000000
|
|
15943
|
+
6 yes 1.0 18.800000 3.435000 1.000000
|
|
15944
|
+
7 no 1.0 18.875000 3.595000 1.000000
|
|
15945
|
+
8 no 0.0 20.500000 3.220000 0.000000
|
|
15946
|
+
|
|
15947
|
+
# Example 3: Find the avg of all valid columns by grouping the DataFrame with
|
|
15948
|
+
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
15949
|
+
# in aggregate function 'avg'.
|
|
15950
|
+
>>> df1 = df.cube(["masters", "admitted"], include_grouping_columns=False).avg()
|
|
15951
|
+
>>> df1
|
|
15952
|
+
masters admitted avg_id avg_gpa
|
|
15953
|
+
0 no 0.0 20.500000 3.220000
|
|
15954
|
+
1 None 1.0 18.846154 3.533462
|
|
15955
|
+
2 no NaN 19.055556 3.553333
|
|
15956
|
+
3 yes 0.0 24.083333 3.613333
|
|
15957
|
+
4 None NaN 20.500000 3.541750
|
|
15958
|
+
5 None 0.0 23.571429 3.557143
|
|
15959
|
+
6 yes 1.0 18.800000 3.435000
|
|
15960
|
+
7 yes NaN 21.681818 3.532273
|
|
15961
|
+
8 no 1.0 18.875000 3.595000
|
|
15302
15962
|
"""
|
|
15303
15963
|
# Validate columns argument.
|
|
15304
15964
|
arg_info_matrix = []
|
|
15305
15965
|
arg_info_matrix.append(["columns", columns, False, (str, list), True])
|
|
15966
|
+
arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
|
|
15306
15967
|
|
|
15307
15968
|
# Validate argument types
|
|
15308
15969
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -15312,10 +15973,10 @@ class DataFrame():
|
|
|
15312
15973
|
|
|
15313
15974
|
# Query generation of cube API is same as the group by.
|
|
15314
15975
|
# Only 'cube' is concatenated with 'group by' clause.
|
|
15315
|
-
return self.groupby(columns, option="cube")
|
|
15976
|
+
return self.groupby(columns, option="cube", include_grouping_columns=include_grouping_columns)
|
|
15316
15977
|
|
|
15317
15978
|
@collect_queryband(queryband="DF_rollup")
|
|
15318
|
-
def rollup(self, columns):
|
|
15979
|
+
def rollup(self, columns, include_grouping_columns=False):
|
|
15319
15980
|
"""
|
|
15320
15981
|
DESCRIPTION:
|
|
15321
15982
|
rollup() function creates a multi-dimensional rollup for the DataFrame
|
|
@@ -15329,6 +15990,15 @@ class DataFrame():
|
|
|
15329
15990
|
Specifies the name(s) of input teradataml DataFrame column(s).
|
|
15330
15991
|
Types: str OR list of str(s)
|
|
15331
15992
|
|
|
15993
|
+
include_grouping_columns:
|
|
15994
|
+
Optional Argument.
|
|
15995
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
15996
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
15997
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
15998
|
+
aggregations on the columns mentioned in "columns".
|
|
15999
|
+
Default Value: False
|
|
16000
|
+
Types: bool
|
|
16001
|
+
|
|
15332
16002
|
RETURNS:
|
|
15333
16003
|
teradataml DataFrameGroupBy
|
|
15334
16004
|
|
|
@@ -15336,9 +16006,27 @@ class DataFrame():
|
|
|
15336
16006
|
TeradataMlException
|
|
15337
16007
|
|
|
15338
16008
|
EXAMPLES :
|
|
15339
|
-
#
|
|
16009
|
+
# Load the data to run the example.
|
|
15340
16010
|
>>> load_example_data("dataframe","admissions_train")
|
|
16011
|
+
|
|
16012
|
+
# Create a DataFrame on 'admissions_train' table.
|
|
15341
16013
|
>>> df = DataFrame("admissions_train")
|
|
16014
|
+
>>> df
|
|
16015
|
+
masters gpa stats programming admitted
|
|
16016
|
+
id
|
|
16017
|
+
15 yes 4.00 Advanced Advanced 1
|
|
16018
|
+
34 yes 3.85 Advanced Beginner 0
|
|
16019
|
+
13 no 4.00 Advanced Novice 1
|
|
16020
|
+
38 yes 2.65 Advanced Beginner 1
|
|
16021
|
+
5 no 3.44 Novice Novice 0
|
|
16022
|
+
40 yes 3.95 Novice Beginner 0
|
|
16023
|
+
7 yes 2.33 Novice Novice 1
|
|
16024
|
+
22 yes 3.46 Novice Beginner 0
|
|
16025
|
+
26 yes 3.57 Advanced Advanced 1
|
|
16026
|
+
17 no 3.83 Advanced Advanced 1
|
|
16027
|
+
|
|
16028
|
+
# Example 1: Find the sum of all valid columns by grouping the
|
|
16029
|
+
# DataFrame columns with 'masters' and 'stats'.
|
|
15342
16030
|
>>> df1 = df.rollup(["masters", "stats"]).sum()
|
|
15343
16031
|
>>> df1
|
|
15344
16032
|
masters stats sum_id sum_gpa sum_admitted
|
|
@@ -15351,11 +16039,39 @@ class DataFrame():
|
|
|
15351
16039
|
6 yes Beginner 13 14.71 2
|
|
15352
16040
|
7 yes Advanced 366 49.26 7
|
|
15353
16041
|
8 no Advanced 189 34.95 9
|
|
15354
|
-
|
|
16042
|
+
|
|
16043
|
+
# Example 2: Find the avg of all valid columns by grouping the DataFrame
|
|
16044
|
+
# with columns 'masters' and 'admitted'. Include grouping columns
|
|
16045
|
+
# in aggregate function 'avg'.
|
|
16046
|
+
>>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=True).avg()
|
|
16047
|
+
>>> df1
|
|
16048
|
+
masters admitted avg_id avg_gpa avg_admitted
|
|
16049
|
+
0 no NaN 19.055556 3.553333 0.888889
|
|
16050
|
+
1 yes NaN 21.681818 3.532273 0.454545
|
|
16051
|
+
2 None NaN 20.500000 3.541750 0.650000
|
|
16052
|
+
3 yes 0.0 24.083333 3.613333 0.000000
|
|
16053
|
+
4 no 1.0 18.875000 3.595000 1.000000
|
|
16054
|
+
5 yes 1.0 18.800000 3.435000 1.000000
|
|
16055
|
+
6 no 0.0 20.500000 3.220000 0.000000
|
|
16056
|
+
|
|
16057
|
+
# Example 3: Find the avg of all valid columns by grouping the DataFrame with
|
|
16058
|
+
# columns 'masters' and 'admitted'. Do not include grouping columns
|
|
16059
|
+
# in aggregate function 'avg'.
|
|
16060
|
+
>>> df1 = df.rollup(["masters", "admitted"], include_grouping_columns=False).avg()
|
|
16061
|
+
>>> df1
|
|
16062
|
+
masters admitted avg_id avg_gpa
|
|
16063
|
+
0 no NaN 19.055556 3.553333
|
|
16064
|
+
1 yes NaN 21.681818 3.532273
|
|
16065
|
+
2 no 0.0 20.500000 3.220000
|
|
16066
|
+
3 yes 0.0 24.083333 3.613333
|
|
16067
|
+
4 no 1.0 18.875000 3.595000
|
|
16068
|
+
5 yes 1.0 18.800000 3.435000
|
|
16069
|
+
6 None NaN 20.500000 3.541750
|
|
15355
16070
|
"""
|
|
15356
16071
|
# Validate columns argument.
|
|
15357
16072
|
arg_info_matrix = []
|
|
15358
16073
|
arg_info_matrix.append(["columns", columns, False, (str, list), True])
|
|
16074
|
+
arg_info_matrix.append(["include_grouping_columns", include_grouping_columns, False, bool])
|
|
15359
16075
|
|
|
15360
16076
|
# Validate argument types
|
|
15361
16077
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -15365,8 +16081,255 @@ class DataFrame():
|
|
|
15365
16081
|
|
|
15366
16082
|
# Query generation of cube API is same as the group by.
|
|
15367
16083
|
# Only 'rollup' is concatenated with 'group by' clause.
|
|
15368
|
-
return self.groupby(columns, option="rollup")
|
|
16084
|
+
return self.groupby(columns, option="rollup", include_grouping_columns=include_grouping_columns)
|
|
16085
|
+
|
|
16086
|
+
# Metadata functions for DataFrame created on datalake/OTF table.
|
|
16087
|
+
@property
|
|
16088
|
+
@collect_queryband(queryband="DF_snpsht")
|
|
16089
|
+
@df_utils.check_otf_dataframe()
|
|
16090
|
+
def snapshots(self):
|
|
16091
|
+
"""
|
|
16092
|
+
DESCRIPTION:
|
|
16093
|
+
Gets snapshot information for a DataLake table.
|
|
16094
|
+
|
|
16095
|
+
PARAMETERS:
|
|
16096
|
+
None
|
|
16097
|
+
|
|
16098
|
+
RETURNS:
|
|
16099
|
+
teradataml DataFrame.
|
|
15369
16100
|
|
|
16101
|
+
RAISES:
|
|
16102
|
+
TeradataMLException.
|
|
16103
|
+
|
|
16104
|
+
EXAMPLES :
|
|
16105
|
+
# Example 1: Get the snapshot information for datalake table.
|
|
16106
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16107
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16108
|
+
... table_name="datalake_table",
|
|
16109
|
+
... datalake_name="datalake")
|
|
16110
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16111
|
+
>>> datalake_df.snapshots
|
|
16112
|
+
snapshotId snapshotTimestamp timestampMSecs manifestList summary
|
|
16113
|
+
0 6373759902296319074 2023-06-15 00:07:47 1686787667420 s3://vim-iceberg-v1/glue/metadata/snap-6373759... {"added-data-files":"1","added-records":"5","a...}
|
|
16114
|
+
1 4768076782814510171 2023-06-15 00:09:01 1686787741964 s3://vim-iceberg-v1/glue/metadata/snap-4768076... {"added-data-files":"1","added-records":"2","a...}
|
|
16115
|
+
2 7771482207931850214 2024-05-29 04:59:09 1716958749946 s3://vim-iceberg-v1/glue/metadata/snap-7771482... {"deleted-data-files":"2","deleted-records":"7...}
|
|
16116
|
+
3 1545363077953282623 2024-05-29 05:13:39 1716959619455 s3://vim-iceberg-v1/glue/metadata/snap-1545363... {"changed-partition-count":"0","total-records"...}
|
|
16117
|
+
4 2166707884289108360 2024-05-29 05:17:49 1716959869075 s3://vim-iceberg-v1/glue/metadata/snap-2166707... {"changed-partition-count":"0","total-records"...}
|
|
16118
|
+
5 8934190131471882700 2024-05-29 05:21:32 1716960092422 s3://vim-iceberg-v1/glue/metadata/snap-8934190... {"changed-partition-count":"0","total-records"...}
|
|
16119
|
+
6 3086605171258231948 2024-05-29 05:34:43 1716960883786 s3://vim-iceberg-v1/glue/metadata/snap-3086605... {"changed-partition-count":"0","total-records"...}
|
|
16120
|
+
7 7592503716012384122 2024-05-29 06:04:48 1716962688047 s3://vim-iceberg-v1/glue/metadata/snap-7592503... {"changed-partition-count":"0","total-records"...}
|
|
16121
|
+
8 2831061717890032890 2024-06-04 17:21:01 1717521661689 s3://vim-iceberg-v1/glue/metadata/snap-2831061... {"added-data-files":"2","added-records":"7","a...}
|
|
16122
|
+
9 8810491341502972715 2024-10-22 23:47:22 1729640842067 s3://vim-iceberg-v1/glue/metadata/snap-8810491... {"added-data-files":"1","added-records":"1","a...}
|
|
16123
|
+
10 3953136136558551163 2024-12-03 04:40:48 1733200848733 s3://vim-iceberg-v1/glue/metadata/snap-3953136... {"added-data-files":"1","added-records":"4","a...}
|
|
16124
|
+
11 6034775168901969481 2024-12-03 04:40:49 1733200849966 s3://vim-iceberg-v1/glue/metadata/snap-6034775... {"deleted-data-files":"1","deleted-records":"5...}
|
|
16125
|
+
"""
|
|
16126
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_SNAPSHOTS")
|
|
16127
|
+
|
|
16128
|
+
@property
|
|
16129
|
+
@collect_queryband(queryband="DF_prttns")
|
|
16130
|
+
@df_utils.check_otf_dataframe()
|
|
16131
|
+
def partitions(self):
|
|
16132
|
+
"""
|
|
16133
|
+
DESCRIPTION:
|
|
16134
|
+
Gets partition information for a DataLake table.
|
|
16135
|
+
|
|
16136
|
+
PARAMETERS:
|
|
16137
|
+
None
|
|
16138
|
+
|
|
16139
|
+
RETURNS:
|
|
16140
|
+
teradataml DataFrame.
|
|
16141
|
+
|
|
16142
|
+
RAISES:
|
|
16143
|
+
TeradataMLException.
|
|
16144
|
+
|
|
16145
|
+
EXAMPLES :
|
|
16146
|
+
# Example 1: Get the partition information for datalake table.
|
|
16147
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16148
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16149
|
+
... table_name="datalake_table",
|
|
16150
|
+
... datalake_name="datalake")
|
|
16151
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16152
|
+
>>> datalake_df.partitions
|
|
16153
|
+
id name
|
|
16154
|
+
0 1000 c2
|
|
16155
|
+
1 1001 c3
|
|
16156
|
+
|
|
16157
|
+
|
|
16158
|
+
"""
|
|
16159
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_PARTITIONS")
|
|
16160
|
+
|
|
16161
|
+
@property
|
|
16162
|
+
@collect_queryband(queryband="DF_mnfsts")
|
|
16163
|
+
@df_utils.check_otf_dataframe()
|
|
16164
|
+
def manifests(self):
|
|
16165
|
+
"""
|
|
16166
|
+
DESCRIPTION:
|
|
16167
|
+
Gets manifest information for a DataLake table.
|
|
16168
|
+
|
|
16169
|
+
PARAMETERS:
|
|
16170
|
+
None
|
|
16171
|
+
|
|
16172
|
+
RETURNS:
|
|
16173
|
+
teradataml DataFrame.
|
|
16174
|
+
|
|
16175
|
+
RAISES:
|
|
16176
|
+
TeradataMLException.
|
|
16177
|
+
|
|
16178
|
+
EXAMPLES :
|
|
16179
|
+
# Example 1: Get the manifest information for datalake table.
|
|
16180
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16181
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16182
|
+
... table_name="datalake_table",
|
|
16183
|
+
... datalake_name="datalake")
|
|
16184
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16185
|
+
>>> datalake_df.manifests
|
|
16186
|
+
snapshotId snapshotTimestamp manifestList manifestFile manifestFileLength datafilecount totalrowcount
|
|
16187
|
+
0 8068130797628952520 2025-05-02 11:45:26 s3://vim-iceberg-v1/otftestdb/nt_sales/... s3://vim-iceberg-v1/otftestdb/nt_sales/... 7158 6 6
|
|
16188
|
+
"""
|
|
16189
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_MANIFESTS")
|
|
16190
|
+
|
|
16191
|
+
@property
|
|
16192
|
+
@collect_queryband(queryband="DF_hstry")
|
|
16193
|
+
@df_utils.check_otf_dataframe()
|
|
16194
|
+
def history(self):
|
|
16195
|
+
"""
|
|
16196
|
+
DESCRIPTION:
|
|
16197
|
+
Gets the snapshot history related to a DataLake table.
|
|
16198
|
+
|
|
16199
|
+
PARAMETERS:
|
|
16200
|
+
None
|
|
16201
|
+
|
|
16202
|
+
RETURNS:
|
|
16203
|
+
teradataml DataFrame.
|
|
16204
|
+
|
|
16205
|
+
RAISES:
|
|
16206
|
+
TeradataMLException.
|
|
16207
|
+
|
|
16208
|
+
EXAMPLES :
|
|
16209
|
+
# Example 1: Get the partition information for datalake table.
|
|
16210
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16211
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16212
|
+
... table_name="datalake_table",
|
|
16213
|
+
... datalake_name="datalake")
|
|
16214
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16215
|
+
>>> datalake_df.history
|
|
16216
|
+
id timestamp
|
|
16217
|
+
0 8068130797628952520 2025-05-02 11:45:26
|
|
16218
|
+
"""
|
|
16219
|
+
return self._execute_metadata_query_and_generate_dataframe("TD_HISTORY")
|
|
16220
|
+
|
|
16221
|
+
def _execute_metadata_query_and_generate_dataframe(self, func_name):
|
|
16222
|
+
"""Function executes OTF metadata query and return result in DataFrame format"""
|
|
16223
|
+
query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_METADATA).format(func_name,
|
|
16224
|
+
self._table_name)
|
|
16225
|
+
return DataFrame.from_query(query)
|
|
16226
|
+
|
|
16227
|
+
@collect_queryband(queryband="DF_gt_snpsht")
|
|
16228
|
+
@df_utils.check_otf_dataframe()
|
|
16229
|
+
def get_snapshot(self, as_of):
|
|
16230
|
+
"""
|
|
16231
|
+
DESCRIPTION:
|
|
16232
|
+
Gets the data from a DataLake table for the given snapshot id or timestamp string.
|
|
16233
|
+
Notes:
|
|
16234
|
+
* The snapshot id can be obtained from the 'snapshots' property of the DataFrame.
|
|
16235
|
+
* The time travel value represented by 'as_of' should be in the format "YYYY-MM-DD HH:MM:SS.FFFFFFF"
|
|
16236
|
+
for TIMESTAMP string or "YYYY-MM-DD" for DATE string.
|
|
16237
|
+
|
|
16238
|
+
PARAMETERS:
|
|
16239
|
+
as_of:
|
|
16240
|
+
Required Argument.
|
|
16241
|
+
Specifies the snapshot id or timestamp information for which the snapshot is to be fetched.
|
|
16242
|
+
Types: str or int
|
|
16243
|
+
|
|
16244
|
+
RETURNS:
|
|
16245
|
+
teradataml DataFrame.
|
|
16246
|
+
|
|
16247
|
+
RAISES:
|
|
16248
|
+
TeradataMLException.
|
|
16249
|
+
|
|
16250
|
+
EXAMPLES:
|
|
16251
|
+
# DataFrame creation on OTF table.
|
|
16252
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
16253
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
16254
|
+
... table_name="datalake_table",
|
|
16255
|
+
... datalake_name="datalake")
|
|
16256
|
+
>>> datalake_df = DataFrame(in_schema_tbl)
|
|
16257
|
+
|
|
16258
|
+
# List snapshots first.
|
|
16259
|
+
>>> datalake_df.snapshots
|
|
16260
|
+
snapshotId snapshotTimestamp timestampMSecs manifestList summary
|
|
16261
|
+
2046682612111137809 2025-06-03 13:26:15 1748957175692 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-204... {"added-data-files":"Red Inc","added-records"...}
|
|
16262
|
+
282293708812257203 2025-06-03 05:53:19 1748929999245 s3://vim-iceberg-v1/datalake_db/datalake_table/metadata/snap-282... {"added-data-files":"Blue Inc","added-records"...}
|
|
16263
|
+
|
|
16264
|
+
# Example 1: Get the snapshot using snapshot id.
|
|
16265
|
+
>>> datalake_df.get_snapshot(2046682612111137809)
|
|
16266
|
+
Feb Jan Mar Apr datetime
|
|
16267
|
+
accounts
|
|
16268
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16269
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16270
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16271
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16272
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16273
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16274
|
+
|
|
16275
|
+
# Example 2: Get the snapshot using snapshot id in string format.
|
|
16276
|
+
>>> datalake_df.get_snapshot("2046682612111137809")
|
|
16277
|
+
Feb Jan Mar Apr datetime
|
|
16278
|
+
accounts
|
|
16279
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16280
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16281
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16282
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16283
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16284
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16285
|
+
|
|
16286
|
+
# Example 3: Get the snapshot using timestamp string.
|
|
16287
|
+
>>> datalake_df.get_snapshot("2025-06-03 13:26:16")
|
|
16288
|
+
Feb Jan Mar Apr datetime
|
|
16289
|
+
accounts
|
|
16290
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16291
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16292
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16293
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16294
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16295
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16296
|
+
|
|
16297
|
+
# Example 4: Get the snapshot using date string.
|
|
16298
|
+
>>> datalake_df.get_snapshot("2025-06-04")
|
|
16299
|
+
Feb Jan Mar Apr datetime
|
|
16300
|
+
accounts
|
|
16301
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
16302
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
16303
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
16304
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
16305
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
16306
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
16307
|
+
|
|
16308
|
+
"""
|
|
16309
|
+
_Validators._validate_function_arguments([["as_of", as_of, False, (int, str)]])
|
|
16310
|
+
|
|
16311
|
+
# If already int or string representation of int, return by quoting it
|
|
16312
|
+
if isinstance(as_of, int) or (isinstance(as_of, str) and as_of.isdigit()):
|
|
16313
|
+
snapshot_on = "'{}'".format(as_of)
|
|
16314
|
+
else:
|
|
16315
|
+
try:
|
|
16316
|
+
snapshot_on = UtilFuncs._get_time_formatted_string(as_of)
|
|
16317
|
+
except ValueError as e:
|
|
16318
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
16319
|
+
"get_snapshot", "Invalid value for 'as_of' argument: {}. "
|
|
16320
|
+
"Use valid format [\"YYYY-MM-DD HH:MM:SS.FFFFFFF\", \"YYYY-MM-DD HH:MM:SS\","
|
|
16321
|
+
"\"YYYY-MM-DD\"]".format(as_of)),
|
|
16322
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
16323
|
+
|
|
16324
|
+
query = SQLBundle()._get_sql_query(SQLConstants.SQL_TD_OTF_SNAPSHOT).format(self._table_name, snapshot_on)
|
|
16325
|
+
|
|
16326
|
+
try:
|
|
16327
|
+
return DataFrame.from_query(query)
|
|
16328
|
+
except TeradataMlException as e:
|
|
16329
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
16330
|
+
"get_snapshot()", "Invalid value for 'as_of' argument: {}. "
|
|
16331
|
+
"Use valid timestamp or correct snapshot id listed using 'snapshots' property.".format(as_of)),
|
|
16332
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
15370
16333
|
|
|
15371
16334
|
class DataFrameGroupBy(DataFrame):
|
|
15372
16335
|
"""
|
|
@@ -15375,7 +16338,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15375
16338
|
|
|
15376
16339
|
"""
|
|
15377
16340
|
|
|
15378
|
-
def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None):
|
|
16341
|
+
def __init__(self, nodeid, metaexpr, column_names_and_types, columns, groupbyexpr, column_list, option=None, include_grouping_columns=False):
|
|
15379
16342
|
"""
|
|
15380
16343
|
init() method for DataFrameGroupBy.
|
|
15381
16344
|
|
|
@@ -15416,6 +16379,15 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15416
16379
|
Permitted Values: "CUBE", "ROLLUP", None
|
|
15417
16380
|
Types: str or NoneType
|
|
15418
16381
|
|
|
16382
|
+
include_grouping_columns:
|
|
16383
|
+
Optional Argument.
|
|
16384
|
+
Specifies whether to include aggregations on the grouping column(s) or not.
|
|
16385
|
+
When set to True, the resultant DataFrame will have the aggregations on the
|
|
16386
|
+
columns mentioned in "columns". Otherwise, resultant DataFrame will not have
|
|
16387
|
+
aggregations on the columns mentioned in "columns".
|
|
16388
|
+
Default Value: False
|
|
16389
|
+
Types: bool
|
|
16390
|
+
|
|
15419
16391
|
RETURNS:
|
|
15420
16392
|
teradataml DataFrameGroupBy instance
|
|
15421
16393
|
"""
|
|
@@ -15425,6 +16397,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15425
16397
|
self._column_names_and_types = column_names_and_types
|
|
15426
16398
|
self._columns = columns
|
|
15427
16399
|
self.groupby_column_list = column_list
|
|
16400
|
+
self._include_grouping_columns = include_grouping_columns
|
|
15428
16401
|
|
|
15429
16402
|
def _get_assign_allowed_types(self):
|
|
15430
16403
|
"""
|
|
@@ -15446,7 +16419,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15446
16419
|
from sqlalchemy.sql.functions import Function
|
|
15447
16420
|
return (type(None), int, float, str, decimal.Decimal, Function, ColumnExpression, ClauseElement)
|
|
15448
16421
|
|
|
15449
|
-
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
|
|
16422
|
+
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
|
|
15450
16423
|
"""
|
|
15451
16424
|
DESCRIPTION:
|
|
15452
16425
|
Function generates the MetaExpression and AED nodeid for DataFrameGroupBy.assign()
|
|
@@ -15459,6 +16432,11 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15459
16432
|
and grouping columns are returned. This is unused argument.
|
|
15460
16433
|
Types: bool
|
|
15461
16434
|
|
|
16435
|
+
node_id:
|
|
16436
|
+
Optional Argument.
|
|
16437
|
+
Specifies the input nodeid for the assign operation. This is unused argument.
|
|
16438
|
+
Types: str
|
|
16439
|
+
|
|
15462
16440
|
kwargs:
|
|
15463
16441
|
keyword, value pairs
|
|
15464
16442
|
- keywords are the column names.
|