teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +193 -1
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +25 -18
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +20 -2
- teradataml/analytics/utils.py +15 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +341 -112
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +84 -42
- teradataml/automl/data_transformation.py +69 -33
- teradataml/automl/feature_engineering.py +76 -9
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +35 -14
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +122 -63
- teradataml/common/messagecodes.py +14 -3
- teradataml/common/messages.py +8 -4
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +366 -74
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +348 -86
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +45 -29
- teradataml/dataframe/data_transfer.py +72 -46
- teradataml/dataframe/dataframe.py +642 -166
- teradataml/dataframe/dataframe_utils.py +167 -22
- teradataml/dataframe/functions.py +135 -20
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +330 -78
- teradataml/dbutils/dbutils.py +556 -140
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -26
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +307 -40
- teradataml/scriptmgmt/lls_utils.py +428 -145
- teradataml/store/__init__.py +2 -3
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +48 -19
- teradataml/table_operators/Script.py +23 -2
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +49 -1
- teradataml/utils/internal_buffer.py +38 -0
- teradataml/utils/validators.py +377 -62
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -20,6 +20,9 @@ import re
|
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
import sys
|
|
22
22
|
import urllib.parse
|
|
23
|
+
|
|
24
|
+
from sqlalchemy import Column
|
|
25
|
+
|
|
23
26
|
import teradataml.context.context as tdmlctx
|
|
24
27
|
|
|
25
28
|
from collections import OrderedDict, namedtuple
|
|
@@ -31,6 +34,7 @@ from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
|
31
34
|
from teradataml.dataframe.sql_functions import case
|
|
32
35
|
from teradataml.series.series import Series
|
|
33
36
|
from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
|
|
37
|
+
from teradataml.common.deprecations import argument_deprecation
|
|
34
38
|
from teradataml.common.utils import UtilFuncs
|
|
35
39
|
from teradataml.common.exceptions import TeradataMlException
|
|
36
40
|
from teradataml.common.messages import Messages
|
|
@@ -42,6 +46,7 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, Dat
|
|
|
42
46
|
from teradataml.dataframe.indexer import _LocationIndexer
|
|
43
47
|
from teradataml.common.aed_utils import AedUtils
|
|
44
48
|
from teradataml.options.display import display
|
|
49
|
+
from teradataml.options.configure import configure
|
|
45
50
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
46
51
|
from teradataml.dataframe.row import _Row
|
|
47
52
|
from teradataml.dataframe.setop import concat
|
|
@@ -63,7 +68,79 @@ from teradataml.common.constants import OutputStyle
|
|
|
63
68
|
|
|
64
69
|
# TODO use logger when available on master branch
|
|
65
70
|
# logger = teradatapylog.getLogger()
|
|
66
|
-
|
|
71
|
+
|
|
72
|
+
class in_schema:
|
|
73
|
+
"""
|
|
74
|
+
Class takes a schema name, a table name and datalake name attributes
|
|
75
|
+
and creates an object that can be passed to DataFrame.
|
|
76
|
+
Note:
|
|
77
|
+
teradataml recommends to use this class to access table(s)/view(s),
|
|
78
|
+
from the database other than the default database.
|
|
79
|
+
"""
|
|
80
|
+
def __init__(self, schema_name, table_name, datalake_name=None):
|
|
81
|
+
"""
|
|
82
|
+
Constructor for in_schema class.
|
|
83
|
+
|
|
84
|
+
PARAMETERS:
|
|
85
|
+
schema_name:
|
|
86
|
+
Required Argument.
|
|
87
|
+
Specifies the schema where the table resides in.
|
|
88
|
+
Types: str
|
|
89
|
+
|
|
90
|
+
table_name:
|
|
91
|
+
Required Argument.
|
|
92
|
+
Specifies the table name or view name in Vantage.
|
|
93
|
+
Types: str
|
|
94
|
+
|
|
95
|
+
datalake_name:
|
|
96
|
+
Optional Argument.
|
|
97
|
+
Specifies the datalake name.
|
|
98
|
+
Types: str
|
|
99
|
+
|
|
100
|
+
EXAMPLES:
|
|
101
|
+
from teradataml.dataframe.dataframe import in_schema, DataFrame
|
|
102
|
+
|
|
103
|
+
# Example 1: The following example creates a DataFrame from the
|
|
104
|
+
# existing Vantage table "dbcinfo" in the non-default
|
|
105
|
+
# database "dbc" using the in_schema instance.
|
|
106
|
+
df = DataFrame(in_schema("dbc", "dbcinfo"))
|
|
107
|
+
|
|
108
|
+
# Example 2: The following example uses from_table() function, existing
|
|
109
|
+
# Vantage table "dbcinfo" and non-default database "dbc" to
|
|
110
|
+
# create a teradataml DataFrame.
|
|
111
|
+
df = DataFrame.from_table(in_schema("dbc","dbcinfo"))
|
|
112
|
+
|
|
113
|
+
# Example 3: The following example uses "in_schema" object created
|
|
114
|
+
# with "datalake_name" argument to create DataFrame on OTF table.
|
|
115
|
+
otf_df = DataFrame(in_schema("datalake_db","datalake_table","datalake"))
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
self.schema_name = schema_name
|
|
119
|
+
self.table_name = table_name
|
|
120
|
+
self.datalake_name = datalake_name
|
|
121
|
+
|
|
122
|
+
awu_matrix = []
|
|
123
|
+
awu_matrix.append(["schema_name", schema_name, False, (str), True])
|
|
124
|
+
awu_matrix.append(["table_name", table_name, False, (str), True])
|
|
125
|
+
awu_matrix.append(["datalake_name", datalake_name, True, (str), True])
|
|
126
|
+
|
|
127
|
+
# Validate argument types
|
|
128
|
+
_Validators._validate_function_arguments(awu_matrix)
|
|
129
|
+
|
|
130
|
+
def __str__(self):
|
|
131
|
+
"""
|
|
132
|
+
Returns the string representation of in_schema instance.
|
|
133
|
+
"""
|
|
134
|
+
tbl_name = '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.schema_name, "\"", False),
|
|
135
|
+
UtilFuncs._teradata_quote_arg(self.table_name, "\"", False))
|
|
136
|
+
|
|
137
|
+
if not self.datalake_name:
|
|
138
|
+
return tbl_name
|
|
139
|
+
|
|
140
|
+
return '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.datalake_name, "\"", False), tbl_name)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
in_schema = in_schema
|
|
67
144
|
|
|
68
145
|
|
|
69
146
|
class DataFrame():
|
|
@@ -166,6 +243,24 @@ class DataFrame():
|
|
|
166
243
|
# Property to determine if table is an ART table or not.
|
|
167
244
|
self._is_art = None
|
|
168
245
|
|
|
246
|
+
# This attribute stores the previous assign arguments in continuous assign calls.
|
|
247
|
+
self._previous_assign_args = None
|
|
248
|
+
# This attribute stores the root DataFrame columns.
|
|
249
|
+
self._root_columns = None
|
|
250
|
+
|
|
251
|
+
self._datalake = None
|
|
252
|
+
self._database = None
|
|
253
|
+
self._table = None
|
|
254
|
+
self._otf = False
|
|
255
|
+
|
|
256
|
+
if isinstance(table_name, in_schema):
|
|
257
|
+
self._table = table_name.table_name
|
|
258
|
+
self._datalake = table_name.datalake_name
|
|
259
|
+
self._database = table_name.schema_name
|
|
260
|
+
self._otf = True if self._datalake else False
|
|
261
|
+
|
|
262
|
+
table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
|
|
263
|
+
|
|
169
264
|
# Below matrix is list of list, where in each row contains following elements:
|
|
170
265
|
# Let's take an example of following, just to get an idea:
|
|
171
266
|
# [element1, element2, element3, element4, element5, element6]
|
|
@@ -198,25 +293,45 @@ class DataFrame():
|
|
|
198
293
|
self._source_type = SourceType.TABLE.value
|
|
199
294
|
self._nodeid = self._aed_utils._aed_table(self._table_name)
|
|
200
295
|
elif query is not None:
|
|
296
|
+
query = query.strip()
|
|
297
|
+
query = query[:-1] if query[-1] == ";" else query
|
|
298
|
+
|
|
201
299
|
self._query = query
|
|
202
300
|
self._source_type = SourceType.QUERY.value
|
|
203
301
|
|
|
204
|
-
|
|
205
|
-
|
|
302
|
+
temp_obj_params = {
|
|
303
|
+
"prefix": "_frmqry_v",
|
|
304
|
+
"use_default_database": True,
|
|
305
|
+
"quote": False
|
|
306
|
+
}
|
|
307
|
+
__execute = UtilFuncs._create_view
|
|
308
|
+
|
|
309
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
310
|
+
# If user requests to materialize the query, then we should create a
|
|
311
|
+
# volatile table if user intends to the same instead of view.
|
|
312
|
+
# Volatile table does not need to be added to the GC.
|
|
313
|
+
temp_obj_params["table_type"] = TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
314
|
+
temp_obj_params["gc_on_quit"] = False
|
|
315
|
+
temp_obj_params["prefix"] = "_frmqry_vt"
|
|
316
|
+
__execute = UtilFuncs._create_table
|
|
317
|
+
|
|
318
|
+
elif materialize:
|
|
319
|
+
# If user requests to materialize the query, then we should create a
|
|
206
320
|
# table instead of view and add the same in the GarbageCollector.
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_v", use_default_database=True,
|
|
212
|
-
quote=False)
|
|
321
|
+
temp_obj_params["table_type"] = TeradataConstants.TERADATA_TABLE
|
|
322
|
+
temp_obj_params["gc_on_quit"] = True
|
|
323
|
+
temp_obj_params["prefix"] = "_frmqry_t"
|
|
324
|
+
__execute = UtilFuncs._create_table
|
|
213
325
|
|
|
326
|
+
temp_table_name = UtilFuncs._generate_temp_table_name(**temp_obj_params)
|
|
214
327
|
self._table_name = temp_table_name
|
|
328
|
+
__execute_params = (self._table_name, self._query)
|
|
329
|
+
|
|
330
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
331
|
+
__execute_params = (self._table_name, self._query, True)
|
|
332
|
+
|
|
215
333
|
try:
|
|
216
|
-
|
|
217
|
-
UtilFuncs._create_table(self._table_name, self._query)
|
|
218
|
-
else:
|
|
219
|
-
UtilFuncs._create_view(self._table_name, self._query)
|
|
334
|
+
__execute(*__execute_params)
|
|
220
335
|
except OperationalError as oe:
|
|
221
336
|
if "[Error 3707] Syntax error" in str(oe):
|
|
222
337
|
raise ValueError(Messages.get_message(
|
|
@@ -245,6 +360,9 @@ class DataFrame():
|
|
|
245
360
|
self.__data = None
|
|
246
361
|
self.__data_columns = None
|
|
247
362
|
self._alias = None
|
|
363
|
+
self._plot = None
|
|
364
|
+
|
|
365
|
+
self._eda_ui = None
|
|
248
366
|
|
|
249
367
|
except TeradataMlException:
|
|
250
368
|
raise
|
|
@@ -334,7 +452,9 @@ class DataFrame():
|
|
|
334
452
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
335
453
|
try:
|
|
336
454
|
alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
|
|
337
|
-
|
|
455
|
+
reuse_metaexpr=False, _datalake=self._datalake,
|
|
456
|
+
_database=self._database, _table=self._table,
|
|
457
|
+
_otf=self._otf)
|
|
338
458
|
# Assigning self attributes to newly created alias dataframe.
|
|
339
459
|
alias_df._table_name = self._table_name
|
|
340
460
|
alias_df._index = self._index
|
|
@@ -350,7 +470,8 @@ class DataFrame():
|
|
|
350
470
|
|
|
351
471
|
@classmethod
|
|
352
472
|
@collect_queryband(queryband="DF_fromTable")
|
|
353
|
-
def from_table(cls, table_name, index=True, index_label=None
|
|
473
|
+
def from_table(cls, table_name, index=True, index_label=None,
|
|
474
|
+
schema_name=None, datalake_name=None):
|
|
354
475
|
"""
|
|
355
476
|
Class method for creating a DataFrame from a table or a view.
|
|
356
477
|
|
|
@@ -371,30 +492,48 @@ class DataFrame():
|
|
|
371
492
|
Column/s used for sorting.
|
|
372
493
|
Types: str
|
|
373
494
|
|
|
495
|
+
schema_name:
|
|
496
|
+
Optional Argument.
|
|
497
|
+
Specifies the schema where the table resides.
|
|
498
|
+
Types: str
|
|
499
|
+
|
|
500
|
+
datalake_name:
|
|
501
|
+
Optional Argument.
|
|
502
|
+
Specifies the datalake name.
|
|
503
|
+
Types: str
|
|
504
|
+
|
|
374
505
|
EXAMPLES:
|
|
375
|
-
from teradataml.dataframe.dataframe import DataFrame
|
|
506
|
+
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
376
507
|
|
|
377
508
|
# Example 1: The following example creates a DataFrame from a table or
|
|
378
509
|
a view.
|
|
379
510
|
# Load the example data.
|
|
380
|
-
load_example_data("dataframe","sales")
|
|
511
|
+
>>> load_example_data("dataframe","sales")
|
|
381
512
|
|
|
382
513
|
# Create DataFrame from table
|
|
383
|
-
df = DataFrame.from_table('sales')
|
|
514
|
+
>>> df = DataFrame.from_table('sales')
|
|
384
515
|
|
|
385
516
|
# Create DataFrame from table and without index column sorting.
|
|
386
|
-
df = DataFrame.from_table("sales", False)
|
|
517
|
+
>>> df = DataFrame.from_table("sales", False)
|
|
387
518
|
|
|
388
519
|
# Create DataFrame from table and sorting using the 'accounts'
|
|
389
520
|
# column.
|
|
390
|
-
df = DataFrame.from_table("sales", True, "accounts")
|
|
521
|
+
>>> df = DataFrame.from_table("sales", True, "accounts")
|
|
391
522
|
|
|
392
523
|
# Example 2: The following example creates a DataFrame from existing Vantage
|
|
393
524
|
# table "dbcinfo" in the non-default database "dbc" using the
|
|
394
525
|
# in_schema() function.
|
|
395
526
|
|
|
396
|
-
from teradataml.dataframe.dataframe import in_schema
|
|
397
|
-
df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
|
|
527
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
528
|
+
>>> df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
|
|
529
|
+
|
|
530
|
+
# Example 3: Create a DataFrame on existing DataLake
|
|
531
|
+
# table "lake_table" in the "datalake_database" database
|
|
532
|
+
# in "datalake" datalake.
|
|
533
|
+
|
|
534
|
+
>>> datalake_df = DataFrame.from_table(table_name="lake_table",
|
|
535
|
+
... schema_name="datalake_database",
|
|
536
|
+
... datalake_name="datalake" )
|
|
398
537
|
|
|
399
538
|
RETURNS:
|
|
400
539
|
DataFrame
|
|
@@ -403,6 +542,9 @@ class DataFrame():
|
|
|
403
542
|
TeradataMlException - TDMLDF_CREATE_FAIL
|
|
404
543
|
|
|
405
544
|
"""
|
|
545
|
+
if schema_name:
|
|
546
|
+
return cls(in_schema(schema_name, table_name, datalake_name))
|
|
547
|
+
|
|
406
548
|
return cls(table_name, index, index_label)
|
|
407
549
|
|
|
408
550
|
@classmethod
|
|
@@ -462,7 +604,7 @@ class DataFrame():
|
|
|
462
604
|
return cls(index=index, index_label=index_label, query=query, materialize=materialize)
|
|
463
605
|
|
|
464
606
|
@classmethod
|
|
465
|
-
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
|
|
607
|
+
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True, **kwargs):
|
|
466
608
|
"""
|
|
467
609
|
Private class method for creating a DataFrame from a nodeid and parent metadata.
|
|
468
610
|
|
|
@@ -543,6 +685,11 @@ class DataFrame():
|
|
|
543
685
|
in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
544
686
|
df._undropped_index = undropped_index
|
|
545
687
|
|
|
688
|
+
# Populate remaining attributes.
|
|
689
|
+
for arg in kwargs:
|
|
690
|
+
# Pop each argument from kwargs and assign to new DataFrame.
|
|
691
|
+
arg_value = kwargs.get(arg)
|
|
692
|
+
df.__setattr__(arg, arg_value)
|
|
546
693
|
return df
|
|
547
694
|
|
|
548
695
|
def create_temp_view(self, name):
|
|
@@ -670,9 +817,10 @@ class DataFrame():
|
|
|
670
817
|
return self
|
|
671
818
|
|
|
672
819
|
@collect_queryband(queryband="DF_fillna")
|
|
673
|
-
def fillna(self, value=None, columns=None, literal_value=False):
|
|
820
|
+
def fillna(self, value=None, columns=None, literal_value=False, partition_column=None):
|
|
674
821
|
"""
|
|
675
|
-
|
|
822
|
+
DESCRIPTION:
|
|
823
|
+
Method to replace the null values in a column with the value specified.
|
|
676
824
|
|
|
677
825
|
PARAMETERS:
|
|
678
826
|
value:
|
|
@@ -705,6 +853,12 @@ class DataFrame():
|
|
|
705
853
|
Default Value: False
|
|
706
854
|
Types: bool
|
|
707
855
|
|
|
856
|
+
partition_column:
|
|
857
|
+
Optional Argument.
|
|
858
|
+
Specifies the column name to partition the data.
|
|
859
|
+
Default Value: None
|
|
860
|
+
Types: str
|
|
861
|
+
|
|
708
862
|
RETURNS:
|
|
709
863
|
teradataml DataFrame
|
|
710
864
|
|
|
@@ -745,6 +899,26 @@ class DataFrame():
|
|
|
745
899
|
3 Blue Inc 90.0 50 95.0 101.0 17/01/04
|
|
746
900
|
4 Alpha Co 210.0 200 215.0 250.0 17/01/04
|
|
747
901
|
5 Orange Inc 210.0 50 NaN 250.0 17/01/04
|
|
902
|
+
|
|
903
|
+
# Example 3: Populate the null value in 'pclass' and
|
|
904
|
+
# 'fare' column with mean value with partition
|
|
905
|
+
# column as 'sex'.
|
|
906
|
+
# Load the example data.
|
|
907
|
+
>>> load_example_data("teradataml", ["titanic"])
|
|
908
|
+
>>> df = DataFrame.from_table("titanic")
|
|
909
|
+
|
|
910
|
+
>>> df.fillna(value="mean", columns=["pclass", "fare"], partition_column="sex")
|
|
911
|
+
passenger survived pclass name sex age sibsp parch ticket fare cabin embarked
|
|
912
|
+
0 284 1 3 Dorking, Mr. Edward Arthur male 19.0 0 0 A/5. 10482 8.0500 None S
|
|
913
|
+
1 589 0 3 Gilinski, Mr. Eliezer male 22.0 0 0 14973 8.0500 None S
|
|
914
|
+
2 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 None Q
|
|
915
|
+
3 282 0 3 Olsson, Mr. Nils Johan Goransson male 28.0 0 0 347464 7.8542 None S
|
|
916
|
+
4 608 1 1 Daniel, Mr. Robert Williams male 27.0 0 0 113804 30.5000 None S
|
|
917
|
+
5 404 0 3 Hakkarainen, Mr. Pekka Pietari male 28.0 1 0 STON/O2. 3101279 15.8500 None S
|
|
918
|
+
6 427 1 2 Clarke, Mrs. Charles V (Ada Maria Winfield) female 28.0 1 0 2003 26.0000 None S
|
|
919
|
+
7 141 0 3 Boulos, Mrs. Joseph (Sultana) female NaN 0 2 2678 15.2458 None C
|
|
920
|
+
8 610 1 1 Shutes, Miss. Elizabeth W female 40.0 0 0 PC 17582 153.4625 C125 S
|
|
921
|
+
9 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 None C
|
|
748
922
|
"""
|
|
749
923
|
from teradataml import SimpleImputeFit, SimpleImputeTransform
|
|
750
924
|
|
|
@@ -752,6 +926,7 @@ class DataFrame():
|
|
|
752
926
|
arg_info_matrix.append(["value", value, True, (int, float, str, dict, list)])
|
|
753
927
|
arg_info_matrix.append(["columns", columns, True, (list, str, tuple)])
|
|
754
928
|
arg_info_matrix.append(["literal_value", literal_value, True, (bool)])
|
|
929
|
+
arg_info_matrix.append(["partition_column", partition_column, True, (str)])
|
|
755
930
|
|
|
756
931
|
# Validate argument types
|
|
757
932
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -823,9 +998,15 @@ class DataFrame():
|
|
|
823
998
|
literals=literals,
|
|
824
999
|
literals_columns=literals_columns,
|
|
825
1000
|
stats=stats,
|
|
826
|
-
stats_columns=stats_columns
|
|
1001
|
+
stats_columns=stats_columns,
|
|
1002
|
+
partition_column=partition_column)
|
|
827
1003
|
|
|
828
|
-
|
|
1004
|
+
impute_transform = {
|
|
1005
|
+
'data': self,
|
|
1006
|
+
'data_partition_column': partition_column,
|
|
1007
|
+
'object_partition_column': partition_column}
|
|
1008
|
+
|
|
1009
|
+
return fit_obj.transform(**impute_transform).result
|
|
829
1010
|
|
|
830
1011
|
def __execute_node_and_set_table_name(self, nodeid, metaexpr=None):
|
|
831
1012
|
"""
|
|
@@ -924,6 +1105,7 @@ class DataFrame():
|
|
|
924
1105
|
self._column_names_and_types = []
|
|
925
1106
|
self._td_column_names_and_types = []
|
|
926
1107
|
self._td_column_names_and_sqlalchemy_types = {}
|
|
1108
|
+
self._column_types = {}
|
|
927
1109
|
|
|
928
1110
|
for col in self._metaexpr.c:
|
|
929
1111
|
if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
|
|
@@ -931,9 +1113,11 @@ class DataFrame():
|
|
|
931
1113
|
else:
|
|
932
1114
|
tdtype = "{}".format(col.type)
|
|
933
1115
|
|
|
934
|
-
|
|
1116
|
+
py_type = UtilFuncs._teradata_type_to_python_type(col.type)
|
|
1117
|
+
self._column_names_and_types.append((str(col.name), py_type))
|
|
935
1118
|
self._td_column_names_and_types.append((str(col.name), tdtype))
|
|
936
1119
|
self._td_column_names_and_sqlalchemy_types[(str(col.name)).lower()] = col.type
|
|
1120
|
+
self._column_types[(str(col.name)).lower()] = [py_type, col.type]
|
|
937
1121
|
|
|
938
1122
|
def _get_metaexpr(self):
|
|
939
1123
|
"""
|
|
@@ -952,7 +1136,24 @@ class DataFrame():
|
|
|
952
1136
|
meta = sqlalchemy.MetaData()
|
|
953
1137
|
db_schema = UtilFuncs._extract_db_name(self._table_name)
|
|
954
1138
|
db_table_name = UtilFuncs._extract_table_name(self._table_name)
|
|
955
|
-
|
|
1139
|
+
if not self._datalake:
|
|
1140
|
+
t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
|
|
1141
|
+
return _MetaExpression(t)
|
|
1142
|
+
|
|
1143
|
+
# Get metaexpression for datalake table.
|
|
1144
|
+
# check existence of datalake table.
|
|
1145
|
+
tdmlctx.get_connection().dialect.has_table(tdmlctx.get_connection(),
|
|
1146
|
+
self._table,
|
|
1147
|
+
schema=self._database,
|
|
1148
|
+
table_only=True,
|
|
1149
|
+
datalake=self._datalake)
|
|
1150
|
+
|
|
1151
|
+
# Extract column names and corresponding teradatasqlalchemy types.
|
|
1152
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1153
|
+
self._table,
|
|
1154
|
+
self._datalake)
|
|
1155
|
+
t = sqlalchemy.Table(self._table, meta, schema=self._database,
|
|
1156
|
+
*(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
|
|
956
1157
|
return _MetaExpression(t)
|
|
957
1158
|
|
|
958
1159
|
def __getattr__(self, name):
|
|
@@ -2729,8 +2930,8 @@ class DataFrame():
|
|
|
2729
2930
|
raise TeradataMlException(msg, errcode)
|
|
2730
2931
|
|
|
2731
2932
|
@collect_queryband(queryband="DF_describe")
|
|
2732
|
-
def describe(self, percentiles=[.25, .5, .75],
|
|
2733
|
-
columns=None):
|
|
2933
|
+
def describe(self, percentiles=[.25, .5, .75], verbose=False, distinct=False, statistics=None,
|
|
2934
|
+
columns=None, pivot=False):
|
|
2734
2935
|
"""
|
|
2735
2936
|
DESCRIPTION:
|
|
2736
2937
|
Generates statistics for numeric columns. This function can be used in two modes:
|
|
@@ -2759,18 +2960,6 @@ class DataFrame():
|
|
|
2759
2960
|
Default Values: [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
|
|
2760
2961
|
Types: float or List of floats
|
|
2761
2962
|
|
|
2762
|
-
include:
|
|
2763
|
-
Optional Argument.
|
|
2764
|
-
Values can be either None or "all".
|
|
2765
|
-
If the value is "all", then both numeric and non-numeric columns are included.
|
|
2766
|
-
Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2767
|
-
Computes count and unique for non-numeric columns.
|
|
2768
|
-
If the value is None, only numeric columns are used for collecting statistics.
|
|
2769
|
-
Note:
|
|
2770
|
-
Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2771
|
-
Default Values: None
|
|
2772
|
-
Types: str
|
|
2773
|
-
|
|
2774
2963
|
verbose:
|
|
2775
2964
|
Optional Argument.
|
|
2776
2965
|
Specifies a boolean value to be used for time series aggregation, stating whether to get
|
|
@@ -2797,7 +2986,6 @@ class DataFrame():
|
|
|
2797
2986
|
Computes count and unique for non-numeric columns.
|
|
2798
2987
|
Notes:
|
|
2799
2988
|
1. statistics is not applicable for 'Time Series Aggregate Mode'.
|
|
2800
|
-
2. statistics should not be used with include as 'all'.
|
|
2801
2989
|
Permitted Values: count, mean, min, max, unique, std, describe, percentile
|
|
2802
2990
|
Default Values: None
|
|
2803
2991
|
Types: str or List of str
|
|
@@ -2807,7 +2995,14 @@ class DataFrame():
|
|
|
2807
2995
|
Specifies the name(s) of the columns we are collecting statistics for.
|
|
2808
2996
|
Default Values: None
|
|
2809
2997
|
Types: str or List of str
|
|
2810
|
-
|
|
2998
|
+
|
|
2999
|
+
pivot:
|
|
3000
|
+
Optional Argument.
|
|
3001
|
+
Specifies a boolean value to pivot the output.
|
|
3002
|
+
Note:
|
|
3003
|
+
* "pivot" is not supported for PTI tables.
|
|
3004
|
+
Default Values: 'False'
|
|
3005
|
+
Types: bool
|
|
2811
3006
|
|
|
2812
3007
|
RETURNS:
|
|
2813
3008
|
teradataml DataFrame
|
|
@@ -2829,7 +3024,7 @@ class DataFrame():
|
|
|
2829
3024
|
Orange Inc 210.0 None None 250 04/01/2017
|
|
2830
3025
|
|
|
2831
3026
|
# Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2832
|
-
>>> df.describe()
|
|
3027
|
+
>>> df.describe(pivot=True)
|
|
2833
3028
|
Apr Feb Mar Jan
|
|
2834
3029
|
func
|
|
2835
3030
|
count 4 6 4 4
|
|
@@ -2841,8 +3036,45 @@ class DataFrame():
|
|
|
2841
3036
|
75% 250 207.5 158.75 162.5
|
|
2842
3037
|
max 250 210 215 200
|
|
2843
3038
|
|
|
3039
|
+
# Computes count, mean, std, min, percentiles, and max for numeric columns with
|
|
3040
|
+
# default arugments.
|
|
3041
|
+
>>> df.describe()
|
|
3042
|
+
ATTRIBUTE StatName StatValue
|
|
3043
|
+
Jan MAXIMUM 200.0
|
|
3044
|
+
Jan STANDARD DEVIATION 62.91528696058958
|
|
3045
|
+
Jan PERCENTILES(25) 125.0
|
|
3046
|
+
Jan PERCENTILES(50) 150.0
|
|
3047
|
+
Mar COUNT 4.0
|
|
3048
|
+
Mar MINIMUM 95.0
|
|
3049
|
+
Mar MAXIMUM 215.0
|
|
3050
|
+
Mar MEAN 147.5
|
|
3051
|
+
Mar STANDARD DEVIATION 49.749371855331
|
|
3052
|
+
Mar PERCENTILES(25) 128.75
|
|
3053
|
+
Mar PERCENTILES(50) 140.0
|
|
3054
|
+
Apr COUNT 4.0
|
|
3055
|
+
Apr MINIMUM 101.0
|
|
3056
|
+
Apr MAXIMUM 250.0
|
|
3057
|
+
Apr MEAN 195.25
|
|
3058
|
+
Apr STANDARD DEVIATION 70.97123830585646
|
|
3059
|
+
Apr PERCENTILES(25) 160.25
|
|
3060
|
+
Apr PERCENTILES(50) 215.0
|
|
3061
|
+
Apr PERCENTILES(75) 250.0
|
|
3062
|
+
Feb COUNT 6.0
|
|
3063
|
+
Feb MINIMUM 90.0
|
|
3064
|
+
Feb MAXIMUM 210.0
|
|
3065
|
+
Feb MEAN 166.66666666666666
|
|
3066
|
+
Feb STANDARD DEVIATION 59.553897157672786
|
|
3067
|
+
Feb PERCENTILES(25) 117.5
|
|
3068
|
+
Feb PERCENTILES(50) 200.0
|
|
3069
|
+
Feb PERCENTILES(75) 207.5
|
|
3070
|
+
Mar PERCENTILES(75) 158.75
|
|
3071
|
+
Jan PERCENTILES(75) 162.5
|
|
3072
|
+
Jan MEAN 137.5
|
|
3073
|
+
Jan MINIMUM 50.0
|
|
3074
|
+
Jan COUNT 4.0
|
|
3075
|
+
|
|
2844
3076
|
# Computes count, mean, std, min, percentiles, and max for numeric columns with 30th and 60th percentiles.
|
|
2845
|
-
>>> df.describe(percentiles=[.3, .6])
|
|
3077
|
+
>>> df.describe(percentiles=[.3, .6], pivot=True)
|
|
2846
3078
|
Apr Feb Mar Jan
|
|
2847
3079
|
func
|
|
2848
3080
|
count 4 6 4 4
|
|
@@ -2855,7 +3087,7 @@ class DataFrame():
|
|
|
2855
3087
|
|
|
2856
3088
|
# Computes count, mean, std, min, percentiles, and max for numeric columns group by "datetime" and "Feb".
|
|
2857
3089
|
>>> df1 = df.groupby(["datetime", "Feb"])
|
|
2858
|
-
>>> df1.describe()
|
|
3090
|
+
>>> df1.describe(pivot=True)
|
|
2859
3091
|
Jan Mar Apr
|
|
2860
3092
|
datetime Feb func
|
|
2861
3093
|
04/01/2017 90.0 25% 50 95 101
|
|
@@ -2883,22 +3115,6 @@ class DataFrame():
|
|
|
2883
3115
|
min 200 215 250
|
|
2884
3116
|
std None None 0
|
|
2885
3117
|
|
|
2886
|
-
# Computes count, mean, std, min, percentiles, and max for numeric columns and
|
|
2887
|
-
# computes count and unique for non-numeric columns
|
|
2888
|
-
>>> df.describe(include="all")
|
|
2889
|
-
accounts Feb Jan Mar Apr datetime
|
|
2890
|
-
func
|
|
2891
|
-
25% None 117.5 125 128.75 160.25 None
|
|
2892
|
-
75% None 207.5 162.5 158.75 250 None
|
|
2893
|
-
count 6 6 4 4 4 6
|
|
2894
|
-
mean None 166.667 137.5 147.5 195.25 None
|
|
2895
|
-
max None 210 200 215 250 None
|
|
2896
|
-
min None 90 50 95 101 None
|
|
2897
|
-
50% None 200 150 140 215 None
|
|
2898
|
-
std None 59.554 62.915 49.749 70.971 None
|
|
2899
|
-
unique 6 None None None None 1
|
|
2900
|
-
|
|
2901
|
-
#
|
|
2902
3118
|
# Examples for describe() function as Time Series Aggregate.
|
|
2903
3119
|
#
|
|
2904
3120
|
>>> # Load the example datasets.
|
|
@@ -3081,15 +3297,15 @@ class DataFrame():
|
|
|
3081
3297
|
>>>
|
|
3082
3298
|
"""
|
|
3083
3299
|
|
|
3084
|
-
# Argument validations
|
|
3300
|
+
# -------------Argument validations---------------#
|
|
3085
3301
|
awu_matrix = []
|
|
3086
3302
|
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
3087
3303
|
awu_matrix.append(["percentiles", percentiles, True, (float, list)])
|
|
3088
|
-
awu_matrix.append(["include", include, True, (str), True, [None, "all"]])
|
|
3089
3304
|
awu_matrix.append(["verbose", verbose, True, (bool)])
|
|
3090
3305
|
awu_matrix.append(["distinct", distinct, True, (bool)])
|
|
3091
3306
|
awu_matrix.append(["statistics", statistics, True, (str, list), True,
|
|
3092
3307
|
["count", "mean", "min", "max", "unique", "std", "describe", "percentile"]])
|
|
3308
|
+
awu_matrix.append(["pivot", pivot, True, (bool)])
|
|
3093
3309
|
|
|
3094
3310
|
# Validate argument types
|
|
3095
3311
|
_Validators._validate_function_arguments(awu_matrix)
|
|
@@ -3108,22 +3324,11 @@ class DataFrame():
|
|
|
3108
3324
|
if statistics:
|
|
3109
3325
|
statistics = [stats.lower() for stats in UtilFuncs._as_list(statistics)]
|
|
3110
3326
|
|
|
3111
|
-
# Argument include and statistics should not be used together
|
|
3112
|
-
if include is not None and statistics is not None:
|
|
3113
|
-
raise ValueError(Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH).format(
|
|
3114
|
-
'include', 'statistics'
|
|
3115
|
-
))
|
|
3116
|
-
|
|
3117
3327
|
# Percentiles must be a list of values between 0 and 1.
|
|
3118
3328
|
if not isinstance(percentiles, list) or not all(p > 0 and p < 1 for p in percentiles):
|
|
3119
3329
|
raise ValueError(Messages.get_message(MessageCodes.INVALID_ARG_VALUE, percentiles, "percentiles",
|
|
3120
3330
|
"percentiles must be a list of values between 0 and 1"))
|
|
3121
3331
|
|
|
3122
|
-
# Argument 'include' with value 'all' is not allowed for DataFrameGroupByTime
|
|
3123
|
-
if include is not None and include.lower() == "all" and isinstance(self, DataFrameGroupByTime):
|
|
3124
|
-
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
3125
|
-
'include', 'Aggregation', 'all', 'describe()', 'DataFrame or DataFrameGroupBy'))
|
|
3126
|
-
|
|
3127
3332
|
# Argument 'statistics' is not allowed for DataFrameGroupByTime
|
|
3128
3333
|
if statistics is not None and isinstance(self, DataFrameGroupByTime):
|
|
3129
3334
|
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
@@ -3133,26 +3338,31 @@ class DataFrame():
|
|
|
3133
3338
|
if verbose and not isinstance(self, DataFrameGroupByTime):
|
|
3134
3339
|
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
3135
3340
|
'verbose', 'Aggregation', 'True', 'describe()', 'DataFrameGroupByTime'))
|
|
3341
|
+
# -------------End of argument validations---------------#
|
|
3136
3342
|
|
|
3137
3343
|
function_label = "func"
|
|
3344
|
+
sort_cols = []
|
|
3138
3345
|
try:
|
|
3139
3346
|
self.__execute_node_and_set_table_name(self._nodeid)
|
|
3140
3347
|
|
|
3141
3348
|
groupby_column_list = None
|
|
3142
|
-
if isinstance(self, DataFrameGroupBy):
|
|
3349
|
+
if isinstance(self, DataFrameGroupByTime) or isinstance(self, DataFrameGroupBy):
|
|
3143
3350
|
groupby_column_list = self.groupby_column_list
|
|
3144
|
-
|
|
3145
|
-
|
|
3351
|
+
if columns:
|
|
3352
|
+
df_utils._validate_describe_columns(columns=columns, metaexpr=self._metaexpr,
|
|
3353
|
+
groupby_column_list=groupby_column_list)
|
|
3354
|
+
sort_cols = list(groupby_column_list)
|
|
3146
3355
|
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3150
|
-
groupby_column_list=groupby_column_list)
|
|
3356
|
+
# 'func' column will be always there in result.
|
|
3357
|
+
sort_cols.append(function_label)
|
|
3151
3358
|
|
|
3359
|
+
# Handle DataFrameGroupByTime using union all approach and
|
|
3360
|
+
# other DataFrames using TD_UnivariateStatistics approach.
|
|
3361
|
+
if isinstance(self, DataFrameGroupByTime):
|
|
3152
3362
|
# Construct the aggregate query.
|
|
3153
3363
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3154
3364
|
percentiles=percentiles, function_label=function_label,
|
|
3155
|
-
groupby_column_list=groupby_column_list, include=
|
|
3365
|
+
groupby_column_list=groupby_column_list, include=None,
|
|
3156
3366
|
is_time_series_aggregate=True, verbose=verbose,
|
|
3157
3367
|
distinct=distinct,
|
|
3158
3368
|
timebucket_duration=self._timebucket_duration,
|
|
@@ -3160,29 +3370,99 @@ class DataFrame():
|
|
|
3160
3370
|
timecode_column=self._timecode_column,
|
|
3161
3371
|
sequence_column=self._sequence_column,
|
|
3162
3372
|
fill=self._fill)
|
|
3373
|
+
|
|
3374
|
+
if groupby_column_list is not None:
|
|
3375
|
+
df = DataFrame.from_query(agg_query, index_label=sort_cols)
|
|
3376
|
+
df2 = df.sort(sort_cols)
|
|
3377
|
+
df2._metaexpr._n_rows = 100
|
|
3378
|
+
describe_df = df2
|
|
3379
|
+
else:
|
|
3380
|
+
describe_df = DataFrame.from_query(agg_query, index_label=function_label)
|
|
3381
|
+
|
|
3382
|
+
# Check if numeric overflow can occur for result DataFrame.
|
|
3383
|
+
if self._check_numeric_overflow(describe_df):
|
|
3384
|
+
result_df = self._promote_dataframe_types()
|
|
3385
|
+
describe_df = result_df.describe(pivot=True)
|
|
3386
|
+
return describe_df
|
|
3387
|
+
|
|
3163
3388
|
else:
|
|
3164
|
-
#
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3389
|
+
# If pivot is True, then construct the aggregate query and return the result DataFrame.
|
|
3390
|
+
# Otherwise, return the result DataFrame in the regular aggregate mode using UnivariateStatistics.
|
|
3391
|
+
|
|
3392
|
+
if pivot:
|
|
3393
|
+
# Construct the aggregate query.
|
|
3394
|
+
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3395
|
+
percentiles=percentiles, function_label=function_label,
|
|
3396
|
+
groupby_column_list=groupby_column_list, include=None,
|
|
3397
|
+
is_time_series_aggregate=False, verbose=verbose,
|
|
3398
|
+
distinct=distinct, statistics=statistics)
|
|
3399
|
+
|
|
3400
|
+
if groupby_column_list is not None:
|
|
3401
|
+
sort_cols = [i for i in groupby_column_list]
|
|
3402
|
+
sort_cols.append(function_label)
|
|
3403
|
+
df = DataFrame.from_query(agg_query, index_label=sort_cols)
|
|
3404
|
+
df2 = df.sort(sort_cols)
|
|
3405
|
+
df2._metaexpr._n_rows = 100
|
|
3406
|
+
describe_df = df2
|
|
3407
|
+
else:
|
|
3408
|
+
describe_df = DataFrame.from_query(agg_query, index_label=function_label)
|
|
3409
|
+
|
|
3410
|
+
# Check if numeric overflow can occur for result DataFrame.
|
|
3411
|
+
if self._check_numeric_overflow(describe_df):
|
|
3412
|
+
result_df = self._promote_dataframe_types()
|
|
3413
|
+
describe_df = result_df.describe(pivot=True)
|
|
3414
|
+
|
|
3415
|
+
return describe_df
|
|
3416
|
+
|
|
3417
|
+
# If columns is None, then all dataframe columns are considered.
|
|
3418
|
+
if columns is None:
|
|
3419
|
+
columns = self.columns
|
|
3420
|
+
# Exclude groupby columns
|
|
3421
|
+
if groupby_column_list is not None:
|
|
3422
|
+
columns = [col for col in columns if col not in groupby_column_list]
|
|
3423
|
+
|
|
3424
|
+
numeric_cols = []
|
|
3425
|
+
|
|
3426
|
+
# Extract numeric columns and their types of all columns
|
|
3427
|
+
for col in self._metaexpr.c:
|
|
3428
|
+
if type(col.type) in UtilFuncs()._get_numeric_datatypes() and \
|
|
3429
|
+
col.name in columns:
|
|
3430
|
+
numeric_cols.append(col.name)
|
|
3431
|
+
|
|
3432
|
+
if numeric_cols:
|
|
3433
|
+
# Default statistics for 'Regular Aggregate Mode'
|
|
3434
|
+
sql_stat = ["COUNT", "MAXIMUM", "MEAN", "MINIMUM", "PERCENTILES", "STANDARD DEVIATION"]
|
|
3435
|
+
|
|
3436
|
+
if statistics is not None:
|
|
3437
|
+
py_to_sql_func_map = {"count": "COUNT",
|
|
3438
|
+
"max": "MAXIMUM",
|
|
3439
|
+
"mean": "MEAN",
|
|
3440
|
+
"unique": 'UNIQUE ENTITY COUNT',
|
|
3441
|
+
"min": "MINIMUM",
|
|
3442
|
+
"percentile": "PERCENTILES",
|
|
3443
|
+
"std": "STANDARD DEVIATION"}
|
|
3444
|
+
# Convert statistics into corresponding SQL function names
|
|
3445
|
+
sql_stat = [py_to_sql_func_map[stat] for stat in UtilFuncs()._as_list(statistics)]
|
|
3446
|
+
|
|
3447
|
+
# Convert percentiles to centiles for univariate statistics
|
|
3448
|
+
centiles = list(map(lambda n: int(n * 100), percentiles))
|
|
3449
|
+
|
|
3450
|
+
# UnivariateStatistics parameters
|
|
3451
|
+
univar_param = {
|
|
3452
|
+
"newdata": self.select(self.columns),
|
|
3453
|
+
"target_columns": numeric_cols,
|
|
3454
|
+
"partition_columns": groupby_column_list,
|
|
3455
|
+
"centiles": centiles,
|
|
3456
|
+
"stats": sql_stat
|
|
3457
|
+
}
|
|
3458
|
+
|
|
3459
|
+
from teradataml import UnivariateStatistics
|
|
3460
|
+
# Run UnivariateStatistics
|
|
3461
|
+
aggr_df = UnivariateStatistics(**univar_param).result
|
|
3462
|
+
|
|
3463
|
+
# Return the result in teradataml format
|
|
3464
|
+
return aggr_df
|
|
3180
3465
|
|
|
3181
|
-
# Check if numeric overflow can occur for result DataFrame.
|
|
3182
|
-
if self._check_numeric_overflow(describe_df):
|
|
3183
|
-
result_df = self._promote_dataframe_types()
|
|
3184
|
-
describe_df = result_df.describe()
|
|
3185
|
-
return describe_df
|
|
3186
3466
|
except TeradataMlException:
|
|
3187
3467
|
raise
|
|
3188
3468
|
except Exception as err:
|
|
@@ -5269,8 +5549,10 @@ class DataFrame():
|
|
|
5269
5549
|
Specifies the function(s) to apply on DataFrame columns.
|
|
5270
5550
|
|
|
5271
5551
|
Valid values for func are:
|
|
5272
|
-
'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique',
|
|
5273
|
-
|
|
5552
|
+
* 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'percentile_<floatvalue>', 'unique',
|
|
5553
|
+
'median', 'var'
|
|
5554
|
+
* Note: In 'percentile_<floatvalue>', <floatvalue> specifies the desired percentile value to
|
|
5555
|
+
calculate aggregate. It should be in the range of 0.0 to 1.0 (both inclusive).
|
|
5274
5556
|
|
|
5275
5557
|
Acceptable formats for function(s) are
|
|
5276
5558
|
string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
|
|
@@ -5304,12 +5586,17 @@ class DataFrame():
|
|
|
5304
5586
|
Output column names after the above operation are:
|
|
5305
5587
|
min_employee_no, sum_employee_no, var_employee_no, min_first_name
|
|
5306
5588
|
|
|
5307
|
-
4. "
|
|
5589
|
+
4. "percentile_<floatvalue>" passed to agg.
|
|
5590
|
+
>>> df.agg({'employee_no' : ['percentile_0.25', 'percentile_0.75', 'min']})
|
|
5591
|
+
>>> df.agg(['percentile_0.25', 'percentile_0.75', 'sum'])
|
|
5592
|
+
>>> df.agg('percentile_0.25')
|
|
5593
|
+
|
|
5594
|
+
5. "func" passed as a ColumnExpression built using the aggregate functions.
|
|
5308
5595
|
>>> df.agg(df.first_name.count())
|
|
5309
5596
|
Output column name after the above operation is:
|
|
5310
5597
|
count(first_name)
|
|
5311
5598
|
|
|
5312
|
-
|
|
5599
|
+
6. "func" passed as a list of ColumnExpression built using the aggregate functions.
|
|
5313
5600
|
>>> df.agg([df.employee_no.min(), df.first_name.count()])
|
|
5314
5601
|
Output column names after the above operation are:
|
|
5315
5602
|
min(employee_no), count(first_name)
|
|
@@ -5397,6 +5684,12 @@ class DataFrame():
|
|
|
5397
5684
|
min_employee_no sum_employee_no var_employee_no min_first_name
|
|
5398
5685
|
0 100 313 44.333333 abcd
|
|
5399
5686
|
|
|
5687
|
+
# Get the minimum, 25 percentile value and variance of employee number, by passing dictionary of
|
|
5688
|
+
# column names to string function/list of string functions as parameter.
|
|
5689
|
+
>>> df.agg({'employee_no' : ['min', 'percentile_0.25', 'var']})
|
|
5690
|
+
min_employee_no percentile_0.25_employee_no var_employee_no
|
|
5691
|
+
0 100 100 44.333333
|
|
5692
|
+
|
|
5400
5693
|
# Get the minimum and sum of all the columns in the dataframe,
|
|
5401
5694
|
# by passing list of string functions as parameter.
|
|
5402
5695
|
>>> df.agg(['min', 'sum'])
|
|
@@ -5442,9 +5735,15 @@ class DataFrame():
|
|
|
5442
5735
|
mean_employee_no unique_employee_no unique_first_name mean_joined_date unique_joined_date
|
|
5443
5736
|
0 104.333333 3 2 60/12/04 2
|
|
5444
5737
|
|
|
5738
|
+
# Get the percentile of each column in the dataframe with default value 0.5.
|
|
5445
5739
|
>>> df.agg('percentile')
|
|
5446
|
-
|
|
5447
|
-
|
|
5740
|
+
percentile_employee_no percentile_marks
|
|
5741
|
+
0 101 None
|
|
5742
|
+
|
|
5743
|
+
# Get 80 percentile of each column in the datafame.
|
|
5744
|
+
>>> df.agg('percentile_0.8')
|
|
5745
|
+
percentile_0.8_employee_no percentile_0.8_marks
|
|
5746
|
+
0 107 None
|
|
5448
5747
|
|
|
5449
5748
|
# Using another table 'sales' (having repeated values) to demonstrate operations
|
|
5450
5749
|
# 'unique' and 'percentile'.
|
|
@@ -5461,9 +5760,11 @@ class DataFrame():
|
|
|
5461
5760
|
Blue Inc 90.0 50 95 101 2017-04-01
|
|
5462
5761
|
Red Inc 200.0 150 140 None 2017-04-01
|
|
5463
5762
|
|
|
5464
|
-
|
|
5465
|
-
|
|
5466
|
-
|
|
5763
|
+
# Get 80 and 40 percentile values of each column in the dataframe.
|
|
5764
|
+
>>> df1 = df.select(['Feb', 'Jan', 'Mar', 'Apr'])
|
|
5765
|
+
>>> df1.agg(['percentile_0.8', 'percentile_0.4'])
|
|
5766
|
+
percentile_0.8_Feb percentile_0.4_Feb percentile_0.8_Jan percentile_0.4_Jan percentile_0.8_Mar percentile_0.4_Mar percentile_0.8_Apr percentile_0.4_Apr
|
|
5767
|
+
0 210.0 200.0 170 150 170 140 250 194
|
|
5467
5768
|
|
|
5468
5769
|
>>> df.agg('unique')
|
|
5469
5770
|
unique_accounts unique_Feb unique_Jan unique_Mar unique_Apr unique_datetime
|
|
@@ -5650,6 +5951,8 @@ class DataFrame():
|
|
|
5650
5951
|
|
|
5651
5952
|
except TeradataMlException:
|
|
5652
5953
|
raise
|
|
5954
|
+
except ValueError:
|
|
5955
|
+
raise
|
|
5653
5956
|
except Exception as err:
|
|
5654
5957
|
raise TeradataMlException(Messages.get_message(
|
|
5655
5958
|
MessageCodes.EXECUTION_FAILED, "perform {} on DataFrame".format(operation), str(err)),
|
|
@@ -5765,7 +6068,35 @@ class DataFrame():
|
|
|
5765
6068
|
|
|
5766
6069
|
def _repr_html_(self):
|
|
5767
6070
|
""" Print method for teradataml for iPython rich display. """
|
|
6071
|
+
self._generate_output_html()
|
|
6072
|
+
if display.enable_ui:
|
|
6073
|
+
# EDA Ui widget representation using teradatamlwidgets
|
|
6074
|
+
if self._eda_ui is None:
|
|
6075
|
+
from teradatamlwidgets.eda.Ui import Ui
|
|
6076
|
+
self._eda_ui = Ui(df=self, html=self.html)
|
|
6077
|
+
else:
|
|
6078
|
+
self._eda_ui.display_ui()
|
|
6079
|
+
return self.html
|
|
6080
|
+
|
|
6081
|
+
def get_eda_ui(self):
|
|
6082
|
+
"""
|
|
6083
|
+
Returns the EDA representation UI.
|
|
5768
6084
|
|
|
6085
|
+
PARAMETERS:
|
|
6086
|
+
None.
|
|
6087
|
+
|
|
6088
|
+
EXCEPTIONS:
|
|
6089
|
+
None.
|
|
6090
|
+
|
|
6091
|
+
RETURNS:
|
|
6092
|
+
teradatamlwidgets.eda.Ui
|
|
6093
|
+
|
|
6094
|
+
EXAMPLE:
|
|
6095
|
+
df = ui.get_eda_ui()
|
|
6096
|
+
"""
|
|
6097
|
+
return self._eda_ui
|
|
6098
|
+
|
|
6099
|
+
def _generate_output_html(self, disable_types=True):
|
|
5769
6100
|
# Check if class attributes __data and __data_columns are not None.
|
|
5770
6101
|
# If not None, reuse the data and columns.
|
|
5771
6102
|
# If None, generate latest results.
|
|
@@ -5778,17 +6109,25 @@ class DataFrame():
|
|
|
5778
6109
|
dindent = indent + indent
|
|
5779
6110
|
|
|
5780
6111
|
header_html = ['<style type="text/css">',
|
|
5781
|
-
'table {border:ridge 5px
|
|
6112
|
+
'table { border:ridge 5px}',
|
|
5782
6113
|
'table td {border:inset 1px;}',
|
|
5783
|
-
'table tr#HeaderRow {background-color:grey; color:white;}'
|
|
6114
|
+
'table tr#HeaderRow {background-color:grey; color:white;}',
|
|
5784
6115
|
'</style>\n'
|
|
5785
6116
|
]
|
|
5786
6117
|
html = "\n{0}".format(indent).join(header_html)
|
|
5787
|
-
html += '<html><table>\n{0}<tr id="HeaderRow">\n'.format(indent)
|
|
6118
|
+
html += '<html><table style="min-width:1000px;">\n{0}<tr id="HeaderRow">\n'.format(indent)
|
|
5788
6119
|
|
|
5789
|
-
columns_html = "</th
|
|
5790
|
-
html += "
|
|
5791
|
-
html += "
|
|
6120
|
+
columns_html = "</th><th>".join(self.__data_columns)
|
|
6121
|
+
html += "<th>{0}</th>\n".format(columns_html)
|
|
6122
|
+
html += "</tr>\n"
|
|
6123
|
+
|
|
6124
|
+
if not disable_types:
|
|
6125
|
+
html += '<tr>\n'.format(indent)
|
|
6126
|
+
col_types = [repr(self._td_column_names_and_sqlalchemy_types[column]) for column in
|
|
6127
|
+
self.__data_columns]
|
|
6128
|
+
columns_types_html = "</td>\n{0}<td>".format(dindent).join(col_types)
|
|
6129
|
+
html += "{0}<td>{1}</td>\n".format(dindent, columns_types_html)
|
|
6130
|
+
html += "{0}</tr>\n".format(indent)
|
|
5792
6131
|
|
|
5793
6132
|
for row in self.__data:
|
|
5794
6133
|
row_html = ["{0}<td>{1}</td>\n".format(dindent,
|
|
@@ -5796,8 +6135,31 @@ class DataFrame():
|
|
|
5796
6135
|
html += "{1}<tr>\n{0}{1}</tr>\n".format("".join(row_html), indent)
|
|
5797
6136
|
|
|
5798
6137
|
html += "</table></html>"
|
|
6138
|
+
self.html = html
|
|
5799
6139
|
|
|
5800
|
-
|
|
6140
|
+
def get_output(self, output_index=0):
|
|
6141
|
+
"""
|
|
6142
|
+
DESCRIPTION:
|
|
6143
|
+
Returns the result of analytic function when analytic function is
|
|
6144
|
+
run from 'Analyze' tab in EDA UI.
|
|
6145
|
+
Note:
|
|
6146
|
+
* The function does not return anything if analytic function is
|
|
6147
|
+
not run from EDA UI.
|
|
6148
|
+
|
|
6149
|
+
PARAMETERS:
|
|
6150
|
+
output_index:
|
|
6151
|
+
Optional Argument.
|
|
6152
|
+
Specifies the index of the output dataframe to be returned.
|
|
6153
|
+
Default Value: 0
|
|
6154
|
+
Types: int
|
|
6155
|
+
|
|
6156
|
+
RAISES:
|
|
6157
|
+
IndexError
|
|
6158
|
+
|
|
6159
|
+
RETURNS:
|
|
6160
|
+
teradataml DataFrame object.
|
|
6161
|
+
"""
|
|
6162
|
+
return self._eda_ui.get_output_dataframe(output_index=output_index)
|
|
5801
6163
|
|
|
5802
6164
|
def __get_data_columns(self):
|
|
5803
6165
|
"""
|
|
@@ -6857,7 +7219,8 @@ class DataFrame():
|
|
|
6857
7219
|
compiled_condition = condition.compile(compile_kwargs={'include_table': True,
|
|
6858
7220
|
'literal_binds': True,
|
|
6859
7221
|
'table_name_kind': '_join_alias',
|
|
6860
|
-
'compile_with_caller_table': True
|
|
7222
|
+
'compile_with_caller_table': True,
|
|
7223
|
+
'table_only': True})
|
|
6861
7224
|
|
|
6862
7225
|
all_join_conditions.append(compiled_condition)
|
|
6863
7226
|
|
|
@@ -7399,7 +7762,7 @@ class DataFrame():
|
|
|
7399
7762
|
"""
|
|
7400
7763
|
return (type(None), int, float, str, decimal.Decimal, ColumnExpression, ClauseElement)
|
|
7401
7764
|
|
|
7402
|
-
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
|
|
7765
|
+
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
|
|
7403
7766
|
"""
|
|
7404
7767
|
DESCRIPTION:
|
|
7405
7768
|
Function generates the MetaExpression and AED nodeid for DataFrame.assign()
|
|
@@ -7412,6 +7775,11 @@ class DataFrame():
|
|
|
7412
7775
|
Default Value: False
|
|
7413
7776
|
Types: bool
|
|
7414
7777
|
|
|
7778
|
+
node_id:
|
|
7779
|
+
Optional Argument.
|
|
7780
|
+
Specifies the input nodeid for the assign operation.
|
|
7781
|
+
Types: str
|
|
7782
|
+
|
|
7415
7783
|
kwargs:
|
|
7416
7784
|
keyword, value pairs
|
|
7417
7785
|
- keywords are the column names.
|
|
@@ -7439,7 +7807,7 @@ class DataFrame():
|
|
|
7439
7807
|
|
|
7440
7808
|
# Join the expressions in result.
|
|
7441
7809
|
assign_expression = ', '.join(list(map(lambda x: x[1], result)))
|
|
7442
|
-
new_nodeid = self._aed_utils._aed_assign(
|
|
7810
|
+
new_nodeid = self._aed_utils._aed_assign(node_id,
|
|
7443
7811
|
assign_expression,
|
|
7444
7812
|
AEDConstants.AED_ASSIGN_DROP_EXISITING_COLUMNS.value)
|
|
7445
7813
|
|
|
@@ -7571,14 +7939,14 @@ class DataFrame():
|
|
|
7571
7939
|
_Validators._check_auth_token("udf")
|
|
7572
7940
|
for colname, col in udf_expr.items():
|
|
7573
7941
|
env_name = UtilFuncs._get_env_name(col)
|
|
7574
|
-
# Store the env_name and its corresponding output column
|
|
7942
|
+
# Store the env_name and its corresponding output column
|
|
7575
7943
|
if env_name in env_mapper:
|
|
7576
7944
|
env_mapper[env_name].append(colname)
|
|
7577
7945
|
else:
|
|
7578
7946
|
env_mapper[env_name] = [colname]
|
|
7579
7947
|
else:
|
|
7580
7948
|
env_mapper[env_name] = udf_expr.keys()
|
|
7581
|
-
|
|
7949
|
+
debug = False
|
|
7582
7950
|
for env_name, cols in env_mapper.items():
|
|
7583
7951
|
# Create a dictionary of output columns to column type.
|
|
7584
7952
|
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
@@ -7589,6 +7957,7 @@ class DataFrame():
|
|
|
7589
7957
|
# Create a dictionary of output column name to udf arguments
|
|
7590
7958
|
function_args = {}
|
|
7591
7959
|
for colname, col in udf_expr.items():
|
|
7960
|
+
debug |= col._debug
|
|
7592
7961
|
delimiter = col._delimiter
|
|
7593
7962
|
quotechar = col._quotechar
|
|
7594
7963
|
if colname in cols:
|
|
@@ -7621,15 +7990,17 @@ class DataFrame():
|
|
|
7621
7990
|
columns_definitions=columns_definitions,
|
|
7622
7991
|
output_type_converters={
|
|
7623
7992
|
col_name: _Dtypes._teradata_type_to_python_type(col_type)
|
|
7624
|
-
for col_name, col_type in returns.items()}
|
|
7993
|
+
for col_name, col_type in returns.items()},
|
|
7994
|
+
debug=debug
|
|
7995
|
+
)
|
|
7625
7996
|
|
|
7626
7997
|
df = tbl_operators.execute()
|
|
7627
7998
|
return df
|
|
7628
|
-
|
|
7999
|
+
|
|
7629
8000
|
def _assign_call_udf(self, call_udf_expr):
|
|
7630
8001
|
"""
|
|
7631
8002
|
DESCRIPTION:
|
|
7632
|
-
Internal function for DataFrame.assign() to execute the call_udf using
|
|
8003
|
+
Internal function for DataFrame.assign() to execute the call_udf using
|
|
7633
8004
|
Script/Apply Table Operator and create new column for teradataml DataFrame.
|
|
7634
8005
|
|
|
7635
8006
|
PARAMETER:
|
|
@@ -7656,7 +8027,7 @@ class DataFrame():
|
|
|
7656
8027
|
# Create a dictionary of output columns to column type (python types).
|
|
7657
8028
|
output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
|
|
7658
8029
|
for col_name, col_type in returns.items()}
|
|
7659
|
-
|
|
8030
|
+
|
|
7660
8031
|
for colname, col in call_udf_expr.items():
|
|
7661
8032
|
returns[colname] = col.type
|
|
7662
8033
|
output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
|
|
@@ -7782,7 +8153,7 @@ class DataFrame():
|
|
|
7782
8153
|
Look at Example 18 to understand more.
|
|
7783
8154
|
8. While passing multiple udf expressions, one can not pass one column output
|
|
7784
8155
|
as another column input in the same ``assign`` call.
|
|
7785
|
-
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
8156
|
+
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
7786
8157
|
last udf expression are considered for processing.
|
|
7787
8158
|
|
|
7788
8159
|
RAISES:
|
|
@@ -8147,13 +8518,13 @@ class DataFrame():
|
|
|
8147
8518
|
Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
|
|
8148
8519
|
>>>
|
|
8149
8520
|
|
|
8150
|
-
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
8521
|
+
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
8151
8522
|
# defined function on Vantage Cloud Lake.
|
|
8152
8523
|
# Create a Python 3.10.5 environment with given name and description in Vantage.
|
|
8153
8524
|
>>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
|
|
8154
8525
|
User environment 'test_udf' created.
|
|
8155
8526
|
>>>
|
|
8156
|
-
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
8527
|
+
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
8157
8528
|
# and pass the user env to run it on.
|
|
8158
8529
|
>>> from teradataml.dataframe.functions import udf
|
|
8159
8530
|
>>> @udf(env_name = env)
|
|
@@ -8165,7 +8536,7 @@ class DataFrame():
|
|
|
8165
8536
|
# to the DataFrame.
|
|
8166
8537
|
>>> df.assign(upper_stats = to_upper('accounts'))
|
|
8167
8538
|
Feb Jan Mar Apr datetime upper_stats
|
|
8168
|
-
accounts
|
|
8539
|
+
accounts
|
|
8169
8540
|
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8170
8541
|
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8171
8542
|
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
@@ -8184,12 +8555,12 @@ class DataFrame():
|
|
|
8184
8555
|
# Register the created user defined function with name "upper".
|
|
8185
8556
|
>>> register("upper", to_upper)
|
|
8186
8557
|
>>>
|
|
8187
|
-
# Call the user defined function registered with name "upper" and assign the
|
|
8558
|
+
# Call the user defined function registered with name "upper" and assign the
|
|
8188
8559
|
# ColumnExpression returned to the DataFrame.
|
|
8189
8560
|
>>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
|
|
8190
8561
|
>>> res
|
|
8191
8562
|
Feb Jan Mar Apr datetime upper_col
|
|
8192
|
-
accounts
|
|
8563
|
+
accounts
|
|
8193
8564
|
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8194
8565
|
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8195
8566
|
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
@@ -8263,8 +8634,34 @@ class DataFrame():
|
|
|
8263
8634
|
# from udf expression.
|
|
8264
8635
|
if bool(regular_expr):
|
|
8265
8636
|
try:
|
|
8266
|
-
|
|
8637
|
+
root_node_id = None
|
|
8638
|
+
root_df_col = df.columns
|
|
8639
|
+
|
|
8640
|
+
# Get the previous node type, if it is assign and drop_columns is False,
|
|
8641
|
+
# then check if the previous assign arguments exists and are not present
|
|
8642
|
+
# in either the root dataframe columns or the current assign arguments.
|
|
8643
|
+
# if these conditions are met, obtain the root node id (i.e., the first
|
|
8644
|
+
# node of the assign operation) and merge the previous assign arguments with the current ones.
|
|
8645
|
+
|
|
8646
|
+
prev_node_type = df._aed_utils._aed_get_node_query_type(df._nodeid)
|
|
8647
|
+
if not drop_columns and prev_node_type == "assign" and df._previous_assign_args is not None:
|
|
8648
|
+
if not df._root_columns & df._previous_assign_args.keys() and \
|
|
8649
|
+
not df._previous_assign_args.keys() & regular_expr.keys():
|
|
8650
|
+
# Get the root node id and root dataframe columns.
|
|
8651
|
+
root_df_col = df._root_columns
|
|
8652
|
+
root_node_id = df._aed_utils._aed_get_parent_nodeids(df._nodeid)[0]
|
|
8653
|
+
regular_expr = {**df._previous_assign_args, **regular_expr}
|
|
8654
|
+
|
|
8655
|
+
# If root_node_id is None, assign the current node id as root node of assign operation
|
|
8656
|
+
node_id = root_node_id if root_node_id is not None else df._nodeid
|
|
8657
|
+
|
|
8658
|
+
# Generate new meta expression and node id for the new dataframe.
|
|
8659
|
+
(new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(
|
|
8660
|
+
drop_columns, node_id = node_id, **regular_expr)
|
|
8267
8661
|
df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
|
|
8662
|
+
df._previous_assign_args = regular_expr
|
|
8663
|
+
df._root_columns = root_df_col
|
|
8664
|
+
|
|
8268
8665
|
except Exception as err:
|
|
8269
8666
|
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
8270
8667
|
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
@@ -8475,7 +8872,9 @@ class DataFrame():
|
|
|
8475
8872
|
_Validators._validate_column_exists_in_dataframe(keys, self._metaexpr)
|
|
8476
8873
|
|
|
8477
8874
|
try:
|
|
8478
|
-
|
|
8875
|
+
|
|
8876
|
+
# Slicing creates a new list instance with the same contents.
|
|
8877
|
+
new_index_list = self._index_label[:] if self._index_label is not None else []
|
|
8479
8878
|
|
|
8480
8879
|
# Creating a list with requested index labels bases on append
|
|
8481
8880
|
if append:
|
|
@@ -8490,7 +8889,7 @@ class DataFrame():
|
|
|
8490
8889
|
new_index_list = keys
|
|
8491
8890
|
|
|
8492
8891
|
# Takes care of appending already existing index
|
|
8493
|
-
new_index_list = list(
|
|
8892
|
+
new_index_list = list(dict.fromkeys(new_index_list))
|
|
8494
8893
|
|
|
8495
8894
|
# In case requested index is same as existing index, return same DF
|
|
8496
8895
|
if new_index_list == self._index_label:
|
|
@@ -9373,15 +9772,15 @@ class DataFrame():
|
|
|
9373
9772
|
TypeError, ValueError, TeradataMLException
|
|
9374
9773
|
|
|
9375
9774
|
EXAMPLES:
|
|
9376
|
-
|
|
9377
|
-
|
|
9775
|
+
# Load the example datasets.
|
|
9776
|
+
>>> load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
|
|
9378
9777
|
>>>
|
|
9379
9778
|
|
|
9380
|
-
|
|
9381
|
-
|
|
9382
|
-
|
|
9383
|
-
|
|
9384
|
-
|
|
9779
|
+
# Create the required DataFrames.
|
|
9780
|
+
# DataFrame on non-sequenced PTI table
|
|
9781
|
+
>>> ocean_buoys = DataFrame("ocean_buoys")
|
|
9782
|
+
# Check DataFrame columns and let's peek at the data
|
|
9783
|
+
>>> ocean_buoys.columns
|
|
9385
9784
|
['buoyid', 'TD_TIMECODE', 'temperature', 'salinity']
|
|
9386
9785
|
>>> ocean_buoys.head()
|
|
9387
9786
|
TD_TIMECODE temperature salinity
|
|
@@ -9397,10 +9796,10 @@ class DataFrame():
|
|
|
9397
9796
|
0 2014-01-06 08:00:00.000000 10.0 55
|
|
9398
9797
|
0 2014-01-06 08:10:00.000000 10.0 55
|
|
9399
9798
|
|
|
9400
|
-
|
|
9401
|
-
|
|
9402
|
-
|
|
9403
|
-
|
|
9799
|
+
# DataFrame on NON-PTI table
|
|
9800
|
+
>>> ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
|
|
9801
|
+
# Check DataFrame columns and let's peek at the data
|
|
9802
|
+
>>> ocean_buoys_nonpti.columns
|
|
9404
9803
|
['buoyid', 'timecode', 'temperature', 'salinity']
|
|
9405
9804
|
>>> ocean_buoys_nonpti.head()
|
|
9406
9805
|
buoyid temperature salinity
|
|
@@ -9974,6 +10373,15 @@ class DataFrame():
|
|
|
9974
10373
|
# If user did not pass any arguments which form join conditions,
|
|
9975
10374
|
# Merge is performed using index columns of TeradataML DataFrames
|
|
9976
10375
|
if on is None and left_on is None and right_on is None and not use_index:
|
|
10376
|
+
# DataFrames created on OTF table will not have index.
|
|
10377
|
+
if self._datalake is not None or right._datalake is not None:
|
|
10378
|
+
msg_code = MessageCodes.EXECUTION_FAILED
|
|
10379
|
+
emsg = "Either 'on' argument or both 'left_on' and 'right_on' arguments" \
|
|
10380
|
+
" must be provided to merge DataFrames when they are created on" \
|
|
10381
|
+
" OTF table(s)."
|
|
10382
|
+
error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
|
|
10383
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
10384
|
+
|
|
9977
10385
|
if self._index_label is None or right._index_label is None:
|
|
9978
10386
|
raise TeradataMlException(
|
|
9979
10387
|
Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
|
|
@@ -9981,6 +10389,12 @@ class DataFrame():
|
|
|
9981
10389
|
use_index = True
|
|
9982
10390
|
|
|
9983
10391
|
if use_index:
|
|
10392
|
+
if self._datalake is not None or right._datalake is not None:
|
|
10393
|
+
msg_code = MessageCodes.EXECUTION_FAILED
|
|
10394
|
+
emsg = "Can not use Index to merge DataFrames when they are created on OTF table(s)."
|
|
10395
|
+
error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
|
|
10396
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
10397
|
+
|
|
9984
10398
|
if self._index_label is None or right._index_label is None:
|
|
9985
10399
|
raise TeradataMlException(
|
|
9986
10400
|
Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
|
|
@@ -10636,7 +11050,7 @@ class DataFrame():
|
|
|
10636
11050
|
2. seed is supported for stratify column.
|
|
10637
11051
|
3. Arguments "stratify_column", "seed", "id_column" are supported only
|
|
10638
11052
|
for stratifying the data.
|
|
10639
|
-
Types: str
|
|
11053
|
+
Types: str OR Feature
|
|
10640
11054
|
|
|
10641
11055
|
seed:
|
|
10642
11056
|
Optional Argument.
|
|
@@ -10662,7 +11076,7 @@ class DataFrame():
|
|
|
10662
11076
|
for stratifying the data.
|
|
10663
11077
|
2. "id_column" is supported only when "stratify_column" is used.
|
|
10664
11078
|
Ignored otherwise.
|
|
10665
|
-
Types: str
|
|
11079
|
+
Types: str OR Feature
|
|
10666
11080
|
|
|
10667
11081
|
RETURNS:
|
|
10668
11082
|
teradataml DataFrame
|
|
@@ -11191,6 +11605,10 @@ class DataFrame():
|
|
|
11191
11605
|
DESCRIPTION:
|
|
11192
11606
|
Function to apply a user defined function to each row in the
|
|
11193
11607
|
teradataml DataFrame, leveraging Vantage's Script Table Operator.
|
|
11608
|
+
Notes:
|
|
11609
|
+
1. The function requires to use same Python version in both Vantage and local environment.
|
|
11610
|
+
2. Teradata recommends to use "dill" package with same version in both Vantage and
|
|
11611
|
+
local environment.
|
|
11194
11612
|
|
|
11195
11613
|
PARAMETERS:
|
|
11196
11614
|
user_function:
|
|
@@ -11371,6 +11789,15 @@ class DataFrame():
|
|
|
11371
11789
|
Default Value: True
|
|
11372
11790
|
Types: bool
|
|
11373
11791
|
|
|
11792
|
+
debug:
|
|
11793
|
+
Optional Argument.
|
|
11794
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
11795
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
11796
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
11797
|
+
Otherwise, file is removed from the local file system.
|
|
11798
|
+
Default Value: False
|
|
11799
|
+
Types: bool
|
|
11800
|
+
|
|
11374
11801
|
RETURNS:
|
|
11375
11802
|
1. teradataml DataFrame if exec_mode is "IN-DB".
|
|
11376
11803
|
2. Pandas DataFrame if exec_mode is "LOCAL".
|
|
@@ -11523,6 +11950,7 @@ class DataFrame():
|
|
|
11523
11950
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
11524
11951
|
auth = kwargs.pop('auth', None)
|
|
11525
11952
|
charset = kwargs.pop('charset', None)
|
|
11953
|
+
debug = kwargs.pop('debug', False)
|
|
11526
11954
|
|
|
11527
11955
|
# Check for other extra/unknown arguments.
|
|
11528
11956
|
unknown_args = list(kwargs.keys())
|
|
@@ -11541,7 +11969,7 @@ class DataFrame():
|
|
|
11541
11969
|
sort_ascending=sort_ascending,
|
|
11542
11970
|
returns=returns, delimiter=delimiter,
|
|
11543
11971
|
quotechar=quotechar, auth=auth,
|
|
11544
|
-
charset=charset, num_rows=num_rows)
|
|
11972
|
+
charset=charset, num_rows=num_rows, debug=debug)
|
|
11545
11973
|
|
|
11546
11974
|
return tbl_op_util.execute()
|
|
11547
11975
|
|
|
@@ -11558,6 +11986,10 @@ class DataFrame():
|
|
|
11558
11986
|
DESCRIPTION:
|
|
11559
11987
|
Function to apply a user defined function to a group or partition of rows
|
|
11560
11988
|
in the teradataml DataFrame, leveraging Vantage's Script Table Operator.
|
|
11989
|
+
Notes:
|
|
11990
|
+
1. The function requires to use same Python version in both Vantage and local environment.
|
|
11991
|
+
2. Teradata recommends to use "dill" package with same version in both Vantage and
|
|
11992
|
+
local environment.
|
|
11561
11993
|
|
|
11562
11994
|
PARAMETERS:
|
|
11563
11995
|
user_function:
|
|
@@ -11768,6 +12200,15 @@ class DataFrame():
|
|
|
11768
12200
|
Default Value: True
|
|
11769
12201
|
Types: bool
|
|
11770
12202
|
|
|
12203
|
+
debug:
|
|
12204
|
+
Optional Argument.
|
|
12205
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
12206
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
12207
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
12208
|
+
Otherwise, file is removed from the local file system.
|
|
12209
|
+
Default Value: False
|
|
12210
|
+
Types: bool
|
|
12211
|
+
|
|
11771
12212
|
RETURNS:
|
|
11772
12213
|
1. teradataml DataFrame if exec_mode is "IN-DB".
|
|
11773
12214
|
2. Pandas DataFrame if exec_mode is "LOCAL".
|
|
@@ -11933,6 +12374,7 @@ class DataFrame():
|
|
|
11933
12374
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
11934
12375
|
auth = kwargs.pop('auth', None)
|
|
11935
12376
|
charset = kwargs.pop('charset', None)
|
|
12377
|
+
debug = kwargs.pop('debug', False)
|
|
11936
12378
|
|
|
11937
12379
|
# Check for other extra/unknown arguments.
|
|
11938
12380
|
unknown_args = list(kwargs.keys())
|
|
@@ -11951,7 +12393,7 @@ class DataFrame():
|
|
|
11951
12393
|
sort_ascending=sort_ascending,
|
|
11952
12394
|
returns=returns, delimiter=delimiter,
|
|
11953
12395
|
quotechar=quotechar, auth=auth,
|
|
11954
|
-
charset=charset, num_rows=num_rows)
|
|
12396
|
+
charset=charset, num_rows=num_rows, debug=debug)
|
|
11955
12397
|
|
|
11956
12398
|
return tbl_op_util.execute()
|
|
11957
12399
|
|
|
@@ -11968,9 +12410,9 @@ class DataFrame():
|
|
|
11968
12410
|
teradataml DataFrame, leveraging Apply Table Operator of Open
|
|
11969
12411
|
Analytics Framework.
|
|
11970
12412
|
Notes:
|
|
11971
|
-
|
|
11972
|
-
|
|
11973
|
-
|
|
12413
|
+
1. The function requires to use same Python version in both remote environment and local environment.
|
|
12414
|
+
2. Teradata recommends to use "dill" package with same version in both remote environment and
|
|
12415
|
+
local environment.
|
|
11974
12416
|
|
|
11975
12417
|
PARAMETERS:
|
|
11976
12418
|
user_function:
|
|
@@ -12153,6 +12595,15 @@ class DataFrame():
|
|
|
12153
12595
|
Default value: "csv"
|
|
12154
12596
|
Types: str
|
|
12155
12597
|
|
|
12598
|
+
debug:
|
|
12599
|
+
Optional Argument.
|
|
12600
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
12601
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
12602
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
12603
|
+
Otherwise, file is removed from the local file system.
|
|
12604
|
+
Default Value: False
|
|
12605
|
+
Types: bool
|
|
12606
|
+
|
|
12156
12607
|
RETURNS:
|
|
12157
12608
|
teradataml DataFrame.
|
|
12158
12609
|
|
|
@@ -12329,6 +12780,7 @@ class DataFrame():
|
|
|
12329
12780
|
is_local_order = kwargs.pop('is_local_order', False)
|
|
12330
12781
|
nulls_first = kwargs.pop('nulls_first', True)
|
|
12331
12782
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
12783
|
+
debug = kwargs.pop('debug', False)
|
|
12332
12784
|
|
|
12333
12785
|
# Check for other extra/unknown arguments.
|
|
12334
12786
|
unknown_args = list(kwargs.keys())
|
|
@@ -12351,7 +12803,8 @@ class DataFrame():
|
|
|
12351
12803
|
charset=None,
|
|
12352
12804
|
num_rows=num_rows,
|
|
12353
12805
|
env_name=env_name,
|
|
12354
|
-
style=style
|
|
12806
|
+
style=style,
|
|
12807
|
+
debug=debug)
|
|
12355
12808
|
|
|
12356
12809
|
return tbl_op_util.execute()
|
|
12357
12810
|
|
|
@@ -12696,8 +13149,8 @@ class DataFrame():
|
|
|
12696
13149
|
_Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
|
|
12697
13150
|
False)
|
|
12698
13151
|
column_names = list(dict.fromkeys(column_names))
|
|
12699
|
-
|
|
12700
|
-
if list_td_reserved_keywords(column_names):
|
|
13152
|
+
|
|
13153
|
+
if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
|
|
12701
13154
|
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
12702
13155
|
|
|
12703
13156
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
@@ -14617,7 +15070,18 @@ class DataFrame():
|
|
|
14617
15070
|
>>> plot.show()
|
|
14618
15071
|
|
|
14619
15072
|
"""
|
|
14620
|
-
|
|
15073
|
+
|
|
15074
|
+
_plot = _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
|
|
15075
|
+
# If plot is already generated, return the same plot.
|
|
15076
|
+
if self._plot is None:
|
|
15077
|
+
self._plot = _plot
|
|
15078
|
+
return _plot
|
|
15079
|
+
|
|
15080
|
+
if self._plot == _plot:
|
|
15081
|
+
return self._plot
|
|
15082
|
+
else:
|
|
15083
|
+
self._plot = _plot
|
|
15084
|
+
return _plot
|
|
14621
15085
|
|
|
14622
15086
|
@collect_queryband(queryband="DF_itertuples")
|
|
14623
15087
|
def itertuples(self, name='Row', num_rows=None):
|
|
@@ -15057,7 +15521,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15057
15521
|
from sqlalchemy.sql.functions import Function
|
|
15058
15522
|
return (type(None), int, float, str, decimal.Decimal, Function, ColumnExpression, ClauseElement)
|
|
15059
15523
|
|
|
15060
|
-
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
|
|
15524
|
+
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
|
|
15061
15525
|
"""
|
|
15062
15526
|
DESCRIPTION:
|
|
15063
15527
|
Function generates the MetaExpression and AED nodeid for DataFrameGroupBy.assign()
|
|
@@ -15070,6 +15534,11 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15070
15534
|
and grouping columns are returned. This is unused argument.
|
|
15071
15535
|
Types: bool
|
|
15072
15536
|
|
|
15537
|
+
node_id:
|
|
15538
|
+
Optional Argument.
|
|
15539
|
+
Specifies the input nodeid for the assign operation. This is unused argument.
|
|
15540
|
+
Types: str
|
|
15541
|
+
|
|
15073
15542
|
kwargs:
|
|
15074
15543
|
keyword, value pairs
|
|
15075
15544
|
- keywords are the column names.
|
|
@@ -17510,11 +17979,18 @@ class _TDUAF(DataFrame):
|
|
|
17510
17979
|
table_name = self._db_utils._execute_node_return_db_object_name(self._data._nodeid, self._data._metaexpr)
|
|
17511
17980
|
|
|
17512
17981
|
# UAF Functions do not accept double quotes.
|
|
17982
|
+
tdp = preparer(td_dialect)
|
|
17513
17983
|
db_name = UtilFuncs._extract_db_name(table_name)
|
|
17514
|
-
|
|
17515
|
-
|
|
17984
|
+
datalake_name = UtilFuncs._extract_datalake_name(table_name)
|
|
17985
|
+
if datalake_name:
|
|
17986
|
+
table_name = '{}.{}.{}'.format(tdp.quote(datalake_name),
|
|
17987
|
+
tdp.quote(db_name),
|
|
17988
|
+
tdp.quote(UtilFuncs._extract_table_name(table_name)))
|
|
17989
|
+
elif db_name:
|
|
17990
|
+
table_name = '{}.{}'.format(tdp.quote(db_name),
|
|
17991
|
+
tdp.quote(UtilFuncs._extract_table_name(table_name)))
|
|
17516
17992
|
else:
|
|
17517
|
-
table_name = UtilFuncs._extract_table_name(table_name)
|
|
17993
|
+
table_name = tdp.quote(UtilFuncs._extract_table_name(table_name))
|
|
17518
17994
|
|
|
17519
17995
|
sql_clauses.append("TABLE_NAME ({})")
|
|
17520
17996
|
sql_values.append(table_name)
|