teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +315 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +95 -8
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +5 -1
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +59 -35
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +27 -12
- teradataml/automl/model_training.py +73 -46
- teradataml/common/constants.py +88 -29
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +19 -3
- teradataml/common/messages.py +6 -1
- teradataml/common/sqlbundle.py +64 -12
- teradataml/common/utils.py +246 -47
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +161 -27
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +1049 -285
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +578 -35
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +185 -16
- teradataml/dbutils/dbutils.py +1049 -115
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/_base.py +1466 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
- teradataml/options/__init__.py +54 -38
- teradataml/options/configure.py +131 -27
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +5 -5
- teradataml/scriptmgmt/lls_utils.py +130 -40
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2318 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +51 -2
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +99 -8
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_class.py +0 -255
- teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -19,6 +19,10 @@ import pandas as pd
|
|
|
19
19
|
import re
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
import sys
|
|
22
|
+
import urllib.parse
|
|
23
|
+
|
|
24
|
+
from sqlalchemy import Column
|
|
25
|
+
|
|
22
26
|
import teradataml.context.context as tdmlctx
|
|
23
27
|
|
|
24
28
|
from collections import OrderedDict, namedtuple
|
|
@@ -30,6 +34,7 @@ from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
|
30
34
|
from teradataml.dataframe.sql_functions import case
|
|
31
35
|
from teradataml.series.series import Series
|
|
32
36
|
from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
|
|
37
|
+
from teradataml.common.deprecations import argument_deprecation
|
|
33
38
|
from teradataml.common.utils import UtilFuncs
|
|
34
39
|
from teradataml.common.exceptions import TeradataMlException
|
|
35
40
|
from teradataml.common.messages import Messages
|
|
@@ -41,9 +46,11 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, Dat
|
|
|
41
46
|
from teradataml.dataframe.indexer import _LocationIndexer
|
|
42
47
|
from teradataml.common.aed_utils import AedUtils
|
|
43
48
|
from teradataml.options.display import display
|
|
49
|
+
from teradataml.options.configure import configure
|
|
44
50
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
45
51
|
from teradataml.dataframe.row import _Row
|
|
46
52
|
from teradataml.dataframe.setop import concat
|
|
53
|
+
from teradataml.dbutils.dbutils import list_td_reserved_keywords
|
|
47
54
|
from teradataml.plot.plot import _Plot
|
|
48
55
|
from teradataml.scriptmgmt.UserEnv import UserEnv
|
|
49
56
|
from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
|
|
@@ -57,10 +64,83 @@ from teradataml.common.bulk_exposed_utils import _validate_unimplemented_functio
|
|
|
57
64
|
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
58
65
|
from teradataml.options.configure import configure
|
|
59
66
|
from teradataml.utils.internal_buffer import _InternalBuffer
|
|
67
|
+
from teradataml.common.constants import OutputStyle
|
|
60
68
|
|
|
61
69
|
# TODO use logger when available on master branch
|
|
62
70
|
# logger = teradatapylog.getLogger()
|
|
63
|
-
|
|
71
|
+
|
|
72
|
+
class in_schema:
|
|
73
|
+
"""
|
|
74
|
+
Class takes a schema name, a table name and datalake name attributes
|
|
75
|
+
and creates an object that can be passed to DataFrame.
|
|
76
|
+
Note:
|
|
77
|
+
teradataml recommends to use this class to access table(s)/view(s),
|
|
78
|
+
from the database other than the default database.
|
|
79
|
+
"""
|
|
80
|
+
def __init__(self, schema_name, table_name, datalake_name=None):
|
|
81
|
+
"""
|
|
82
|
+
Constructor for in_schema class.
|
|
83
|
+
|
|
84
|
+
PARAMETERS:
|
|
85
|
+
schema_name:
|
|
86
|
+
Required Argument.
|
|
87
|
+
Specifies the schema where the table resides in.
|
|
88
|
+
Types: str
|
|
89
|
+
|
|
90
|
+
table_name:
|
|
91
|
+
Required Argument.
|
|
92
|
+
Specifies the table name or view name in Vantage.
|
|
93
|
+
Types: str
|
|
94
|
+
|
|
95
|
+
datalake_name:
|
|
96
|
+
Optional Argument.
|
|
97
|
+
Specifies the datalake name.
|
|
98
|
+
Types: str
|
|
99
|
+
|
|
100
|
+
EXAMPLES:
|
|
101
|
+
from teradataml.dataframe.dataframe import in_schema, DataFrame
|
|
102
|
+
|
|
103
|
+
# Example 1: The following example creates a DataFrame from the
|
|
104
|
+
# existing Vantage table "dbcinfo" in the non-default
|
|
105
|
+
# database "dbc" using the in_schema instance.
|
|
106
|
+
df = DataFrame(in_schema("dbc", "dbcinfo"))
|
|
107
|
+
|
|
108
|
+
# Example 2: The following example uses from_table() function, existing
|
|
109
|
+
# Vantage table "dbcinfo" and non-default database "dbc" to
|
|
110
|
+
# create a teradataml DataFrame.
|
|
111
|
+
df = DataFrame.from_table(in_schema("dbc","dbcinfo"))
|
|
112
|
+
|
|
113
|
+
# Example 3: The following example uses "in_schema" object created
|
|
114
|
+
# with "datalake_name" argument to create DataFrame on OTF table.
|
|
115
|
+
otf_df = DataFrame(in_schema("datalake_db","datalake_table","datalake"))
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
self.schema_name = schema_name
|
|
119
|
+
self.table_name = table_name
|
|
120
|
+
self.datalake_name = datalake_name
|
|
121
|
+
|
|
122
|
+
awu_matrix = []
|
|
123
|
+
awu_matrix.append(["schema_name", schema_name, False, (str), True])
|
|
124
|
+
awu_matrix.append(["table_name", table_name, False, (str), True])
|
|
125
|
+
awu_matrix.append(["datalake_name", datalake_name, True, (str), True])
|
|
126
|
+
|
|
127
|
+
# Validate argument types
|
|
128
|
+
_Validators._validate_function_arguments(awu_matrix)
|
|
129
|
+
|
|
130
|
+
def __str__(self):
|
|
131
|
+
"""
|
|
132
|
+
Returns the string representation of in_schema instance.
|
|
133
|
+
"""
|
|
134
|
+
tbl_name = '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.schema_name, "\"", False),
|
|
135
|
+
UtilFuncs._teradata_quote_arg(self.table_name, "\"", False))
|
|
136
|
+
|
|
137
|
+
if not self.datalake_name:
|
|
138
|
+
return tbl_name
|
|
139
|
+
|
|
140
|
+
return '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.datalake_name, "\"", False), tbl_name)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
in_schema = in_schema
|
|
64
144
|
|
|
65
145
|
|
|
66
146
|
class DataFrame():
|
|
@@ -163,6 +243,19 @@ class DataFrame():
|
|
|
163
243
|
# Property to determine if table is an ART table or not.
|
|
164
244
|
self._is_art = None
|
|
165
245
|
|
|
246
|
+
self._datalake = None
|
|
247
|
+
self._database = None
|
|
248
|
+
self._table = None
|
|
249
|
+
self._otf = False
|
|
250
|
+
|
|
251
|
+
if isinstance(table_name, in_schema):
|
|
252
|
+
self._table = table_name.table_name
|
|
253
|
+
self._datalake = table_name.datalake_name
|
|
254
|
+
self._database = table_name.schema_name
|
|
255
|
+
self._otf = True if self._datalake else False
|
|
256
|
+
|
|
257
|
+
table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
|
|
258
|
+
|
|
166
259
|
# Below matrix is list of list, where in each row contains following elements:
|
|
167
260
|
# Let's take an example of following, just to get an idea:
|
|
168
261
|
# [element1, element2, element3, element4, element5, element6]
|
|
@@ -195,25 +288,45 @@ class DataFrame():
|
|
|
195
288
|
self._source_type = SourceType.TABLE.value
|
|
196
289
|
self._nodeid = self._aed_utils._aed_table(self._table_name)
|
|
197
290
|
elif query is not None:
|
|
291
|
+
query = query.strip()
|
|
292
|
+
query = query[:-1] if query[-1] == ";" else query
|
|
293
|
+
|
|
198
294
|
self._query = query
|
|
199
295
|
self._source_type = SourceType.QUERY.value
|
|
200
296
|
|
|
201
|
-
|
|
202
|
-
|
|
297
|
+
temp_obj_params = {
|
|
298
|
+
"prefix": "_frmqry_v",
|
|
299
|
+
"use_default_database": True,
|
|
300
|
+
"quote": False
|
|
301
|
+
}
|
|
302
|
+
__execute = UtilFuncs._create_view
|
|
303
|
+
|
|
304
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
305
|
+
# If user requests to materialize the query, then we should create a
|
|
306
|
+
# volatile table if user intends to the same instead of view.
|
|
307
|
+
# Volatile table does not need to be added to the GC.
|
|
308
|
+
temp_obj_params["table_type"] = TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
309
|
+
temp_obj_params["gc_on_quit"] = False
|
|
310
|
+
temp_obj_params["prefix"] = "_frmqry_vt"
|
|
311
|
+
__execute = UtilFuncs._create_table
|
|
312
|
+
|
|
313
|
+
elif materialize:
|
|
314
|
+
# If user requests to materialize the query, then we should create a
|
|
203
315
|
# table instead of view and add the same in the GarbageCollector.
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_v", use_default_database=True,
|
|
209
|
-
quote=False)
|
|
316
|
+
temp_obj_params["table_type"] = TeradataConstants.TERADATA_TABLE
|
|
317
|
+
temp_obj_params["gc_on_quit"] = True
|
|
318
|
+
temp_obj_params["prefix"] = "_frmqry_t"
|
|
319
|
+
__execute = UtilFuncs._create_table
|
|
210
320
|
|
|
321
|
+
temp_table_name = UtilFuncs._generate_temp_table_name(**temp_obj_params)
|
|
211
322
|
self._table_name = temp_table_name
|
|
323
|
+
__execute_params = (self._table_name, self._query)
|
|
324
|
+
|
|
325
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
326
|
+
__execute_params = (self._table_name, self._query, True)
|
|
327
|
+
|
|
212
328
|
try:
|
|
213
|
-
|
|
214
|
-
UtilFuncs._create_table(self._table_name, self._query)
|
|
215
|
-
else:
|
|
216
|
-
UtilFuncs._create_view(self._table_name, self._query)
|
|
329
|
+
__execute(*__execute_params)
|
|
217
330
|
except OperationalError as oe:
|
|
218
331
|
if "[Error 3707] Syntax error" in str(oe):
|
|
219
332
|
raise ValueError(Messages.get_message(
|
|
@@ -229,7 +342,7 @@ class DataFrame():
|
|
|
229
342
|
|
|
230
343
|
self._nodeid = self._aed_utils._aed_query(self._query, temp_table_name)
|
|
231
344
|
else:
|
|
232
|
-
if inspect.stack()[1][3] not in ['_from_node', '__init__']:
|
|
345
|
+
if inspect.stack()[1][3] not in ['_from_node', '__init__', 'alias']:
|
|
233
346
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
234
347
|
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
235
348
|
|
|
@@ -241,6 +354,10 @@ class DataFrame():
|
|
|
241
354
|
self._iloc = _LocationIndexer(self, integer_indexing=True)
|
|
242
355
|
self.__data = None
|
|
243
356
|
self.__data_columns = None
|
|
357
|
+
self._alias = None
|
|
358
|
+
self._plot = None
|
|
359
|
+
|
|
360
|
+
self._eda_ui = None
|
|
244
361
|
|
|
245
362
|
except TeradataMlException:
|
|
246
363
|
raise
|
|
@@ -250,9 +367,106 @@ class DataFrame():
|
|
|
250
367
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
251
368
|
MessageCodes.TDMLDF_CREATE_FAIL) from err
|
|
252
369
|
|
|
370
|
+
@property
|
|
371
|
+
def db_object_name(self):
|
|
372
|
+
"""
|
|
373
|
+
DESCRIPTION:
|
|
374
|
+
Get the underlying database object name, on which DataFrame is
|
|
375
|
+
created.
|
|
376
|
+
|
|
377
|
+
RETURNS:
|
|
378
|
+
str representing object name of DataFrame
|
|
379
|
+
|
|
380
|
+
EXAMPLES:
|
|
381
|
+
>>> load_example_data("dataframe", "sales")
|
|
382
|
+
>>> df = DataFrame('sales')
|
|
383
|
+
>>> df.db_object_name
|
|
384
|
+
'"sales"'
|
|
385
|
+
"""
|
|
386
|
+
if self._table_name is not None:
|
|
387
|
+
return self._table_name
|
|
388
|
+
else:
|
|
389
|
+
msg = "Object name is available once DataFrame is materialized. " \
|
|
390
|
+
"Use DataFrame.materialize() to materialize DataFrame."
|
|
391
|
+
print(msg)
|
|
392
|
+
|
|
393
|
+
def alias(self, alias_name):
|
|
394
|
+
"""
|
|
395
|
+
DESCRIPTION:
|
|
396
|
+
Method to create an aliased teradataml DataFrame.
|
|
397
|
+
Note:
|
|
398
|
+
* This method is recommended to be used before performing
|
|
399
|
+
self join using DataFrame's join() API.
|
|
400
|
+
|
|
401
|
+
PARAMETERS:
|
|
402
|
+
alias_name:
|
|
403
|
+
Required Argument.
|
|
404
|
+
Specifies the alias name to be assigned to a teradataml DataFrame.
|
|
405
|
+
Types: str
|
|
406
|
+
|
|
407
|
+
RETURNS:
|
|
408
|
+
teradataml DataFrame
|
|
409
|
+
|
|
410
|
+
EXAMPLES:
|
|
411
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
412
|
+
>>> df = DataFrame("admissions_train")
|
|
413
|
+
>>> df
|
|
414
|
+
masters gpa stats programming admitted
|
|
415
|
+
id
|
|
416
|
+
13 no 4.00 Advanced Novice 1
|
|
417
|
+
26 yes 3.57 Advanced Advanced 1
|
|
418
|
+
5 no 3.44 Novice Novice 0
|
|
419
|
+
19 yes 1.98 Advanced Advanced 0
|
|
420
|
+
15 yes 4.00 Advanced Advanced 1
|
|
421
|
+
40 yes 3.95 Novice Beginner 0
|
|
422
|
+
7 yes 2.33 Novice Novice 1
|
|
423
|
+
22 yes 3.46 Novice Beginner 0
|
|
424
|
+
36 no 3.00 Advanced Novice 0
|
|
425
|
+
38 yes 2.65 Advanced Beginner 1
|
|
426
|
+
|
|
427
|
+
# Example 1: Create an alias of teradataml DataFrame.
|
|
428
|
+
|
|
429
|
+
>>> df2 = df.alias("adm_trn")
|
|
430
|
+
|
|
431
|
+
# Print aliased DataFrame.
|
|
432
|
+
>>> df2
|
|
433
|
+
masters gpa stats programming admitted
|
|
434
|
+
id
|
|
435
|
+
13 no 4.00 Advanced Novice 1
|
|
436
|
+
26 yes 3.57 Advanced Advanced 1
|
|
437
|
+
5 no 3.44 Novice Novice 0
|
|
438
|
+
19 yes 1.98 Advanced Advanced 0
|
|
439
|
+
15 yes 4.00 Advanced Advanced 1
|
|
440
|
+
40 yes 3.95 Novice Beginner 0
|
|
441
|
+
7 yes 2.33 Novice Novice 1
|
|
442
|
+
22 yes 3.46 Novice Beginner 0
|
|
443
|
+
36 no 3.00 Advanced Novice 0
|
|
444
|
+
38 yes 2.65 Advanced Beginner 1
|
|
445
|
+
"""
|
|
446
|
+
arg_info_matrix = [["alias_name", alias_name, False, (str), True]]
|
|
447
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
448
|
+
try:
|
|
449
|
+
alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
|
|
450
|
+
reuse_metaexpr=False, _datalake=self._datalake,
|
|
451
|
+
_database=self._database, _table=self._table,
|
|
452
|
+
_otf=self._otf)
|
|
453
|
+
# Assigning self attributes to newly created alias dataframe.
|
|
454
|
+
alias_df._table_name = self._table_name
|
|
455
|
+
alias_df._index = self._index
|
|
456
|
+
alias_df._index_label = self._index_label
|
|
457
|
+
setattr(alias_df._metaexpr.t, "table_alias", alias_name)
|
|
458
|
+
alias_df._alias = alias_name
|
|
459
|
+
return alias_df
|
|
460
|
+
except Exception as err:
|
|
461
|
+
error_code = MessageCodes.EXECUTION_FAILED
|
|
462
|
+
error_msg = Messages.get_message(
|
|
463
|
+
error_code, "create alias dataFrame", '{}'.format(str(err)))
|
|
464
|
+
raise TeradataMlException(error_msg, error_code)
|
|
465
|
+
|
|
253
466
|
@classmethod
|
|
254
467
|
@collect_queryband(queryband="DF_fromTable")
|
|
255
|
-
def from_table(cls, table_name, index=True, index_label=None
|
|
468
|
+
def from_table(cls, table_name, index=True, index_label=None,
|
|
469
|
+
schema_name=None, datalake_name=None):
|
|
256
470
|
"""
|
|
257
471
|
Class method for creating a DataFrame from a table or a view.
|
|
258
472
|
|
|
@@ -273,30 +487,48 @@ class DataFrame():
|
|
|
273
487
|
Column/s used for sorting.
|
|
274
488
|
Types: str
|
|
275
489
|
|
|
490
|
+
schema_name:
|
|
491
|
+
Optional Argument.
|
|
492
|
+
Specifies the schema where the table resides.
|
|
493
|
+
Types: str
|
|
494
|
+
|
|
495
|
+
datalake_name:
|
|
496
|
+
Optional Argument.
|
|
497
|
+
Specifies the datalake name.
|
|
498
|
+
Types: str
|
|
499
|
+
|
|
276
500
|
EXAMPLES:
|
|
277
|
-
from teradataml.dataframe.dataframe import DataFrame
|
|
501
|
+
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
278
502
|
|
|
279
503
|
# Example 1: The following example creates a DataFrame from a table or
|
|
280
504
|
a view.
|
|
281
505
|
# Load the example data.
|
|
282
|
-
load_example_data("dataframe","sales")
|
|
506
|
+
>>> load_example_data("dataframe","sales")
|
|
283
507
|
|
|
284
508
|
# Create DataFrame from table
|
|
285
|
-
df = DataFrame.from_table('sales')
|
|
509
|
+
>>> df = DataFrame.from_table('sales')
|
|
286
510
|
|
|
287
511
|
# Create DataFrame from table and without index column sorting.
|
|
288
|
-
df = DataFrame.from_table("sales", False)
|
|
512
|
+
>>> df = DataFrame.from_table("sales", False)
|
|
289
513
|
|
|
290
514
|
# Create DataFrame from table and sorting using the 'accounts'
|
|
291
515
|
# column.
|
|
292
|
-
df = DataFrame.from_table("sales", True, "accounts")
|
|
516
|
+
>>> df = DataFrame.from_table("sales", True, "accounts")
|
|
293
517
|
|
|
294
518
|
# Example 2: The following example creates a DataFrame from existing Vantage
|
|
295
519
|
# table "dbcinfo" in the non-default database "dbc" using the
|
|
296
520
|
# in_schema() function.
|
|
297
521
|
|
|
298
|
-
from teradataml.dataframe.dataframe import in_schema
|
|
299
|
-
df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
|
|
522
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
523
|
+
>>> df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
|
|
524
|
+
|
|
525
|
+
# Example 3: Create a DataFrame on existing DataLake
|
|
526
|
+
# table "lake_table" in the "datalake_database" database
|
|
527
|
+
# in "datalake" datalake.
|
|
528
|
+
|
|
529
|
+
>>> datalake_df = DataFrame.from_table(table_name="lake_table",
|
|
530
|
+
... schema_name="datalake_database",
|
|
531
|
+
... datalake_name="datalake" )
|
|
300
532
|
|
|
301
533
|
RETURNS:
|
|
302
534
|
DataFrame
|
|
@@ -305,6 +537,9 @@ class DataFrame():
|
|
|
305
537
|
TeradataMlException - TDMLDF_CREATE_FAIL
|
|
306
538
|
|
|
307
539
|
"""
|
|
540
|
+
if schema_name:
|
|
541
|
+
return cls(in_schema(schema_name, table_name, datalake_name))
|
|
542
|
+
|
|
308
543
|
return cls(table_name, index, index_label)
|
|
309
544
|
|
|
310
545
|
@classmethod
|
|
@@ -364,7 +599,7 @@ class DataFrame():
|
|
|
364
599
|
return cls(index=index, index_label=index_label, query=query, materialize=materialize)
|
|
365
600
|
|
|
366
601
|
@classmethod
|
|
367
|
-
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None):
|
|
602
|
+
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True, **kwargs):
|
|
368
603
|
"""
|
|
369
604
|
Private class method for creating a DataFrame from a nodeid and parent metadata.
|
|
370
605
|
|
|
@@ -385,6 +620,12 @@ class DataFrame():
|
|
|
385
620
|
Optional Argument.
|
|
386
621
|
List specifying index column(s) to be retained as columns for printing.
|
|
387
622
|
|
|
623
|
+
reuse_metaexpr:
|
|
624
|
+
Optional Argument.
|
|
625
|
+
Specifies the flag to decide whether to use same _MetaExpression object or not.
|
|
626
|
+
Default Value: True
|
|
627
|
+
Types: bool
|
|
628
|
+
|
|
388
629
|
EXAMPLES:
|
|
389
630
|
from teradataml.dataframe.dataframe import DataFrame
|
|
390
631
|
df = DataFrame._from_node(1234, metaexpr)
|
|
@@ -400,30 +641,50 @@ class DataFrame():
|
|
|
400
641
|
df = cls()
|
|
401
642
|
df._nodeid = nodeid
|
|
402
643
|
df._source_type = SourceType.TABLE.value
|
|
403
|
-
|
|
644
|
+
|
|
645
|
+
if not reuse_metaexpr:
|
|
646
|
+
# Create new _MetaExpression object using reference metaExpression
|
|
647
|
+
# for newly created DataFrame.
|
|
648
|
+
df._metaexpr = UtilFuncs._get_metaexpr_using_parent_metaexpr(nodeid, metaexpr)
|
|
649
|
+
# When metaexpression is created using only column information from parent DataFrame,
|
|
650
|
+
# underlying SQLAlchemy table is created with '' string as Table name.
|
|
651
|
+
# Assign name from reference mataexpression here.
|
|
652
|
+
df._metaexpr.t.name = metaexpr.t.name
|
|
653
|
+
# Populate corresponding information into newly created DataFrame object
|
|
654
|
+
# using newly created metaExpression.
|
|
655
|
+
df._get_metadata_from_metaexpr(df._metaexpr)
|
|
656
|
+
else:
|
|
657
|
+
# Populate corresponding information into newly created DataFrame object
|
|
658
|
+
# using reference metaExpression.
|
|
659
|
+
df._get_metadata_from_metaexpr(metaexpr)
|
|
404
660
|
|
|
405
661
|
if isinstance(index_label, str):
|
|
406
662
|
index_label = [index_label]
|
|
407
663
|
|
|
408
|
-
if index_label is not None and all(elem in [col.name for col in
|
|
664
|
+
if index_label is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in index_label):
|
|
409
665
|
df._index_label = index_label
|
|
410
666
|
elif index_label is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
|
|
411
|
-
in [col.name for col in
|
|
667
|
+
in [col.name for col in df._metaexpr.c] for elem in index_label):
|
|
412
668
|
df._index_label = index_label
|
|
413
669
|
|
|
414
670
|
# Set the flag suggesting that the _index_label is set,
|
|
415
|
-
# and that a database lookup
|
|
671
|
+
# and that a database lookup won't be required even when it is None.
|
|
416
672
|
df._index_query_required = False
|
|
417
673
|
|
|
418
674
|
if isinstance(undropped_index, str):
|
|
419
675
|
undropped_index = [undropped_index]
|
|
420
676
|
|
|
421
|
-
if undropped_index is not None and all(elem in [col.name for col in
|
|
677
|
+
if undropped_index is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
422
678
|
df._undropped_index = undropped_index
|
|
423
679
|
elif undropped_index is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
|
|
424
|
-
in [col.name for col in
|
|
680
|
+
in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
425
681
|
df._undropped_index = undropped_index
|
|
426
682
|
|
|
683
|
+
# Populate remaining attributes.
|
|
684
|
+
for arg in kwargs:
|
|
685
|
+
# Pop each argument from kwargs and assign to new DataFrame.
|
|
686
|
+
arg_value = kwargs.get(arg)
|
|
687
|
+
df.__setattr__(arg, arg_value)
|
|
427
688
|
return df
|
|
428
689
|
|
|
429
690
|
def create_temp_view(self, name):
|
|
@@ -551,9 +812,10 @@ class DataFrame():
|
|
|
551
812
|
return self
|
|
552
813
|
|
|
553
814
|
@collect_queryband(queryband="DF_fillna")
|
|
554
|
-
def fillna(self, value=None, columns=None, literal_value=False):
|
|
815
|
+
def fillna(self, value=None, columns=None, literal_value=False, partition_column=None):
|
|
555
816
|
"""
|
|
556
|
-
|
|
817
|
+
DESCRIPTION:
|
|
818
|
+
Method to replace the null values in a column with the value specified.
|
|
557
819
|
|
|
558
820
|
PARAMETERS:
|
|
559
821
|
value:
|
|
@@ -586,6 +848,12 @@ class DataFrame():
|
|
|
586
848
|
Default Value: False
|
|
587
849
|
Types: bool
|
|
588
850
|
|
|
851
|
+
partition_column:
|
|
852
|
+
Optional Argument.
|
|
853
|
+
Specifies the column name to partition the data.
|
|
854
|
+
Default Value: None
|
|
855
|
+
Types: str
|
|
856
|
+
|
|
589
857
|
RETURNS:
|
|
590
858
|
teradataml DataFrame
|
|
591
859
|
|
|
@@ -626,6 +894,26 @@ class DataFrame():
|
|
|
626
894
|
3 Blue Inc 90.0 50 95.0 101.0 17/01/04
|
|
627
895
|
4 Alpha Co 210.0 200 215.0 250.0 17/01/04
|
|
628
896
|
5 Orange Inc 210.0 50 NaN 250.0 17/01/04
|
|
897
|
+
|
|
898
|
+
# Example 3: Populate the null value in 'pclass' and
|
|
899
|
+
# 'fare' column with mean value with partition
|
|
900
|
+
# column as 'sex'.
|
|
901
|
+
# Load the example data.
|
|
902
|
+
>>> load_example_data("teradataml", ["titanic"])
|
|
903
|
+
>>> df = DataFrame.from_table("titanic")
|
|
904
|
+
|
|
905
|
+
>>> df.fillna(value="mean", columns=["pclass", "fare"], partition_column="sex")
|
|
906
|
+
passenger survived pclass name sex age sibsp parch ticket fare cabin embarked
|
|
907
|
+
0 284 1 3 Dorking, Mr. Edward Arthur male 19.0 0 0 A/5. 10482 8.0500 None S
|
|
908
|
+
1 589 0 3 Gilinski, Mr. Eliezer male 22.0 0 0 14973 8.0500 None S
|
|
909
|
+
2 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 None Q
|
|
910
|
+
3 282 0 3 Olsson, Mr. Nils Johan Goransson male 28.0 0 0 347464 7.8542 None S
|
|
911
|
+
4 608 1 1 Daniel, Mr. Robert Williams male 27.0 0 0 113804 30.5000 None S
|
|
912
|
+
5 404 0 3 Hakkarainen, Mr. Pekka Pietari male 28.0 1 0 STON/O2. 3101279 15.8500 None S
|
|
913
|
+
6 427 1 2 Clarke, Mrs. Charles V (Ada Maria Winfield) female 28.0 1 0 2003 26.0000 None S
|
|
914
|
+
7 141 0 3 Boulos, Mrs. Joseph (Sultana) female NaN 0 2 2678 15.2458 None C
|
|
915
|
+
8 610 1 1 Shutes, Miss. Elizabeth W female 40.0 0 0 PC 17582 153.4625 C125 S
|
|
916
|
+
9 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 None C
|
|
629
917
|
"""
|
|
630
918
|
from teradataml import SimpleImputeFit, SimpleImputeTransform
|
|
631
919
|
|
|
@@ -633,6 +921,7 @@ class DataFrame():
|
|
|
633
921
|
arg_info_matrix.append(["value", value, True, (int, float, str, dict, list)])
|
|
634
922
|
arg_info_matrix.append(["columns", columns, True, (list, str, tuple)])
|
|
635
923
|
arg_info_matrix.append(["literal_value", literal_value, True, (bool)])
|
|
924
|
+
arg_info_matrix.append(["partition_column", partition_column, True, (str)])
|
|
636
925
|
|
|
637
926
|
# Validate argument types
|
|
638
927
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -704,9 +993,15 @@ class DataFrame():
|
|
|
704
993
|
literals=literals,
|
|
705
994
|
literals_columns=literals_columns,
|
|
706
995
|
stats=stats,
|
|
707
|
-
stats_columns=stats_columns
|
|
996
|
+
stats_columns=stats_columns,
|
|
997
|
+
partition_column=partition_column)
|
|
708
998
|
|
|
709
|
-
|
|
999
|
+
impute_transform = {
|
|
1000
|
+
'data': self,
|
|
1001
|
+
'data_partition_column': partition_column,
|
|
1002
|
+
'object_partition_column': partition_column}
|
|
1003
|
+
|
|
1004
|
+
return fit_obj.transform(**impute_transform).result
|
|
710
1005
|
|
|
711
1006
|
def __execute_node_and_set_table_name(self, nodeid, metaexpr=None):
|
|
712
1007
|
"""
|
|
@@ -789,7 +1084,10 @@ class DataFrame():
|
|
|
789
1084
|
Private method for setting _metaexpr and retrieving column names and types.
|
|
790
1085
|
|
|
791
1086
|
PARAMETERS:
|
|
792
|
-
metaexpr
|
|
1087
|
+
metaexpr:
|
|
1088
|
+
Required Argument.
|
|
1089
|
+
Specifies parent meta data (_MetaExpression object).
|
|
1090
|
+
Types: _MetaExpression
|
|
793
1091
|
|
|
794
1092
|
RETURNS:
|
|
795
1093
|
None
|
|
@@ -802,15 +1100,19 @@ class DataFrame():
|
|
|
802
1100
|
self._column_names_and_types = []
|
|
803
1101
|
self._td_column_names_and_types = []
|
|
804
1102
|
self._td_column_names_and_sqlalchemy_types = {}
|
|
805
|
-
|
|
1103
|
+
self._column_types = {}
|
|
1104
|
+
|
|
1105
|
+
for col in self._metaexpr.c:
|
|
806
1106
|
if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
|
|
807
1107
|
tdtype = TeradataTypes.TD_NULL_TYPE.value
|
|
808
1108
|
else:
|
|
809
1109
|
tdtype = "{}".format(col.type)
|
|
810
1110
|
|
|
811
|
-
|
|
1111
|
+
py_type = UtilFuncs._teradata_type_to_python_type(col.type)
|
|
1112
|
+
self._column_names_and_types.append((str(col.name), py_type))
|
|
812
1113
|
self._td_column_names_and_types.append((str(col.name), tdtype))
|
|
813
1114
|
self._td_column_names_and_sqlalchemy_types[(str(col.name)).lower()] = col.type
|
|
1115
|
+
self._column_types[(str(col.name)).lower()] = [py_type, col.type]
|
|
814
1116
|
|
|
815
1117
|
def _get_metaexpr(self):
|
|
816
1118
|
"""
|
|
@@ -829,7 +1131,24 @@ class DataFrame():
|
|
|
829
1131
|
meta = sqlalchemy.MetaData()
|
|
830
1132
|
db_schema = UtilFuncs._extract_db_name(self._table_name)
|
|
831
1133
|
db_table_name = UtilFuncs._extract_table_name(self._table_name)
|
|
832
|
-
|
|
1134
|
+
if not self._datalake:
|
|
1135
|
+
t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
|
|
1136
|
+
return _MetaExpression(t)
|
|
1137
|
+
|
|
1138
|
+
# Get metaexpression for datalake table.
|
|
1139
|
+
# check existence of datalake table.
|
|
1140
|
+
tdmlctx.get_connection().dialect.has_table(tdmlctx.get_connection(),
|
|
1141
|
+
self._table,
|
|
1142
|
+
schema=self._database,
|
|
1143
|
+
table_only=True,
|
|
1144
|
+
datalake=self._datalake)
|
|
1145
|
+
|
|
1146
|
+
# Extract column names and corresponding teradatasqlalchemy types.
|
|
1147
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1148
|
+
self._table,
|
|
1149
|
+
self._datalake)
|
|
1150
|
+
t = sqlalchemy.Table(self._table, meta, schema=self._database,
|
|
1151
|
+
*(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
|
|
833
1152
|
return _MetaExpression(t)
|
|
834
1153
|
|
|
835
1154
|
def __getattr__(self, name):
|
|
@@ -2066,7 +2385,7 @@ class DataFrame():
|
|
|
2066
2385
|
else:
|
|
2067
2386
|
col_filters = col_names
|
|
2068
2387
|
|
|
2069
|
-
col_filters_decode = ["
|
|
2388
|
+
col_filters_decode = ["CASE WHEN \"{}\" IS NULL THEN 0 ELSE 1 END".format(col_name) for col_name in col_filters]
|
|
2070
2389
|
fmt_filter = " + ".join(col_filters_decode)
|
|
2071
2390
|
|
|
2072
2391
|
if thresh is not None:
|
|
@@ -2605,9 +2924,10 @@ class DataFrame():
|
|
|
2605
2924
|
msg = Messages.get_message(errcode)
|
|
2606
2925
|
raise TeradataMlException(msg, errcode)
|
|
2607
2926
|
|
|
2927
|
+
@argument_deprecation("20.0.0.5", "include", False, None)
|
|
2608
2928
|
@collect_queryband(queryband="DF_describe")
|
|
2609
2929
|
def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
|
|
2610
|
-
columns=None):
|
|
2930
|
+
columns=None, pivot=False):
|
|
2611
2931
|
"""
|
|
2612
2932
|
DESCRIPTION:
|
|
2613
2933
|
Generates statistics for numeric columns. This function can be used in two modes:
|
|
@@ -2639,12 +2959,12 @@ class DataFrame():
|
|
|
2639
2959
|
include:
|
|
2640
2960
|
Optional Argument.
|
|
2641
2961
|
Values can be either None or "all".
|
|
2642
|
-
If the value is "all",
|
|
2962
|
+
If the value is "all", both numeric and non-numeric columns are included.
|
|
2643
2963
|
Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2644
2964
|
Computes count and unique for non-numeric columns.
|
|
2645
2965
|
If the value is None, only numeric columns are used for collecting statistics.
|
|
2646
2966
|
Note:
|
|
2647
|
-
Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2967
|
+
* Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2648
2968
|
Default Values: None
|
|
2649
2969
|
Types: str
|
|
2650
2970
|
|
|
@@ -2684,7 +3004,14 @@ class DataFrame():
|
|
|
2684
3004
|
Specifies the name(s) of the columns we are collecting statistics for.
|
|
2685
3005
|
Default Values: None
|
|
2686
3006
|
Types: str or List of str
|
|
2687
|
-
|
|
3007
|
+
|
|
3008
|
+
pivot:
|
|
3009
|
+
Optional Argument.
|
|
3010
|
+
Specifies a boolean value to pivot the output.
|
|
3011
|
+
Note:
|
|
3012
|
+
* "pivot" is not supported for PTI tables.
|
|
3013
|
+
Default Values: 'False'
|
|
3014
|
+
Types: bool
|
|
2688
3015
|
|
|
2689
3016
|
RETURNS:
|
|
2690
3017
|
teradataml DataFrame
|
|
@@ -2706,7 +3033,7 @@ class DataFrame():
|
|
|
2706
3033
|
Orange Inc 210.0 None None 250 04/01/2017
|
|
2707
3034
|
|
|
2708
3035
|
# Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2709
|
-
>>> df.describe()
|
|
3036
|
+
>>> df.describe(pivot=True)
|
|
2710
3037
|
Apr Feb Mar Jan
|
|
2711
3038
|
func
|
|
2712
3039
|
count 4 6 4 4
|
|
@@ -2718,8 +3045,45 @@ class DataFrame():
|
|
|
2718
3045
|
75% 250 207.5 158.75 162.5
|
|
2719
3046
|
max 250 210 215 200
|
|
2720
3047
|
|
|
3048
|
+
# Computes count, mean, std, min, percentiles, and max for numeric columns with
|
|
3049
|
+
# default arugments.
|
|
3050
|
+
>>> df.describe()
|
|
3051
|
+
ATTRIBUTE StatName StatValue
|
|
3052
|
+
Jan MAXIMUM 200.0
|
|
3053
|
+
Jan STANDARD DEVIATION 62.91528696058958
|
|
3054
|
+
Jan PERCENTILES(25) 125.0
|
|
3055
|
+
Jan PERCENTILES(50) 150.0
|
|
3056
|
+
Mar COUNT 4.0
|
|
3057
|
+
Mar MINIMUM 95.0
|
|
3058
|
+
Mar MAXIMUM 215.0
|
|
3059
|
+
Mar MEAN 147.5
|
|
3060
|
+
Mar STANDARD DEVIATION 49.749371855331
|
|
3061
|
+
Mar PERCENTILES(25) 128.75
|
|
3062
|
+
Mar PERCENTILES(50) 140.0
|
|
3063
|
+
Apr COUNT 4.0
|
|
3064
|
+
Apr MINIMUM 101.0
|
|
3065
|
+
Apr MAXIMUM 250.0
|
|
3066
|
+
Apr MEAN 195.25
|
|
3067
|
+
Apr STANDARD DEVIATION 70.97123830585646
|
|
3068
|
+
Apr PERCENTILES(25) 160.25
|
|
3069
|
+
Apr PERCENTILES(50) 215.0
|
|
3070
|
+
Apr PERCENTILES(75) 250.0
|
|
3071
|
+
Feb COUNT 6.0
|
|
3072
|
+
Feb MINIMUM 90.0
|
|
3073
|
+
Feb MAXIMUM 210.0
|
|
3074
|
+
Feb MEAN 166.66666666666666
|
|
3075
|
+
Feb STANDARD DEVIATION 59.553897157672786
|
|
3076
|
+
Feb PERCENTILES(25) 117.5
|
|
3077
|
+
Feb PERCENTILES(50) 200.0
|
|
3078
|
+
Feb PERCENTILES(75) 207.5
|
|
3079
|
+
Mar PERCENTILES(75) 158.75
|
|
3080
|
+
Jan PERCENTILES(75) 162.5
|
|
3081
|
+
Jan MEAN 137.5
|
|
3082
|
+
Jan MINIMUM 50.0
|
|
3083
|
+
Jan COUNT 4.0
|
|
3084
|
+
|
|
2721
3085
|
# Computes count, mean, std, min, percentiles, and max for numeric columns with 30th and 60th percentiles.
|
|
2722
|
-
>>> df.describe(percentiles=[.3, .6])
|
|
3086
|
+
>>> df.describe(percentiles=[.3, .6], pivot=True)
|
|
2723
3087
|
Apr Feb Mar Jan
|
|
2724
3088
|
func
|
|
2725
3089
|
count 4 6 4 4
|
|
@@ -2732,7 +3096,7 @@ class DataFrame():
|
|
|
2732
3096
|
|
|
2733
3097
|
# Computes count, mean, std, min, percentiles, and max for numeric columns group by "datetime" and "Feb".
|
|
2734
3098
|
>>> df1 = df.groupby(["datetime", "Feb"])
|
|
2735
|
-
>>> df1.describe()
|
|
3099
|
+
>>> df1.describe(pivot=True)
|
|
2736
3100
|
Jan Mar Apr
|
|
2737
3101
|
datetime Feb func
|
|
2738
3102
|
04/01/2017 90.0 25% 50 95 101
|
|
@@ -2760,22 +3124,6 @@ class DataFrame():
|
|
|
2760
3124
|
min 200 215 250
|
|
2761
3125
|
std None None 0
|
|
2762
3126
|
|
|
2763
|
-
# Computes count, mean, std, min, percentiles, and max for numeric columns and
|
|
2764
|
-
# computes count and unique for non-numeric columns
|
|
2765
|
-
>>> df.describe(include="all")
|
|
2766
|
-
accounts Feb Jan Mar Apr datetime
|
|
2767
|
-
func
|
|
2768
|
-
25% None 117.5 125 128.75 160.25 None
|
|
2769
|
-
75% None 207.5 162.5 158.75 250 None
|
|
2770
|
-
count 6 6 4 4 4 6
|
|
2771
|
-
mean None 166.667 137.5 147.5 195.25 None
|
|
2772
|
-
max None 210 200 215 250 None
|
|
2773
|
-
min None 90 50 95 101 None
|
|
2774
|
-
50% None 200 150 140 215 None
|
|
2775
|
-
std None 59.554 62.915 49.749 70.971 None
|
|
2776
|
-
unique 6 None None None None 1
|
|
2777
|
-
|
|
2778
|
-
#
|
|
2779
3127
|
# Examples for describe() function as Time Series Aggregate.
|
|
2780
3128
|
#
|
|
2781
3129
|
>>> # Load the example datasets.
|
|
@@ -2958,7 +3306,7 @@ class DataFrame():
|
|
|
2958
3306
|
>>>
|
|
2959
3307
|
"""
|
|
2960
3308
|
|
|
2961
|
-
# Argument validations
|
|
3309
|
+
# -------------Argument validations---------------#
|
|
2962
3310
|
awu_matrix = []
|
|
2963
3311
|
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
2964
3312
|
awu_matrix.append(["percentiles", percentiles, True, (float, list)])
|
|
@@ -2967,6 +3315,7 @@ class DataFrame():
|
|
|
2967
3315
|
awu_matrix.append(["distinct", distinct, True, (bool)])
|
|
2968
3316
|
awu_matrix.append(["statistics", statistics, True, (str, list), True,
|
|
2969
3317
|
["count", "mean", "min", "max", "unique", "std", "describe", "percentile"]])
|
|
3318
|
+
awu_matrix.append(["pivot", pivot, True, (bool)])
|
|
2970
3319
|
|
|
2971
3320
|
# Validate argument types
|
|
2972
3321
|
_Validators._validate_function_arguments(awu_matrix)
|
|
@@ -3010,22 +3359,27 @@ class DataFrame():
|
|
|
3010
3359
|
if verbose and not isinstance(self, DataFrameGroupByTime):
|
|
3011
3360
|
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
3012
3361
|
'verbose', 'Aggregation', 'True', 'describe()', 'DataFrameGroupByTime'))
|
|
3362
|
+
# -------------End of argument validations---------------#
|
|
3013
3363
|
|
|
3014
3364
|
function_label = "func"
|
|
3365
|
+
sort_cols = []
|
|
3015
3366
|
try:
|
|
3016
3367
|
self.__execute_node_and_set_table_name(self._nodeid)
|
|
3017
3368
|
|
|
3018
3369
|
groupby_column_list = None
|
|
3019
|
-
if isinstance(self, DataFrameGroupBy):
|
|
3370
|
+
if isinstance(self, DataFrameGroupByTime) or isinstance(self, DataFrameGroupBy):
|
|
3020
3371
|
groupby_column_list = self.groupby_column_list
|
|
3021
|
-
|
|
3022
|
-
|
|
3372
|
+
if columns:
|
|
3373
|
+
df_utils._validate_describe_columns(columns=columns, metaexpr=self._metaexpr,
|
|
3374
|
+
groupby_column_list=groupby_column_list)
|
|
3375
|
+
sort_cols = list(groupby_column_list)
|
|
3023
3376
|
|
|
3024
|
-
|
|
3025
|
-
|
|
3026
|
-
df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3027
|
-
groupby_column_list=groupby_column_list)
|
|
3377
|
+
# 'func' column will be always there in result.
|
|
3378
|
+
sort_cols.append(function_label)
|
|
3028
3379
|
|
|
3380
|
+
# Handle DataFrameGroupByTime using union all approach and
|
|
3381
|
+
# other DataFrames using TD_UnivariateStatistics approach.
|
|
3382
|
+
if isinstance(self, DataFrameGroupByTime):
|
|
3029
3383
|
# Construct the aggregate query.
|
|
3030
3384
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3031
3385
|
percentiles=percentiles, function_label=function_label,
|
|
@@ -3037,29 +3391,99 @@ class DataFrame():
|
|
|
3037
3391
|
timecode_column=self._timecode_column,
|
|
3038
3392
|
sequence_column=self._sequence_column,
|
|
3039
3393
|
fill=self._fill)
|
|
3394
|
+
|
|
3395
|
+
if groupby_column_list is not None:
|
|
3396
|
+
df = DataFrame.from_query(agg_query, index_label=sort_cols)
|
|
3397
|
+
df2 = df.sort(sort_cols)
|
|
3398
|
+
df2._metaexpr._n_rows = 100
|
|
3399
|
+
describe_df = df2
|
|
3400
|
+
else:
|
|
3401
|
+
describe_df = DataFrame.from_query(agg_query, index_label=function_label)
|
|
3402
|
+
|
|
3403
|
+
# Check if numeric overflow can occur for result DataFrame.
|
|
3404
|
+
if self._check_numeric_overflow(describe_df):
|
|
3405
|
+
result_df = self._promote_dataframe_types()
|
|
3406
|
+
describe_df = result_df.describe(pivot=True)
|
|
3407
|
+
return describe_df
|
|
3408
|
+
|
|
3040
3409
|
else:
|
|
3041
|
-
#
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
|
|
3045
|
-
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3410
|
+
# If pivot is True, then construct the aggregate query and return the result DataFrame.
|
|
3411
|
+
# Otherwise, return the result DataFrame in the regular aggregate mode using UnivariateStatistics.
|
|
3412
|
+
|
|
3413
|
+
if pivot:
|
|
3414
|
+
# Construct the aggregate query.
|
|
3415
|
+
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3416
|
+
percentiles=percentiles, function_label=function_label,
|
|
3417
|
+
groupby_column_list=groupby_column_list, include=include,
|
|
3418
|
+
is_time_series_aggregate=False, verbose=verbose,
|
|
3419
|
+
distinct=distinct, statistics=statistics)
|
|
3420
|
+
|
|
3421
|
+
if groupby_column_list is not None:
|
|
3422
|
+
sort_cols = [i for i in groupby_column_list]
|
|
3423
|
+
sort_cols.append(function_label)
|
|
3424
|
+
df = DataFrame.from_query(agg_query, index_label=sort_cols)
|
|
3425
|
+
df2 = df.sort(sort_cols)
|
|
3426
|
+
df2._metaexpr._n_rows = 100
|
|
3427
|
+
describe_df = df2
|
|
3428
|
+
else:
|
|
3429
|
+
describe_df = DataFrame.from_query(agg_query, index_label=function_label)
|
|
3430
|
+
|
|
3431
|
+
# Check if numeric overflow can occur for result DataFrame.
|
|
3432
|
+
if self._check_numeric_overflow(describe_df):
|
|
3433
|
+
result_df = self._promote_dataframe_types()
|
|
3434
|
+
describe_df = result_df.describe(pivot=True)
|
|
3435
|
+
|
|
3436
|
+
return describe_df
|
|
3437
|
+
|
|
3438
|
+
# If columns is None, then all dataframe columns are considered.
|
|
3439
|
+
if columns is None:
|
|
3440
|
+
columns = self.columns
|
|
3441
|
+
# Exclude groupby columns
|
|
3442
|
+
if groupby_column_list is not None:
|
|
3443
|
+
columns = [col for col in columns if col not in groupby_column_list]
|
|
3444
|
+
|
|
3445
|
+
numeric_cols = []
|
|
3446
|
+
|
|
3447
|
+
# Extract numeric columns and their types of all columns
|
|
3448
|
+
for col in self._metaexpr.c:
|
|
3449
|
+
if type(col.type) in UtilFuncs()._get_numeric_datatypes() and \
|
|
3450
|
+
col.name in columns:
|
|
3451
|
+
numeric_cols.append(col.name)
|
|
3452
|
+
|
|
3453
|
+
if numeric_cols:
|
|
3454
|
+
# Default statistics for 'Regular Aggregate Mode'
|
|
3455
|
+
sql_stat = ["COUNT", "MAXIMUM", "MEAN", "MINIMUM", "PERCENTILES", "STANDARD DEVIATION"]
|
|
3456
|
+
|
|
3457
|
+
if statistics is not None:
|
|
3458
|
+
py_to_sql_func_map = {"count": "COUNT",
|
|
3459
|
+
"max": "MAXIMUM",
|
|
3460
|
+
"mean": "MEAN",
|
|
3461
|
+
"unique": 'UNIQUE ENTITY COUNT',
|
|
3462
|
+
"min": "MINIMUM",
|
|
3463
|
+
"percentile": "PERCENTILES",
|
|
3464
|
+
"std": "STANDARD DEVIATION"}
|
|
3465
|
+
# Convert statistics into corresponding SQL function names
|
|
3466
|
+
sql_stat = [py_to_sql_func_map[stat] for stat in UtilFuncs()._as_list(statistics)]
|
|
3467
|
+
|
|
3468
|
+
# Convert percentiles to centiles for univariate statistics
|
|
3469
|
+
centiles = list(map(lambda n: int(n * 100), percentiles))
|
|
3470
|
+
|
|
3471
|
+
# UnivariateStatistics parameters
|
|
3472
|
+
univar_param = {
|
|
3473
|
+
"newdata": self.select(self.columns),
|
|
3474
|
+
"target_columns": numeric_cols,
|
|
3475
|
+
"partition_columns": groupby_column_list,
|
|
3476
|
+
"centiles": centiles,
|
|
3477
|
+
"stats": sql_stat
|
|
3478
|
+
}
|
|
3479
|
+
|
|
3480
|
+
from teradataml import UnivariateStatistics
|
|
3481
|
+
# Run UnivariateStatistics
|
|
3482
|
+
aggr_df = UnivariateStatistics(**univar_param).result
|
|
3483
|
+
|
|
3484
|
+
# Return the result in teradataml format
|
|
3485
|
+
return aggr_df
|
|
3057
3486
|
|
|
3058
|
-
# Check if numeric overflow can occur for result DataFrame.
|
|
3059
|
-
if self._check_numeric_overflow(describe_df):
|
|
3060
|
-
result_df = self._promote_dataframe_types()
|
|
3061
|
-
describe_df = result_df.describe()
|
|
3062
|
-
return describe_df
|
|
3063
3487
|
except TeradataMlException:
|
|
3064
3488
|
raise
|
|
3065
3489
|
except Exception as err:
|
|
@@ -5555,7 +5979,7 @@ class DataFrame():
|
|
|
5555
5979
|
try:
|
|
5556
5980
|
# Printing the DF will actually run underlying select query and
|
|
5557
5981
|
# will brought up numeric overflow if any. Only materializing won't work.
|
|
5558
|
-
|
|
5982
|
+
repr(result_df)
|
|
5559
5983
|
return False
|
|
5560
5984
|
except TeradataMlException as tme:
|
|
5561
5985
|
if "Numeric overflow occurred during computation" in str(tme):
|
|
@@ -5642,7 +6066,35 @@ class DataFrame():
|
|
|
5642
6066
|
|
|
5643
6067
|
def _repr_html_(self):
|
|
5644
6068
|
""" Print method for teradataml for iPython rich display. """
|
|
6069
|
+
self._generate_output_html()
|
|
6070
|
+
if display.enable_ui:
|
|
6071
|
+
# EDA Ui widget representation using teradatamlwidgets
|
|
6072
|
+
if self._eda_ui is None:
|
|
6073
|
+
from teradatamlwidgets.eda.Ui import Ui
|
|
6074
|
+
self._eda_ui = Ui(df=self, html=self.html)
|
|
6075
|
+
else:
|
|
6076
|
+
self._eda_ui.display_ui()
|
|
6077
|
+
return self.html
|
|
6078
|
+
|
|
6079
|
+
def get_eda_ui(self):
|
|
6080
|
+
"""
|
|
6081
|
+
Returns the EDA representation UI.
|
|
6082
|
+
|
|
6083
|
+
PARAMETERS:
|
|
6084
|
+
None.
|
|
6085
|
+
|
|
6086
|
+
EXCEPTIONS:
|
|
6087
|
+
None.
|
|
6088
|
+
|
|
6089
|
+
RETURNS:
|
|
6090
|
+
teradatamlwidgets.eda.Ui
|
|
6091
|
+
|
|
6092
|
+
EXAMPLE:
|
|
6093
|
+
df = ui.get_eda_ui()
|
|
6094
|
+
"""
|
|
6095
|
+
return self._eda_ui
|
|
5645
6096
|
|
|
6097
|
+
def _generate_output_html(self, disable_types=True):
|
|
5646
6098
|
# Check if class attributes __data and __data_columns are not None.
|
|
5647
6099
|
# If not None, reuse the data and columns.
|
|
5648
6100
|
# If None, generate latest results.
|
|
@@ -5655,17 +6107,25 @@ class DataFrame():
|
|
|
5655
6107
|
dindent = indent + indent
|
|
5656
6108
|
|
|
5657
6109
|
header_html = ['<style type="text/css">',
|
|
5658
|
-
'table {border:ridge 5px
|
|
6110
|
+
'table { border:ridge 5px}',
|
|
5659
6111
|
'table td {border:inset 1px;}',
|
|
5660
|
-
'table tr#HeaderRow {background-color:grey; color:white;}'
|
|
6112
|
+
'table tr#HeaderRow {background-color:grey; color:white;}',
|
|
5661
6113
|
'</style>\n'
|
|
5662
6114
|
]
|
|
5663
6115
|
html = "\n{0}".format(indent).join(header_html)
|
|
5664
|
-
html += '<html><table>\n{0}<tr id="HeaderRow">\n'.format(indent)
|
|
6116
|
+
html += '<html><table style="min-width:1000px;">\n{0}<tr id="HeaderRow">\n'.format(indent)
|
|
5665
6117
|
|
|
5666
|
-
columns_html = "</th
|
|
5667
|
-
html += "
|
|
5668
|
-
html += "
|
|
6118
|
+
columns_html = "</th><th>".join(self.__data_columns)
|
|
6119
|
+
html += "<th>{0}</th>\n".format(columns_html)
|
|
6120
|
+
html += "</tr>\n"
|
|
6121
|
+
|
|
6122
|
+
if not disable_types:
|
|
6123
|
+
html += '<tr>\n'.format(indent)
|
|
6124
|
+
col_types = [repr(self._td_column_names_and_sqlalchemy_types[column]) for column in
|
|
6125
|
+
self.__data_columns]
|
|
6126
|
+
columns_types_html = "</td>\n{0}<td>".format(dindent).join(col_types)
|
|
6127
|
+
html += "{0}<td>{1}</td>\n".format(dindent, columns_types_html)
|
|
6128
|
+
html += "{0}</tr>\n".format(indent)
|
|
5669
6129
|
|
|
5670
6130
|
for row in self.__data:
|
|
5671
6131
|
row_html = ["{0}<td>{1}</td>\n".format(dindent,
|
|
@@ -5673,8 +6133,31 @@ class DataFrame():
|
|
|
5673
6133
|
html += "{1}<tr>\n{0}{1}</tr>\n".format("".join(row_html), indent)
|
|
5674
6134
|
|
|
5675
6135
|
html += "</table></html>"
|
|
6136
|
+
self.html = html
|
|
6137
|
+
|
|
6138
|
+
def get_output(self, output_index=0):
|
|
6139
|
+
"""
|
|
6140
|
+
DESCRIPTION:
|
|
6141
|
+
Returns the result of analytic function when analytic function is
|
|
6142
|
+
run from 'Analyze' tab in EDA UI.
|
|
6143
|
+
Note:
|
|
6144
|
+
* The function does not return anything if analytic function is
|
|
6145
|
+
not run from EDA UI.
|
|
5676
6146
|
|
|
5677
|
-
|
|
6147
|
+
PARAMETERS:
|
|
6148
|
+
output_index:
|
|
6149
|
+
Optional Argument.
|
|
6150
|
+
Specifies the index of the output dataframe to be returned.
|
|
6151
|
+
Default Value: 0
|
|
6152
|
+
Types: int
|
|
6153
|
+
|
|
6154
|
+
RAISES:
|
|
6155
|
+
IndexError
|
|
6156
|
+
|
|
6157
|
+
RETURNS:
|
|
6158
|
+
teradataml DataFrame object.
|
|
6159
|
+
"""
|
|
6160
|
+
return self._eda_ui.get_output_dataframe(output_index=output_index)
|
|
5678
6161
|
|
|
5679
6162
|
def __get_data_columns(self):
|
|
5680
6163
|
"""
|
|
@@ -6019,6 +6502,8 @@ class DataFrame():
|
|
|
6019
6502
|
* "open_sessions" specifies the number of Teradata data transfer
|
|
6020
6503
|
sessions to be opened for fastexport. This argument is only applicable
|
|
6021
6504
|
in fastexport mode.
|
|
6505
|
+
* Function returns the pandas dataframe with Decimal columns types as float instead of object.
|
|
6506
|
+
If user want datatype to be object, set argument "coerce_float" to False.
|
|
6022
6507
|
|
|
6023
6508
|
Notes:
|
|
6024
6509
|
1. For additional information about "coerce_float" and
|
|
@@ -6334,15 +6819,22 @@ class DataFrame():
|
|
|
6334
6819
|
Supported join operators are =, ==, <, <=, >, >=, <> and != (= and <> operators are
|
|
6335
6820
|
not supported when using DataFrame columns as operands).
|
|
6336
6821
|
|
|
6337
|
-
|
|
6338
|
-
1. When multiple join conditions are given
|
|
6339
|
-
|
|
6340
|
-
2.
|
|
6341
|
-
|
|
6342
|
-
|
|
6822
|
+
Notes:
|
|
6823
|
+
1. When multiple join conditions are given as a list string/ColumnExpression,
|
|
6824
|
+
they are joined using AND operator.
|
|
6825
|
+
2. Two or more on conditions can be combined using & and | operators
|
|
6826
|
+
and can be passed as single ColumnExpression.
|
|
6827
|
+
You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
|
|
6828
|
+
[df1.a == df1.b, df1.c == df1.d].
|
|
6829
|
+
3. Two or more on conditions can not be combined using pythonic 'and'
|
|
6830
|
+
and 'or'.
|
|
6831
|
+
You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
|
|
6832
|
+
[df1.a == df1.b and df1.c == df1.d].
|
|
6833
|
+
4. Performing self join using same DataFrame object in 'other'
|
|
6834
|
+
argument is not supported. In order to perform self join,
|
|
6835
|
+
first create aliased DataFrame using alias() API and pass it
|
|
6836
|
+
for 'other' argument. Refer to Example 10 in EXAMPLES section.
|
|
6343
6837
|
|
|
6344
|
-
You can use [df1.a == df1.b, df1.c == df1.d] in place of
|
|
6345
|
-
[(df1.a == df1.b) & (df1.c == df1.d)].
|
|
6346
6838
|
|
|
6347
6839
|
PARAMETERS:
|
|
6348
6840
|
|
|
@@ -6370,15 +6862,20 @@ class DataFrame():
|
|
|
6370
6862
|
is the column of left dataframe df1 and col2 is the column of right
|
|
6371
6863
|
dataframe df2.
|
|
6372
6864
|
Examples:
|
|
6373
|
-
1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a
|
|
6374
|
-
2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b
|
|
6375
|
-
3. [df1.a <= df2.b
|
|
6376
|
-
4. [df1.a < df2.b
|
|
6865
|
+
1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
|
|
6866
|
+
2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b AND df1.c = df2.d.
|
|
6867
|
+
3. [df1.a <= df2.b & df1.c > df2.d] indicates df1.a <= df2.b AND df1.c > df2.d.
|
|
6868
|
+
4. [df1.a < df2.b | df1.c >= df2.d] indicates df1.a < df2.b OR df1.c >= df2.d.
|
|
6377
6869
|
5. df1.a != df2.b indicates df1.a != df2.b.
|
|
6378
6870
|
• The combination of both string comparisons and comparisons as column expressions.
|
|
6379
6871
|
Examples:
|
|
6380
|
-
1. ["a", df1.b == df2.b] indicates df1.a = df2.a
|
|
6381
|
-
2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b
|
|
6872
|
+
1. ["a", df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
|
|
6873
|
+
2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b AND df1.c > df2.d.
|
|
6874
|
+
• ColumnExpressions containing FunctionExpressions which represent SQL functions
|
|
6875
|
+
invoked on DataFrame Columns.
|
|
6876
|
+
Examples:
|
|
6877
|
+
1. (df1.a.round(1) - df2.a.round(1)).mod(2.5) > 2
|
|
6878
|
+
2. df1.a.floor() - df2.b.floor() > 2
|
|
6382
6879
|
|
|
6383
6880
|
Types: str (or) ColumnExpression (or) List of strings(str) or ColumnExpressions
|
|
6384
6881
|
|
|
@@ -6400,7 +6897,7 @@ class DataFrame():
|
|
|
6400
6897
|
Specifies the suffix to be added to the right table columns.
|
|
6401
6898
|
Default Value: None.
|
|
6402
6899
|
Types: str
|
|
6403
|
-
|
|
6900
|
+
|
|
6404
6901
|
lprefix:
|
|
6405
6902
|
Optional Argument.
|
|
6406
6903
|
Specifies the prefix to be added to the left table columns.
|
|
@@ -6450,7 +6947,7 @@ class DataFrame():
|
|
|
6450
6947
|
0 2 2 analytics 2.3 2.3 b analytics b
|
|
6451
6948
|
1 1 1 teradata 1.3 1.3 a teradata a
|
|
6452
6949
|
|
|
6453
|
-
# Example 2: One "on" argument condition is ColumnExpression and other is string having two
|
|
6950
|
+
# Example 2: One "on" argument condition is ColumnExpression and other is string having two
|
|
6454
6951
|
# columns with left outer join.
|
|
6455
6952
|
>>> df1.join(df2, on = [df1.col2 == df2.col4,"col5 = col7"], how = "left", lprefix = "t1", rprefix = "t2")
|
|
6456
6953
|
t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
|
|
@@ -6464,7 +6961,7 @@ class DataFrame():
|
|
|
6464
6961
|
0 2 2 analytics 2.3 2.3 b analytics b
|
|
6465
6962
|
1 1 1 teradata 1.3 1.3 a teradata a
|
|
6466
6963
|
|
|
6467
|
-
# Example 4: One "on" argument condition is ColumnExpression and other is string having two
|
|
6964
|
+
# Example 4: One "on" argument condition is ColumnExpression and other is string having two
|
|
6468
6965
|
# columns with full join.
|
|
6469
6966
|
>>> df1.join(other = df2, on = ["col2=col4",df1.col5 == df2.col7], how = "full", lprefix = "t1", rprefix = "t2")
|
|
6470
6967
|
t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
|
|
@@ -6542,7 +7039,53 @@ class DataFrame():
|
|
|
6542
7039
|
3 Beginner Beginner 1 3.95 Beginner 3.70 Novice 0 1 no yes
|
|
6543
7040
|
3 Beginner Beginner 2 3.76 Beginner 3.70 Novice 0 1 no yes
|
|
6544
7041
|
3 Beginner Novice 3 3.70 Beginner 3.70 Novice 1 1 no no
|
|
7042
|
+
|
|
7043
|
+
# Example 10: Perform self join using aliased DataFrame.
|
|
7044
|
+
# Create an aliased DataFrame.
|
|
7045
|
+
>>> lhs = DataFrame("admissions_train").head(3).sort("id")
|
|
7046
|
+
>>> rhs = lhs.alias("rhs")
|
|
7047
|
+
# Use aliased DataFrame for self join.
|
|
7048
|
+
>>> joined_df = lhs.join(other=rhs, how="cross", lprefix="l", rprefix="r")
|
|
7049
|
+
>>> joined_df
|
|
7050
|
+
l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted
|
|
7051
|
+
0 1 3 yes no 3.95 3.70 Beginner Novice Beginner Beginner 0 1
|
|
7052
|
+
1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0
|
|
7053
|
+
2 2 3 yes no 3.76 3.70 Beginner Novice Beginner Beginner 0 1
|
|
7054
|
+
3 3 1 no yes 3.70 3.95 Novice Beginner Beginner Beginner 1 0
|
|
7055
|
+
4 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1
|
|
7056
|
+
5 3 2 no yes 3.70 3.76 Novice Beginner Beginner Beginner 1 0
|
|
7057
|
+
6 2 1 yes yes 3.76 3.95 Beginner Beginner Beginner Beginner 0 0
|
|
7058
|
+
7 1 2 yes yes 3.95 3.76 Beginner Beginner Beginner Beginner 0 0
|
|
7059
|
+
8 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0
|
|
7060
|
+
|
|
7061
|
+
# Example 11: Perform join with compound 'on' condition having
|
|
7062
|
+
# more than one binary operator.
|
|
7063
|
+
>>> rhs_2 = lhs.assign(double_gpa=lhs.gpa * 2)
|
|
7064
|
+
>>> joined_df_2 = lhs.join(rhs_2, on=rhs_2.double_gpa == lhs.gpa * 2, how="left", lprefix="l", rprefix="r")
|
|
7065
|
+
>>> joined_df_2
|
|
7066
|
+
l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted double_gpa
|
|
7067
|
+
0 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1 7.40
|
|
7068
|
+
1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0 7.52
|
|
7069
|
+
2 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0 7.90
|
|
7070
|
+
|
|
7071
|
+
# Example 12: Perform join on DataFrames with 'on' condition
|
|
7072
|
+
# having FunctionExpression.
|
|
7073
|
+
>>> df = DataFrame("admissions_train")
|
|
7074
|
+
>>> df2 = df.alias("rhs_df")
|
|
7075
|
+
>>> joined_df_3 = df.join(df2, on=(df.gpa.round(1) - df2.gpa.round(1)).mod(2.5) > 2,
|
|
7076
|
+
>>> how="inner", lprefix="l")
|
|
7077
|
+
>>> joined_df_3.sort(["id", "l_id"])
|
|
7078
|
+
l_id id l_masters masters l_gpa gpa l_stats stats l_programming programming l_admitted admitted
|
|
7079
|
+
0 1 24 yes no 3.95 1.87 Beginner Advanced Beginner Novice 0 1
|
|
7080
|
+
1 13 24 no no 4.0 1.87 Advanced Advanced Novice Novice 1 1
|
|
7081
|
+
2 15 24 yes no 4.0 1.87 Advanced Advanced Advanced Novice 1 1
|
|
7082
|
+
3 25 24 no no 3.96 1.87 Advanced Advanced Advanced Novice 1 1
|
|
7083
|
+
4 27 24 yes no 3.96 1.87 Advanced Advanced Advanced Novice 0 1
|
|
7084
|
+
5 29 24 yes no 4.0 1.87 Novice Advanced Beginner Novice 0 1
|
|
7085
|
+
6 40 24 yes no 3.95 1.87 Novice Advanced Beginner Novice 0 1
|
|
7086
|
+
|
|
6545
7087
|
"""
|
|
7088
|
+
|
|
6546
7089
|
# Argument validations
|
|
6547
7090
|
awu_matrix = []
|
|
6548
7091
|
awu_matrix.append(["other", other, False, (DataFrame)])
|
|
@@ -6556,17 +7099,11 @@ class DataFrame():
|
|
|
6556
7099
|
# Validate argument types
|
|
6557
7100
|
_Validators._validate_function_arguments(awu_matrix)
|
|
6558
7101
|
|
|
6559
|
-
# If
|
|
6560
|
-
#
|
|
6561
|
-
|
|
6562
|
-
|
|
6563
|
-
|
|
6564
|
-
raffix = rsuffix
|
|
6565
|
-
affix_type = "suffix"
|
|
6566
|
-
else:
|
|
6567
|
-
laffix = lprefix
|
|
6568
|
-
raffix = rprefix
|
|
6569
|
-
affix_type = "prefix"
|
|
7102
|
+
# If self and other DataFrames are pointing to same Table object,
|
|
7103
|
+
# raise error.
|
|
7104
|
+
if self._metaexpr.t is other._metaexpr.t:
|
|
7105
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "join"),
|
|
7106
|
+
MessageCodes.TDMLDF_ALIAS_REQUIRED)
|
|
6570
7107
|
|
|
6571
7108
|
how_lc = how.lower()
|
|
6572
7109
|
|
|
@@ -6584,12 +7121,33 @@ class DataFrame():
|
|
|
6584
7121
|
for col in other.columns:
|
|
6585
7122
|
other_columns_lower_actual_map[col.lower()] = col
|
|
6586
7123
|
|
|
6587
|
-
|
|
6588
|
-
|
|
6589
|
-
|
|
6590
|
-
|
|
6591
|
-
|
|
6592
|
-
|
|
7124
|
+
# Set the affix variables (laffix and raffix) with provided value(s)
|
|
7125
|
+
# of lsuffix, rsuffix, lprefix and rprefix.
|
|
7126
|
+
# Also set affix_type appropriately.
|
|
7127
|
+
laffix = None
|
|
7128
|
+
raffix = None
|
|
7129
|
+
affix_type = None
|
|
7130
|
+
if lsuffix is not None or rsuffix is not None:
|
|
7131
|
+
laffix = lsuffix
|
|
7132
|
+
raffix = rsuffix
|
|
7133
|
+
affix_type = "suffix"
|
|
7134
|
+
elif lprefix is not None or rprefix is not None:
|
|
7135
|
+
laffix = lprefix
|
|
7136
|
+
raffix = rprefix
|
|
7137
|
+
affix_type = "prefix"
|
|
7138
|
+
|
|
7139
|
+
# Same column names can be present in two dataframes involved
|
|
7140
|
+
# in join operation in below two cases:
|
|
7141
|
+
# Case 1: Self join.
|
|
7142
|
+
# Case 2: Two tables having common column names.
|
|
7143
|
+
# In any case, at least one kind of affix is required to generate
|
|
7144
|
+
# distinct column names in resultant table. Throw error if no affix
|
|
7145
|
+
# is available.
|
|
7146
|
+
if not set(self_columns_lower_actual_map.keys()).isdisjoint(other_columns_lower_actual_map.keys()):
|
|
7147
|
+
if affix_type is None:
|
|
7148
|
+
raise TeradataMlException(
|
|
7149
|
+
Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
|
|
7150
|
+
MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
|
|
6593
7151
|
|
|
6594
7152
|
# Both affixes should not be equal to perform join.
|
|
6595
7153
|
if laffix == raffix and laffix is not None:
|
|
@@ -6598,115 +7156,159 @@ class DataFrame():
|
|
|
6598
7156
|
"'l{affix_type}' and 'r{affix_type}'".format(affix_type=affix_type)),
|
|
6599
7157
|
MessageCodes.TDMLDF_INVALID_TABLE_ALIAS)
|
|
6600
7158
|
|
|
6601
|
-
|
|
6602
|
-
|
|
6603
|
-
|
|
6604
|
-
|
|
6605
|
-
|
|
6606
|
-
|
|
6607
|
-
|
|
6608
|
-
|
|
6609
|
-
|
|
6610
|
-
|
|
6611
|
-
|
|
6612
|
-
|
|
6613
|
-
|
|
6614
|
-
|
|
6615
|
-
#
|
|
6616
|
-
|
|
6617
|
-
|
|
6618
|
-
|
|
6619
|
-
|
|
6620
|
-
|
|
6621
|
-
|
|
6622
|
-
|
|
6623
|
-
|
|
6624
|
-
|
|
6625
|
-
|
|
6626
|
-
|
|
6627
|
-
|
|
6628
|
-
|
|
6629
|
-
|
|
6630
|
-
|
|
6631
|
-
|
|
6632
|
-
|
|
6633
|
-
|
|
6634
|
-
|
|
6635
|
-
|
|
6636
|
-
|
|
6637
|
-
|
|
6638
|
-
|
|
6639
|
-
|
|
6640
|
-
|
|
6641
|
-
|
|
6642
|
-
|
|
6643
|
-
|
|
6644
|
-
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6649
|
-
|
|
6650
|
-
|
|
6651
|
-
|
|
6652
|
-
|
|
6653
|
-
|
|
6654
|
-
|
|
6655
|
-
|
|
6656
|
-
|
|
6657
|
-
|
|
6658
|
-
|
|
6659
|
-
|
|
6660
|
-
|
|
6661
|
-
|
|
6662
|
-
|
|
6663
|
-
|
|
6664
|
-
|
|
6665
|
-
|
|
6666
|
-
|
|
6667
|
-
|
|
6668
|
-
|
|
6669
|
-
|
|
6670
|
-
|
|
6671
|
-
|
|
6672
|
-
|
|
6673
|
-
|
|
6674
|
-
|
|
6675
|
-
self_columns_lower_actual_map.keys(),
|
|
6676
|
-
"left", affix_type)
|
|
6677
|
-
select_columns.append("{0} as {1}".format(
|
|
6678
|
-
self.__get_fully_qualified_col_name(column, "df2" if raffix is None else raffix),
|
|
6679
|
-
df2_column_with_affix))
|
|
6680
|
-
|
|
6681
|
-
# As we are creating new column name, adding it to new metadata dict for new dataframe from join.
|
|
6682
|
-
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6683
|
-
UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
|
|
6684
|
-
column, df1_columns_types)
|
|
6685
|
-
|
|
6686
|
-
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6687
|
-
UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
|
|
6688
|
-
other_column, df2_columns_types)
|
|
6689
|
-
|
|
7159
|
+
try:
|
|
7160
|
+
# Set an attribute named '_join_alias' to underlying SQLAlchemy table objects
|
|
7161
|
+
# and use it as default alias for compiling.
|
|
7162
|
+
setattr(self._metaexpr.t, "_join_alias", "lhs")
|
|
7163
|
+
setattr(other._metaexpr.t, "_join_alias", "rhs")
|
|
7164
|
+
lhs_alias = "lhs"
|
|
7165
|
+
rhs_alias = "rhs"
|
|
7166
|
+
|
|
7167
|
+
# Step 1: Generate the on clause string.
|
|
7168
|
+
if how_lc != "cross":
|
|
7169
|
+
on = UtilFuncs._as_list(on)
|
|
7170
|
+
|
|
7171
|
+
all_join_conditions = []
|
|
7172
|
+
invalid_join_conditions = []
|
|
7173
|
+
# Forming join condition
|
|
7174
|
+
for condition in on:
|
|
7175
|
+
# Process only when the on condition is either a string or a ColumnExpression.
|
|
7176
|
+
if not isinstance(condition, (ColumnExpression, str)):
|
|
7177
|
+
invalid_join_conditions.append(condition)
|
|
7178
|
+
continue
|
|
7179
|
+
|
|
7180
|
+
# Generate final on clause string from string representation of condition.
|
|
7181
|
+
if isinstance(condition, str):
|
|
7182
|
+
# Process the string manually.
|
|
7183
|
+
# 1. Parse the string to get operator.
|
|
7184
|
+
for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
|
|
7185
|
+
if op in condition:
|
|
7186
|
+
conditional_separator = op
|
|
7187
|
+
break
|
|
7188
|
+
else:
|
|
7189
|
+
# If no join condition is mentioned, then string represents the column.
|
|
7190
|
+
# In this case, default operator is taken as equal.
|
|
7191
|
+
# If on is ['a'], then it is equal to 'lhs.a = rhs.a'
|
|
7192
|
+
columns = [condition, condition]
|
|
7193
|
+
condition = "{0} = {0}".format(condition)
|
|
7194
|
+
conditional_separator = "="
|
|
7195
|
+
# 2. Split the string using operator and extract LHS and RHS
|
|
7196
|
+
# columns from a binary expression.
|
|
7197
|
+
columns = [column.strip() for column in condition.split(sep=conditional_separator)
|
|
7198
|
+
if len(column) > 0]
|
|
7199
|
+
|
|
7200
|
+
if len(columns) != 2:
|
|
7201
|
+
invalid_join_conditions.append(condition)
|
|
7202
|
+
# TODO: Raise exception here only.
|
|
7203
|
+
else:
|
|
7204
|
+
# 3. Generate fully qualified names using affix and table alias
|
|
7205
|
+
# and create final on clause condition string.
|
|
7206
|
+
left_col = self.__add_alias_to_column(columns[0], self, lhs_alias)
|
|
7207
|
+
right_col = self.__add_alias_to_column(columns[1], other, rhs_alias)
|
|
7208
|
+
if conditional_separator == "!=":
|
|
7209
|
+
# "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
|
|
7210
|
+
# expressing 'not equal to'. Adding support for "!=".
|
|
7211
|
+
conditional_separator = "<>"
|
|
7212
|
+
all_join_conditions.append(
|
|
7213
|
+
'{0} {1} {2}'.format(left_col, conditional_separator, right_col))
|
|
7214
|
+
|
|
7215
|
+
# Generate on clause string from column expression.
|
|
7216
|
+
if isinstance(condition, ColumnExpression):
|
|
7217
|
+
compiled_condition = condition.compile(compile_kwargs={'include_table': True,
|
|
7218
|
+
'literal_binds': True,
|
|
7219
|
+
'table_name_kind': '_join_alias',
|
|
7220
|
+
'compile_with_caller_table': True,
|
|
7221
|
+
'table_only': True})
|
|
7222
|
+
|
|
7223
|
+
all_join_conditions.append(compiled_condition)
|
|
7224
|
+
|
|
7225
|
+
# Raise error if invalid on conditions are passed.
|
|
7226
|
+
if len(invalid_join_conditions) > 0:
|
|
7227
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
|
|
7228
|
+
", ".join(invalid_join_conditions)),
|
|
7229
|
+
MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
|
|
7230
|
+
|
|
7231
|
+
# Generate final on condition.
|
|
7232
|
+
join_condition = " and ".join(all_join_conditions)
|
|
6690
7233
|
else:
|
|
6691
|
-
#
|
|
6692
|
-
|
|
6693
|
-
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
7234
|
+
# In case of cross join no need of condition.
|
|
7235
|
+
join_condition = ""
|
|
6694
7236
|
|
|
6695
|
-
|
|
6696
|
-
|
|
6697
|
-
|
|
6698
|
-
|
|
6699
|
-
|
|
7237
|
+
# Step 2: Generate the select clause string.
|
|
7238
|
+
# Generate new column names for overlapping column names using lsuffix, rsuffix, lprefix, rprefix.
|
|
7239
|
+
# Also, use table alias while addressing overlapping column names.
|
|
7240
|
+
lhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
|
|
7241
|
+
rhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
|
|
6700
7242
|
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
join_condition, "df1" if laffix is None else laffix,
|
|
6704
|
-
"df2" if raffix is None else raffix)
|
|
7243
|
+
select_columns = []
|
|
7244
|
+
new_metaexpr_columns_types = OrderedDict()
|
|
6705
7245
|
|
|
6706
|
-
|
|
6707
|
-
|
|
7246
|
+
# Processing columns in LHS DF/ self DF.
|
|
7247
|
+
for column in self.columns:
|
|
7248
|
+
if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
|
|
7249
|
+
# Check if column found in other DataFrame has same case or different.
|
|
7250
|
+
# Return the column name from the other DataFrame.
|
|
7251
|
+
other_column = other_columns_lower_actual_map[column.lower()]
|
|
7252
|
+
|
|
7253
|
+
# Check if column name in LHS dataframe is same as that of in RHS dataframe.
|
|
7254
|
+
# If so, generate new name for LHS DF column using provided affix.
|
|
7255
|
+
df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
|
|
7256
|
+
other_columns_lower_actual_map.keys(),
|
|
7257
|
+
"right", affix_type)
|
|
7258
|
+
|
|
7259
|
+
# Generate select clause string for current column and append to list.
|
|
7260
|
+
select_columns.append("{0} as {1}".format(
|
|
7261
|
+
self.__get_fully_qualified_col_name(other_column, lhs_alias),
|
|
7262
|
+
df1_column_with_affix))
|
|
7263
|
+
|
|
7264
|
+
# Check if column name in RHS dataframe is same as that of in LHS dataframe.
|
|
7265
|
+
# If so, generate new name for RHS DF column using provided affix.
|
|
7266
|
+
df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
|
|
7267
|
+
self_columns_lower_actual_map.keys(),
|
|
7268
|
+
"left", affix_type)
|
|
7269
|
+
# Generate select clause string for current column and append to list.
|
|
7270
|
+
select_columns.append("{0} as {1}".format(
|
|
7271
|
+
self.__get_fully_qualified_col_name(column, rhs_alias),
|
|
7272
|
+
df2_column_with_affix))
|
|
7273
|
+
|
|
7274
|
+
# As we are creating new column name, adding it to new metadata dict for new dataframe from join.
|
|
7275
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
7276
|
+
UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
|
|
7277
|
+
column, lhs_columns_types)
|
|
7278
|
+
|
|
7279
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
7280
|
+
UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
|
|
7281
|
+
other_column, rhs_columns_types)
|
|
6708
7282
|
|
|
6709
|
-
|
|
7283
|
+
else:
|
|
7284
|
+
# As column with same name is not present in RHS DataFrame now,
|
|
7285
|
+
# directly adding column to new metadata dict.
|
|
7286
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, lhs_columns_types)
|
|
7287
|
+
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
7288
|
+
|
|
7289
|
+
# Processing columns in RHS DF/ other DF.
|
|
7290
|
+
# Here we will only be processing columns which are not overlapping.
|
|
7291
|
+
for column in other.columns:
|
|
7292
|
+
if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
|
|
7293
|
+
# As column not present in left DataFrame, directly adding column to new metadata dict.
|
|
7294
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, rhs_columns_types)
|
|
7295
|
+
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
7296
|
+
|
|
7297
|
+
# Step 3: Create a node in AED using _aed_join using appropriate alias for involved tables.
|
|
7298
|
+
join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns),
|
|
7299
|
+
how_lc, join_condition, lhs_alias, rhs_alias)
|
|
7300
|
+
|
|
7301
|
+
# Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
|
|
7302
|
+
# and underlying table name.
|
|
7303
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
|
|
7304
|
+
|
|
7305
|
+
# Return a new joined dataframe.
|
|
7306
|
+
return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
|
|
7307
|
+
finally:
|
|
7308
|
+
# Delete the '_join_alias' attribute attached to underlying
|
|
7309
|
+
# SQLALchemy table objects.
|
|
7310
|
+
delattr(self._metaexpr.t, "_join_alias")
|
|
7311
|
+
delattr(other._metaexpr.t, "_join_alias")
|
|
6710
7312
|
|
|
6711
7313
|
def __add_alias_to_column(self, column, df, alias):
|
|
6712
7314
|
"""
|
|
@@ -6766,7 +7368,7 @@ class DataFrame():
|
|
|
6766
7368
|
return "{0}.{1}".format(UtilFuncs._teradata_quote_arg(alias, "\"", False),
|
|
6767
7369
|
UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6768
7370
|
|
|
6769
|
-
def __check_and_return_new_column_name(self, affix, column, col_list,
|
|
7371
|
+
def __check_and_return_new_column_name(self, affix, column, col_list, other_df_side, affix_type):
|
|
6770
7372
|
"""
|
|
6771
7373
|
Check new column name alias with column exists in col_list or not, if exists throws exception else
|
|
6772
7374
|
returns new column name.
|
|
@@ -6775,7 +7377,7 @@ class DataFrame():
|
|
|
6775
7377
|
affix - affix to be added to column.
|
|
6776
7378
|
column - column name.
|
|
6777
7379
|
col_list - list of columns to check in which new column is exists or not.
|
|
6778
|
-
|
|
7380
|
+
other_df_side - Side on which the other dataframe in current join operation resides.
|
|
6779
7381
|
affix_type - Type of affix. Either "prefix" or "suffix".
|
|
6780
7382
|
|
|
6781
7383
|
EXAMPLES:
|
|
@@ -6789,19 +7391,19 @@ class DataFrame():
|
|
|
6789
7391
|
return UtilFuncs._teradata_quote_arg(column, "\"", False)
|
|
6790
7392
|
|
|
6791
7393
|
# If Prefix, affix is added before column name else it is appended.
|
|
6792
|
-
|
|
6793
|
-
|
|
6794
|
-
|
|
6795
|
-
if df_utils._check_column_exists(
|
|
6796
|
-
if
|
|
6797
|
-
|
|
7394
|
+
column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
|
|
7395
|
+
column_with_affix = column_with_affix.format(affix,
|
|
7396
|
+
UtilFuncs._teradata_unquote_arg(column, "\""))
|
|
7397
|
+
if df_utils._check_column_exists(column_with_affix.lower(), col_list):
|
|
7398
|
+
if other_df_side == "right":
|
|
7399
|
+
affix_type = "l{}".format(affix_type)
|
|
6798
7400
|
else:
|
|
6799
|
-
|
|
7401
|
+
affix_type = "r{}".format(affix_type)
|
|
6800
7402
|
raise TeradataMlException(
|
|
6801
|
-
Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS,
|
|
6802
|
-
|
|
7403
|
+
Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, column_with_affix, other_df_side,
|
|
7404
|
+
affix_type),
|
|
6803
7405
|
MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS)
|
|
6804
|
-
return UtilFuncs._teradata_quote_arg(
|
|
7406
|
+
return UtilFuncs._teradata_quote_arg(column_with_affix, "\"", False)
|
|
6805
7407
|
|
|
6806
7408
|
def __add_column_type_item_to_dict(self, new_metadata_dict, new_column, column, column_types):
|
|
6807
7409
|
"""
|
|
@@ -7327,21 +7929,17 @@ class DataFrame():
|
|
|
7327
7929
|
|
|
7328
7930
|
exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
|
|
7329
7931
|
if exec_mode == 'REMOTE':
|
|
7330
|
-
|
|
7331
|
-
|
|
7332
|
-
|
|
7333
|
-
|
|
7334
|
-
|
|
7335
|
-
|
|
7336
|
-
|
|
7337
|
-
|
|
7338
|
-
if env_name in env_mapper:
|
|
7339
|
-
env_mapper[env_name].append(colname)
|
|
7340
|
-
else:
|
|
7341
|
-
env_mapper[env_name] = [colname]
|
|
7932
|
+
_Validators._check_auth_token("udf")
|
|
7933
|
+
for colname, col in udf_expr.items():
|
|
7934
|
+
env_name = UtilFuncs._get_env_name(col)
|
|
7935
|
+
# Store the env_name and its corresponding output column
|
|
7936
|
+
if env_name in env_mapper:
|
|
7937
|
+
env_mapper[env_name].append(colname)
|
|
7938
|
+
else:
|
|
7939
|
+
env_mapper[env_name] = [colname]
|
|
7342
7940
|
else:
|
|
7343
7941
|
env_mapper[env_name] = udf_expr.keys()
|
|
7344
|
-
|
|
7942
|
+
|
|
7345
7943
|
for env_name, cols in env_mapper.items():
|
|
7346
7944
|
# Create a dictionary of output columns to column type.
|
|
7347
7945
|
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
@@ -7389,6 +7987,97 @@ class DataFrame():
|
|
|
7389
7987
|
df = tbl_operators.execute()
|
|
7390
7988
|
return df
|
|
7391
7989
|
|
|
7990
|
+
def _assign_call_udf(self, call_udf_expr):
|
|
7991
|
+
"""
|
|
7992
|
+
DESCRIPTION:
|
|
7993
|
+
Internal function for DataFrame.assign() to execute the call_udf using
|
|
7994
|
+
Script/Apply Table Operator and create new column for teradataml DataFrame.
|
|
7995
|
+
|
|
7996
|
+
PARAMETER:
|
|
7997
|
+
call_udf_expr:
|
|
7998
|
+
Required Argument.
|
|
7999
|
+
Specifies a dictionary of column name to call_udf expressions.
|
|
8000
|
+
Types: dict
|
|
8001
|
+
|
|
8002
|
+
RETURNS:
|
|
8003
|
+
teradataml DataFrame
|
|
8004
|
+
|
|
8005
|
+
RAISES:
|
|
8006
|
+
None.
|
|
8007
|
+
|
|
8008
|
+
EXAMPLES:
|
|
8009
|
+
# call_udf_expr is a dictionary of column names to call_udf expressions.
|
|
8010
|
+
call_udf_expr = {'upper_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C44310>,
|
|
8011
|
+
'sum_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C41690>}
|
|
8012
|
+
self._assign_register(call_udf_expr)
|
|
8013
|
+
"""
|
|
8014
|
+
df = self
|
|
8015
|
+
# Create a dictionary of output columns to column type (teradata type).
|
|
8016
|
+
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
8017
|
+
# Create a dictionary of output columns to column type (python types).
|
|
8018
|
+
output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
|
|
8019
|
+
for col_name, col_type in returns.items()}
|
|
8020
|
+
|
|
8021
|
+
for colname, col in call_udf_expr.items():
|
|
8022
|
+
returns[colname] = col.type
|
|
8023
|
+
output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
|
|
8024
|
+
script_name = col._udf_script
|
|
8025
|
+
delimiter = col._delimiter
|
|
8026
|
+
quotechar = col._quotechar
|
|
8027
|
+
|
|
8028
|
+
# Create a dictionary of arguments to be passed to the script.
|
|
8029
|
+
script_data = {}
|
|
8030
|
+
script_data['input_cols'] = df.columns
|
|
8031
|
+
script_data['output_cols'] = list(returns.keys())
|
|
8032
|
+
script_data['output_type_converters'] = output_type_converters
|
|
8033
|
+
script_data['function_args'] = {colname: col._udf_args}
|
|
8034
|
+
script_data['delimiter'] = delimiter
|
|
8035
|
+
script_data['qoutechar'] = quotechar
|
|
8036
|
+
|
|
8037
|
+
# Convert the dictionary to a string.
|
|
8038
|
+
# The string is URL encoded to pass it as a parameter to the script.
|
|
8039
|
+
script_data = urllib.parse.quote_plus(json.dumps(script_data))
|
|
8040
|
+
|
|
8041
|
+
if UtilFuncs._is_lake():
|
|
8042
|
+
from teradataml.table_operators.Apply import Apply
|
|
8043
|
+
apply_op_obj = Apply(data=df,
|
|
8044
|
+
script_name=script_name,
|
|
8045
|
+
env_name=col._env_name,
|
|
8046
|
+
returns = returns,
|
|
8047
|
+
delimiter = delimiter,
|
|
8048
|
+
quotechar=quotechar,
|
|
8049
|
+
files_local_path=GarbageCollector._get_temp_dir_name(),
|
|
8050
|
+
apply_command="python3 {} {}".format(script_name, script_data)
|
|
8051
|
+
)
|
|
8052
|
+
try:
|
|
8053
|
+
df = apply_op_obj.execute_script(
|
|
8054
|
+
output_style=OutputStyle.OUTPUT_TABLE.value)
|
|
8055
|
+
except Exception:
|
|
8056
|
+
raise
|
|
8057
|
+
else:
|
|
8058
|
+
import teradataml.context.context as context
|
|
8059
|
+
database = context._get_current_databasename()
|
|
8060
|
+
|
|
8061
|
+
check_reserved_keyword = False if sorted(list(returns.keys())) == sorted(df.columns) else True
|
|
8062
|
+
|
|
8063
|
+
from teradataml.table_operators.Script import Script
|
|
8064
|
+
table_op_obj = Script(data=df,
|
|
8065
|
+
script_name=script_name,
|
|
8066
|
+
files_local_path=GarbageCollector._get_temp_dir_name(),
|
|
8067
|
+
script_command="{}/bin/python3 ./{}/{} {}".format(
|
|
8068
|
+
configure.indb_install_location, database, script_name, script_data),
|
|
8069
|
+
returns=returns,
|
|
8070
|
+
quotechar=quotechar,
|
|
8071
|
+
delimiter = delimiter
|
|
8072
|
+
)
|
|
8073
|
+
table_op_obj.check_reserved_keyword = check_reserved_keyword
|
|
8074
|
+
try:
|
|
8075
|
+
df = table_op_obj.execute_script(
|
|
8076
|
+
output_style=OutputStyle.OUTPUT_TABLE.value)
|
|
8077
|
+
except Exception:
|
|
8078
|
+
raise
|
|
8079
|
+
return df
|
|
8080
|
+
|
|
7392
8081
|
@collect_queryband(queryband="DF_assign")
|
|
7393
8082
|
def assign(self, drop_columns=False, **kwargs):
|
|
7394
8083
|
"""
|
|
@@ -7420,7 +8109,7 @@ class DataFrame():
|
|
|
7420
8109
|
* SQLAlchemy ClauseElements.
|
|
7421
8110
|
(See teradataml extension with SQLAlchemy in teradataml User Guide
|
|
7422
8111
|
and Function reference guide for more details)
|
|
7423
|
-
* Function - udf.
|
|
8112
|
+
* Function - udf, call_udf.
|
|
7424
8113
|
|
|
7425
8114
|
|
|
7426
8115
|
RETURNS:
|
|
@@ -7454,7 +8143,7 @@ class DataFrame():
|
|
|
7454
8143
|
Look at Example 18 to understand more.
|
|
7455
8144
|
8. While passing multiple udf expressions, one can not pass one column output
|
|
7456
8145
|
as another column input in the same ``assign`` call.
|
|
7457
|
-
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
8146
|
+
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
7458
8147
|
last udf expression are considered for processing.
|
|
7459
8148
|
|
|
7460
8149
|
RAISES:
|
|
@@ -7819,13 +8508,13 @@ class DataFrame():
|
|
|
7819
8508
|
Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
|
|
7820
8509
|
>>>
|
|
7821
8510
|
|
|
7822
|
-
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
8511
|
+
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
7823
8512
|
# defined function on Vantage Cloud Lake.
|
|
7824
8513
|
# Create a Python 3.10.5 environment with given name and description in Vantage.
|
|
7825
8514
|
>>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
|
|
7826
8515
|
User environment 'test_udf' created.
|
|
7827
8516
|
>>>
|
|
7828
|
-
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
8517
|
+
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
7829
8518
|
# and pass the user env to run it on.
|
|
7830
8519
|
>>> from teradataml.dataframe.functions import udf
|
|
7831
8520
|
>>> @udf(env_name = env)
|
|
@@ -7837,7 +8526,31 @@ class DataFrame():
|
|
|
7837
8526
|
# to the DataFrame.
|
|
7838
8527
|
>>> df.assign(upper_stats = to_upper('accounts'))
|
|
7839
8528
|
Feb Jan Mar Apr datetime upper_stats
|
|
7840
|
-
accounts
|
|
8529
|
+
accounts
|
|
8530
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8531
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8532
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
8533
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
|
|
8534
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
|
|
8535
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
|
|
8536
|
+
>>>
|
|
8537
|
+
|
|
8538
|
+
# Example 20: Register and Call the user defined function to get the values upper case.
|
|
8539
|
+
>>> from teradataml.dataframe.functions import udf, register, call_udf
|
|
8540
|
+
>>> @udf
|
|
8541
|
+
... def to_upper(s):
|
|
8542
|
+
... if s is not None:
|
|
8543
|
+
... return s.upper()
|
|
8544
|
+
>>>
|
|
8545
|
+
# Register the created user defined function with name "upper".
|
|
8546
|
+
>>> register("upper", to_upper)
|
|
8547
|
+
>>>
|
|
8548
|
+
# Call the user defined function registered with name "upper" and assign the
|
|
8549
|
+
# ColumnExpression returned to the DataFrame.
|
|
8550
|
+
>>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
|
|
8551
|
+
>>> res
|
|
8552
|
+
Feb Jan Mar Apr datetime upper_col
|
|
8553
|
+
accounts
|
|
7841
8554
|
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
7842
8555
|
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
7843
8556
|
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
@@ -7894,10 +8607,14 @@ class DataFrame():
|
|
|
7894
8607
|
# column name to normal/regular expressions.
|
|
7895
8608
|
udf_expr = {}
|
|
7896
8609
|
regular_expr = {}
|
|
8610
|
+
call_udf_expr = {}
|
|
7897
8611
|
for colname, col in kwargs.items():
|
|
7898
8612
|
# If value passed in kwargs is a ColumnExpression and is a udf, store it.
|
|
7899
8613
|
if isinstance(col, ColumnExpression) and col._udf:
|
|
7900
8614
|
udf_expr[colname] = col
|
|
8615
|
+
# If value passed in kwargs is a ColumnExpression and is a registerd udf script, store it.
|
|
8616
|
+
elif isinstance(col, ColumnExpression) and col._udf_script:
|
|
8617
|
+
call_udf_expr[colname] = col
|
|
7901
8618
|
else:
|
|
7902
8619
|
regular_expr[colname] = col
|
|
7903
8620
|
df = self
|
|
@@ -7917,6 +8634,9 @@ class DataFrame():
|
|
|
7917
8634
|
if bool(udf_expr):
|
|
7918
8635
|
df = df._assign_udf(udf_expr)
|
|
7919
8636
|
|
|
8637
|
+
if bool(call_udf_expr):
|
|
8638
|
+
df = df._assign_call_udf(call_udf_expr)
|
|
8639
|
+
|
|
7920
8640
|
return df
|
|
7921
8641
|
|
|
7922
8642
|
|
|
@@ -8116,7 +8836,9 @@ class DataFrame():
|
|
|
8116
8836
|
_Validators._validate_column_exists_in_dataframe(keys, self._metaexpr)
|
|
8117
8837
|
|
|
8118
8838
|
try:
|
|
8119
|
-
|
|
8839
|
+
|
|
8840
|
+
# Slicing creates a new list instance with the same contents.
|
|
8841
|
+
new_index_list = self._index_label[:] if self._index_label is not None else []
|
|
8120
8842
|
|
|
8121
8843
|
# Creating a list with requested index labels bases on append
|
|
8122
8844
|
if append:
|
|
@@ -8131,7 +8853,7 @@ class DataFrame():
|
|
|
8131
8853
|
new_index_list = keys
|
|
8132
8854
|
|
|
8133
8855
|
# Takes care of appending already existing index
|
|
8134
|
-
new_index_list = list(
|
|
8856
|
+
new_index_list = list(dict.fromkeys(new_index_list))
|
|
8135
8857
|
|
|
8136
8858
|
# In case requested index is same as existing index, return same DF
|
|
8137
8859
|
if new_index_list == self._index_label:
|
|
@@ -9014,15 +9736,15 @@ class DataFrame():
|
|
|
9014
9736
|
TypeError, ValueError, TeradataMLException
|
|
9015
9737
|
|
|
9016
9738
|
EXAMPLES:
|
|
9017
|
-
|
|
9018
|
-
|
|
9739
|
+
# Load the example datasets.
|
|
9740
|
+
>>> load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
|
|
9019
9741
|
>>>
|
|
9020
9742
|
|
|
9021
|
-
|
|
9022
|
-
|
|
9023
|
-
|
|
9024
|
-
|
|
9025
|
-
|
|
9743
|
+
# Create the required DataFrames.
|
|
9744
|
+
# DataFrame on non-sequenced PTI table
|
|
9745
|
+
>>> ocean_buoys = DataFrame("ocean_buoys")
|
|
9746
|
+
# Check DataFrame columns and let's peek at the data
|
|
9747
|
+
>>> ocean_buoys.columns
|
|
9026
9748
|
['buoyid', 'TD_TIMECODE', 'temperature', 'salinity']
|
|
9027
9749
|
>>> ocean_buoys.head()
|
|
9028
9750
|
TD_TIMECODE temperature salinity
|
|
@@ -9038,10 +9760,10 @@ class DataFrame():
|
|
|
9038
9760
|
0 2014-01-06 08:00:00.000000 10.0 55
|
|
9039
9761
|
0 2014-01-06 08:10:00.000000 10.0 55
|
|
9040
9762
|
|
|
9041
|
-
|
|
9042
|
-
|
|
9043
|
-
|
|
9044
|
-
|
|
9763
|
+
# DataFrame on NON-PTI table
|
|
9764
|
+
>>> ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
|
|
9765
|
+
# Check DataFrame columns and let's peek at the data
|
|
9766
|
+
>>> ocean_buoys_nonpti.columns
|
|
9045
9767
|
['buoyid', 'timecode', 'temperature', 'salinity']
|
|
9046
9768
|
>>> ocean_buoys_nonpti.head()
|
|
9047
9769
|
buoyid temperature salinity
|
|
@@ -9553,6 +10275,12 @@ class DataFrame():
|
|
|
9553
10275
|
# Validate argument types
|
|
9554
10276
|
_Validators._validate_function_arguments(awu_matrix)
|
|
9555
10277
|
|
|
10278
|
+
# If self and right DataFrames are pointing to same Table object,
|
|
10279
|
+
# raise error.
|
|
10280
|
+
if self._metaexpr.t is right._metaexpr.t:
|
|
10281
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "merge"),
|
|
10282
|
+
MessageCodes.TDMLDF_ALIAS_REQUIRED)
|
|
10283
|
+
|
|
9556
10284
|
if (right_on is not None and left_on is None) or (right_on is None and left_on is not None):
|
|
9557
10285
|
raise TeradataMlException(
|
|
9558
10286
|
Messages.get_message(MessageCodes.MUST_PASS_ARGUMENT, "left_on", "right_on"),
|
|
@@ -9609,6 +10337,15 @@ class DataFrame():
|
|
|
9609
10337
|
# If user did not pass any arguments which form join conditions,
|
|
9610
10338
|
# Merge is performed using index columns of TeradataML DataFrames
|
|
9611
10339
|
if on is None and left_on is None and right_on is None and not use_index:
|
|
10340
|
+
# DataFrames created on OTF table will not have index.
|
|
10341
|
+
if self._datalake is not None or right._datalake is not None:
|
|
10342
|
+
msg_code = MessageCodes.EXECUTION_FAILED
|
|
10343
|
+
emsg = "Either 'on' argument or both 'left_on' and 'right_on' arguments" \
|
|
10344
|
+
" must be provided to merge DataFrames when they are created on" \
|
|
10345
|
+
" OTF table(s)."
|
|
10346
|
+
error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
|
|
10347
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
10348
|
+
|
|
9612
10349
|
if self._index_label is None or right._index_label is None:
|
|
9613
10350
|
raise TeradataMlException(
|
|
9614
10351
|
Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
|
|
@@ -9616,6 +10353,12 @@ class DataFrame():
|
|
|
9616
10353
|
use_index = True
|
|
9617
10354
|
|
|
9618
10355
|
if use_index:
|
|
10356
|
+
if self._datalake is not None or right._datalake is not None:
|
|
10357
|
+
msg_code = MessageCodes.EXECUTION_FAILED
|
|
10358
|
+
emsg = "Can not use Index to merge DataFrames when they are created on OTF table(s)."
|
|
10359
|
+
error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
|
|
10360
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
10361
|
+
|
|
9619
10362
|
if self._index_label is None or right._index_label is None:
|
|
9620
10363
|
raise TeradataMlException(
|
|
9621
10364
|
Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
|
|
@@ -10271,7 +11014,7 @@ class DataFrame():
|
|
|
10271
11014
|
2. seed is supported for stratify column.
|
|
10272
11015
|
3. Arguments "stratify_column", "seed", "id_column" are supported only
|
|
10273
11016
|
for stratifying the data.
|
|
10274
|
-
Types: str
|
|
11017
|
+
Types: str OR Feature
|
|
10275
11018
|
|
|
10276
11019
|
seed:
|
|
10277
11020
|
Optional Argument.
|
|
@@ -10297,7 +11040,7 @@ class DataFrame():
|
|
|
10297
11040
|
for stratifying the data.
|
|
10298
11041
|
2. "id_column" is supported only when "stratify_column" is used.
|
|
10299
11042
|
Ignored otherwise.
|
|
10300
|
-
Types: str
|
|
11043
|
+
Types: str OR Feature
|
|
10301
11044
|
|
|
10302
11045
|
RETURNS:
|
|
10303
11046
|
teradataml DataFrame
|
|
@@ -12332,6 +13075,9 @@ class DataFrame():
|
|
|
12332
13075
|
False)
|
|
12333
13076
|
column_names = list(dict.fromkeys(column_names))
|
|
12334
13077
|
|
|
13078
|
+
if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
|
|
13079
|
+
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
13080
|
+
|
|
12335
13081
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
12336
13082
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
|
|
12337
13083
|
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
|
|
@@ -14249,7 +14995,18 @@ class DataFrame():
|
|
|
14249
14995
|
>>> plot.show()
|
|
14250
14996
|
|
|
14251
14997
|
"""
|
|
14252
|
-
|
|
14998
|
+
|
|
14999
|
+
_plot = _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
|
|
15000
|
+
# If plot is already generated, return the same plot.
|
|
15001
|
+
if self._plot is None:
|
|
15002
|
+
self._plot = _plot
|
|
15003
|
+
return _plot
|
|
15004
|
+
|
|
15005
|
+
if self._plot == _plot:
|
|
15006
|
+
return self._plot
|
|
15007
|
+
else:
|
|
15008
|
+
self._plot = _plot
|
|
15009
|
+
return _plot
|
|
14253
15010
|
|
|
14254
15011
|
@collect_queryband(queryband="DF_itertuples")
|
|
14255
15012
|
def itertuples(self, name='Row', num_rows=None):
|
|
@@ -17142,11 +17899,18 @@ class _TDUAF(DataFrame):
|
|
|
17142
17899
|
table_name = self._db_utils._execute_node_return_db_object_name(self._data._nodeid, self._data._metaexpr)
|
|
17143
17900
|
|
|
17144
17901
|
# UAF Functions do not accept double quotes.
|
|
17902
|
+
tdp = preparer(td_dialect)
|
|
17145
17903
|
db_name = UtilFuncs._extract_db_name(table_name)
|
|
17146
|
-
|
|
17147
|
-
|
|
17904
|
+
datalake_name = UtilFuncs._extract_datalake_name(table_name)
|
|
17905
|
+
if datalake_name:
|
|
17906
|
+
table_name = '{}.{}.{}'.format(tdp.quote(datalake_name),
|
|
17907
|
+
tdp.quote(db_name),
|
|
17908
|
+
tdp.quote(UtilFuncs._extract_table_name(table_name)))
|
|
17909
|
+
elif db_name:
|
|
17910
|
+
table_name = '{}.{}'.format(tdp.quote(db_name),
|
|
17911
|
+
tdp.quote(UtilFuncs._extract_table_name(table_name)))
|
|
17148
17912
|
else:
|
|
17149
|
-
table_name = UtilFuncs._extract_table_name(table_name)
|
|
17913
|
+
table_name = tdp.quote(UtilFuncs._extract_table_name(table_name))
|
|
17150
17914
|
|
|
17151
17915
|
sql_clauses.append("TABLE_NAME ({})")
|
|
17152
17916
|
sql_values.append(table_name)
|