teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +196 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +79 -4
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +1 -0
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/automl/data_preparation.py +3 -2
- teradataml/automl/feature_engineering.py +15 -7
- teradataml/automl/model_training.py +39 -33
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +35 -0
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +8 -2
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +25 -3
- teradataml/common/utils.py +134 -9
- teradataml/context/context.py +20 -10
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/dataframe.py +543 -175
- teradataml/dataframe/functions.py +553 -25
- teradataml/dataframe/sql.py +184 -15
- teradataml/dbutils/dbutils.py +556 -18
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
- teradataml/options/__init__.py +7 -23
- teradataml/options/configure.py +29 -3
- teradataml/scriptmgmt/UserEnv.py +3 -3
- teradataml/scriptmgmt/lls_utils.py +74 -21
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +33 -1
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -19,6 +19,7 @@ import pandas as pd
|
|
|
19
19
|
import re
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
import sys
|
|
22
|
+
import urllib.parse
|
|
22
23
|
import teradataml.context.context as tdmlctx
|
|
23
24
|
|
|
24
25
|
from collections import OrderedDict, namedtuple
|
|
@@ -44,6 +45,7 @@ from teradataml.options.display import display
|
|
|
44
45
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
45
46
|
from teradataml.dataframe.row import _Row
|
|
46
47
|
from teradataml.dataframe.setop import concat
|
|
48
|
+
from teradataml.dbutils.dbutils import list_td_reserved_keywords
|
|
47
49
|
from teradataml.plot.plot import _Plot
|
|
48
50
|
from teradataml.scriptmgmt.UserEnv import UserEnv
|
|
49
51
|
from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
|
|
@@ -57,6 +59,7 @@ from teradataml.common.bulk_exposed_utils import _validate_unimplemented_functio
|
|
|
57
59
|
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
58
60
|
from teradataml.options.configure import configure
|
|
59
61
|
from teradataml.utils.internal_buffer import _InternalBuffer
|
|
62
|
+
from teradataml.common.constants import OutputStyle
|
|
60
63
|
|
|
61
64
|
# TODO use logger when available on master branch
|
|
62
65
|
# logger = teradatapylog.getLogger()
|
|
@@ -229,7 +232,7 @@ class DataFrame():
|
|
|
229
232
|
|
|
230
233
|
self._nodeid = self._aed_utils._aed_query(self._query, temp_table_name)
|
|
231
234
|
else:
|
|
232
|
-
if inspect.stack()[1][3] not in ['_from_node', '__init__']:
|
|
235
|
+
if inspect.stack()[1][3] not in ['_from_node', '__init__', 'alias']:
|
|
233
236
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
234
237
|
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
235
238
|
|
|
@@ -241,6 +244,7 @@ class DataFrame():
|
|
|
241
244
|
self._iloc = _LocationIndexer(self, integer_indexing=True)
|
|
242
245
|
self.__data = None
|
|
243
246
|
self.__data_columns = None
|
|
247
|
+
self._alias = None
|
|
244
248
|
|
|
245
249
|
except TeradataMlException:
|
|
246
250
|
raise
|
|
@@ -250,6 +254,100 @@ class DataFrame():
|
|
|
250
254
|
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
251
255
|
MessageCodes.TDMLDF_CREATE_FAIL) from err
|
|
252
256
|
|
|
257
|
+
@property
|
|
258
|
+
def db_object_name(self):
|
|
259
|
+
"""
|
|
260
|
+
DESCRIPTION:
|
|
261
|
+
Get the underlying database object name, on which DataFrame is
|
|
262
|
+
created.
|
|
263
|
+
|
|
264
|
+
RETURNS:
|
|
265
|
+
str representing object name of DataFrame
|
|
266
|
+
|
|
267
|
+
EXAMPLES:
|
|
268
|
+
>>> load_example_data("dataframe", "sales")
|
|
269
|
+
>>> df = DataFrame('sales')
|
|
270
|
+
>>> df.db_object_name
|
|
271
|
+
'"sales"'
|
|
272
|
+
"""
|
|
273
|
+
if self._table_name is not None:
|
|
274
|
+
return self._table_name
|
|
275
|
+
else:
|
|
276
|
+
msg = "Object name is available once DataFrame is materialized. " \
|
|
277
|
+
"Use DataFrame.materialize() to materialize DataFrame."
|
|
278
|
+
print(msg)
|
|
279
|
+
|
|
280
|
+
def alias(self, alias_name):
|
|
281
|
+
"""
|
|
282
|
+
DESCRIPTION:
|
|
283
|
+
Method to create an aliased teradataml DataFrame.
|
|
284
|
+
Note:
|
|
285
|
+
* This method is recommended to be used before performing
|
|
286
|
+
self join using DataFrame's join() API.
|
|
287
|
+
|
|
288
|
+
PARAMETERS:
|
|
289
|
+
alias_name:
|
|
290
|
+
Required Argument.
|
|
291
|
+
Specifies the alias name to be assigned to a teradataml DataFrame.
|
|
292
|
+
Types: str
|
|
293
|
+
|
|
294
|
+
RETURNS:
|
|
295
|
+
teradataml DataFrame
|
|
296
|
+
|
|
297
|
+
EXAMPLES:
|
|
298
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
299
|
+
>>> df = DataFrame("admissions_train")
|
|
300
|
+
>>> df
|
|
301
|
+
masters gpa stats programming admitted
|
|
302
|
+
id
|
|
303
|
+
13 no 4.00 Advanced Novice 1
|
|
304
|
+
26 yes 3.57 Advanced Advanced 1
|
|
305
|
+
5 no 3.44 Novice Novice 0
|
|
306
|
+
19 yes 1.98 Advanced Advanced 0
|
|
307
|
+
15 yes 4.00 Advanced Advanced 1
|
|
308
|
+
40 yes 3.95 Novice Beginner 0
|
|
309
|
+
7 yes 2.33 Novice Novice 1
|
|
310
|
+
22 yes 3.46 Novice Beginner 0
|
|
311
|
+
36 no 3.00 Advanced Novice 0
|
|
312
|
+
38 yes 2.65 Advanced Beginner 1
|
|
313
|
+
|
|
314
|
+
# Example 1: Create an alias of teradataml DataFrame.
|
|
315
|
+
|
|
316
|
+
>>> df2 = df.alias("adm_trn")
|
|
317
|
+
|
|
318
|
+
# Print aliased DataFrame.
|
|
319
|
+
>>> df2
|
|
320
|
+
masters gpa stats programming admitted
|
|
321
|
+
id
|
|
322
|
+
13 no 4.00 Advanced Novice 1
|
|
323
|
+
26 yes 3.57 Advanced Advanced 1
|
|
324
|
+
5 no 3.44 Novice Novice 0
|
|
325
|
+
19 yes 1.98 Advanced Advanced 0
|
|
326
|
+
15 yes 4.00 Advanced Advanced 1
|
|
327
|
+
40 yes 3.95 Novice Beginner 0
|
|
328
|
+
7 yes 2.33 Novice Novice 1
|
|
329
|
+
22 yes 3.46 Novice Beginner 0
|
|
330
|
+
36 no 3.00 Advanced Novice 0
|
|
331
|
+
38 yes 2.65 Advanced Beginner 1
|
|
332
|
+
"""
|
|
333
|
+
arg_info_matrix = [["alias_name", alias_name, False, (str), True]]
|
|
334
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
335
|
+
try:
|
|
336
|
+
alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
|
|
337
|
+
reuse_metaexpr=False)
|
|
338
|
+
# Assigning self attributes to newly created alias dataframe.
|
|
339
|
+
alias_df._table_name = self._table_name
|
|
340
|
+
alias_df._index = self._index
|
|
341
|
+
alias_df._index_label = self._index_label
|
|
342
|
+
setattr(alias_df._metaexpr.t, "table_alias", alias_name)
|
|
343
|
+
alias_df._alias = alias_name
|
|
344
|
+
return alias_df
|
|
345
|
+
except Exception as err:
|
|
346
|
+
error_code = MessageCodes.EXECUTION_FAILED
|
|
347
|
+
error_msg = Messages.get_message(
|
|
348
|
+
error_code, "create alias dataFrame", '{}'.format(str(err)))
|
|
349
|
+
raise TeradataMlException(error_msg, error_code)
|
|
350
|
+
|
|
253
351
|
@classmethod
|
|
254
352
|
@collect_queryband(queryband="DF_fromTable")
|
|
255
353
|
def from_table(cls, table_name, index=True, index_label=None):
|
|
@@ -364,7 +462,7 @@ class DataFrame():
|
|
|
364
462
|
return cls(index=index, index_label=index_label, query=query, materialize=materialize)
|
|
365
463
|
|
|
366
464
|
@classmethod
|
|
367
|
-
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None):
|
|
465
|
+
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
|
|
368
466
|
"""
|
|
369
467
|
Private class method for creating a DataFrame from a nodeid and parent metadata.
|
|
370
468
|
|
|
@@ -385,6 +483,12 @@ class DataFrame():
|
|
|
385
483
|
Optional Argument.
|
|
386
484
|
List specifying index column(s) to be retained as columns for printing.
|
|
387
485
|
|
|
486
|
+
reuse_metaexpr:
|
|
487
|
+
Optional Argument.
|
|
488
|
+
Specifies the flag to decide whether to use same _MetaExpression object or not.
|
|
489
|
+
Default Value: True
|
|
490
|
+
Types: bool
|
|
491
|
+
|
|
388
492
|
EXAMPLES:
|
|
389
493
|
from teradataml.dataframe.dataframe import DataFrame
|
|
390
494
|
df = DataFrame._from_node(1234, metaexpr)
|
|
@@ -400,28 +504,43 @@ class DataFrame():
|
|
|
400
504
|
df = cls()
|
|
401
505
|
df._nodeid = nodeid
|
|
402
506
|
df._source_type = SourceType.TABLE.value
|
|
403
|
-
|
|
507
|
+
|
|
508
|
+
if not reuse_metaexpr:
|
|
509
|
+
# Create new _MetaExpression object using reference metaExpression
|
|
510
|
+
# for newly created DataFrame.
|
|
511
|
+
df._metaexpr = UtilFuncs._get_metaexpr_using_parent_metaexpr(nodeid, metaexpr)
|
|
512
|
+
# When metaexpression is created using only column information from parent DataFrame,
|
|
513
|
+
# underlying SQLAlchemy table is created with '' string as Table name.
|
|
514
|
+
# Assign name from reference mataexpression here.
|
|
515
|
+
df._metaexpr.t.name = metaexpr.t.name
|
|
516
|
+
# Populate corresponding information into newly created DataFrame object
|
|
517
|
+
# using newly created metaExpression.
|
|
518
|
+
df._get_metadata_from_metaexpr(df._metaexpr)
|
|
519
|
+
else:
|
|
520
|
+
# Populate corresponding information into newly created DataFrame object
|
|
521
|
+
# using reference metaExpression.
|
|
522
|
+
df._get_metadata_from_metaexpr(metaexpr)
|
|
404
523
|
|
|
405
524
|
if isinstance(index_label, str):
|
|
406
525
|
index_label = [index_label]
|
|
407
526
|
|
|
408
|
-
if index_label is not None and all(elem in [col.name for col in
|
|
527
|
+
if index_label is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in index_label):
|
|
409
528
|
df._index_label = index_label
|
|
410
529
|
elif index_label is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
|
|
411
|
-
in [col.name for col in
|
|
530
|
+
in [col.name for col in df._metaexpr.c] for elem in index_label):
|
|
412
531
|
df._index_label = index_label
|
|
413
532
|
|
|
414
533
|
# Set the flag suggesting that the _index_label is set,
|
|
415
|
-
# and that a database lookup
|
|
534
|
+
# and that a database lookup won't be required even when it is None.
|
|
416
535
|
df._index_query_required = False
|
|
417
536
|
|
|
418
537
|
if isinstance(undropped_index, str):
|
|
419
538
|
undropped_index = [undropped_index]
|
|
420
539
|
|
|
421
|
-
if undropped_index is not None and all(elem in [col.name for col in
|
|
540
|
+
if undropped_index is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
422
541
|
df._undropped_index = undropped_index
|
|
423
542
|
elif undropped_index is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
|
|
424
|
-
in [col.name for col in
|
|
543
|
+
in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
425
544
|
df._undropped_index = undropped_index
|
|
426
545
|
|
|
427
546
|
return df
|
|
@@ -789,7 +908,10 @@ class DataFrame():
|
|
|
789
908
|
Private method for setting _metaexpr and retrieving column names and types.
|
|
790
909
|
|
|
791
910
|
PARAMETERS:
|
|
792
|
-
metaexpr
|
|
911
|
+
metaexpr:
|
|
912
|
+
Required Argument.
|
|
913
|
+
Specifies parent meta data (_MetaExpression object).
|
|
914
|
+
Types: _MetaExpression
|
|
793
915
|
|
|
794
916
|
RETURNS:
|
|
795
917
|
None
|
|
@@ -802,7 +924,8 @@ class DataFrame():
|
|
|
802
924
|
self._column_names_and_types = []
|
|
803
925
|
self._td_column_names_and_types = []
|
|
804
926
|
self._td_column_names_and_sqlalchemy_types = {}
|
|
805
|
-
|
|
927
|
+
|
|
928
|
+
for col in self._metaexpr.c:
|
|
806
929
|
if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
|
|
807
930
|
tdtype = TeradataTypes.TD_NULL_TYPE.value
|
|
808
931
|
else:
|
|
@@ -2066,7 +2189,7 @@ class DataFrame():
|
|
|
2066
2189
|
else:
|
|
2067
2190
|
col_filters = col_names
|
|
2068
2191
|
|
|
2069
|
-
col_filters_decode = ["
|
|
2192
|
+
col_filters_decode = ["CASE WHEN \"{}\" IS NULL THEN 0 ELSE 1 END".format(col_name) for col_name in col_filters]
|
|
2070
2193
|
fmt_filter = " + ".join(col_filters_decode)
|
|
2071
2194
|
|
|
2072
2195
|
if thresh is not None:
|
|
@@ -5555,7 +5678,7 @@ class DataFrame():
|
|
|
5555
5678
|
try:
|
|
5556
5679
|
# Printing the DF will actually run underlying select query and
|
|
5557
5680
|
# will brought up numeric overflow if any. Only materializing won't work.
|
|
5558
|
-
|
|
5681
|
+
repr(result_df)
|
|
5559
5682
|
return False
|
|
5560
5683
|
except TeradataMlException as tme:
|
|
5561
5684
|
if "Numeric overflow occurred during computation" in str(tme):
|
|
@@ -6019,6 +6142,8 @@ class DataFrame():
|
|
|
6019
6142
|
* "open_sessions" specifies the number of Teradata data transfer
|
|
6020
6143
|
sessions to be opened for fastexport. This argument is only applicable
|
|
6021
6144
|
in fastexport mode.
|
|
6145
|
+
* Function returns the pandas dataframe with Decimal columns types as float instead of object.
|
|
6146
|
+
If user want datatype to be object, set argument "coerce_float" to False.
|
|
6022
6147
|
|
|
6023
6148
|
Notes:
|
|
6024
6149
|
1. For additional information about "coerce_float" and
|
|
@@ -6334,15 +6459,22 @@ class DataFrame():
|
|
|
6334
6459
|
Supported join operators are =, ==, <, <=, >, >=, <> and != (= and <> operators are
|
|
6335
6460
|
not supported when using DataFrame columns as operands).
|
|
6336
6461
|
|
|
6337
|
-
|
|
6338
|
-
1. When multiple join conditions are given
|
|
6339
|
-
|
|
6340
|
-
2.
|
|
6341
|
-
|
|
6342
|
-
|
|
6462
|
+
Notes:
|
|
6463
|
+
1. When multiple join conditions are given as a list string/ColumnExpression,
|
|
6464
|
+
they are joined using AND operator.
|
|
6465
|
+
2. Two or more on conditions can be combined using & and | operators
|
|
6466
|
+
and can be passed as single ColumnExpression.
|
|
6467
|
+
You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
|
|
6468
|
+
[df1.a == df1.b, df1.c == df1.d].
|
|
6469
|
+
3. Two or more on conditions can not be combined using pythonic 'and'
|
|
6470
|
+
and 'or'.
|
|
6471
|
+
You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
|
|
6472
|
+
[df1.a == df1.b and df1.c == df1.d].
|
|
6473
|
+
4. Performing self join using same DataFrame object in 'other'
|
|
6474
|
+
argument is not supported. In order to perform self join,
|
|
6475
|
+
first create aliased DataFrame using alias() API and pass it
|
|
6476
|
+
for 'other' argument. Refer to Example 10 in EXAMPLES section.
|
|
6343
6477
|
|
|
6344
|
-
You can use [df1.a == df1.b, df1.c == df1.d] in place of
|
|
6345
|
-
[(df1.a == df1.b) & (df1.c == df1.d)].
|
|
6346
6478
|
|
|
6347
6479
|
PARAMETERS:
|
|
6348
6480
|
|
|
@@ -6370,15 +6502,20 @@ class DataFrame():
|
|
|
6370
6502
|
is the column of left dataframe df1 and col2 is the column of right
|
|
6371
6503
|
dataframe df2.
|
|
6372
6504
|
Examples:
|
|
6373
|
-
1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a
|
|
6374
|
-
2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b
|
|
6375
|
-
3. [df1.a <= df2.b
|
|
6376
|
-
4. [df1.a < df2.b
|
|
6505
|
+
1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
|
|
6506
|
+
2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b AND df1.c = df2.d.
|
|
6507
|
+
3. [df1.a <= df2.b & df1.c > df2.d] indicates df1.a <= df2.b AND df1.c > df2.d.
|
|
6508
|
+
4. [df1.a < df2.b | df1.c >= df2.d] indicates df1.a < df2.b OR df1.c >= df2.d.
|
|
6377
6509
|
5. df1.a != df2.b indicates df1.a != df2.b.
|
|
6378
6510
|
• The combination of both string comparisons and comparisons as column expressions.
|
|
6379
6511
|
Examples:
|
|
6380
|
-
1. ["a", df1.b == df2.b] indicates df1.a = df2.a
|
|
6381
|
-
2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b
|
|
6512
|
+
1. ["a", df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
|
|
6513
|
+
2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b AND df1.c > df2.d.
|
|
6514
|
+
• ColumnExpressions containing FunctionExpressions which represent SQL functions
|
|
6515
|
+
invoked on DataFrame Columns.
|
|
6516
|
+
Examples:
|
|
6517
|
+
1. (df1.a.round(1) - df2.a.round(1)).mod(2.5) > 2
|
|
6518
|
+
2. df1.a.floor() - df2.b.floor() > 2
|
|
6382
6519
|
|
|
6383
6520
|
Types: str (or) ColumnExpression (or) List of strings(str) or ColumnExpressions
|
|
6384
6521
|
|
|
@@ -6400,7 +6537,7 @@ class DataFrame():
|
|
|
6400
6537
|
Specifies the suffix to be added to the right table columns.
|
|
6401
6538
|
Default Value: None.
|
|
6402
6539
|
Types: str
|
|
6403
|
-
|
|
6540
|
+
|
|
6404
6541
|
lprefix:
|
|
6405
6542
|
Optional Argument.
|
|
6406
6543
|
Specifies the prefix to be added to the left table columns.
|
|
@@ -6450,7 +6587,7 @@ class DataFrame():
|
|
|
6450
6587
|
0 2 2 analytics 2.3 2.3 b analytics b
|
|
6451
6588
|
1 1 1 teradata 1.3 1.3 a teradata a
|
|
6452
6589
|
|
|
6453
|
-
# Example 2: One "on" argument condition is ColumnExpression and other is string having two
|
|
6590
|
+
# Example 2: One "on" argument condition is ColumnExpression and other is string having two
|
|
6454
6591
|
# columns with left outer join.
|
|
6455
6592
|
>>> df1.join(df2, on = [df1.col2 == df2.col4,"col5 = col7"], how = "left", lprefix = "t1", rprefix = "t2")
|
|
6456
6593
|
t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
|
|
@@ -6464,7 +6601,7 @@ class DataFrame():
|
|
|
6464
6601
|
0 2 2 analytics 2.3 2.3 b analytics b
|
|
6465
6602
|
1 1 1 teradata 1.3 1.3 a teradata a
|
|
6466
6603
|
|
|
6467
|
-
# Example 4: One "on" argument condition is ColumnExpression and other is string having two
|
|
6604
|
+
# Example 4: One "on" argument condition is ColumnExpression and other is string having two
|
|
6468
6605
|
# columns with full join.
|
|
6469
6606
|
>>> df1.join(other = df2, on = ["col2=col4",df1.col5 == df2.col7], how = "full", lprefix = "t1", rprefix = "t2")
|
|
6470
6607
|
t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
|
|
@@ -6542,7 +6679,53 @@ class DataFrame():
|
|
|
6542
6679
|
3 Beginner Beginner 1 3.95 Beginner 3.70 Novice 0 1 no yes
|
|
6543
6680
|
3 Beginner Beginner 2 3.76 Beginner 3.70 Novice 0 1 no yes
|
|
6544
6681
|
3 Beginner Novice 3 3.70 Beginner 3.70 Novice 1 1 no no
|
|
6682
|
+
|
|
6683
|
+
# Example 10: Perform self join using aliased DataFrame.
|
|
6684
|
+
# Create an aliased DataFrame.
|
|
6685
|
+
>>> lhs = DataFrame("admissions_train").head(3).sort("id")
|
|
6686
|
+
>>> rhs = lhs.alias("rhs")
|
|
6687
|
+
# Use aliased DataFrame for self join.
|
|
6688
|
+
>>> joined_df = lhs.join(other=rhs, how="cross", lprefix="l", rprefix="r")
|
|
6689
|
+
>>> joined_df
|
|
6690
|
+
l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted
|
|
6691
|
+
0 1 3 yes no 3.95 3.70 Beginner Novice Beginner Beginner 0 1
|
|
6692
|
+
1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0
|
|
6693
|
+
2 2 3 yes no 3.76 3.70 Beginner Novice Beginner Beginner 0 1
|
|
6694
|
+
3 3 1 no yes 3.70 3.95 Novice Beginner Beginner Beginner 1 0
|
|
6695
|
+
4 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1
|
|
6696
|
+
5 3 2 no yes 3.70 3.76 Novice Beginner Beginner Beginner 1 0
|
|
6697
|
+
6 2 1 yes yes 3.76 3.95 Beginner Beginner Beginner Beginner 0 0
|
|
6698
|
+
7 1 2 yes yes 3.95 3.76 Beginner Beginner Beginner Beginner 0 0
|
|
6699
|
+
8 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0
|
|
6700
|
+
|
|
6701
|
+
# Example 11: Perform join with compound 'on' condition having
|
|
6702
|
+
# more than one binary operator.
|
|
6703
|
+
>>> rhs_2 = lhs.assign(double_gpa=lhs.gpa * 2)
|
|
6704
|
+
>>> joined_df_2 = lhs.join(rhs_2, on=rhs_2.double_gpa == lhs.gpa * 2, how="left", lprefix="l", rprefix="r")
|
|
6705
|
+
>>> joined_df_2
|
|
6706
|
+
l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted double_gpa
|
|
6707
|
+
0 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1 7.40
|
|
6708
|
+
1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0 7.52
|
|
6709
|
+
2 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0 7.90
|
|
6710
|
+
|
|
6711
|
+
# Example 12: Perform join on DataFrames with 'on' condition
|
|
6712
|
+
# having FunctionExpression.
|
|
6713
|
+
>>> df = DataFrame("admissions_train")
|
|
6714
|
+
>>> df2 = df.alias("rhs_df")
|
|
6715
|
+
>>> joined_df_3 = df.join(df2, on=(df.gpa.round(1) - df2.gpa.round(1)).mod(2.5) > 2,
|
|
6716
|
+
>>> how="inner", lprefix="l")
|
|
6717
|
+
>>> joined_df_3.sort(["id", "l_id"])
|
|
6718
|
+
l_id id l_masters masters l_gpa gpa l_stats stats l_programming programming l_admitted admitted
|
|
6719
|
+
0 1 24 yes no 3.95 1.87 Beginner Advanced Beginner Novice 0 1
|
|
6720
|
+
1 13 24 no no 4.0 1.87 Advanced Advanced Novice Novice 1 1
|
|
6721
|
+
2 15 24 yes no 4.0 1.87 Advanced Advanced Advanced Novice 1 1
|
|
6722
|
+
3 25 24 no no 3.96 1.87 Advanced Advanced Advanced Novice 1 1
|
|
6723
|
+
4 27 24 yes no 3.96 1.87 Advanced Advanced Advanced Novice 0 1
|
|
6724
|
+
5 29 24 yes no 4.0 1.87 Novice Advanced Beginner Novice 0 1
|
|
6725
|
+
6 40 24 yes no 3.95 1.87 Novice Advanced Beginner Novice 0 1
|
|
6726
|
+
|
|
6545
6727
|
"""
|
|
6728
|
+
|
|
6546
6729
|
# Argument validations
|
|
6547
6730
|
awu_matrix = []
|
|
6548
6731
|
awu_matrix.append(["other", other, False, (DataFrame)])
|
|
@@ -6556,17 +6739,11 @@ class DataFrame():
|
|
|
6556
6739
|
# Validate argument types
|
|
6557
6740
|
_Validators._validate_function_arguments(awu_matrix)
|
|
6558
6741
|
|
|
6559
|
-
# If
|
|
6560
|
-
#
|
|
6561
|
-
|
|
6562
|
-
|
|
6563
|
-
|
|
6564
|
-
raffix = rsuffix
|
|
6565
|
-
affix_type = "suffix"
|
|
6566
|
-
else:
|
|
6567
|
-
laffix = lprefix
|
|
6568
|
-
raffix = rprefix
|
|
6569
|
-
affix_type = "prefix"
|
|
6742
|
+
# If self and other DataFrames are pointing to same Table object,
|
|
6743
|
+
# raise error.
|
|
6744
|
+
if self._metaexpr.t is other._metaexpr.t:
|
|
6745
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "join"),
|
|
6746
|
+
MessageCodes.TDMLDF_ALIAS_REQUIRED)
|
|
6570
6747
|
|
|
6571
6748
|
how_lc = how.lower()
|
|
6572
6749
|
|
|
@@ -6584,12 +6761,33 @@ class DataFrame():
|
|
|
6584
6761
|
for col in other.columns:
|
|
6585
6762
|
other_columns_lower_actual_map[col.lower()] = col
|
|
6586
6763
|
|
|
6587
|
-
|
|
6588
|
-
|
|
6589
|
-
|
|
6590
|
-
|
|
6591
|
-
|
|
6592
|
-
|
|
6764
|
+
# Set the affix variables (laffix and raffix) with provided value(s)
|
|
6765
|
+
# of lsuffix, rsuffix, lprefix and rprefix.
|
|
6766
|
+
# Also set affix_type appropriately.
|
|
6767
|
+
laffix = None
|
|
6768
|
+
raffix = None
|
|
6769
|
+
affix_type = None
|
|
6770
|
+
if lsuffix is not None or rsuffix is not None:
|
|
6771
|
+
laffix = lsuffix
|
|
6772
|
+
raffix = rsuffix
|
|
6773
|
+
affix_type = "suffix"
|
|
6774
|
+
elif lprefix is not None or rprefix is not None:
|
|
6775
|
+
laffix = lprefix
|
|
6776
|
+
raffix = rprefix
|
|
6777
|
+
affix_type = "prefix"
|
|
6778
|
+
|
|
6779
|
+
# Same column names can be present in two dataframes involved
|
|
6780
|
+
# in join operation in below two cases:
|
|
6781
|
+
# Case 1: Self join.
|
|
6782
|
+
# Case 2: Two tables having common column names.
|
|
6783
|
+
# In any case, at least one kind of affix is required to generate
|
|
6784
|
+
# distinct column names in resultant table. Throw error if no affix
|
|
6785
|
+
# is available.
|
|
6786
|
+
if not set(self_columns_lower_actual_map.keys()).isdisjoint(other_columns_lower_actual_map.keys()):
|
|
6787
|
+
if affix_type is None:
|
|
6788
|
+
raise TeradataMlException(
|
|
6789
|
+
Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
|
|
6790
|
+
MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
|
|
6593
6791
|
|
|
6594
6792
|
# Both affixes should not be equal to perform join.
|
|
6595
6793
|
if laffix == raffix and laffix is not None:
|
|
@@ -6598,115 +6796,158 @@ class DataFrame():
|
|
|
6598
6796
|
"'l{affix_type}' and 'r{affix_type}'".format(affix_type=affix_type)),
|
|
6599
6797
|
MessageCodes.TDMLDF_INVALID_TABLE_ALIAS)
|
|
6600
6798
|
|
|
6601
|
-
|
|
6602
|
-
|
|
6603
|
-
|
|
6604
|
-
|
|
6605
|
-
|
|
6606
|
-
|
|
6607
|
-
|
|
6608
|
-
|
|
6609
|
-
|
|
6610
|
-
|
|
6611
|
-
|
|
6612
|
-
|
|
6613
|
-
|
|
6614
|
-
|
|
6615
|
-
#
|
|
6616
|
-
|
|
6617
|
-
|
|
6618
|
-
|
|
6619
|
-
|
|
6620
|
-
|
|
6621
|
-
|
|
6622
|
-
|
|
6623
|
-
|
|
6624
|
-
|
|
6625
|
-
|
|
6626
|
-
|
|
6627
|
-
|
|
6628
|
-
|
|
6629
|
-
|
|
6630
|
-
|
|
6631
|
-
|
|
6632
|
-
|
|
6633
|
-
|
|
6634
|
-
|
|
6635
|
-
|
|
6636
|
-
|
|
6637
|
-
|
|
6638
|
-
|
|
6639
|
-
|
|
6640
|
-
|
|
6641
|
-
|
|
6642
|
-
|
|
6643
|
-
|
|
6644
|
-
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6649
|
-
|
|
6650
|
-
|
|
6651
|
-
|
|
6652
|
-
|
|
6653
|
-
|
|
6654
|
-
|
|
6655
|
-
|
|
6656
|
-
|
|
6657
|
-
|
|
6658
|
-
|
|
6659
|
-
|
|
6660
|
-
|
|
6661
|
-
|
|
6662
|
-
|
|
6663
|
-
|
|
6664
|
-
|
|
6665
|
-
|
|
6666
|
-
|
|
6667
|
-
|
|
6668
|
-
|
|
6669
|
-
|
|
6670
|
-
|
|
6671
|
-
|
|
6672
|
-
|
|
6673
|
-
|
|
6674
|
-
df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
|
|
6675
|
-
self_columns_lower_actual_map.keys(),
|
|
6676
|
-
"left", affix_type)
|
|
6677
|
-
select_columns.append("{0} as {1}".format(
|
|
6678
|
-
self.__get_fully_qualified_col_name(column, "df2" if raffix is None else raffix),
|
|
6679
|
-
df2_column_with_affix))
|
|
6680
|
-
|
|
6681
|
-
# As we are creating new column name, adding it to new metadata dict for new dataframe from join.
|
|
6682
|
-
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6683
|
-
UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
|
|
6684
|
-
column, df1_columns_types)
|
|
6685
|
-
|
|
6686
|
-
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6687
|
-
UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
|
|
6688
|
-
other_column, df2_columns_types)
|
|
6689
|
-
|
|
6799
|
+
try:
|
|
6800
|
+
# Set an attribute named '_join_alias' to underlying SQLAlchemy table objects
|
|
6801
|
+
# and use it as default alias for compiling.
|
|
6802
|
+
setattr(self._metaexpr.t, "_join_alias", "lhs")
|
|
6803
|
+
setattr(other._metaexpr.t, "_join_alias", "rhs")
|
|
6804
|
+
lhs_alias = "lhs"
|
|
6805
|
+
rhs_alias = "rhs"
|
|
6806
|
+
|
|
6807
|
+
# Step 1: Generate the on clause string.
|
|
6808
|
+
if how_lc != "cross":
|
|
6809
|
+
on = UtilFuncs._as_list(on)
|
|
6810
|
+
|
|
6811
|
+
all_join_conditions = []
|
|
6812
|
+
invalid_join_conditions = []
|
|
6813
|
+
# Forming join condition
|
|
6814
|
+
for condition in on:
|
|
6815
|
+
# Process only when the on condition is either a string or a ColumnExpression.
|
|
6816
|
+
if not isinstance(condition, (ColumnExpression, str)):
|
|
6817
|
+
invalid_join_conditions.append(condition)
|
|
6818
|
+
continue
|
|
6819
|
+
|
|
6820
|
+
# Generate final on clause string from string representation of condition.
|
|
6821
|
+
if isinstance(condition, str):
|
|
6822
|
+
# Process the string manually.
|
|
6823
|
+
# 1. Parse the string to get operator.
|
|
6824
|
+
for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
|
|
6825
|
+
if op in condition:
|
|
6826
|
+
conditional_separator = op
|
|
6827
|
+
break
|
|
6828
|
+
else:
|
|
6829
|
+
# If no join condition is mentioned, then string represents the column.
|
|
6830
|
+
# In this case, default operator is taken as equal.
|
|
6831
|
+
# If on is ['a'], then it is equal to 'lhs.a = rhs.a'
|
|
6832
|
+
columns = [condition, condition]
|
|
6833
|
+
condition = "{0} = {0}".format(condition)
|
|
6834
|
+
conditional_separator = "="
|
|
6835
|
+
# 2. Split the string using operator and extract LHS and RHS
|
|
6836
|
+
# columns from a binary expression.
|
|
6837
|
+
columns = [column.strip() for column in condition.split(sep=conditional_separator)
|
|
6838
|
+
if len(column) > 0]
|
|
6839
|
+
|
|
6840
|
+
if len(columns) != 2:
|
|
6841
|
+
invalid_join_conditions.append(condition)
|
|
6842
|
+
# TODO: Raise exception here only.
|
|
6843
|
+
else:
|
|
6844
|
+
# 3. Generate fully qualified names using affix and table alias
|
|
6845
|
+
# and create final on clause condition string.
|
|
6846
|
+
left_col = self.__add_alias_to_column(columns[0], self, lhs_alias)
|
|
6847
|
+
right_col = self.__add_alias_to_column(columns[1], other, rhs_alias)
|
|
6848
|
+
if conditional_separator == "!=":
|
|
6849
|
+
# "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
|
|
6850
|
+
# expressing 'not equal to'. Adding support for "!=".
|
|
6851
|
+
conditional_separator = "<>"
|
|
6852
|
+
all_join_conditions.append(
|
|
6853
|
+
'{0} {1} {2}'.format(left_col, conditional_separator, right_col))
|
|
6854
|
+
|
|
6855
|
+
# Generate on clause string from column expression.
|
|
6856
|
+
if isinstance(condition, ColumnExpression):
|
|
6857
|
+
compiled_condition = condition.compile(compile_kwargs={'include_table': True,
|
|
6858
|
+
'literal_binds': True,
|
|
6859
|
+
'table_name_kind': '_join_alias',
|
|
6860
|
+
'compile_with_caller_table': True})
|
|
6861
|
+
|
|
6862
|
+
all_join_conditions.append(compiled_condition)
|
|
6863
|
+
|
|
6864
|
+
# Raise error if invalid on conditions are passed.
|
|
6865
|
+
if len(invalid_join_conditions) > 0:
|
|
6866
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
|
|
6867
|
+
", ".join(invalid_join_conditions)),
|
|
6868
|
+
MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
|
|
6869
|
+
|
|
6870
|
+
# Generate final on condition.
|
|
6871
|
+
join_condition = " and ".join(all_join_conditions)
|
|
6690
6872
|
else:
|
|
6691
|
-
#
|
|
6692
|
-
|
|
6693
|
-
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6873
|
+
# In case of cross join no need of condition.
|
|
6874
|
+
join_condition = ""
|
|
6694
6875
|
|
|
6695
|
-
|
|
6696
|
-
|
|
6697
|
-
|
|
6698
|
-
|
|
6699
|
-
|
|
6876
|
+
# Step 2: Generate the select clause string.
|
|
6877
|
+
# Generate new column names for overlapping column names using lsuffix, rsuffix, lprefix, rprefix.
|
|
6878
|
+
# Also, use table alias while addressing overlapping column names.
|
|
6879
|
+
lhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
|
|
6880
|
+
rhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
|
|
6700
6881
|
|
|
6701
|
-
|
|
6702
|
-
|
|
6703
|
-
join_condition, "df1" if laffix is None else laffix,
|
|
6704
|
-
"df2" if raffix is None else raffix)
|
|
6882
|
+
select_columns = []
|
|
6883
|
+
new_metaexpr_columns_types = OrderedDict()
|
|
6705
6884
|
|
|
6706
|
-
|
|
6707
|
-
|
|
6885
|
+
# Processing columns in LHS DF/ self DF.
|
|
6886
|
+
for column in self.columns:
|
|
6887
|
+
if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
|
|
6888
|
+
# Check if column found in other DataFrame has same case or different.
|
|
6889
|
+
# Return the column name from the other DataFrame.
|
|
6890
|
+
other_column = other_columns_lower_actual_map[column.lower()]
|
|
6891
|
+
|
|
6892
|
+
# Check if column name in LHS dataframe is same as that of in RHS dataframe.
|
|
6893
|
+
# If so, generate new name for LHS DF column using provided affix.
|
|
6894
|
+
df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
|
|
6895
|
+
other_columns_lower_actual_map.keys(),
|
|
6896
|
+
"right", affix_type)
|
|
6897
|
+
|
|
6898
|
+
# Generate select clause string for current column and append to list.
|
|
6899
|
+
select_columns.append("{0} as {1}".format(
|
|
6900
|
+
self.__get_fully_qualified_col_name(other_column, lhs_alias),
|
|
6901
|
+
df1_column_with_affix))
|
|
6902
|
+
|
|
6903
|
+
# Check if column name in RHS dataframe is same as that of in LHS dataframe.
|
|
6904
|
+
# If so, generate new name for RHS DF column using provided affix.
|
|
6905
|
+
df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
|
|
6906
|
+
self_columns_lower_actual_map.keys(),
|
|
6907
|
+
"left", affix_type)
|
|
6908
|
+
# Generate select clause string for current column and append to list.
|
|
6909
|
+
select_columns.append("{0} as {1}".format(
|
|
6910
|
+
self.__get_fully_qualified_col_name(column, rhs_alias),
|
|
6911
|
+
df2_column_with_affix))
|
|
6912
|
+
|
|
6913
|
+
# As we are creating new column name, adding it to new metadata dict for new dataframe from join.
|
|
6914
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6915
|
+
UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
|
|
6916
|
+
column, lhs_columns_types)
|
|
6917
|
+
|
|
6918
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
|
|
6919
|
+
UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
|
|
6920
|
+
other_column, rhs_columns_types)
|
|
6708
6921
|
|
|
6709
|
-
|
|
6922
|
+
else:
|
|
6923
|
+
# As column with same name is not present in RHS DataFrame now,
|
|
6924
|
+
# directly adding column to new metadata dict.
|
|
6925
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, lhs_columns_types)
|
|
6926
|
+
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6927
|
+
|
|
6928
|
+
# Processing columns in RHS DF/ other DF.
|
|
6929
|
+
# Here we will only be processing columns which are not overlapping.
|
|
6930
|
+
for column in other.columns:
|
|
6931
|
+
if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
|
|
6932
|
+
# As column not present in left DataFrame, directly adding column to new metadata dict.
|
|
6933
|
+
self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, rhs_columns_types)
|
|
6934
|
+
select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6935
|
+
|
|
6936
|
+
# Step 3: Create a node in AED using _aed_join using appropriate alias for involved tables.
|
|
6937
|
+
join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns),
|
|
6938
|
+
how_lc, join_condition, lhs_alias, rhs_alias)
|
|
6939
|
+
|
|
6940
|
+
# Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
|
|
6941
|
+
# and underlying table name.
|
|
6942
|
+
new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
|
|
6943
|
+
|
|
6944
|
+
# Return a new joined dataframe.
|
|
6945
|
+
return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
|
|
6946
|
+
finally:
|
|
6947
|
+
# Delete the '_join_alias' attribute attached to underlying
|
|
6948
|
+
# SQLALchemy table objects.
|
|
6949
|
+
delattr(self._metaexpr.t, "_join_alias")
|
|
6950
|
+
delattr(other._metaexpr.t, "_join_alias")
|
|
6710
6951
|
|
|
6711
6952
|
def __add_alias_to_column(self, column, df, alias):
|
|
6712
6953
|
"""
|
|
@@ -6766,7 +7007,7 @@ class DataFrame():
|
|
|
6766
7007
|
return "{0}.{1}".format(UtilFuncs._teradata_quote_arg(alias, "\"", False),
|
|
6767
7008
|
UtilFuncs._teradata_quote_arg(column, "\"", False))
|
|
6768
7009
|
|
|
6769
|
-
def __check_and_return_new_column_name(self, affix, column, col_list,
|
|
7010
|
+
def __check_and_return_new_column_name(self, affix, column, col_list, other_df_side, affix_type):
|
|
6770
7011
|
"""
|
|
6771
7012
|
Check new column name alias with column exists in col_list or not, if exists throws exception else
|
|
6772
7013
|
returns new column name.
|
|
@@ -6775,7 +7016,7 @@ class DataFrame():
|
|
|
6775
7016
|
affix - affix to be added to column.
|
|
6776
7017
|
column - column name.
|
|
6777
7018
|
col_list - list of columns to check in which new column is exists or not.
|
|
6778
|
-
|
|
7019
|
+
other_df_side - Side on which the other dataframe in current join operation resides.
|
|
6779
7020
|
affix_type - Type of affix. Either "prefix" or "suffix".
|
|
6780
7021
|
|
|
6781
7022
|
EXAMPLES:
|
|
@@ -6789,19 +7030,19 @@ class DataFrame():
|
|
|
6789
7030
|
return UtilFuncs._teradata_quote_arg(column, "\"", False)
|
|
6790
7031
|
|
|
6791
7032
|
# If Prefix, affix is added before column name else it is appended.
|
|
6792
|
-
|
|
6793
|
-
|
|
6794
|
-
|
|
6795
|
-
if df_utils._check_column_exists(
|
|
6796
|
-
if
|
|
6797
|
-
|
|
7033
|
+
column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
|
|
7034
|
+
column_with_affix = column_with_affix.format(affix,
|
|
7035
|
+
UtilFuncs._teradata_unquote_arg(column, "\""))
|
|
7036
|
+
if df_utils._check_column_exists(column_with_affix.lower(), col_list):
|
|
7037
|
+
if other_df_side == "right":
|
|
7038
|
+
affix_type = "l{}".format(affix_type)
|
|
6798
7039
|
else:
|
|
6799
|
-
|
|
7040
|
+
affix_type = "r{}".format(affix_type)
|
|
6800
7041
|
raise TeradataMlException(
|
|
6801
|
-
Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS,
|
|
6802
|
-
|
|
7042
|
+
Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, column_with_affix, other_df_side,
|
|
7043
|
+
affix_type),
|
|
6803
7044
|
MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS)
|
|
6804
|
-
return UtilFuncs._teradata_quote_arg(
|
|
7045
|
+
return UtilFuncs._teradata_quote_arg(column_with_affix, "\"", False)
|
|
6805
7046
|
|
|
6806
7047
|
def __add_column_type_item_to_dict(self, new_metadata_dict, new_column, column, column_types):
|
|
6807
7048
|
"""
|
|
@@ -7327,18 +7568,14 @@ class DataFrame():
|
|
|
7327
7568
|
|
|
7328
7569
|
exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
|
|
7329
7570
|
if exec_mode == 'REMOTE':
|
|
7330
|
-
|
|
7331
|
-
|
|
7332
|
-
|
|
7333
|
-
|
|
7334
|
-
|
|
7335
|
-
|
|
7336
|
-
|
|
7337
|
-
|
|
7338
|
-
if env_name in env_mapper:
|
|
7339
|
-
env_mapper[env_name].append(colname)
|
|
7340
|
-
else:
|
|
7341
|
-
env_mapper[env_name] = [colname]
|
|
7571
|
+
_Validators._check_auth_token("udf")
|
|
7572
|
+
for colname, col in udf_expr.items():
|
|
7573
|
+
env_name = UtilFuncs._get_env_name(col)
|
|
7574
|
+
# Store the env_name and its corresponding output column
|
|
7575
|
+
if env_name in env_mapper:
|
|
7576
|
+
env_mapper[env_name].append(colname)
|
|
7577
|
+
else:
|
|
7578
|
+
env_mapper[env_name] = [colname]
|
|
7342
7579
|
else:
|
|
7343
7580
|
env_mapper[env_name] = udf_expr.keys()
|
|
7344
7581
|
|
|
@@ -7388,6 +7625,97 @@ class DataFrame():
|
|
|
7388
7625
|
|
|
7389
7626
|
df = tbl_operators.execute()
|
|
7390
7627
|
return df
|
|
7628
|
+
|
|
7629
|
+
def _assign_call_udf(self, call_udf_expr):
|
|
7630
|
+
"""
|
|
7631
|
+
DESCRIPTION:
|
|
7632
|
+
Internal function for DataFrame.assign() to execute the call_udf using
|
|
7633
|
+
Script/Apply Table Operator and create new column for teradataml DataFrame.
|
|
7634
|
+
|
|
7635
|
+
PARAMETER:
|
|
7636
|
+
call_udf_expr:
|
|
7637
|
+
Required Argument.
|
|
7638
|
+
Specifies a dictionary of column name to call_udf expressions.
|
|
7639
|
+
Types: dict
|
|
7640
|
+
|
|
7641
|
+
RETURNS:
|
|
7642
|
+
teradataml DataFrame
|
|
7643
|
+
|
|
7644
|
+
RAISES:
|
|
7645
|
+
None.
|
|
7646
|
+
|
|
7647
|
+
EXAMPLES:
|
|
7648
|
+
# call_udf_expr is a dictionary of column names to call_udf expressions.
|
|
7649
|
+
call_udf_expr = {'upper_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C44310>,
|
|
7650
|
+
'sum_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C41690>}
|
|
7651
|
+
self._assign_register(call_udf_expr)
|
|
7652
|
+
"""
|
|
7653
|
+
df = self
|
|
7654
|
+
# Create a dictionary of output columns to column type (teradata type).
|
|
7655
|
+
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
7656
|
+
# Create a dictionary of output columns to column type (python types).
|
|
7657
|
+
output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
|
|
7658
|
+
for col_name, col_type in returns.items()}
|
|
7659
|
+
|
|
7660
|
+
for colname, col in call_udf_expr.items():
|
|
7661
|
+
returns[colname] = col.type
|
|
7662
|
+
output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
|
|
7663
|
+
script_name = col._udf_script
|
|
7664
|
+
delimiter = col._delimiter
|
|
7665
|
+
quotechar = col._quotechar
|
|
7666
|
+
|
|
7667
|
+
# Create a dictionary of arguments to be passed to the script.
|
|
7668
|
+
script_data = {}
|
|
7669
|
+
script_data['input_cols'] = df.columns
|
|
7670
|
+
script_data['output_cols'] = list(returns.keys())
|
|
7671
|
+
script_data['output_type_converters'] = output_type_converters
|
|
7672
|
+
script_data['function_args'] = {colname: col._udf_args}
|
|
7673
|
+
script_data['delimiter'] = delimiter
|
|
7674
|
+
script_data['qoutechar'] = quotechar
|
|
7675
|
+
|
|
7676
|
+
# Convert the dictionary to a string.
|
|
7677
|
+
# The string is URL encoded to pass it as a parameter to the script.
|
|
7678
|
+
script_data = urllib.parse.quote_plus(json.dumps(script_data))
|
|
7679
|
+
|
|
7680
|
+
if UtilFuncs._is_lake():
|
|
7681
|
+
from teradataml.table_operators.Apply import Apply
|
|
7682
|
+
apply_op_obj = Apply(data=df,
|
|
7683
|
+
script_name=script_name,
|
|
7684
|
+
env_name=col._env_name,
|
|
7685
|
+
returns = returns,
|
|
7686
|
+
delimiter = delimiter,
|
|
7687
|
+
quotechar=quotechar,
|
|
7688
|
+
files_local_path=GarbageCollector._get_temp_dir_name(),
|
|
7689
|
+
apply_command="python3 {} {}".format(script_name, script_data)
|
|
7690
|
+
)
|
|
7691
|
+
try:
|
|
7692
|
+
df = apply_op_obj.execute_script(
|
|
7693
|
+
output_style=OutputStyle.OUTPUT_TABLE.value)
|
|
7694
|
+
except Exception:
|
|
7695
|
+
raise
|
|
7696
|
+
else:
|
|
7697
|
+
import teradataml.context.context as context
|
|
7698
|
+
database = context._get_current_databasename()
|
|
7699
|
+
|
|
7700
|
+
check_reserved_keyword = False if sorted(list(returns.keys())) == sorted(df.columns) else True
|
|
7701
|
+
|
|
7702
|
+
from teradataml.table_operators.Script import Script
|
|
7703
|
+
table_op_obj = Script(data=df,
|
|
7704
|
+
script_name=script_name,
|
|
7705
|
+
files_local_path=GarbageCollector._get_temp_dir_name(),
|
|
7706
|
+
script_command="{}/bin/python3 ./{}/{} {}".format(
|
|
7707
|
+
configure.indb_install_location, database, script_name, script_data),
|
|
7708
|
+
returns=returns,
|
|
7709
|
+
quotechar=quotechar,
|
|
7710
|
+
delimiter = delimiter
|
|
7711
|
+
)
|
|
7712
|
+
table_op_obj.check_reserved_keyword = check_reserved_keyword
|
|
7713
|
+
try:
|
|
7714
|
+
df = table_op_obj.execute_script(
|
|
7715
|
+
output_style=OutputStyle.OUTPUT_TABLE.value)
|
|
7716
|
+
except Exception:
|
|
7717
|
+
raise
|
|
7718
|
+
return df
|
|
7391
7719
|
|
|
7392
7720
|
@collect_queryband(queryband="DF_assign")
|
|
7393
7721
|
def assign(self, drop_columns=False, **kwargs):
|
|
@@ -7420,7 +7748,7 @@ class DataFrame():
|
|
|
7420
7748
|
* SQLAlchemy ClauseElements.
|
|
7421
7749
|
(See teradataml extension with SQLAlchemy in teradataml User Guide
|
|
7422
7750
|
and Function reference guide for more details)
|
|
7423
|
-
* Function - udf.
|
|
7751
|
+
* Function - udf, call_udf.
|
|
7424
7752
|
|
|
7425
7753
|
|
|
7426
7754
|
RETURNS:
|
|
@@ -7845,6 +8173,30 @@ class DataFrame():
|
|
|
7845
8173
|
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
|
|
7846
8174
|
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
|
|
7847
8175
|
>>>
|
|
8176
|
+
|
|
8177
|
+
# Example 20: Register and Call the user defined function to get the values upper case.
|
|
8178
|
+
>>> from teradataml.dataframe.functions import udf, register, call_udf
|
|
8179
|
+
>>> @udf
|
|
8180
|
+
... def to_upper(s):
|
|
8181
|
+
... if s is not None:
|
|
8182
|
+
... return s.upper()
|
|
8183
|
+
>>>
|
|
8184
|
+
# Register the created user defined function with name "upper".
|
|
8185
|
+
>>> register("upper", to_upper)
|
|
8186
|
+
>>>
|
|
8187
|
+
# Call the user defined function registered with name "upper" and assign the
|
|
8188
|
+
# ColumnExpression returned to the DataFrame.
|
|
8189
|
+
>>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
|
|
8190
|
+
>>> res
|
|
8191
|
+
Feb Jan Mar Apr datetime upper_col
|
|
8192
|
+
accounts
|
|
8193
|
+
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8194
|
+
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8195
|
+
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
8196
|
+
Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
|
|
8197
|
+
Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
|
|
8198
|
+
Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
|
|
8199
|
+
>>>
|
|
7848
8200
|
"""
|
|
7849
8201
|
# Argument validations
|
|
7850
8202
|
awu_matrix = []
|
|
@@ -7894,10 +8246,14 @@ class DataFrame():
|
|
|
7894
8246
|
# column name to normal/regular expressions.
|
|
7895
8247
|
udf_expr = {}
|
|
7896
8248
|
regular_expr = {}
|
|
8249
|
+
call_udf_expr = {}
|
|
7897
8250
|
for colname, col in kwargs.items():
|
|
7898
8251
|
# If value passed in kwargs is a ColumnExpression and is a udf, store it.
|
|
7899
8252
|
if isinstance(col, ColumnExpression) and col._udf:
|
|
7900
8253
|
udf_expr[colname] = col
|
|
8254
|
+
# If value passed in kwargs is a ColumnExpression and is a registerd udf script, store it.
|
|
8255
|
+
elif isinstance(col, ColumnExpression) and col._udf_script:
|
|
8256
|
+
call_udf_expr[colname] = col
|
|
7901
8257
|
else:
|
|
7902
8258
|
regular_expr[colname] = col
|
|
7903
8259
|
df = self
|
|
@@ -7917,6 +8273,9 @@ class DataFrame():
|
|
|
7917
8273
|
if bool(udf_expr):
|
|
7918
8274
|
df = df._assign_udf(udf_expr)
|
|
7919
8275
|
|
|
8276
|
+
if bool(call_udf_expr):
|
|
8277
|
+
df = df._assign_call_udf(call_udf_expr)
|
|
8278
|
+
|
|
7920
8279
|
return df
|
|
7921
8280
|
|
|
7922
8281
|
|
|
@@ -9553,6 +9912,12 @@ class DataFrame():
|
|
|
9553
9912
|
# Validate argument types
|
|
9554
9913
|
_Validators._validate_function_arguments(awu_matrix)
|
|
9555
9914
|
|
|
9915
|
+
# If self and right DataFrames are pointing to same Table object,
|
|
9916
|
+
# raise error.
|
|
9917
|
+
if self._metaexpr.t is right._metaexpr.t:
|
|
9918
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "merge"),
|
|
9919
|
+
MessageCodes.TDMLDF_ALIAS_REQUIRED)
|
|
9920
|
+
|
|
9556
9921
|
if (right_on is not None and left_on is None) or (right_on is None and left_on is not None):
|
|
9557
9922
|
raise TeradataMlException(
|
|
9558
9923
|
Messages.get_message(MessageCodes.MUST_PASS_ARGUMENT, "left_on", "right_on"),
|
|
@@ -12331,6 +12696,9 @@ class DataFrame():
|
|
|
12331
12696
|
_Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
|
|
12332
12697
|
False)
|
|
12333
12698
|
column_names = list(dict.fromkeys(column_names))
|
|
12699
|
+
|
|
12700
|
+
if list_td_reserved_keywords(column_names):
|
|
12701
|
+
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
12334
12702
|
|
|
12335
12703
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
12336
12704
|
sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
|