teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +196 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +79 -4
  6. teradataml/analytics/json_parser/metadata.py +12 -3
  7. teradataml/analytics/json_parser/utils.py +7 -2
  8. teradataml/analytics/sqle/__init__.py +1 -0
  9. teradataml/analytics/table_operator/__init__.py +1 -1
  10. teradataml/analytics/uaf/__init__.py +1 -1
  11. teradataml/analytics/utils.py +4 -0
  12. teradataml/automl/data_preparation.py +3 -2
  13. teradataml/automl/feature_engineering.py +15 -7
  14. teradataml/automl/model_training.py +39 -33
  15. teradataml/common/__init__.py +2 -1
  16. teradataml/common/constants.py +35 -0
  17. teradataml/common/garbagecollector.py +2 -1
  18. teradataml/common/messagecodes.py +8 -2
  19. teradataml/common/messages.py +3 -1
  20. teradataml/common/sqlbundle.py +25 -3
  21. teradataml/common/utils.py +134 -9
  22. teradataml/context/context.py +20 -10
  23. teradataml/data/SQL_Fundamentals.pdf +0 -0
  24. teradataml/data/dataframe_example.json +18 -2
  25. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  26. teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
  27. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  29. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  30. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  31. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  32. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  33. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  34. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  35. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  36. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  37. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  38. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  39. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  40. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  41. teradataml/data/medical_readings.csv +101 -0
  42. teradataml/data/patient_profile.csv +101 -0
  43. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  44. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  45. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  46. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  47. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  48. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  49. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  50. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  51. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  52. teradataml/data/target_udt_data.csv +8 -0
  53. teradataml/data/templates/open_source_ml.json +3 -2
  54. teradataml/data/vectordistance_example.json +4 -0
  55. teradataml/dataframe/dataframe.py +543 -175
  56. teradataml/dataframe/functions.py +553 -25
  57. teradataml/dataframe/sql.py +184 -15
  58. teradataml/dbutils/dbutils.py +556 -18
  59. teradataml/dbutils/filemgr.py +48 -1
  60. teradataml/lib/aed_0_1.dll +0 -0
  61. teradataml/opensource/__init__.py +1 -1
  62. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  63. teradataml/opensource/_lightgbm.py +950 -0
  64. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  65. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  66. teradataml/opensource/sklearn/__init__.py +0 -1
  67. teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
  68. teradataml/options/__init__.py +7 -23
  69. teradataml/options/configure.py +29 -3
  70. teradataml/scriptmgmt/UserEnv.py +3 -3
  71. teradataml/scriptmgmt/lls_utils.py +74 -21
  72. teradataml/store/__init__.py +13 -0
  73. teradataml/store/feature_store/__init__.py +0 -0
  74. teradataml/store/feature_store/constants.py +291 -0
  75. teradataml/store/feature_store/feature_store.py +2223 -0
  76. teradataml/store/feature_store/models.py +1505 -0
  77. teradataml/store/vector_store/__init__.py +1586 -0
  78. teradataml/table_operators/query_generator.py +3 -0
  79. teradataml/table_operators/table_operator_query_generator.py +3 -1
  80. teradataml/table_operators/table_operator_util.py +37 -38
  81. teradataml/table_operators/templates/dataframe_register.template +69 -0
  82. teradataml/utils/dtypes.py +4 -2
  83. teradataml/utils/validators.py +33 -1
  84. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
  85. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
  86. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  87. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  88. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -19,6 +19,7 @@ import pandas as pd
19
19
  import re
20
20
  import sqlalchemy
21
21
  import sys
22
+ import urllib.parse
22
23
  import teradataml.context.context as tdmlctx
23
24
 
24
25
  from collections import OrderedDict, namedtuple
@@ -44,6 +45,7 @@ from teradataml.options.display import display
44
45
  from teradataml.dataframe.copy_to import copy_to_sql
45
46
  from teradataml.dataframe.row import _Row
46
47
  from teradataml.dataframe.setop import concat
48
+ from teradataml.dbutils.dbutils import list_td_reserved_keywords
47
49
  from teradataml.plot.plot import _Plot
48
50
  from teradataml.scriptmgmt.UserEnv import UserEnv
49
51
  from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
@@ -57,6 +59,7 @@ from teradataml.common.bulk_exposed_utils import _validate_unimplemented_functio
57
59
  from teradataml.telemetry_utils.queryband import collect_queryband
58
60
  from teradataml.options.configure import configure
59
61
  from teradataml.utils.internal_buffer import _InternalBuffer
62
+ from teradataml.common.constants import OutputStyle
60
63
 
61
64
  # TODO use logger when available on master branch
62
65
  # logger = teradatapylog.getLogger()
@@ -229,7 +232,7 @@ class DataFrame():
229
232
 
230
233
  self._nodeid = self._aed_utils._aed_query(self._query, temp_table_name)
231
234
  else:
232
- if inspect.stack()[1][3] not in ['_from_node', '__init__']:
235
+ if inspect.stack()[1][3] not in ['_from_node', '__init__', 'alias']:
233
236
  raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
234
237
  MessageCodes.TDMLDF_CREATE_FAIL)
235
238
 
@@ -241,6 +244,7 @@ class DataFrame():
241
244
  self._iloc = _LocationIndexer(self, integer_indexing=True)
242
245
  self.__data = None
243
246
  self.__data_columns = None
247
+ self._alias = None
244
248
 
245
249
  except TeradataMlException:
246
250
  raise
@@ -250,6 +254,100 @@ class DataFrame():
250
254
  raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
251
255
  MessageCodes.TDMLDF_CREATE_FAIL) from err
252
256
 
257
+ @property
258
+ def db_object_name(self):
259
+ """
260
+ DESCRIPTION:
261
+ Get the underlying database object name, on which DataFrame is
262
+ created.
263
+
264
+ RETURNS:
265
+ str representing object name of DataFrame
266
+
267
+ EXAMPLES:
268
+ >>> load_example_data("dataframe", "sales")
269
+ >>> df = DataFrame('sales')
270
+ >>> df.db_object_name
271
+ '"sales"'
272
+ """
273
+ if self._table_name is not None:
274
+ return self._table_name
275
+ else:
276
+ msg = "Object name is available once DataFrame is materialized. " \
277
+ "Use DataFrame.materialize() to materialize DataFrame."
278
+ print(msg)
279
+
280
+ def alias(self, alias_name):
281
+ """
282
+ DESCRIPTION:
283
+ Method to create an aliased teradataml DataFrame.
284
+ Note:
285
+ * This method is recommended to be used before performing
286
+ self join using DataFrame's join() API.
287
+
288
+ PARAMETERS:
289
+ alias_name:
290
+ Required Argument.
291
+ Specifies the alias name to be assigned to a teradataml DataFrame.
292
+ Types: str
293
+
294
+ RETURNS:
295
+ teradataml DataFrame
296
+
297
+ EXAMPLES:
298
+ >>> load_example_data("dataframe", "admissions_train")
299
+ >>> df = DataFrame("admissions_train")
300
+ >>> df
301
+ masters gpa stats programming admitted
302
+ id
303
+ 13 no 4.00 Advanced Novice 1
304
+ 26 yes 3.57 Advanced Advanced 1
305
+ 5 no 3.44 Novice Novice 0
306
+ 19 yes 1.98 Advanced Advanced 0
307
+ 15 yes 4.00 Advanced Advanced 1
308
+ 40 yes 3.95 Novice Beginner 0
309
+ 7 yes 2.33 Novice Novice 1
310
+ 22 yes 3.46 Novice Beginner 0
311
+ 36 no 3.00 Advanced Novice 0
312
+ 38 yes 2.65 Advanced Beginner 1
313
+
314
+ # Example 1: Create an alias of teradataml DataFrame.
315
+
316
+ >>> df2 = df.alias("adm_trn")
317
+
318
+ # Print aliased DataFrame.
319
+ >>> df2
320
+ masters gpa stats programming admitted
321
+ id
322
+ 13 no 4.00 Advanced Novice 1
323
+ 26 yes 3.57 Advanced Advanced 1
324
+ 5 no 3.44 Novice Novice 0
325
+ 19 yes 1.98 Advanced Advanced 0
326
+ 15 yes 4.00 Advanced Advanced 1
327
+ 40 yes 3.95 Novice Beginner 0
328
+ 7 yes 2.33 Novice Novice 1
329
+ 22 yes 3.46 Novice Beginner 0
330
+ 36 no 3.00 Advanced Novice 0
331
+ 38 yes 2.65 Advanced Beginner 1
332
+ """
333
+ arg_info_matrix = [["alias_name", alias_name, False, (str), True]]
334
+ _Validators._validate_function_arguments(arg_info_matrix)
335
+ try:
336
+ alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
337
+ reuse_metaexpr=False)
338
+ # Assigning self attributes to newly created alias dataframe.
339
+ alias_df._table_name = self._table_name
340
+ alias_df._index = self._index
341
+ alias_df._index_label = self._index_label
342
+ setattr(alias_df._metaexpr.t, "table_alias", alias_name)
343
+ alias_df._alias = alias_name
344
+ return alias_df
345
+ except Exception as err:
346
+ error_code = MessageCodes.EXECUTION_FAILED
347
+ error_msg = Messages.get_message(
348
+ error_code, "create alias dataFrame", '{}'.format(str(err)))
349
+ raise TeradataMlException(error_msg, error_code)
350
+
253
351
  @classmethod
254
352
  @collect_queryband(queryband="DF_fromTable")
255
353
  def from_table(cls, table_name, index=True, index_label=None):
@@ -364,7 +462,7 @@ class DataFrame():
364
462
  return cls(index=index, index_label=index_label, query=query, materialize=materialize)
365
463
 
366
464
  @classmethod
367
- def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None):
465
+ def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
368
466
  """
369
467
  Private class method for creating a DataFrame from a nodeid and parent metadata.
370
468
 
@@ -385,6 +483,12 @@ class DataFrame():
385
483
  Optional Argument.
386
484
  List specifying index column(s) to be retained as columns for printing.
387
485
 
486
+ reuse_metaexpr:
487
+ Optional Argument.
488
+ Specifies the flag to decide whether to use same _MetaExpression object or not.
489
+ Default Value: True
490
+ Types: bool
491
+
388
492
  EXAMPLES:
389
493
  from teradataml.dataframe.dataframe import DataFrame
390
494
  df = DataFrame._from_node(1234, metaexpr)
@@ -400,28 +504,43 @@ class DataFrame():
400
504
  df = cls()
401
505
  df._nodeid = nodeid
402
506
  df._source_type = SourceType.TABLE.value
403
- df._get_metadata_from_metaexpr(metaexpr)
507
+
508
+ if not reuse_metaexpr:
509
+ # Create new _MetaExpression object using reference metaExpression
510
+ # for newly created DataFrame.
511
+ df._metaexpr = UtilFuncs._get_metaexpr_using_parent_metaexpr(nodeid, metaexpr)
512
+ # When metaexpression is created using only column information from parent DataFrame,
513
+ # underlying SQLAlchemy table is created with '' string as Table name.
514
+ # Assign name from reference mataexpression here.
515
+ df._metaexpr.t.name = metaexpr.t.name
516
+ # Populate corresponding information into newly created DataFrame object
517
+ # using newly created metaExpression.
518
+ df._get_metadata_from_metaexpr(df._metaexpr)
519
+ else:
520
+ # Populate corresponding information into newly created DataFrame object
521
+ # using reference metaExpression.
522
+ df._get_metadata_from_metaexpr(metaexpr)
404
523
 
405
524
  if isinstance(index_label, str):
406
525
  index_label = [index_label]
407
526
 
408
- if index_label is not None and all(elem in [col.name for col in metaexpr.c] for elem in index_label):
527
+ if index_label is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in index_label):
409
528
  df._index_label = index_label
410
529
  elif index_label is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
411
- in [col.name for col in metaexpr.c] for elem in index_label):
530
+ in [col.name for col in df._metaexpr.c] for elem in index_label):
412
531
  df._index_label = index_label
413
532
 
414
533
  # Set the flag suggesting that the _index_label is set,
415
- # and that a database lookup wont be required even when it is None.
534
+ # and that a database lookup won't be required even when it is None.
416
535
  df._index_query_required = False
417
536
 
418
537
  if isinstance(undropped_index, str):
419
538
  undropped_index = [undropped_index]
420
539
 
421
- if undropped_index is not None and all(elem in [col.name for col in metaexpr.c] for elem in undropped_index):
540
+ if undropped_index is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in undropped_index):
422
541
  df._undropped_index = undropped_index
423
542
  elif undropped_index is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
424
- in [col.name for col in metaexpr.c] for elem in undropped_index):
543
+ in [col.name for col in df._metaexpr.c] for elem in undropped_index):
425
544
  df._undropped_index = undropped_index
426
545
 
427
546
  return df
@@ -789,7 +908,10 @@ class DataFrame():
789
908
  Private method for setting _metaexpr and retrieving column names and types.
790
909
 
791
910
  PARAMETERS:
792
- metaexpr - Parent meta data (_MetaExpression object).
911
+ metaexpr:
912
+ Required Argument.
913
+ Specifies parent meta data (_MetaExpression object).
914
+ Types: _MetaExpression
793
915
 
794
916
  RETURNS:
795
917
  None
@@ -802,7 +924,8 @@ class DataFrame():
802
924
  self._column_names_and_types = []
803
925
  self._td_column_names_and_types = []
804
926
  self._td_column_names_and_sqlalchemy_types = {}
805
- for col in metaexpr.c:
927
+
928
+ for col in self._metaexpr.c:
806
929
  if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
807
930
  tdtype = TeradataTypes.TD_NULL_TYPE.value
808
931
  else:
@@ -2066,7 +2189,7 @@ class DataFrame():
2066
2189
  else:
2067
2190
  col_filters = col_names
2068
2191
 
2069
- col_filters_decode = ["decode(\"{}\", null, 0, 1)".format(col_name) for col_name in col_filters]
2192
+ col_filters_decode = ["CASE WHEN \"{}\" IS NULL THEN 0 ELSE 1 END".format(col_name) for col_name in col_filters]
2070
2193
  fmt_filter = " + ".join(col_filters_decode)
2071
2194
 
2072
2195
  if thresh is not None:
@@ -5555,7 +5678,7 @@ class DataFrame():
5555
5678
  try:
5556
5679
  # Printing the DF will actually run underlying select query and
5557
5680
  # will brought up numeric overflow if any. Only materializing won't work.
5558
- print(result_df)
5681
+ repr(result_df)
5559
5682
  return False
5560
5683
  except TeradataMlException as tme:
5561
5684
  if "Numeric overflow occurred during computation" in str(tme):
@@ -6019,6 +6142,8 @@ class DataFrame():
6019
6142
  * "open_sessions" specifies the number of Teradata data transfer
6020
6143
  sessions to be opened for fastexport. This argument is only applicable
6021
6144
  in fastexport mode.
6145
+ * Function returns the pandas dataframe with Decimal columns types as float instead of object.
6146
+ If user want datatype to be object, set argument "coerce_float" to False.
6022
6147
 
6023
6148
  Notes:
6024
6149
  1. For additional information about "coerce_float" and
@@ -6334,15 +6459,22 @@ class DataFrame():
6334
6459
  Supported join operators are =, ==, <, <=, >, >=, <> and != (= and <> operators are
6335
6460
  not supported when using DataFrame columns as operands).
6336
6461
 
6337
- Note:
6338
- 1. When multiple join conditions are given, they are joined using AND boolean
6339
- operator. Other boolean operators are not supported.
6340
- 2. Nesting of join on conditions in column expressions using & and | is not
6341
- supported. The example for unsupported nested join on conditions is:
6342
- on = [(df1.a == df1.b) & (df1.c == df1.d)]
6462
+ Notes:
6463
+ 1. When multiple join conditions are given as a list string/ColumnExpression,
6464
+ they are joined using AND operator.
6465
+ 2. Two or more on conditions can be combined using & and | operators
6466
+ and can be passed as single ColumnExpression.
6467
+ You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
6468
+ [df1.a == df1.b, df1.c == df1.d].
6469
+ 3. Two or more on conditions can not be combined using pythonic 'and'
6470
+ and 'or'.
6471
+ You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
6472
+ [df1.a == df1.b and df1.c == df1.d].
6473
+ 4. Performing self join using same DataFrame object in 'other'
6474
+ argument is not supported. In order to perform self join,
6475
+ first create aliased DataFrame using alias() API and pass it
6476
+ for 'other' argument. Refer to Example 10 in EXAMPLES section.
6343
6477
 
6344
- You can use [df1.a == df1.b, df1.c == df1.d] in place of
6345
- [(df1.a == df1.b) & (df1.c == df1.d)].
6346
6478
 
6347
6479
  PARAMETERS:
6348
6480
 
@@ -6370,15 +6502,20 @@ class DataFrame():
6370
6502
  is the column of left dataframe df1 and col2 is the column of right
6371
6503
  dataframe df2.
6372
6504
  Examples:
6373
- 1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a and df1.b = df2.b.
6374
- 2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b and df1.c = df2.d.
6375
- 3. [df1.a <= df2.b and df1.c > df2.d] indicates df1.a <= df2.b and df1.c > df2.d.
6376
- 4. [df1.a < df2.b and df1.c >= df2.d] indicates df1.a < df2.b and df1.c >= df2.d.
6505
+ 1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
6506
+ 2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b AND df1.c = df2.d.
6507
+ 3. [df1.a <= df2.b & df1.c > df2.d] indicates df1.a <= df2.b AND df1.c > df2.d.
6508
+ 4. [df1.a < df2.b | df1.c >= df2.d] indicates df1.a < df2.b OR df1.c >= df2.d.
6377
6509
  5. df1.a != df2.b indicates df1.a != df2.b.
6378
6510
  • The combination of both string comparisons and comparisons as column expressions.
6379
6511
  Examples:
6380
- 1. ["a", df1.b == df2.b] indicates df1.a = df2.a and df1.b = df2.b.
6381
- 2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b and df1.c > df2.d.
6512
+ 1. ["a", df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
6513
+ 2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b AND df1.c > df2.d.
6514
+ • ColumnExpressions containing FunctionExpressions which represent SQL functions
6515
+ invoked on DataFrame Columns.
6516
+ Examples:
6517
+ 1. (df1.a.round(1) - df2.a.round(1)).mod(2.5) > 2
6518
+ 2. df1.a.floor() - df2.b.floor() > 2
6382
6519
 
6383
6520
  Types: str (or) ColumnExpression (or) List of strings(str) or ColumnExpressions
6384
6521
 
@@ -6400,7 +6537,7 @@ class DataFrame():
6400
6537
  Specifies the suffix to be added to the right table columns.
6401
6538
  Default Value: None.
6402
6539
  Types: str
6403
-
6540
+
6404
6541
  lprefix:
6405
6542
  Optional Argument.
6406
6543
  Specifies the prefix to be added to the left table columns.
@@ -6450,7 +6587,7 @@ class DataFrame():
6450
6587
  0 2 2 analytics 2.3 2.3 b analytics b
6451
6588
  1 1 1 teradata 1.3 1.3 a teradata a
6452
6589
 
6453
- # Example 2: One "on" argument condition is ColumnExpression and other is string having two
6590
+ # Example 2: One "on" argument condition is ColumnExpression and other is string having two
6454
6591
  # columns with left outer join.
6455
6592
  >>> df1.join(df2, on = [df1.col2 == df2.col4,"col5 = col7"], how = "left", lprefix = "t1", rprefix = "t2")
6456
6593
  t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
@@ -6464,7 +6601,7 @@ class DataFrame():
6464
6601
  0 2 2 analytics 2.3 2.3 b analytics b
6465
6602
  1 1 1 teradata 1.3 1.3 a teradata a
6466
6603
 
6467
- # Example 4: One "on" argument condition is ColumnExpression and other is string having two
6604
+ # Example 4: One "on" argument condition is ColumnExpression and other is string having two
6468
6605
  # columns with full join.
6469
6606
  >>> df1.join(other = df2, on = ["col2=col4",df1.col5 == df2.col7], how = "full", lprefix = "t1", rprefix = "t2")
6470
6607
  t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
@@ -6542,7 +6679,53 @@ class DataFrame():
6542
6679
  3 Beginner Beginner 1 3.95 Beginner 3.70 Novice 0 1 no yes
6543
6680
  3 Beginner Beginner 2 3.76 Beginner 3.70 Novice 0 1 no yes
6544
6681
  3 Beginner Novice 3 3.70 Beginner 3.70 Novice 1 1 no no
6682
+
6683
+ # Example 10: Perform self join using aliased DataFrame.
6684
+ # Create an aliased DataFrame.
6685
+ >>> lhs = DataFrame("admissions_train").head(3).sort("id")
6686
+ >>> rhs = lhs.alias("rhs")
6687
+ # Use aliased DataFrame for self join.
6688
+ >>> joined_df = lhs.join(other=rhs, how="cross", lprefix="l", rprefix="r")
6689
+ >>> joined_df
6690
+ l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted
6691
+ 0 1 3 yes no 3.95 3.70 Beginner Novice Beginner Beginner 0 1
6692
+ 1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0
6693
+ 2 2 3 yes no 3.76 3.70 Beginner Novice Beginner Beginner 0 1
6694
+ 3 3 1 no yes 3.70 3.95 Novice Beginner Beginner Beginner 1 0
6695
+ 4 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1
6696
+ 5 3 2 no yes 3.70 3.76 Novice Beginner Beginner Beginner 1 0
6697
+ 6 2 1 yes yes 3.76 3.95 Beginner Beginner Beginner Beginner 0 0
6698
+ 7 1 2 yes yes 3.95 3.76 Beginner Beginner Beginner Beginner 0 0
6699
+ 8 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0
6700
+
6701
+ # Example 11: Perform join with compound 'on' condition having
6702
+ # more than one binary operator.
6703
+ >>> rhs_2 = lhs.assign(double_gpa=lhs.gpa * 2)
6704
+ >>> joined_df_2 = lhs.join(rhs_2, on=rhs_2.double_gpa == lhs.gpa * 2, how="left", lprefix="l", rprefix="r")
6705
+ >>> joined_df_2
6706
+ l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted double_gpa
6707
+ 0 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1 7.40
6708
+ 1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0 7.52
6709
+ 2 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0 7.90
6710
+
6711
+ # Example 12: Perform join on DataFrames with 'on' condition
6712
+ # having FunctionExpression.
6713
+ >>> df = DataFrame("admissions_train")
6714
+ >>> df2 = df.alias("rhs_df")
6715
+ >>> joined_df_3 = df.join(df2, on=(df.gpa.round(1) - df2.gpa.round(1)).mod(2.5) > 2,
6716
+ >>> how="inner", lprefix="l")
6717
+ >>> joined_df_3.sort(["id", "l_id"])
6718
+ l_id id l_masters masters l_gpa gpa l_stats stats l_programming programming l_admitted admitted
6719
+ 0 1 24 yes no 3.95 1.87 Beginner Advanced Beginner Novice 0 1
6720
+ 1 13 24 no no 4.0 1.87 Advanced Advanced Novice Novice 1 1
6721
+ 2 15 24 yes no 4.0 1.87 Advanced Advanced Advanced Novice 1 1
6722
+ 3 25 24 no no 3.96 1.87 Advanced Advanced Advanced Novice 1 1
6723
+ 4 27 24 yes no 3.96 1.87 Advanced Advanced Advanced Novice 0 1
6724
+ 5 29 24 yes no 4.0 1.87 Novice Advanced Beginner Novice 0 1
6725
+ 6 40 24 yes no 3.95 1.87 Novice Advanced Beginner Novice 0 1
6726
+
6545
6727
  """
6728
+
6546
6729
  # Argument validations
6547
6730
  awu_matrix = []
6548
6731
  awu_matrix.append(["other", other, False, (DataFrame)])
@@ -6556,17 +6739,11 @@ class DataFrame():
6556
6739
  # Validate argument types
6557
6740
  _Validators._validate_function_arguments(awu_matrix)
6558
6741
 
6559
- # If user has not provided suffix argument(s), then prefix argument(s) value(s) are passed by
6560
- # user hence we will set the affix variables (laffix and raffix) with provided value(s).
6561
- # affix_type is also set appropriately.
6562
- if lsuffix is not None or rsuffix is not None:
6563
- laffix = lsuffix
6564
- raffix = rsuffix
6565
- affix_type = "suffix"
6566
- else:
6567
- laffix = lprefix
6568
- raffix = rprefix
6569
- affix_type = "prefix"
6742
+ # If self and other DataFrames are pointing to same Table object,
6743
+ # raise error.
6744
+ if self._metaexpr.t is other._metaexpr.t:
6745
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "join"),
6746
+ MessageCodes.TDMLDF_ALIAS_REQUIRED)
6570
6747
 
6571
6748
  how_lc = how.lower()
6572
6749
 
@@ -6584,12 +6761,33 @@ class DataFrame():
6584
6761
  for col in other.columns:
6585
6762
  other_columns_lower_actual_map[col.lower()] = col
6586
6763
 
6587
- for column in self_columns_lower_actual_map.keys():
6588
- if column in other_columns_lower_actual_map.keys():
6589
- if laffix is None and raffix is None:
6590
- raise TeradataMlException(
6591
- Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
6592
- MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
6764
+ # Set the affix variables (laffix and raffix) with provided value(s)
6765
+ # of lsuffix, rsuffix, lprefix and rprefix.
6766
+ # Also set affix_type appropriately.
6767
+ laffix = None
6768
+ raffix = None
6769
+ affix_type = None
6770
+ if lsuffix is not None or rsuffix is not None:
6771
+ laffix = lsuffix
6772
+ raffix = rsuffix
6773
+ affix_type = "suffix"
6774
+ elif lprefix is not None or rprefix is not None:
6775
+ laffix = lprefix
6776
+ raffix = rprefix
6777
+ affix_type = "prefix"
6778
+
6779
+ # Same column names can be present in two dataframes involved
6780
+ # in join operation in below two cases:
6781
+ # Case 1: Self join.
6782
+ # Case 2: Two tables having common column names.
6783
+ # In any case, at least one kind of affix is required to generate
6784
+ # distinct column names in resultant table. Throw error if no affix
6785
+ # is available.
6786
+ if not set(self_columns_lower_actual_map.keys()).isdisjoint(other_columns_lower_actual_map.keys()):
6787
+ if affix_type is None:
6788
+ raise TeradataMlException(
6789
+ Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
6790
+ MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
6593
6791
 
6594
6792
  # Both affixes should not be equal to perform join.
6595
6793
  if laffix == raffix and laffix is not None:
@@ -6598,115 +6796,158 @@ class DataFrame():
6598
6796
  "'l{affix_type}' and 'r{affix_type}'".format(affix_type=affix_type)),
6599
6797
  MessageCodes.TDMLDF_INVALID_TABLE_ALIAS)
6600
6798
 
6601
- if how_lc != "cross":
6602
- if isinstance(on, str) or isinstance(on, ColumnExpression):
6603
- on = [on]
6604
-
6605
- all_join_conditions = []
6606
- invalid_join_conditions = []
6607
- # Forming join condition
6608
- for condition in on:
6609
- ori_condition = condition
6610
-
6611
- if not isinstance(condition, (ColumnExpression, str)):
6612
- invalid_join_conditions.append(condition)
6613
- continue
6614
-
6615
- # Process only when the on condition is string or a ColumnExpression
6616
- if isinstance(condition, ColumnExpression):
6617
- columns = condition.original_column_expr
6618
- condition = condition.compile()
6619
-
6620
- for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
6621
- if op in condition:
6622
- conditional_separator = op
6623
- break
6624
- else:
6625
- # If no join condition is mentioned, default is taken as equal.
6626
- # If on is ['a'], then it is equal to 'df1.a = df2.a'
6627
- columns = [condition, condition]
6628
- condition = "{0} = {0}".format(condition)
6629
- conditional_separator = "="
6630
-
6631
- if isinstance(ori_condition, str):
6632
- columns = [column.strip() for column in condition.split(sep=conditional_separator)
6633
- if len(column) > 0]
6634
-
6635
- if len(columns) != 2:
6636
- invalid_join_conditions.append(condition)
6637
- else:
6638
- left_col = self.__add_alias_to_column(columns[0], self, laffix if laffix is not None else "df1")
6639
- right_col = self.__add_alias_to_column(columns[1], other, raffix if raffix is not None else "df2")
6640
- if conditional_separator == "!=":
6641
- # "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
6642
- # expressing 'not equal to'. Adding support for "!=".
6643
- conditional_separator = "<>"
6644
- all_join_conditions.append('{0} {1} {2}'.format(left_col, conditional_separator, right_col))
6645
-
6646
- if len(invalid_join_conditions) > 0:
6647
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
6648
- ", ".join(invalid_join_conditions)),
6649
- MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
6650
-
6651
- join_condition = " and ".join(all_join_conditions)
6652
- else:
6653
- join_condition = ""
6654
-
6655
- df1_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
6656
- df2_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
6657
-
6658
- select_columns = []
6659
- new_metaexpr_columns_types = OrderedDict()
6660
-
6661
- for column in self.columns:
6662
- if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
6663
- # Check if column found in other DataFrame has same case or different.
6664
- # Return the column name from the other DataFrame.
6665
- other_column = other_columns_lower_actual_map[column.lower()]
6666
-
6667
- df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
6668
- other_columns_lower_actual_map.keys(),
6669
- "right", affix_type)
6670
- select_columns.append("{0} as {1}".format(
6671
- self.__get_fully_qualified_col_name(other_column, "df1" if laffix is None else laffix),
6672
- df1_column_with_affix))
6673
-
6674
- df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
6675
- self_columns_lower_actual_map.keys(),
6676
- "left", affix_type)
6677
- select_columns.append("{0} as {1}".format(
6678
- self.__get_fully_qualified_col_name(column, "df2" if raffix is None else raffix),
6679
- df2_column_with_affix))
6680
-
6681
- # As we are creating new column name, adding it to new metadata dict for new dataframe from join.
6682
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
6683
- UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
6684
- column, df1_columns_types)
6685
-
6686
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
6687
- UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
6688
- other_column, df2_columns_types)
6689
-
6799
+ try:
6800
+ # Set an attribute named '_join_alias' to underlying SQLAlchemy table objects
6801
+ # and use it as default alias for compiling.
6802
+ setattr(self._metaexpr.t, "_join_alias", "lhs")
6803
+ setattr(other._metaexpr.t, "_join_alias", "rhs")
6804
+ lhs_alias = "lhs"
6805
+ rhs_alias = "rhs"
6806
+
6807
+ # Step 1: Generate the on clause string.
6808
+ if how_lc != "cross":
6809
+ on = UtilFuncs._as_list(on)
6810
+
6811
+ all_join_conditions = []
6812
+ invalid_join_conditions = []
6813
+ # Forming join condition
6814
+ for condition in on:
6815
+ # Process only when the on condition is either a string or a ColumnExpression.
6816
+ if not isinstance(condition, (ColumnExpression, str)):
6817
+ invalid_join_conditions.append(condition)
6818
+ continue
6819
+
6820
+ # Generate final on clause string from string representation of condition.
6821
+ if isinstance(condition, str):
6822
+ # Process the string manually.
6823
+ # 1. Parse the string to get operator.
6824
+ for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
6825
+ if op in condition:
6826
+ conditional_separator = op
6827
+ break
6828
+ else:
6829
+ # If no join condition is mentioned, then string represents the column.
6830
+ # In this case, default operator is taken as equal.
6831
+ # If on is ['a'], then it is equal to 'lhs.a = rhs.a'
6832
+ columns = [condition, condition]
6833
+ condition = "{0} = {0}".format(condition)
6834
+ conditional_separator = "="
6835
+ # 2. Split the string using operator and extract LHS and RHS
6836
+ # columns from a binary expression.
6837
+ columns = [column.strip() for column in condition.split(sep=conditional_separator)
6838
+ if len(column) > 0]
6839
+
6840
+ if len(columns) != 2:
6841
+ invalid_join_conditions.append(condition)
6842
+ # TODO: Raise exception here only.
6843
+ else:
6844
+ # 3. Generate fully qualified names using affix and table alias
6845
+ # and create final on clause condition string.
6846
+ left_col = self.__add_alias_to_column(columns[0], self, lhs_alias)
6847
+ right_col = self.__add_alias_to_column(columns[1], other, rhs_alias)
6848
+ if conditional_separator == "!=":
6849
+ # "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
6850
+ # expressing 'not equal to'. Adding support for "!=".
6851
+ conditional_separator = "<>"
6852
+ all_join_conditions.append(
6853
+ '{0} {1} {2}'.format(left_col, conditional_separator, right_col))
6854
+
6855
+ # Generate on clause string from column expression.
6856
+ if isinstance(condition, ColumnExpression):
6857
+ compiled_condition = condition.compile(compile_kwargs={'include_table': True,
6858
+ 'literal_binds': True,
6859
+ 'table_name_kind': '_join_alias',
6860
+ 'compile_with_caller_table': True})
6861
+
6862
+ all_join_conditions.append(compiled_condition)
6863
+
6864
+ # Raise error if invalid on conditions are passed.
6865
+ if len(invalid_join_conditions) > 0:
6866
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
6867
+ ", ".join(invalid_join_conditions)),
6868
+ MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
6869
+
6870
+ # Generate final on condition.
6871
+ join_condition = " and ".join(all_join_conditions)
6690
6872
  else:
6691
- # As column not present in right DataFrame, directly adding column to new metadata dict.
6692
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, df1_columns_types)
6693
- select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
6873
+ # In case of cross join no need of condition.
6874
+ join_condition = ""
6694
6875
 
6695
- for column in other.columns:
6696
- if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
6697
- # As column not present in left DataFrame, directly adding column to new metadata dict.
6698
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, df2_columns_types)
6699
- select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
6876
+ # Step 2: Generate the select clause string.
6877
+ # Generate new column names for overlapping column names using lsuffix, rsuffix, lprefix, rprefix.
6878
+ # Also, use table alias while addressing overlapping column names.
6879
+ lhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
6880
+ rhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
6700
6881
 
6701
- # Create a node in AED using _aed_join
6702
- join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns), how_lc,
6703
- join_condition, "df1" if laffix is None else laffix,
6704
- "df2" if raffix is None else raffix)
6882
+ select_columns = []
6883
+ new_metaexpr_columns_types = OrderedDict()
6705
6884
 
6706
- # Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
6707
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
6885
+ # Processing columns in LHS DF/ self DF.
6886
+ for column in self.columns:
6887
+ if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
6888
+ # Check if column found in other DataFrame has same case or different.
6889
+ # Return the column name from the other DataFrame.
6890
+ other_column = other_columns_lower_actual_map[column.lower()]
6891
+
6892
+ # Check if column name in LHS dataframe is same as that of in RHS dataframe.
6893
+ # If so, generate new name for LHS DF column using provided affix.
6894
+ df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
6895
+ other_columns_lower_actual_map.keys(),
6896
+ "right", affix_type)
6897
+
6898
+ # Generate select clause string for current column and append to list.
6899
+ select_columns.append("{0} as {1}".format(
6900
+ self.__get_fully_qualified_col_name(other_column, lhs_alias),
6901
+ df1_column_with_affix))
6902
+
6903
+ # Check if column name in RHS dataframe is same as that of in LHS dataframe.
6904
+ # If so, generate new name for RHS DF column using provided affix.
6905
+ df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
6906
+ self_columns_lower_actual_map.keys(),
6907
+ "left", affix_type)
6908
+ # Generate select clause string for current column and append to list.
6909
+ select_columns.append("{0} as {1}".format(
6910
+ self.__get_fully_qualified_col_name(column, rhs_alias),
6911
+ df2_column_with_affix))
6912
+
6913
+ # As we are creating new column name, adding it to new metadata dict for new dataframe from join.
6914
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
6915
+ UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
6916
+ column, lhs_columns_types)
6917
+
6918
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
6919
+ UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
6920
+ other_column, rhs_columns_types)
6708
6921
 
6709
- return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
6922
+ else:
6923
+ # As column with same name is not present in RHS DataFrame now,
6924
+ # directly adding column to new metadata dict.
6925
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, lhs_columns_types)
6926
+ select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
6927
+
6928
+ # Processing columns in RHS DF/ other DF.
6929
+ # Here we will only be processing columns which are not overlapping.
6930
+ for column in other.columns:
6931
+ if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
6932
+ # As column not present in left DataFrame, directly adding column to new metadata dict.
6933
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, rhs_columns_types)
6934
+ select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
6935
+
6936
+ # Step 3: Create a node in AED using _aed_join using appropriate alias for involved tables.
6937
+ join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns),
6938
+ how_lc, join_condition, lhs_alias, rhs_alias)
6939
+
6940
+ # Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
6941
+ # and underlying table name.
6942
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
6943
+
6944
+ # Return a new joined dataframe.
6945
+ return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
6946
+ finally:
6947
+ # Delete the '_join_alias' attribute attached to underlying
6948
+ # SQLALchemy table objects.
6949
+ delattr(self._metaexpr.t, "_join_alias")
6950
+ delattr(other._metaexpr.t, "_join_alias")
6710
6951
 
6711
6952
  def __add_alias_to_column(self, column, df, alias):
6712
6953
  """
@@ -6766,7 +7007,7 @@ class DataFrame():
6766
7007
  return "{0}.{1}".format(UtilFuncs._teradata_quote_arg(alias, "\"", False),
6767
7008
  UtilFuncs._teradata_quote_arg(column, "\"", False))
6768
7009
 
6769
- def __check_and_return_new_column_name(self, affix, column, col_list, df_side, affix_type):
7010
+ def __check_and_return_new_column_name(self, affix, column, col_list, other_df_side, affix_type):
6770
7011
  """
6771
7012
  Check new column name alias with column exists in col_list or not, if exists throws exception else
6772
7013
  returns new column name.
@@ -6775,7 +7016,7 @@ class DataFrame():
6775
7016
  affix - affix to be added to column.
6776
7017
  column - column name.
6777
7018
  col_list - list of columns to check in which new column is exists or not.
6778
- df_side - Side of the dataframe.
7019
+ other_df_side - Side on which the other dataframe in current join operation resides.
6779
7020
  affix_type - Type of affix. Either "prefix" or "suffix".
6780
7021
 
6781
7022
  EXAMPLES:
@@ -6789,19 +7030,19 @@ class DataFrame():
6789
7030
  return UtilFuncs._teradata_quote_arg(column, "\"", False)
6790
7031
 
6791
7032
  # If Prefix, affix is added before column name else it is appended.
6792
- df1_column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
6793
- df1_column_with_affix = df1_column_with_affix.format(affix,
6794
- UtilFuncs._teradata_unquote_arg(column, "\""))
6795
- if df_utils._check_column_exists(df1_column_with_affix.lower(), col_list):
6796
- if df_side == "right":
6797
- suffix_side = "l{}".format(affix_type)
7033
+ column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
7034
+ column_with_affix = column_with_affix.format(affix,
7035
+ UtilFuncs._teradata_unquote_arg(column, "\""))
7036
+ if df_utils._check_column_exists(column_with_affix.lower(), col_list):
7037
+ if other_df_side == "right":
7038
+ affix_type = "l{}".format(affix_type)
6798
7039
  else:
6799
- suffix_side = "r{}".format(affix_type)
7040
+ affix_type = "r{}".format(affix_type)
6800
7041
  raise TeradataMlException(
6801
- Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, df1_column_with_affix, df_side,
6802
- suffix_side),
7042
+ Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, column_with_affix, other_df_side,
7043
+ affix_type),
6803
7044
  MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS)
6804
- return UtilFuncs._teradata_quote_arg(df1_column_with_affix, "\"", False)
7045
+ return UtilFuncs._teradata_quote_arg(column_with_affix, "\"", False)
6805
7046
 
6806
7047
  def __add_column_type_item_to_dict(self, new_metadata_dict, new_column, column, column_types):
6807
7048
  """
@@ -7327,18 +7568,14 @@ class DataFrame():
7327
7568
 
7328
7569
  exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
7329
7570
  if exec_mode == 'REMOTE':
7330
- if _InternalBuffer.get("auth_token") is None:
7331
- raise TeradataMlException(Messages.get_message(
7332
- MessageCodes.FUNC_EXECUTION_FAILED, "'udf'", 'Authentication token is required to run udf. Set token using set_auth_token().'),
7333
- MessageCodes.FUNC_EXECUTION_FAILED)
7334
- else:
7335
- for colname, col in udf_expr.items():
7336
- env_name = UtilFuncs._get_env_name(col)
7337
- # Store the env_name and its corresponding output column
7338
- if env_name in env_mapper:
7339
- env_mapper[env_name].append(colname)
7340
- else:
7341
- env_mapper[env_name] = [colname]
7571
+ _Validators._check_auth_token("udf")
7572
+ for colname, col in udf_expr.items():
7573
+ env_name = UtilFuncs._get_env_name(col)
7574
+ # Store the env_name and its corresponding output column
7575
+ if env_name in env_mapper:
7576
+ env_mapper[env_name].append(colname)
7577
+ else:
7578
+ env_mapper[env_name] = [colname]
7342
7579
  else:
7343
7580
  env_mapper[env_name] = udf_expr.keys()
7344
7581
 
@@ -7388,6 +7625,97 @@ class DataFrame():
7388
7625
 
7389
7626
  df = tbl_operators.execute()
7390
7627
  return df
7628
+
7629
+ def _assign_call_udf(self, call_udf_expr):
7630
+ """
7631
+ DESCRIPTION:
7632
+ Internal function for DataFrame.assign() to execute the call_udf using
7633
+ Script/Apply Table Operator and create new column for teradataml DataFrame.
7634
+
7635
+ PARAMETER:
7636
+ call_udf_expr:
7637
+ Required Argument.
7638
+ Specifies a dictionary of column name to call_udf expressions.
7639
+ Types: dict
7640
+
7641
+ RETURNS:
7642
+ teradataml DataFrame
7643
+
7644
+ RAISES:
7645
+ None.
7646
+
7647
+ EXAMPLES:
7648
+ # call_udf_expr is a dictionary of column names to call_udf expressions.
7649
+ call_udf_expr = {'upper_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C44310>,
7650
+ 'sum_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C41690>}
7651
+ self._assign_register(call_udf_expr)
7652
+ """
7653
+ df = self
7654
+ # Create a dictionary of output columns to column type (teradata type).
7655
+ returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
7656
+ # Create a dictionary of output columns to column type (python types).
7657
+ output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
7658
+ for col_name, col_type in returns.items()}
7659
+
7660
+ for colname, col in call_udf_expr.items():
7661
+ returns[colname] = col.type
7662
+ output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
7663
+ script_name = col._udf_script
7664
+ delimiter = col._delimiter
7665
+ quotechar = col._quotechar
7666
+
7667
+ # Create a dictionary of arguments to be passed to the script.
7668
+ script_data = {}
7669
+ script_data['input_cols'] = df.columns
7670
+ script_data['output_cols'] = list(returns.keys())
7671
+ script_data['output_type_converters'] = output_type_converters
7672
+ script_data['function_args'] = {colname: col._udf_args}
7673
+ script_data['delimiter'] = delimiter
7674
+ script_data['qoutechar'] = quotechar
7675
+
7676
+ # Convert the dictionary to a string.
7677
+ # The string is URL encoded to pass it as a parameter to the script.
7678
+ script_data = urllib.parse.quote_plus(json.dumps(script_data))
7679
+
7680
+ if UtilFuncs._is_lake():
7681
+ from teradataml.table_operators.Apply import Apply
7682
+ apply_op_obj = Apply(data=df,
7683
+ script_name=script_name,
7684
+ env_name=col._env_name,
7685
+ returns = returns,
7686
+ delimiter = delimiter,
7687
+ quotechar=quotechar,
7688
+ files_local_path=GarbageCollector._get_temp_dir_name(),
7689
+ apply_command="python3 {} {}".format(script_name, script_data)
7690
+ )
7691
+ try:
7692
+ df = apply_op_obj.execute_script(
7693
+ output_style=OutputStyle.OUTPUT_TABLE.value)
7694
+ except Exception:
7695
+ raise
7696
+ else:
7697
+ import teradataml.context.context as context
7698
+ database = context._get_current_databasename()
7699
+
7700
+ check_reserved_keyword = False if sorted(list(returns.keys())) == sorted(df.columns) else True
7701
+
7702
+ from teradataml.table_operators.Script import Script
7703
+ table_op_obj = Script(data=df,
7704
+ script_name=script_name,
7705
+ files_local_path=GarbageCollector._get_temp_dir_name(),
7706
+ script_command="{}/bin/python3 ./{}/{} {}".format(
7707
+ configure.indb_install_location, database, script_name, script_data),
7708
+ returns=returns,
7709
+ quotechar=quotechar,
7710
+ delimiter = delimiter
7711
+ )
7712
+ table_op_obj.check_reserved_keyword = check_reserved_keyword
7713
+ try:
7714
+ df = table_op_obj.execute_script(
7715
+ output_style=OutputStyle.OUTPUT_TABLE.value)
7716
+ except Exception:
7717
+ raise
7718
+ return df
7391
7719
 
7392
7720
  @collect_queryband(queryband="DF_assign")
7393
7721
  def assign(self, drop_columns=False, **kwargs):
@@ -7420,7 +7748,7 @@ class DataFrame():
7420
7748
  * SQLAlchemy ClauseElements.
7421
7749
  (See teradataml extension with SQLAlchemy in teradataml User Guide
7422
7750
  and Function reference guide for more details)
7423
- * Function - udf.
7751
+ * Function - udf, call_udf.
7424
7752
 
7425
7753
 
7426
7754
  RETURNS:
@@ -7845,6 +8173,30 @@ class DataFrame():
7845
8173
  Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
7846
8174
  Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
7847
8175
  >>>
8176
+
8177
+ # Example 20: Register and Call the user defined function to get the values upper case.
8178
+ >>> from teradataml.dataframe.functions import udf, register, call_udf
8179
+ >>> @udf
8180
+ ... def to_upper(s):
8181
+ ... if s is not None:
8182
+ ... return s.upper()
8183
+ >>>
8184
+ # Register the created user defined function with name "upper".
8185
+ >>> register("upper", to_upper)
8186
+ >>>
8187
+ # Call the user defined function registered with name "upper" and assign the
8188
+ # ColumnExpression returned to the DataFrame.
8189
+ >>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
8190
+ >>> res
8191
+ Feb Jan Mar Apr datetime upper_col
8192
+ accounts
8193
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
8194
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
8195
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
8196
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
8197
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
8198
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
8199
+ >>>
7848
8200
  """
7849
8201
  # Argument validations
7850
8202
  awu_matrix = []
@@ -7894,10 +8246,14 @@ class DataFrame():
7894
8246
  # column name to normal/regular expressions.
7895
8247
  udf_expr = {}
7896
8248
  regular_expr = {}
8249
+ call_udf_expr = {}
7897
8250
  for colname, col in kwargs.items():
7898
8251
  # If value passed in kwargs is a ColumnExpression and is a udf, store it.
7899
8252
  if isinstance(col, ColumnExpression) and col._udf:
7900
8253
  udf_expr[colname] = col
8254
+ # If value passed in kwargs is a ColumnExpression and is a registerd udf script, store it.
8255
+ elif isinstance(col, ColumnExpression) and col._udf_script:
8256
+ call_udf_expr[colname] = col
7901
8257
  else:
7902
8258
  regular_expr[colname] = col
7903
8259
  df = self
@@ -7917,6 +8273,9 @@ class DataFrame():
7917
8273
  if bool(udf_expr):
7918
8274
  df = df._assign_udf(udf_expr)
7919
8275
 
8276
+ if bool(call_udf_expr):
8277
+ df = df._assign_call_udf(call_udf_expr)
8278
+
7920
8279
  return df
7921
8280
 
7922
8281
 
@@ -9553,6 +9912,12 @@ class DataFrame():
9553
9912
  # Validate argument types
9554
9913
  _Validators._validate_function_arguments(awu_matrix)
9555
9914
 
9915
+ # If self and right DataFrames are pointing to same Table object,
9916
+ # raise error.
9917
+ if self._metaexpr.t is right._metaexpr.t:
9918
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "merge"),
9919
+ MessageCodes.TDMLDF_ALIAS_REQUIRED)
9920
+
9556
9921
  if (right_on is not None and left_on is None) or (right_on is None and left_on is not None):
9557
9922
  raise TeradataMlException(
9558
9923
  Messages.get_message(MessageCodes.MUST_PASS_ARGUMENT, "left_on", "right_on"),
@@ -12331,6 +12696,9 @@ class DataFrame():
12331
12696
  _Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
12332
12697
  False)
12333
12698
  column_names = list(dict.fromkeys(column_names))
12699
+
12700
+ if list_td_reserved_keywords(column_names):
12701
+ column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
12334
12702
 
12335
12703
  col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
12336
12704
  sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)