teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (126) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +315 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +95 -8
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/metadata.py +12 -3
  8. teradataml/analytics/json_parser/utils.py +7 -2
  9. teradataml/analytics/sqle/__init__.py +5 -1
  10. teradataml/analytics/table_operator/__init__.py +1 -1
  11. teradataml/analytics/uaf/__init__.py +1 -1
  12. teradataml/analytics/utils.py +4 -0
  13. teradataml/analytics/valib.py +18 -4
  14. teradataml/automl/__init__.py +51 -6
  15. teradataml/automl/data_preparation.py +59 -35
  16. teradataml/automl/data_transformation.py +58 -33
  17. teradataml/automl/feature_engineering.py +27 -12
  18. teradataml/automl/model_training.py +73 -46
  19. teradataml/common/constants.py +88 -29
  20. teradataml/common/garbagecollector.py +2 -1
  21. teradataml/common/messagecodes.py +19 -3
  22. teradataml/common/messages.py +6 -1
  23. teradataml/common/sqlbundle.py +64 -12
  24. teradataml/common/utils.py +246 -47
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +161 -27
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/byom_example.json +11 -0
  29. teradataml/data/dataframe_example.json +18 -2
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  37. teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
  38. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  39. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  40. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  41. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  42. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  43. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  44. teradataml/data/hnsw_alter_data.csv +5 -0
  45. teradataml/data/hnsw_data.csv +10 -0
  46. teradataml/data/jsons/byom/h2opredict.json +1 -1
  47. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  48. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  49. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  50. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  51. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  52. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  53. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  54. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  55. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  56. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  57. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  58. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  59. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  60. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  61. teradataml/data/medical_readings.csv +101 -0
  62. teradataml/data/patient_profile.csv +101 -0
  63. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  64. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  65. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  66. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  67. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  68. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  69. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  70. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  71. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  72. teradataml/data/target_udt_data.csv +8 -0
  73. teradataml/data/templates/open_source_ml.json +3 -2
  74. teradataml/data/teradataml_example.json +8 -0
  75. teradataml/data/vectordistance_example.json +4 -0
  76. teradataml/dataframe/copy_to.py +8 -3
  77. teradataml/dataframe/data_transfer.py +11 -1
  78. teradataml/dataframe/dataframe.py +1049 -285
  79. teradataml/dataframe/dataframe_utils.py +152 -20
  80. teradataml/dataframe/functions.py +578 -35
  81. teradataml/dataframe/setop.py +11 -6
  82. teradataml/dataframe/sql.py +185 -16
  83. teradataml/dbutils/dbutils.py +1049 -115
  84. teradataml/dbutils/filemgr.py +48 -1
  85. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  86. teradataml/lib/aed_0_1.dll +0 -0
  87. teradataml/opensource/__init__.py +1 -1
  88. teradataml/opensource/_base.py +1466 -0
  89. teradataml/opensource/_class.py +464 -0
  90. teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
  91. teradataml/opensource/_lightgbm.py +949 -0
  92. teradataml/opensource/_sklearn.py +1008 -0
  93. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
  94. teradataml/options/__init__.py +54 -38
  95. teradataml/options/configure.py +131 -27
  96. teradataml/options/display.py +13 -2
  97. teradataml/plot/axis.py +47 -8
  98. teradataml/plot/figure.py +33 -0
  99. teradataml/plot/plot.py +63 -13
  100. teradataml/scriptmgmt/UserEnv.py +5 -5
  101. teradataml/scriptmgmt/lls_utils.py +130 -40
  102. teradataml/store/__init__.py +12 -0
  103. teradataml/store/feature_store/__init__.py +0 -0
  104. teradataml/store/feature_store/constants.py +291 -0
  105. teradataml/store/feature_store/feature_store.py +2318 -0
  106. teradataml/store/feature_store/models.py +1505 -0
  107. teradataml/table_operators/Apply.py +32 -18
  108. teradataml/table_operators/Script.py +3 -1
  109. teradataml/table_operators/TableOperator.py +3 -1
  110. teradataml/table_operators/query_generator.py +3 -0
  111. teradataml/table_operators/table_operator_query_generator.py +3 -1
  112. teradataml/table_operators/table_operator_util.py +37 -38
  113. teradataml/table_operators/templates/dataframe_register.template +69 -0
  114. teradataml/utils/dtypes.py +51 -2
  115. teradataml/utils/internal_buffer.py +18 -0
  116. teradataml/utils/validators.py +99 -8
  117. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
  118. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
  119. teradataml/libaed_0_1.dylib +0 -0
  120. teradataml/libaed_0_1.so +0 -0
  121. teradataml/opensource/sklearn/__init__.py +0 -1
  122. teradataml/opensource/sklearn/_class.py +0 -255
  123. teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
  124. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
  125. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
  126. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
@@ -19,6 +19,10 @@ import pandas as pd
19
19
  import re
20
20
  import sqlalchemy
21
21
  import sys
22
+ import urllib.parse
23
+
24
+ from sqlalchemy import Column
25
+
22
26
  import teradataml.context.context as tdmlctx
23
27
 
24
28
  from collections import OrderedDict, namedtuple
@@ -30,6 +34,7 @@ from teradataml.dataframe.sql_interfaces import ColumnExpression
30
34
  from teradataml.dataframe.sql_functions import case
31
35
  from teradataml.series.series import Series
32
36
  from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
37
+ from teradataml.common.deprecations import argument_deprecation
33
38
  from teradataml.common.utils import UtilFuncs
34
39
  from teradataml.common.exceptions import TeradataMlException
35
40
  from teradataml.common.messages import Messages
@@ -41,9 +46,11 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, Dat
41
46
  from teradataml.dataframe.indexer import _LocationIndexer
42
47
  from teradataml.common.aed_utils import AedUtils
43
48
  from teradataml.options.display import display
49
+ from teradataml.options.configure import configure
44
50
  from teradataml.dataframe.copy_to import copy_to_sql
45
51
  from teradataml.dataframe.row import _Row
46
52
  from teradataml.dataframe.setop import concat
53
+ from teradataml.dbutils.dbutils import list_td_reserved_keywords
47
54
  from teradataml.plot.plot import _Plot
48
55
  from teradataml.scriptmgmt.UserEnv import UserEnv
49
56
  from teradataml.utils.dtypes import _Dtypes, _ListOf, _TupleOf
@@ -57,10 +64,83 @@ from teradataml.common.bulk_exposed_utils import _validate_unimplemented_functio
57
64
  from teradataml.telemetry_utils.queryband import collect_queryband
58
65
  from teradataml.options.configure import configure
59
66
  from teradataml.utils.internal_buffer import _InternalBuffer
67
+ from teradataml.common.constants import OutputStyle
60
68
 
61
69
  # TODO use logger when available on master branch
62
70
  # logger = teradatapylog.getLogger()
63
- in_schema = UtilFuncs._in_schema
71
+
72
+ class in_schema:
73
+ """
74
+ Class takes a schema name, a table name and datalake name attributes
75
+ and creates an object that can be passed to DataFrame.
76
+ Note:
77
+ teradataml recommends to use this class to access table(s)/view(s),
78
+ from the database other than the default database.
79
+ """
80
+ def __init__(self, schema_name, table_name, datalake_name=None):
81
+ """
82
+ Constructor for in_schema class.
83
+
84
+ PARAMETERS:
85
+ schema_name:
86
+ Required Argument.
87
+ Specifies the schema where the table resides in.
88
+ Types: str
89
+
90
+ table_name:
91
+ Required Argument.
92
+ Specifies the table name or view name in Vantage.
93
+ Types: str
94
+
95
+ datalake_name:
96
+ Optional Argument.
97
+ Specifies the datalake name.
98
+ Types: str
99
+
100
+ EXAMPLES:
101
+ from teradataml.dataframe.dataframe import in_schema, DataFrame
102
+
103
+ # Example 1: The following example creates a DataFrame from the
104
+ # existing Vantage table "dbcinfo" in the non-default
105
+ # database "dbc" using the in_schema instance.
106
+ df = DataFrame(in_schema("dbc", "dbcinfo"))
107
+
108
+ # Example 2: The following example uses from_table() function, existing
109
+ # Vantage table "dbcinfo" and non-default database "dbc" to
110
+ # create a teradataml DataFrame.
111
+ df = DataFrame.from_table(in_schema("dbc","dbcinfo"))
112
+
113
+ # Example 3: The following example uses "in_schema" object created
114
+ # with "datalake_name" argument to create DataFrame on OTF table.
115
+ otf_df = DataFrame(in_schema("datalake_db","datalake_table","datalake"))
116
+
117
+ """
118
+ self.schema_name = schema_name
119
+ self.table_name = table_name
120
+ self.datalake_name = datalake_name
121
+
122
+ awu_matrix = []
123
+ awu_matrix.append(["schema_name", schema_name, False, (str), True])
124
+ awu_matrix.append(["table_name", table_name, False, (str), True])
125
+ awu_matrix.append(["datalake_name", datalake_name, True, (str), True])
126
+
127
+ # Validate argument types
128
+ _Validators._validate_function_arguments(awu_matrix)
129
+
130
+ def __str__(self):
131
+ """
132
+ Returns the string representation of in_schema instance.
133
+ """
134
+ tbl_name = '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.schema_name, "\"", False),
135
+ UtilFuncs._teradata_quote_arg(self.table_name, "\"", False))
136
+
137
+ if not self.datalake_name:
138
+ return tbl_name
139
+
140
+ return '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.datalake_name, "\"", False), tbl_name)
141
+
142
+
143
+ in_schema = in_schema
64
144
 
65
145
 
66
146
  class DataFrame():
@@ -163,6 +243,19 @@ class DataFrame():
163
243
  # Property to determine if table is an ART table or not.
164
244
  self._is_art = None
165
245
 
246
+ self._datalake = None
247
+ self._database = None
248
+ self._table = None
249
+ self._otf = False
250
+
251
+ if isinstance(table_name, in_schema):
252
+ self._table = table_name.table_name
253
+ self._datalake = table_name.datalake_name
254
+ self._database = table_name.schema_name
255
+ self._otf = True if self._datalake else False
256
+
257
+ table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
258
+
166
259
  # Below matrix is list of list, where in each row contains following elements:
167
260
  # Let's take an example of following, just to get an idea:
168
261
  # [element1, element2, element3, element4, element5, element6]
@@ -195,25 +288,45 @@ class DataFrame():
195
288
  self._source_type = SourceType.TABLE.value
196
289
  self._nodeid = self._aed_utils._aed_table(self._table_name)
197
290
  elif query is not None:
291
+ query = query.strip()
292
+ query = query[:-1] if query[-1] == ";" else query
293
+
198
294
  self._query = query
199
295
  self._source_type = SourceType.QUERY.value
200
296
 
201
- if materialize:
202
- # If user requests to materialize the the query, then we should create a
297
+ temp_obj_params = {
298
+ "prefix": "_frmqry_v",
299
+ "use_default_database": True,
300
+ "quote": False
301
+ }
302
+ __execute = UtilFuncs._create_view
303
+
304
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
305
+ # If user requests to materialize the query, then we should create a
306
+ # volatile table if user intends to the same instead of view.
307
+ # Volatile table does not need to be added to the GC.
308
+ temp_obj_params["table_type"] = TeradataConstants.TERADATA_VOLATILE_TABLE
309
+ temp_obj_params["gc_on_quit"] = False
310
+ temp_obj_params["prefix"] = "_frmqry_vt"
311
+ __execute = UtilFuncs._create_table
312
+
313
+ elif materialize:
314
+ # If user requests to materialize the query, then we should create a
203
315
  # table instead of view and add the same in the GarbageCollector.
204
- temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_t", use_default_database=True,
205
- quote=False,
206
- table_type=TeradataConstants.TERADATA_TABLE)
207
- else:
208
- temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_v", use_default_database=True,
209
- quote=False)
316
+ temp_obj_params["table_type"] = TeradataConstants.TERADATA_TABLE
317
+ temp_obj_params["gc_on_quit"] = True
318
+ temp_obj_params["prefix"] = "_frmqry_t"
319
+ __execute = UtilFuncs._create_table
210
320
 
321
+ temp_table_name = UtilFuncs._generate_temp_table_name(**temp_obj_params)
211
322
  self._table_name = temp_table_name
323
+ __execute_params = (self._table_name, self._query)
324
+
325
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
326
+ __execute_params = (self._table_name, self._query, True)
327
+
212
328
  try:
213
- if materialize:
214
- UtilFuncs._create_table(self._table_name, self._query)
215
- else:
216
- UtilFuncs._create_view(self._table_name, self._query)
329
+ __execute(*__execute_params)
217
330
  except OperationalError as oe:
218
331
  if "[Error 3707] Syntax error" in str(oe):
219
332
  raise ValueError(Messages.get_message(
@@ -229,7 +342,7 @@ class DataFrame():
229
342
 
230
343
  self._nodeid = self._aed_utils._aed_query(self._query, temp_table_name)
231
344
  else:
232
- if inspect.stack()[1][3] not in ['_from_node', '__init__']:
345
+ if inspect.stack()[1][3] not in ['_from_node', '__init__', 'alias']:
233
346
  raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
234
347
  MessageCodes.TDMLDF_CREATE_FAIL)
235
348
 
@@ -241,6 +354,10 @@ class DataFrame():
241
354
  self._iloc = _LocationIndexer(self, integer_indexing=True)
242
355
  self.__data = None
243
356
  self.__data_columns = None
357
+ self._alias = None
358
+ self._plot = None
359
+
360
+ self._eda_ui = None
244
361
 
245
362
  except TeradataMlException:
246
363
  raise
@@ -250,9 +367,106 @@ class DataFrame():
250
367
  raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
251
368
  MessageCodes.TDMLDF_CREATE_FAIL) from err
252
369
 
370
+ @property
371
+ def db_object_name(self):
372
+ """
373
+ DESCRIPTION:
374
+ Get the underlying database object name, on which DataFrame is
375
+ created.
376
+
377
+ RETURNS:
378
+ str representing object name of DataFrame
379
+
380
+ EXAMPLES:
381
+ >>> load_example_data("dataframe", "sales")
382
+ >>> df = DataFrame('sales')
383
+ >>> df.db_object_name
384
+ '"sales"'
385
+ """
386
+ if self._table_name is not None:
387
+ return self._table_name
388
+ else:
389
+ msg = "Object name is available once DataFrame is materialized. " \
390
+ "Use DataFrame.materialize() to materialize DataFrame."
391
+ print(msg)
392
+
393
+ def alias(self, alias_name):
394
+ """
395
+ DESCRIPTION:
396
+ Method to create an aliased teradataml DataFrame.
397
+ Note:
398
+ * This method is recommended to be used before performing
399
+ self join using DataFrame's join() API.
400
+
401
+ PARAMETERS:
402
+ alias_name:
403
+ Required Argument.
404
+ Specifies the alias name to be assigned to a teradataml DataFrame.
405
+ Types: str
406
+
407
+ RETURNS:
408
+ teradataml DataFrame
409
+
410
+ EXAMPLES:
411
+ >>> load_example_data("dataframe", "admissions_train")
412
+ >>> df = DataFrame("admissions_train")
413
+ >>> df
414
+ masters gpa stats programming admitted
415
+ id
416
+ 13 no 4.00 Advanced Novice 1
417
+ 26 yes 3.57 Advanced Advanced 1
418
+ 5 no 3.44 Novice Novice 0
419
+ 19 yes 1.98 Advanced Advanced 0
420
+ 15 yes 4.00 Advanced Advanced 1
421
+ 40 yes 3.95 Novice Beginner 0
422
+ 7 yes 2.33 Novice Novice 1
423
+ 22 yes 3.46 Novice Beginner 0
424
+ 36 no 3.00 Advanced Novice 0
425
+ 38 yes 2.65 Advanced Beginner 1
426
+
427
+ # Example 1: Create an alias of teradataml DataFrame.
428
+
429
+ >>> df2 = df.alias("adm_trn")
430
+
431
+ # Print aliased DataFrame.
432
+ >>> df2
433
+ masters gpa stats programming admitted
434
+ id
435
+ 13 no 4.00 Advanced Novice 1
436
+ 26 yes 3.57 Advanced Advanced 1
437
+ 5 no 3.44 Novice Novice 0
438
+ 19 yes 1.98 Advanced Advanced 0
439
+ 15 yes 4.00 Advanced Advanced 1
440
+ 40 yes 3.95 Novice Beginner 0
441
+ 7 yes 2.33 Novice Novice 1
442
+ 22 yes 3.46 Novice Beginner 0
443
+ 36 no 3.00 Advanced Novice 0
444
+ 38 yes 2.65 Advanced Beginner 1
445
+ """
446
+ arg_info_matrix = [["alias_name", alias_name, False, (str), True]]
447
+ _Validators._validate_function_arguments(arg_info_matrix)
448
+ try:
449
+ alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
450
+ reuse_metaexpr=False, _datalake=self._datalake,
451
+ _database=self._database, _table=self._table,
452
+ _otf=self._otf)
453
+ # Assigning self attributes to newly created alias dataframe.
454
+ alias_df._table_name = self._table_name
455
+ alias_df._index = self._index
456
+ alias_df._index_label = self._index_label
457
+ setattr(alias_df._metaexpr.t, "table_alias", alias_name)
458
+ alias_df._alias = alias_name
459
+ return alias_df
460
+ except Exception as err:
461
+ error_code = MessageCodes.EXECUTION_FAILED
462
+ error_msg = Messages.get_message(
463
+ error_code, "create alias dataFrame", '{}'.format(str(err)))
464
+ raise TeradataMlException(error_msg, error_code)
465
+
253
466
  @classmethod
254
467
  @collect_queryband(queryband="DF_fromTable")
255
- def from_table(cls, table_name, index=True, index_label=None):
468
+ def from_table(cls, table_name, index=True, index_label=None,
469
+ schema_name=None, datalake_name=None):
256
470
  """
257
471
  Class method for creating a DataFrame from a table or a view.
258
472
 
@@ -273,30 +487,48 @@ class DataFrame():
273
487
  Column/s used for sorting.
274
488
  Types: str
275
489
 
490
+ schema_name:
491
+ Optional Argument.
492
+ Specifies the schema where the table resides.
493
+ Types: str
494
+
495
+ datalake_name:
496
+ Optional Argument.
497
+ Specifies the datalake name.
498
+ Types: str
499
+
276
500
  EXAMPLES:
277
- from teradataml.dataframe.dataframe import DataFrame
501
+ >>> from teradataml.dataframe.dataframe import DataFrame
278
502
 
279
503
  # Example 1: The following example creates a DataFrame from a table or
280
504
  a view.
281
505
  # Load the example data.
282
- load_example_data("dataframe","sales")
506
+ >>> load_example_data("dataframe","sales")
283
507
 
284
508
  # Create DataFrame from table
285
- df = DataFrame.from_table('sales')
509
+ >>> df = DataFrame.from_table('sales')
286
510
 
287
511
  # Create DataFrame from table and without index column sorting.
288
- df = DataFrame.from_table("sales", False)
512
+ >>> df = DataFrame.from_table("sales", False)
289
513
 
290
514
  # Create DataFrame from table and sorting using the 'accounts'
291
515
  # column.
292
- df = DataFrame.from_table("sales", True, "accounts")
516
+ >>> df = DataFrame.from_table("sales", True, "accounts")
293
517
 
294
518
  # Example 2: The following example creates a DataFrame from existing Vantage
295
519
  # table "dbcinfo" in the non-default database "dbc" using the
296
520
  # in_schema() function.
297
521
 
298
- from teradataml.dataframe.dataframe import in_schema
299
- df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
522
+ >>> from teradataml.dataframe.dataframe import in_schema
523
+ >>> df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
524
+
525
+ # Example 3: Create a DataFrame on existing DataLake
526
+ # table "lake_table" in the "datalake_database" database
527
+ # in "datalake" datalake.
528
+
529
+ >>> datalake_df = DataFrame.from_table(table_name="lake_table",
530
+ ... schema_name="datalake_database",
531
+ ... datalake_name="datalake" )
300
532
 
301
533
  RETURNS:
302
534
  DataFrame
@@ -305,6 +537,9 @@ class DataFrame():
305
537
  TeradataMlException - TDMLDF_CREATE_FAIL
306
538
 
307
539
  """
540
+ if schema_name:
541
+ return cls(in_schema(schema_name, table_name, datalake_name))
542
+
308
543
  return cls(table_name, index, index_label)
309
544
 
310
545
  @classmethod
@@ -364,7 +599,7 @@ class DataFrame():
364
599
  return cls(index=index, index_label=index_label, query=query, materialize=materialize)
365
600
 
366
601
  @classmethod
367
- def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None):
602
+ def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True, **kwargs):
368
603
  """
369
604
  Private class method for creating a DataFrame from a nodeid and parent metadata.
370
605
 
@@ -385,6 +620,12 @@ class DataFrame():
385
620
  Optional Argument.
386
621
  List specifying index column(s) to be retained as columns for printing.
387
622
 
623
+ reuse_metaexpr:
624
+ Optional Argument.
625
+ Specifies the flag to decide whether to use same _MetaExpression object or not.
626
+ Default Value: True
627
+ Types: bool
628
+
388
629
  EXAMPLES:
389
630
  from teradataml.dataframe.dataframe import DataFrame
390
631
  df = DataFrame._from_node(1234, metaexpr)
@@ -400,30 +641,50 @@ class DataFrame():
400
641
  df = cls()
401
642
  df._nodeid = nodeid
402
643
  df._source_type = SourceType.TABLE.value
403
- df._get_metadata_from_metaexpr(metaexpr)
644
+
645
+ if not reuse_metaexpr:
646
+ # Create new _MetaExpression object using reference metaExpression
647
+ # for newly created DataFrame.
648
+ df._metaexpr = UtilFuncs._get_metaexpr_using_parent_metaexpr(nodeid, metaexpr)
649
+ # When metaexpression is created using only column information from parent DataFrame,
650
+ # underlying SQLAlchemy table is created with '' string as Table name.
651
+ # Assign name from reference mataexpression here.
652
+ df._metaexpr.t.name = metaexpr.t.name
653
+ # Populate corresponding information into newly created DataFrame object
654
+ # using newly created metaExpression.
655
+ df._get_metadata_from_metaexpr(df._metaexpr)
656
+ else:
657
+ # Populate corresponding information into newly created DataFrame object
658
+ # using reference metaExpression.
659
+ df._get_metadata_from_metaexpr(metaexpr)
404
660
 
405
661
  if isinstance(index_label, str):
406
662
  index_label = [index_label]
407
663
 
408
- if index_label is not None and all(elem in [col.name for col in metaexpr.c] for elem in index_label):
664
+ if index_label is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in index_label):
409
665
  df._index_label = index_label
410
666
  elif index_label is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
411
- in [col.name for col in metaexpr.c] for elem in index_label):
667
+ in [col.name for col in df._metaexpr.c] for elem in index_label):
412
668
  df._index_label = index_label
413
669
 
414
670
  # Set the flag suggesting that the _index_label is set,
415
- # and that a database lookup wont be required even when it is None.
671
+ # and that a database lookup won't be required even when it is None.
416
672
  df._index_query_required = False
417
673
 
418
674
  if isinstance(undropped_index, str):
419
675
  undropped_index = [undropped_index]
420
676
 
421
- if undropped_index is not None and all(elem in [col.name for col in metaexpr.c] for elem in undropped_index):
677
+ if undropped_index is not None and all(elem in [col.name for col in df._metaexpr.c] for elem in undropped_index):
422
678
  df._undropped_index = undropped_index
423
679
  elif undropped_index is not None and all(UtilFuncs._teradata_quote_arg(elem, "\"", False)
424
- in [col.name for col in metaexpr.c] for elem in undropped_index):
680
+ in [col.name for col in df._metaexpr.c] for elem in undropped_index):
425
681
  df._undropped_index = undropped_index
426
682
 
683
+ # Populate remaining attributes.
684
+ for arg in kwargs:
685
+ # Pop each argument from kwargs and assign to new DataFrame.
686
+ arg_value = kwargs.get(arg)
687
+ df.__setattr__(arg, arg_value)
427
688
  return df
428
689
 
429
690
  def create_temp_view(self, name):
@@ -551,9 +812,10 @@ class DataFrame():
551
812
  return self
552
813
 
553
814
  @collect_queryband(queryband="DF_fillna")
554
- def fillna(self, value=None, columns=None, literal_value=False):
815
+ def fillna(self, value=None, columns=None, literal_value=False, partition_column=None):
555
816
  """
556
- Method to replace the null values in a column with the value specified.
817
+ DESCRIPTION:
818
+ Method to replace the null values in a column with the value specified.
557
819
 
558
820
  PARAMETERS:
559
821
  value:
@@ -586,6 +848,12 @@ class DataFrame():
586
848
  Default Value: False
587
849
  Types: bool
588
850
 
851
+ partition_column:
852
+ Optional Argument.
853
+ Specifies the column name to partition the data.
854
+ Default Value: None
855
+ Types: str
856
+
589
857
  RETURNS:
590
858
  teradataml DataFrame
591
859
 
@@ -626,6 +894,26 @@ class DataFrame():
626
894
  3 Blue Inc 90.0 50 95.0 101.0 17/01/04
627
895
  4 Alpha Co 210.0 200 215.0 250.0 17/01/04
628
896
  5 Orange Inc 210.0 50 NaN 250.0 17/01/04
897
+
898
+ # Example 3: Populate the null value in 'pclass' and
899
+ # 'fare' column with mean value with partition
900
+ # column as 'sex'.
901
+ # Load the example data.
902
+ >>> load_example_data("teradataml", ["titanic"])
903
+ >>> df = DataFrame.from_table("titanic")
904
+
905
+ >>> df.fillna(value="mean", columns=["pclass", "fare"], partition_column="sex")
906
+ passenger survived pclass name sex age sibsp parch ticket fare cabin embarked
907
+ 0 284 1 3 Dorking, Mr. Edward Arthur male 19.0 0 0 A/5. 10482 8.0500 None S
908
+ 1 589 0 3 Gilinski, Mr. Eliezer male 22.0 0 0 14973 8.0500 None S
909
+ 2 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 None Q
910
+ 3 282 0 3 Olsson, Mr. Nils Johan Goransson male 28.0 0 0 347464 7.8542 None S
911
+ 4 608 1 1 Daniel, Mr. Robert Williams male 27.0 0 0 113804 30.5000 None S
912
+ 5 404 0 3 Hakkarainen, Mr. Pekka Pietari male 28.0 1 0 STON/O2. 3101279 15.8500 None S
913
+ 6 427 1 2 Clarke, Mrs. Charles V (Ada Maria Winfield) female 28.0 1 0 2003 26.0000 None S
914
+ 7 141 0 3 Boulos, Mrs. Joseph (Sultana) female NaN 0 2 2678 15.2458 None C
915
+ 8 610 1 1 Shutes, Miss. Elizabeth W female 40.0 0 0 PC 17582 153.4625 C125 S
916
+ 9 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 None C
629
917
  """
630
918
  from teradataml import SimpleImputeFit, SimpleImputeTransform
631
919
 
@@ -633,6 +921,7 @@ class DataFrame():
633
921
  arg_info_matrix.append(["value", value, True, (int, float, str, dict, list)])
634
922
  arg_info_matrix.append(["columns", columns, True, (list, str, tuple)])
635
923
  arg_info_matrix.append(["literal_value", literal_value, True, (bool)])
924
+ arg_info_matrix.append(["partition_column", partition_column, True, (str)])
636
925
 
637
926
  # Validate argument types
638
927
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -704,9 +993,15 @@ class DataFrame():
704
993
  literals=literals,
705
994
  literals_columns=literals_columns,
706
995
  stats=stats,
707
- stats_columns=stats_columns)
996
+ stats_columns=stats_columns,
997
+ partition_column=partition_column)
708
998
 
709
- return fit_obj.transform(data=self).result
999
+ impute_transform = {
1000
+ 'data': self,
1001
+ 'data_partition_column': partition_column,
1002
+ 'object_partition_column': partition_column}
1003
+
1004
+ return fit_obj.transform(**impute_transform).result
710
1005
 
711
1006
  def __execute_node_and_set_table_name(self, nodeid, metaexpr=None):
712
1007
  """
@@ -789,7 +1084,10 @@ class DataFrame():
789
1084
  Private method for setting _metaexpr and retrieving column names and types.
790
1085
 
791
1086
  PARAMETERS:
792
- metaexpr - Parent meta data (_MetaExpression object).
1087
+ metaexpr:
1088
+ Required Argument.
1089
+ Specifies parent meta data (_MetaExpression object).
1090
+ Types: _MetaExpression
793
1091
 
794
1092
  RETURNS:
795
1093
  None
@@ -802,15 +1100,19 @@ class DataFrame():
802
1100
  self._column_names_and_types = []
803
1101
  self._td_column_names_and_types = []
804
1102
  self._td_column_names_and_sqlalchemy_types = {}
805
- for col in metaexpr.c:
1103
+ self._column_types = {}
1104
+
1105
+ for col in self._metaexpr.c:
806
1106
  if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
807
1107
  tdtype = TeradataTypes.TD_NULL_TYPE.value
808
1108
  else:
809
1109
  tdtype = "{}".format(col.type)
810
1110
 
811
- self._column_names_and_types.append((str(col.name), UtilFuncs._teradata_type_to_python_type(col.type)))
1111
+ py_type = UtilFuncs._teradata_type_to_python_type(col.type)
1112
+ self._column_names_and_types.append((str(col.name), py_type))
812
1113
  self._td_column_names_and_types.append((str(col.name), tdtype))
813
1114
  self._td_column_names_and_sqlalchemy_types[(str(col.name)).lower()] = col.type
1115
+ self._column_types[(str(col.name)).lower()] = [py_type, col.type]
814
1116
 
815
1117
  def _get_metaexpr(self):
816
1118
  """
@@ -829,7 +1131,24 @@ class DataFrame():
829
1131
  meta = sqlalchemy.MetaData()
830
1132
  db_schema = UtilFuncs._extract_db_name(self._table_name)
831
1133
  db_table_name = UtilFuncs._extract_table_name(self._table_name)
832
- t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1134
+ if not self._datalake:
1135
+ t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1136
+ return _MetaExpression(t)
1137
+
1138
+ # Get metaexpression for datalake table.
1139
+ # check existence of datalake table.
1140
+ tdmlctx.get_connection().dialect.has_table(tdmlctx.get_connection(),
1141
+ self._table,
1142
+ schema=self._database,
1143
+ table_only=True,
1144
+ datalake=self._datalake)
1145
+
1146
+ # Extract column names and corresponding teradatasqlalchemy types.
1147
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1148
+ self._table,
1149
+ self._datalake)
1150
+ t = sqlalchemy.Table(self._table, meta, schema=self._database,
1151
+ *(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
833
1152
  return _MetaExpression(t)
834
1153
 
835
1154
  def __getattr__(self, name):
@@ -2066,7 +2385,7 @@ class DataFrame():
2066
2385
  else:
2067
2386
  col_filters = col_names
2068
2387
 
2069
- col_filters_decode = ["decode(\"{}\", null, 0, 1)".format(col_name) for col_name in col_filters]
2388
+ col_filters_decode = ["CASE WHEN \"{}\" IS NULL THEN 0 ELSE 1 END".format(col_name) for col_name in col_filters]
2070
2389
  fmt_filter = " + ".join(col_filters_decode)
2071
2390
 
2072
2391
  if thresh is not None:
@@ -2605,9 +2924,10 @@ class DataFrame():
2605
2924
  msg = Messages.get_message(errcode)
2606
2925
  raise TeradataMlException(msg, errcode)
2607
2926
 
2927
+ @argument_deprecation("20.0.0.5", "include", False, None)
2608
2928
  @collect_queryband(queryband="DF_describe")
2609
2929
  def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
2610
- columns=None):
2930
+ columns=None, pivot=False):
2611
2931
  """
2612
2932
  DESCRIPTION:
2613
2933
  Generates statistics for numeric columns. This function can be used in two modes:
@@ -2639,12 +2959,12 @@ class DataFrame():
2639
2959
  include:
2640
2960
  Optional Argument.
2641
2961
  Values can be either None or "all".
2642
- If the value is "all", then both numeric and non-numeric columns are included.
2962
+ If the value is "all", both numeric and non-numeric columns are included.
2643
2963
  Computes count, mean, std, min, percentiles, and max for numeric columns.
2644
2964
  Computes count and unique for non-numeric columns.
2645
2965
  If the value is None, only numeric columns are used for collecting statistics.
2646
2966
  Note:
2647
- Value 'all' is not applicable for 'Time Series Aggregate Mode'.
2967
+ * Value 'all' is not applicable for 'Time Series Aggregate Mode'.
2648
2968
  Default Values: None
2649
2969
  Types: str
2650
2970
 
@@ -2684,7 +3004,14 @@ class DataFrame():
2684
3004
  Specifies the name(s) of the columns we are collecting statistics for.
2685
3005
  Default Values: None
2686
3006
  Types: str or List of str
2687
-
3007
+
3008
+ pivot:
3009
+ Optional Argument.
3010
+ Specifies a boolean value to pivot the output.
3011
+ Note:
3012
+ * "pivot" is not supported for PTI tables.
3013
+ Default Values: 'False'
3014
+ Types: bool
2688
3015
 
2689
3016
  RETURNS:
2690
3017
  teradataml DataFrame
@@ -2706,7 +3033,7 @@ class DataFrame():
2706
3033
  Orange Inc 210.0 None None 250 04/01/2017
2707
3034
 
2708
3035
  # Computes count, mean, std, min, percentiles, and max for numeric columns.
2709
- >>> df.describe()
3036
+ >>> df.describe(pivot=True)
2710
3037
  Apr Feb Mar Jan
2711
3038
  func
2712
3039
  count 4 6 4 4
@@ -2718,8 +3045,45 @@ class DataFrame():
2718
3045
  75% 250 207.5 158.75 162.5
2719
3046
  max 250 210 215 200
2720
3047
 
3048
+ # Computes count, mean, std, min, percentiles, and max for numeric columns with
3049
+ # default arugments.
3050
+ >>> df.describe()
3051
+ ATTRIBUTE StatName StatValue
3052
+ Jan MAXIMUM 200.0
3053
+ Jan STANDARD DEVIATION 62.91528696058958
3054
+ Jan PERCENTILES(25) 125.0
3055
+ Jan PERCENTILES(50) 150.0
3056
+ Mar COUNT 4.0
3057
+ Mar MINIMUM 95.0
3058
+ Mar MAXIMUM 215.0
3059
+ Mar MEAN 147.5
3060
+ Mar STANDARD DEVIATION 49.749371855331
3061
+ Mar PERCENTILES(25) 128.75
3062
+ Mar PERCENTILES(50) 140.0
3063
+ Apr COUNT 4.0
3064
+ Apr MINIMUM 101.0
3065
+ Apr MAXIMUM 250.0
3066
+ Apr MEAN 195.25
3067
+ Apr STANDARD DEVIATION 70.97123830585646
3068
+ Apr PERCENTILES(25) 160.25
3069
+ Apr PERCENTILES(50) 215.0
3070
+ Apr PERCENTILES(75) 250.0
3071
+ Feb COUNT 6.0
3072
+ Feb MINIMUM 90.0
3073
+ Feb MAXIMUM 210.0
3074
+ Feb MEAN 166.66666666666666
3075
+ Feb STANDARD DEVIATION 59.553897157672786
3076
+ Feb PERCENTILES(25) 117.5
3077
+ Feb PERCENTILES(50) 200.0
3078
+ Feb PERCENTILES(75) 207.5
3079
+ Mar PERCENTILES(75) 158.75
3080
+ Jan PERCENTILES(75) 162.5
3081
+ Jan MEAN 137.5
3082
+ Jan MINIMUM 50.0
3083
+ Jan COUNT 4.0
3084
+
2721
3085
  # Computes count, mean, std, min, percentiles, and max for numeric columns with 30th and 60th percentiles.
2722
- >>> df.describe(percentiles=[.3, .6])
3086
+ >>> df.describe(percentiles=[.3, .6], pivot=True)
2723
3087
  Apr Feb Mar Jan
2724
3088
  func
2725
3089
  count 4 6 4 4
@@ -2732,7 +3096,7 @@ class DataFrame():
2732
3096
 
2733
3097
  # Computes count, mean, std, min, percentiles, and max for numeric columns group by "datetime" and "Feb".
2734
3098
  >>> df1 = df.groupby(["datetime", "Feb"])
2735
- >>> df1.describe()
3099
+ >>> df1.describe(pivot=True)
2736
3100
  Jan Mar Apr
2737
3101
  datetime Feb func
2738
3102
  04/01/2017 90.0 25% 50 95 101
@@ -2760,22 +3124,6 @@ class DataFrame():
2760
3124
  min 200 215 250
2761
3125
  std None None 0
2762
3126
 
2763
- # Computes count, mean, std, min, percentiles, and max for numeric columns and
2764
- # computes count and unique for non-numeric columns
2765
- >>> df.describe(include="all")
2766
- accounts Feb Jan Mar Apr datetime
2767
- func
2768
- 25% None 117.5 125 128.75 160.25 None
2769
- 75% None 207.5 162.5 158.75 250 None
2770
- count 6 6 4 4 4 6
2771
- mean None 166.667 137.5 147.5 195.25 None
2772
- max None 210 200 215 250 None
2773
- min None 90 50 95 101 None
2774
- 50% None 200 150 140 215 None
2775
- std None 59.554 62.915 49.749 70.971 None
2776
- unique 6 None None None None 1
2777
-
2778
- #
2779
3127
  # Examples for describe() function as Time Series Aggregate.
2780
3128
  #
2781
3129
  >>> # Load the example datasets.
@@ -2958,7 +3306,7 @@ class DataFrame():
2958
3306
  >>>
2959
3307
  """
2960
3308
 
2961
- # Argument validations
3309
+ # -------------Argument validations---------------#
2962
3310
  awu_matrix = []
2963
3311
  awu_matrix.append(["columns", columns, True, (str, list), True])
2964
3312
  awu_matrix.append(["percentiles", percentiles, True, (float, list)])
@@ -2967,6 +3315,7 @@ class DataFrame():
2967
3315
  awu_matrix.append(["distinct", distinct, True, (bool)])
2968
3316
  awu_matrix.append(["statistics", statistics, True, (str, list), True,
2969
3317
  ["count", "mean", "min", "max", "unique", "std", "describe", "percentile"]])
3318
+ awu_matrix.append(["pivot", pivot, True, (bool)])
2970
3319
 
2971
3320
  # Validate argument types
2972
3321
  _Validators._validate_function_arguments(awu_matrix)
@@ -3010,22 +3359,27 @@ class DataFrame():
3010
3359
  if verbose and not isinstance(self, DataFrameGroupByTime):
3011
3360
  raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
3012
3361
  'verbose', 'Aggregation', 'True', 'describe()', 'DataFrameGroupByTime'))
3362
+ # -------------End of argument validations---------------#
3013
3363
 
3014
3364
  function_label = "func"
3365
+ sort_cols = []
3015
3366
  try:
3016
3367
  self.__execute_node_and_set_table_name(self._nodeid)
3017
3368
 
3018
3369
  groupby_column_list = None
3019
- if isinstance(self, DataFrameGroupBy):
3370
+ if isinstance(self, DataFrameGroupByTime) or isinstance(self, DataFrameGroupBy):
3020
3371
  groupby_column_list = self.groupby_column_list
3021
- df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
3022
- groupby_column_list=groupby_column_list)
3372
+ if columns:
3373
+ df_utils._validate_describe_columns(columns=columns, metaexpr=self._metaexpr,
3374
+ groupby_column_list=groupby_column_list)
3375
+ sort_cols = list(groupby_column_list)
3023
3376
 
3024
- if isinstance(self, DataFrameGroupByTime):
3025
- groupby_column_list = self.groupby_column_list
3026
- df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
3027
- groupby_column_list=groupby_column_list)
3377
+ # 'func' column will be always there in result.
3378
+ sort_cols.append(function_label)
3028
3379
 
3380
+ # Handle DataFrameGroupByTime using union all approach and
3381
+ # other DataFrames using TD_UnivariateStatistics approach.
3382
+ if isinstance(self, DataFrameGroupByTime):
3029
3383
  # Construct the aggregate query.
3030
3384
  agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3031
3385
  percentiles=percentiles, function_label=function_label,
@@ -3037,29 +3391,99 @@ class DataFrame():
3037
3391
  timecode_column=self._timecode_column,
3038
3392
  sequence_column=self._sequence_column,
3039
3393
  fill=self._fill)
3394
+
3395
+ if groupby_column_list is not None:
3396
+ df = DataFrame.from_query(agg_query, index_label=sort_cols)
3397
+ df2 = df.sort(sort_cols)
3398
+ df2._metaexpr._n_rows = 100
3399
+ describe_df = df2
3400
+ else:
3401
+ describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3402
+
3403
+ # Check if numeric overflow can occur for result DataFrame.
3404
+ if self._check_numeric_overflow(describe_df):
3405
+ result_df = self._promote_dataframe_types()
3406
+ describe_df = result_df.describe(pivot=True)
3407
+ return describe_df
3408
+
3040
3409
  else:
3041
- # Construct the aggregate query.
3042
- agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3043
- percentiles=percentiles, function_label=function_label,
3044
- groupby_column_list=groupby_column_list, include=include,
3045
- is_time_series_aggregate=False, verbose=verbose,
3046
- distinct=distinct, statistics=statistics)
3047
-
3048
- if groupby_column_list is not None:
3049
- sort_cols = [i for i in groupby_column_list]
3050
- sort_cols.append(function_label)
3051
- df = DataFrame.from_query(agg_query, index_label=sort_cols)
3052
- df2 = df.sort(sort_cols)
3053
- df2._metaexpr._n_rows = 100
3054
- describe_df = df2
3055
- else:
3056
- describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3410
+ # If pivot is True, then construct the aggregate query and return the result DataFrame.
3411
+ # Otherwise, return the result DataFrame in the regular aggregate mode using UnivariateStatistics.
3412
+
3413
+ if pivot:
3414
+ # Construct the aggregate query.
3415
+ agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3416
+ percentiles=percentiles, function_label=function_label,
3417
+ groupby_column_list=groupby_column_list, include=include,
3418
+ is_time_series_aggregate=False, verbose=verbose,
3419
+ distinct=distinct, statistics=statistics)
3420
+
3421
+ if groupby_column_list is not None:
3422
+ sort_cols = [i for i in groupby_column_list]
3423
+ sort_cols.append(function_label)
3424
+ df = DataFrame.from_query(agg_query, index_label=sort_cols)
3425
+ df2 = df.sort(sort_cols)
3426
+ df2._metaexpr._n_rows = 100
3427
+ describe_df = df2
3428
+ else:
3429
+ describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3430
+
3431
+ # Check if numeric overflow can occur for result DataFrame.
3432
+ if self._check_numeric_overflow(describe_df):
3433
+ result_df = self._promote_dataframe_types()
3434
+ describe_df = result_df.describe(pivot=True)
3435
+
3436
+ return describe_df
3437
+
3438
+ # If columns is None, then all dataframe columns are considered.
3439
+ if columns is None:
3440
+ columns = self.columns
3441
+ # Exclude groupby columns
3442
+ if groupby_column_list is not None:
3443
+ columns = [col for col in columns if col not in groupby_column_list]
3444
+
3445
+ numeric_cols = []
3446
+
3447
+ # Extract numeric columns and their types of all columns
3448
+ for col in self._metaexpr.c:
3449
+ if type(col.type) in UtilFuncs()._get_numeric_datatypes() and \
3450
+ col.name in columns:
3451
+ numeric_cols.append(col.name)
3452
+
3453
+ if numeric_cols:
3454
+ # Default statistics for 'Regular Aggregate Mode'
3455
+ sql_stat = ["COUNT", "MAXIMUM", "MEAN", "MINIMUM", "PERCENTILES", "STANDARD DEVIATION"]
3456
+
3457
+ if statistics is not None:
3458
+ py_to_sql_func_map = {"count": "COUNT",
3459
+ "max": "MAXIMUM",
3460
+ "mean": "MEAN",
3461
+ "unique": 'UNIQUE ENTITY COUNT',
3462
+ "min": "MINIMUM",
3463
+ "percentile": "PERCENTILES",
3464
+ "std": "STANDARD DEVIATION"}
3465
+ # Convert statistics into corresponding SQL function names
3466
+ sql_stat = [py_to_sql_func_map[stat] for stat in UtilFuncs()._as_list(statistics)]
3467
+
3468
+ # Convert percentiles to centiles for univariate statistics
3469
+ centiles = list(map(lambda n: int(n * 100), percentiles))
3470
+
3471
+ # UnivariateStatistics parameters
3472
+ univar_param = {
3473
+ "newdata": self.select(self.columns),
3474
+ "target_columns": numeric_cols,
3475
+ "partition_columns": groupby_column_list,
3476
+ "centiles": centiles,
3477
+ "stats": sql_stat
3478
+ }
3479
+
3480
+ from teradataml import UnivariateStatistics
3481
+ # Run UnivariateStatistics
3482
+ aggr_df = UnivariateStatistics(**univar_param).result
3483
+
3484
+ # Return the result in teradataml format
3485
+ return aggr_df
3057
3486
 
3058
- # Check if numeric overflow can occur for result DataFrame.
3059
- if self._check_numeric_overflow(describe_df):
3060
- result_df = self._promote_dataframe_types()
3061
- describe_df = result_df.describe()
3062
- return describe_df
3063
3487
  except TeradataMlException:
3064
3488
  raise
3065
3489
  except Exception as err:
@@ -5555,7 +5979,7 @@ class DataFrame():
5555
5979
  try:
5556
5980
  # Printing the DF will actually run underlying select query and
5557
5981
  # will brought up numeric overflow if any. Only materializing won't work.
5558
- print(result_df)
5982
+ repr(result_df)
5559
5983
  return False
5560
5984
  except TeradataMlException as tme:
5561
5985
  if "Numeric overflow occurred during computation" in str(tme):
@@ -5642,7 +6066,35 @@ class DataFrame():
5642
6066
 
5643
6067
  def _repr_html_(self):
5644
6068
  """ Print method for teradataml for iPython rich display. """
6069
+ self._generate_output_html()
6070
+ if display.enable_ui:
6071
+ # EDA Ui widget representation using teradatamlwidgets
6072
+ if self._eda_ui is None:
6073
+ from teradatamlwidgets.eda.Ui import Ui
6074
+ self._eda_ui = Ui(df=self, html=self.html)
6075
+ else:
6076
+ self._eda_ui.display_ui()
6077
+ return self.html
6078
+
6079
+ def get_eda_ui(self):
6080
+ """
6081
+ Returns the EDA representation UI.
6082
+
6083
+ PARAMETERS:
6084
+ None.
6085
+
6086
+ EXCEPTIONS:
6087
+ None.
6088
+
6089
+ RETURNS:
6090
+ teradatamlwidgets.eda.Ui
6091
+
6092
+ EXAMPLE:
6093
+ df = ui.get_eda_ui()
6094
+ """
6095
+ return self._eda_ui
5645
6096
 
6097
+ def _generate_output_html(self, disable_types=True):
5646
6098
  # Check if class attributes __data and __data_columns are not None.
5647
6099
  # If not None, reuse the data and columns.
5648
6100
  # If None, generate latest results.
@@ -5655,17 +6107,25 @@ class DataFrame():
5655
6107
  dindent = indent + indent
5656
6108
 
5657
6109
  header_html = ['<style type="text/css">',
5658
- 'table {border:ridge 5px;}',
6110
+ 'table { border:ridge 5px}',
5659
6111
  'table td {border:inset 1px;}',
5660
- 'table tr#HeaderRow {background-color:grey; color:white;}'
6112
+ 'table tr#HeaderRow {background-color:grey; color:white;}',
5661
6113
  '</style>\n'
5662
6114
  ]
5663
6115
  html = "\n{0}".format(indent).join(header_html)
5664
- html += '<html><table>\n{0}<tr id="HeaderRow">\n'.format(indent)
6116
+ html += '<html><table style="min-width:1000px;">\n{0}<tr id="HeaderRow">\n'.format(indent)
5665
6117
 
5666
- columns_html = "</th>\n{0}<th>".format(dindent).join(self.__data_columns)
5667
- html += "{0}<th>{1}</th>\n".format(dindent, columns_html)
5668
- html += "{0}</tr>\n".format(indent)
6118
+ columns_html = "</th><th>".join(self.__data_columns)
6119
+ html += "<th>{0}</th>\n".format(columns_html)
6120
+ html += "</tr>\n"
6121
+
6122
+ if not disable_types:
6123
+ html += '<tr>\n'.format(indent)
6124
+ col_types = [repr(self._td_column_names_and_sqlalchemy_types[column]) for column in
6125
+ self.__data_columns]
6126
+ columns_types_html = "</td>\n{0}<td>".format(dindent).join(col_types)
6127
+ html += "{0}<td>{1}</td>\n".format(dindent, columns_types_html)
6128
+ html += "{0}</tr>\n".format(indent)
5669
6129
 
5670
6130
  for row in self.__data:
5671
6131
  row_html = ["{0}<td>{1}</td>\n".format(dindent,
@@ -5673,8 +6133,31 @@ class DataFrame():
5673
6133
  html += "{1}<tr>\n{0}{1}</tr>\n".format("".join(row_html), indent)
5674
6134
 
5675
6135
  html += "</table></html>"
6136
+ self.html = html
6137
+
6138
+ def get_output(self, output_index=0):
6139
+ """
6140
+ DESCRIPTION:
6141
+ Returns the result of analytic function when analytic function is
6142
+ run from 'Analyze' tab in EDA UI.
6143
+ Note:
6144
+ * The function does not return anything if analytic function is
6145
+ not run from EDA UI.
5676
6146
 
5677
- return html
6147
+ PARAMETERS:
6148
+ output_index:
6149
+ Optional Argument.
6150
+ Specifies the index of the output dataframe to be returned.
6151
+ Default Value: 0
6152
+ Types: int
6153
+
6154
+ RAISES:
6155
+ IndexError
6156
+
6157
+ RETURNS:
6158
+ teradataml DataFrame object.
6159
+ """
6160
+ return self._eda_ui.get_output_dataframe(output_index=output_index)
5678
6161
 
5679
6162
  def __get_data_columns(self):
5680
6163
  """
@@ -6019,6 +6502,8 @@ class DataFrame():
6019
6502
  * "open_sessions" specifies the number of Teradata data transfer
6020
6503
  sessions to be opened for fastexport. This argument is only applicable
6021
6504
  in fastexport mode.
6505
+ * Function returns the pandas dataframe with Decimal columns types as float instead of object.
6506
+ If user want datatype to be object, set argument "coerce_float" to False.
6022
6507
 
6023
6508
  Notes:
6024
6509
  1. For additional information about "coerce_float" and
@@ -6334,15 +6819,22 @@ class DataFrame():
6334
6819
  Supported join operators are =, ==, <, <=, >, >=, <> and != (= and <> operators are
6335
6820
  not supported when using DataFrame columns as operands).
6336
6821
 
6337
- Note:
6338
- 1. When multiple join conditions are given, they are joined using AND boolean
6339
- operator. Other boolean operators are not supported.
6340
- 2. Nesting of join on conditions in column expressions using & and | is not
6341
- supported. The example for unsupported nested join on conditions is:
6342
- on = [(df1.a == df1.b) & (df1.c == df1.d)]
6822
+ Notes:
6823
+ 1. When multiple join conditions are given as a list string/ColumnExpression,
6824
+ they are joined using AND operator.
6825
+ 2. Two or more on conditions can be combined using & and | operators
6826
+ and can be passed as single ColumnExpression.
6827
+ You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
6828
+ [df1.a == df1.b, df1.c == df1.d].
6829
+ 3. Two or more on conditions can not be combined using pythonic 'and'
6830
+ and 'or'.
6831
+ You can use (df1.a == df1.b) & (df1.c == df1.d) in place of
6832
+ [df1.a == df1.b and df1.c == df1.d].
6833
+ 4. Performing self join using same DataFrame object in 'other'
6834
+ argument is not supported. In order to perform self join,
6835
+ first create aliased DataFrame using alias() API and pass it
6836
+ for 'other' argument. Refer to Example 10 in EXAMPLES section.
6343
6837
 
6344
- You can use [df1.a == df1.b, df1.c == df1.d] in place of
6345
- [(df1.a == df1.b) & (df1.c == df1.d)].
6346
6838
 
6347
6839
  PARAMETERS:
6348
6840
 
@@ -6370,15 +6862,20 @@ class DataFrame():
6370
6862
  is the column of left dataframe df1 and col2 is the column of right
6371
6863
  dataframe df2.
6372
6864
  Examples:
6373
- 1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a and df1.b = df2.b.
6374
- 2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b and df1.c = df2.d.
6375
- 3. [df1.a <= df2.b and df1.c > df2.d] indicates df1.a <= df2.b and df1.c > df2.d.
6376
- 4. [df1.a < df2.b and df1.c >= df2.d] indicates df1.a < df2.b and df1.c >= df2.d.
6865
+ 1. [df1.a == df2.a, df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
6866
+ 2. [df1.a == df2.b, df1.c == df2.d] indicates df1.a = df2.b AND df1.c = df2.d.
6867
+ 3. [df1.a <= df2.b & df1.c > df2.d] indicates df1.a <= df2.b AND df1.c > df2.d.
6868
+ 4. [df1.a < df2.b | df1.c >= df2.d] indicates df1.a < df2.b OR df1.c >= df2.d.
6377
6869
  5. df1.a != df2.b indicates df1.a != df2.b.
6378
6870
  • The combination of both string comparisons and comparisons as column expressions.
6379
6871
  Examples:
6380
- 1. ["a", df1.b == df2.b] indicates df1.a = df2.a and df1.b = df2.b.
6381
- 2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b and df1.c > df2.d.
6872
+ 1. ["a", df1.b == df2.b] indicates df1.a = df2.a AND df1.b = df2.b.
6873
+ 2. [df1.a <= df2.b, "c > d"] indicates df1.a <= df2.b AND df1.c > df2.d.
6874
+ • ColumnExpressions containing FunctionExpressions which represent SQL functions
6875
+ invoked on DataFrame Columns.
6876
+ Examples:
6877
+ 1. (df1.a.round(1) - df2.a.round(1)).mod(2.5) > 2
6878
+ 2. df1.a.floor() - df2.b.floor() > 2
6382
6879
 
6383
6880
  Types: str (or) ColumnExpression (or) List of strings(str) or ColumnExpressions
6384
6881
 
@@ -6400,7 +6897,7 @@ class DataFrame():
6400
6897
  Specifies the suffix to be added to the right table columns.
6401
6898
  Default Value: None.
6402
6899
  Types: str
6403
-
6900
+
6404
6901
  lprefix:
6405
6902
  Optional Argument.
6406
6903
  Specifies the prefix to be added to the left table columns.
@@ -6450,7 +6947,7 @@ class DataFrame():
6450
6947
  0 2 2 analytics 2.3 2.3 b analytics b
6451
6948
  1 1 1 teradata 1.3 1.3 a teradata a
6452
6949
 
6453
- # Example 2: One "on" argument condition is ColumnExpression and other is string having two
6950
+ # Example 2: One "on" argument condition is ColumnExpression and other is string having two
6454
6951
  # columns with left outer join.
6455
6952
  >>> df1.join(df2, on = [df1.col2 == df2.col4,"col5 = col7"], how = "left", lprefix = "t1", rprefix = "t2")
6456
6953
  t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
@@ -6464,7 +6961,7 @@ class DataFrame():
6464
6961
  0 2 2 analytics 2.3 2.3 b analytics b
6465
6962
  1 1 1 teradata 1.3 1.3 a teradata a
6466
6963
 
6467
- # Example 4: One "on" argument condition is ColumnExpression and other is string having two
6964
+ # Example 4: One "on" argument condition is ColumnExpression and other is string having two
6468
6965
  # columns with full join.
6469
6966
  >>> df1.join(other = df2, on = ["col2=col4",df1.col5 == df2.col7], how = "full", lprefix = "t1", rprefix = "t2")
6470
6967
  t1_col1 t2_col1 col2 t1_col3 t2_col3 col5 col4 col7
@@ -6542,7 +7039,53 @@ class DataFrame():
6542
7039
  3 Beginner Beginner 1 3.95 Beginner 3.70 Novice 0 1 no yes
6543
7040
  3 Beginner Beginner 2 3.76 Beginner 3.70 Novice 0 1 no yes
6544
7041
  3 Beginner Novice 3 3.70 Beginner 3.70 Novice 1 1 no no
7042
+
7043
+ # Example 10: Perform self join using aliased DataFrame.
7044
+ # Create an aliased DataFrame.
7045
+ >>> lhs = DataFrame("admissions_train").head(3).sort("id")
7046
+ >>> rhs = lhs.alias("rhs")
7047
+ # Use aliased DataFrame for self join.
7048
+ >>> joined_df = lhs.join(other=rhs, how="cross", lprefix="l", rprefix="r")
7049
+ >>> joined_df
7050
+ l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted
7051
+ 0 1 3 yes no 3.95 3.70 Beginner Novice Beginner Beginner 0 1
7052
+ 1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0
7053
+ 2 2 3 yes no 3.76 3.70 Beginner Novice Beginner Beginner 0 1
7054
+ 3 3 1 no yes 3.70 3.95 Novice Beginner Beginner Beginner 1 0
7055
+ 4 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1
7056
+ 5 3 2 no yes 3.70 3.76 Novice Beginner Beginner Beginner 1 0
7057
+ 6 2 1 yes yes 3.76 3.95 Beginner Beginner Beginner Beginner 0 0
7058
+ 7 1 2 yes yes 3.95 3.76 Beginner Beginner Beginner Beginner 0 0
7059
+ 8 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0
7060
+
7061
+ # Example 11: Perform join with compound 'on' condition having
7062
+ # more than one binary operator.
7063
+ >>> rhs_2 = lhs.assign(double_gpa=lhs.gpa * 2)
7064
+ >>> joined_df_2 = lhs.join(rhs_2, on=rhs_2.double_gpa == lhs.gpa * 2, how="left", lprefix="l", rprefix="r")
7065
+ >>> joined_df_2
7066
+ l_id r_id l_masters r_masters l_gpa r_gpa l_stats r_stats l_programming r_programming l_admitted r_admitted double_gpa
7067
+ 0 3 3 no no 3.70 3.70 Novice Novice Beginner Beginner 1 1 7.40
7068
+ 1 2 2 yes yes 3.76 3.76 Beginner Beginner Beginner Beginner 0 0 7.52
7069
+ 2 1 1 yes yes 3.95 3.95 Beginner Beginner Beginner Beginner 0 0 7.90
7070
+
7071
+ # Example 12: Perform join on DataFrames with 'on' condition
7072
+ # having FunctionExpression.
7073
+ >>> df = DataFrame("admissions_train")
7074
+ >>> df2 = df.alias("rhs_df")
7075
+ >>> joined_df_3 = df.join(df2, on=(df.gpa.round(1) - df2.gpa.round(1)).mod(2.5) > 2,
7076
+ >>> how="inner", lprefix="l")
7077
+ >>> joined_df_3.sort(["id", "l_id"])
7078
+ l_id id l_masters masters l_gpa gpa l_stats stats l_programming programming l_admitted admitted
7079
+ 0 1 24 yes no 3.95 1.87 Beginner Advanced Beginner Novice 0 1
7080
+ 1 13 24 no no 4.0 1.87 Advanced Advanced Novice Novice 1 1
7081
+ 2 15 24 yes no 4.0 1.87 Advanced Advanced Advanced Novice 1 1
7082
+ 3 25 24 no no 3.96 1.87 Advanced Advanced Advanced Novice 1 1
7083
+ 4 27 24 yes no 3.96 1.87 Advanced Advanced Advanced Novice 0 1
7084
+ 5 29 24 yes no 4.0 1.87 Novice Advanced Beginner Novice 0 1
7085
+ 6 40 24 yes no 3.95 1.87 Novice Advanced Beginner Novice 0 1
7086
+
6545
7087
  """
7088
+
6546
7089
  # Argument validations
6547
7090
  awu_matrix = []
6548
7091
  awu_matrix.append(["other", other, False, (DataFrame)])
@@ -6556,17 +7099,11 @@ class DataFrame():
6556
7099
  # Validate argument types
6557
7100
  _Validators._validate_function_arguments(awu_matrix)
6558
7101
 
6559
- # If user has not provided suffix argument(s), then prefix argument(s) value(s) are passed by
6560
- # user hence we will set the affix variables (laffix and raffix) with provided value(s).
6561
- # affix_type is also set appropriately.
6562
- if lsuffix is not None or rsuffix is not None:
6563
- laffix = lsuffix
6564
- raffix = rsuffix
6565
- affix_type = "suffix"
6566
- else:
6567
- laffix = lprefix
6568
- raffix = rprefix
6569
- affix_type = "prefix"
7102
+ # If self and other DataFrames are pointing to same Table object,
7103
+ # raise error.
7104
+ if self._metaexpr.t is other._metaexpr.t:
7105
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "join"),
7106
+ MessageCodes.TDMLDF_ALIAS_REQUIRED)
6570
7107
 
6571
7108
  how_lc = how.lower()
6572
7109
 
@@ -6584,12 +7121,33 @@ class DataFrame():
6584
7121
  for col in other.columns:
6585
7122
  other_columns_lower_actual_map[col.lower()] = col
6586
7123
 
6587
- for column in self_columns_lower_actual_map.keys():
6588
- if column in other_columns_lower_actual_map.keys():
6589
- if laffix is None and raffix is None:
6590
- raise TeradataMlException(
6591
- Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
6592
- MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
7124
+ # Set the affix variables (laffix and raffix) with provided value(s)
7125
+ # of lsuffix, rsuffix, lprefix and rprefix.
7126
+ # Also set affix_type appropriately.
7127
+ laffix = None
7128
+ raffix = None
7129
+ affix_type = None
7130
+ if lsuffix is not None or rsuffix is not None:
7131
+ laffix = lsuffix
7132
+ raffix = rsuffix
7133
+ affix_type = "suffix"
7134
+ elif lprefix is not None or rprefix is not None:
7135
+ laffix = lprefix
7136
+ raffix = rprefix
7137
+ affix_type = "prefix"
7138
+
7139
+ # Same column names can be present in two dataframes involved
7140
+ # in join operation in below two cases:
7141
+ # Case 1: Self join.
7142
+ # Case 2: Two tables having common column names.
7143
+ # In any case, at least one kind of affix is required to generate
7144
+ # distinct column names in resultant table. Throw error if no affix
7145
+ # is available.
7146
+ if not set(self_columns_lower_actual_map.keys()).isdisjoint(other_columns_lower_actual_map.keys()):
7147
+ if affix_type is None:
7148
+ raise TeradataMlException(
7149
+ Messages.get_message(MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS),
7150
+ MessageCodes.TDMLDF_REQUIRED_TABLE_ALIAS)
6593
7151
 
6594
7152
  # Both affixes should not be equal to perform join.
6595
7153
  if laffix == raffix and laffix is not None:
@@ -6598,115 +7156,159 @@ class DataFrame():
6598
7156
  "'l{affix_type}' and 'r{affix_type}'".format(affix_type=affix_type)),
6599
7157
  MessageCodes.TDMLDF_INVALID_TABLE_ALIAS)
6600
7158
 
6601
- if how_lc != "cross":
6602
- if isinstance(on, str) or isinstance(on, ColumnExpression):
6603
- on = [on]
6604
-
6605
- all_join_conditions = []
6606
- invalid_join_conditions = []
6607
- # Forming join condition
6608
- for condition in on:
6609
- ori_condition = condition
6610
-
6611
- if not isinstance(condition, (ColumnExpression, str)):
6612
- invalid_join_conditions.append(condition)
6613
- continue
6614
-
6615
- # Process only when the on condition is string or a ColumnExpression
6616
- if isinstance(condition, ColumnExpression):
6617
- columns = condition.original_column_expr
6618
- condition = condition.compile()
6619
-
6620
- for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
6621
- if op in condition:
6622
- conditional_separator = op
6623
- break
6624
- else:
6625
- # If no join condition is mentioned, default is taken as equal.
6626
- # If on is ['a'], then it is equal to 'df1.a = df2.a'
6627
- columns = [condition, condition]
6628
- condition = "{0} = {0}".format(condition)
6629
- conditional_separator = "="
6630
-
6631
- if isinstance(ori_condition, str):
6632
- columns = [column.strip() for column in condition.split(sep=conditional_separator)
6633
- if len(column) > 0]
6634
-
6635
- if len(columns) != 2:
6636
- invalid_join_conditions.append(condition)
6637
- else:
6638
- left_col = self.__add_alias_to_column(columns[0], self, laffix if laffix is not None else "df1")
6639
- right_col = self.__add_alias_to_column(columns[1], other, raffix if raffix is not None else "df2")
6640
- if conditional_separator == "!=":
6641
- # "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
6642
- # expressing 'not equal to'. Adding support for "!=".
6643
- conditional_separator = "<>"
6644
- all_join_conditions.append('{0} {1} {2}'.format(left_col, conditional_separator, right_col))
6645
-
6646
- if len(invalid_join_conditions) > 0:
6647
- raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
6648
- ", ".join(invalid_join_conditions)),
6649
- MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
6650
-
6651
- join_condition = " and ".join(all_join_conditions)
6652
- else:
6653
- join_condition = ""
6654
-
6655
- df1_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
6656
- df2_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
6657
-
6658
- select_columns = []
6659
- new_metaexpr_columns_types = OrderedDict()
6660
-
6661
- for column in self.columns:
6662
- if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
6663
- # Check if column found in other DataFrame has same case or different.
6664
- # Return the column name from the other DataFrame.
6665
- other_column = other_columns_lower_actual_map[column.lower()]
6666
-
6667
- df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
6668
- other_columns_lower_actual_map.keys(),
6669
- "right", affix_type)
6670
- select_columns.append("{0} as {1}".format(
6671
- self.__get_fully_qualified_col_name(other_column, "df1" if laffix is None else laffix),
6672
- df1_column_with_affix))
6673
-
6674
- df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
6675
- self_columns_lower_actual_map.keys(),
6676
- "left", affix_type)
6677
- select_columns.append("{0} as {1}".format(
6678
- self.__get_fully_qualified_col_name(column, "df2" if raffix is None else raffix),
6679
- df2_column_with_affix))
6680
-
6681
- # As we are creating new column name, adding it to new metadata dict for new dataframe from join.
6682
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
6683
- UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
6684
- column, df1_columns_types)
6685
-
6686
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
6687
- UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
6688
- other_column, df2_columns_types)
6689
-
7159
+ try:
7160
+ # Set an attribute named '_join_alias' to underlying SQLAlchemy table objects
7161
+ # and use it as default alias for compiling.
7162
+ setattr(self._metaexpr.t, "_join_alias", "lhs")
7163
+ setattr(other._metaexpr.t, "_join_alias", "rhs")
7164
+ lhs_alias = "lhs"
7165
+ rhs_alias = "rhs"
7166
+
7167
+ # Step 1: Generate the on clause string.
7168
+ if how_lc != "cross":
7169
+ on = UtilFuncs._as_list(on)
7170
+
7171
+ all_join_conditions = []
7172
+ invalid_join_conditions = []
7173
+ # Forming join condition
7174
+ for condition in on:
7175
+ # Process only when the on condition is either a string or a ColumnExpression.
7176
+ if not isinstance(condition, (ColumnExpression, str)):
7177
+ invalid_join_conditions.append(condition)
7178
+ continue
7179
+
7180
+ # Generate final on clause string from string representation of condition.
7181
+ if isinstance(condition, str):
7182
+ # Process the string manually.
7183
+ # 1. Parse the string to get operator.
7184
+ for op in TeradataConstants.TERADATA_JOIN_OPERATORS.value:
7185
+ if op in condition:
7186
+ conditional_separator = op
7187
+ break
7188
+ else:
7189
+ # If no join condition is mentioned, then string represents the column.
7190
+ # In this case, default operator is taken as equal.
7191
+ # If on is ['a'], then it is equal to 'lhs.a = rhs.a'
7192
+ columns = [condition, condition]
7193
+ condition = "{0} = {0}".format(condition)
7194
+ conditional_separator = "="
7195
+ # 2. Split the string using operator and extract LHS and RHS
7196
+ # columns from a binary expression.
7197
+ columns = [column.strip() for column in condition.split(sep=conditional_separator)
7198
+ if len(column) > 0]
7199
+
7200
+ if len(columns) != 2:
7201
+ invalid_join_conditions.append(condition)
7202
+ # TODO: Raise exception here only.
7203
+ else:
7204
+ # 3. Generate fully qualified names using affix and table alias
7205
+ # and create final on clause condition string.
7206
+ left_col = self.__add_alias_to_column(columns[0], self, lhs_alias)
7207
+ right_col = self.__add_alias_to_column(columns[1], other, rhs_alias)
7208
+ if conditional_separator == "!=":
7209
+ # "!=" is python way of expressing 'not equal to'. "<>" is Teradata way of
7210
+ # expressing 'not equal to'. Adding support for "!=".
7211
+ conditional_separator = "<>"
7212
+ all_join_conditions.append(
7213
+ '{0} {1} {2}'.format(left_col, conditional_separator, right_col))
7214
+
7215
+ # Generate on clause string from column expression.
7216
+ if isinstance(condition, ColumnExpression):
7217
+ compiled_condition = condition.compile(compile_kwargs={'include_table': True,
7218
+ 'literal_binds': True,
7219
+ 'table_name_kind': '_join_alias',
7220
+ 'compile_with_caller_table': True,
7221
+ 'table_only': True})
7222
+
7223
+ all_join_conditions.append(compiled_condition)
7224
+
7225
+ # Raise error if invalid on conditions are passed.
7226
+ if len(invalid_join_conditions) > 0:
7227
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_INVALID_JOIN_CONDITION,
7228
+ ", ".join(invalid_join_conditions)),
7229
+ MessageCodes.TDMLDF_INVALID_JOIN_CONDITION)
7230
+
7231
+ # Generate final on condition.
7232
+ join_condition = " and ".join(all_join_conditions)
6690
7233
  else:
6691
- # As column not present in right DataFrame, directly adding column to new metadata dict.
6692
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, df1_columns_types)
6693
- select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
7234
+ # In case of cross join no need of condition.
7235
+ join_condition = ""
6694
7236
 
6695
- for column in other.columns:
6696
- if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
6697
- # As column not present in left DataFrame, directly adding column to new metadata dict.
6698
- self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, df2_columns_types)
6699
- select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
7237
+ # Step 2: Generate the select clause string.
7238
+ # Generate new column names for overlapping column names using lsuffix, rsuffix, lprefix, rprefix.
7239
+ # Also, use table alias while addressing overlapping column names.
7240
+ lhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr)
7241
+ rhs_columns_types = df_utils._get_required_columns_types_from_metaexpr(other._metaexpr)
6700
7242
 
6701
- # Create a node in AED using _aed_join
6702
- join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns), how_lc,
6703
- join_condition, "df1" if laffix is None else laffix,
6704
- "df2" if raffix is None else raffix)
7243
+ select_columns = []
7244
+ new_metaexpr_columns_types = OrderedDict()
6705
7245
 
6706
- # Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid and underlying table name.
6707
- new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
7246
+ # Processing columns in LHS DF/ self DF.
7247
+ for column in self.columns:
7248
+ if df_utils._check_column_exists(column.lower(), other_columns_lower_actual_map.keys()):
7249
+ # Check if column found in other DataFrame has same case or different.
7250
+ # Return the column name from the other DataFrame.
7251
+ other_column = other_columns_lower_actual_map[column.lower()]
7252
+
7253
+ # Check if column name in LHS dataframe is same as that of in RHS dataframe.
7254
+ # If so, generate new name for LHS DF column using provided affix.
7255
+ df1_column_with_affix = self.__check_and_return_new_column_name(laffix, other_column,
7256
+ other_columns_lower_actual_map.keys(),
7257
+ "right", affix_type)
7258
+
7259
+ # Generate select clause string for current column and append to list.
7260
+ select_columns.append("{0} as {1}".format(
7261
+ self.__get_fully_qualified_col_name(other_column, lhs_alias),
7262
+ df1_column_with_affix))
7263
+
7264
+ # Check if column name in RHS dataframe is same as that of in LHS dataframe.
7265
+ # If so, generate new name for RHS DF column using provided affix.
7266
+ df2_column_with_affix = self.__check_and_return_new_column_name(raffix, column,
7267
+ self_columns_lower_actual_map.keys(),
7268
+ "left", affix_type)
7269
+ # Generate select clause string for current column and append to list.
7270
+ select_columns.append("{0} as {1}".format(
7271
+ self.__get_fully_qualified_col_name(column, rhs_alias),
7272
+ df2_column_with_affix))
7273
+
7274
+ # As we are creating new column name, adding it to new metadata dict for new dataframe from join.
7275
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
7276
+ UtilFuncs._teradata_unquote_arg(df1_column_with_affix, "\""),
7277
+ column, lhs_columns_types)
7278
+
7279
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types,
7280
+ UtilFuncs._teradata_unquote_arg(df2_column_with_affix, "\""),
7281
+ other_column, rhs_columns_types)
6708
7282
 
6709
- return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
7283
+ else:
7284
+ # As column with same name is not present in RHS DataFrame now,
7285
+ # directly adding column to new metadata dict.
7286
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, lhs_columns_types)
7287
+ select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
7288
+
7289
+ # Processing columns in RHS DF/ other DF.
7290
+ # Here we will only be processing columns which are not overlapping.
7291
+ for column in other.columns:
7292
+ if not df_utils._check_column_exists(column.lower(), self_columns_lower_actual_map.keys()):
7293
+ # As column not present in left DataFrame, directly adding column to new metadata dict.
7294
+ self.__add_column_type_item_to_dict(new_metaexpr_columns_types, column, column, rhs_columns_types)
7295
+ select_columns.append(UtilFuncs._teradata_quote_arg(column, "\"", False))
7296
+
7297
+ # Step 3: Create a node in AED using _aed_join using appropriate alias for involved tables.
7298
+ join_node_id = self._aed_utils._aed_join(self._nodeid, other._nodeid, ", ".join(select_columns),
7299
+ how_lc, join_condition, lhs_alias, rhs_alias)
7300
+
7301
+ # Step 4: Constructing new Metadata (_metaexpr) without DB; using dummy select_nodeid
7302
+ # and underlying table name.
7303
+ new_metaexpr = UtilFuncs._get_metaexpr_using_columns(join_node_id, new_metaexpr_columns_types.items())
7304
+
7305
+ # Return a new joined dataframe.
7306
+ return self._create_dataframe_from_node(join_node_id, new_metaexpr, self._index_label)
7307
+ finally:
7308
+ # Delete the '_join_alias' attribute attached to underlying
7309
+ # SQLALchemy table objects.
7310
+ delattr(self._metaexpr.t, "_join_alias")
7311
+ delattr(other._metaexpr.t, "_join_alias")
6710
7312
 
6711
7313
  def __add_alias_to_column(self, column, df, alias):
6712
7314
  """
@@ -6766,7 +7368,7 @@ class DataFrame():
6766
7368
  return "{0}.{1}".format(UtilFuncs._teradata_quote_arg(alias, "\"", False),
6767
7369
  UtilFuncs._teradata_quote_arg(column, "\"", False))
6768
7370
 
6769
- def __check_and_return_new_column_name(self, affix, column, col_list, df_side, affix_type):
7371
+ def __check_and_return_new_column_name(self, affix, column, col_list, other_df_side, affix_type):
6770
7372
  """
6771
7373
  Check new column name alias with column exists in col_list or not, if exists throws exception else
6772
7374
  returns new column name.
@@ -6775,7 +7377,7 @@ class DataFrame():
6775
7377
  affix - affix to be added to column.
6776
7378
  column - column name.
6777
7379
  col_list - list of columns to check in which new column is exists or not.
6778
- df_side - Side of the dataframe.
7380
+ other_df_side - Side on which the other dataframe in current join operation resides.
6779
7381
  affix_type - Type of affix. Either "prefix" or "suffix".
6780
7382
 
6781
7383
  EXAMPLES:
@@ -6789,19 +7391,19 @@ class DataFrame():
6789
7391
  return UtilFuncs._teradata_quote_arg(column, "\"", False)
6790
7392
 
6791
7393
  # If Prefix, affix is added before column name else it is appended.
6792
- df1_column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
6793
- df1_column_with_affix = df1_column_with_affix.format(affix,
6794
- UtilFuncs._teradata_unquote_arg(column, "\""))
6795
- if df_utils._check_column_exists(df1_column_with_affix.lower(), col_list):
6796
- if df_side == "right":
6797
- suffix_side = "l{}".format(affix_type)
7394
+ column_with_affix = "{0}_{1}" if affix_type == "prefix" else "{1}_{0}"
7395
+ column_with_affix = column_with_affix.format(affix,
7396
+ UtilFuncs._teradata_unquote_arg(column, "\""))
7397
+ if df_utils._check_column_exists(column_with_affix.lower(), col_list):
7398
+ if other_df_side == "right":
7399
+ affix_type = "l{}".format(affix_type)
6798
7400
  else:
6799
- suffix_side = "r{}".format(affix_type)
7401
+ affix_type = "r{}".format(affix_type)
6800
7402
  raise TeradataMlException(
6801
- Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, df1_column_with_affix, df_side,
6802
- suffix_side),
7403
+ Messages.get_message(MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS, column_with_affix, other_df_side,
7404
+ affix_type),
6803
7405
  MessageCodes.TDMLDF_COLUMN_ALREADY_EXISTS)
6804
- return UtilFuncs._teradata_quote_arg(df1_column_with_affix, "\"", False)
7406
+ return UtilFuncs._teradata_quote_arg(column_with_affix, "\"", False)
6805
7407
 
6806
7408
  def __add_column_type_item_to_dict(self, new_metadata_dict, new_column, column, column_types):
6807
7409
  """
@@ -7327,21 +7929,17 @@ class DataFrame():
7327
7929
 
7328
7930
  exec_mode = 'REMOTE' if UtilFuncs._is_lake() else 'IN-DB'
7329
7931
  if exec_mode == 'REMOTE':
7330
- if _InternalBuffer.get("auth_token") is None:
7331
- raise TeradataMlException(Messages.get_message(
7332
- MessageCodes.FUNC_EXECUTION_FAILED, "'udf'", 'Authentication token is required to run udf. Set token using set_auth_token().'),
7333
- MessageCodes.FUNC_EXECUTION_FAILED)
7334
- else:
7335
- for colname, col in udf_expr.items():
7336
- env_name = UtilFuncs._get_env_name(col)
7337
- # Store the env_name and its corresponding output column
7338
- if env_name in env_mapper:
7339
- env_mapper[env_name].append(colname)
7340
- else:
7341
- env_mapper[env_name] = [colname]
7932
+ _Validators._check_auth_token("udf")
7933
+ for colname, col in udf_expr.items():
7934
+ env_name = UtilFuncs._get_env_name(col)
7935
+ # Store the env_name and its corresponding output column
7936
+ if env_name in env_mapper:
7937
+ env_mapper[env_name].append(colname)
7938
+ else:
7939
+ env_mapper[env_name] = [colname]
7342
7940
  else:
7343
7941
  env_mapper[env_name] = udf_expr.keys()
7344
-
7942
+
7345
7943
  for env_name, cols in env_mapper.items():
7346
7944
  # Create a dictionary of output columns to column type.
7347
7945
  returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
@@ -7389,6 +7987,97 @@ class DataFrame():
7389
7987
  df = tbl_operators.execute()
7390
7988
  return df
7391
7989
 
7990
+ def _assign_call_udf(self, call_udf_expr):
7991
+ """
7992
+ DESCRIPTION:
7993
+ Internal function for DataFrame.assign() to execute the call_udf using
7994
+ Script/Apply Table Operator and create new column for teradataml DataFrame.
7995
+
7996
+ PARAMETER:
7997
+ call_udf_expr:
7998
+ Required Argument.
7999
+ Specifies a dictionary of column name to call_udf expressions.
8000
+ Types: dict
8001
+
8002
+ RETURNS:
8003
+ teradataml DataFrame
8004
+
8005
+ RAISES:
8006
+ None.
8007
+
8008
+ EXAMPLES:
8009
+ # call_udf_expr is a dictionary of column names to call_udf expressions.
8010
+ call_udf_expr = {'upper_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C44310>,
8011
+ 'sum_col': <teradataml.dataframe.sql._SQLColumnExpression object at 0x0000028E59C41690>}
8012
+ self._assign_register(call_udf_expr)
8013
+ """
8014
+ df = self
8015
+ # Create a dictionary of output columns to column type (teradata type).
8016
+ returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
8017
+ # Create a dictionary of output columns to column type (python types).
8018
+ output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
8019
+ for col_name, col_type in returns.items()}
8020
+
8021
+ for colname, col in call_udf_expr.items():
8022
+ returns[colname] = col.type
8023
+ output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
8024
+ script_name = col._udf_script
8025
+ delimiter = col._delimiter
8026
+ quotechar = col._quotechar
8027
+
8028
+ # Create a dictionary of arguments to be passed to the script.
8029
+ script_data = {}
8030
+ script_data['input_cols'] = df.columns
8031
+ script_data['output_cols'] = list(returns.keys())
8032
+ script_data['output_type_converters'] = output_type_converters
8033
+ script_data['function_args'] = {colname: col._udf_args}
8034
+ script_data['delimiter'] = delimiter
8035
+ script_data['qoutechar'] = quotechar
8036
+
8037
+ # Convert the dictionary to a string.
8038
+ # The string is URL encoded to pass it as a parameter to the script.
8039
+ script_data = urllib.parse.quote_plus(json.dumps(script_data))
8040
+
8041
+ if UtilFuncs._is_lake():
8042
+ from teradataml.table_operators.Apply import Apply
8043
+ apply_op_obj = Apply(data=df,
8044
+ script_name=script_name,
8045
+ env_name=col._env_name,
8046
+ returns = returns,
8047
+ delimiter = delimiter,
8048
+ quotechar=quotechar,
8049
+ files_local_path=GarbageCollector._get_temp_dir_name(),
8050
+ apply_command="python3 {} {}".format(script_name, script_data)
8051
+ )
8052
+ try:
8053
+ df = apply_op_obj.execute_script(
8054
+ output_style=OutputStyle.OUTPUT_TABLE.value)
8055
+ except Exception:
8056
+ raise
8057
+ else:
8058
+ import teradataml.context.context as context
8059
+ database = context._get_current_databasename()
8060
+
8061
+ check_reserved_keyword = False if sorted(list(returns.keys())) == sorted(df.columns) else True
8062
+
8063
+ from teradataml.table_operators.Script import Script
8064
+ table_op_obj = Script(data=df,
8065
+ script_name=script_name,
8066
+ files_local_path=GarbageCollector._get_temp_dir_name(),
8067
+ script_command="{}/bin/python3 ./{}/{} {}".format(
8068
+ configure.indb_install_location, database, script_name, script_data),
8069
+ returns=returns,
8070
+ quotechar=quotechar,
8071
+ delimiter = delimiter
8072
+ )
8073
+ table_op_obj.check_reserved_keyword = check_reserved_keyword
8074
+ try:
8075
+ df = table_op_obj.execute_script(
8076
+ output_style=OutputStyle.OUTPUT_TABLE.value)
8077
+ except Exception:
8078
+ raise
8079
+ return df
8080
+
7392
8081
  @collect_queryband(queryband="DF_assign")
7393
8082
  def assign(self, drop_columns=False, **kwargs):
7394
8083
  """
@@ -7420,7 +8109,7 @@ class DataFrame():
7420
8109
  * SQLAlchemy ClauseElements.
7421
8110
  (See teradataml extension with SQLAlchemy in teradataml User Guide
7422
8111
  and Function reference guide for more details)
7423
- * Function - udf.
8112
+ * Function - udf, call_udf.
7424
8113
 
7425
8114
 
7426
8115
  RETURNS:
@@ -7454,7 +8143,7 @@ class DataFrame():
7454
8143
  Look at Example 18 to understand more.
7455
8144
  8. While passing multiple udf expressions, one can not pass one column output
7456
8145
  as another column input in the same ``assign`` call.
7457
- 9. If user pass multiple udf expressions, delimiter and quotechar specified in
8146
+ 9. If user pass multiple udf expressions, delimiter and quotechar specified in
7458
8147
  last udf expression are considered for processing.
7459
8148
 
7460
8149
  RAISES:
@@ -7819,13 +8508,13 @@ class DataFrame():
7819
8508
  Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
7820
8509
  >>>
7821
8510
 
7822
- # Example 19: Convert the values is 'accounts' column to upper case using a user
8511
+ # Example 19: Convert the values is 'accounts' column to upper case using a user
7823
8512
  # defined function on Vantage Cloud Lake.
7824
8513
  # Create a Python 3.10.5 environment with given name and description in Vantage.
7825
8514
  >>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
7826
8515
  User environment 'test_udf' created.
7827
8516
  >>>
7828
- # Create a user defined functions to 'to_upper' to get the values in upper case
8517
+ # Create a user defined functions to 'to_upper' to get the values in upper case
7829
8518
  # and pass the user env to run it on.
7830
8519
  >>> from teradataml.dataframe.functions import udf
7831
8520
  >>> @udf(env_name = env)
@@ -7837,7 +8526,31 @@ class DataFrame():
7837
8526
  # to the DataFrame.
7838
8527
  >>> df.assign(upper_stats = to_upper('accounts'))
7839
8528
  Feb Jan Mar Apr datetime upper_stats
7840
- accounts
8529
+ accounts
8530
+ Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
8531
+ Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
8532
+ Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
8533
+ Jones LLC 200.0 150.0 140.0 180.0 17/01/04 JONES LLC
8534
+ Orange Inc 210.0 NaN NaN 250.0 17/01/04 ORANGE INC
8535
+ Red Inc 200.0 150.0 140.0 NaN 17/01/04 RED INC
8536
+ >>>
8537
+
8538
+ # Example 20: Register and Call the user defined function to get the values upper case.
8539
+ >>> from teradataml.dataframe.functions import udf, register, call_udf
8540
+ >>> @udf
8541
+ ... def to_upper(s):
8542
+ ... if s is not None:
8543
+ ... return s.upper()
8544
+ >>>
8545
+ # Register the created user defined function with name "upper".
8546
+ >>> register("upper", to_upper)
8547
+ >>>
8548
+ # Call the user defined function registered with name "upper" and assign the
8549
+ # ColumnExpression returned to the DataFrame.
8550
+ >>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
8551
+ >>> res
8552
+ Feb Jan Mar Apr datetime upper_col
8553
+ accounts
7841
8554
  Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
7842
8555
  Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
7843
8556
  Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
@@ -7894,10 +8607,14 @@ class DataFrame():
7894
8607
  # column name to normal/regular expressions.
7895
8608
  udf_expr = {}
7896
8609
  regular_expr = {}
8610
+ call_udf_expr = {}
7897
8611
  for colname, col in kwargs.items():
7898
8612
  # If value passed in kwargs is a ColumnExpression and is a udf, store it.
7899
8613
  if isinstance(col, ColumnExpression) and col._udf:
7900
8614
  udf_expr[colname] = col
8615
+ # If value passed in kwargs is a ColumnExpression and is a registerd udf script, store it.
8616
+ elif isinstance(col, ColumnExpression) and col._udf_script:
8617
+ call_udf_expr[colname] = col
7901
8618
  else:
7902
8619
  regular_expr[colname] = col
7903
8620
  df = self
@@ -7917,6 +8634,9 @@ class DataFrame():
7917
8634
  if bool(udf_expr):
7918
8635
  df = df._assign_udf(udf_expr)
7919
8636
 
8637
+ if bool(call_udf_expr):
8638
+ df = df._assign_call_udf(call_udf_expr)
8639
+
7920
8640
  return df
7921
8641
 
7922
8642
 
@@ -8116,7 +8836,9 @@ class DataFrame():
8116
8836
  _Validators._validate_column_exists_in_dataframe(keys, self._metaexpr)
8117
8837
 
8118
8838
  try:
8119
- new_index_list = self._index_label if self._index_label is not None else []
8839
+
8840
+ # Slicing creates a new list instance with the same contents.
8841
+ new_index_list = self._index_label[:] if self._index_label is not None else []
8120
8842
 
8121
8843
  # Creating a list with requested index labels bases on append
8122
8844
  if append:
@@ -8131,7 +8853,7 @@ class DataFrame():
8131
8853
  new_index_list = keys
8132
8854
 
8133
8855
  # Takes care of appending already existing index
8134
- new_index_list = list(set(new_index_list))
8856
+ new_index_list = list(dict.fromkeys(new_index_list))
8135
8857
 
8136
8858
  # In case requested index is same as existing index, return same DF
8137
8859
  if new_index_list == self._index_label:
@@ -9014,15 +9736,15 @@ class DataFrame():
9014
9736
  TypeError, ValueError, TeradataMLException
9015
9737
 
9016
9738
  EXAMPLES:
9017
- >>> # Load the example datasets.
9018
- ... load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
9739
+ # Load the example datasets.
9740
+ >>> load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
9019
9741
  >>>
9020
9742
 
9021
- >>> # Create the required DataFrames.
9022
- ... # DataFrame on non-sequenced PTI table
9023
- ... ocean_buoys = DataFrame("ocean_buoys")
9024
- >>> # Check DataFrame columns and let's peek at the data
9025
- ... ocean_buoys.columns
9743
+ # Create the required DataFrames.
9744
+ # DataFrame on non-sequenced PTI table
9745
+ >>> ocean_buoys = DataFrame("ocean_buoys")
9746
+ # Check DataFrame columns and let's peek at the data
9747
+ >>> ocean_buoys.columns
9026
9748
  ['buoyid', 'TD_TIMECODE', 'temperature', 'salinity']
9027
9749
  >>> ocean_buoys.head()
9028
9750
  TD_TIMECODE temperature salinity
@@ -9038,10 +9760,10 @@ class DataFrame():
9038
9760
  0 2014-01-06 08:00:00.000000 10.0 55
9039
9761
  0 2014-01-06 08:10:00.000000 10.0 55
9040
9762
 
9041
- >>> # DataFrame on NON-PTI table
9042
- ... ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
9043
- >>> # Check DataFrame columns and let's peek at the data
9044
- ... ocean_buoys_nonpti.columns
9763
+ # DataFrame on NON-PTI table
9764
+ >>> ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
9765
+ # Check DataFrame columns and let's peek at the data
9766
+ >>> ocean_buoys_nonpti.columns
9045
9767
  ['buoyid', 'timecode', 'temperature', 'salinity']
9046
9768
  >>> ocean_buoys_nonpti.head()
9047
9769
  buoyid temperature salinity
@@ -9553,6 +10275,12 @@ class DataFrame():
9553
10275
  # Validate argument types
9554
10276
  _Validators._validate_function_arguments(awu_matrix)
9555
10277
 
10278
+ # If self and right DataFrames are pointing to same Table object,
10279
+ # raise error.
10280
+ if self._metaexpr.t is right._metaexpr.t:
10281
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_ALIAS_REQUIRED, "merge"),
10282
+ MessageCodes.TDMLDF_ALIAS_REQUIRED)
10283
+
9556
10284
  if (right_on is not None and left_on is None) or (right_on is None and left_on is not None):
9557
10285
  raise TeradataMlException(
9558
10286
  Messages.get_message(MessageCodes.MUST_PASS_ARGUMENT, "left_on", "right_on"),
@@ -9609,6 +10337,15 @@ class DataFrame():
9609
10337
  # If user did not pass any arguments which form join conditions,
9610
10338
  # Merge is performed using index columns of TeradataML DataFrames
9611
10339
  if on is None and left_on is None and right_on is None and not use_index:
10340
+ # DataFrames created on OTF table will not have index.
10341
+ if self._datalake is not None or right._datalake is not None:
10342
+ msg_code = MessageCodes.EXECUTION_FAILED
10343
+ emsg = "Either 'on' argument or both 'left_on' and 'right_on' arguments" \
10344
+ " must be provided to merge DataFrames when they are created on" \
10345
+ " OTF table(s)."
10346
+ error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
10347
+ raise TeradataMlException(error_msg, msg_code)
10348
+
9612
10349
  if self._index_label is None or right._index_label is None:
9613
10350
  raise TeradataMlException(
9614
10351
  Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
@@ -9616,6 +10353,12 @@ class DataFrame():
9616
10353
  use_index = True
9617
10354
 
9618
10355
  if use_index:
10356
+ if self._datalake is not None or right._datalake is not None:
10357
+ msg_code = MessageCodes.EXECUTION_FAILED
10358
+ emsg = "Can not use Index to merge DataFrames when they are created on OTF table(s)."
10359
+ error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
10360
+ raise TeradataMlException(error_msg, msg_code)
10361
+
9619
10362
  if self._index_label is None or right._index_label is None:
9620
10363
  raise TeradataMlException(
9621
10364
  Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
@@ -10271,7 +11014,7 @@ class DataFrame():
10271
11014
  2. seed is supported for stratify column.
10272
11015
  3. Arguments "stratify_column", "seed", "id_column" are supported only
10273
11016
  for stratifying the data.
10274
- Types: str
11017
+ Types: str OR Feature
10275
11018
 
10276
11019
  seed:
10277
11020
  Optional Argument.
@@ -10297,7 +11040,7 @@ class DataFrame():
10297
11040
  for stratifying the data.
10298
11041
  2. "id_column" is supported only when "stratify_column" is used.
10299
11042
  Ignored otherwise.
10300
- Types: str
11043
+ Types: str OR Feature
10301
11044
 
10302
11045
  RETURNS:
10303
11046
  teradataml DataFrame
@@ -12332,6 +13075,9 @@ class DataFrame():
12332
13075
  False)
12333
13076
  column_names = list(dict.fromkeys(column_names))
12334
13077
 
13078
+ if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
13079
+ column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
13080
+
12335
13081
  col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
12336
13082
  sel_nodeid = self._aed_utils._aed_select(self._nodeid, ','.join(column_names), True)
12337
13083
  new_metaexpr = UtilFuncs._get_metaexpr_using_columns(sel_nodeid, col_names_types.items())
@@ -14249,7 +14995,18 @@ class DataFrame():
14249
14995
  >>> plot.show()
14250
14996
 
14251
14997
  """
14252
- return _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
14998
+
14999
+ _plot = _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
15000
+ # If plot is already generated, return the same plot.
15001
+ if self._plot is None:
15002
+ self._plot = _plot
15003
+ return _plot
15004
+
15005
+ if self._plot == _plot:
15006
+ return self._plot
15007
+ else:
15008
+ self._plot = _plot
15009
+ return _plot
14253
15010
 
14254
15011
  @collect_queryband(queryband="DF_itertuples")
14255
15012
  def itertuples(self, name='Row', num_rows=None):
@@ -17142,11 +17899,18 @@ class _TDUAF(DataFrame):
17142
17899
  table_name = self._db_utils._execute_node_return_db_object_name(self._data._nodeid, self._data._metaexpr)
17143
17900
 
17144
17901
  # UAF Functions do not accept double quotes.
17902
+ tdp = preparer(td_dialect)
17145
17903
  db_name = UtilFuncs._extract_db_name(table_name)
17146
- if db_name:
17147
- table_name = '"{}"."{}"'.format(db_name, UtilFuncs._extract_table_name(table_name))
17904
+ datalake_name = UtilFuncs._extract_datalake_name(table_name)
17905
+ if datalake_name:
17906
+ table_name = '{}.{}.{}'.format(tdp.quote(datalake_name),
17907
+ tdp.quote(db_name),
17908
+ tdp.quote(UtilFuncs._extract_table_name(table_name)))
17909
+ elif db_name:
17910
+ table_name = '{}.{}'.format(tdp.quote(db_name),
17911
+ tdp.quote(UtilFuncs._extract_table_name(table_name)))
17148
17912
  else:
17149
- table_name = UtilFuncs._extract_table_name(table_name)
17913
+ table_name = tdp.quote(UtilFuncs._extract_table_name(table_name))
17150
17914
 
17151
17915
  sql_clauses.append("TABLE_NAME ({})")
17152
17916
  sql_values.append(table_name)