teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (151) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +193 -1
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +25 -18
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  8. teradataml/analytics/sqle/__init__.py +20 -2
  9. teradataml/analytics/utils.py +15 -1
  10. teradataml/analytics/valib.py +18 -4
  11. teradataml/automl/__init__.py +341 -112
  12. teradataml/automl/autodataprep/__init__.py +471 -0
  13. teradataml/automl/data_preparation.py +84 -42
  14. teradataml/automl/data_transformation.py +69 -33
  15. teradataml/automl/feature_engineering.py +76 -9
  16. teradataml/automl/feature_exploration.py +639 -25
  17. teradataml/automl/model_training.py +35 -14
  18. teradataml/clients/auth_client.py +2 -2
  19. teradataml/common/__init__.py +1 -2
  20. teradataml/common/constants.py +122 -63
  21. teradataml/common/messagecodes.py +14 -3
  22. teradataml/common/messages.py +8 -4
  23. teradataml/common/sqlbundle.py +40 -10
  24. teradataml/common/utils.py +366 -74
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +348 -86
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/apriori_example.json +22 -0
  29. teradataml/data/byom_example.json +11 -0
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  37. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  38. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  40. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  41. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  42. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  43. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  45. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  47. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  48. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  49. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  51. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  52. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  53. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  54. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  55. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  56. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  57. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  58. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  59. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  60. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  61. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  62. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  63. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  64. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  65. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  66. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  67. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  68. teradataml/data/hnsw_alter_data.csv +5 -0
  69. teradataml/data/hnsw_data.csv +10 -0
  70. teradataml/data/jsons/byom/h2opredict.json +1 -1
  71. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  72. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  73. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  74. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  75. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  76. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  77. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  78. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  79. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  80. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  81. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  82. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  83. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  84. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  85. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  86. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  87. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  88. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  89. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  90. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  91. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  92. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  93. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  94. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
  95. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
  96. teradataml/data/ner_dict.csv +8 -0
  97. teradataml/data/ner_input_eng.csv +7 -0
  98. teradataml/data/ner_rule.csv +5 -0
  99. teradataml/data/pos_input.csv +40 -0
  100. teradataml/data/tdnerextractor_example.json +14 -0
  101. teradataml/data/teradataml_example.json +21 -0
  102. teradataml/data/textmorph_example.json +5 -0
  103. teradataml/data/to_num_data.csv +4 -0
  104. teradataml/data/tochar_data.csv +5 -0
  105. teradataml/data/trans_dense.csv +16 -0
  106. teradataml/data/trans_sparse.csv +55 -0
  107. teradataml/data/vectordistance_example.json +1 -1
  108. teradataml/dataframe/copy_to.py +45 -29
  109. teradataml/dataframe/data_transfer.py +72 -46
  110. teradataml/dataframe/dataframe.py +642 -166
  111. teradataml/dataframe/dataframe_utils.py +167 -22
  112. teradataml/dataframe/functions.py +135 -20
  113. teradataml/dataframe/setop.py +11 -6
  114. teradataml/dataframe/sql.py +330 -78
  115. teradataml/dbutils/dbutils.py +556 -140
  116. teradataml/dbutils/filemgr.py +14 -10
  117. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  118. teradataml/lib/aed_0_1.dll +0 -0
  119. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
  120. teradataml/opensource/_class.py +141 -17
  121. teradataml/opensource/{constants.py → _constants.py} +7 -3
  122. teradataml/opensource/_lightgbm.py +52 -53
  123. teradataml/opensource/_sklearn.py +1008 -0
  124. teradataml/opensource/_wrapper_utils.py +5 -5
  125. teradataml/options/__init__.py +47 -15
  126. teradataml/options/configure.py +103 -26
  127. teradataml/options/display.py +13 -2
  128. teradataml/plot/axis.py +47 -8
  129. teradataml/plot/figure.py +33 -0
  130. teradataml/plot/plot.py +63 -13
  131. teradataml/scriptmgmt/UserEnv.py +307 -40
  132. teradataml/scriptmgmt/lls_utils.py +428 -145
  133. teradataml/store/__init__.py +2 -3
  134. teradataml/store/feature_store/feature_store.py +102 -7
  135. teradataml/table_operators/Apply.py +48 -19
  136. teradataml/table_operators/Script.py +23 -2
  137. teradataml/table_operators/TableOperator.py +3 -1
  138. teradataml/table_operators/table_operator_util.py +58 -9
  139. teradataml/utils/dtypes.py +49 -1
  140. teradataml/utils/internal_buffer.py +38 -0
  141. teradataml/utils/validators.py +377 -62
  142. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
  143. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
  144. teradataml/data/SQL_Fundamentals.pdf +0 -0
  145. teradataml/libaed_0_1.dylib +0 -0
  146. teradataml/libaed_0_1.so +0 -0
  147. teradataml/opensource/sklearn/__init__.py +0 -0
  148. teradataml/store/vector_store/__init__.py +0 -1586
  149. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  150. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  151. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -20,6 +20,9 @@ import re
20
20
  import sqlalchemy
21
21
  import sys
22
22
  import urllib.parse
23
+
24
+ from sqlalchemy import Column
25
+
23
26
  import teradataml.context.context as tdmlctx
24
27
 
25
28
  from collections import OrderedDict, namedtuple
@@ -31,6 +34,7 @@ from teradataml.dataframe.sql_interfaces import ColumnExpression
31
34
  from teradataml.dataframe.sql_functions import case
32
35
  from teradataml.series.series import Series
33
36
  from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
37
+ from teradataml.common.deprecations import argument_deprecation
34
38
  from teradataml.common.utils import UtilFuncs
35
39
  from teradataml.common.exceptions import TeradataMlException
36
40
  from teradataml.common.messages import Messages
@@ -42,6 +46,7 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, Dat
42
46
  from teradataml.dataframe.indexer import _LocationIndexer
43
47
  from teradataml.common.aed_utils import AedUtils
44
48
  from teradataml.options.display import display
49
+ from teradataml.options.configure import configure
45
50
  from teradataml.dataframe.copy_to import copy_to_sql
46
51
  from teradataml.dataframe.row import _Row
47
52
  from teradataml.dataframe.setop import concat
@@ -63,7 +68,79 @@ from teradataml.common.constants import OutputStyle
63
68
 
64
69
  # TODO use logger when available on master branch
65
70
  # logger = teradatapylog.getLogger()
66
- in_schema = UtilFuncs._in_schema
71
+
72
+ class in_schema:
73
+ """
74
+ Class takes a schema name, a table name and datalake name attributes
75
+ and creates an object that can be passed to DataFrame.
76
+ Note:
77
+ teradataml recommends to use this class to access table(s)/view(s),
78
+ from the database other than the default database.
79
+ """
80
+ def __init__(self, schema_name, table_name, datalake_name=None):
81
+ """
82
+ Constructor for in_schema class.
83
+
84
+ PARAMETERS:
85
+ schema_name:
86
+ Required Argument.
87
+ Specifies the schema where the table resides in.
88
+ Types: str
89
+
90
+ table_name:
91
+ Required Argument.
92
+ Specifies the table name or view name in Vantage.
93
+ Types: str
94
+
95
+ datalake_name:
96
+ Optional Argument.
97
+ Specifies the datalake name.
98
+ Types: str
99
+
100
+ EXAMPLES:
101
+ from teradataml.dataframe.dataframe import in_schema, DataFrame
102
+
103
+ # Example 1: The following example creates a DataFrame from the
104
+ # existing Vantage table "dbcinfo" in the non-default
105
+ # database "dbc" using the in_schema instance.
106
+ df = DataFrame(in_schema("dbc", "dbcinfo"))
107
+
108
+ # Example 2: The following example uses from_table() function, existing
109
+ # Vantage table "dbcinfo" and non-default database "dbc" to
110
+ # create a teradataml DataFrame.
111
+ df = DataFrame.from_table(in_schema("dbc","dbcinfo"))
112
+
113
+ # Example 3: The following example uses "in_schema" object created
114
+ # with "datalake_name" argument to create DataFrame on OTF table.
115
+ otf_df = DataFrame(in_schema("datalake_db","datalake_table","datalake"))
116
+
117
+ """
118
+ self.schema_name = schema_name
119
+ self.table_name = table_name
120
+ self.datalake_name = datalake_name
121
+
122
+ awu_matrix = []
123
+ awu_matrix.append(["schema_name", schema_name, False, (str), True])
124
+ awu_matrix.append(["table_name", table_name, False, (str), True])
125
+ awu_matrix.append(["datalake_name", datalake_name, True, (str), True])
126
+
127
+ # Validate argument types
128
+ _Validators._validate_function_arguments(awu_matrix)
129
+
130
+ def __str__(self):
131
+ """
132
+ Returns the string representation of in_schema instance.
133
+ """
134
+ tbl_name = '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.schema_name, "\"", False),
135
+ UtilFuncs._teradata_quote_arg(self.table_name, "\"", False))
136
+
137
+ if not self.datalake_name:
138
+ return tbl_name
139
+
140
+ return '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.datalake_name, "\"", False), tbl_name)
141
+
142
+
143
+ in_schema = in_schema
67
144
 
68
145
 
69
146
  class DataFrame():
@@ -166,6 +243,24 @@ class DataFrame():
166
243
  # Property to determine if table is an ART table or not.
167
244
  self._is_art = None
168
245
 
246
+ # This attribute stores the previous assign arguments in continuous assign calls.
247
+ self._previous_assign_args = None
248
+ # This attribute stores the root DataFrame columns.
249
+ self._root_columns = None
250
+
251
+ self._datalake = None
252
+ self._database = None
253
+ self._table = None
254
+ self._otf = False
255
+
256
+ if isinstance(table_name, in_schema):
257
+ self._table = table_name.table_name
258
+ self._datalake = table_name.datalake_name
259
+ self._database = table_name.schema_name
260
+ self._otf = True if self._datalake else False
261
+
262
+ table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
263
+
169
264
  # Below matrix is list of list, where in each row contains following elements:
170
265
  # Let's take an example of following, just to get an idea:
171
266
  # [element1, element2, element3, element4, element5, element6]
@@ -198,25 +293,45 @@ class DataFrame():
198
293
  self._source_type = SourceType.TABLE.value
199
294
  self._nodeid = self._aed_utils._aed_table(self._table_name)
200
295
  elif query is not None:
296
+ query = query.strip()
297
+ query = query[:-1] if query[-1] == ";" else query
298
+
201
299
  self._query = query
202
300
  self._source_type = SourceType.QUERY.value
203
301
 
204
- if materialize:
205
- # If user requests to materialize the the query, then we should create a
302
+ temp_obj_params = {
303
+ "prefix": "_frmqry_v",
304
+ "use_default_database": True,
305
+ "quote": False
306
+ }
307
+ __execute = UtilFuncs._create_view
308
+
309
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
310
+ # If user requests to materialize the query, then we should create a
311
+ # volatile table if user intends to the same instead of view.
312
+ # Volatile table does not need to be added to the GC.
313
+ temp_obj_params["table_type"] = TeradataConstants.TERADATA_VOLATILE_TABLE
314
+ temp_obj_params["gc_on_quit"] = False
315
+ temp_obj_params["prefix"] = "_frmqry_vt"
316
+ __execute = UtilFuncs._create_table
317
+
318
+ elif materialize:
319
+ # If user requests to materialize the query, then we should create a
206
320
  # table instead of view and add the same in the GarbageCollector.
207
- temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_t", use_default_database=True,
208
- quote=False,
209
- table_type=TeradataConstants.TERADATA_TABLE)
210
- else:
211
- temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_v", use_default_database=True,
212
- quote=False)
321
+ temp_obj_params["table_type"] = TeradataConstants.TERADATA_TABLE
322
+ temp_obj_params["gc_on_quit"] = True
323
+ temp_obj_params["prefix"] = "_frmqry_t"
324
+ __execute = UtilFuncs._create_table
213
325
 
326
+ temp_table_name = UtilFuncs._generate_temp_table_name(**temp_obj_params)
214
327
  self._table_name = temp_table_name
328
+ __execute_params = (self._table_name, self._query)
329
+
330
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
331
+ __execute_params = (self._table_name, self._query, True)
332
+
215
333
  try:
216
- if materialize:
217
- UtilFuncs._create_table(self._table_name, self._query)
218
- else:
219
- UtilFuncs._create_view(self._table_name, self._query)
334
+ __execute(*__execute_params)
220
335
  except OperationalError as oe:
221
336
  if "[Error 3707] Syntax error" in str(oe):
222
337
  raise ValueError(Messages.get_message(
@@ -245,6 +360,9 @@ class DataFrame():
245
360
  self.__data = None
246
361
  self.__data_columns = None
247
362
  self._alias = None
363
+ self._plot = None
364
+
365
+ self._eda_ui = None
248
366
 
249
367
  except TeradataMlException:
250
368
  raise
@@ -334,7 +452,9 @@ class DataFrame():
334
452
  _Validators._validate_function_arguments(arg_info_matrix)
335
453
  try:
336
454
  alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
337
- reuse_metaexpr=False)
455
+ reuse_metaexpr=False, _datalake=self._datalake,
456
+ _database=self._database, _table=self._table,
457
+ _otf=self._otf)
338
458
  # Assigning self attributes to newly created alias dataframe.
339
459
  alias_df._table_name = self._table_name
340
460
  alias_df._index = self._index
@@ -350,7 +470,8 @@ class DataFrame():
350
470
 
351
471
  @classmethod
352
472
  @collect_queryband(queryband="DF_fromTable")
353
- def from_table(cls, table_name, index=True, index_label=None):
473
+ def from_table(cls, table_name, index=True, index_label=None,
474
+ schema_name=None, datalake_name=None):
354
475
  """
355
476
  Class method for creating a DataFrame from a table or a view.
356
477
 
@@ -371,30 +492,48 @@ class DataFrame():
371
492
  Column/s used for sorting.
372
493
  Types: str
373
494
 
495
+ schema_name:
496
+ Optional Argument.
497
+ Specifies the schema where the table resides.
498
+ Types: str
499
+
500
+ datalake_name:
501
+ Optional Argument.
502
+ Specifies the datalake name.
503
+ Types: str
504
+
374
505
  EXAMPLES:
375
- from teradataml.dataframe.dataframe import DataFrame
506
+ >>> from teradataml.dataframe.dataframe import DataFrame
376
507
 
377
508
  # Example 1: The following example creates a DataFrame from a table or
378
509
  a view.
379
510
  # Load the example data.
380
- load_example_data("dataframe","sales")
511
+ >>> load_example_data("dataframe","sales")
381
512
 
382
513
  # Create DataFrame from table
383
- df = DataFrame.from_table('sales')
514
+ >>> df = DataFrame.from_table('sales')
384
515
 
385
516
  # Create DataFrame from table and without index column sorting.
386
- df = DataFrame.from_table("sales", False)
517
+ >>> df = DataFrame.from_table("sales", False)
387
518
 
388
519
  # Create DataFrame from table and sorting using the 'accounts'
389
520
  # column.
390
- df = DataFrame.from_table("sales", True, "accounts")
521
+ >>> df = DataFrame.from_table("sales", True, "accounts")
391
522
 
392
523
  # Example 2: The following example creates a DataFrame from existing Vantage
393
524
  # table "dbcinfo" in the non-default database "dbc" using the
394
525
  # in_schema() function.
395
526
 
396
- from teradataml.dataframe.dataframe import in_schema
397
- df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
527
+ >>> from teradataml.dataframe.dataframe import in_schema
528
+ >>> df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
529
+
530
+ # Example 3: Create a DataFrame on existing DataLake
531
+ # table "lake_table" in the "datalake_database" database
532
+ # in "datalake" datalake.
533
+
534
+ >>> datalake_df = DataFrame.from_table(table_name="lake_table",
535
+ ... schema_name="datalake_database",
536
+ ... datalake_name="datalake" )
398
537
 
399
538
  RETURNS:
400
539
  DataFrame
@@ -403,6 +542,9 @@ class DataFrame():
403
542
  TeradataMlException - TDMLDF_CREATE_FAIL
404
543
 
405
544
  """
545
+ if schema_name:
546
+ return cls(in_schema(schema_name, table_name, datalake_name))
547
+
406
548
  return cls(table_name, index, index_label)
407
549
 
408
550
  @classmethod
@@ -462,7 +604,7 @@ class DataFrame():
462
604
  return cls(index=index, index_label=index_label, query=query, materialize=materialize)
463
605
 
464
606
  @classmethod
465
- def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
607
+ def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True, **kwargs):
466
608
  """
467
609
  Private class method for creating a DataFrame from a nodeid and parent metadata.
468
610
 
@@ -543,6 +685,11 @@ class DataFrame():
543
685
  in [col.name for col in df._metaexpr.c] for elem in undropped_index):
544
686
  df._undropped_index = undropped_index
545
687
 
688
+ # Populate remaining attributes.
689
+ for arg in kwargs:
690
+ # Pop each argument from kwargs and assign to new DataFrame.
691
+ arg_value = kwargs.get(arg)
692
+ df.__setattr__(arg, arg_value)
546
693
  return df
547
694
 
548
695
  def create_temp_view(self, name):
@@ -670,9 +817,10 @@ class DataFrame():
670
817
  return self
671
818
 
672
819
  @collect_queryband(queryband="DF_fillna")
673
- def fillna(self, value=None, columns=None, literal_value=False):
820
+ def fillna(self, value=None, columns=None, literal_value=False, partition_column=None):
674
821
  """
675
- Method to replace the null values in a column with the value specified.
822
+ DESCRIPTION:
823
+ Method to replace the null values in a column with the value specified.
676
824
 
677
825
  PARAMETERS:
678
826
  value:
@@ -705,6 +853,12 @@ class DataFrame():
705
853
  Default Value: False
706
854
  Types: bool
707
855
 
856
+ partition_column:
857
+ Optional Argument.
858
+ Specifies the column name to partition the data.
859
+ Default Value: None
860
+ Types: str
861
+
708
862
  RETURNS:
709
863
  teradataml DataFrame
710
864
 
@@ -745,6 +899,26 @@ class DataFrame():
745
899
  3 Blue Inc 90.0 50 95.0 101.0 17/01/04
746
900
  4 Alpha Co 210.0 200 215.0 250.0 17/01/04
747
901
  5 Orange Inc 210.0 50 NaN 250.0 17/01/04
902
+
903
+ # Example 3: Populate the null value in 'pclass' and
904
+ # 'fare' column with mean value with partition
905
+ # column as 'sex'.
906
+ # Load the example data.
907
+ >>> load_example_data("teradataml", ["titanic"])
908
+ >>> df = DataFrame.from_table("titanic")
909
+
910
+ >>> df.fillna(value="mean", columns=["pclass", "fare"], partition_column="sex")
911
+ passenger survived pclass name sex age sibsp parch ticket fare cabin embarked
912
+ 0 284 1 3 Dorking, Mr. Edward Arthur male 19.0 0 0 A/5. 10482 8.0500 None S
913
+ 1 589 0 3 Gilinski, Mr. Eliezer male 22.0 0 0 14973 8.0500 None S
914
+ 2 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 None Q
915
+ 3 282 0 3 Olsson, Mr. Nils Johan Goransson male 28.0 0 0 347464 7.8542 None S
916
+ 4 608 1 1 Daniel, Mr. Robert Williams male 27.0 0 0 113804 30.5000 None S
917
+ 5 404 0 3 Hakkarainen, Mr. Pekka Pietari male 28.0 1 0 STON/O2. 3101279 15.8500 None S
918
+ 6 427 1 2 Clarke, Mrs. Charles V (Ada Maria Winfield) female 28.0 1 0 2003 26.0000 None S
919
+ 7 141 0 3 Boulos, Mrs. Joseph (Sultana) female NaN 0 2 2678 15.2458 None C
920
+ 8 610 1 1 Shutes, Miss. Elizabeth W female 40.0 0 0 PC 17582 153.4625 C125 S
921
+ 9 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 None C
748
922
  """
749
923
  from teradataml import SimpleImputeFit, SimpleImputeTransform
750
924
 
@@ -752,6 +926,7 @@ class DataFrame():
752
926
  arg_info_matrix.append(["value", value, True, (int, float, str, dict, list)])
753
927
  arg_info_matrix.append(["columns", columns, True, (list, str, tuple)])
754
928
  arg_info_matrix.append(["literal_value", literal_value, True, (bool)])
929
+ arg_info_matrix.append(["partition_column", partition_column, True, (str)])
755
930
 
756
931
  # Validate argument types
757
932
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -823,9 +998,15 @@ class DataFrame():
823
998
  literals=literals,
824
999
  literals_columns=literals_columns,
825
1000
  stats=stats,
826
- stats_columns=stats_columns)
1001
+ stats_columns=stats_columns,
1002
+ partition_column=partition_column)
827
1003
 
828
- return fit_obj.transform(data=self).result
1004
+ impute_transform = {
1005
+ 'data': self,
1006
+ 'data_partition_column': partition_column,
1007
+ 'object_partition_column': partition_column}
1008
+
1009
+ return fit_obj.transform(**impute_transform).result
829
1010
 
830
1011
  def __execute_node_and_set_table_name(self, nodeid, metaexpr=None):
831
1012
  """
@@ -924,6 +1105,7 @@ class DataFrame():
924
1105
  self._column_names_and_types = []
925
1106
  self._td_column_names_and_types = []
926
1107
  self._td_column_names_and_sqlalchemy_types = {}
1108
+ self._column_types = {}
927
1109
 
928
1110
  for col in self._metaexpr.c:
929
1111
  if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
@@ -931,9 +1113,11 @@ class DataFrame():
931
1113
  else:
932
1114
  tdtype = "{}".format(col.type)
933
1115
 
934
- self._column_names_and_types.append((str(col.name), UtilFuncs._teradata_type_to_python_type(col.type)))
1116
+ py_type = UtilFuncs._teradata_type_to_python_type(col.type)
1117
+ self._column_names_and_types.append((str(col.name), py_type))
935
1118
  self._td_column_names_and_types.append((str(col.name), tdtype))
936
1119
  self._td_column_names_and_sqlalchemy_types[(str(col.name)).lower()] = col.type
1120
+ self._column_types[(str(col.name)).lower()] = [py_type, col.type]
937
1121
 
938
1122
  def _get_metaexpr(self):
939
1123
  """
@@ -952,7 +1136,24 @@ class DataFrame():
952
1136
  meta = sqlalchemy.MetaData()
953
1137
  db_schema = UtilFuncs._extract_db_name(self._table_name)
954
1138
  db_table_name = UtilFuncs._extract_table_name(self._table_name)
955
- t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1139
+ if not self._datalake:
1140
+ t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1141
+ return _MetaExpression(t)
1142
+
1143
+ # Get metaexpression for datalake table.
1144
+ # check existence of datalake table.
1145
+ tdmlctx.get_connection().dialect.has_table(tdmlctx.get_connection(),
1146
+ self._table,
1147
+ schema=self._database,
1148
+ table_only=True,
1149
+ datalake=self._datalake)
1150
+
1151
+ # Extract column names and corresponding teradatasqlalchemy types.
1152
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1153
+ self._table,
1154
+ self._datalake)
1155
+ t = sqlalchemy.Table(self._table, meta, schema=self._database,
1156
+ *(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
956
1157
  return _MetaExpression(t)
957
1158
 
958
1159
  def __getattr__(self, name):
@@ -2729,8 +2930,8 @@ class DataFrame():
2729
2930
  raise TeradataMlException(msg, errcode)
2730
2931
 
2731
2932
  @collect_queryband(queryband="DF_describe")
2732
- def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
2733
- columns=None):
2933
+ def describe(self, percentiles=[.25, .5, .75], verbose=False, distinct=False, statistics=None,
2934
+ columns=None, pivot=False):
2734
2935
  """
2735
2936
  DESCRIPTION:
2736
2937
  Generates statistics for numeric columns. This function can be used in two modes:
@@ -2759,18 +2960,6 @@ class DataFrame():
2759
2960
  Default Values: [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
2760
2961
  Types: float or List of floats
2761
2962
 
2762
- include:
2763
- Optional Argument.
2764
- Values can be either None or "all".
2765
- If the value is "all", then both numeric and non-numeric columns are included.
2766
- Computes count, mean, std, min, percentiles, and max for numeric columns.
2767
- Computes count and unique for non-numeric columns.
2768
- If the value is None, only numeric columns are used for collecting statistics.
2769
- Note:
2770
- Value 'all' is not applicable for 'Time Series Aggregate Mode'.
2771
- Default Values: None
2772
- Types: str
2773
-
2774
2963
  verbose:
2775
2964
  Optional Argument.
2776
2965
  Specifies a boolean value to be used for time series aggregation, stating whether to get
@@ -2797,7 +2986,6 @@ class DataFrame():
2797
2986
  Computes count and unique for non-numeric columns.
2798
2987
  Notes:
2799
2988
  1. statistics is not applicable for 'Time Series Aggregate Mode'.
2800
- 2. statistics should not be used with include as 'all'.
2801
2989
  Permitted Values: count, mean, min, max, unique, std, describe, percentile
2802
2990
  Default Values: None
2803
2991
  Types: str or List of str
@@ -2807,7 +2995,14 @@ class DataFrame():
2807
2995
  Specifies the name(s) of the columns we are collecting statistics for.
2808
2996
  Default Values: None
2809
2997
  Types: str or List of str
2810
-
2998
+
2999
+ pivot:
3000
+ Optional Argument.
3001
+ Specifies a boolean value to pivot the output.
3002
+ Note:
3003
+ * "pivot" is not supported for PTI tables.
3004
+ Default Values: 'False'
3005
+ Types: bool
2811
3006
 
2812
3007
  RETURNS:
2813
3008
  teradataml DataFrame
@@ -2829,7 +3024,7 @@ class DataFrame():
2829
3024
  Orange Inc 210.0 None None 250 04/01/2017
2830
3025
 
2831
3026
  # Computes count, mean, std, min, percentiles, and max for numeric columns.
2832
- >>> df.describe()
3027
+ >>> df.describe(pivot=True)
2833
3028
  Apr Feb Mar Jan
2834
3029
  func
2835
3030
  count 4 6 4 4
@@ -2841,8 +3036,45 @@ class DataFrame():
2841
3036
  75% 250 207.5 158.75 162.5
2842
3037
  max 250 210 215 200
2843
3038
 
3039
+ # Computes count, mean, std, min, percentiles, and max for numeric columns with
3040
+ # default arugments.
3041
+ >>> df.describe()
3042
+ ATTRIBUTE StatName StatValue
3043
+ Jan MAXIMUM 200.0
3044
+ Jan STANDARD DEVIATION 62.91528696058958
3045
+ Jan PERCENTILES(25) 125.0
3046
+ Jan PERCENTILES(50) 150.0
3047
+ Mar COUNT 4.0
3048
+ Mar MINIMUM 95.0
3049
+ Mar MAXIMUM 215.0
3050
+ Mar MEAN 147.5
3051
+ Mar STANDARD DEVIATION 49.749371855331
3052
+ Mar PERCENTILES(25) 128.75
3053
+ Mar PERCENTILES(50) 140.0
3054
+ Apr COUNT 4.0
3055
+ Apr MINIMUM 101.0
3056
+ Apr MAXIMUM 250.0
3057
+ Apr MEAN 195.25
3058
+ Apr STANDARD DEVIATION 70.97123830585646
3059
+ Apr PERCENTILES(25) 160.25
3060
+ Apr PERCENTILES(50) 215.0
3061
+ Apr PERCENTILES(75) 250.0
3062
+ Feb COUNT 6.0
3063
+ Feb MINIMUM 90.0
3064
+ Feb MAXIMUM 210.0
3065
+ Feb MEAN 166.66666666666666
3066
+ Feb STANDARD DEVIATION 59.553897157672786
3067
+ Feb PERCENTILES(25) 117.5
3068
+ Feb PERCENTILES(50) 200.0
3069
+ Feb PERCENTILES(75) 207.5
3070
+ Mar PERCENTILES(75) 158.75
3071
+ Jan PERCENTILES(75) 162.5
3072
+ Jan MEAN 137.5
3073
+ Jan MINIMUM 50.0
3074
+ Jan COUNT 4.0
3075
+
2844
3076
  # Computes count, mean, std, min, percentiles, and max for numeric columns with 30th and 60th percentiles.
2845
- >>> df.describe(percentiles=[.3, .6])
3077
+ >>> df.describe(percentiles=[.3, .6], pivot=True)
2846
3078
  Apr Feb Mar Jan
2847
3079
  func
2848
3080
  count 4 6 4 4
@@ -2855,7 +3087,7 @@ class DataFrame():
2855
3087
 
2856
3088
  # Computes count, mean, std, min, percentiles, and max for numeric columns group by "datetime" and "Feb".
2857
3089
  >>> df1 = df.groupby(["datetime", "Feb"])
2858
- >>> df1.describe()
3090
+ >>> df1.describe(pivot=True)
2859
3091
  Jan Mar Apr
2860
3092
  datetime Feb func
2861
3093
  04/01/2017 90.0 25% 50 95 101
@@ -2883,22 +3115,6 @@ class DataFrame():
2883
3115
  min 200 215 250
2884
3116
  std None None 0
2885
3117
 
2886
- # Computes count, mean, std, min, percentiles, and max for numeric columns and
2887
- # computes count and unique for non-numeric columns
2888
- >>> df.describe(include="all")
2889
- accounts Feb Jan Mar Apr datetime
2890
- func
2891
- 25% None 117.5 125 128.75 160.25 None
2892
- 75% None 207.5 162.5 158.75 250 None
2893
- count 6 6 4 4 4 6
2894
- mean None 166.667 137.5 147.5 195.25 None
2895
- max None 210 200 215 250 None
2896
- min None 90 50 95 101 None
2897
- 50% None 200 150 140 215 None
2898
- std None 59.554 62.915 49.749 70.971 None
2899
- unique 6 None None None None 1
2900
-
2901
- #
2902
3118
  # Examples for describe() function as Time Series Aggregate.
2903
3119
  #
2904
3120
  >>> # Load the example datasets.
@@ -3081,15 +3297,15 @@ class DataFrame():
3081
3297
  >>>
3082
3298
  """
3083
3299
 
3084
- # Argument validations
3300
+ # -------------Argument validations---------------#
3085
3301
  awu_matrix = []
3086
3302
  awu_matrix.append(["columns", columns, True, (str, list), True])
3087
3303
  awu_matrix.append(["percentiles", percentiles, True, (float, list)])
3088
- awu_matrix.append(["include", include, True, (str), True, [None, "all"]])
3089
3304
  awu_matrix.append(["verbose", verbose, True, (bool)])
3090
3305
  awu_matrix.append(["distinct", distinct, True, (bool)])
3091
3306
  awu_matrix.append(["statistics", statistics, True, (str, list), True,
3092
3307
  ["count", "mean", "min", "max", "unique", "std", "describe", "percentile"]])
3308
+ awu_matrix.append(["pivot", pivot, True, (bool)])
3093
3309
 
3094
3310
  # Validate argument types
3095
3311
  _Validators._validate_function_arguments(awu_matrix)
@@ -3108,22 +3324,11 @@ class DataFrame():
3108
3324
  if statistics:
3109
3325
  statistics = [stats.lower() for stats in UtilFuncs._as_list(statistics)]
3110
3326
 
3111
- # Argument include and statistics should not be used together
3112
- if include is not None and statistics is not None:
3113
- raise ValueError(Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH).format(
3114
- 'include', 'statistics'
3115
- ))
3116
-
3117
3327
  # Percentiles must be a list of values between 0 and 1.
3118
3328
  if not isinstance(percentiles, list) or not all(p > 0 and p < 1 for p in percentiles):
3119
3329
  raise ValueError(Messages.get_message(MessageCodes.INVALID_ARG_VALUE, percentiles, "percentiles",
3120
3330
  "percentiles must be a list of values between 0 and 1"))
3121
3331
 
3122
- # Argument 'include' with value 'all' is not allowed for DataFrameGroupByTime
3123
- if include is not None and include.lower() == "all" and isinstance(self, DataFrameGroupByTime):
3124
- raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
3125
- 'include', 'Aggregation', 'all', 'describe()', 'DataFrame or DataFrameGroupBy'))
3126
-
3127
3332
  # Argument 'statistics' is not allowed for DataFrameGroupByTime
3128
3333
  if statistics is not None and isinstance(self, DataFrameGroupByTime):
3129
3334
  raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
@@ -3133,26 +3338,31 @@ class DataFrame():
3133
3338
  if verbose and not isinstance(self, DataFrameGroupByTime):
3134
3339
  raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
3135
3340
  'verbose', 'Aggregation', 'True', 'describe()', 'DataFrameGroupByTime'))
3341
+ # -------------End of argument validations---------------#
3136
3342
 
3137
3343
  function_label = "func"
3344
+ sort_cols = []
3138
3345
  try:
3139
3346
  self.__execute_node_and_set_table_name(self._nodeid)
3140
3347
 
3141
3348
  groupby_column_list = None
3142
- if isinstance(self, DataFrameGroupBy):
3349
+ if isinstance(self, DataFrameGroupByTime) or isinstance(self, DataFrameGroupBy):
3143
3350
  groupby_column_list = self.groupby_column_list
3144
- df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
3145
- groupby_column_list=groupby_column_list)
3351
+ if columns:
3352
+ df_utils._validate_describe_columns(columns=columns, metaexpr=self._metaexpr,
3353
+ groupby_column_list=groupby_column_list)
3354
+ sort_cols = list(groupby_column_list)
3146
3355
 
3147
- if isinstance(self, DataFrameGroupByTime):
3148
- groupby_column_list = self.groupby_column_list
3149
- df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
3150
- groupby_column_list=groupby_column_list)
3356
+ # 'func' column will be always there in result.
3357
+ sort_cols.append(function_label)
3151
3358
 
3359
+ # Handle DataFrameGroupByTime using union all approach and
3360
+ # other DataFrames using TD_UnivariateStatistics approach.
3361
+ if isinstance(self, DataFrameGroupByTime):
3152
3362
  # Construct the aggregate query.
3153
3363
  agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3154
3364
  percentiles=percentiles, function_label=function_label,
3155
- groupby_column_list=groupby_column_list, include=include,
3365
+ groupby_column_list=groupby_column_list, include=None,
3156
3366
  is_time_series_aggregate=True, verbose=verbose,
3157
3367
  distinct=distinct,
3158
3368
  timebucket_duration=self._timebucket_duration,
@@ -3160,29 +3370,99 @@ class DataFrame():
3160
3370
  timecode_column=self._timecode_column,
3161
3371
  sequence_column=self._sequence_column,
3162
3372
  fill=self._fill)
3373
+
3374
+ if groupby_column_list is not None:
3375
+ df = DataFrame.from_query(agg_query, index_label=sort_cols)
3376
+ df2 = df.sort(sort_cols)
3377
+ df2._metaexpr._n_rows = 100
3378
+ describe_df = df2
3379
+ else:
3380
+ describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3381
+
3382
+ # Check if numeric overflow can occur for result DataFrame.
3383
+ if self._check_numeric_overflow(describe_df):
3384
+ result_df = self._promote_dataframe_types()
3385
+ describe_df = result_df.describe(pivot=True)
3386
+ return describe_df
3387
+
3163
3388
  else:
3164
- # Construct the aggregate query.
3165
- agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3166
- percentiles=percentiles, function_label=function_label,
3167
- groupby_column_list=groupby_column_list, include=include,
3168
- is_time_series_aggregate=False, verbose=verbose,
3169
- distinct=distinct, statistics=statistics)
3170
-
3171
- if groupby_column_list is not None:
3172
- sort_cols = [i for i in groupby_column_list]
3173
- sort_cols.append(function_label)
3174
- df = DataFrame.from_query(agg_query, index_label=sort_cols)
3175
- df2 = df.sort(sort_cols)
3176
- df2._metaexpr._n_rows = 100
3177
- describe_df = df2
3178
- else:
3179
- describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3389
+ # If pivot is True, then construct the aggregate query and return the result DataFrame.
3390
+ # Otherwise, return the result DataFrame in the regular aggregate mode using UnivariateStatistics.
3391
+
3392
+ if pivot:
3393
+ # Construct the aggregate query.
3394
+ agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3395
+ percentiles=percentiles, function_label=function_label,
3396
+ groupby_column_list=groupby_column_list, include=None,
3397
+ is_time_series_aggregate=False, verbose=verbose,
3398
+ distinct=distinct, statistics=statistics)
3399
+
3400
+ if groupby_column_list is not None:
3401
+ sort_cols = [i for i in groupby_column_list]
3402
+ sort_cols.append(function_label)
3403
+ df = DataFrame.from_query(agg_query, index_label=sort_cols)
3404
+ df2 = df.sort(sort_cols)
3405
+ df2._metaexpr._n_rows = 100
3406
+ describe_df = df2
3407
+ else:
3408
+ describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3409
+
3410
+ # Check if numeric overflow can occur for result DataFrame.
3411
+ if self._check_numeric_overflow(describe_df):
3412
+ result_df = self._promote_dataframe_types()
3413
+ describe_df = result_df.describe(pivot=True)
3414
+
3415
+ return describe_df
3416
+
3417
+ # If columns is None, then all dataframe columns are considered.
3418
+ if columns is None:
3419
+ columns = self.columns
3420
+ # Exclude groupby columns
3421
+ if groupby_column_list is not None:
3422
+ columns = [col for col in columns if col not in groupby_column_list]
3423
+
3424
+ numeric_cols = []
3425
+
3426
+ # Extract numeric columns and their types of all columns
3427
+ for col in self._metaexpr.c:
3428
+ if type(col.type) in UtilFuncs()._get_numeric_datatypes() and \
3429
+ col.name in columns:
3430
+ numeric_cols.append(col.name)
3431
+
3432
+ if numeric_cols:
3433
+ # Default statistics for 'Regular Aggregate Mode'
3434
+ sql_stat = ["COUNT", "MAXIMUM", "MEAN", "MINIMUM", "PERCENTILES", "STANDARD DEVIATION"]
3435
+
3436
+ if statistics is not None:
3437
+ py_to_sql_func_map = {"count": "COUNT",
3438
+ "max": "MAXIMUM",
3439
+ "mean": "MEAN",
3440
+ "unique": 'UNIQUE ENTITY COUNT',
3441
+ "min": "MINIMUM",
3442
+ "percentile": "PERCENTILES",
3443
+ "std": "STANDARD DEVIATION"}
3444
+ # Convert statistics into corresponding SQL function names
3445
+ sql_stat = [py_to_sql_func_map[stat] for stat in UtilFuncs()._as_list(statistics)]
3446
+
3447
+ # Convert percentiles to centiles for univariate statistics
3448
+ centiles = list(map(lambda n: int(n * 100), percentiles))
3449
+
3450
+ # UnivariateStatistics parameters
3451
+ univar_param = {
3452
+ "newdata": self.select(self.columns),
3453
+ "target_columns": numeric_cols,
3454
+ "partition_columns": groupby_column_list,
3455
+ "centiles": centiles,
3456
+ "stats": sql_stat
3457
+ }
3458
+
3459
+ from teradataml import UnivariateStatistics
3460
+ # Run UnivariateStatistics
3461
+ aggr_df = UnivariateStatistics(**univar_param).result
3462
+
3463
+ # Return the result in teradataml format
3464
+ return aggr_df
3180
3465
 
3181
- # Check if numeric overflow can occur for result DataFrame.
3182
- if self._check_numeric_overflow(describe_df):
3183
- result_df = self._promote_dataframe_types()
3184
- describe_df = result_df.describe()
3185
- return describe_df
3186
3466
  except TeradataMlException:
3187
3467
  raise
3188
3468
  except Exception as err:
@@ -5269,8 +5549,10 @@ class DataFrame():
5269
5549
  Specifies the function(s) to apply on DataFrame columns.
5270
5550
 
5271
5551
  Valid values for func are:
5272
- 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique',
5273
- 'median', 'var'
5552
+ * 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'percentile_<floatvalue>', 'unique',
5553
+ 'median', 'var'
5554
+ * Note: In 'percentile_<floatvalue>', <floatvalue> specifies the desired percentile value to
5555
+ calculate aggregate. It should be in the range of 0.0 to 1.0 (both inclusive).
5274
5556
 
5275
5557
  Acceptable formats for function(s) are
5276
5558
  string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
@@ -5304,12 +5586,17 @@ class DataFrame():
5304
5586
  Output column names after the above operation are:
5305
5587
  min_employee_no, sum_employee_no, var_employee_no, min_first_name
5306
5588
 
5307
- 4. "func" passed as a ColumnExpression built using the aggregate functions.
5589
+ 4. "percentile_<floatvalue>" passed to agg.
5590
+ >>> df.agg({'employee_no' : ['percentile_0.25', 'percentile_0.75', 'min']})
5591
+ >>> df.agg(['percentile_0.25', 'percentile_0.75', 'sum'])
5592
+ >>> df.agg('percentile_0.25')
5593
+
5594
+ 5. "func" passed as a ColumnExpression built using the aggregate functions.
5308
5595
  >>> df.agg(df.first_name.count())
5309
5596
  Output column name after the above operation is:
5310
5597
  count(first_name)
5311
5598
 
5312
- 5. "func" passed as a list of ColumnExpression built using the aggregate functions.
5599
+ 6. "func" passed as a list of ColumnExpression built using the aggregate functions.
5313
5600
  >>> df.agg([df.employee_no.min(), df.first_name.count()])
5314
5601
  Output column names after the above operation are:
5315
5602
  min(employee_no), count(first_name)
@@ -5397,6 +5684,12 @@ class DataFrame():
5397
5684
  min_employee_no sum_employee_no var_employee_no min_first_name
5398
5685
  0 100 313 44.333333 abcd
5399
5686
 
5687
+ # Get the minimum, 25 percentile value and variance of employee number, by passing dictionary of
5688
+ # column names to string function/list of string functions as parameter.
5689
+ >>> df.agg({'employee_no' : ['min', 'percentile_0.25', 'var']})
5690
+ min_employee_no percentile_0.25_employee_no var_employee_no
5691
+ 0 100 100 44.333333
5692
+
5400
5693
  # Get the minimum and sum of all the columns in the dataframe,
5401
5694
  # by passing list of string functions as parameter.
5402
5695
  >>> df.agg(['min', 'sum'])
@@ -5442,9 +5735,15 @@ class DataFrame():
5442
5735
  mean_employee_no unique_employee_no unique_first_name mean_joined_date unique_joined_date
5443
5736
  0 104.333333 3 2 60/12/04 2
5444
5737
 
5738
+ # Get the percentile of each column in the dataframe with default value 0.5.
5445
5739
  >>> df.agg('percentile')
5446
- percentile_employee_no percentile_marks
5447
- 0 101 None
5740
+ percentile_employee_no percentile_marks
5741
+ 0 101 None
5742
+
5743
+ # Get 80 percentile of each column in the datafame.
5744
+ >>> df.agg('percentile_0.8')
5745
+ percentile_0.8_employee_no percentile_0.8_marks
5746
+ 0 107 None
5448
5747
 
5449
5748
  # Using another table 'sales' (having repeated values) to demonstrate operations
5450
5749
  # 'unique' and 'percentile'.
@@ -5461,9 +5760,11 @@ class DataFrame():
5461
5760
  Blue Inc 90.0 50 95 101 2017-04-01
5462
5761
  Red Inc 200.0 150 140 None 2017-04-01
5463
5762
 
5464
- >>> df.agg('percentile')
5465
- percentile_Feb percentile_Jan percentile_Mar percentile_Apr
5466
- 0 200.0 150 140 215
5763
+ # Get 80 and 40 percentile values of each column in the dataframe.
5764
+ >>> df1 = df.select(['Feb', 'Jan', 'Mar', 'Apr'])
5765
+ >>> df1.agg(['percentile_0.8', 'percentile_0.4'])
5766
+ percentile_0.8_Feb percentile_0.4_Feb percentile_0.8_Jan percentile_0.4_Jan percentile_0.8_Mar percentile_0.4_Mar percentile_0.8_Apr percentile_0.4_Apr
5767
+ 0 210.0 200.0 170 150 170 140 250 194
5467
5768
 
5468
5769
  >>> df.agg('unique')
5469
5770
  unique_accounts unique_Feb unique_Jan unique_Mar unique_Apr unique_datetime
@@ -5650,6 +5951,8 @@ class DataFrame():
5650
5951
 
5651
5952
  except TeradataMlException:
5652
5953
  raise
5954
+ except ValueError:
5955
+ raise
5653
5956
  except Exception as err:
5654
5957
  raise TeradataMlException(Messages.get_message(
5655
5958
  MessageCodes.EXECUTION_FAILED, "perform {} on DataFrame".format(operation), str(err)),
@@ -5765,7 +6068,35 @@ class DataFrame():
5765
6068
 
5766
6069
  def _repr_html_(self):
5767
6070
  """ Print method for teradataml for iPython rich display. """
6071
+ self._generate_output_html()
6072
+ if display.enable_ui:
6073
+ # EDA Ui widget representation using teradatamlwidgets
6074
+ if self._eda_ui is None:
6075
+ from teradatamlwidgets.eda.Ui import Ui
6076
+ self._eda_ui = Ui(df=self, html=self.html)
6077
+ else:
6078
+ self._eda_ui.display_ui()
6079
+ return self.html
6080
+
6081
+ def get_eda_ui(self):
6082
+ """
6083
+ Returns the EDA representation UI.
5768
6084
 
6085
+ PARAMETERS:
6086
+ None.
6087
+
6088
+ EXCEPTIONS:
6089
+ None.
6090
+
6091
+ RETURNS:
6092
+ teradatamlwidgets.eda.Ui
6093
+
6094
+ EXAMPLE:
6095
+ df = ui.get_eda_ui()
6096
+ """
6097
+ return self._eda_ui
6098
+
6099
+ def _generate_output_html(self, disable_types=True):
5769
6100
  # Check if class attributes __data and __data_columns are not None.
5770
6101
  # If not None, reuse the data and columns.
5771
6102
  # If None, generate latest results.
@@ -5778,17 +6109,25 @@ class DataFrame():
5778
6109
  dindent = indent + indent
5779
6110
 
5780
6111
  header_html = ['<style type="text/css">',
5781
- 'table {border:ridge 5px;}',
6112
+ 'table { border:ridge 5px}',
5782
6113
  'table td {border:inset 1px;}',
5783
- 'table tr#HeaderRow {background-color:grey; color:white;}'
6114
+ 'table tr#HeaderRow {background-color:grey; color:white;}',
5784
6115
  '</style>\n'
5785
6116
  ]
5786
6117
  html = "\n{0}".format(indent).join(header_html)
5787
- html += '<html><table>\n{0}<tr id="HeaderRow">\n'.format(indent)
6118
+ html += '<html><table style="min-width:1000px;">\n{0}<tr id="HeaderRow">\n'.format(indent)
5788
6119
 
5789
- columns_html = "</th>\n{0}<th>".format(dindent).join(self.__data_columns)
5790
- html += "{0}<th>{1}</th>\n".format(dindent, columns_html)
5791
- html += "{0}</tr>\n".format(indent)
6120
+ columns_html = "</th><th>".join(self.__data_columns)
6121
+ html += "<th>{0}</th>\n".format(columns_html)
6122
+ html += "</tr>\n"
6123
+
6124
+ if not disable_types:
6125
+ html += '<tr>\n'.format(indent)
6126
+ col_types = [repr(self._td_column_names_and_sqlalchemy_types[column]) for column in
6127
+ self.__data_columns]
6128
+ columns_types_html = "</td>\n{0}<td>".format(dindent).join(col_types)
6129
+ html += "{0}<td>{1}</td>\n".format(dindent, columns_types_html)
6130
+ html += "{0}</tr>\n".format(indent)
5792
6131
 
5793
6132
  for row in self.__data:
5794
6133
  row_html = ["{0}<td>{1}</td>\n".format(dindent,
@@ -5796,8 +6135,31 @@ class DataFrame():
5796
6135
  html += "{1}<tr>\n{0}{1}</tr>\n".format("".join(row_html), indent)
5797
6136
 
5798
6137
  html += "</table></html>"
6138
+ self.html = html
5799
6139
 
5800
- return html
6140
+ def get_output(self, output_index=0):
6141
+ """
6142
+ DESCRIPTION:
6143
+ Returns the result of analytic function when analytic function is
6144
+ run from 'Analyze' tab in EDA UI.
6145
+ Note:
6146
+ * The function does not return anything if analytic function is
6147
+ not run from EDA UI.
6148
+
6149
+ PARAMETERS:
6150
+ output_index:
6151
+ Optional Argument.
6152
+ Specifies the index of the output dataframe to be returned.
6153
+ Default Value: 0
6154
+ Types: int
6155
+
6156
+ RAISES:
6157
+ IndexError
6158
+
6159
+ RETURNS:
6160
+ teradataml DataFrame object.
6161
+ """
6162
+ return self._eda_ui.get_output_dataframe(output_index=output_index)
5801
6163
 
5802
6164
  def __get_data_columns(self):
5803
6165
  """
@@ -6857,7 +7219,8 @@ class DataFrame():
6857
7219
  compiled_condition = condition.compile(compile_kwargs={'include_table': True,
6858
7220
  'literal_binds': True,
6859
7221
  'table_name_kind': '_join_alias',
6860
- 'compile_with_caller_table': True})
7222
+ 'compile_with_caller_table': True,
7223
+ 'table_only': True})
6861
7224
 
6862
7225
  all_join_conditions.append(compiled_condition)
6863
7226
 
@@ -7399,7 +7762,7 @@ class DataFrame():
7399
7762
  """
7400
7763
  return (type(None), int, float, str, decimal.Decimal, ColumnExpression, ClauseElement)
7401
7764
 
7402
- def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
7765
+ def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
7403
7766
  """
7404
7767
  DESCRIPTION:
7405
7768
  Function generates the MetaExpression and AED nodeid for DataFrame.assign()
@@ -7412,6 +7775,11 @@ class DataFrame():
7412
7775
  Default Value: False
7413
7776
  Types: bool
7414
7777
 
7778
+ node_id:
7779
+ Optional Argument.
7780
+ Specifies the input nodeid for the assign operation.
7781
+ Types: str
7782
+
7415
7783
  kwargs:
7416
7784
  keyword, value pairs
7417
7785
  - keywords are the column names.
@@ -7439,7 +7807,7 @@ class DataFrame():
7439
7807
 
7440
7808
  # Join the expressions in result.
7441
7809
  assign_expression = ', '.join(list(map(lambda x: x[1], result)))
7442
- new_nodeid = self._aed_utils._aed_assign(self._nodeid,
7810
+ new_nodeid = self._aed_utils._aed_assign(node_id,
7443
7811
  assign_expression,
7444
7812
  AEDConstants.AED_ASSIGN_DROP_EXISITING_COLUMNS.value)
7445
7813
 
@@ -7571,14 +7939,14 @@ class DataFrame():
7571
7939
  _Validators._check_auth_token("udf")
7572
7940
  for colname, col in udf_expr.items():
7573
7941
  env_name = UtilFuncs._get_env_name(col)
7574
- # Store the env_name and its corresponding output column
7942
+ # Store the env_name and its corresponding output column
7575
7943
  if env_name in env_mapper:
7576
7944
  env_mapper[env_name].append(colname)
7577
7945
  else:
7578
7946
  env_mapper[env_name] = [colname]
7579
7947
  else:
7580
7948
  env_mapper[env_name] = udf_expr.keys()
7581
-
7949
+ debug = False
7582
7950
  for env_name, cols in env_mapper.items():
7583
7951
  # Create a dictionary of output columns to column type.
7584
7952
  returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
@@ -7589,6 +7957,7 @@ class DataFrame():
7589
7957
  # Create a dictionary of output column name to udf arguments
7590
7958
  function_args = {}
7591
7959
  for colname, col in udf_expr.items():
7960
+ debug |= col._debug
7592
7961
  delimiter = col._delimiter
7593
7962
  quotechar = col._quotechar
7594
7963
  if colname in cols:
@@ -7621,15 +7990,17 @@ class DataFrame():
7621
7990
  columns_definitions=columns_definitions,
7622
7991
  output_type_converters={
7623
7992
  col_name: _Dtypes._teradata_type_to_python_type(col_type)
7624
- for col_name, col_type in returns.items()})
7993
+ for col_name, col_type in returns.items()},
7994
+ debug=debug
7995
+ )
7625
7996
 
7626
7997
  df = tbl_operators.execute()
7627
7998
  return df
7628
-
7999
+
7629
8000
  def _assign_call_udf(self, call_udf_expr):
7630
8001
  """
7631
8002
  DESCRIPTION:
7632
- Internal function for DataFrame.assign() to execute the call_udf using
8003
+ Internal function for DataFrame.assign() to execute the call_udf using
7633
8004
  Script/Apply Table Operator and create new column for teradataml DataFrame.
7634
8005
 
7635
8006
  PARAMETER:
@@ -7656,7 +8027,7 @@ class DataFrame():
7656
8027
  # Create a dictionary of output columns to column type (python types).
7657
8028
  output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
7658
8029
  for col_name, col_type in returns.items()}
7659
-
8030
+
7660
8031
  for colname, col in call_udf_expr.items():
7661
8032
  returns[colname] = col.type
7662
8033
  output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
@@ -7782,7 +8153,7 @@ class DataFrame():
7782
8153
  Look at Example 18 to understand more.
7783
8154
  8. While passing multiple udf expressions, one can not pass one column output
7784
8155
  as another column input in the same ``assign`` call.
7785
- 9. If user pass multiple udf expressions, delimiter and quotechar specified in
8156
+ 9. If user pass multiple udf expressions, delimiter and quotechar specified in
7786
8157
  last udf expression are considered for processing.
7787
8158
 
7788
8159
  RAISES:
@@ -8147,13 +8518,13 @@ class DataFrame():
8147
8518
  Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
8148
8519
  >>>
8149
8520
 
8150
- # Example 19: Convert the values is 'accounts' column to upper case using a user
8521
+ # Example 19: Convert the values is 'accounts' column to upper case using a user
8151
8522
  # defined function on Vantage Cloud Lake.
8152
8523
  # Create a Python 3.10.5 environment with given name and description in Vantage.
8153
8524
  >>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
8154
8525
  User environment 'test_udf' created.
8155
8526
  >>>
8156
- # Create a user defined functions to 'to_upper' to get the values in upper case
8527
+ # Create a user defined functions to 'to_upper' to get the values in upper case
8157
8528
  # and pass the user env to run it on.
8158
8529
  >>> from teradataml.dataframe.functions import udf
8159
8530
  >>> @udf(env_name = env)
@@ -8165,7 +8536,7 @@ class DataFrame():
8165
8536
  # to the DataFrame.
8166
8537
  >>> df.assign(upper_stats = to_upper('accounts'))
8167
8538
  Feb Jan Mar Apr datetime upper_stats
8168
- accounts
8539
+ accounts
8169
8540
  Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
8170
8541
  Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
8171
8542
  Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
@@ -8184,12 +8555,12 @@ class DataFrame():
8184
8555
  # Register the created user defined function with name "upper".
8185
8556
  >>> register("upper", to_upper)
8186
8557
  >>>
8187
- # Call the user defined function registered with name "upper" and assign the
8558
+ # Call the user defined function registered with name "upper" and assign the
8188
8559
  # ColumnExpression returned to the DataFrame.
8189
8560
  >>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
8190
8561
  >>> res
8191
8562
  Feb Jan Mar Apr datetime upper_col
8192
- accounts
8563
+ accounts
8193
8564
  Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
8194
8565
  Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
8195
8566
  Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
@@ -8263,8 +8634,34 @@ class DataFrame():
8263
8634
  # from udf expression.
8264
8635
  if bool(regular_expr):
8265
8636
  try:
8266
- (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(drop_columns, **regular_expr)
8637
+ root_node_id = None
8638
+ root_df_col = df.columns
8639
+
8640
+ # Get the previous node type, if it is assign and drop_columns is False,
8641
+ # then check if the previous assign arguments exists and are not present
8642
+ # in either the root dataframe columns or the current assign arguments.
8643
+ # if these conditions are met, obtain the root node id (i.e., the first
8644
+ # node of the assign operation) and merge the previous assign arguments with the current ones.
8645
+
8646
+ prev_node_type = df._aed_utils._aed_get_node_query_type(df._nodeid)
8647
+ if not drop_columns and prev_node_type == "assign" and df._previous_assign_args is not None:
8648
+ if not df._root_columns & df._previous_assign_args.keys() and \
8649
+ not df._previous_assign_args.keys() & regular_expr.keys():
8650
+ # Get the root node id and root dataframe columns.
8651
+ root_df_col = df._root_columns
8652
+ root_node_id = df._aed_utils._aed_get_parent_nodeids(df._nodeid)[0]
8653
+ regular_expr = {**df._previous_assign_args, **regular_expr}
8654
+
8655
+ # If root_node_id is None, assign the current node id as root node of assign operation
8656
+ node_id = root_node_id if root_node_id is not None else df._nodeid
8657
+
8658
+ # Generate new meta expression and node id for the new dataframe.
8659
+ (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(
8660
+ drop_columns, node_id = node_id, **regular_expr)
8267
8661
  df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
8662
+ df._previous_assign_args = regular_expr
8663
+ df._root_columns = root_df_col
8664
+
8268
8665
  except Exception as err:
8269
8666
  errcode = MessageCodes.TDMLDF_INFO_ERROR
8270
8667
  msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
@@ -8475,7 +8872,9 @@ class DataFrame():
8475
8872
  _Validators._validate_column_exists_in_dataframe(keys, self._metaexpr)
8476
8873
 
8477
8874
  try:
8478
- new_index_list = self._index_label if self._index_label is not None else []
8875
+
8876
+ # Slicing creates a new list instance with the same contents.
8877
+ new_index_list = self._index_label[:] if self._index_label is not None else []
8479
8878
 
8480
8879
  # Creating a list with requested index labels bases on append
8481
8880
  if append:
@@ -8490,7 +8889,7 @@ class DataFrame():
8490
8889
  new_index_list = keys
8491
8890
 
8492
8891
  # Takes care of appending already existing index
8493
- new_index_list = list(set(new_index_list))
8892
+ new_index_list = list(dict.fromkeys(new_index_list))
8494
8893
 
8495
8894
  # In case requested index is same as existing index, return same DF
8496
8895
  if new_index_list == self._index_label:
@@ -9373,15 +9772,15 @@ class DataFrame():
9373
9772
  TypeError, ValueError, TeradataMLException
9374
9773
 
9375
9774
  EXAMPLES:
9376
- >>> # Load the example datasets.
9377
- ... load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
9775
+ # Load the example datasets.
9776
+ >>> load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
9378
9777
  >>>
9379
9778
 
9380
- >>> # Create the required DataFrames.
9381
- ... # DataFrame on non-sequenced PTI table
9382
- ... ocean_buoys = DataFrame("ocean_buoys")
9383
- >>> # Check DataFrame columns and let's peek at the data
9384
- ... ocean_buoys.columns
9779
+ # Create the required DataFrames.
9780
+ # DataFrame on non-sequenced PTI table
9781
+ >>> ocean_buoys = DataFrame("ocean_buoys")
9782
+ # Check DataFrame columns and let's peek at the data
9783
+ >>> ocean_buoys.columns
9385
9784
  ['buoyid', 'TD_TIMECODE', 'temperature', 'salinity']
9386
9785
  >>> ocean_buoys.head()
9387
9786
  TD_TIMECODE temperature salinity
@@ -9397,10 +9796,10 @@ class DataFrame():
9397
9796
  0 2014-01-06 08:00:00.000000 10.0 55
9398
9797
  0 2014-01-06 08:10:00.000000 10.0 55
9399
9798
 
9400
- >>> # DataFrame on NON-PTI table
9401
- ... ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
9402
- >>> # Check DataFrame columns and let's peek at the data
9403
- ... ocean_buoys_nonpti.columns
9799
+ # DataFrame on NON-PTI table
9800
+ >>> ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
9801
+ # Check DataFrame columns and let's peek at the data
9802
+ >>> ocean_buoys_nonpti.columns
9404
9803
  ['buoyid', 'timecode', 'temperature', 'salinity']
9405
9804
  >>> ocean_buoys_nonpti.head()
9406
9805
  buoyid temperature salinity
@@ -9974,6 +10373,15 @@ class DataFrame():
9974
10373
  # If user did not pass any arguments which form join conditions,
9975
10374
  # Merge is performed using index columns of TeradataML DataFrames
9976
10375
  if on is None and left_on is None and right_on is None and not use_index:
10376
+ # DataFrames created on OTF table will not have index.
10377
+ if self._datalake is not None or right._datalake is not None:
10378
+ msg_code = MessageCodes.EXECUTION_FAILED
10379
+ emsg = "Either 'on' argument or both 'left_on' and 'right_on' arguments" \
10380
+ " must be provided to merge DataFrames when they are created on" \
10381
+ " OTF table(s)."
10382
+ error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
10383
+ raise TeradataMlException(error_msg, msg_code)
10384
+
9977
10385
  if self._index_label is None or right._index_label is None:
9978
10386
  raise TeradataMlException(
9979
10387
  Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
@@ -9981,6 +10389,12 @@ class DataFrame():
9981
10389
  use_index = True
9982
10390
 
9983
10391
  if use_index:
10392
+ if self._datalake is not None or right._datalake is not None:
10393
+ msg_code = MessageCodes.EXECUTION_FAILED
10394
+ emsg = "Can not use Index to merge DataFrames when they are created on OTF table(s)."
10395
+ error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
10396
+ raise TeradataMlException(error_msg, msg_code)
10397
+
9984
10398
  if self._index_label is None or right._index_label is None:
9985
10399
  raise TeradataMlException(
9986
10400
  Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
@@ -10636,7 +11050,7 @@ class DataFrame():
10636
11050
  2. seed is supported for stratify column.
10637
11051
  3. Arguments "stratify_column", "seed", "id_column" are supported only
10638
11052
  for stratifying the data.
10639
- Types: str
11053
+ Types: str OR Feature
10640
11054
 
10641
11055
  seed:
10642
11056
  Optional Argument.
@@ -10662,7 +11076,7 @@ class DataFrame():
10662
11076
  for stratifying the data.
10663
11077
  2. "id_column" is supported only when "stratify_column" is used.
10664
11078
  Ignored otherwise.
10665
- Types: str
11079
+ Types: str OR Feature
10666
11080
 
10667
11081
  RETURNS:
10668
11082
  teradataml DataFrame
@@ -11191,6 +11605,10 @@ class DataFrame():
11191
11605
  DESCRIPTION:
11192
11606
  Function to apply a user defined function to each row in the
11193
11607
  teradataml DataFrame, leveraging Vantage's Script Table Operator.
11608
+ Notes:
11609
+ 1. The function requires to use same Python version in both Vantage and local environment.
11610
+ 2. Teradata recommends to use "dill" package with same version in both Vantage and
11611
+ local environment.
11194
11612
 
11195
11613
  PARAMETERS:
11196
11614
  user_function:
@@ -11371,6 +11789,15 @@ class DataFrame():
11371
11789
  Default Value: True
11372
11790
  Types: bool
11373
11791
 
11792
+ debug:
11793
+ Optional Argument.
11794
+ Specifies whether to display the script file path generated during function execution or not. This
11795
+ argument helps in debugging when there are any failures during function execution. When set
11796
+ to True, function displays the path of the script and does not remove the file from local file system.
11797
+ Otherwise, file is removed from the local file system.
11798
+ Default Value: False
11799
+ Types: bool
11800
+
11374
11801
  RETURNS:
11375
11802
  1. teradataml DataFrame if exec_mode is "IN-DB".
11376
11803
  2. Pandas DataFrame if exec_mode is "LOCAL".
@@ -11523,6 +11950,7 @@ class DataFrame():
11523
11950
  sort_ascending = kwargs.pop('sort_ascending', True)
11524
11951
  auth = kwargs.pop('auth', None)
11525
11952
  charset = kwargs.pop('charset', None)
11953
+ debug = kwargs.pop('debug', False)
11526
11954
 
11527
11955
  # Check for other extra/unknown arguments.
11528
11956
  unknown_args = list(kwargs.keys())
@@ -11541,7 +11969,7 @@ class DataFrame():
11541
11969
  sort_ascending=sort_ascending,
11542
11970
  returns=returns, delimiter=delimiter,
11543
11971
  quotechar=quotechar, auth=auth,
11544
- charset=charset, num_rows=num_rows)
11972
+ charset=charset, num_rows=num_rows, debug=debug)
11545
11973
 
11546
11974
  return tbl_op_util.execute()
11547
11975
 
@@ -11558,6 +11986,10 @@ class DataFrame():
11558
11986
  DESCRIPTION:
11559
11987
  Function to apply a user defined function to a group or partition of rows
11560
11988
  in the teradataml DataFrame, leveraging Vantage's Script Table Operator.
11989
+ Notes:
11990
+ 1. The function requires to use same Python version in both Vantage and local environment.
11991
+ 2. Teradata recommends to use "dill" package with same version in both Vantage and
11992
+ local environment.
11561
11993
 
11562
11994
  PARAMETERS:
11563
11995
  user_function:
@@ -11768,6 +12200,15 @@ class DataFrame():
11768
12200
  Default Value: True
11769
12201
  Types: bool
11770
12202
 
12203
+ debug:
12204
+ Optional Argument.
12205
+ Specifies whether to display the script file path generated during function execution or not. This
12206
+ argument helps in debugging when there are any failures during function execution. When set
12207
+ to True, function displays the path of the script and does not remove the file from local file system.
12208
+ Otherwise, file is removed from the local file system.
12209
+ Default Value: False
12210
+ Types: bool
12211
+
11771
12212
  RETURNS:
11772
12213
  1. teradataml DataFrame if exec_mode is "IN-DB".
11773
12214
  2. Pandas DataFrame if exec_mode is "LOCAL".
@@ -11933,6 +12374,7 @@ class DataFrame():
11933
12374
  sort_ascending = kwargs.pop('sort_ascending', True)
11934
12375
  auth = kwargs.pop('auth', None)
11935
12376
  charset = kwargs.pop('charset', None)
12377
+ debug = kwargs.pop('debug', False)
11936
12378
 
11937
12379
  # Check for other extra/unknown arguments.
11938
12380
  unknown_args = list(kwargs.keys())
@@ -11951,7 +12393,7 @@ class DataFrame():
11951
12393
  sort_ascending=sort_ascending,
11952
12394
  returns=returns, delimiter=delimiter,
11953
12395
  quotechar=quotechar, auth=auth,
11954
- charset=charset, num_rows=num_rows)
12396
+ charset=charset, num_rows=num_rows, debug=debug)
11955
12397
 
11956
12398
  return tbl_op_util.execute()
11957
12399
 
@@ -11968,9 +12410,9 @@ class DataFrame():
11968
12410
  teradataml DataFrame, leveraging Apply Table Operator of Open
11969
12411
  Analytics Framework.
11970
12412
  Notes:
11971
- 1. The function requires dill package with same version in both remote environment
11972
- and local environment.
11973
- 2. Teradata recommends to use same Python version in both remote and local environment.
12413
+ 1. The function requires to use same Python version in both remote environment and local environment.
12414
+ 2. Teradata recommends to use "dill" package with same version in both remote environment and
12415
+ local environment.
11974
12416
 
11975
12417
  PARAMETERS:
11976
12418
  user_function:
@@ -12153,6 +12595,15 @@ class DataFrame():
12153
12595
  Default value: "csv"
12154
12596
  Types: str
12155
12597
 
12598
+ debug:
12599
+ Optional Argument.
12600
+ Specifies whether to display the script file path generated during function execution or not. This
12601
+ argument helps in debugging when there are any failures during function execution. When set
12602
+ to True, function displays the path of the script and does not remove the file from local file system.
12603
+ Otherwise, file is removed from the local file system.
12604
+ Default Value: False
12605
+ Types: bool
12606
+
12156
12607
  RETURNS:
12157
12608
  teradataml DataFrame.
12158
12609
 
@@ -12329,6 +12780,7 @@ class DataFrame():
12329
12780
  is_local_order = kwargs.pop('is_local_order', False)
12330
12781
  nulls_first = kwargs.pop('nulls_first', True)
12331
12782
  sort_ascending = kwargs.pop('sort_ascending', True)
12783
+ debug = kwargs.pop('debug', False)
12332
12784
 
12333
12785
  # Check for other extra/unknown arguments.
12334
12786
  unknown_args = list(kwargs.keys())
@@ -12351,7 +12803,8 @@ class DataFrame():
12351
12803
  charset=None,
12352
12804
  num_rows=num_rows,
12353
12805
  env_name=env_name,
12354
- style=style)
12806
+ style=style,
12807
+ debug=debug)
12355
12808
 
12356
12809
  return tbl_op_util.execute()
12357
12810
 
@@ -12696,8 +13149,8 @@ class DataFrame():
12696
13149
  _Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
12697
13150
  False)
12698
13151
  column_names = list(dict.fromkeys(column_names))
12699
-
12700
- if list_td_reserved_keywords(column_names):
13152
+
13153
+ if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
12701
13154
  column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
12702
13155
 
12703
13156
  col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
@@ -14617,7 +15070,18 @@ class DataFrame():
14617
15070
  >>> plot.show()
14618
15071
 
14619
15072
  """
14620
- return _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
15073
+
15074
+ _plot = _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
15075
+ # If plot is already generated, return the same plot.
15076
+ if self._plot is None:
15077
+ self._plot = _plot
15078
+ return _plot
15079
+
15080
+ if self._plot == _plot:
15081
+ return self._plot
15082
+ else:
15083
+ self._plot = _plot
15084
+ return _plot
14621
15085
 
14622
15086
  @collect_queryband(queryband="DF_itertuples")
14623
15087
  def itertuples(self, name='Row', num_rows=None):
@@ -15057,7 +15521,7 @@ class DataFrameGroupBy(DataFrame):
15057
15521
  from sqlalchemy.sql.functions import Function
15058
15522
  return (type(None), int, float, str, decimal.Decimal, Function, ColumnExpression, ClauseElement)
15059
15523
 
15060
- def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
15524
+ def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
15061
15525
  """
15062
15526
  DESCRIPTION:
15063
15527
  Function generates the MetaExpression and AED nodeid for DataFrameGroupBy.assign()
@@ -15070,6 +15534,11 @@ class DataFrameGroupBy(DataFrame):
15070
15534
  and grouping columns are returned. This is unused argument.
15071
15535
  Types: bool
15072
15536
 
15537
+ node_id:
15538
+ Optional Argument.
15539
+ Specifies the input nodeid for the assign operation. This is unused argument.
15540
+ Types: str
15541
+
15073
15542
  kwargs:
15074
15543
  keyword, value pairs
15075
15544
  - keywords are the column names.
@@ -17510,11 +17979,18 @@ class _TDUAF(DataFrame):
17510
17979
  table_name = self._db_utils._execute_node_return_db_object_name(self._data._nodeid, self._data._metaexpr)
17511
17980
 
17512
17981
  # UAF Functions do not accept double quotes.
17982
+ tdp = preparer(td_dialect)
17513
17983
  db_name = UtilFuncs._extract_db_name(table_name)
17514
- if db_name:
17515
- table_name = '"{}"."{}"'.format(db_name, UtilFuncs._extract_table_name(table_name))
17984
+ datalake_name = UtilFuncs._extract_datalake_name(table_name)
17985
+ if datalake_name:
17986
+ table_name = '{}.{}.{}'.format(tdp.quote(datalake_name),
17987
+ tdp.quote(db_name),
17988
+ tdp.quote(UtilFuncs._extract_table_name(table_name)))
17989
+ elif db_name:
17990
+ table_name = '{}.{}'.format(tdp.quote(db_name),
17991
+ tdp.quote(UtilFuncs._extract_table_name(table_name)))
17516
17992
  else:
17517
- table_name = UtilFuncs._extract_table_name(table_name)
17993
+ table_name = tdp.quote(UtilFuncs._extract_table_name(table_name))
17518
17994
 
17519
17995
  sql_clauses.append("TABLE_NAME ({})")
17520
17996
  sql_values.append(table_name)