teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (84) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +119 -0
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +18 -6
  5. teradataml/analytics/byom/__init__.py +1 -1
  6. teradataml/analytics/sqle/__init__.py +4 -1
  7. teradataml/analytics/valib.py +18 -4
  8. teradataml/automl/__init__.py +51 -6
  9. teradataml/automl/data_preparation.py +56 -33
  10. teradataml/automl/data_transformation.py +58 -33
  11. teradataml/automl/feature_engineering.py +12 -5
  12. teradataml/automl/model_training.py +34 -13
  13. teradataml/common/__init__.py +1 -2
  14. teradataml/common/constants.py +64 -40
  15. teradataml/common/messagecodes.py +13 -3
  16. teradataml/common/messages.py +4 -1
  17. teradataml/common/sqlbundle.py +40 -10
  18. teradataml/common/utils.py +113 -39
  19. teradataml/common/warnings.py +11 -0
  20. teradataml/context/context.py +141 -17
  21. teradataml/data/amazon_reviews_25.csv +26 -0
  22. teradataml/data/byom_example.json +11 -0
  23. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  24. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  25. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  26. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  27. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  28. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  29. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  30. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  31. teradataml/data/hnsw_alter_data.csv +5 -0
  32. teradataml/data/hnsw_data.csv +10 -0
  33. teradataml/data/jsons/byom/h2opredict.json +1 -1
  34. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  35. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  36. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  37. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  38. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  39. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  40. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +1 -1
  41. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +5 -5
  42. teradataml/data/teradataml_example.json +8 -0
  43. teradataml/data/vectordistance_example.json +1 -1
  44. teradataml/dataframe/copy_to.py +8 -3
  45. teradataml/dataframe/data_transfer.py +11 -1
  46. teradataml/dataframe/dataframe.py +517 -121
  47. teradataml/dataframe/dataframe_utils.py +152 -20
  48. teradataml/dataframe/functions.py +26 -11
  49. teradataml/dataframe/setop.py +11 -6
  50. teradataml/dataframe/sql.py +2 -2
  51. teradataml/dbutils/dbutils.py +525 -129
  52. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  53. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +317 -1011
  54. teradataml/opensource/_class.py +141 -17
  55. teradataml/opensource/{constants.py → _constants.py} +7 -3
  56. teradataml/opensource/_lightgbm.py +52 -53
  57. teradataml/opensource/_sklearn.py +1008 -0
  58. teradataml/opensource/_wrapper_utils.py +5 -5
  59. teradataml/options/__init__.py +47 -15
  60. teradataml/options/configure.py +103 -25
  61. teradataml/options/display.py +13 -2
  62. teradataml/plot/axis.py +47 -8
  63. teradataml/plot/figure.py +33 -0
  64. teradataml/plot/plot.py +63 -13
  65. teradataml/scriptmgmt/UserEnv.py +2 -2
  66. teradataml/scriptmgmt/lls_utils.py +63 -26
  67. teradataml/store/__init__.py +1 -2
  68. teradataml/store/feature_store/feature_store.py +102 -7
  69. teradataml/table_operators/Apply.py +32 -18
  70. teradataml/table_operators/Script.py +3 -1
  71. teradataml/table_operators/TableOperator.py +3 -1
  72. teradataml/utils/dtypes.py +47 -0
  73. teradataml/utils/internal_buffer.py +18 -0
  74. teradataml/utils/validators.py +68 -9
  75. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +123 -2
  76. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +79 -75
  77. teradataml/data/SQL_Fundamentals.pdf +0 -0
  78. teradataml/libaed_0_1.dylib +0 -0
  79. teradataml/libaed_0_1.so +0 -0
  80. teradataml/opensource/sklearn/__init__.py +0 -0
  81. teradataml/store/vector_store/__init__.py +0 -1586
  82. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
  83. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
  84. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
@@ -20,6 +20,9 @@ import re
20
20
  import sqlalchemy
21
21
  import sys
22
22
  import urllib.parse
23
+
24
+ from sqlalchemy import Column
25
+
23
26
  import teradataml.context.context as tdmlctx
24
27
 
25
28
  from collections import OrderedDict, namedtuple
@@ -31,6 +34,7 @@ from teradataml.dataframe.sql_interfaces import ColumnExpression
31
34
  from teradataml.dataframe.sql_functions import case
32
35
  from teradataml.series.series import Series
33
36
  from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
37
+ from teradataml.common.deprecations import argument_deprecation
34
38
  from teradataml.common.utils import UtilFuncs
35
39
  from teradataml.common.exceptions import TeradataMlException
36
40
  from teradataml.common.messages import Messages
@@ -42,6 +46,7 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, Dat
42
46
  from teradataml.dataframe.indexer import _LocationIndexer
43
47
  from teradataml.common.aed_utils import AedUtils
44
48
  from teradataml.options.display import display
49
+ from teradataml.options.configure import configure
45
50
  from teradataml.dataframe.copy_to import copy_to_sql
46
51
  from teradataml.dataframe.row import _Row
47
52
  from teradataml.dataframe.setop import concat
@@ -63,7 +68,79 @@ from teradataml.common.constants import OutputStyle
63
68
 
64
69
  # TODO use logger when available on master branch
65
70
  # logger = teradatapylog.getLogger()
66
- in_schema = UtilFuncs._in_schema
71
+
72
+ class in_schema:
73
+ """
74
+ Class takes a schema name, a table name and datalake name attributes
75
+ and creates an object that can be passed to DataFrame.
76
+ Note:
77
+ teradataml recommends to use this class to access table(s)/view(s),
78
+ from the database other than the default database.
79
+ """
80
+ def __init__(self, schema_name, table_name, datalake_name=None):
81
+ """
82
+ Constructor for in_schema class.
83
+
84
+ PARAMETERS:
85
+ schema_name:
86
+ Required Argument.
87
+ Specifies the schema where the table resides in.
88
+ Types: str
89
+
90
+ table_name:
91
+ Required Argument.
92
+ Specifies the table name or view name in Vantage.
93
+ Types: str
94
+
95
+ datalake_name:
96
+ Optional Argument.
97
+ Specifies the datalake name.
98
+ Types: str
99
+
100
+ EXAMPLES:
101
+ from teradataml.dataframe.dataframe import in_schema, DataFrame
102
+
103
+ # Example 1: The following example creates a DataFrame from the
104
+ # existing Vantage table "dbcinfo" in the non-default
105
+ # database "dbc" using the in_schema instance.
106
+ df = DataFrame(in_schema("dbc", "dbcinfo"))
107
+
108
+ # Example 2: The following example uses from_table() function, existing
109
+ # Vantage table "dbcinfo" and non-default database "dbc" to
110
+ # create a teradataml DataFrame.
111
+ df = DataFrame.from_table(in_schema("dbc","dbcinfo"))
112
+
113
+ # Example 3: The following example uses "in_schema" object created
114
+ # with "datalake_name" argument to create DataFrame on OTF table.
115
+ otf_df = DataFrame(in_schema("datalake_db","datalake_table","datalake"))
116
+
117
+ """
118
+ self.schema_name = schema_name
119
+ self.table_name = table_name
120
+ self.datalake_name = datalake_name
121
+
122
+ awu_matrix = []
123
+ awu_matrix.append(["schema_name", schema_name, False, (str), True])
124
+ awu_matrix.append(["table_name", table_name, False, (str), True])
125
+ awu_matrix.append(["datalake_name", datalake_name, True, (str), True])
126
+
127
+ # Validate argument types
128
+ _Validators._validate_function_arguments(awu_matrix)
129
+
130
+ def __str__(self):
131
+ """
132
+ Returns the string representation of in_schema instance.
133
+ """
134
+ tbl_name = '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.schema_name, "\"", False),
135
+ UtilFuncs._teradata_quote_arg(self.table_name, "\"", False))
136
+
137
+ if not self.datalake_name:
138
+ return tbl_name
139
+
140
+ return '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.datalake_name, "\"", False), tbl_name)
141
+
142
+
143
+ in_schema = in_schema
67
144
 
68
145
 
69
146
  class DataFrame():
@@ -166,6 +243,19 @@ class DataFrame():
166
243
  # Property to determine if table is an ART table or not.
167
244
  self._is_art = None
168
245
 
246
+ self._datalake = None
247
+ self._database = None
248
+ self._table = None
249
+ self._otf = False
250
+
251
+ if isinstance(table_name, in_schema):
252
+ self._table = table_name.table_name
253
+ self._datalake = table_name.datalake_name
254
+ self._database = table_name.schema_name
255
+ self._otf = True if self._datalake else False
256
+
257
+ table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
258
+
169
259
  # Below matrix is list of list, where in each row contains following elements:
170
260
  # Let's take an example of following, just to get an idea:
171
261
  # [element1, element2, element3, element4, element5, element6]
@@ -198,25 +288,45 @@ class DataFrame():
198
288
  self._source_type = SourceType.TABLE.value
199
289
  self._nodeid = self._aed_utils._aed_table(self._table_name)
200
290
  elif query is not None:
291
+ query = query.strip()
292
+ query = query[:-1] if query[-1] == ";" else query
293
+
201
294
  self._query = query
202
295
  self._source_type = SourceType.QUERY.value
203
296
 
204
- if materialize:
205
- # If user requests to materialize the the query, then we should create a
297
+ temp_obj_params = {
298
+ "prefix": "_frmqry_v",
299
+ "use_default_database": True,
300
+ "quote": False
301
+ }
302
+ __execute = UtilFuncs._create_view
303
+
304
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
305
+ # If user requests to materialize the query, then we should create a
306
+ # volatile table if user intends to the same instead of view.
307
+ # Volatile table does not need to be added to the GC.
308
+ temp_obj_params["table_type"] = TeradataConstants.TERADATA_VOLATILE_TABLE
309
+ temp_obj_params["gc_on_quit"] = False
310
+ temp_obj_params["prefix"] = "_frmqry_vt"
311
+ __execute = UtilFuncs._create_table
312
+
313
+ elif materialize:
314
+ # If user requests to materialize the query, then we should create a
206
315
  # table instead of view and add the same in the GarbageCollector.
207
- temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_t", use_default_database=True,
208
- quote=False,
209
- table_type=TeradataConstants.TERADATA_TABLE)
210
- else:
211
- temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_v", use_default_database=True,
212
- quote=False)
316
+ temp_obj_params["table_type"] = TeradataConstants.TERADATA_TABLE
317
+ temp_obj_params["gc_on_quit"] = True
318
+ temp_obj_params["prefix"] = "_frmqry_t"
319
+ __execute = UtilFuncs._create_table
213
320
 
321
+ temp_table_name = UtilFuncs._generate_temp_table_name(**temp_obj_params)
214
322
  self._table_name = temp_table_name
323
+ __execute_params = (self._table_name, self._query)
324
+
325
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
326
+ __execute_params = (self._table_name, self._query, True)
327
+
215
328
  try:
216
- if materialize:
217
- UtilFuncs._create_table(self._table_name, self._query)
218
- else:
219
- UtilFuncs._create_view(self._table_name, self._query)
329
+ __execute(*__execute_params)
220
330
  except OperationalError as oe:
221
331
  if "[Error 3707] Syntax error" in str(oe):
222
332
  raise ValueError(Messages.get_message(
@@ -245,6 +355,9 @@ class DataFrame():
245
355
  self.__data = None
246
356
  self.__data_columns = None
247
357
  self._alias = None
358
+ self._plot = None
359
+
360
+ self._eda_ui = None
248
361
 
249
362
  except TeradataMlException:
250
363
  raise
@@ -334,7 +447,9 @@ class DataFrame():
334
447
  _Validators._validate_function_arguments(arg_info_matrix)
335
448
  try:
336
449
  alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
337
- reuse_metaexpr=False)
450
+ reuse_metaexpr=False, _datalake=self._datalake,
451
+ _database=self._database, _table=self._table,
452
+ _otf=self._otf)
338
453
  # Assigning self attributes to newly created alias dataframe.
339
454
  alias_df._table_name = self._table_name
340
455
  alias_df._index = self._index
@@ -350,7 +465,8 @@ class DataFrame():
350
465
 
351
466
  @classmethod
352
467
  @collect_queryband(queryband="DF_fromTable")
353
- def from_table(cls, table_name, index=True, index_label=None):
468
+ def from_table(cls, table_name, index=True, index_label=None,
469
+ schema_name=None, datalake_name=None):
354
470
  """
355
471
  Class method for creating a DataFrame from a table or a view.
356
472
 
@@ -371,30 +487,48 @@ class DataFrame():
371
487
  Column/s used for sorting.
372
488
  Types: str
373
489
 
490
+ schema_name:
491
+ Optional Argument.
492
+ Specifies the schema where the table resides.
493
+ Types: str
494
+
495
+ datalake_name:
496
+ Optional Argument.
497
+ Specifies the datalake name.
498
+ Types: str
499
+
374
500
  EXAMPLES:
375
- from teradataml.dataframe.dataframe import DataFrame
501
+ >>> from teradataml.dataframe.dataframe import DataFrame
376
502
 
377
503
  # Example 1: The following example creates a DataFrame from a table or
378
504
  a view.
379
505
  # Load the example data.
380
- load_example_data("dataframe","sales")
506
+ >>> load_example_data("dataframe","sales")
381
507
 
382
508
  # Create DataFrame from table
383
- df = DataFrame.from_table('sales')
509
+ >>> df = DataFrame.from_table('sales')
384
510
 
385
511
  # Create DataFrame from table and without index column sorting.
386
- df = DataFrame.from_table("sales", False)
512
+ >>> df = DataFrame.from_table("sales", False)
387
513
 
388
514
  # Create DataFrame from table and sorting using the 'accounts'
389
515
  # column.
390
- df = DataFrame.from_table("sales", True, "accounts")
516
+ >>> df = DataFrame.from_table("sales", True, "accounts")
391
517
 
392
518
  # Example 2: The following example creates a DataFrame from existing Vantage
393
519
  # table "dbcinfo" in the non-default database "dbc" using the
394
520
  # in_schema() function.
395
521
 
396
- from teradataml.dataframe.dataframe import in_schema
397
- df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
522
+ >>> from teradataml.dataframe.dataframe import in_schema
523
+ >>> df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
524
+
525
+ # Example 3: Create a DataFrame on existing DataLake
526
+ # table "lake_table" in the "datalake_database" database
527
+ # in "datalake" datalake.
528
+
529
+ >>> datalake_df = DataFrame.from_table(table_name="lake_table",
530
+ ... schema_name="datalake_database",
531
+ ... datalake_name="datalake" )
398
532
 
399
533
  RETURNS:
400
534
  DataFrame
@@ -403,6 +537,9 @@ class DataFrame():
403
537
  TeradataMlException - TDMLDF_CREATE_FAIL
404
538
 
405
539
  """
540
+ if schema_name:
541
+ return cls(in_schema(schema_name, table_name, datalake_name))
542
+
406
543
  return cls(table_name, index, index_label)
407
544
 
408
545
  @classmethod
@@ -462,7 +599,7 @@ class DataFrame():
462
599
  return cls(index=index, index_label=index_label, query=query, materialize=materialize)
463
600
 
464
601
  @classmethod
465
- def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
602
+ def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True, **kwargs):
466
603
  """
467
604
  Private class method for creating a DataFrame from a nodeid and parent metadata.
468
605
 
@@ -543,6 +680,11 @@ class DataFrame():
543
680
  in [col.name for col in df._metaexpr.c] for elem in undropped_index):
544
681
  df._undropped_index = undropped_index
545
682
 
683
+ # Populate remaining attributes.
684
+ for arg in kwargs:
685
+ # Pop each argument from kwargs and assign to new DataFrame.
686
+ arg_value = kwargs.get(arg)
687
+ df.__setattr__(arg, arg_value)
546
688
  return df
547
689
 
548
690
  def create_temp_view(self, name):
@@ -670,9 +812,10 @@ class DataFrame():
670
812
  return self
671
813
 
672
814
  @collect_queryband(queryband="DF_fillna")
673
- def fillna(self, value=None, columns=None, literal_value=False):
815
+ def fillna(self, value=None, columns=None, literal_value=False, partition_column=None):
674
816
  """
675
- Method to replace the null values in a column with the value specified.
817
+ DESCRIPTION:
818
+ Method to replace the null values in a column with the value specified.
676
819
 
677
820
  PARAMETERS:
678
821
  value:
@@ -705,6 +848,12 @@ class DataFrame():
705
848
  Default Value: False
706
849
  Types: bool
707
850
 
851
+ partition_column:
852
+ Optional Argument.
853
+ Specifies the column name to partition the data.
854
+ Default Value: None
855
+ Types: str
856
+
708
857
  RETURNS:
709
858
  teradataml DataFrame
710
859
 
@@ -745,6 +894,26 @@ class DataFrame():
745
894
  3 Blue Inc 90.0 50 95.0 101.0 17/01/04
746
895
  4 Alpha Co 210.0 200 215.0 250.0 17/01/04
747
896
  5 Orange Inc 210.0 50 NaN 250.0 17/01/04
897
+
898
+ # Example 3: Populate the null value in 'pclass' and
899
+ # 'fare' column with mean value with partition
900
+ # column as 'sex'.
901
+ # Load the example data.
902
+ >>> load_example_data("teradataml", ["titanic"])
903
+ >>> df = DataFrame.from_table("titanic")
904
+
905
+ >>> df.fillna(value="mean", columns=["pclass", "fare"], partition_column="sex")
906
+ passenger survived pclass name sex age sibsp parch ticket fare cabin embarked
907
+ 0 284 1 3 Dorking, Mr. Edward Arthur male 19.0 0 0 A/5. 10482 8.0500 None S
908
+ 1 589 0 3 Gilinski, Mr. Eliezer male 22.0 0 0 14973 8.0500 None S
909
+ 2 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 None Q
910
+ 3 282 0 3 Olsson, Mr. Nils Johan Goransson male 28.0 0 0 347464 7.8542 None S
911
+ 4 608 1 1 Daniel, Mr. Robert Williams male 27.0 0 0 113804 30.5000 None S
912
+ 5 404 0 3 Hakkarainen, Mr. Pekka Pietari male 28.0 1 0 STON/O2. 3101279 15.8500 None S
913
+ 6 427 1 2 Clarke, Mrs. Charles V (Ada Maria Winfield) female 28.0 1 0 2003 26.0000 None S
914
+ 7 141 0 3 Boulos, Mrs. Joseph (Sultana) female NaN 0 2 2678 15.2458 None C
915
+ 8 610 1 1 Shutes, Miss. Elizabeth W female 40.0 0 0 PC 17582 153.4625 C125 S
916
+ 9 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 None C
748
917
  """
749
918
  from teradataml import SimpleImputeFit, SimpleImputeTransform
750
919
 
@@ -752,6 +921,7 @@ class DataFrame():
752
921
  arg_info_matrix.append(["value", value, True, (int, float, str, dict, list)])
753
922
  arg_info_matrix.append(["columns", columns, True, (list, str, tuple)])
754
923
  arg_info_matrix.append(["literal_value", literal_value, True, (bool)])
924
+ arg_info_matrix.append(["partition_column", partition_column, True, (str)])
755
925
 
756
926
  # Validate argument types
757
927
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -823,9 +993,15 @@ class DataFrame():
823
993
  literals=literals,
824
994
  literals_columns=literals_columns,
825
995
  stats=stats,
826
- stats_columns=stats_columns)
996
+ stats_columns=stats_columns,
997
+ partition_column=partition_column)
827
998
 
828
- return fit_obj.transform(data=self).result
999
+ impute_transform = {
1000
+ 'data': self,
1001
+ 'data_partition_column': partition_column,
1002
+ 'object_partition_column': partition_column}
1003
+
1004
+ return fit_obj.transform(**impute_transform).result
829
1005
 
830
1006
  def __execute_node_and_set_table_name(self, nodeid, metaexpr=None):
831
1007
  """
@@ -924,6 +1100,7 @@ class DataFrame():
924
1100
  self._column_names_and_types = []
925
1101
  self._td_column_names_and_types = []
926
1102
  self._td_column_names_and_sqlalchemy_types = {}
1103
+ self._column_types = {}
927
1104
 
928
1105
  for col in self._metaexpr.c:
929
1106
  if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
@@ -931,9 +1108,11 @@ class DataFrame():
931
1108
  else:
932
1109
  tdtype = "{}".format(col.type)
933
1110
 
934
- self._column_names_and_types.append((str(col.name), UtilFuncs._teradata_type_to_python_type(col.type)))
1111
+ py_type = UtilFuncs._teradata_type_to_python_type(col.type)
1112
+ self._column_names_and_types.append((str(col.name), py_type))
935
1113
  self._td_column_names_and_types.append((str(col.name), tdtype))
936
1114
  self._td_column_names_and_sqlalchemy_types[(str(col.name)).lower()] = col.type
1115
+ self._column_types[(str(col.name)).lower()] = [py_type, col.type]
937
1116
 
938
1117
  def _get_metaexpr(self):
939
1118
  """
@@ -952,7 +1131,24 @@ class DataFrame():
952
1131
  meta = sqlalchemy.MetaData()
953
1132
  db_schema = UtilFuncs._extract_db_name(self._table_name)
954
1133
  db_table_name = UtilFuncs._extract_table_name(self._table_name)
955
- t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1134
+ if not self._datalake:
1135
+ t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
1136
+ return _MetaExpression(t)
1137
+
1138
+ # Get metaexpression for datalake table.
1139
+ # check existence of datalake table.
1140
+ tdmlctx.get_connection().dialect.has_table(tdmlctx.get_connection(),
1141
+ self._table,
1142
+ schema=self._database,
1143
+ table_only=True,
1144
+ datalake=self._datalake)
1145
+
1146
+ # Extract column names and corresponding teradatasqlalchemy types.
1147
+ col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
1148
+ self._table,
1149
+ self._datalake)
1150
+ t = sqlalchemy.Table(self._table, meta, schema=self._database,
1151
+ *(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
956
1152
  return _MetaExpression(t)
957
1153
 
958
1154
  def __getattr__(self, name):
@@ -2728,9 +2924,10 @@ class DataFrame():
2728
2924
  msg = Messages.get_message(errcode)
2729
2925
  raise TeradataMlException(msg, errcode)
2730
2926
 
2927
+ @argument_deprecation("20.0.0.5", "include", False, None)
2731
2928
  @collect_queryband(queryband="DF_describe")
2732
2929
  def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
2733
- columns=None):
2930
+ columns=None, pivot=False):
2734
2931
  """
2735
2932
  DESCRIPTION:
2736
2933
  Generates statistics for numeric columns. This function can be used in two modes:
@@ -2762,12 +2959,12 @@ class DataFrame():
2762
2959
  include:
2763
2960
  Optional Argument.
2764
2961
  Values can be either None or "all".
2765
- If the value is "all", then both numeric and non-numeric columns are included.
2962
+ If the value is "all", both numeric and non-numeric columns are included.
2766
2963
  Computes count, mean, std, min, percentiles, and max for numeric columns.
2767
2964
  Computes count and unique for non-numeric columns.
2768
2965
  If the value is None, only numeric columns are used for collecting statistics.
2769
2966
  Note:
2770
- Value 'all' is not applicable for 'Time Series Aggregate Mode'.
2967
+ * Value 'all' is not applicable for 'Time Series Aggregate Mode'.
2771
2968
  Default Values: None
2772
2969
  Types: str
2773
2970
 
@@ -2807,7 +3004,14 @@ class DataFrame():
2807
3004
  Specifies the name(s) of the columns we are collecting statistics for.
2808
3005
  Default Values: None
2809
3006
  Types: str or List of str
2810
-
3007
+
3008
+ pivot:
3009
+ Optional Argument.
3010
+ Specifies a boolean value to pivot the output.
3011
+ Note:
3012
+ * "pivot" is not supported for PTI tables.
3013
+ Default Values: 'False'
3014
+ Types: bool
2811
3015
 
2812
3016
  RETURNS:
2813
3017
  teradataml DataFrame
@@ -2829,7 +3033,7 @@ class DataFrame():
2829
3033
  Orange Inc 210.0 None None 250 04/01/2017
2830
3034
 
2831
3035
  # Computes count, mean, std, min, percentiles, and max for numeric columns.
2832
- >>> df.describe()
3036
+ >>> df.describe(pivot=True)
2833
3037
  Apr Feb Mar Jan
2834
3038
  func
2835
3039
  count 4 6 4 4
@@ -2841,8 +3045,45 @@ class DataFrame():
2841
3045
  75% 250 207.5 158.75 162.5
2842
3046
  max 250 210 215 200
2843
3047
 
3048
+ # Computes count, mean, std, min, percentiles, and max for numeric columns with
3049
+ # default arugments.
3050
+ >>> df.describe()
3051
+ ATTRIBUTE StatName StatValue
3052
+ Jan MAXIMUM 200.0
3053
+ Jan STANDARD DEVIATION 62.91528696058958
3054
+ Jan PERCENTILES(25) 125.0
3055
+ Jan PERCENTILES(50) 150.0
3056
+ Mar COUNT 4.0
3057
+ Mar MINIMUM 95.0
3058
+ Mar MAXIMUM 215.0
3059
+ Mar MEAN 147.5
3060
+ Mar STANDARD DEVIATION 49.749371855331
3061
+ Mar PERCENTILES(25) 128.75
3062
+ Mar PERCENTILES(50) 140.0
3063
+ Apr COUNT 4.0
3064
+ Apr MINIMUM 101.0
3065
+ Apr MAXIMUM 250.0
3066
+ Apr MEAN 195.25
3067
+ Apr STANDARD DEVIATION 70.97123830585646
3068
+ Apr PERCENTILES(25) 160.25
3069
+ Apr PERCENTILES(50) 215.0
3070
+ Apr PERCENTILES(75) 250.0
3071
+ Feb COUNT 6.0
3072
+ Feb MINIMUM 90.0
3073
+ Feb MAXIMUM 210.0
3074
+ Feb MEAN 166.66666666666666
3075
+ Feb STANDARD DEVIATION 59.553897157672786
3076
+ Feb PERCENTILES(25) 117.5
3077
+ Feb PERCENTILES(50) 200.0
3078
+ Feb PERCENTILES(75) 207.5
3079
+ Mar PERCENTILES(75) 158.75
3080
+ Jan PERCENTILES(75) 162.5
3081
+ Jan MEAN 137.5
3082
+ Jan MINIMUM 50.0
3083
+ Jan COUNT 4.0
3084
+
2844
3085
  # Computes count, mean, std, min, percentiles, and max for numeric columns with 30th and 60th percentiles.
2845
- >>> df.describe(percentiles=[.3, .6])
3086
+ >>> df.describe(percentiles=[.3, .6], pivot=True)
2846
3087
  Apr Feb Mar Jan
2847
3088
  func
2848
3089
  count 4 6 4 4
@@ -2855,7 +3096,7 @@ class DataFrame():
2855
3096
 
2856
3097
  # Computes count, mean, std, min, percentiles, and max for numeric columns group by "datetime" and "Feb".
2857
3098
  >>> df1 = df.groupby(["datetime", "Feb"])
2858
- >>> df1.describe()
3099
+ >>> df1.describe(pivot=True)
2859
3100
  Jan Mar Apr
2860
3101
  datetime Feb func
2861
3102
  04/01/2017 90.0 25% 50 95 101
@@ -2883,22 +3124,6 @@ class DataFrame():
2883
3124
  min 200 215 250
2884
3125
  std None None 0
2885
3126
 
2886
- # Computes count, mean, std, min, percentiles, and max for numeric columns and
2887
- # computes count and unique for non-numeric columns
2888
- >>> df.describe(include="all")
2889
- accounts Feb Jan Mar Apr datetime
2890
- func
2891
- 25% None 117.5 125 128.75 160.25 None
2892
- 75% None 207.5 162.5 158.75 250 None
2893
- count 6 6 4 4 4 6
2894
- mean None 166.667 137.5 147.5 195.25 None
2895
- max None 210 200 215 250 None
2896
- min None 90 50 95 101 None
2897
- 50% None 200 150 140 215 None
2898
- std None 59.554 62.915 49.749 70.971 None
2899
- unique 6 None None None None 1
2900
-
2901
- #
2902
3127
  # Examples for describe() function as Time Series Aggregate.
2903
3128
  #
2904
3129
  >>> # Load the example datasets.
@@ -3081,7 +3306,7 @@ class DataFrame():
3081
3306
  >>>
3082
3307
  """
3083
3308
 
3084
- # Argument validations
3309
+ # -------------Argument validations---------------#
3085
3310
  awu_matrix = []
3086
3311
  awu_matrix.append(["columns", columns, True, (str, list), True])
3087
3312
  awu_matrix.append(["percentiles", percentiles, True, (float, list)])
@@ -3090,6 +3315,7 @@ class DataFrame():
3090
3315
  awu_matrix.append(["distinct", distinct, True, (bool)])
3091
3316
  awu_matrix.append(["statistics", statistics, True, (str, list), True,
3092
3317
  ["count", "mean", "min", "max", "unique", "std", "describe", "percentile"]])
3318
+ awu_matrix.append(["pivot", pivot, True, (bool)])
3093
3319
 
3094
3320
  # Validate argument types
3095
3321
  _Validators._validate_function_arguments(awu_matrix)
@@ -3133,22 +3359,27 @@ class DataFrame():
3133
3359
  if verbose and not isinstance(self, DataFrameGroupByTime):
3134
3360
  raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
3135
3361
  'verbose', 'Aggregation', 'True', 'describe()', 'DataFrameGroupByTime'))
3362
+ # -------------End of argument validations---------------#
3136
3363
 
3137
3364
  function_label = "func"
3365
+ sort_cols = []
3138
3366
  try:
3139
3367
  self.__execute_node_and_set_table_name(self._nodeid)
3140
3368
 
3141
3369
  groupby_column_list = None
3142
- if isinstance(self, DataFrameGroupBy):
3370
+ if isinstance(self, DataFrameGroupByTime) or isinstance(self, DataFrameGroupBy):
3143
3371
  groupby_column_list = self.groupby_column_list
3144
- df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
3145
- groupby_column_list=groupby_column_list)
3372
+ if columns:
3373
+ df_utils._validate_describe_columns(columns=columns, metaexpr=self._metaexpr,
3374
+ groupby_column_list=groupby_column_list)
3375
+ sort_cols = list(groupby_column_list)
3146
3376
 
3147
- if isinstance(self, DataFrameGroupByTime):
3148
- groupby_column_list = self.groupby_column_list
3149
- df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
3150
- groupby_column_list=groupby_column_list)
3377
+ # 'func' column will be always there in result.
3378
+ sort_cols.append(function_label)
3151
3379
 
3380
+ # Handle DataFrameGroupByTime using union all approach and
3381
+ # other DataFrames using TD_UnivariateStatistics approach.
3382
+ if isinstance(self, DataFrameGroupByTime):
3152
3383
  # Construct the aggregate query.
3153
3384
  agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3154
3385
  percentiles=percentiles, function_label=function_label,
@@ -3160,29 +3391,99 @@ class DataFrame():
3160
3391
  timecode_column=self._timecode_column,
3161
3392
  sequence_column=self._sequence_column,
3162
3393
  fill=self._fill)
3394
+
3395
+ if groupby_column_list is not None:
3396
+ df = DataFrame.from_query(agg_query, index_label=sort_cols)
3397
+ df2 = df.sort(sort_cols)
3398
+ df2._metaexpr._n_rows = 100
3399
+ describe_df = df2
3400
+ else:
3401
+ describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3402
+
3403
+ # Check if numeric overflow can occur for result DataFrame.
3404
+ if self._check_numeric_overflow(describe_df):
3405
+ result_df = self._promote_dataframe_types()
3406
+ describe_df = result_df.describe(pivot=True)
3407
+ return describe_df
3408
+
3163
3409
  else:
3164
- # Construct the aggregate query.
3165
- agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3166
- percentiles=percentiles, function_label=function_label,
3167
- groupby_column_list=groupby_column_list, include=include,
3168
- is_time_series_aggregate=False, verbose=verbose,
3169
- distinct=distinct, statistics=statistics)
3170
-
3171
- if groupby_column_list is not None:
3172
- sort_cols = [i for i in groupby_column_list]
3173
- sort_cols.append(function_label)
3174
- df = DataFrame.from_query(agg_query, index_label=sort_cols)
3175
- df2 = df.sort(sort_cols)
3176
- df2._metaexpr._n_rows = 100
3177
- describe_df = df2
3178
- else:
3179
- describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3410
+ # If pivot is True, then construct the aggregate query and return the result DataFrame.
3411
+ # Otherwise, return the result DataFrame in the regular aggregate mode using UnivariateStatistics.
3412
+
3413
+ if pivot:
3414
+ # Construct the aggregate query.
3415
+ agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3416
+ percentiles=percentiles, function_label=function_label,
3417
+ groupby_column_list=groupby_column_list, include=include,
3418
+ is_time_series_aggregate=False, verbose=verbose,
3419
+ distinct=distinct, statistics=statistics)
3420
+
3421
+ if groupby_column_list is not None:
3422
+ sort_cols = [i for i in groupby_column_list]
3423
+ sort_cols.append(function_label)
3424
+ df = DataFrame.from_query(agg_query, index_label=sort_cols)
3425
+ df2 = df.sort(sort_cols)
3426
+ df2._metaexpr._n_rows = 100
3427
+ describe_df = df2
3428
+ else:
3429
+ describe_df = DataFrame.from_query(agg_query, index_label=function_label)
3430
+
3431
+ # Check if numeric overflow can occur for result DataFrame.
3432
+ if self._check_numeric_overflow(describe_df):
3433
+ result_df = self._promote_dataframe_types()
3434
+ describe_df = result_df.describe(pivot=True)
3435
+
3436
+ return describe_df
3437
+
3438
+ # If columns is None, then all dataframe columns are considered.
3439
+ if columns is None:
3440
+ columns = self.columns
3441
+ # Exclude groupby columns
3442
+ if groupby_column_list is not None:
3443
+ columns = [col for col in columns if col not in groupby_column_list]
3444
+
3445
+ numeric_cols = []
3446
+
3447
+ # Extract numeric columns and their types of all columns
3448
+ for col in self._metaexpr.c:
3449
+ if type(col.type) in UtilFuncs()._get_numeric_datatypes() and \
3450
+ col.name in columns:
3451
+ numeric_cols.append(col.name)
3452
+
3453
+ if numeric_cols:
3454
+ # Default statistics for 'Regular Aggregate Mode'
3455
+ sql_stat = ["COUNT", "MAXIMUM", "MEAN", "MINIMUM", "PERCENTILES", "STANDARD DEVIATION"]
3456
+
3457
+ if statistics is not None:
3458
+ py_to_sql_func_map = {"count": "COUNT",
3459
+ "max": "MAXIMUM",
3460
+ "mean": "MEAN",
3461
+ "unique": 'UNIQUE ENTITY COUNT',
3462
+ "min": "MINIMUM",
3463
+ "percentile": "PERCENTILES",
3464
+ "std": "STANDARD DEVIATION"}
3465
+ # Convert statistics into corresponding SQL function names
3466
+ sql_stat = [py_to_sql_func_map[stat] for stat in UtilFuncs()._as_list(statistics)]
3467
+
3468
+ # Convert percentiles to centiles for univariate statistics
3469
+ centiles = list(map(lambda n: int(n * 100), percentiles))
3470
+
3471
+ # UnivariateStatistics parameters
3472
+ univar_param = {
3473
+ "newdata": self.select(self.columns),
3474
+ "target_columns": numeric_cols,
3475
+ "partition_columns": groupby_column_list,
3476
+ "centiles": centiles,
3477
+ "stats": sql_stat
3478
+ }
3479
+
3480
+ from teradataml import UnivariateStatistics
3481
+ # Run UnivariateStatistics
3482
+ aggr_df = UnivariateStatistics(**univar_param).result
3483
+
3484
+ # Return the result in teradataml format
3485
+ return aggr_df
3180
3486
 
3181
- # Check if numeric overflow can occur for result DataFrame.
3182
- if self._check_numeric_overflow(describe_df):
3183
- result_df = self._promote_dataframe_types()
3184
- describe_df = result_df.describe()
3185
- return describe_df
3186
3487
  except TeradataMlException:
3187
3488
  raise
3188
3489
  except Exception as err:
@@ -5765,7 +6066,35 @@ class DataFrame():
5765
6066
 
5766
6067
  def _repr_html_(self):
5767
6068
  """ Print method for teradataml for iPython rich display. """
6069
+ self._generate_output_html()
6070
+ if display.enable_ui:
6071
+ # EDA Ui widget representation using teradatamlwidgets
6072
+ if self._eda_ui is None:
6073
+ from teradatamlwidgets.eda.Ui import Ui
6074
+ self._eda_ui = Ui(df=self, html=self.html)
6075
+ else:
6076
+ self._eda_ui.display_ui()
6077
+ return self.html
6078
+
6079
+ def get_eda_ui(self):
6080
+ """
6081
+ Returns the EDA representation UI.
6082
+
6083
+ PARAMETERS:
6084
+ None.
5768
6085
 
6086
+ EXCEPTIONS:
6087
+ None.
6088
+
6089
+ RETURNS:
6090
+ teradatamlwidgets.eda.Ui
6091
+
6092
+ EXAMPLE:
6093
+ df = ui.get_eda_ui()
6094
+ """
6095
+ return self._eda_ui
6096
+
6097
+ def _generate_output_html(self, disable_types=True):
5769
6098
  # Check if class attributes __data and __data_columns are not None.
5770
6099
  # If not None, reuse the data and columns.
5771
6100
  # If None, generate latest results.
@@ -5778,17 +6107,25 @@ class DataFrame():
5778
6107
  dindent = indent + indent
5779
6108
 
5780
6109
  header_html = ['<style type="text/css">',
5781
- 'table {border:ridge 5px;}',
6110
+ 'table { border:ridge 5px}',
5782
6111
  'table td {border:inset 1px;}',
5783
- 'table tr#HeaderRow {background-color:grey; color:white;}'
6112
+ 'table tr#HeaderRow {background-color:grey; color:white;}',
5784
6113
  '</style>\n'
5785
6114
  ]
5786
6115
  html = "\n{0}".format(indent).join(header_html)
5787
- html += '<html><table>\n{0}<tr id="HeaderRow">\n'.format(indent)
6116
+ html += '<html><table style="min-width:1000px;">\n{0}<tr id="HeaderRow">\n'.format(indent)
6117
+
6118
+ columns_html = "</th><th>".join(self.__data_columns)
6119
+ html += "<th>{0}</th>\n".format(columns_html)
6120
+ html += "</tr>\n"
5788
6121
 
5789
- columns_html = "</th>\n{0}<th>".format(dindent).join(self.__data_columns)
5790
- html += "{0}<th>{1}</th>\n".format(dindent, columns_html)
5791
- html += "{0}</tr>\n".format(indent)
6122
+ if not disable_types:
6123
+ html += '<tr>\n'.format(indent)
6124
+ col_types = [repr(self._td_column_names_and_sqlalchemy_types[column]) for column in
6125
+ self.__data_columns]
6126
+ columns_types_html = "</td>\n{0}<td>".format(dindent).join(col_types)
6127
+ html += "{0}<td>{1}</td>\n".format(dindent, columns_types_html)
6128
+ html += "{0}</tr>\n".format(indent)
5792
6129
 
5793
6130
  for row in self.__data:
5794
6131
  row_html = ["{0}<td>{1}</td>\n".format(dindent,
@@ -5796,8 +6133,31 @@ class DataFrame():
5796
6133
  html += "{1}<tr>\n{0}{1}</tr>\n".format("".join(row_html), indent)
5797
6134
 
5798
6135
  html += "</table></html>"
6136
+ self.html = html
6137
+
6138
+ def get_output(self, output_index=0):
6139
+ """
6140
+ DESCRIPTION:
6141
+ Returns the result of analytic function when analytic function is
6142
+ run from 'Analyze' tab in EDA UI.
6143
+ Note:
6144
+ * The function does not return anything if analytic function is
6145
+ not run from EDA UI.
6146
+
6147
+ PARAMETERS:
6148
+ output_index:
6149
+ Optional Argument.
6150
+ Specifies the index of the output dataframe to be returned.
6151
+ Default Value: 0
6152
+ Types: int
6153
+
6154
+ RAISES:
6155
+ IndexError
5799
6156
 
5800
- return html
6157
+ RETURNS:
6158
+ teradataml DataFrame object.
6159
+ """
6160
+ return self._eda_ui.get_output_dataframe(output_index=output_index)
5801
6161
 
5802
6162
  def __get_data_columns(self):
5803
6163
  """
@@ -6857,7 +7217,8 @@ class DataFrame():
6857
7217
  compiled_condition = condition.compile(compile_kwargs={'include_table': True,
6858
7218
  'literal_binds': True,
6859
7219
  'table_name_kind': '_join_alias',
6860
- 'compile_with_caller_table': True})
7220
+ 'compile_with_caller_table': True,
7221
+ 'table_only': True})
6861
7222
 
6862
7223
  all_join_conditions.append(compiled_condition)
6863
7224
 
@@ -7571,14 +7932,14 @@ class DataFrame():
7571
7932
  _Validators._check_auth_token("udf")
7572
7933
  for colname, col in udf_expr.items():
7573
7934
  env_name = UtilFuncs._get_env_name(col)
7574
- # Store the env_name and its corresponding output column
7935
+ # Store the env_name and its corresponding output column
7575
7936
  if env_name in env_mapper:
7576
7937
  env_mapper[env_name].append(colname)
7577
7938
  else:
7578
7939
  env_mapper[env_name] = [colname]
7579
7940
  else:
7580
7941
  env_mapper[env_name] = udf_expr.keys()
7581
-
7942
+
7582
7943
  for env_name, cols in env_mapper.items():
7583
7944
  # Create a dictionary of output columns to column type.
7584
7945
  returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
@@ -7625,11 +7986,11 @@ class DataFrame():
7625
7986
 
7626
7987
  df = tbl_operators.execute()
7627
7988
  return df
7628
-
7989
+
7629
7990
  def _assign_call_udf(self, call_udf_expr):
7630
7991
  """
7631
7992
  DESCRIPTION:
7632
- Internal function for DataFrame.assign() to execute the call_udf using
7993
+ Internal function for DataFrame.assign() to execute the call_udf using
7633
7994
  Script/Apply Table Operator and create new column for teradataml DataFrame.
7634
7995
 
7635
7996
  PARAMETER:
@@ -7656,7 +8017,7 @@ class DataFrame():
7656
8017
  # Create a dictionary of output columns to column type (python types).
7657
8018
  output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
7658
8019
  for col_name, col_type in returns.items()}
7659
-
8020
+
7660
8021
  for colname, col in call_udf_expr.items():
7661
8022
  returns[colname] = col.type
7662
8023
  output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
@@ -7782,7 +8143,7 @@ class DataFrame():
7782
8143
  Look at Example 18 to understand more.
7783
8144
  8. While passing multiple udf expressions, one can not pass one column output
7784
8145
  as another column input in the same ``assign`` call.
7785
- 9. If user pass multiple udf expressions, delimiter and quotechar specified in
8146
+ 9. If user pass multiple udf expressions, delimiter and quotechar specified in
7786
8147
  last udf expression are considered for processing.
7787
8148
 
7788
8149
  RAISES:
@@ -8147,13 +8508,13 @@ class DataFrame():
8147
8508
  Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
8148
8509
  >>>
8149
8510
 
8150
- # Example 19: Convert the values is 'accounts' column to upper case using a user
8511
+ # Example 19: Convert the values is 'accounts' column to upper case using a user
8151
8512
  # defined function on Vantage Cloud Lake.
8152
8513
  # Create a Python 3.10.5 environment with given name and description in Vantage.
8153
8514
  >>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
8154
8515
  User environment 'test_udf' created.
8155
8516
  >>>
8156
- # Create a user defined functions to 'to_upper' to get the values in upper case
8517
+ # Create a user defined functions to 'to_upper' to get the values in upper case
8157
8518
  # and pass the user env to run it on.
8158
8519
  >>> from teradataml.dataframe.functions import udf
8159
8520
  >>> @udf(env_name = env)
@@ -8165,7 +8526,7 @@ class DataFrame():
8165
8526
  # to the DataFrame.
8166
8527
  >>> df.assign(upper_stats = to_upper('accounts'))
8167
8528
  Feb Jan Mar Apr datetime upper_stats
8168
- accounts
8529
+ accounts
8169
8530
  Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
8170
8531
  Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
8171
8532
  Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
@@ -8184,12 +8545,12 @@ class DataFrame():
8184
8545
  # Register the created user defined function with name "upper".
8185
8546
  >>> register("upper", to_upper)
8186
8547
  >>>
8187
- # Call the user defined function registered with name "upper" and assign the
8548
+ # Call the user defined function registered with name "upper" and assign the
8188
8549
  # ColumnExpression returned to the DataFrame.
8189
8550
  >>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
8190
8551
  >>> res
8191
8552
  Feb Jan Mar Apr datetime upper_col
8192
- accounts
8553
+ accounts
8193
8554
  Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
8194
8555
  Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
8195
8556
  Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
@@ -8475,7 +8836,9 @@ class DataFrame():
8475
8836
  _Validators._validate_column_exists_in_dataframe(keys, self._metaexpr)
8476
8837
 
8477
8838
  try:
8478
- new_index_list = self._index_label if self._index_label is not None else []
8839
+
8840
+ # Slicing creates a new list instance with the same contents.
8841
+ new_index_list = self._index_label[:] if self._index_label is not None else []
8479
8842
 
8480
8843
  # Creating a list with requested index labels bases on append
8481
8844
  if append:
@@ -8490,7 +8853,7 @@ class DataFrame():
8490
8853
  new_index_list = keys
8491
8854
 
8492
8855
  # Takes care of appending already existing index
8493
- new_index_list = list(set(new_index_list))
8856
+ new_index_list = list(dict.fromkeys(new_index_list))
8494
8857
 
8495
8858
  # In case requested index is same as existing index, return same DF
8496
8859
  if new_index_list == self._index_label:
@@ -9373,15 +9736,15 @@ class DataFrame():
9373
9736
  TypeError, ValueError, TeradataMLException
9374
9737
 
9375
9738
  EXAMPLES:
9376
- >>> # Load the example datasets.
9377
- ... load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
9739
+ # Load the example datasets.
9740
+ >>> load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
9378
9741
  >>>
9379
9742
 
9380
- >>> # Create the required DataFrames.
9381
- ... # DataFrame on non-sequenced PTI table
9382
- ... ocean_buoys = DataFrame("ocean_buoys")
9383
- >>> # Check DataFrame columns and let's peek at the data
9384
- ... ocean_buoys.columns
9743
+ # Create the required DataFrames.
9744
+ # DataFrame on non-sequenced PTI table
9745
+ >>> ocean_buoys = DataFrame("ocean_buoys")
9746
+ # Check DataFrame columns and let's peek at the data
9747
+ >>> ocean_buoys.columns
9385
9748
  ['buoyid', 'TD_TIMECODE', 'temperature', 'salinity']
9386
9749
  >>> ocean_buoys.head()
9387
9750
  TD_TIMECODE temperature salinity
@@ -9397,10 +9760,10 @@ class DataFrame():
9397
9760
  0 2014-01-06 08:00:00.000000 10.0 55
9398
9761
  0 2014-01-06 08:10:00.000000 10.0 55
9399
9762
 
9400
- >>> # DataFrame on NON-PTI table
9401
- ... ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
9402
- >>> # Check DataFrame columns and let's peek at the data
9403
- ... ocean_buoys_nonpti.columns
9763
+ # DataFrame on NON-PTI table
9764
+ >>> ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
9765
+ # Check DataFrame columns and let's peek at the data
9766
+ >>> ocean_buoys_nonpti.columns
9404
9767
  ['buoyid', 'timecode', 'temperature', 'salinity']
9405
9768
  >>> ocean_buoys_nonpti.head()
9406
9769
  buoyid temperature salinity
@@ -9974,6 +10337,15 @@ class DataFrame():
9974
10337
  # If user did not pass any arguments which form join conditions,
9975
10338
  # Merge is performed using index columns of TeradataML DataFrames
9976
10339
  if on is None and left_on is None and right_on is None and not use_index:
10340
+ # DataFrames created on OTF table will not have index.
10341
+ if self._datalake is not None or right._datalake is not None:
10342
+ msg_code = MessageCodes.EXECUTION_FAILED
10343
+ emsg = "Either 'on' argument or both 'left_on' and 'right_on' arguments" \
10344
+ " must be provided to merge DataFrames when they are created on" \
10345
+ " OTF table(s)."
10346
+ error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
10347
+ raise TeradataMlException(error_msg, msg_code)
10348
+
9977
10349
  if self._index_label is None or right._index_label is None:
9978
10350
  raise TeradataMlException(
9979
10351
  Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
@@ -9981,6 +10353,12 @@ class DataFrame():
9981
10353
  use_index = True
9982
10354
 
9983
10355
  if use_index:
10356
+ if self._datalake is not None or right._datalake is not None:
10357
+ msg_code = MessageCodes.EXECUTION_FAILED
10358
+ emsg = "Can not use Index to merge DataFrames when they are created on OTF table(s)."
10359
+ error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
10360
+ raise TeradataMlException(error_msg, msg_code)
10361
+
9984
10362
  if self._index_label is None or right._index_label is None:
9985
10363
  raise TeradataMlException(
9986
10364
  Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
@@ -10636,7 +11014,7 @@ class DataFrame():
10636
11014
  2. seed is supported for stratify column.
10637
11015
  3. Arguments "stratify_column", "seed", "id_column" are supported only
10638
11016
  for stratifying the data.
10639
- Types: str
11017
+ Types: str OR Feature
10640
11018
 
10641
11019
  seed:
10642
11020
  Optional Argument.
@@ -10662,7 +11040,7 @@ class DataFrame():
10662
11040
  for stratifying the data.
10663
11041
  2. "id_column" is supported only when "stratify_column" is used.
10664
11042
  Ignored otherwise.
10665
- Types: str
11043
+ Types: str OR Feature
10666
11044
 
10667
11045
  RETURNS:
10668
11046
  teradataml DataFrame
@@ -12696,8 +13074,8 @@ class DataFrame():
12696
13074
  _Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
12697
13075
  False)
12698
13076
  column_names = list(dict.fromkeys(column_names))
12699
-
12700
- if list_td_reserved_keywords(column_names):
13077
+
13078
+ if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
12701
13079
  column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
12702
13080
 
12703
13081
  col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
@@ -14617,7 +14995,18 @@ class DataFrame():
14617
14995
  >>> plot.show()
14618
14996
 
14619
14997
  """
14620
- return _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
14998
+
14999
+ _plot = _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
15000
+ # If plot is already generated, return the same plot.
15001
+ if self._plot is None:
15002
+ self._plot = _plot
15003
+ return _plot
15004
+
15005
+ if self._plot == _plot:
15006
+ return self._plot
15007
+ else:
15008
+ self._plot = _plot
15009
+ return _plot
14621
15010
 
14622
15011
  @collect_queryband(queryband="DF_itertuples")
14623
15012
  def itertuples(self, name='Row', num_rows=None):
@@ -17510,11 +17899,18 @@ class _TDUAF(DataFrame):
17510
17899
  table_name = self._db_utils._execute_node_return_db_object_name(self._data._nodeid, self._data._metaexpr)
17511
17900
 
17512
17901
  # UAF Functions do not accept double quotes.
17902
+ tdp = preparer(td_dialect)
17513
17903
  db_name = UtilFuncs._extract_db_name(table_name)
17514
- if db_name:
17515
- table_name = '"{}"."{}"'.format(db_name, UtilFuncs._extract_table_name(table_name))
17904
+ datalake_name = UtilFuncs._extract_datalake_name(table_name)
17905
+ if datalake_name:
17906
+ table_name = '{}.{}.{}'.format(tdp.quote(datalake_name),
17907
+ tdp.quote(db_name),
17908
+ tdp.quote(UtilFuncs._extract_table_name(table_name)))
17909
+ elif db_name:
17910
+ table_name = '{}.{}'.format(tdp.quote(db_name),
17911
+ tdp.quote(UtilFuncs._extract_table_name(table_name)))
17516
17912
  else:
17517
- table_name = UtilFuncs._extract_table_name(table_name)
17913
+ table_name = tdp.quote(UtilFuncs._extract_table_name(table_name))
17518
17914
 
17519
17915
  sql_clauses.append("TABLE_NAME ({})")
17520
17916
  sql_values.append(table_name)