teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +119 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +18 -6
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/sqle/__init__.py +4 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +56 -33
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +12 -5
- teradataml/automl/model_training.py +34 -13
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +64 -40
- teradataml/common/messagecodes.py +13 -3
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +113 -39
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +141 -17
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +5 -5
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +517 -121
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +26 -11
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +2 -2
- teradataml/dbutils/dbutils.py +525 -129
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +317 -1011
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -25
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +2 -2
- teradataml/scriptmgmt/lls_utils.py +63 -26
- teradataml/store/__init__.py +1 -2
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/utils/dtypes.py +47 -0
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +68 -9
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +123 -2
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +79 -75
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -20,6 +20,9 @@ import re
|
|
|
20
20
|
import sqlalchemy
|
|
21
21
|
import sys
|
|
22
22
|
import urllib.parse
|
|
23
|
+
|
|
24
|
+
from sqlalchemy import Column
|
|
25
|
+
|
|
23
26
|
import teradataml.context.context as tdmlctx
|
|
24
27
|
|
|
25
28
|
from collections import OrderedDict, namedtuple
|
|
@@ -31,6 +34,7 @@ from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
|
31
34
|
from teradataml.dataframe.sql_functions import case
|
|
32
35
|
from teradataml.series.series import Series
|
|
33
36
|
from teradatasqlalchemy.types import _TDType, BIGINT, INTEGER, PERIOD_TIMESTAMP, SMALLINT, BYTEINT, FLOAT, DECIMAL
|
|
37
|
+
from teradataml.common.deprecations import argument_deprecation
|
|
34
38
|
from teradataml.common.utils import UtilFuncs
|
|
35
39
|
from teradataml.common.exceptions import TeradataMlException
|
|
36
40
|
from teradataml.common.messages import Messages
|
|
@@ -42,6 +46,7 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils, Dat
|
|
|
42
46
|
from teradataml.dataframe.indexer import _LocationIndexer
|
|
43
47
|
from teradataml.common.aed_utils import AedUtils
|
|
44
48
|
from teradataml.options.display import display
|
|
49
|
+
from teradataml.options.configure import configure
|
|
45
50
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
46
51
|
from teradataml.dataframe.row import _Row
|
|
47
52
|
from teradataml.dataframe.setop import concat
|
|
@@ -63,7 +68,79 @@ from teradataml.common.constants import OutputStyle
|
|
|
63
68
|
|
|
64
69
|
# TODO use logger when available on master branch
|
|
65
70
|
# logger = teradatapylog.getLogger()
|
|
66
|
-
|
|
71
|
+
|
|
72
|
+
class in_schema:
|
|
73
|
+
"""
|
|
74
|
+
Class takes a schema name, a table name and datalake name attributes
|
|
75
|
+
and creates an object that can be passed to DataFrame.
|
|
76
|
+
Note:
|
|
77
|
+
teradataml recommends to use this class to access table(s)/view(s),
|
|
78
|
+
from the database other than the default database.
|
|
79
|
+
"""
|
|
80
|
+
def __init__(self, schema_name, table_name, datalake_name=None):
|
|
81
|
+
"""
|
|
82
|
+
Constructor for in_schema class.
|
|
83
|
+
|
|
84
|
+
PARAMETERS:
|
|
85
|
+
schema_name:
|
|
86
|
+
Required Argument.
|
|
87
|
+
Specifies the schema where the table resides in.
|
|
88
|
+
Types: str
|
|
89
|
+
|
|
90
|
+
table_name:
|
|
91
|
+
Required Argument.
|
|
92
|
+
Specifies the table name or view name in Vantage.
|
|
93
|
+
Types: str
|
|
94
|
+
|
|
95
|
+
datalake_name:
|
|
96
|
+
Optional Argument.
|
|
97
|
+
Specifies the datalake name.
|
|
98
|
+
Types: str
|
|
99
|
+
|
|
100
|
+
EXAMPLES:
|
|
101
|
+
from teradataml.dataframe.dataframe import in_schema, DataFrame
|
|
102
|
+
|
|
103
|
+
# Example 1: The following example creates a DataFrame from the
|
|
104
|
+
# existing Vantage table "dbcinfo" in the non-default
|
|
105
|
+
# database "dbc" using the in_schema instance.
|
|
106
|
+
df = DataFrame(in_schema("dbc", "dbcinfo"))
|
|
107
|
+
|
|
108
|
+
# Example 2: The following example uses from_table() function, existing
|
|
109
|
+
# Vantage table "dbcinfo" and non-default database "dbc" to
|
|
110
|
+
# create a teradataml DataFrame.
|
|
111
|
+
df = DataFrame.from_table(in_schema("dbc","dbcinfo"))
|
|
112
|
+
|
|
113
|
+
# Example 3: The following example uses "in_schema" object created
|
|
114
|
+
# with "datalake_name" argument to create DataFrame on OTF table.
|
|
115
|
+
otf_df = DataFrame(in_schema("datalake_db","datalake_table","datalake"))
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
self.schema_name = schema_name
|
|
119
|
+
self.table_name = table_name
|
|
120
|
+
self.datalake_name = datalake_name
|
|
121
|
+
|
|
122
|
+
awu_matrix = []
|
|
123
|
+
awu_matrix.append(["schema_name", schema_name, False, (str), True])
|
|
124
|
+
awu_matrix.append(["table_name", table_name, False, (str), True])
|
|
125
|
+
awu_matrix.append(["datalake_name", datalake_name, True, (str), True])
|
|
126
|
+
|
|
127
|
+
# Validate argument types
|
|
128
|
+
_Validators._validate_function_arguments(awu_matrix)
|
|
129
|
+
|
|
130
|
+
def __str__(self):
|
|
131
|
+
"""
|
|
132
|
+
Returns the string representation of in_schema instance.
|
|
133
|
+
"""
|
|
134
|
+
tbl_name = '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.schema_name, "\"", False),
|
|
135
|
+
UtilFuncs._teradata_quote_arg(self.table_name, "\"", False))
|
|
136
|
+
|
|
137
|
+
if not self.datalake_name:
|
|
138
|
+
return tbl_name
|
|
139
|
+
|
|
140
|
+
return '{}.{}'.format(UtilFuncs._teradata_quote_arg(self.datalake_name, "\"", False), tbl_name)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
in_schema = in_schema
|
|
67
144
|
|
|
68
145
|
|
|
69
146
|
class DataFrame():
|
|
@@ -166,6 +243,19 @@ class DataFrame():
|
|
|
166
243
|
# Property to determine if table is an ART table or not.
|
|
167
244
|
self._is_art = None
|
|
168
245
|
|
|
246
|
+
self._datalake = None
|
|
247
|
+
self._database = None
|
|
248
|
+
self._table = None
|
|
249
|
+
self._otf = False
|
|
250
|
+
|
|
251
|
+
if isinstance(table_name, in_schema):
|
|
252
|
+
self._table = table_name.table_name
|
|
253
|
+
self._datalake = table_name.datalake_name
|
|
254
|
+
self._database = table_name.schema_name
|
|
255
|
+
self._otf = True if self._datalake else False
|
|
256
|
+
|
|
257
|
+
table_name = str(table_name) if isinstance(table_name, in_schema) else table_name
|
|
258
|
+
|
|
169
259
|
# Below matrix is list of list, where in each row contains following elements:
|
|
170
260
|
# Let's take an example of following, just to get an idea:
|
|
171
261
|
# [element1, element2, element3, element4, element5, element6]
|
|
@@ -198,25 +288,45 @@ class DataFrame():
|
|
|
198
288
|
self._source_type = SourceType.TABLE.value
|
|
199
289
|
self._nodeid = self._aed_utils._aed_table(self._table_name)
|
|
200
290
|
elif query is not None:
|
|
291
|
+
query = query.strip()
|
|
292
|
+
query = query[:-1] if query[-1] == ";" else query
|
|
293
|
+
|
|
201
294
|
self._query = query
|
|
202
295
|
self._source_type = SourceType.QUERY.value
|
|
203
296
|
|
|
204
|
-
|
|
205
|
-
|
|
297
|
+
temp_obj_params = {
|
|
298
|
+
"prefix": "_frmqry_v",
|
|
299
|
+
"use_default_database": True,
|
|
300
|
+
"quote": False
|
|
301
|
+
}
|
|
302
|
+
__execute = UtilFuncs._create_view
|
|
303
|
+
|
|
304
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
305
|
+
# If user requests to materialize the query, then we should create a
|
|
306
|
+
# volatile table if user intends to the same instead of view.
|
|
307
|
+
# Volatile table does not need to be added to the GC.
|
|
308
|
+
temp_obj_params["table_type"] = TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
309
|
+
temp_obj_params["gc_on_quit"] = False
|
|
310
|
+
temp_obj_params["prefix"] = "_frmqry_vt"
|
|
311
|
+
__execute = UtilFuncs._create_table
|
|
312
|
+
|
|
313
|
+
elif materialize:
|
|
314
|
+
# If user requests to materialize the query, then we should create a
|
|
206
315
|
# table instead of view and add the same in the GarbageCollector.
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
temp_table_name = UtilFuncs._generate_temp_table_name(prefix="_frmqry_v", use_default_database=True,
|
|
212
|
-
quote=False)
|
|
316
|
+
temp_obj_params["table_type"] = TeradataConstants.TERADATA_TABLE
|
|
317
|
+
temp_obj_params["gc_on_quit"] = True
|
|
318
|
+
temp_obj_params["prefix"] = "_frmqry_t"
|
|
319
|
+
__execute = UtilFuncs._create_table
|
|
213
320
|
|
|
321
|
+
temp_table_name = UtilFuncs._generate_temp_table_name(**temp_obj_params)
|
|
214
322
|
self._table_name = temp_table_name
|
|
323
|
+
__execute_params = (self._table_name, self._query)
|
|
324
|
+
|
|
325
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
326
|
+
__execute_params = (self._table_name, self._query, True)
|
|
327
|
+
|
|
215
328
|
try:
|
|
216
|
-
|
|
217
|
-
UtilFuncs._create_table(self._table_name, self._query)
|
|
218
|
-
else:
|
|
219
|
-
UtilFuncs._create_view(self._table_name, self._query)
|
|
329
|
+
__execute(*__execute_params)
|
|
220
330
|
except OperationalError as oe:
|
|
221
331
|
if "[Error 3707] Syntax error" in str(oe):
|
|
222
332
|
raise ValueError(Messages.get_message(
|
|
@@ -245,6 +355,9 @@ class DataFrame():
|
|
|
245
355
|
self.__data = None
|
|
246
356
|
self.__data_columns = None
|
|
247
357
|
self._alias = None
|
|
358
|
+
self._plot = None
|
|
359
|
+
|
|
360
|
+
self._eda_ui = None
|
|
248
361
|
|
|
249
362
|
except TeradataMlException:
|
|
250
363
|
raise
|
|
@@ -334,7 +447,9 @@ class DataFrame():
|
|
|
334
447
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
335
448
|
try:
|
|
336
449
|
alias_df = self._from_node(self._nodeid, self._metaexpr, self._index_label,
|
|
337
|
-
|
|
450
|
+
reuse_metaexpr=False, _datalake=self._datalake,
|
|
451
|
+
_database=self._database, _table=self._table,
|
|
452
|
+
_otf=self._otf)
|
|
338
453
|
# Assigning self attributes to newly created alias dataframe.
|
|
339
454
|
alias_df._table_name = self._table_name
|
|
340
455
|
alias_df._index = self._index
|
|
@@ -350,7 +465,8 @@ class DataFrame():
|
|
|
350
465
|
|
|
351
466
|
@classmethod
|
|
352
467
|
@collect_queryband(queryband="DF_fromTable")
|
|
353
|
-
def from_table(cls, table_name, index=True, index_label=None
|
|
468
|
+
def from_table(cls, table_name, index=True, index_label=None,
|
|
469
|
+
schema_name=None, datalake_name=None):
|
|
354
470
|
"""
|
|
355
471
|
Class method for creating a DataFrame from a table or a view.
|
|
356
472
|
|
|
@@ -371,30 +487,48 @@ class DataFrame():
|
|
|
371
487
|
Column/s used for sorting.
|
|
372
488
|
Types: str
|
|
373
489
|
|
|
490
|
+
schema_name:
|
|
491
|
+
Optional Argument.
|
|
492
|
+
Specifies the schema where the table resides.
|
|
493
|
+
Types: str
|
|
494
|
+
|
|
495
|
+
datalake_name:
|
|
496
|
+
Optional Argument.
|
|
497
|
+
Specifies the datalake name.
|
|
498
|
+
Types: str
|
|
499
|
+
|
|
374
500
|
EXAMPLES:
|
|
375
|
-
from teradataml.dataframe.dataframe import DataFrame
|
|
501
|
+
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
376
502
|
|
|
377
503
|
# Example 1: The following example creates a DataFrame from a table or
|
|
378
504
|
a view.
|
|
379
505
|
# Load the example data.
|
|
380
|
-
load_example_data("dataframe","sales")
|
|
506
|
+
>>> load_example_data("dataframe","sales")
|
|
381
507
|
|
|
382
508
|
# Create DataFrame from table
|
|
383
|
-
df = DataFrame.from_table('sales')
|
|
509
|
+
>>> df = DataFrame.from_table('sales')
|
|
384
510
|
|
|
385
511
|
# Create DataFrame from table and without index column sorting.
|
|
386
|
-
df = DataFrame.from_table("sales", False)
|
|
512
|
+
>>> df = DataFrame.from_table("sales", False)
|
|
387
513
|
|
|
388
514
|
# Create DataFrame from table and sorting using the 'accounts'
|
|
389
515
|
# column.
|
|
390
|
-
df = DataFrame.from_table("sales", True, "accounts")
|
|
516
|
+
>>> df = DataFrame.from_table("sales", True, "accounts")
|
|
391
517
|
|
|
392
518
|
# Example 2: The following example creates a DataFrame from existing Vantage
|
|
393
519
|
# table "dbcinfo" in the non-default database "dbc" using the
|
|
394
520
|
# in_schema() function.
|
|
395
521
|
|
|
396
|
-
from teradataml.dataframe.dataframe import in_schema
|
|
397
|
-
df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
|
|
522
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
523
|
+
>>> df = DataFrame.from_table(in_schema("dbc", "dbcinfo"))
|
|
524
|
+
|
|
525
|
+
# Example 3: Create a DataFrame on existing DataLake
|
|
526
|
+
# table "lake_table" in the "datalake_database" database
|
|
527
|
+
# in "datalake" datalake.
|
|
528
|
+
|
|
529
|
+
>>> datalake_df = DataFrame.from_table(table_name="lake_table",
|
|
530
|
+
... schema_name="datalake_database",
|
|
531
|
+
... datalake_name="datalake" )
|
|
398
532
|
|
|
399
533
|
RETURNS:
|
|
400
534
|
DataFrame
|
|
@@ -403,6 +537,9 @@ class DataFrame():
|
|
|
403
537
|
TeradataMlException - TDMLDF_CREATE_FAIL
|
|
404
538
|
|
|
405
539
|
"""
|
|
540
|
+
if schema_name:
|
|
541
|
+
return cls(in_schema(schema_name, table_name, datalake_name))
|
|
542
|
+
|
|
406
543
|
return cls(table_name, index, index_label)
|
|
407
544
|
|
|
408
545
|
@classmethod
|
|
@@ -462,7 +599,7 @@ class DataFrame():
|
|
|
462
599
|
return cls(index=index, index_label=index_label, query=query, materialize=materialize)
|
|
463
600
|
|
|
464
601
|
@classmethod
|
|
465
|
-
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True):
|
|
602
|
+
def _from_node(cls, nodeid, metaexpr, index_label=None, undropped_index=None, reuse_metaexpr=True, **kwargs):
|
|
466
603
|
"""
|
|
467
604
|
Private class method for creating a DataFrame from a nodeid and parent metadata.
|
|
468
605
|
|
|
@@ -543,6 +680,11 @@ class DataFrame():
|
|
|
543
680
|
in [col.name for col in df._metaexpr.c] for elem in undropped_index):
|
|
544
681
|
df._undropped_index = undropped_index
|
|
545
682
|
|
|
683
|
+
# Populate remaining attributes.
|
|
684
|
+
for arg in kwargs:
|
|
685
|
+
# Pop each argument from kwargs and assign to new DataFrame.
|
|
686
|
+
arg_value = kwargs.get(arg)
|
|
687
|
+
df.__setattr__(arg, arg_value)
|
|
546
688
|
return df
|
|
547
689
|
|
|
548
690
|
def create_temp_view(self, name):
|
|
@@ -670,9 +812,10 @@ class DataFrame():
|
|
|
670
812
|
return self
|
|
671
813
|
|
|
672
814
|
@collect_queryband(queryband="DF_fillna")
|
|
673
|
-
def fillna(self, value=None, columns=None, literal_value=False):
|
|
815
|
+
def fillna(self, value=None, columns=None, literal_value=False, partition_column=None):
|
|
674
816
|
"""
|
|
675
|
-
|
|
817
|
+
DESCRIPTION:
|
|
818
|
+
Method to replace the null values in a column with the value specified.
|
|
676
819
|
|
|
677
820
|
PARAMETERS:
|
|
678
821
|
value:
|
|
@@ -705,6 +848,12 @@ class DataFrame():
|
|
|
705
848
|
Default Value: False
|
|
706
849
|
Types: bool
|
|
707
850
|
|
|
851
|
+
partition_column:
|
|
852
|
+
Optional Argument.
|
|
853
|
+
Specifies the column name to partition the data.
|
|
854
|
+
Default Value: None
|
|
855
|
+
Types: str
|
|
856
|
+
|
|
708
857
|
RETURNS:
|
|
709
858
|
teradataml DataFrame
|
|
710
859
|
|
|
@@ -745,6 +894,26 @@ class DataFrame():
|
|
|
745
894
|
3 Blue Inc 90.0 50 95.0 101.0 17/01/04
|
|
746
895
|
4 Alpha Co 210.0 200 215.0 250.0 17/01/04
|
|
747
896
|
5 Orange Inc 210.0 50 NaN 250.0 17/01/04
|
|
897
|
+
|
|
898
|
+
# Example 3: Populate the null value in 'pclass' and
|
|
899
|
+
# 'fare' column with mean value with partition
|
|
900
|
+
# column as 'sex'.
|
|
901
|
+
# Load the example data.
|
|
902
|
+
>>> load_example_data("teradataml", ["titanic"])
|
|
903
|
+
>>> df = DataFrame.from_table("titanic")
|
|
904
|
+
|
|
905
|
+
>>> df.fillna(value="mean", columns=["pclass", "fare"], partition_column="sex")
|
|
906
|
+
passenger survived pclass name sex age sibsp parch ticket fare cabin embarked
|
|
907
|
+
0 284 1 3 Dorking, Mr. Edward Arthur male 19.0 0 0 A/5. 10482 8.0500 None S
|
|
908
|
+
1 589 0 3 Gilinski, Mr. Eliezer male 22.0 0 0 14973 8.0500 None S
|
|
909
|
+
2 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 None Q
|
|
910
|
+
3 282 0 3 Olsson, Mr. Nils Johan Goransson male 28.0 0 0 347464 7.8542 None S
|
|
911
|
+
4 608 1 1 Daniel, Mr. Robert Williams male 27.0 0 0 113804 30.5000 None S
|
|
912
|
+
5 404 0 3 Hakkarainen, Mr. Pekka Pietari male 28.0 1 0 STON/O2. 3101279 15.8500 None S
|
|
913
|
+
6 427 1 2 Clarke, Mrs. Charles V (Ada Maria Winfield) female 28.0 1 0 2003 26.0000 None S
|
|
914
|
+
7 141 0 3 Boulos, Mrs. Joseph (Sultana) female NaN 0 2 2678 15.2458 None C
|
|
915
|
+
8 610 1 1 Shutes, Miss. Elizabeth W female 40.0 0 0 PC 17582 153.4625 C125 S
|
|
916
|
+
9 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 None C
|
|
748
917
|
"""
|
|
749
918
|
from teradataml import SimpleImputeFit, SimpleImputeTransform
|
|
750
919
|
|
|
@@ -752,6 +921,7 @@ class DataFrame():
|
|
|
752
921
|
arg_info_matrix.append(["value", value, True, (int, float, str, dict, list)])
|
|
753
922
|
arg_info_matrix.append(["columns", columns, True, (list, str, tuple)])
|
|
754
923
|
arg_info_matrix.append(["literal_value", literal_value, True, (bool)])
|
|
924
|
+
arg_info_matrix.append(["partition_column", partition_column, True, (str)])
|
|
755
925
|
|
|
756
926
|
# Validate argument types
|
|
757
927
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -823,9 +993,15 @@ class DataFrame():
|
|
|
823
993
|
literals=literals,
|
|
824
994
|
literals_columns=literals_columns,
|
|
825
995
|
stats=stats,
|
|
826
|
-
stats_columns=stats_columns
|
|
996
|
+
stats_columns=stats_columns,
|
|
997
|
+
partition_column=partition_column)
|
|
827
998
|
|
|
828
|
-
|
|
999
|
+
impute_transform = {
|
|
1000
|
+
'data': self,
|
|
1001
|
+
'data_partition_column': partition_column,
|
|
1002
|
+
'object_partition_column': partition_column}
|
|
1003
|
+
|
|
1004
|
+
return fit_obj.transform(**impute_transform).result
|
|
829
1005
|
|
|
830
1006
|
def __execute_node_and_set_table_name(self, nodeid, metaexpr=None):
|
|
831
1007
|
"""
|
|
@@ -924,6 +1100,7 @@ class DataFrame():
|
|
|
924
1100
|
self._column_names_and_types = []
|
|
925
1101
|
self._td_column_names_and_types = []
|
|
926
1102
|
self._td_column_names_and_sqlalchemy_types = {}
|
|
1103
|
+
self._column_types = {}
|
|
927
1104
|
|
|
928
1105
|
for col in self._metaexpr.c:
|
|
929
1106
|
if isinstance(col.type, sqlalchemy.sql.sqltypes.NullType):
|
|
@@ -931,9 +1108,11 @@ class DataFrame():
|
|
|
931
1108
|
else:
|
|
932
1109
|
tdtype = "{}".format(col.type)
|
|
933
1110
|
|
|
934
|
-
|
|
1111
|
+
py_type = UtilFuncs._teradata_type_to_python_type(col.type)
|
|
1112
|
+
self._column_names_and_types.append((str(col.name), py_type))
|
|
935
1113
|
self._td_column_names_and_types.append((str(col.name), tdtype))
|
|
936
1114
|
self._td_column_names_and_sqlalchemy_types[(str(col.name)).lower()] = col.type
|
|
1115
|
+
self._column_types[(str(col.name)).lower()] = [py_type, col.type]
|
|
937
1116
|
|
|
938
1117
|
def _get_metaexpr(self):
|
|
939
1118
|
"""
|
|
@@ -952,7 +1131,24 @@ class DataFrame():
|
|
|
952
1131
|
meta = sqlalchemy.MetaData()
|
|
953
1132
|
db_schema = UtilFuncs._extract_db_name(self._table_name)
|
|
954
1133
|
db_table_name = UtilFuncs._extract_table_name(self._table_name)
|
|
955
|
-
|
|
1134
|
+
if not self._datalake:
|
|
1135
|
+
t = sqlalchemy.Table(db_table_name, meta, schema=db_schema, autoload_with=eng)
|
|
1136
|
+
return _MetaExpression(t)
|
|
1137
|
+
|
|
1138
|
+
# Get metaexpression for datalake table.
|
|
1139
|
+
# check existence of datalake table.
|
|
1140
|
+
tdmlctx.get_connection().dialect.has_table(tdmlctx.get_connection(),
|
|
1141
|
+
self._table,
|
|
1142
|
+
schema=self._database,
|
|
1143
|
+
table_only=True,
|
|
1144
|
+
datalake=self._datalake)
|
|
1145
|
+
|
|
1146
|
+
# Extract column names and corresponding teradatasqlalchemy types.
|
|
1147
|
+
col_names, col_types = df_utils._get_datalake_table_columns_info(self._database,
|
|
1148
|
+
self._table,
|
|
1149
|
+
self._datalake)
|
|
1150
|
+
t = sqlalchemy.Table(self._table, meta, schema=self._database,
|
|
1151
|
+
*(Column(col_name, col_type) for col_name, col_type in zip(col_names, col_types)))
|
|
956
1152
|
return _MetaExpression(t)
|
|
957
1153
|
|
|
958
1154
|
def __getattr__(self, name):
|
|
@@ -2728,9 +2924,10 @@ class DataFrame():
|
|
|
2728
2924
|
msg = Messages.get_message(errcode)
|
|
2729
2925
|
raise TeradataMlException(msg, errcode)
|
|
2730
2926
|
|
|
2927
|
+
@argument_deprecation("20.0.0.5", "include", False, None)
|
|
2731
2928
|
@collect_queryband(queryband="DF_describe")
|
|
2732
2929
|
def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
|
|
2733
|
-
columns=None):
|
|
2930
|
+
columns=None, pivot=False):
|
|
2734
2931
|
"""
|
|
2735
2932
|
DESCRIPTION:
|
|
2736
2933
|
Generates statistics for numeric columns. This function can be used in two modes:
|
|
@@ -2762,12 +2959,12 @@ class DataFrame():
|
|
|
2762
2959
|
include:
|
|
2763
2960
|
Optional Argument.
|
|
2764
2961
|
Values can be either None or "all".
|
|
2765
|
-
If the value is "all",
|
|
2962
|
+
If the value is "all", both numeric and non-numeric columns are included.
|
|
2766
2963
|
Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2767
2964
|
Computes count and unique for non-numeric columns.
|
|
2768
2965
|
If the value is None, only numeric columns are used for collecting statistics.
|
|
2769
2966
|
Note:
|
|
2770
|
-
Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2967
|
+
* Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2771
2968
|
Default Values: None
|
|
2772
2969
|
Types: str
|
|
2773
2970
|
|
|
@@ -2807,7 +3004,14 @@ class DataFrame():
|
|
|
2807
3004
|
Specifies the name(s) of the columns we are collecting statistics for.
|
|
2808
3005
|
Default Values: None
|
|
2809
3006
|
Types: str or List of str
|
|
2810
|
-
|
|
3007
|
+
|
|
3008
|
+
pivot:
|
|
3009
|
+
Optional Argument.
|
|
3010
|
+
Specifies a boolean value to pivot the output.
|
|
3011
|
+
Note:
|
|
3012
|
+
* "pivot" is not supported for PTI tables.
|
|
3013
|
+
Default Values: 'False'
|
|
3014
|
+
Types: bool
|
|
2811
3015
|
|
|
2812
3016
|
RETURNS:
|
|
2813
3017
|
teradataml DataFrame
|
|
@@ -2829,7 +3033,7 @@ class DataFrame():
|
|
|
2829
3033
|
Orange Inc 210.0 None None 250 04/01/2017
|
|
2830
3034
|
|
|
2831
3035
|
# Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2832
|
-
>>> df.describe()
|
|
3036
|
+
>>> df.describe(pivot=True)
|
|
2833
3037
|
Apr Feb Mar Jan
|
|
2834
3038
|
func
|
|
2835
3039
|
count 4 6 4 4
|
|
@@ -2841,8 +3045,45 @@ class DataFrame():
|
|
|
2841
3045
|
75% 250 207.5 158.75 162.5
|
|
2842
3046
|
max 250 210 215 200
|
|
2843
3047
|
|
|
3048
|
+
# Computes count, mean, std, min, percentiles, and max for numeric columns with
|
|
3049
|
+
# default arugments.
|
|
3050
|
+
>>> df.describe()
|
|
3051
|
+
ATTRIBUTE StatName StatValue
|
|
3052
|
+
Jan MAXIMUM 200.0
|
|
3053
|
+
Jan STANDARD DEVIATION 62.91528696058958
|
|
3054
|
+
Jan PERCENTILES(25) 125.0
|
|
3055
|
+
Jan PERCENTILES(50) 150.0
|
|
3056
|
+
Mar COUNT 4.0
|
|
3057
|
+
Mar MINIMUM 95.0
|
|
3058
|
+
Mar MAXIMUM 215.0
|
|
3059
|
+
Mar MEAN 147.5
|
|
3060
|
+
Mar STANDARD DEVIATION 49.749371855331
|
|
3061
|
+
Mar PERCENTILES(25) 128.75
|
|
3062
|
+
Mar PERCENTILES(50) 140.0
|
|
3063
|
+
Apr COUNT 4.0
|
|
3064
|
+
Apr MINIMUM 101.0
|
|
3065
|
+
Apr MAXIMUM 250.0
|
|
3066
|
+
Apr MEAN 195.25
|
|
3067
|
+
Apr STANDARD DEVIATION 70.97123830585646
|
|
3068
|
+
Apr PERCENTILES(25) 160.25
|
|
3069
|
+
Apr PERCENTILES(50) 215.0
|
|
3070
|
+
Apr PERCENTILES(75) 250.0
|
|
3071
|
+
Feb COUNT 6.0
|
|
3072
|
+
Feb MINIMUM 90.0
|
|
3073
|
+
Feb MAXIMUM 210.0
|
|
3074
|
+
Feb MEAN 166.66666666666666
|
|
3075
|
+
Feb STANDARD DEVIATION 59.553897157672786
|
|
3076
|
+
Feb PERCENTILES(25) 117.5
|
|
3077
|
+
Feb PERCENTILES(50) 200.0
|
|
3078
|
+
Feb PERCENTILES(75) 207.5
|
|
3079
|
+
Mar PERCENTILES(75) 158.75
|
|
3080
|
+
Jan PERCENTILES(75) 162.5
|
|
3081
|
+
Jan MEAN 137.5
|
|
3082
|
+
Jan MINIMUM 50.0
|
|
3083
|
+
Jan COUNT 4.0
|
|
3084
|
+
|
|
2844
3085
|
# Computes count, mean, std, min, percentiles, and max for numeric columns with 30th and 60th percentiles.
|
|
2845
|
-
>>> df.describe(percentiles=[.3, .6])
|
|
3086
|
+
>>> df.describe(percentiles=[.3, .6], pivot=True)
|
|
2846
3087
|
Apr Feb Mar Jan
|
|
2847
3088
|
func
|
|
2848
3089
|
count 4 6 4 4
|
|
@@ -2855,7 +3096,7 @@ class DataFrame():
|
|
|
2855
3096
|
|
|
2856
3097
|
# Computes count, mean, std, min, percentiles, and max for numeric columns group by "datetime" and "Feb".
|
|
2857
3098
|
>>> df1 = df.groupby(["datetime", "Feb"])
|
|
2858
|
-
>>> df1.describe()
|
|
3099
|
+
>>> df1.describe(pivot=True)
|
|
2859
3100
|
Jan Mar Apr
|
|
2860
3101
|
datetime Feb func
|
|
2861
3102
|
04/01/2017 90.0 25% 50 95 101
|
|
@@ -2883,22 +3124,6 @@ class DataFrame():
|
|
|
2883
3124
|
min 200 215 250
|
|
2884
3125
|
std None None 0
|
|
2885
3126
|
|
|
2886
|
-
# Computes count, mean, std, min, percentiles, and max for numeric columns and
|
|
2887
|
-
# computes count and unique for non-numeric columns
|
|
2888
|
-
>>> df.describe(include="all")
|
|
2889
|
-
accounts Feb Jan Mar Apr datetime
|
|
2890
|
-
func
|
|
2891
|
-
25% None 117.5 125 128.75 160.25 None
|
|
2892
|
-
75% None 207.5 162.5 158.75 250 None
|
|
2893
|
-
count 6 6 4 4 4 6
|
|
2894
|
-
mean None 166.667 137.5 147.5 195.25 None
|
|
2895
|
-
max None 210 200 215 250 None
|
|
2896
|
-
min None 90 50 95 101 None
|
|
2897
|
-
50% None 200 150 140 215 None
|
|
2898
|
-
std None 59.554 62.915 49.749 70.971 None
|
|
2899
|
-
unique 6 None None None None 1
|
|
2900
|
-
|
|
2901
|
-
#
|
|
2902
3127
|
# Examples for describe() function as Time Series Aggregate.
|
|
2903
3128
|
#
|
|
2904
3129
|
>>> # Load the example datasets.
|
|
@@ -3081,7 +3306,7 @@ class DataFrame():
|
|
|
3081
3306
|
>>>
|
|
3082
3307
|
"""
|
|
3083
3308
|
|
|
3084
|
-
# Argument validations
|
|
3309
|
+
# -------------Argument validations---------------#
|
|
3085
3310
|
awu_matrix = []
|
|
3086
3311
|
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
3087
3312
|
awu_matrix.append(["percentiles", percentiles, True, (float, list)])
|
|
@@ -3090,6 +3315,7 @@ class DataFrame():
|
|
|
3090
3315
|
awu_matrix.append(["distinct", distinct, True, (bool)])
|
|
3091
3316
|
awu_matrix.append(["statistics", statistics, True, (str, list), True,
|
|
3092
3317
|
["count", "mean", "min", "max", "unique", "std", "describe", "percentile"]])
|
|
3318
|
+
awu_matrix.append(["pivot", pivot, True, (bool)])
|
|
3093
3319
|
|
|
3094
3320
|
# Validate argument types
|
|
3095
3321
|
_Validators._validate_function_arguments(awu_matrix)
|
|
@@ -3133,22 +3359,27 @@ class DataFrame():
|
|
|
3133
3359
|
if verbose and not isinstance(self, DataFrameGroupByTime):
|
|
3134
3360
|
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
3135
3361
|
'verbose', 'Aggregation', 'True', 'describe()', 'DataFrameGroupByTime'))
|
|
3362
|
+
# -------------End of argument validations---------------#
|
|
3136
3363
|
|
|
3137
3364
|
function_label = "func"
|
|
3365
|
+
sort_cols = []
|
|
3138
3366
|
try:
|
|
3139
3367
|
self.__execute_node_and_set_table_name(self._nodeid)
|
|
3140
3368
|
|
|
3141
3369
|
groupby_column_list = None
|
|
3142
|
-
if isinstance(self, DataFrameGroupBy):
|
|
3370
|
+
if isinstance(self, DataFrameGroupByTime) or isinstance(self, DataFrameGroupBy):
|
|
3143
3371
|
groupby_column_list = self.groupby_column_list
|
|
3144
|
-
|
|
3145
|
-
|
|
3372
|
+
if columns:
|
|
3373
|
+
df_utils._validate_describe_columns(columns=columns, metaexpr=self._metaexpr,
|
|
3374
|
+
groupby_column_list=groupby_column_list)
|
|
3375
|
+
sort_cols = list(groupby_column_list)
|
|
3146
3376
|
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
df_utils._invalid_describe_column(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3150
|
-
groupby_column_list=groupby_column_list)
|
|
3377
|
+
# 'func' column will be always there in result.
|
|
3378
|
+
sort_cols.append(function_label)
|
|
3151
3379
|
|
|
3380
|
+
# Handle DataFrameGroupByTime using union all approach and
|
|
3381
|
+
# other DataFrames using TD_UnivariateStatistics approach.
|
|
3382
|
+
if isinstance(self, DataFrameGroupByTime):
|
|
3152
3383
|
# Construct the aggregate query.
|
|
3153
3384
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3154
3385
|
percentiles=percentiles, function_label=function_label,
|
|
@@ -3160,29 +3391,99 @@ class DataFrame():
|
|
|
3160
3391
|
timecode_column=self._timecode_column,
|
|
3161
3392
|
sequence_column=self._sequence_column,
|
|
3162
3393
|
fill=self._fill)
|
|
3394
|
+
|
|
3395
|
+
if groupby_column_list is not None:
|
|
3396
|
+
df = DataFrame.from_query(agg_query, index_label=sort_cols)
|
|
3397
|
+
df2 = df.sort(sort_cols)
|
|
3398
|
+
df2._metaexpr._n_rows = 100
|
|
3399
|
+
describe_df = df2
|
|
3400
|
+
else:
|
|
3401
|
+
describe_df = DataFrame.from_query(agg_query, index_label=function_label)
|
|
3402
|
+
|
|
3403
|
+
# Check if numeric overflow can occur for result DataFrame.
|
|
3404
|
+
if self._check_numeric_overflow(describe_df):
|
|
3405
|
+
result_df = self._promote_dataframe_types()
|
|
3406
|
+
describe_df = result_df.describe(pivot=True)
|
|
3407
|
+
return describe_df
|
|
3408
|
+
|
|
3163
3409
|
else:
|
|
3164
|
-
#
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3410
|
+
# If pivot is True, then construct the aggregate query and return the result DataFrame.
|
|
3411
|
+
# Otherwise, return the result DataFrame in the regular aggregate mode using UnivariateStatistics.
|
|
3412
|
+
|
|
3413
|
+
if pivot:
|
|
3414
|
+
# Construct the aggregate query.
|
|
3415
|
+
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3416
|
+
percentiles=percentiles, function_label=function_label,
|
|
3417
|
+
groupby_column_list=groupby_column_list, include=include,
|
|
3418
|
+
is_time_series_aggregate=False, verbose=verbose,
|
|
3419
|
+
distinct=distinct, statistics=statistics)
|
|
3420
|
+
|
|
3421
|
+
if groupby_column_list is not None:
|
|
3422
|
+
sort_cols = [i for i in groupby_column_list]
|
|
3423
|
+
sort_cols.append(function_label)
|
|
3424
|
+
df = DataFrame.from_query(agg_query, index_label=sort_cols)
|
|
3425
|
+
df2 = df.sort(sort_cols)
|
|
3426
|
+
df2._metaexpr._n_rows = 100
|
|
3427
|
+
describe_df = df2
|
|
3428
|
+
else:
|
|
3429
|
+
describe_df = DataFrame.from_query(agg_query, index_label=function_label)
|
|
3430
|
+
|
|
3431
|
+
# Check if numeric overflow can occur for result DataFrame.
|
|
3432
|
+
if self._check_numeric_overflow(describe_df):
|
|
3433
|
+
result_df = self._promote_dataframe_types()
|
|
3434
|
+
describe_df = result_df.describe(pivot=True)
|
|
3435
|
+
|
|
3436
|
+
return describe_df
|
|
3437
|
+
|
|
3438
|
+
# If columns is None, then all dataframe columns are considered.
|
|
3439
|
+
if columns is None:
|
|
3440
|
+
columns = self.columns
|
|
3441
|
+
# Exclude groupby columns
|
|
3442
|
+
if groupby_column_list is not None:
|
|
3443
|
+
columns = [col for col in columns if col not in groupby_column_list]
|
|
3444
|
+
|
|
3445
|
+
numeric_cols = []
|
|
3446
|
+
|
|
3447
|
+
# Extract numeric columns and their types of all columns
|
|
3448
|
+
for col in self._metaexpr.c:
|
|
3449
|
+
if type(col.type) in UtilFuncs()._get_numeric_datatypes() and \
|
|
3450
|
+
col.name in columns:
|
|
3451
|
+
numeric_cols.append(col.name)
|
|
3452
|
+
|
|
3453
|
+
if numeric_cols:
|
|
3454
|
+
# Default statistics for 'Regular Aggregate Mode'
|
|
3455
|
+
sql_stat = ["COUNT", "MAXIMUM", "MEAN", "MINIMUM", "PERCENTILES", "STANDARD DEVIATION"]
|
|
3456
|
+
|
|
3457
|
+
if statistics is not None:
|
|
3458
|
+
py_to_sql_func_map = {"count": "COUNT",
|
|
3459
|
+
"max": "MAXIMUM",
|
|
3460
|
+
"mean": "MEAN",
|
|
3461
|
+
"unique": 'UNIQUE ENTITY COUNT',
|
|
3462
|
+
"min": "MINIMUM",
|
|
3463
|
+
"percentile": "PERCENTILES",
|
|
3464
|
+
"std": "STANDARD DEVIATION"}
|
|
3465
|
+
# Convert statistics into corresponding SQL function names
|
|
3466
|
+
sql_stat = [py_to_sql_func_map[stat] for stat in UtilFuncs()._as_list(statistics)]
|
|
3467
|
+
|
|
3468
|
+
# Convert percentiles to centiles for univariate statistics
|
|
3469
|
+
centiles = list(map(lambda n: int(n * 100), percentiles))
|
|
3470
|
+
|
|
3471
|
+
# UnivariateStatistics parameters
|
|
3472
|
+
univar_param = {
|
|
3473
|
+
"newdata": self.select(self.columns),
|
|
3474
|
+
"target_columns": numeric_cols,
|
|
3475
|
+
"partition_columns": groupby_column_list,
|
|
3476
|
+
"centiles": centiles,
|
|
3477
|
+
"stats": sql_stat
|
|
3478
|
+
}
|
|
3479
|
+
|
|
3480
|
+
from teradataml import UnivariateStatistics
|
|
3481
|
+
# Run UnivariateStatistics
|
|
3482
|
+
aggr_df = UnivariateStatistics(**univar_param).result
|
|
3483
|
+
|
|
3484
|
+
# Return the result in teradataml format
|
|
3485
|
+
return aggr_df
|
|
3180
3486
|
|
|
3181
|
-
# Check if numeric overflow can occur for result DataFrame.
|
|
3182
|
-
if self._check_numeric_overflow(describe_df):
|
|
3183
|
-
result_df = self._promote_dataframe_types()
|
|
3184
|
-
describe_df = result_df.describe()
|
|
3185
|
-
return describe_df
|
|
3186
3487
|
except TeradataMlException:
|
|
3187
3488
|
raise
|
|
3188
3489
|
except Exception as err:
|
|
@@ -5765,7 +6066,35 @@ class DataFrame():
|
|
|
5765
6066
|
|
|
5766
6067
|
def _repr_html_(self):
|
|
5767
6068
|
""" Print method for teradataml for iPython rich display. """
|
|
6069
|
+
self._generate_output_html()
|
|
6070
|
+
if display.enable_ui:
|
|
6071
|
+
# EDA Ui widget representation using teradatamlwidgets
|
|
6072
|
+
if self._eda_ui is None:
|
|
6073
|
+
from teradatamlwidgets.eda.Ui import Ui
|
|
6074
|
+
self._eda_ui = Ui(df=self, html=self.html)
|
|
6075
|
+
else:
|
|
6076
|
+
self._eda_ui.display_ui()
|
|
6077
|
+
return self.html
|
|
6078
|
+
|
|
6079
|
+
def get_eda_ui(self):
|
|
6080
|
+
"""
|
|
6081
|
+
Returns the EDA representation UI.
|
|
6082
|
+
|
|
6083
|
+
PARAMETERS:
|
|
6084
|
+
None.
|
|
5768
6085
|
|
|
6086
|
+
EXCEPTIONS:
|
|
6087
|
+
None.
|
|
6088
|
+
|
|
6089
|
+
RETURNS:
|
|
6090
|
+
teradatamlwidgets.eda.Ui
|
|
6091
|
+
|
|
6092
|
+
EXAMPLE:
|
|
6093
|
+
df = ui.get_eda_ui()
|
|
6094
|
+
"""
|
|
6095
|
+
return self._eda_ui
|
|
6096
|
+
|
|
6097
|
+
def _generate_output_html(self, disable_types=True):
|
|
5769
6098
|
# Check if class attributes __data and __data_columns are not None.
|
|
5770
6099
|
# If not None, reuse the data and columns.
|
|
5771
6100
|
# If None, generate latest results.
|
|
@@ -5778,17 +6107,25 @@ class DataFrame():
|
|
|
5778
6107
|
dindent = indent + indent
|
|
5779
6108
|
|
|
5780
6109
|
header_html = ['<style type="text/css">',
|
|
5781
|
-
'table {border:ridge 5px
|
|
6110
|
+
'table { border:ridge 5px}',
|
|
5782
6111
|
'table td {border:inset 1px;}',
|
|
5783
|
-
'table tr#HeaderRow {background-color:grey; color:white;}'
|
|
6112
|
+
'table tr#HeaderRow {background-color:grey; color:white;}',
|
|
5784
6113
|
'</style>\n'
|
|
5785
6114
|
]
|
|
5786
6115
|
html = "\n{0}".format(indent).join(header_html)
|
|
5787
|
-
html += '<html><table>\n{0}<tr id="HeaderRow">\n'.format(indent)
|
|
6116
|
+
html += '<html><table style="min-width:1000px;">\n{0}<tr id="HeaderRow">\n'.format(indent)
|
|
6117
|
+
|
|
6118
|
+
columns_html = "</th><th>".join(self.__data_columns)
|
|
6119
|
+
html += "<th>{0}</th>\n".format(columns_html)
|
|
6120
|
+
html += "</tr>\n"
|
|
5788
6121
|
|
|
5789
|
-
|
|
5790
|
-
|
|
5791
|
-
|
|
6122
|
+
if not disable_types:
|
|
6123
|
+
html += '<tr>\n'.format(indent)
|
|
6124
|
+
col_types = [repr(self._td_column_names_and_sqlalchemy_types[column]) for column in
|
|
6125
|
+
self.__data_columns]
|
|
6126
|
+
columns_types_html = "</td>\n{0}<td>".format(dindent).join(col_types)
|
|
6127
|
+
html += "{0}<td>{1}</td>\n".format(dindent, columns_types_html)
|
|
6128
|
+
html += "{0}</tr>\n".format(indent)
|
|
5792
6129
|
|
|
5793
6130
|
for row in self.__data:
|
|
5794
6131
|
row_html = ["{0}<td>{1}</td>\n".format(dindent,
|
|
@@ -5796,8 +6133,31 @@ class DataFrame():
|
|
|
5796
6133
|
html += "{1}<tr>\n{0}{1}</tr>\n".format("".join(row_html), indent)
|
|
5797
6134
|
|
|
5798
6135
|
html += "</table></html>"
|
|
6136
|
+
self.html = html
|
|
6137
|
+
|
|
6138
|
+
def get_output(self, output_index=0):
|
|
6139
|
+
"""
|
|
6140
|
+
DESCRIPTION:
|
|
6141
|
+
Returns the result of analytic function when analytic function is
|
|
6142
|
+
run from 'Analyze' tab in EDA UI.
|
|
6143
|
+
Note:
|
|
6144
|
+
* The function does not return anything if analytic function is
|
|
6145
|
+
not run from EDA UI.
|
|
6146
|
+
|
|
6147
|
+
PARAMETERS:
|
|
6148
|
+
output_index:
|
|
6149
|
+
Optional Argument.
|
|
6150
|
+
Specifies the index of the output dataframe to be returned.
|
|
6151
|
+
Default Value: 0
|
|
6152
|
+
Types: int
|
|
6153
|
+
|
|
6154
|
+
RAISES:
|
|
6155
|
+
IndexError
|
|
5799
6156
|
|
|
5800
|
-
|
|
6157
|
+
RETURNS:
|
|
6158
|
+
teradataml DataFrame object.
|
|
6159
|
+
"""
|
|
6160
|
+
return self._eda_ui.get_output_dataframe(output_index=output_index)
|
|
5801
6161
|
|
|
5802
6162
|
def __get_data_columns(self):
|
|
5803
6163
|
"""
|
|
@@ -6857,7 +7217,8 @@ class DataFrame():
|
|
|
6857
7217
|
compiled_condition = condition.compile(compile_kwargs={'include_table': True,
|
|
6858
7218
|
'literal_binds': True,
|
|
6859
7219
|
'table_name_kind': '_join_alias',
|
|
6860
|
-
'compile_with_caller_table': True
|
|
7220
|
+
'compile_with_caller_table': True,
|
|
7221
|
+
'table_only': True})
|
|
6861
7222
|
|
|
6862
7223
|
all_join_conditions.append(compiled_condition)
|
|
6863
7224
|
|
|
@@ -7571,14 +7932,14 @@ class DataFrame():
|
|
|
7571
7932
|
_Validators._check_auth_token("udf")
|
|
7572
7933
|
for colname, col in udf_expr.items():
|
|
7573
7934
|
env_name = UtilFuncs._get_env_name(col)
|
|
7574
|
-
# Store the env_name and its corresponding output column
|
|
7935
|
+
# Store the env_name and its corresponding output column
|
|
7575
7936
|
if env_name in env_mapper:
|
|
7576
7937
|
env_mapper[env_name].append(colname)
|
|
7577
7938
|
else:
|
|
7578
7939
|
env_mapper[env_name] = [colname]
|
|
7579
7940
|
else:
|
|
7580
7941
|
env_mapper[env_name] = udf_expr.keys()
|
|
7581
|
-
|
|
7942
|
+
|
|
7582
7943
|
for env_name, cols in env_mapper.items():
|
|
7583
7944
|
# Create a dictionary of output columns to column type.
|
|
7584
7945
|
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
@@ -7625,11 +7986,11 @@ class DataFrame():
|
|
|
7625
7986
|
|
|
7626
7987
|
df = tbl_operators.execute()
|
|
7627
7988
|
return df
|
|
7628
|
-
|
|
7989
|
+
|
|
7629
7990
|
def _assign_call_udf(self, call_udf_expr):
|
|
7630
7991
|
"""
|
|
7631
7992
|
DESCRIPTION:
|
|
7632
|
-
Internal function for DataFrame.assign() to execute the call_udf using
|
|
7993
|
+
Internal function for DataFrame.assign() to execute the call_udf using
|
|
7633
7994
|
Script/Apply Table Operator and create new column for teradataml DataFrame.
|
|
7634
7995
|
|
|
7635
7996
|
PARAMETER:
|
|
@@ -7656,7 +8017,7 @@ class DataFrame():
|
|
|
7656
8017
|
# Create a dictionary of output columns to column type (python types).
|
|
7657
8018
|
output_type_converters = {col_name: _Dtypes._teradata_type_to_python_type(col_type) \
|
|
7658
8019
|
for col_name, col_type in returns.items()}
|
|
7659
|
-
|
|
8020
|
+
|
|
7660
8021
|
for colname, col in call_udf_expr.items():
|
|
7661
8022
|
returns[colname] = col.type
|
|
7662
8023
|
output_type_converters[colname] = _Dtypes._teradata_type_to_python_type(col.type)
|
|
@@ -7782,7 +8143,7 @@ class DataFrame():
|
|
|
7782
8143
|
Look at Example 18 to understand more.
|
|
7783
8144
|
8. While passing multiple udf expressions, one can not pass one column output
|
|
7784
8145
|
as another column input in the same ``assign`` call.
|
|
7785
|
-
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
8146
|
+
9. If user pass multiple udf expressions, delimiter and quotechar specified in
|
|
7786
8147
|
last udf expression are considered for processing.
|
|
7787
8148
|
|
|
7788
8149
|
RAISES:
|
|
@@ -8147,13 +8508,13 @@ class DataFrame():
|
|
|
8147
8508
|
Red Inc 200.0 150.0 140.0 NaN 17/01/04 201.0 abc RED INC 207
|
|
8148
8509
|
>>>
|
|
8149
8510
|
|
|
8150
|
-
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
8511
|
+
# Example 19: Convert the values is 'accounts' column to upper case using a user
|
|
8151
8512
|
# defined function on Vantage Cloud Lake.
|
|
8152
8513
|
# Create a Python 3.10.5 environment with given name and description in Vantage.
|
|
8153
8514
|
>>> env = create_env('test_udf', 'python_3.10.5', 'Test environment for UDF')
|
|
8154
8515
|
User environment 'test_udf' created.
|
|
8155
8516
|
>>>
|
|
8156
|
-
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
8517
|
+
# Create a user defined functions to 'to_upper' to get the values in upper case
|
|
8157
8518
|
# and pass the user env to run it on.
|
|
8158
8519
|
>>> from teradataml.dataframe.functions import udf
|
|
8159
8520
|
>>> @udf(env_name = env)
|
|
@@ -8165,7 +8526,7 @@ class DataFrame():
|
|
|
8165
8526
|
# to the DataFrame.
|
|
8166
8527
|
>>> df.assign(upper_stats = to_upper('accounts'))
|
|
8167
8528
|
Feb Jan Mar Apr datetime upper_stats
|
|
8168
|
-
accounts
|
|
8529
|
+
accounts
|
|
8169
8530
|
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8170
8531
|
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8171
8532
|
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
@@ -8184,12 +8545,12 @@ class DataFrame():
|
|
|
8184
8545
|
# Register the created user defined function with name "upper".
|
|
8185
8546
|
>>> register("upper", to_upper)
|
|
8186
8547
|
>>>
|
|
8187
|
-
# Call the user defined function registered with name "upper" and assign the
|
|
8548
|
+
# Call the user defined function registered with name "upper" and assign the
|
|
8188
8549
|
# ColumnExpression returned to the DataFrame.
|
|
8189
8550
|
>>> res = df.assign(upper_col = call_udf("upper", ('accounts',)))
|
|
8190
8551
|
>>> res
|
|
8191
8552
|
Feb Jan Mar Apr datetime upper_col
|
|
8192
|
-
accounts
|
|
8553
|
+
accounts
|
|
8193
8554
|
Alpha Co 210.0 200.0 215.0 250.0 17/01/04 ALPHA CO
|
|
8194
8555
|
Blue Inc 90.0 50.0 95.0 101.0 17/01/04 BLUE INC
|
|
8195
8556
|
Yellow Inc 90.0 NaN NaN NaN 17/01/04 YELLOW INC
|
|
@@ -8475,7 +8836,9 @@ class DataFrame():
|
|
|
8475
8836
|
_Validators._validate_column_exists_in_dataframe(keys, self._metaexpr)
|
|
8476
8837
|
|
|
8477
8838
|
try:
|
|
8478
|
-
|
|
8839
|
+
|
|
8840
|
+
# Slicing creates a new list instance with the same contents.
|
|
8841
|
+
new_index_list = self._index_label[:] if self._index_label is not None else []
|
|
8479
8842
|
|
|
8480
8843
|
# Creating a list with requested index labels bases on append
|
|
8481
8844
|
if append:
|
|
@@ -8490,7 +8853,7 @@ class DataFrame():
|
|
|
8490
8853
|
new_index_list = keys
|
|
8491
8854
|
|
|
8492
8855
|
# Takes care of appending already existing index
|
|
8493
|
-
new_index_list = list(
|
|
8856
|
+
new_index_list = list(dict.fromkeys(new_index_list))
|
|
8494
8857
|
|
|
8495
8858
|
# In case requested index is same as existing index, return same DF
|
|
8496
8859
|
if new_index_list == self._index_label:
|
|
@@ -9373,15 +9736,15 @@ class DataFrame():
|
|
|
9373
9736
|
TypeError, ValueError, TeradataMLException
|
|
9374
9737
|
|
|
9375
9738
|
EXAMPLES:
|
|
9376
|
-
|
|
9377
|
-
|
|
9739
|
+
# Load the example datasets.
|
|
9740
|
+
>>> load_example_data("dataframe", ["ocean_buoys", "ocean_buoys_nonpti"])
|
|
9378
9741
|
>>>
|
|
9379
9742
|
|
|
9380
|
-
|
|
9381
|
-
|
|
9382
|
-
|
|
9383
|
-
|
|
9384
|
-
|
|
9743
|
+
# Create the required DataFrames.
|
|
9744
|
+
# DataFrame on non-sequenced PTI table
|
|
9745
|
+
>>> ocean_buoys = DataFrame("ocean_buoys")
|
|
9746
|
+
# Check DataFrame columns and let's peek at the data
|
|
9747
|
+
>>> ocean_buoys.columns
|
|
9385
9748
|
['buoyid', 'TD_TIMECODE', 'temperature', 'salinity']
|
|
9386
9749
|
>>> ocean_buoys.head()
|
|
9387
9750
|
TD_TIMECODE temperature salinity
|
|
@@ -9397,10 +9760,10 @@ class DataFrame():
|
|
|
9397
9760
|
0 2014-01-06 08:00:00.000000 10.0 55
|
|
9398
9761
|
0 2014-01-06 08:10:00.000000 10.0 55
|
|
9399
9762
|
|
|
9400
|
-
|
|
9401
|
-
|
|
9402
|
-
|
|
9403
|
-
|
|
9763
|
+
# DataFrame on NON-PTI table
|
|
9764
|
+
>>> ocean_buoys_nonpti = DataFrame("ocean_buoys_nonpti")
|
|
9765
|
+
# Check DataFrame columns and let's peek at the data
|
|
9766
|
+
>>> ocean_buoys_nonpti.columns
|
|
9404
9767
|
['buoyid', 'timecode', 'temperature', 'salinity']
|
|
9405
9768
|
>>> ocean_buoys_nonpti.head()
|
|
9406
9769
|
buoyid temperature salinity
|
|
@@ -9974,6 +10337,15 @@ class DataFrame():
|
|
|
9974
10337
|
# If user did not pass any arguments which form join conditions,
|
|
9975
10338
|
# Merge is performed using index columns of TeradataML DataFrames
|
|
9976
10339
|
if on is None and left_on is None and right_on is None and not use_index:
|
|
10340
|
+
# DataFrames created on OTF table will not have index.
|
|
10341
|
+
if self._datalake is not None or right._datalake is not None:
|
|
10342
|
+
msg_code = MessageCodes.EXECUTION_FAILED
|
|
10343
|
+
emsg = "Either 'on' argument or both 'left_on' and 'right_on' arguments" \
|
|
10344
|
+
" must be provided to merge DataFrames when they are created on" \
|
|
10345
|
+
" OTF table(s)."
|
|
10346
|
+
error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
|
|
10347
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
10348
|
+
|
|
9977
10349
|
if self._index_label is None or right._index_label is None:
|
|
9978
10350
|
raise TeradataMlException(
|
|
9979
10351
|
Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
|
|
@@ -9981,6 +10353,12 @@ class DataFrame():
|
|
|
9981
10353
|
use_index = True
|
|
9982
10354
|
|
|
9983
10355
|
if use_index:
|
|
10356
|
+
if self._datalake is not None or right._datalake is not None:
|
|
10357
|
+
msg_code = MessageCodes.EXECUTION_FAILED
|
|
10358
|
+
emsg = "Can not use Index to merge DataFrames when they are created on OTF table(s)."
|
|
10359
|
+
error_msg = Messages.get_message(msg_code, "merge dataframes", emsg)
|
|
10360
|
+
raise TeradataMlException(error_msg, msg_code)
|
|
10361
|
+
|
|
9984
10362
|
if self._index_label is None or right._index_label is None:
|
|
9985
10363
|
raise TeradataMlException(
|
|
9986
10364
|
Messages.get_message(MessageCodes.TDMLDF_INDEXES_ARE_NONE), MessageCodes.TDMLDF_INDEXES_ARE_NONE)
|
|
@@ -10636,7 +11014,7 @@ class DataFrame():
|
|
|
10636
11014
|
2. seed is supported for stratify column.
|
|
10637
11015
|
3. Arguments "stratify_column", "seed", "id_column" are supported only
|
|
10638
11016
|
for stratifying the data.
|
|
10639
|
-
Types: str
|
|
11017
|
+
Types: str OR Feature
|
|
10640
11018
|
|
|
10641
11019
|
seed:
|
|
10642
11020
|
Optional Argument.
|
|
@@ -10662,7 +11040,7 @@ class DataFrame():
|
|
|
10662
11040
|
for stratifying the data.
|
|
10663
11041
|
2. "id_column" is supported only when "stratify_column" is used.
|
|
10664
11042
|
Ignored otherwise.
|
|
10665
|
-
Types: str
|
|
11043
|
+
Types: str OR Feature
|
|
10666
11044
|
|
|
10667
11045
|
RETURNS:
|
|
10668
11046
|
teradataml DataFrame
|
|
@@ -12696,8 +13074,8 @@ class DataFrame():
|
|
|
12696
13074
|
_Validators._validate_column_exists_in_dataframe(column_names, self._metaexpr,
|
|
12697
13075
|
False)
|
|
12698
13076
|
column_names = list(dict.fromkeys(column_names))
|
|
12699
|
-
|
|
12700
|
-
if list_td_reserved_keywords(column_names):
|
|
13077
|
+
|
|
13078
|
+
if list_td_reserved_keywords(column_names) or UtilFuncs._is_ascii(column_names):
|
|
12701
13079
|
column_names = UtilFuncs._teradata_quote_arg(column_names, "\"", False)
|
|
12702
13080
|
|
|
12703
13081
|
col_names_types = df_utils._get_required_columns_types_from_metaexpr(self._metaexpr, column_names)
|
|
@@ -14617,7 +14995,18 @@ class DataFrame():
|
|
|
14617
14995
|
>>> plot.show()
|
|
14618
14996
|
|
|
14619
14997
|
"""
|
|
14620
|
-
|
|
14998
|
+
|
|
14999
|
+
_plot = _Plot(x=x, y=y, scale=scale, kind=kind, **kwargs)
|
|
15000
|
+
# If plot is already generated, return the same plot.
|
|
15001
|
+
if self._plot is None:
|
|
15002
|
+
self._plot = _plot
|
|
15003
|
+
return _plot
|
|
15004
|
+
|
|
15005
|
+
if self._plot == _plot:
|
|
15006
|
+
return self._plot
|
|
15007
|
+
else:
|
|
15008
|
+
self._plot = _plot
|
|
15009
|
+
return _plot
|
|
14621
15010
|
|
|
14622
15011
|
@collect_queryband(queryband="DF_itertuples")
|
|
14623
15012
|
def itertuples(self, name='Row', num_rows=None):
|
|
@@ -17510,11 +17899,18 @@ class _TDUAF(DataFrame):
|
|
|
17510
17899
|
table_name = self._db_utils._execute_node_return_db_object_name(self._data._nodeid, self._data._metaexpr)
|
|
17511
17900
|
|
|
17512
17901
|
# UAF Functions do not accept double quotes.
|
|
17902
|
+
tdp = preparer(td_dialect)
|
|
17513
17903
|
db_name = UtilFuncs._extract_db_name(table_name)
|
|
17514
|
-
|
|
17515
|
-
|
|
17904
|
+
datalake_name = UtilFuncs._extract_datalake_name(table_name)
|
|
17905
|
+
if datalake_name:
|
|
17906
|
+
table_name = '{}.{}.{}'.format(tdp.quote(datalake_name),
|
|
17907
|
+
tdp.quote(db_name),
|
|
17908
|
+
tdp.quote(UtilFuncs._extract_table_name(table_name)))
|
|
17909
|
+
elif db_name:
|
|
17910
|
+
table_name = '{}.{}'.format(tdp.quote(db_name),
|
|
17911
|
+
tdp.quote(UtilFuncs._extract_table_name(table_name)))
|
|
17516
17912
|
else:
|
|
17517
|
-
table_name = UtilFuncs._extract_table_name(table_name)
|
|
17913
|
+
table_name = tdp.quote(UtilFuncs._extract_table_name(table_name))
|
|
17518
17914
|
|
|
17519
17915
|
sql_clauses.append("TABLE_NAME ({})")
|
|
17520
17916
|
sql_values.append(table_name)
|