teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +315 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +95 -8
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +5 -1
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +59 -35
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +27 -12
- teradataml/automl/model_training.py +73 -46
- teradataml/common/constants.py +88 -29
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +19 -3
- teradataml/common/messages.py +6 -1
- teradataml/common/sqlbundle.py +64 -12
- teradataml/common/utils.py +246 -47
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +161 -27
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +1049 -285
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +578 -35
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +185 -16
- teradataml/dbutils/dbutils.py +1049 -115
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/_base.py +1466 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
- teradataml/options/__init__.py +54 -38
- teradataml/options/configure.py +131 -27
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +5 -5
- teradataml/scriptmgmt/lls_utils.py +130 -40
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2318 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +51 -2
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +99 -8
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_class.py +0 -255
- teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -12,13 +12,14 @@ This file implements util functions of data frame.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
import numbers
|
|
15
|
+
import re
|
|
15
16
|
import pandas as pd
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
from teradataml.common.utils import UtilFuncs
|
|
19
20
|
from teradataml.common.aed_utils import AedUtils
|
|
20
21
|
from teradataml.common.constants import AEDConstants, PTITableConstants, \
|
|
21
|
-
SQLPattern, PythonTypes
|
|
22
|
+
SQLPattern, PythonTypes, TeradataConstants, SQLConstants
|
|
22
23
|
from teradataml.common.sqlbundle import SQLBundle
|
|
23
24
|
from teradataml.common.exceptions import TeradataMlException
|
|
24
25
|
from teradataml.common.messages import Messages
|
|
@@ -30,6 +31,7 @@ from teradataml.dbutils.dbutils import _execute_query_and_generate_pandas_df
|
|
|
30
31
|
|
|
31
32
|
from teradataml.options.display import display
|
|
32
33
|
from teradataml.options.configure import configure
|
|
34
|
+
from teradataml.utils.dtypes import _DtypesMappers
|
|
33
35
|
from teradataml.utils.utils import execute_sql
|
|
34
36
|
|
|
35
37
|
from teradatasqlalchemy.types import FLOAT, NUMBER, DECIMAL, PERIOD_TIMESTAMP
|
|
@@ -77,7 +79,10 @@ class DataFrameUtils():
|
|
|
77
79
|
is_persist = True
|
|
78
80
|
|
|
79
81
|
try:
|
|
80
|
-
if
|
|
82
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
83
|
+
UtilFuncs._create_table(view_names[index], queries[index], volatile=True)
|
|
84
|
+
|
|
85
|
+
elif node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
|
|
81
86
|
("OUT TABLE " in queries[index] and SQLPattern.SQLMR.value.match(queries[index])) or \
|
|
82
87
|
is_persist:
|
|
83
88
|
# TODO:: OR condition in above needs to be removed once AED support is added.
|
|
@@ -1291,43 +1296,32 @@ class DataFrameUtils():
|
|
|
1291
1296
|
|
|
1292
1297
|
aggregate_expr = ", ".join(select_columns)
|
|
1293
1298
|
return aggregate_expr, new_column_names, new_column_types
|
|
1294
|
-
|
|
1299
|
+
|
|
1295
1300
|
@staticmethod
|
|
1296
|
-
def
|
|
1301
|
+
def _validate_describe_columns(columns, metaexpr, groupby_column_list):
|
|
1297
1302
|
"""
|
|
1298
|
-
Internal function to validate columns provided to describe()
|
|
1303
|
+
Internal function to validate columns provided to describe() are correct or not,
|
|
1299
1304
|
when DataFrame is output of groupby and groupby_time.
|
|
1300
|
-
|
|
1301
1305
|
PARAMETERS:
|
|
1302
|
-
df:
|
|
1303
|
-
Required Argument.
|
|
1304
|
-
Specifies teradataml DataFrame we are collecting statistics for.
|
|
1305
|
-
Types: str
|
|
1306
|
-
|
|
1307
1306
|
columns:
|
|
1308
1307
|
Optional Argument.
|
|
1309
1308
|
Specifies the name(s) of columns we are collecting statistics for.
|
|
1310
1309
|
Types: str ot List of strings (str)
|
|
1311
|
-
|
|
1312
1310
|
metaexpr:
|
|
1313
1311
|
Required Argument.
|
|
1314
1312
|
Specifies the meta expression for the dataframe.
|
|
1315
1313
|
Types: _MetaExpression
|
|
1316
|
-
|
|
1317
1314
|
groupby_column_list:
|
|
1318
1315
|
Optional Argument.
|
|
1319
1316
|
Specifies the group by columns for the dataframe.
|
|
1320
1317
|
Default Values: None.
|
|
1321
1318
|
Types: str ot List of strings (str)
|
|
1322
|
-
|
|
1323
1319
|
Returns:
|
|
1324
1320
|
None
|
|
1325
|
-
|
|
1326
1321
|
Raises:
|
|
1327
1322
|
TeradataMLException
|
|
1328
1323
|
"""
|
|
1329
|
-
invalid_columns = [_column for _column in groupby_column_list if
|
|
1330
|
-
and _column in columns]
|
|
1324
|
+
invalid_columns = [_column for _column in groupby_column_list if _column in columns]
|
|
1331
1325
|
if len(invalid_columns) > 0:
|
|
1332
1326
|
all_columns = [col.name for col in metaexpr.c]
|
|
1333
1327
|
valid_columns = [item for item in all_columns if item not in groupby_column_list]
|
|
@@ -1849,7 +1843,10 @@ class DataFrameUtils():
|
|
|
1849
1843
|
db_schema = UtilFuncs._extract_db_name(tab_name_first)
|
|
1850
1844
|
db_table_name = UtilFuncs._extract_table_name(tab_name_first)
|
|
1851
1845
|
|
|
1852
|
-
|
|
1846
|
+
if db_schema:
|
|
1847
|
+
return DataFrame(in_schema(db_schema, db_table_name))
|
|
1848
|
+
|
|
1849
|
+
return DataFrame(db_table_name)
|
|
1853
1850
|
|
|
1854
1851
|
pids_first = None
|
|
1855
1852
|
parent_df = None
|
|
@@ -1865,11 +1862,146 @@ class DataFrameUtils():
|
|
|
1865
1862
|
db_schema = UtilFuncs._extract_db_name(tab_name_first)
|
|
1866
1863
|
db_table_name = UtilFuncs._extract_table_name(tab_name_first)
|
|
1867
1864
|
|
|
1868
|
-
|
|
1865
|
+
if db_schema:
|
|
1866
|
+
parent_df = DataFrame(in_schema(db_schema, db_table_name))
|
|
1867
|
+
else:
|
|
1868
|
+
parent_df = DataFrame(db_table_name)
|
|
1869
1869
|
pids_first = pids
|
|
1870
1870
|
else:
|
|
1871
1871
|
if pids_first != pids:
|
|
1872
1872
|
raise TeradataMlException(Messages.get_message(MessageCodes.DFS_NO_COMMON_PARENT),
|
|
1873
1873
|
MessageCodes.DFS_NO_COMMON_PARENT)
|
|
1874
1874
|
|
|
1875
|
-
return parent_df
|
|
1875
|
+
return parent_df
|
|
1876
|
+
|
|
1877
|
+
@staticmethod
|
|
1878
|
+
def _get_sqlalchemy_type_from_str(td_type):
|
|
1879
|
+
"""
|
|
1880
|
+
Function to get teradatasqlalchemy type from string representation of that type.
|
|
1881
|
+
|
|
1882
|
+
PARAMETERS:
|
|
1883
|
+
td_type:
|
|
1884
|
+
Required Argument.
|
|
1885
|
+
Specifies string representation of teradatasqlalchemy type.
|
|
1886
|
+
Types: str
|
|
1887
|
+
|
|
1888
|
+
RAISES:
|
|
1889
|
+
ValueError
|
|
1890
|
+
|
|
1891
|
+
EXAMPLES:
|
|
1892
|
+
>>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("DECIMAL(4,4)")
|
|
1893
|
+
>>> dt
|
|
1894
|
+
DECIMAL(precision=4, scale=4)
|
|
1895
|
+
>>> type(dt)
|
|
1896
|
+
teradatasqlalchemy.types.DECIMAL
|
|
1897
|
+
|
|
1898
|
+
>>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("VARCHAR(32000) CHARACTER SET UNICODE")
|
|
1899
|
+
>>> dt
|
|
1900
|
+
VARCHAR(length=32000, charset='UNICODE')
|
|
1901
|
+
>>> type(dt)
|
|
1902
|
+
teradatasqlalchemy.types.VARCHAR
|
|
1903
|
+
"""
|
|
1904
|
+
# 4 groups of pattern:
|
|
1905
|
+
# 1. Type name
|
|
1906
|
+
# 2. Comma separated parameters enclosed in parentheses
|
|
1907
|
+
# 3. Comma separated parameters without parenthesis
|
|
1908
|
+
# 4. Remaining string
|
|
1909
|
+
pattern = "([A-Z0-9_]+)(\((.*)\))?(.*)"
|
|
1910
|
+
|
|
1911
|
+
m = re.match(pattern, td_type)
|
|
1912
|
+
td_str_type = m.group(1)
|
|
1913
|
+
td_str_params = m.group(3)
|
|
1914
|
+
td_str_remain = m.group(4)
|
|
1915
|
+
|
|
1916
|
+
if m is None or td_str_type not in _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER.keys():
|
|
1917
|
+
raise ValueError("Invalid Teradata type: {} from datalake".format(td_type))
|
|
1918
|
+
|
|
1919
|
+
if td_str_type in ["VARCHAR", "CHAR"]:
|
|
1920
|
+
# If VARCHAR or CHAR, extract, length and charset from string.
|
|
1921
|
+
length = int(td_str_params.split(",")[0])
|
|
1922
|
+
charset = td_str_remain.strip().split(" ")[2]
|
|
1923
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
|
|
1924
|
+
(length=length, charset=charset)
|
|
1925
|
+
|
|
1926
|
+
if td_str_type in ["BLOB"]:
|
|
1927
|
+
# Ignoring the charset as BLOB does not have it.
|
|
1928
|
+
# If BLOB, extract length from string.
|
|
1929
|
+
length = int(td_str_params.split(",")[0])
|
|
1930
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
|
|
1931
|
+
(length=length)
|
|
1932
|
+
|
|
1933
|
+
if td_str_type in ["DECIMAL"]:
|
|
1934
|
+
# If DECIMAL, extract precision and scale from string.
|
|
1935
|
+
args = td_str_params.split(",")
|
|
1936
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
|
|
1937
|
+
(precision=int(args[0]), scale=int(args[1]))
|
|
1938
|
+
|
|
1939
|
+
# TODO: Test for other data types once OTF team finalize all data types.
|
|
1940
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]()
|
|
1941
|
+
|
|
1942
|
+
@staticmethod
|
|
1943
|
+
def _get_datalake_table_columns_info(schema, table_name, datalake):
|
|
1944
|
+
"""
|
|
1945
|
+
Function to get column names and corresponding teradatasqlalchemy types
|
|
1946
|
+
of a datalake table using results of 'help table <datalake>.<db_name>.<table_name>'
|
|
1947
|
+
SQL query.
|
|
1948
|
+
|
|
1949
|
+
PARAMETERS:
|
|
1950
|
+
schema:
|
|
1951
|
+
Required Argument.
|
|
1952
|
+
Specifies name of schema.
|
|
1953
|
+
Types: str
|
|
1954
|
+
|
|
1955
|
+
table_name:
|
|
1956
|
+
Required Argument.
|
|
1957
|
+
Specifies name of table.
|
|
1958
|
+
Types: str
|
|
1959
|
+
|
|
1960
|
+
datalake:
|
|
1961
|
+
Required Argument.
|
|
1962
|
+
Specifies name of datalake.
|
|
1963
|
+
Types: str
|
|
1964
|
+
|
|
1965
|
+
RAISES:
|
|
1966
|
+
TeradataMlException
|
|
1967
|
+
|
|
1968
|
+
EXAMPLES:
|
|
1969
|
+
>>> DataFrameUtils._get_datalake_table_columns_info(table_name = 'sales',
|
|
1970
|
+
... schema='otftestdb',
|
|
1971
|
+
... datalake='datalake_iceberg_glue')
|
|
1972
|
+
(['id', 'masters', 'gpa', 'stats', 'programming', 'admitted'],
|
|
1973
|
+
[INTEGER(),
|
|
1974
|
+
VARCHAR(length=2000, charset='UNICODE'),
|
|
1975
|
+
FLOAT(),
|
|
1976
|
+
VARCHAR(length=2000, charset='UNICODE'),
|
|
1977
|
+
VARCHAR(length=2000, charset='UNICODE'),
|
|
1978
|
+
INTEGER()])
|
|
1979
|
+
"""
|
|
1980
|
+
# Get the column information from the strings type.
|
|
1981
|
+
prepared = preparer(td_dialect())
|
|
1982
|
+
sqlbundle = SQLBundle()
|
|
1983
|
+
full_tbl_name = '{}.{}.{}'.format(prepared.quote(datalake),
|
|
1984
|
+
prepared.quote(schema),
|
|
1985
|
+
prepared.quote(table_name))
|
|
1986
|
+
help_table_sql = sqlbundle._get_sql_query(SQLConstants.SQL_HELP_TABLE).format(full_tbl_name)
|
|
1987
|
+
|
|
1988
|
+
cur = execute_sql(help_table_sql)
|
|
1989
|
+
td_types_col_index = -1
|
|
1990
|
+
for i, col_metadata in enumerate(cur.description):
|
|
1991
|
+
# Help Table returns column names and
|
|
1992
|
+
# corresponding IcebergType, TeradataInternalType,
|
|
1993
|
+
# TeradataType. We need to extract column index for
|
|
1994
|
+
# 'TeradataType' column.
|
|
1995
|
+
if col_metadata[0].lower() == 'teradatatype':
|
|
1996
|
+
td_types_col_index = i
|
|
1997
|
+
|
|
1998
|
+
col_names = []
|
|
1999
|
+
col_types = []
|
|
2000
|
+
if td_types_col_index > -1:
|
|
2001
|
+
for col_info in cur.fetchall():
|
|
2002
|
+
col_names.append(col_info[0])
|
|
2003
|
+
col_types.append(DataFrameUtils._get_sqlalchemy_type_from_str(col_info[td_types_col_index]))
|
|
2004
|
+
else:
|
|
2005
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
2006
|
+
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
2007
|
+
return col_names, col_types
|