teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +193 -1
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +25 -18
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +20 -2
- teradataml/analytics/utils.py +15 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +341 -112
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +84 -42
- teradataml/automl/data_transformation.py +69 -33
- teradataml/automl/feature_engineering.py +76 -9
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +35 -14
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +122 -63
- teradataml/common/messagecodes.py +14 -3
- teradataml/common/messages.py +8 -4
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +366 -74
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +348 -86
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +45 -29
- teradataml/dataframe/data_transfer.py +72 -46
- teradataml/dataframe/dataframe.py +642 -166
- teradataml/dataframe/dataframe_utils.py +167 -22
- teradataml/dataframe/functions.py +135 -20
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +330 -78
- teradataml/dbutils/dbutils.py +556 -140
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -26
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +307 -40
- teradataml/scriptmgmt/lls_utils.py +428 -145
- teradataml/store/__init__.py +2 -3
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +48 -19
- teradataml/table_operators/Script.py +23 -2
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +49 -1
- teradataml/utils/internal_buffer.py +38 -0
- teradataml/utils/validators.py +377 -62
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -12,13 +12,14 @@ This file implements util functions of data frame.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
import numbers
|
|
15
|
+
import re
|
|
15
16
|
import pandas as pd
|
|
16
17
|
from collections import OrderedDict
|
|
17
18
|
|
|
18
19
|
from teradataml.common.utils import UtilFuncs
|
|
19
20
|
from teradataml.common.aed_utils import AedUtils
|
|
20
21
|
from teradataml.common.constants import AEDConstants, PTITableConstants, \
|
|
21
|
-
SQLPattern, PythonTypes
|
|
22
|
+
SQLPattern, PythonTypes, TeradataConstants, SQLConstants
|
|
22
23
|
from teradataml.common.sqlbundle import SQLBundle
|
|
23
24
|
from teradataml.common.exceptions import TeradataMlException
|
|
24
25
|
from teradataml.common.messages import Messages
|
|
@@ -30,6 +31,7 @@ from teradataml.dbutils.dbutils import _execute_query_and_generate_pandas_df
|
|
|
30
31
|
|
|
31
32
|
from teradataml.options.display import display
|
|
32
33
|
from teradataml.options.configure import configure
|
|
34
|
+
from teradataml.utils.dtypes import _DtypesMappers
|
|
33
35
|
from teradataml.utils.utils import execute_sql
|
|
34
36
|
|
|
35
37
|
from teradatasqlalchemy.types import FLOAT, NUMBER, DECIMAL, PERIOD_TIMESTAMP
|
|
@@ -77,7 +79,10 @@ class DataFrameUtils():
|
|
|
77
79
|
is_persist = True
|
|
78
80
|
|
|
79
81
|
try:
|
|
80
|
-
if
|
|
82
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
83
|
+
UtilFuncs._create_table(view_names[index], queries[index], volatile=True)
|
|
84
|
+
|
|
85
|
+
elif node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
|
|
81
86
|
("OUT TABLE " in queries[index] and SQLPattern.SQLMR.value.match(queries[index])) or \
|
|
82
87
|
is_persist:
|
|
83
88
|
# TODO:: OR condition in above needs to be removed once AED support is added.
|
|
@@ -647,7 +652,7 @@ class DataFrameUtils():
|
|
|
647
652
|
all_operations = list(set(all_operations))
|
|
648
653
|
invalid_aggregates = []
|
|
649
654
|
for operation in all_operations:
|
|
650
|
-
if operation not in valid_aggregate_operations \
|
|
655
|
+
if operation not in valid_aggregate_operations and not operation.startswith('percentile_') \
|
|
651
656
|
and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
|
|
652
657
|
invalid_aggregates.append(operation)
|
|
653
658
|
if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
|
|
@@ -730,7 +735,20 @@ class DataFrameUtils():
|
|
|
730
735
|
quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
|
|
731
736
|
kwargs[key_to_process] = quoted_columns
|
|
732
737
|
|
|
733
|
-
|
|
738
|
+
if operation.startswith('percentile_'):
|
|
739
|
+
try:
|
|
740
|
+
_operation_value = operation.split('_')
|
|
741
|
+
_floatvalue = float(_operation_value[1])
|
|
742
|
+
if _floatvalue < 0.0 or _floatvalue > 1.0 or len(_operation_value)>2:
|
|
743
|
+
raise ValueError
|
|
744
|
+
except ValueError:
|
|
745
|
+
mssg = "Invalid aggregate operation '{}' requested on TeradataML DataFrame." \
|
|
746
|
+
" Valid operation should be in format 'percentile_<floatvalue>' and <floatvalue> " \
|
|
747
|
+
"should be in range [0.0, 1.0].".format(operation)
|
|
748
|
+
raise ValueError(mssg) from None
|
|
749
|
+
func_expression = getattr(df[column], 'percentile')(percentile=_floatvalue)
|
|
750
|
+
else:
|
|
751
|
+
func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
|
|
734
752
|
new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
|
|
735
753
|
# column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
|
|
736
754
|
return True, new_column_name, NUMBER() if describe_op else func_expression.type, \
|
|
@@ -1291,43 +1309,32 @@ class DataFrameUtils():
|
|
|
1291
1309
|
|
|
1292
1310
|
aggregate_expr = ", ".join(select_columns)
|
|
1293
1311
|
return aggregate_expr, new_column_names, new_column_types
|
|
1294
|
-
|
|
1312
|
+
|
|
1295
1313
|
@staticmethod
|
|
1296
|
-
def
|
|
1314
|
+
def _validate_describe_columns(columns, metaexpr, groupby_column_list):
|
|
1297
1315
|
"""
|
|
1298
|
-
Internal function to validate columns provided to describe()
|
|
1316
|
+
Internal function to validate columns provided to describe() are correct or not,
|
|
1299
1317
|
when DataFrame is output of groupby and groupby_time.
|
|
1300
|
-
|
|
1301
1318
|
PARAMETERS:
|
|
1302
|
-
df:
|
|
1303
|
-
Required Argument.
|
|
1304
|
-
Specifies teradataml DataFrame we are collecting statistics for.
|
|
1305
|
-
Types: str
|
|
1306
|
-
|
|
1307
1319
|
columns:
|
|
1308
1320
|
Optional Argument.
|
|
1309
1321
|
Specifies the name(s) of columns we are collecting statistics for.
|
|
1310
1322
|
Types: str ot List of strings (str)
|
|
1311
|
-
|
|
1312
1323
|
metaexpr:
|
|
1313
1324
|
Required Argument.
|
|
1314
1325
|
Specifies the meta expression for the dataframe.
|
|
1315
1326
|
Types: _MetaExpression
|
|
1316
|
-
|
|
1317
1327
|
groupby_column_list:
|
|
1318
1328
|
Optional Argument.
|
|
1319
1329
|
Specifies the group by columns for the dataframe.
|
|
1320
1330
|
Default Values: None.
|
|
1321
1331
|
Types: str ot List of strings (str)
|
|
1322
|
-
|
|
1323
1332
|
Returns:
|
|
1324
1333
|
None
|
|
1325
|
-
|
|
1326
1334
|
Raises:
|
|
1327
1335
|
TeradataMLException
|
|
1328
1336
|
"""
|
|
1329
|
-
invalid_columns = [_column for _column in groupby_column_list if
|
|
1330
|
-
and _column in columns]
|
|
1337
|
+
invalid_columns = [_column for _column in groupby_column_list if _column in columns]
|
|
1331
1338
|
if len(invalid_columns) > 0:
|
|
1332
1339
|
all_columns = [col.name for col in metaexpr.c]
|
|
1333
1340
|
valid_columns = [item for item in all_columns if item not in groupby_column_list]
|
|
@@ -1849,7 +1856,10 @@ class DataFrameUtils():
|
|
|
1849
1856
|
db_schema = UtilFuncs._extract_db_name(tab_name_first)
|
|
1850
1857
|
db_table_name = UtilFuncs._extract_table_name(tab_name_first)
|
|
1851
1858
|
|
|
1852
|
-
|
|
1859
|
+
if db_schema:
|
|
1860
|
+
return DataFrame(in_schema(db_schema, db_table_name))
|
|
1861
|
+
|
|
1862
|
+
return DataFrame(db_table_name)
|
|
1853
1863
|
|
|
1854
1864
|
pids_first = None
|
|
1855
1865
|
parent_df = None
|
|
@@ -1865,11 +1875,146 @@ class DataFrameUtils():
|
|
|
1865
1875
|
db_schema = UtilFuncs._extract_db_name(tab_name_first)
|
|
1866
1876
|
db_table_name = UtilFuncs._extract_table_name(tab_name_first)
|
|
1867
1877
|
|
|
1868
|
-
|
|
1878
|
+
if db_schema:
|
|
1879
|
+
parent_df = DataFrame(in_schema(db_schema, db_table_name))
|
|
1880
|
+
else:
|
|
1881
|
+
parent_df = DataFrame(db_table_name)
|
|
1869
1882
|
pids_first = pids
|
|
1870
1883
|
else:
|
|
1871
1884
|
if pids_first != pids:
|
|
1872
1885
|
raise TeradataMlException(Messages.get_message(MessageCodes.DFS_NO_COMMON_PARENT),
|
|
1873
1886
|
MessageCodes.DFS_NO_COMMON_PARENT)
|
|
1874
1887
|
|
|
1875
|
-
return parent_df
|
|
1888
|
+
return parent_df
|
|
1889
|
+
|
|
1890
|
+
@staticmethod
|
|
1891
|
+
def _get_sqlalchemy_type_from_str(td_type):
|
|
1892
|
+
"""
|
|
1893
|
+
Function to get teradatasqlalchemy type from string representation of that type.
|
|
1894
|
+
|
|
1895
|
+
PARAMETERS:
|
|
1896
|
+
td_type:
|
|
1897
|
+
Required Argument.
|
|
1898
|
+
Specifies string representation of teradatasqlalchemy type.
|
|
1899
|
+
Types: str
|
|
1900
|
+
|
|
1901
|
+
RAISES:
|
|
1902
|
+
ValueError
|
|
1903
|
+
|
|
1904
|
+
EXAMPLES:
|
|
1905
|
+
>>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("DECIMAL(4,4)")
|
|
1906
|
+
>>> dt
|
|
1907
|
+
DECIMAL(precision=4, scale=4)
|
|
1908
|
+
>>> type(dt)
|
|
1909
|
+
teradatasqlalchemy.types.DECIMAL
|
|
1910
|
+
|
|
1911
|
+
>>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("VARCHAR(32000) CHARACTER SET UNICODE")
|
|
1912
|
+
>>> dt
|
|
1913
|
+
VARCHAR(length=32000, charset='UNICODE')
|
|
1914
|
+
>>> type(dt)
|
|
1915
|
+
teradatasqlalchemy.types.VARCHAR
|
|
1916
|
+
"""
|
|
1917
|
+
# 4 groups of pattern:
|
|
1918
|
+
# 1. Type name
|
|
1919
|
+
# 2. Comma separated parameters enclosed in parentheses
|
|
1920
|
+
# 3. Comma separated parameters without parenthesis
|
|
1921
|
+
# 4. Remaining string
|
|
1922
|
+
pattern = "([A-Z0-9_]+)(\((.*)\))?(.*)"
|
|
1923
|
+
|
|
1924
|
+
m = re.match(pattern, td_type)
|
|
1925
|
+
td_str_type = m.group(1)
|
|
1926
|
+
td_str_params = m.group(3)
|
|
1927
|
+
td_str_remain = m.group(4)
|
|
1928
|
+
|
|
1929
|
+
if m is None or td_str_type not in _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER.keys():
|
|
1930
|
+
raise ValueError("Invalid Teradata type: {} from datalake".format(td_type))
|
|
1931
|
+
|
|
1932
|
+
if td_str_type in ["VARCHAR", "CHAR"]:
|
|
1933
|
+
# If VARCHAR or CHAR, extract, length and charset from string.
|
|
1934
|
+
length = int(td_str_params.split(",")[0])
|
|
1935
|
+
charset = td_str_remain.strip().split(" ")[2]
|
|
1936
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
|
|
1937
|
+
(length=length, charset=charset)
|
|
1938
|
+
|
|
1939
|
+
if td_str_type in ["BLOB"]:
|
|
1940
|
+
# Ignoring the charset as BLOB does not have it.
|
|
1941
|
+
# If BLOB, extract length from string.
|
|
1942
|
+
length = int(td_str_params.split(",")[0])
|
|
1943
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
|
|
1944
|
+
(length=length)
|
|
1945
|
+
|
|
1946
|
+
if td_str_type in ["DECIMAL"]:
|
|
1947
|
+
# If DECIMAL, extract precision and scale from string.
|
|
1948
|
+
args = td_str_params.split(",")
|
|
1949
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
|
|
1950
|
+
(precision=int(args[0]), scale=int(args[1]))
|
|
1951
|
+
|
|
1952
|
+
# TODO: Test for other data types once OTF team finalize all data types.
|
|
1953
|
+
return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]()
|
|
1954
|
+
|
|
1955
|
+
@staticmethod
|
|
1956
|
+
def _get_datalake_table_columns_info(schema, table_name, datalake):
|
|
1957
|
+
"""
|
|
1958
|
+
Function to get column names and corresponding teradatasqlalchemy types
|
|
1959
|
+
of a datalake table using results of 'help table <datalake>.<db_name>.<table_name>'
|
|
1960
|
+
SQL query.
|
|
1961
|
+
|
|
1962
|
+
PARAMETERS:
|
|
1963
|
+
schema:
|
|
1964
|
+
Required Argument.
|
|
1965
|
+
Specifies name of schema.
|
|
1966
|
+
Types: str
|
|
1967
|
+
|
|
1968
|
+
table_name:
|
|
1969
|
+
Required Argument.
|
|
1970
|
+
Specifies name of table.
|
|
1971
|
+
Types: str
|
|
1972
|
+
|
|
1973
|
+
datalake:
|
|
1974
|
+
Required Argument.
|
|
1975
|
+
Specifies name of datalake.
|
|
1976
|
+
Types: str
|
|
1977
|
+
|
|
1978
|
+
RAISES:
|
|
1979
|
+
TeradataMlException
|
|
1980
|
+
|
|
1981
|
+
EXAMPLES:
|
|
1982
|
+
>>> DataFrameUtils._get_datalake_table_columns_info(table_name = 'sales',
|
|
1983
|
+
... schema='otftestdb',
|
|
1984
|
+
... datalake='datalake_iceberg_glue')
|
|
1985
|
+
(['id', 'masters', 'gpa', 'stats', 'programming', 'admitted'],
|
|
1986
|
+
[INTEGER(),
|
|
1987
|
+
VARCHAR(length=2000, charset='UNICODE'),
|
|
1988
|
+
FLOAT(),
|
|
1989
|
+
VARCHAR(length=2000, charset='UNICODE'),
|
|
1990
|
+
VARCHAR(length=2000, charset='UNICODE'),
|
|
1991
|
+
INTEGER()])
|
|
1992
|
+
"""
|
|
1993
|
+
# Get the column information from the strings type.
|
|
1994
|
+
prepared = preparer(td_dialect())
|
|
1995
|
+
sqlbundle = SQLBundle()
|
|
1996
|
+
full_tbl_name = '{}.{}.{}'.format(prepared.quote(datalake),
|
|
1997
|
+
prepared.quote(schema),
|
|
1998
|
+
prepared.quote(table_name))
|
|
1999
|
+
help_table_sql = sqlbundle._get_sql_query(SQLConstants.SQL_HELP_TABLE).format(full_tbl_name)
|
|
2000
|
+
|
|
2001
|
+
cur = execute_sql(help_table_sql)
|
|
2002
|
+
td_types_col_index = -1
|
|
2003
|
+
for i, col_metadata in enumerate(cur.description):
|
|
2004
|
+
# Help Table returns column names and
|
|
2005
|
+
# corresponding IcebergType, TeradataInternalType,
|
|
2006
|
+
# TeradataType. We need to extract column index for
|
|
2007
|
+
# 'TeradataType' column.
|
|
2008
|
+
if col_metadata[0].lower() == 'teradatatype':
|
|
2009
|
+
td_types_col_index = i
|
|
2010
|
+
|
|
2011
|
+
col_names = []
|
|
2012
|
+
col_types = []
|
|
2013
|
+
if td_types_col_index > -1:
|
|
2014
|
+
for col_info in cur.fetchall():
|
|
2015
|
+
col_names.append(col_info[0])
|
|
2016
|
+
col_types.append(DataFrameUtils._get_sqlalchemy_type_from_str(col_info[td_types_col_index]))
|
|
2017
|
+
else:
|
|
2018
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
|
|
2019
|
+
MessageCodes.TDMLDF_CREATE_FAIL)
|
|
2020
|
+
return col_names, col_types
|
|
@@ -1,28 +1,36 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from inspect import getsource
|
|
3
3
|
import re
|
|
4
|
-
from
|
|
4
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
5
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
5
6
|
from teradataml.dbutils.filemgr import install_file, list_files, remove_file
|
|
6
|
-
from teradataml.
|
|
7
|
+
from teradataml.utils.utils import execute_sql
|
|
7
8
|
import teradatasqlalchemy as tdsqlalchemy
|
|
8
9
|
from teradataml.utils.validators import _Validators
|
|
9
10
|
from teradataml.dataframe.sql import _SQLColumnExpression
|
|
10
11
|
from teradatasqlalchemy import VARCHAR, CLOB, CHAR
|
|
11
|
-
from teradataml.common.constants import TeradataTypes
|
|
12
|
+
from teradataml.common.constants import TableOperatorConstants, TeradataConstants, TeradataTypes
|
|
12
13
|
from teradataml.common.utils import UtilFuncs
|
|
13
|
-
from teradataml.utils.dtypes import _Dtypes
|
|
14
14
|
from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
15
15
|
from teradataml.table_operators.table_operator_util import _TableOperatorUtils
|
|
16
|
-
from teradataml.utils.internal_buffer import _InternalBuffer
|
|
17
16
|
from teradataml.common.exceptions import TeradataMlException
|
|
18
17
|
from teradataml.common.messages import Messages
|
|
19
18
|
from teradataml.common.messagecodes import MessageCodes
|
|
20
19
|
from teradataml.scriptmgmt.lls_utils import get_env
|
|
21
20
|
|
|
22
|
-
def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None):
|
|
21
|
+
def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None, debug=False):
|
|
23
22
|
"""
|
|
24
23
|
DESCRIPTION:
|
|
25
24
|
Creates a user defined function (UDF).
|
|
25
|
+
|
|
26
|
+
Notes:
|
|
27
|
+
1. Date and time data types must be formatted to supported formats.
|
|
28
|
+
(See Prerequisite Input and Output Structures in Open Analytics Framework for more details.)
|
|
29
|
+
2. Packages required to run the user defined function must be installed in remote user
|
|
30
|
+
environment using install_lib method of UserEnv class. Import statements of these
|
|
31
|
+
packages should be inside the user defined function itself.
|
|
32
|
+
3. Do not call a regular function defined outside the udf() from the user defined function.
|
|
33
|
+
The function definition and call must be inside the udf(). Look at Example 9 to understand more.
|
|
26
34
|
|
|
27
35
|
PARAMETERS:
|
|
28
36
|
user_function:
|
|
@@ -31,7 +39,7 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
|
|
|
31
39
|
teradataml DataFrame.
|
|
32
40
|
Types: function
|
|
33
41
|
Note:
|
|
34
|
-
|
|
42
|
+
Lambda functions are not supported. Re-write the lambda function as regular Python function to use with UDF.
|
|
35
43
|
|
|
36
44
|
returns:
|
|
37
45
|
Optional Argument.
|
|
@@ -76,21 +84,21 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
|
|
|
76
84
|
* This argument cannot be same as "delimiter" argument.
|
|
77
85
|
* This argument cannot be a newline character.
|
|
78
86
|
|
|
87
|
+
debug:
|
|
88
|
+
Optional Argument.
|
|
89
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
90
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
91
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
92
|
+
Otherwise, file is removed from the local file system.
|
|
93
|
+
Default Value: False
|
|
94
|
+
Types: bool
|
|
95
|
+
|
|
79
96
|
RETURNS:
|
|
80
97
|
ColumnExpression
|
|
81
98
|
|
|
82
99
|
RAISES:
|
|
83
100
|
TeradataMLException
|
|
84
101
|
|
|
85
|
-
NOTES:
|
|
86
|
-
1. While working on date and time data types one must format these to supported formats.
|
|
87
|
-
(See Requisite Input and Output Structures in Open Analytics Framework for more details.)
|
|
88
|
-
2. Required packages to run the user defined function must be installed in remote user
|
|
89
|
-
environment using install_lib function Of UserEnv class. Import statements of these
|
|
90
|
-
packages should be inside the user defined function itself.
|
|
91
|
-
3. One can't call a regular function defined outside the udf from the user defined function.
|
|
92
|
-
The function definition and call must be inside the udf. Look at Example 9 to understand more.
|
|
93
|
-
|
|
94
102
|
EXAMPLES:
|
|
95
103
|
# Load the data to run the example.
|
|
96
104
|
>>> load_example_data("dataframe", "sales")
|
|
@@ -324,14 +332,14 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
|
|
|
324
332
|
def wrapper(f):
|
|
325
333
|
def func_(*args):
|
|
326
334
|
return _SQLColumnExpression(expression=None, udf=f, udf_type=returns, udf_args=args,\
|
|
327
|
-
env_name=env_name, delimiter=delimiter, quotechar=quotechar)
|
|
335
|
+
env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
|
|
328
336
|
return func_
|
|
329
337
|
return wrapper
|
|
330
338
|
# Notation: @udf
|
|
331
339
|
else:
|
|
332
340
|
def func_(*args):
|
|
333
341
|
return _SQLColumnExpression(expression=None, udf=user_function, udf_type=returns, udf_args=args,\
|
|
334
|
-
env_name=env_name, delimiter=delimiter, quotechar=quotechar)
|
|
342
|
+
env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
|
|
335
343
|
return func_
|
|
336
344
|
|
|
337
345
|
|
|
@@ -340,6 +348,12 @@ def register(name, user_function, returns=VARCHAR(1024)):
|
|
|
340
348
|
DESCRIPTION:
|
|
341
349
|
Registers a user defined function (UDF).
|
|
342
350
|
|
|
351
|
+
Notes:
|
|
352
|
+
1. Date and time data types must be formatted to supported formats.
|
|
353
|
+
(See Requisite Input and Output Structures in Open Analytics Framework for more details.)
|
|
354
|
+
2. On VantageCloud Lake, user defined function is registered by default in the 'openml_env' environment.
|
|
355
|
+
User can register it in their own user environment, using the 'openml_user_env' configuration option.
|
|
356
|
+
|
|
343
357
|
PARAMETERS:
|
|
344
358
|
name:
|
|
345
359
|
Required Argument.
|
|
@@ -351,6 +365,8 @@ def register(name, user_function, returns=VARCHAR(1024)):
|
|
|
351
365
|
Specifies the user defined function to create a column for
|
|
352
366
|
teradataml DataFrame.
|
|
353
367
|
Types: function, udf
|
|
368
|
+
Note:
|
|
369
|
+
Lambda functions are not supported. Re-write the lambda function as regular Python function to use with UDF.
|
|
354
370
|
|
|
355
371
|
returns:
|
|
356
372
|
Optional Argument.
|
|
@@ -459,10 +475,17 @@ def call_udf(udf_name, func_args = () , **kwargs):
|
|
|
459
475
|
DESCRIPTION:
|
|
460
476
|
Call a registered user defined function (UDF).
|
|
461
477
|
|
|
478
|
+
Notes:
|
|
479
|
+
1. Packages required to run the registered user defined function must be installed in remote user
|
|
480
|
+
environment using install_lib method of UserEnv class. Import statements of these
|
|
481
|
+
packages should be inside the user defined function itself.
|
|
482
|
+
2. On VantageCloud Lake, user defined function runs by default in the 'openml_env' environment.
|
|
483
|
+
User can use their own user environment, using the 'openml_user_env' configuration option.
|
|
484
|
+
|
|
462
485
|
PARAMETERS:
|
|
463
486
|
udf_name:
|
|
464
487
|
Required Argument.
|
|
465
|
-
Specifies the name of the registered user defined.
|
|
488
|
+
Specifies the name of the registered user defined function.
|
|
466
489
|
Types: str
|
|
467
490
|
|
|
468
491
|
func_args:
|
|
@@ -864,4 +887,96 @@ def _create_return_type(returns):
|
|
|
864
887
|
return_str = str(returns)
|
|
865
888
|
# Replace the space with underscore in the return type.
|
|
866
889
|
return_str = return_str.replace(" ", "_")
|
|
867
|
-
return return_str
|
|
890
|
+
return return_str
|
|
891
|
+
|
|
892
|
+
def td_range(start, end=None, step=1):
|
|
893
|
+
"""
|
|
894
|
+
DESCRIPTION:
|
|
895
|
+
Creates a DataFrame with a specified range of numbers.
|
|
896
|
+
|
|
897
|
+
Notes:
|
|
898
|
+
1. The range is inclusive of the start and exclusive of the end.
|
|
899
|
+
2. If only start is provided, then end is set to start and start is set to 0.
|
|
900
|
+
|
|
901
|
+
PARAMETERS:
|
|
902
|
+
start:
|
|
903
|
+
Required Argument.
|
|
904
|
+
Specifies the starting number of the range.
|
|
905
|
+
Types: int
|
|
906
|
+
|
|
907
|
+
end:
|
|
908
|
+
Optional Argument.
|
|
909
|
+
Specifies the end number of the range(exclusive).
|
|
910
|
+
Default Value: None
|
|
911
|
+
Types: int
|
|
912
|
+
|
|
913
|
+
step:
|
|
914
|
+
Optional Argument.
|
|
915
|
+
Specifies the step size of the range.
|
|
916
|
+
Default Value: 1
|
|
917
|
+
Types: int
|
|
918
|
+
|
|
919
|
+
RETURNS:
|
|
920
|
+
teradataml DataFrame
|
|
921
|
+
|
|
922
|
+
RAISES:
|
|
923
|
+
TeradataMlException
|
|
924
|
+
|
|
925
|
+
EXAMPLES:
|
|
926
|
+
# Example 1: Create a DataFrame with a range of numbers from 0 to 5.
|
|
927
|
+
>>> from teradataml.dataframe.functions import td_range
|
|
928
|
+
>>> df = td_range(5)
|
|
929
|
+
>>> df.sort('id')
|
|
930
|
+
id
|
|
931
|
+
0 0
|
|
932
|
+
1 1
|
|
933
|
+
2 2
|
|
934
|
+
3 3
|
|
935
|
+
4 4
|
|
936
|
+
|
|
937
|
+
# Example 2: Create a DataFrame with a range of numbers from 5 to 1 with step size of -2.
|
|
938
|
+
>>> from teradataml.dataframe.functions import td_range
|
|
939
|
+
>>> td_range(5, 1, -2)
|
|
940
|
+
id
|
|
941
|
+
0 3
|
|
942
|
+
1 5
|
|
943
|
+
|
|
944
|
+
>>> Example 3: Create a DataFrame with a range of numbers from 1 to 5 with default step size of 1.
|
|
945
|
+
>>> from teradataml.dataframe.functions import td_range
|
|
946
|
+
>>> td_range(1, 5)
|
|
947
|
+
id
|
|
948
|
+
0 3
|
|
949
|
+
1 4
|
|
950
|
+
2 2
|
|
951
|
+
3 1
|
|
952
|
+
|
|
953
|
+
"""
|
|
954
|
+
# Validate the arguments.
|
|
955
|
+
arg_matrix = []
|
|
956
|
+
arg_matrix.append(["start", start, False, int])
|
|
957
|
+
arg_matrix.append(["end", end, True, int])
|
|
958
|
+
arg_matrix.append(["step", step, True, int])
|
|
959
|
+
_Validators._validate_function_arguments(arg_matrix)
|
|
960
|
+
|
|
961
|
+
# If only start is provided, then set end to start and start to 0.
|
|
962
|
+
if end is None:
|
|
963
|
+
end = start
|
|
964
|
+
start = 0
|
|
965
|
+
|
|
966
|
+
# If start is greater than end, then set the operation to "-" and operator to ">".
|
|
967
|
+
# If end is less than start, then set the operation to "+" and operator to "<".
|
|
968
|
+
if end < start:
|
|
969
|
+
operation, operator, step = "-", ">", -step
|
|
970
|
+
else:
|
|
971
|
+
operation, operator = "+", "<"
|
|
972
|
+
|
|
973
|
+
# Create a temporary table with the start value.
|
|
974
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="tdml_range_df",
|
|
975
|
+
table_type=TeradataConstants.TERADATA_TABLE)
|
|
976
|
+
execute_sql(f"CREATE MULTISET TABLE {table_name} AS (SELECT {start} AS id) WITH DATA;")
|
|
977
|
+
|
|
978
|
+
# Create a DataFrame from the range query.
|
|
979
|
+
range_query = TableOperatorConstants.RANGE_QUERY.value \
|
|
980
|
+
.format(table_name, step, end, operation, operator)
|
|
981
|
+
df = DataFrame.from_query(range_query)
|
|
982
|
+
return df
|
teradataml/dataframe/setop.py
CHANGED
|
@@ -149,7 +149,7 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
|
|
|
149
149
|
# Iterate on all DFs to be applied for set operation.
|
|
150
150
|
for df in dfs_to_operate_on:
|
|
151
151
|
# Process each column in the DF of the iteration.
|
|
152
|
-
for c in df._metaexpr.
|
|
152
|
+
for c in df._metaexpr.c:
|
|
153
153
|
col_name = c.name
|
|
154
154
|
# Process the column name if it is not already processed.
|
|
155
155
|
# Processing of set operation is column name based so if the DF in the nth iteration had column 'xyz',
|
|
@@ -193,6 +193,8 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
|
|
|
193
193
|
col_dict[col_name]['col_present'] = col_present_in_dfs
|
|
194
194
|
# The type to be used for the column is the one of the first DF it is present in.
|
|
195
195
|
col_dict[col_name]['col_type'] = col_types_in_dfs[0]
|
|
196
|
+
# Column name stored with quotes if required.
|
|
197
|
+
col_dict[col_name]['name'] = c.compile()
|
|
196
198
|
|
|
197
199
|
# If the type of the column in all DFs is not the same, then the operation is not lazy.
|
|
198
200
|
if not all(ctype == col_dict[col_name]['col_type']
|
|
@@ -217,6 +219,8 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
|
|
|
217
219
|
col_dict[col_name]['col_present'] = col_present_in_dfs
|
|
218
220
|
# The type to be used for the column is the one of the first DF it is present in.
|
|
219
221
|
col_dict[col_name]['col_type'] = non_none_type_to_add
|
|
222
|
+
# Column name stored with quotes if required.
|
|
223
|
+
col_dict[col_name]['name'] = c.compile()
|
|
220
224
|
|
|
221
225
|
# If the type of the column in all DFs is not the same, then the operation is not lazy.
|
|
222
226
|
if not all(True if ctype is None else ctype == non_none_type_to_add
|
|
@@ -667,15 +671,16 @@ def concat(df_list, join='OUTER', allow_duplicates=True, sort=False, ignore_inde
|
|
|
667
671
|
|
|
668
672
|
# Now create the list of columns for each DataFrame to concatenate
|
|
669
673
|
type_compiler = td_type_compiler(td_dialect)
|
|
674
|
+
|
|
670
675
|
for col_name, value in master_columns_dict.items():
|
|
671
676
|
for i in range(len(col_list)):
|
|
677
|
+
# Quoting is already done for column names if column name starts with number or it is reserved keywords.
|
|
678
|
+
# Here checking again if it is teradata keyword or not for quotes.
|
|
679
|
+
column_name = UtilFuncs._process_for_teradata_keyword(value['name'])
|
|
672
680
|
if not value['col_present'][i]:
|
|
673
|
-
col_list[i].append('CAST(NULL as {}) as {}'.format(type_compiler.process(value['col_type']),
|
|
674
|
-
UtilFuncs._teradata_quote_arg(col_name, "\"",
|
|
675
|
-
False)))
|
|
681
|
+
col_list[i].append('CAST(NULL as {}) as {}'.format(type_compiler.process(value['col_type']), column_name))
|
|
676
682
|
else:
|
|
677
|
-
|
|
678
|
-
col_list[i].append(col_name)
|
|
683
|
+
col_list[i].append(column_name)
|
|
679
684
|
|
|
680
685
|
input_table_columns = []
|
|
681
686
|
for i in range(len(col_list)):
|