teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (151) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +193 -1
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +25 -18
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  8. teradataml/analytics/sqle/__init__.py +20 -2
  9. teradataml/analytics/utils.py +15 -1
  10. teradataml/analytics/valib.py +18 -4
  11. teradataml/automl/__init__.py +341 -112
  12. teradataml/automl/autodataprep/__init__.py +471 -0
  13. teradataml/automl/data_preparation.py +84 -42
  14. teradataml/automl/data_transformation.py +69 -33
  15. teradataml/automl/feature_engineering.py +76 -9
  16. teradataml/automl/feature_exploration.py +639 -25
  17. teradataml/automl/model_training.py +35 -14
  18. teradataml/clients/auth_client.py +2 -2
  19. teradataml/common/__init__.py +1 -2
  20. teradataml/common/constants.py +122 -63
  21. teradataml/common/messagecodes.py +14 -3
  22. teradataml/common/messages.py +8 -4
  23. teradataml/common/sqlbundle.py +40 -10
  24. teradataml/common/utils.py +366 -74
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +348 -86
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/apriori_example.json +22 -0
  29. teradataml/data/byom_example.json +11 -0
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  37. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  38. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  40. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  41. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  42. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  43. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  45. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  47. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  48. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  49. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  51. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  52. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  53. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  54. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  55. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  56. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  57. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  58. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  59. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  60. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  61. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  62. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  63. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  64. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  65. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  66. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  67. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  68. teradataml/data/hnsw_alter_data.csv +5 -0
  69. teradataml/data/hnsw_data.csv +10 -0
  70. teradataml/data/jsons/byom/h2opredict.json +1 -1
  71. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  72. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  73. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  74. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  75. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  76. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  77. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  78. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  79. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  80. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  81. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  82. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  83. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  84. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  85. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  86. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  87. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  88. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  89. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  90. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  91. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  92. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  93. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  94. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
  95. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
  96. teradataml/data/ner_dict.csv +8 -0
  97. teradataml/data/ner_input_eng.csv +7 -0
  98. teradataml/data/ner_rule.csv +5 -0
  99. teradataml/data/pos_input.csv +40 -0
  100. teradataml/data/tdnerextractor_example.json +14 -0
  101. teradataml/data/teradataml_example.json +21 -0
  102. teradataml/data/textmorph_example.json +5 -0
  103. teradataml/data/to_num_data.csv +4 -0
  104. teradataml/data/tochar_data.csv +5 -0
  105. teradataml/data/trans_dense.csv +16 -0
  106. teradataml/data/trans_sparse.csv +55 -0
  107. teradataml/data/vectordistance_example.json +1 -1
  108. teradataml/dataframe/copy_to.py +45 -29
  109. teradataml/dataframe/data_transfer.py +72 -46
  110. teradataml/dataframe/dataframe.py +642 -166
  111. teradataml/dataframe/dataframe_utils.py +167 -22
  112. teradataml/dataframe/functions.py +135 -20
  113. teradataml/dataframe/setop.py +11 -6
  114. teradataml/dataframe/sql.py +330 -78
  115. teradataml/dbutils/dbutils.py +556 -140
  116. teradataml/dbutils/filemgr.py +14 -10
  117. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  118. teradataml/lib/aed_0_1.dll +0 -0
  119. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
  120. teradataml/opensource/_class.py +141 -17
  121. teradataml/opensource/{constants.py → _constants.py} +7 -3
  122. teradataml/opensource/_lightgbm.py +52 -53
  123. teradataml/opensource/_sklearn.py +1008 -0
  124. teradataml/opensource/_wrapper_utils.py +5 -5
  125. teradataml/options/__init__.py +47 -15
  126. teradataml/options/configure.py +103 -26
  127. teradataml/options/display.py +13 -2
  128. teradataml/plot/axis.py +47 -8
  129. teradataml/plot/figure.py +33 -0
  130. teradataml/plot/plot.py +63 -13
  131. teradataml/scriptmgmt/UserEnv.py +307 -40
  132. teradataml/scriptmgmt/lls_utils.py +428 -145
  133. teradataml/store/__init__.py +2 -3
  134. teradataml/store/feature_store/feature_store.py +102 -7
  135. teradataml/table_operators/Apply.py +48 -19
  136. teradataml/table_operators/Script.py +23 -2
  137. teradataml/table_operators/TableOperator.py +3 -1
  138. teradataml/table_operators/table_operator_util.py +58 -9
  139. teradataml/utils/dtypes.py +49 -1
  140. teradataml/utils/internal_buffer.py +38 -0
  141. teradataml/utils/validators.py +377 -62
  142. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
  143. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
  144. teradataml/data/SQL_Fundamentals.pdf +0 -0
  145. teradataml/libaed_0_1.dylib +0 -0
  146. teradataml/libaed_0_1.so +0 -0
  147. teradataml/opensource/sklearn/__init__.py +0 -0
  148. teradataml/store/vector_store/__init__.py +0 -1586
  149. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  150. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  151. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -12,13 +12,14 @@ This file implements util functions of data frame.
12
12
  """
13
13
 
14
14
  import numbers
15
+ import re
15
16
  import pandas as pd
16
17
  from collections import OrderedDict
17
18
 
18
19
  from teradataml.common.utils import UtilFuncs
19
20
  from teradataml.common.aed_utils import AedUtils
20
21
  from teradataml.common.constants import AEDConstants, PTITableConstants, \
21
- SQLPattern, PythonTypes
22
+ SQLPattern, PythonTypes, TeradataConstants, SQLConstants
22
23
  from teradataml.common.sqlbundle import SQLBundle
23
24
  from teradataml.common.exceptions import TeradataMlException
24
25
  from teradataml.common.messages import Messages
@@ -30,6 +31,7 @@ from teradataml.dbutils.dbutils import _execute_query_and_generate_pandas_df
30
31
 
31
32
  from teradataml.options.display import display
32
33
  from teradataml.options.configure import configure
34
+ from teradataml.utils.dtypes import _DtypesMappers
33
35
  from teradataml.utils.utils import execute_sql
34
36
 
35
37
  from teradatasqlalchemy.types import FLOAT, NUMBER, DECIMAL, PERIOD_TIMESTAMP
@@ -77,7 +79,10 @@ class DataFrameUtils():
77
79
  is_persist = True
78
80
 
79
81
  try:
80
- if node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
82
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
83
+ UtilFuncs._create_table(view_names[index], queries[index], volatile=True)
84
+
85
+ elif node_query_types[index] == AEDConstants.AED_QUERY_NODE_TYPE_ML_QUERY_MULTI_OUTPUT.value or\
81
86
  ("OUT TABLE " in queries[index] and SQLPattern.SQLMR.value.match(queries[index])) or \
82
87
  is_persist:
83
88
  # TODO:: OR condition in above needs to be removed once AED support is added.
@@ -647,7 +652,7 @@ class DataFrameUtils():
647
652
  all_operations = list(set(all_operations))
648
653
  invalid_aggregates = []
649
654
  for operation in all_operations:
650
- if operation not in valid_aggregate_operations \
655
+ if operation not in valid_aggregate_operations and not operation.startswith('percentile_') \
651
656
  and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
652
657
  invalid_aggregates.append(operation)
653
658
  if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
@@ -730,7 +735,20 @@ class DataFrameUtils():
730
735
  quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
731
736
  kwargs[key_to_process] = quoted_columns
732
737
 
733
- func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
738
+ if operation.startswith('percentile_'):
739
+ try:
740
+ _operation_value = operation.split('_')
741
+ _floatvalue = float(_operation_value[1])
742
+ if _floatvalue < 0.0 or _floatvalue > 1.0 or len(_operation_value)>2:
743
+ raise ValueError
744
+ except ValueError:
745
+ mssg = "Invalid aggregate operation '{}' requested on TeradataML DataFrame." \
746
+ " Valid operation should be in format 'percentile_<floatvalue>' and <floatvalue> " \
747
+ "should be in range [0.0, 1.0].".format(operation)
748
+ raise ValueError(mssg) from None
749
+ func_expression = getattr(df[column], 'percentile')(percentile=_floatvalue)
750
+ else:
751
+ func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
734
752
  new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
735
753
  # column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
736
754
  return True, new_column_name, NUMBER() if describe_op else func_expression.type, \
@@ -1291,43 +1309,32 @@ class DataFrameUtils():
1291
1309
 
1292
1310
  aggregate_expr = ", ".join(select_columns)
1293
1311
  return aggregate_expr, new_column_names, new_column_types
1294
-
1312
+
1295
1313
  @staticmethod
1296
- def _invalid_describe_column(df, columns, metaexpr, groupby_column_list):
1314
+ def _validate_describe_columns(columns, metaexpr, groupby_column_list):
1297
1315
  """
1298
- Internal function to validate columns provided to describe() is correct or not,
1316
+ Internal function to validate columns provided to describe() are correct or not,
1299
1317
  when DataFrame is output of groupby and groupby_time.
1300
-
1301
1318
  PARAMETERS:
1302
- df:
1303
- Required Argument.
1304
- Specifies teradataml DataFrame we are collecting statistics for.
1305
- Types: str
1306
-
1307
1319
  columns:
1308
1320
  Optional Argument.
1309
1321
  Specifies the name(s) of columns we are collecting statistics for.
1310
1322
  Types: str ot List of strings (str)
1311
-
1312
1323
  metaexpr:
1313
1324
  Required Argument.
1314
1325
  Specifies the meta expression for the dataframe.
1315
1326
  Types: _MetaExpression
1316
-
1317
1327
  groupby_column_list:
1318
1328
  Optional Argument.
1319
1329
  Specifies the group by columns for the dataframe.
1320
1330
  Default Values: None.
1321
1331
  Types: str ot List of strings (str)
1322
-
1323
1332
  Returns:
1324
1333
  None
1325
-
1326
1334
  Raises:
1327
1335
  TeradataMLException
1328
1336
  """
1329
- invalid_columns = [_column for _column in groupby_column_list if columns is not None
1330
- and _column in columns]
1337
+ invalid_columns = [_column for _column in groupby_column_list if _column in columns]
1331
1338
  if len(invalid_columns) > 0:
1332
1339
  all_columns = [col.name for col in metaexpr.c]
1333
1340
  valid_columns = [item for item in all_columns if item not in groupby_column_list]
@@ -1849,7 +1856,10 @@ class DataFrameUtils():
1849
1856
  db_schema = UtilFuncs._extract_db_name(tab_name_first)
1850
1857
  db_table_name = UtilFuncs._extract_table_name(tab_name_first)
1851
1858
 
1852
- return DataFrame(in_schema(db_schema, db_table_name))
1859
+ if db_schema:
1860
+ return DataFrame(in_schema(db_schema, db_table_name))
1861
+
1862
+ return DataFrame(db_table_name)
1853
1863
 
1854
1864
  pids_first = None
1855
1865
  parent_df = None
@@ -1865,11 +1875,146 @@ class DataFrameUtils():
1865
1875
  db_schema = UtilFuncs._extract_db_name(tab_name_first)
1866
1876
  db_table_name = UtilFuncs._extract_table_name(tab_name_first)
1867
1877
 
1868
- parent_df = DataFrame(in_schema(db_schema, db_table_name))
1878
+ if db_schema:
1879
+ parent_df = DataFrame(in_schema(db_schema, db_table_name))
1880
+ else:
1881
+ parent_df = DataFrame(db_table_name)
1869
1882
  pids_first = pids
1870
1883
  else:
1871
1884
  if pids_first != pids:
1872
1885
  raise TeradataMlException(Messages.get_message(MessageCodes.DFS_NO_COMMON_PARENT),
1873
1886
  MessageCodes.DFS_NO_COMMON_PARENT)
1874
1887
 
1875
- return parent_df
1888
+ return parent_df
1889
+
1890
+ @staticmethod
1891
+ def _get_sqlalchemy_type_from_str(td_type):
1892
+ """
1893
+ Function to get teradatasqlalchemy type from string representation of that type.
1894
+
1895
+ PARAMETERS:
1896
+ td_type:
1897
+ Required Argument.
1898
+ Specifies string representation of teradatasqlalchemy type.
1899
+ Types: str
1900
+
1901
+ RAISES:
1902
+ ValueError
1903
+
1904
+ EXAMPLES:
1905
+ >>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("DECIMAL(4,4)")
1906
+ >>> dt
1907
+ DECIMAL(precision=4, scale=4)
1908
+ >>> type(dt)
1909
+ teradatasqlalchemy.types.DECIMAL
1910
+
1911
+ >>> dt = DataFrameUtils._get_sqlalchemy_type_from_str("VARCHAR(32000) CHARACTER SET UNICODE")
1912
+ >>> dt
1913
+ VARCHAR(length=32000, charset='UNICODE')
1914
+ >>> type(dt)
1915
+ teradatasqlalchemy.types.VARCHAR
1916
+ """
1917
+ # 4 groups of pattern:
1918
+ # 1. Type name
1919
+ # 2. Comma separated parameters enclosed in parentheses
1920
+ # 3. Comma separated parameters without parenthesis
1921
+ # 4. Remaining string
1922
+ pattern = "([A-Z0-9_]+)(\((.*)\))?(.*)"
1923
+
1924
+ m = re.match(pattern, td_type)
1925
+ td_str_type = m.group(1)
1926
+ td_str_params = m.group(3)
1927
+ td_str_remain = m.group(4)
1928
+
1929
+ if m is None or td_str_type not in _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER.keys():
1930
+ raise ValueError("Invalid Teradata type: {} from datalake".format(td_type))
1931
+
1932
+ if td_str_type in ["VARCHAR", "CHAR"]:
1933
+ # If VARCHAR or CHAR, extract, length and charset from string.
1934
+ length = int(td_str_params.split(",")[0])
1935
+ charset = td_str_remain.strip().split(" ")[2]
1936
+ return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
1937
+ (length=length, charset=charset)
1938
+
1939
+ if td_str_type in ["BLOB"]:
1940
+ # Ignoring the charset as BLOB does not have it.
1941
+ # If BLOB, extract length from string.
1942
+ length = int(td_str_params.split(",")[0])
1943
+ return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
1944
+ (length=length)
1945
+
1946
+ if td_str_type in ["DECIMAL"]:
1947
+ # If DECIMAL, extract precision and scale from string.
1948
+ args = td_str_params.split(",")
1949
+ return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]\
1950
+ (precision=int(args[0]), scale=int(args[1]))
1951
+
1952
+ # TODO: Test for other data types once OTF team finalize all data types.
1953
+ return _DtypesMappers.DATALAKE_STR_to_TDSQLALCHEMY_DATATYPE_MAPPER[td_str_type]()
1954
+
1955
+ @staticmethod
1956
+ def _get_datalake_table_columns_info(schema, table_name, datalake):
1957
+ """
1958
+ Function to get column names and corresponding teradatasqlalchemy types
1959
+ of a datalake table using results of 'help table <datalake>.<db_name>.<table_name>'
1960
+ SQL query.
1961
+
1962
+ PARAMETERS:
1963
+ schema:
1964
+ Required Argument.
1965
+ Specifies name of schema.
1966
+ Types: str
1967
+
1968
+ table_name:
1969
+ Required Argument.
1970
+ Specifies name of table.
1971
+ Types: str
1972
+
1973
+ datalake:
1974
+ Required Argument.
1975
+ Specifies name of datalake.
1976
+ Types: str
1977
+
1978
+ RAISES:
1979
+ TeradataMlException
1980
+
1981
+ EXAMPLES:
1982
+ >>> DataFrameUtils._get_datalake_table_columns_info(table_name = 'sales',
1983
+ ... schema='otftestdb',
1984
+ ... datalake='datalake_iceberg_glue')
1985
+ (['id', 'masters', 'gpa', 'stats', 'programming', 'admitted'],
1986
+ [INTEGER(),
1987
+ VARCHAR(length=2000, charset='UNICODE'),
1988
+ FLOAT(),
1989
+ VARCHAR(length=2000, charset='UNICODE'),
1990
+ VARCHAR(length=2000, charset='UNICODE'),
1991
+ INTEGER()])
1992
+ """
1993
+ # Get the column information from the strings type.
1994
+ prepared = preparer(td_dialect())
1995
+ sqlbundle = SQLBundle()
1996
+ full_tbl_name = '{}.{}.{}'.format(prepared.quote(datalake),
1997
+ prepared.quote(schema),
1998
+ prepared.quote(table_name))
1999
+ help_table_sql = sqlbundle._get_sql_query(SQLConstants.SQL_HELP_TABLE).format(full_tbl_name)
2000
+
2001
+ cur = execute_sql(help_table_sql)
2002
+ td_types_col_index = -1
2003
+ for i, col_metadata in enumerate(cur.description):
2004
+ # Help Table returns column names and
2005
+ # corresponding IcebergType, TeradataInternalType,
2006
+ # TeradataType. We need to extract column index for
2007
+ # 'TeradataType' column.
2008
+ if col_metadata[0].lower() == 'teradatatype':
2009
+ td_types_col_index = i
2010
+
2011
+ col_names = []
2012
+ col_types = []
2013
+ if td_types_col_index > -1:
2014
+ for col_info in cur.fetchall():
2015
+ col_names.append(col_info[0])
2016
+ col_types.append(DataFrameUtils._get_sqlalchemy_type_from_str(col_info[td_types_col_index]))
2017
+ else:
2018
+ raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_CREATE_FAIL),
2019
+ MessageCodes.TDMLDF_CREATE_FAIL)
2020
+ return col_names, col_types
@@ -1,28 +1,36 @@
1
1
  import pandas as pd
2
2
  from inspect import getsource
3
3
  import re
4
- from types import FunctionType
4
+ from teradataml.dataframe.copy_to import copy_to_sql
5
+ from teradataml.dataframe.dataframe import DataFrame
5
6
  from teradataml.dbutils.filemgr import install_file, list_files, remove_file
6
- from teradataml.options.configure import configure
7
+ from teradataml.utils.utils import execute_sql
7
8
  import teradatasqlalchemy as tdsqlalchemy
8
9
  from teradataml.utils.validators import _Validators
9
10
  from teradataml.dataframe.sql import _SQLColumnExpression
10
11
  from teradatasqlalchemy import VARCHAR, CLOB, CHAR
11
- from teradataml.common.constants import TeradataTypes
12
+ from teradataml.common.constants import TableOperatorConstants, TeradataConstants, TeradataTypes
12
13
  from teradataml.common.utils import UtilFuncs
13
- from teradataml.utils.dtypes import _Dtypes
14
14
  from teradataml.dataframe.sql_interfaces import ColumnExpression
15
15
  from teradataml.table_operators.table_operator_util import _TableOperatorUtils
16
- from teradataml.utils.internal_buffer import _InternalBuffer
17
16
  from teradataml.common.exceptions import TeradataMlException
18
17
  from teradataml.common.messages import Messages
19
18
  from teradataml.common.messagecodes import MessageCodes
20
19
  from teradataml.scriptmgmt.lls_utils import get_env
21
20
 
22
- def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None):
21
+ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None, debug=False):
23
22
  """
24
23
  DESCRIPTION:
25
24
  Creates a user defined function (UDF).
25
+
26
+ Notes:
27
+ 1. Date and time data types must be formatted to supported formats.
28
+ (See Prerequisite Input and Output Structures in Open Analytics Framework for more details.)
29
+ 2. Packages required to run the user defined function must be installed in remote user
30
+ environment using install_lib method of UserEnv class. Import statements of these
31
+ packages should be inside the user defined function itself.
32
+ 3. Do not call a regular function defined outside the udf() from the user defined function.
33
+ The function definition and call must be inside the udf(). Look at Example 9 to understand more.
26
34
 
27
35
  PARAMETERS:
28
36
  user_function:
@@ -31,7 +39,7 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
31
39
  teradataml DataFrame.
32
40
  Types: function
33
41
  Note:
34
- 1. Lambda Function are not supported.
42
+ Lambda functions are not supported. Re-write the lambda function as regular Python function to use with UDF.
35
43
 
36
44
  returns:
37
45
  Optional Argument.
@@ -76,21 +84,21 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
76
84
  * This argument cannot be same as "delimiter" argument.
77
85
  * This argument cannot be a newline character.
78
86
 
87
+ debug:
88
+ Optional Argument.
89
+ Specifies whether to display the script file path generated during function execution or not. This
90
+ argument helps in debugging when there are any failures during function execution. When set
91
+ to True, function displays the path of the script and does not remove the file from local file system.
92
+ Otherwise, file is removed from the local file system.
93
+ Default Value: False
94
+ Types: bool
95
+
79
96
  RETURNS:
80
97
  ColumnExpression
81
98
 
82
99
  RAISES:
83
100
  TeradataMLException
84
101
 
85
- NOTES:
86
- 1. While working on date and time data types one must format these to supported formats.
87
- (See Requisite Input and Output Structures in Open Analytics Framework for more details.)
88
- 2. Required packages to run the user defined function must be installed in remote user
89
- environment using install_lib function Of UserEnv class. Import statements of these
90
- packages should be inside the user defined function itself.
91
- 3. One can't call a regular function defined outside the udf from the user defined function.
92
- The function definition and call must be inside the udf. Look at Example 9 to understand more.
93
-
94
102
  EXAMPLES:
95
103
  # Load the data to run the example.
96
104
  >>> load_example_data("dataframe", "sales")
@@ -324,14 +332,14 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
324
332
  def wrapper(f):
325
333
  def func_(*args):
326
334
  return _SQLColumnExpression(expression=None, udf=f, udf_type=returns, udf_args=args,\
327
- env_name=env_name, delimiter=delimiter, quotechar=quotechar)
335
+ env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
328
336
  return func_
329
337
  return wrapper
330
338
  # Notation: @udf
331
339
  else:
332
340
  def func_(*args):
333
341
  return _SQLColumnExpression(expression=None, udf=user_function, udf_type=returns, udf_args=args,\
334
- env_name=env_name, delimiter=delimiter, quotechar=quotechar)
342
+ env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
335
343
  return func_
336
344
 
337
345
 
@@ -340,6 +348,12 @@ def register(name, user_function, returns=VARCHAR(1024)):
340
348
  DESCRIPTION:
341
349
  Registers a user defined function (UDF).
342
350
 
351
+ Notes:
352
+ 1. Date and time data types must be formatted to supported formats.
353
+ (See Requisite Input and Output Structures in Open Analytics Framework for more details.)
354
+ 2. On VantageCloud Lake, user defined function is registered by default in the 'openml_env' environment.
355
+ User can register it in their own user environment, using the 'openml_user_env' configuration option.
356
+
343
357
  PARAMETERS:
344
358
  name:
345
359
  Required Argument.
@@ -351,6 +365,8 @@ def register(name, user_function, returns=VARCHAR(1024)):
351
365
  Specifies the user defined function to create a column for
352
366
  teradataml DataFrame.
353
367
  Types: function, udf
368
+ Note:
369
+ Lambda functions are not supported. Re-write the lambda function as regular Python function to use with UDF.
354
370
 
355
371
  returns:
356
372
  Optional Argument.
@@ -459,10 +475,17 @@ def call_udf(udf_name, func_args = () , **kwargs):
459
475
  DESCRIPTION:
460
476
  Call a registered user defined function (UDF).
461
477
 
478
+ Notes:
479
+ 1. Packages required to run the registered user defined function must be installed in remote user
480
+ environment using install_lib method of UserEnv class. Import statements of these
481
+ packages should be inside the user defined function itself.
482
+ 2. On VantageCloud Lake, user defined function runs by default in the 'openml_env' environment.
483
+ User can use their own user environment, using the 'openml_user_env' configuration option.
484
+
462
485
  PARAMETERS:
463
486
  udf_name:
464
487
  Required Argument.
465
- Specifies the name of the registered user defined.
488
+ Specifies the name of the registered user defined function.
466
489
  Types: str
467
490
 
468
491
  func_args:
@@ -864,4 +887,96 @@ def _create_return_type(returns):
864
887
  return_str = str(returns)
865
888
  # Replace the space with underscore in the return type.
866
889
  return_str = return_str.replace(" ", "_")
867
- return return_str
890
+ return return_str
891
+
892
+ def td_range(start, end=None, step=1):
893
+ """
894
+ DESCRIPTION:
895
+ Creates a DataFrame with a specified range of numbers.
896
+
897
+ Notes:
898
+ 1. The range is inclusive of the start and exclusive of the end.
899
+ 2. If only start is provided, then end is set to start and start is set to 0.
900
+
901
+ PARAMETERS:
902
+ start:
903
+ Required Argument.
904
+ Specifies the starting number of the range.
905
+ Types: int
906
+
907
+ end:
908
+ Optional Argument.
909
+ Specifies the end number of the range(exclusive).
910
+ Default Value: None
911
+ Types: int
912
+
913
+ step:
914
+ Optional Argument.
915
+ Specifies the step size of the range.
916
+ Default Value: 1
917
+ Types: int
918
+
919
+ RETURNS:
920
+ teradataml DataFrame
921
+
922
+ RAISES:
923
+ TeradataMlException
924
+
925
+ EXAMPLES:
926
+ # Example 1: Create a DataFrame with a range of numbers from 0 to 5.
927
+ >>> from teradataml.dataframe.functions import td_range
928
+ >>> df = td_range(5)
929
+ >>> df.sort('id')
930
+ id
931
+ 0 0
932
+ 1 1
933
+ 2 2
934
+ 3 3
935
+ 4 4
936
+
937
+ # Example 2: Create a DataFrame with a range of numbers from 5 to 1 with step size of -2.
938
+ >>> from teradataml.dataframe.functions import td_range
939
+ >>> td_range(5, 1, -2)
940
+ id
941
+ 0 3
942
+ 1 5
943
+
944
+ >>> Example 3: Create a DataFrame with a range of numbers from 1 to 5 with default step size of 1.
945
+ >>> from teradataml.dataframe.functions import td_range
946
+ >>> td_range(1, 5)
947
+ id
948
+ 0 3
949
+ 1 4
950
+ 2 2
951
+ 3 1
952
+
953
+ """
954
+ # Validate the arguments.
955
+ arg_matrix = []
956
+ arg_matrix.append(["start", start, False, int])
957
+ arg_matrix.append(["end", end, True, int])
958
+ arg_matrix.append(["step", step, True, int])
959
+ _Validators._validate_function_arguments(arg_matrix)
960
+
961
+ # If only start is provided, then set end to start and start to 0.
962
+ if end is None:
963
+ end = start
964
+ start = 0
965
+
966
+ # If start is greater than end, then set the operation to "-" and operator to ">".
967
+ # If end is less than start, then set the operation to "+" and operator to "<".
968
+ if end < start:
969
+ operation, operator, step = "-", ">", -step
970
+ else:
971
+ operation, operator = "+", "<"
972
+
973
+ # Create a temporary table with the start value.
974
+ table_name = UtilFuncs._generate_temp_table_name(prefix="tdml_range_df",
975
+ table_type=TeradataConstants.TERADATA_TABLE)
976
+ execute_sql(f"CREATE MULTISET TABLE {table_name} AS (SELECT {start} AS id) WITH DATA;")
977
+
978
+ # Create a DataFrame from the range query.
979
+ range_query = TableOperatorConstants.RANGE_QUERY.value \
980
+ .format(table_name, step, end, operation, operator)
981
+ df = DataFrame.from_query(range_query)
982
+ return df
@@ -149,7 +149,7 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
149
149
  # Iterate on all DFs to be applied for set operation.
150
150
  for df in dfs_to_operate_on:
151
151
  # Process each column in the DF of the iteration.
152
- for c in df._metaexpr.t.c:
152
+ for c in df._metaexpr.c:
153
153
  col_name = c.name
154
154
  # Process the column name if it is not already processed.
155
155
  # Processing of set operation is column name based so if the DF in the nth iteration had column 'xyz',
@@ -193,6 +193,8 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
193
193
  col_dict[col_name]['col_present'] = col_present_in_dfs
194
194
  # The type to be used for the column is the one of the first DF it is present in.
195
195
  col_dict[col_name]['col_type'] = col_types_in_dfs[0]
196
+ # Column name stored with quotes if required.
197
+ col_dict[col_name]['name'] = c.compile()
196
198
 
197
199
  # If the type of the column in all DFs is not the same, then the operation is not lazy.
198
200
  if not all(ctype == col_dict[col_name]['col_type']
@@ -217,6 +219,8 @@ def __check_concat_compatibility(df_list, join, sort, ignore_index):
217
219
  col_dict[col_name]['col_present'] = col_present_in_dfs
218
220
  # The type to be used for the column is the one of the first DF it is present in.
219
221
  col_dict[col_name]['col_type'] = non_none_type_to_add
222
+ # Column name stored with quotes if required.
223
+ col_dict[col_name]['name'] = c.compile()
220
224
 
221
225
  # If the type of the column in all DFs is not the same, then the operation is not lazy.
222
226
  if not all(True if ctype is None else ctype == non_none_type_to_add
@@ -667,15 +671,16 @@ def concat(df_list, join='OUTER', allow_duplicates=True, sort=False, ignore_inde
667
671
 
668
672
  # Now create the list of columns for each DataFrame to concatenate
669
673
  type_compiler = td_type_compiler(td_dialect)
674
+
670
675
  for col_name, value in master_columns_dict.items():
671
676
  for i in range(len(col_list)):
677
+ # Quoting is already done for column names if column name starts with number or it is reserved keywords.
678
+ # Here checking again if it is teradata keyword or not for quotes.
679
+ column_name = UtilFuncs._process_for_teradata_keyword(value['name'])
672
680
  if not value['col_present'][i]:
673
- col_list[i].append('CAST(NULL as {}) as {}'.format(type_compiler.process(value['col_type']),
674
- UtilFuncs._teradata_quote_arg(col_name, "\"",
675
- False)))
681
+ col_list[i].append('CAST(NULL as {}) as {}'.format(type_compiler.process(value['col_type']), column_name))
676
682
  else:
677
- col_name = UtilFuncs._process_for_teradata_keyword(col_name)
678
- col_list[i].append(col_name)
683
+ col_list[i].append(column_name)
679
684
 
680
685
  input_table_columns = []
681
686
  for i in range(len(col_list)):