teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (107) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +86 -13
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +7 -12
  6. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  7. teradataml/analytics/sqle/__init__.py +16 -1
  8. teradataml/analytics/utils.py +15 -1
  9. teradataml/automl/__init__.py +290 -106
  10. teradataml/automl/autodataprep/__init__.py +471 -0
  11. teradataml/automl/data_preparation.py +29 -10
  12. teradataml/automl/data_transformation.py +11 -0
  13. teradataml/automl/feature_engineering.py +64 -4
  14. teradataml/automl/feature_exploration.py +639 -25
  15. teradataml/automl/model_training.py +1 -1
  16. teradataml/clients/auth_client.py +2 -2
  17. teradataml/common/constants.py +61 -26
  18. teradataml/common/messagecodes.py +2 -1
  19. teradataml/common/messages.py +5 -4
  20. teradataml/common/utils.py +255 -37
  21. teradataml/context/context.py +225 -87
  22. teradataml/data/apriori_example.json +22 -0
  23. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  24. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  25. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  26. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  27. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  29. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  30. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  31. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  32. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  33. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  34. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  35. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  36. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  37. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  38. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  39. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  40. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  41. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  42. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  43. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  45. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  47. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  48. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  49. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  51. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  52. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  53. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  54. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  55. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  56. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  57. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  58. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  59. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  60. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  61. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  62. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  63. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  64. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  65. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  66. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  67. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  68. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  69. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  70. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
  71. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
  72. teradataml/data/ner_dict.csv +8 -0
  73. teradataml/data/ner_input_eng.csv +7 -0
  74. teradataml/data/ner_rule.csv +5 -0
  75. teradataml/data/pos_input.csv +40 -0
  76. teradataml/data/tdnerextractor_example.json +14 -0
  77. teradataml/data/teradataml_example.json +13 -0
  78. teradataml/data/textmorph_example.json +5 -0
  79. teradataml/data/to_num_data.csv +4 -0
  80. teradataml/data/tochar_data.csv +5 -0
  81. teradataml/data/trans_dense.csv +16 -0
  82. teradataml/data/trans_sparse.csv +55 -0
  83. teradataml/dataframe/copy_to.py +37 -26
  84. teradataml/dataframe/data_transfer.py +61 -45
  85. teradataml/dataframe/dataframe.py +130 -50
  86. teradataml/dataframe/dataframe_utils.py +15 -2
  87. teradataml/dataframe/functions.py +109 -9
  88. teradataml/dataframe/sql.py +328 -76
  89. teradataml/dbutils/dbutils.py +33 -13
  90. teradataml/dbutils/filemgr.py +14 -10
  91. teradataml/lib/aed_0_1.dll +0 -0
  92. teradataml/opensource/_base.py +6 -157
  93. teradataml/options/configure.py +4 -5
  94. teradataml/scriptmgmt/UserEnv.py +305 -38
  95. teradataml/scriptmgmt/lls_utils.py +376 -130
  96. teradataml/store/__init__.py +1 -1
  97. teradataml/table_operators/Apply.py +16 -1
  98. teradataml/table_operators/Script.py +20 -1
  99. teradataml/table_operators/table_operator_util.py +58 -9
  100. teradataml/utils/dtypes.py +2 -1
  101. teradataml/utils/internal_buffer.py +22 -2
  102. teradataml/utils/validators.py +313 -57
  103. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +89 -14
  104. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +107 -77
  105. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  106. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  107. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -243,6 +243,11 @@ class DataFrame():
243
243
  # Property to determine if table is an ART table or not.
244
244
  self._is_art = None
245
245
 
246
+ # This attribute stores the previous assign arguments in continuous assign calls.
247
+ self._previous_assign_args = None
248
+ # This attribute stores the root DataFrame columns.
249
+ self._root_columns = None
250
+
246
251
  self._datalake = None
247
252
  self._database = None
248
253
  self._table = None
@@ -2924,9 +2929,8 @@ class DataFrame():
2924
2929
  msg = Messages.get_message(errcode)
2925
2930
  raise TeradataMlException(msg, errcode)
2926
2931
 
2927
- @argument_deprecation("20.0.0.5", "include", False, None)
2928
2932
  @collect_queryband(queryband="DF_describe")
2929
- def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
2933
+ def describe(self, percentiles=[.25, .5, .75], verbose=False, distinct=False, statistics=None,
2930
2934
  columns=None, pivot=False):
2931
2935
  """
2932
2936
  DESCRIPTION:
@@ -2956,18 +2960,6 @@ class DataFrame():
2956
2960
  Default Values: [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
2957
2961
  Types: float or List of floats
2958
2962
 
2959
- include:
2960
- Optional Argument.
2961
- Values can be either None or "all".
2962
- If the value is "all", both numeric and non-numeric columns are included.
2963
- Computes count, mean, std, min, percentiles, and max for numeric columns.
2964
- Computes count and unique for non-numeric columns.
2965
- If the value is None, only numeric columns are used for collecting statistics.
2966
- Note:
2967
- * Value 'all' is not applicable for 'Time Series Aggregate Mode'.
2968
- Default Values: None
2969
- Types: str
2970
-
2971
2963
  verbose:
2972
2964
  Optional Argument.
2973
2965
  Specifies a boolean value to be used for time series aggregation, stating whether to get
@@ -2994,7 +2986,6 @@ class DataFrame():
2994
2986
  Computes count and unique for non-numeric columns.
2995
2987
  Notes:
2996
2988
  1. statistics is not applicable for 'Time Series Aggregate Mode'.
2997
- 2. statistics should not be used with include as 'all'.
2998
2989
  Permitted Values: count, mean, min, max, unique, std, describe, percentile
2999
2990
  Default Values: None
3000
2991
  Types: str or List of str
@@ -3310,7 +3301,6 @@ class DataFrame():
3310
3301
  awu_matrix = []
3311
3302
  awu_matrix.append(["columns", columns, True, (str, list), True])
3312
3303
  awu_matrix.append(["percentiles", percentiles, True, (float, list)])
3313
- awu_matrix.append(["include", include, True, (str), True, [None, "all"]])
3314
3304
  awu_matrix.append(["verbose", verbose, True, (bool)])
3315
3305
  awu_matrix.append(["distinct", distinct, True, (bool)])
3316
3306
  awu_matrix.append(["statistics", statistics, True, (str, list), True,
@@ -3334,22 +3324,11 @@ class DataFrame():
3334
3324
  if statistics:
3335
3325
  statistics = [stats.lower() for stats in UtilFuncs._as_list(statistics)]
3336
3326
 
3337
- # Argument include and statistics should not be used together
3338
- if include is not None and statistics is not None:
3339
- raise ValueError(Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH).format(
3340
- 'include', 'statistics'
3341
- ))
3342
-
3343
3327
  # Percentiles must be a list of values between 0 and 1.
3344
3328
  if not isinstance(percentiles, list) or not all(p > 0 and p < 1 for p in percentiles):
3345
3329
  raise ValueError(Messages.get_message(MessageCodes.INVALID_ARG_VALUE, percentiles, "percentiles",
3346
3330
  "percentiles must be a list of values between 0 and 1"))
3347
3331
 
3348
- # Argument 'include' with value 'all' is not allowed for DataFrameGroupByTime
3349
- if include is not None and include.lower() == "all" and isinstance(self, DataFrameGroupByTime):
3350
- raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
3351
- 'include', 'Aggregation', 'all', 'describe()', 'DataFrame or DataFrameGroupBy'))
3352
-
3353
3332
  # Argument 'statistics' is not allowed for DataFrameGroupByTime
3354
3333
  if statistics is not None and isinstance(self, DataFrameGroupByTime):
3355
3334
  raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
@@ -3383,7 +3362,7 @@ class DataFrame():
3383
3362
  # Construct the aggregate query.
3384
3363
  agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3385
3364
  percentiles=percentiles, function_label=function_label,
3386
- groupby_column_list=groupby_column_list, include=include,
3365
+ groupby_column_list=groupby_column_list, include=None,
3387
3366
  is_time_series_aggregate=True, verbose=verbose,
3388
3367
  distinct=distinct,
3389
3368
  timebucket_duration=self._timebucket_duration,
@@ -3414,7 +3393,7 @@ class DataFrame():
3414
3393
  # Construct the aggregate query.
3415
3394
  agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
3416
3395
  percentiles=percentiles, function_label=function_label,
3417
- groupby_column_list=groupby_column_list, include=include,
3396
+ groupby_column_list=groupby_column_list, include=None,
3418
3397
  is_time_series_aggregate=False, verbose=verbose,
3419
3398
  distinct=distinct, statistics=statistics)
3420
3399
 
@@ -5570,8 +5549,10 @@ class DataFrame():
5570
5549
  Specifies the function(s) to apply on DataFrame columns.
5571
5550
 
5572
5551
  Valid values for func are:
5573
- 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique',
5574
- 'median', 'var'
5552
+ * 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'percentile_<floatvalue>', 'unique',
5553
+ 'median', 'var'
5554
+ * Note: In 'percentile_<floatvalue>', <floatvalue> specifies the desired percentile value to
5555
+ calculate aggregate. It should be in the range of 0.0 to 1.0 (both inclusive).
5575
5556
 
5576
5557
  Acceptable formats for function(s) are
5577
5558
  string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
@@ -5605,12 +5586,17 @@ class DataFrame():
5605
5586
  Output column names after the above operation are:
5606
5587
  min_employee_no, sum_employee_no, var_employee_no, min_first_name
5607
5588
 
5608
- 4. "func" passed as a ColumnExpression built using the aggregate functions.
5589
+ 4. "percentile_<floatvalue>" passed to agg.
5590
+ >>> df.agg({'employee_no' : ['percentile_0.25', 'percentile_0.75', 'min']})
5591
+ >>> df.agg(['percentile_0.25', 'percentile_0.75', 'sum'])
5592
+ >>> df.agg('percentile_0.25')
5593
+
5594
+ 5. "func" passed as a ColumnExpression built using the aggregate functions.
5609
5595
  >>> df.agg(df.first_name.count())
5610
5596
  Output column name after the above operation is:
5611
5597
  count(first_name)
5612
5598
 
5613
- 5. "func" passed as a list of ColumnExpression built using the aggregate functions.
5599
+ 6. "func" passed as a list of ColumnExpression built using the aggregate functions.
5614
5600
  >>> df.agg([df.employee_no.min(), df.first_name.count()])
5615
5601
  Output column names after the above operation are:
5616
5602
  min(employee_no), count(first_name)
@@ -5698,6 +5684,12 @@ class DataFrame():
5698
5684
  min_employee_no sum_employee_no var_employee_no min_first_name
5699
5685
  0 100 313 44.333333 abcd
5700
5686
 
5687
+ # Get the minimum, 25 percentile value and variance of employee number, by passing dictionary of
5688
+ # column names to string function/list of string functions as parameter.
5689
+ >>> df.agg({'employee_no' : ['min', 'percentile_0.25', 'var']})
5690
+ min_employee_no percentile_0.25_employee_no var_employee_no
5691
+ 0 100 100 44.333333
5692
+
5701
5693
  # Get the minimum and sum of all the columns in the dataframe,
5702
5694
  # by passing list of string functions as parameter.
5703
5695
  >>> df.agg(['min', 'sum'])
@@ -5743,9 +5735,15 @@ class DataFrame():
5743
5735
  mean_employee_no unique_employee_no unique_first_name mean_joined_date unique_joined_date
5744
5736
  0 104.333333 3 2 60/12/04 2
5745
5737
 
5738
+ # Get the percentile of each column in the dataframe with default value 0.5.
5746
5739
  >>> df.agg('percentile')
5747
- percentile_employee_no percentile_marks
5748
- 0 101 None
5740
+ percentile_employee_no percentile_marks
5741
+ 0 101 None
5742
+
5743
+ # Get 80 percentile of each column in the datafame.
5744
+ >>> df.agg('percentile_0.8')
5745
+ percentile_0.8_employee_no percentile_0.8_marks
5746
+ 0 107 None
5749
5747
 
5750
5748
  # Using another table 'sales' (having repeated values) to demonstrate operations
5751
5749
  # 'unique' and 'percentile'.
@@ -5762,9 +5760,11 @@ class DataFrame():
5762
5760
  Blue Inc 90.0 50 95 101 2017-04-01
5763
5761
  Red Inc 200.0 150 140 None 2017-04-01
5764
5762
 
5765
- >>> df.agg('percentile')
5766
- percentile_Feb percentile_Jan percentile_Mar percentile_Apr
5767
- 0 200.0 150 140 215
5763
+ # Get 80 and 40 percentile values of each column in the dataframe.
5764
+ >>> df1 = df.select(['Feb', 'Jan', 'Mar', 'Apr'])
5765
+ >>> df1.agg(['percentile_0.8', 'percentile_0.4'])
5766
+ percentile_0.8_Feb percentile_0.4_Feb percentile_0.8_Jan percentile_0.4_Jan percentile_0.8_Mar percentile_0.4_Mar percentile_0.8_Apr percentile_0.4_Apr
5767
+ 0 210.0 200.0 170 150 170 140 250 194
5768
5768
 
5769
5769
  >>> df.agg('unique')
5770
5770
  unique_accounts unique_Feb unique_Jan unique_Mar unique_Apr unique_datetime
@@ -5951,6 +5951,8 @@ class DataFrame():
5951
5951
 
5952
5952
  except TeradataMlException:
5953
5953
  raise
5954
+ except ValueError:
5955
+ raise
5954
5956
  except Exception as err:
5955
5957
  raise TeradataMlException(Messages.get_message(
5956
5958
  MessageCodes.EXECUTION_FAILED, "perform {} on DataFrame".format(operation), str(err)),
@@ -7760,7 +7762,7 @@ class DataFrame():
7760
7762
  """
7761
7763
  return (type(None), int, float, str, decimal.Decimal, ColumnExpression, ClauseElement)
7762
7764
 
7763
- def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
7765
+ def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
7764
7766
  """
7765
7767
  DESCRIPTION:
7766
7768
  Function generates the MetaExpression and AED nodeid for DataFrame.assign()
@@ -7773,6 +7775,11 @@ class DataFrame():
7773
7775
  Default Value: False
7774
7776
  Types: bool
7775
7777
 
7778
+ node_id:
7779
+ Optional Argument.
7780
+ Specifies the input nodeid for the assign operation.
7781
+ Types: str
7782
+
7776
7783
  kwargs:
7777
7784
  keyword, value pairs
7778
7785
  - keywords are the column names.
@@ -7800,7 +7807,7 @@ class DataFrame():
7800
7807
 
7801
7808
  # Join the expressions in result.
7802
7809
  assign_expression = ', '.join(list(map(lambda x: x[1], result)))
7803
- new_nodeid = self._aed_utils._aed_assign(self._nodeid,
7810
+ new_nodeid = self._aed_utils._aed_assign(node_id,
7804
7811
  assign_expression,
7805
7812
  AEDConstants.AED_ASSIGN_DROP_EXISITING_COLUMNS.value)
7806
7813
 
@@ -7939,7 +7946,7 @@ class DataFrame():
7939
7946
  env_mapper[env_name] = [colname]
7940
7947
  else:
7941
7948
  env_mapper[env_name] = udf_expr.keys()
7942
-
7949
+ debug = False
7943
7950
  for env_name, cols in env_mapper.items():
7944
7951
  # Create a dictionary of output columns to column type.
7945
7952
  returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
@@ -7950,6 +7957,7 @@ class DataFrame():
7950
7957
  # Create a dictionary of output column name to udf arguments
7951
7958
  function_args = {}
7952
7959
  for colname, col in udf_expr.items():
7960
+ debug |= col._debug
7953
7961
  delimiter = col._delimiter
7954
7962
  quotechar = col._quotechar
7955
7963
  if colname in cols:
@@ -7982,7 +7990,9 @@ class DataFrame():
7982
7990
  columns_definitions=columns_definitions,
7983
7991
  output_type_converters={
7984
7992
  col_name: _Dtypes._teradata_type_to_python_type(col_type)
7985
- for col_name, col_type in returns.items()})
7993
+ for col_name, col_type in returns.items()},
7994
+ debug=debug
7995
+ )
7986
7996
 
7987
7997
  df = tbl_operators.execute()
7988
7998
  return df
@@ -8624,8 +8634,34 @@ class DataFrame():
8624
8634
  # from udf expression.
8625
8635
  if bool(regular_expr):
8626
8636
  try:
8627
- (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(drop_columns, **regular_expr)
8637
+ root_node_id = None
8638
+ root_df_col = df.columns
8639
+
8640
+ # Get the previous node type, if it is assign and drop_columns is False,
8641
+ # then check if the previous assign arguments exists and are not present
8642
+ # in either the root dataframe columns or the current assign arguments.
8643
+ # if these conditions are met, obtain the root node id (i.e., the first
8644
+ # node of the assign operation) and merge the previous assign arguments with the current ones.
8645
+
8646
+ prev_node_type = df._aed_utils._aed_get_node_query_type(df._nodeid)
8647
+ if not drop_columns and prev_node_type == "assign" and df._previous_assign_args is not None:
8648
+ if not df._root_columns & df._previous_assign_args.keys() and \
8649
+ not df._previous_assign_args.keys() & regular_expr.keys():
8650
+ # Get the root node id and root dataframe columns.
8651
+ root_df_col = df._root_columns
8652
+ root_node_id = df._aed_utils._aed_get_parent_nodeids(df._nodeid)[0]
8653
+ regular_expr = {**df._previous_assign_args, **regular_expr}
8654
+
8655
+ # If root_node_id is None, assign the current node id as root node of assign operation
8656
+ node_id = root_node_id if root_node_id is not None else df._nodeid
8657
+
8658
+ # Generate new meta expression and node id for the new dataframe.
8659
+ (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(
8660
+ drop_columns, node_id = node_id, **regular_expr)
8628
8661
  df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
8662
+ df._previous_assign_args = regular_expr
8663
+ df._root_columns = root_df_col
8664
+
8629
8665
  except Exception as err:
8630
8666
  errcode = MessageCodes.TDMLDF_INFO_ERROR
8631
8667
  msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
@@ -11569,6 +11605,10 @@ class DataFrame():
11569
11605
  DESCRIPTION:
11570
11606
  Function to apply a user defined function to each row in the
11571
11607
  teradataml DataFrame, leveraging Vantage's Script Table Operator.
11608
+ Notes:
11609
+ 1. The function requires to use same Python version in both Vantage and local environment.
11610
+ 2. Teradata recommends to use "dill" package with same version in both Vantage and
11611
+ local environment.
11572
11612
 
11573
11613
  PARAMETERS:
11574
11614
  user_function:
@@ -11749,6 +11789,15 @@ class DataFrame():
11749
11789
  Default Value: True
11750
11790
  Types: bool
11751
11791
 
11792
+ debug:
11793
+ Optional Argument.
11794
+ Specifies whether to display the script file path generated during function execution or not. This
11795
+ argument helps in debugging when there are any failures during function execution. When set
11796
+ to True, function displays the path of the script and does not remove the file from local file system.
11797
+ Otherwise, file is removed from the local file system.
11798
+ Default Value: False
11799
+ Types: bool
11800
+
11752
11801
  RETURNS:
11753
11802
  1. teradataml DataFrame if exec_mode is "IN-DB".
11754
11803
  2. Pandas DataFrame if exec_mode is "LOCAL".
@@ -11901,6 +11950,7 @@ class DataFrame():
11901
11950
  sort_ascending = kwargs.pop('sort_ascending', True)
11902
11951
  auth = kwargs.pop('auth', None)
11903
11952
  charset = kwargs.pop('charset', None)
11953
+ debug = kwargs.pop('debug', False)
11904
11954
 
11905
11955
  # Check for other extra/unknown arguments.
11906
11956
  unknown_args = list(kwargs.keys())
@@ -11919,7 +11969,7 @@ class DataFrame():
11919
11969
  sort_ascending=sort_ascending,
11920
11970
  returns=returns, delimiter=delimiter,
11921
11971
  quotechar=quotechar, auth=auth,
11922
- charset=charset, num_rows=num_rows)
11972
+ charset=charset, num_rows=num_rows, debug=debug)
11923
11973
 
11924
11974
  return tbl_op_util.execute()
11925
11975
 
@@ -11936,6 +11986,10 @@ class DataFrame():
11936
11986
  DESCRIPTION:
11937
11987
  Function to apply a user defined function to a group or partition of rows
11938
11988
  in the teradataml DataFrame, leveraging Vantage's Script Table Operator.
11989
+ Notes:
11990
+ 1. The function requires to use same Python version in both Vantage and local environment.
11991
+ 2. Teradata recommends to use "dill" package with same version in both Vantage and
11992
+ local environment.
11939
11993
 
11940
11994
  PARAMETERS:
11941
11995
  user_function:
@@ -12146,6 +12200,15 @@ class DataFrame():
12146
12200
  Default Value: True
12147
12201
  Types: bool
12148
12202
 
12203
+ debug:
12204
+ Optional Argument.
12205
+ Specifies whether to display the script file path generated during function execution or not. This
12206
+ argument helps in debugging when there are any failures during function execution. When set
12207
+ to True, function displays the path of the script and does not remove the file from local file system.
12208
+ Otherwise, file is removed from the local file system.
12209
+ Default Value: False
12210
+ Types: bool
12211
+
12149
12212
  RETURNS:
12150
12213
  1. teradataml DataFrame if exec_mode is "IN-DB".
12151
12214
  2. Pandas DataFrame if exec_mode is "LOCAL".
@@ -12311,6 +12374,7 @@ class DataFrame():
12311
12374
  sort_ascending = kwargs.pop('sort_ascending', True)
12312
12375
  auth = kwargs.pop('auth', None)
12313
12376
  charset = kwargs.pop('charset', None)
12377
+ debug = kwargs.pop('debug', False)
12314
12378
 
12315
12379
  # Check for other extra/unknown arguments.
12316
12380
  unknown_args = list(kwargs.keys())
@@ -12329,7 +12393,7 @@ class DataFrame():
12329
12393
  sort_ascending=sort_ascending,
12330
12394
  returns=returns, delimiter=delimiter,
12331
12395
  quotechar=quotechar, auth=auth,
12332
- charset=charset, num_rows=num_rows)
12396
+ charset=charset, num_rows=num_rows, debug=debug)
12333
12397
 
12334
12398
  return tbl_op_util.execute()
12335
12399
 
@@ -12346,9 +12410,9 @@ class DataFrame():
12346
12410
  teradataml DataFrame, leveraging Apply Table Operator of Open
12347
12411
  Analytics Framework.
12348
12412
  Notes:
12349
- 1. The function requires dill package with same version in both remote environment
12350
- and local environment.
12351
- 2. Teradata recommends to use same Python version in both remote and local environment.
12413
+ 1. The function requires to use same Python version in both remote environment and local environment.
12414
+ 2. Teradata recommends to use "dill" package with same version in both remote environment and
12415
+ local environment.
12352
12416
 
12353
12417
  PARAMETERS:
12354
12418
  user_function:
@@ -12531,6 +12595,15 @@ class DataFrame():
12531
12595
  Default value: "csv"
12532
12596
  Types: str
12533
12597
 
12598
+ debug:
12599
+ Optional Argument.
12600
+ Specifies whether to display the script file path generated during function execution or not. This
12601
+ argument helps in debugging when there are any failures during function execution. When set
12602
+ to True, function displays the path of the script and does not remove the file from local file system.
12603
+ Otherwise, file is removed from the local file system.
12604
+ Default Value: False
12605
+ Types: bool
12606
+
12534
12607
  RETURNS:
12535
12608
  teradataml DataFrame.
12536
12609
 
@@ -12707,6 +12780,7 @@ class DataFrame():
12707
12780
  is_local_order = kwargs.pop('is_local_order', False)
12708
12781
  nulls_first = kwargs.pop('nulls_first', True)
12709
12782
  sort_ascending = kwargs.pop('sort_ascending', True)
12783
+ debug = kwargs.pop('debug', False)
12710
12784
 
12711
12785
  # Check for other extra/unknown arguments.
12712
12786
  unknown_args = list(kwargs.keys())
@@ -12729,7 +12803,8 @@ class DataFrame():
12729
12803
  charset=None,
12730
12804
  num_rows=num_rows,
12731
12805
  env_name=env_name,
12732
- style=style)
12806
+ style=style,
12807
+ debug=debug)
12733
12808
 
12734
12809
  return tbl_op_util.execute()
12735
12810
 
@@ -15446,7 +15521,7 @@ class DataFrameGroupBy(DataFrame):
15446
15521
  from sqlalchemy.sql.functions import Function
15447
15522
  return (type(None), int, float, str, decimal.Decimal, Function, ColumnExpression, ClauseElement)
15448
15523
 
15449
- def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
15524
+ def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
15450
15525
  """
15451
15526
  DESCRIPTION:
15452
15527
  Function generates the MetaExpression and AED nodeid for DataFrameGroupBy.assign()
@@ -15459,6 +15534,11 @@ class DataFrameGroupBy(DataFrame):
15459
15534
  and grouping columns are returned. This is unused argument.
15460
15535
  Types: bool
15461
15536
 
15537
+ node_id:
15538
+ Optional Argument.
15539
+ Specifies the input nodeid for the assign operation. This is unused argument.
15540
+ Types: str
15541
+
15462
15542
  kwargs:
15463
15543
  keyword, value pairs
15464
15544
  - keywords are the column names.
@@ -652,7 +652,7 @@ class DataFrameUtils():
652
652
  all_operations = list(set(all_operations))
653
653
  invalid_aggregates = []
654
654
  for operation in all_operations:
655
- if operation not in valid_aggregate_operations \
655
+ if operation not in valid_aggregate_operations and not operation.startswith('percentile_') \
656
656
  and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
657
657
  invalid_aggregates.append(operation)
658
658
  if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
@@ -735,7 +735,20 @@ class DataFrameUtils():
735
735
  quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
736
736
  kwargs[key_to_process] = quoted_columns
737
737
 
738
- func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
738
+ if operation.startswith('percentile_'):
739
+ try:
740
+ _operation_value = operation.split('_')
741
+ _floatvalue = float(_operation_value[1])
742
+ if _floatvalue < 0.0 or _floatvalue > 1.0 or len(_operation_value)>2:
743
+ raise ValueError
744
+ except ValueError:
745
+ mssg = "Invalid aggregate operation '{}' requested on TeradataML DataFrame." \
746
+ " Valid operation should be in format 'percentile_<floatvalue>' and <floatvalue> " \
747
+ "should be in range [0.0, 1.0].".format(operation)
748
+ raise ValueError(mssg) from None
749
+ func_expression = getattr(df[column], 'percentile')(percentile=_floatvalue)
750
+ else:
751
+ func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
739
752
  new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
740
753
  # column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
741
754
  return True, new_column_name, NUMBER() if describe_op else func_expression.type, \
@@ -1,25 +1,24 @@
1
1
  import pandas as pd
2
2
  from inspect import getsource
3
3
  import re
4
- from types import FunctionType
4
+ from teradataml.dataframe.copy_to import copy_to_sql
5
+ from teradataml.dataframe.dataframe import DataFrame
5
6
  from teradataml.dbutils.filemgr import install_file, list_files, remove_file
6
- from teradataml.options.configure import configure
7
+ from teradataml.utils.utils import execute_sql
7
8
  import teradatasqlalchemy as tdsqlalchemy
8
9
  from teradataml.utils.validators import _Validators
9
10
  from teradataml.dataframe.sql import _SQLColumnExpression
10
11
  from teradatasqlalchemy import VARCHAR, CLOB, CHAR
11
- from teradataml.common.constants import TeradataTypes
12
+ from teradataml.common.constants import TableOperatorConstants, TeradataConstants, TeradataTypes
12
13
  from teradataml.common.utils import UtilFuncs
13
- from teradataml.utils.dtypes import _Dtypes
14
14
  from teradataml.dataframe.sql_interfaces import ColumnExpression
15
15
  from teradataml.table_operators.table_operator_util import _TableOperatorUtils
16
- from teradataml.utils.internal_buffer import _InternalBuffer
17
16
  from teradataml.common.exceptions import TeradataMlException
18
17
  from teradataml.common.messages import Messages
19
18
  from teradataml.common.messagecodes import MessageCodes
20
19
  from teradataml.scriptmgmt.lls_utils import get_env
21
20
 
22
- def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None):
21
+ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None, debug=False):
23
22
  """
24
23
  DESCRIPTION:
25
24
  Creates a user defined function (UDF).
@@ -85,6 +84,15 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
85
84
  * This argument cannot be same as "delimiter" argument.
86
85
  * This argument cannot be a newline character.
87
86
 
87
+ debug:
88
+ Optional Argument.
89
+ Specifies whether to display the script file path generated during function execution or not. This
90
+ argument helps in debugging when there are any failures during function execution. When set
91
+ to True, function displays the path of the script and does not remove the file from local file system.
92
+ Otherwise, file is removed from the local file system.
93
+ Default Value: False
94
+ Types: bool
95
+
88
96
  RETURNS:
89
97
  ColumnExpression
90
98
 
@@ -324,14 +332,14 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
324
332
  def wrapper(f):
325
333
  def func_(*args):
326
334
  return _SQLColumnExpression(expression=None, udf=f, udf_type=returns, udf_args=args,\
327
- env_name=env_name, delimiter=delimiter, quotechar=quotechar)
335
+ env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
328
336
  return func_
329
337
  return wrapper
330
338
  # Notation: @udf
331
339
  else:
332
340
  def func_(*args):
333
341
  return _SQLColumnExpression(expression=None, udf=user_function, udf_type=returns, udf_args=args,\
334
- env_name=env_name, delimiter=delimiter, quotechar=quotechar)
342
+ env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
335
343
  return func_
336
344
 
337
345
 
@@ -879,4 +887,96 @@ def _create_return_type(returns):
879
887
  return_str = str(returns)
880
888
  # Replace the space with underscore in the return type.
881
889
  return_str = return_str.replace(" ", "_")
882
- return return_str
890
+ return return_str
891
+
892
+ def td_range(start, end=None, step=1):
893
+ """
894
+ DESCRIPTION:
895
+ Creates a DataFrame with a specified range of numbers.
896
+
897
+ Notes:
898
+ 1. The range is inclusive of the start and exclusive of the end.
899
+ 2. If only start is provided, then end is set to start and start is set to 0.
900
+
901
+ PARAMETERS:
902
+ start:
903
+ Required Argument.
904
+ Specifies the starting number of the range.
905
+ Types: int
906
+
907
+ end:
908
+ Optional Argument.
909
+ Specifies the end number of the range(exclusive).
910
+ Default Value: None
911
+ Types: int
912
+
913
+ step:
914
+ Optional Argument.
915
+ Specifies the step size of the range.
916
+ Default Value: 1
917
+ Types: int
918
+
919
+ RETURNS:
920
+ teradataml DataFrame
921
+
922
+ RAISES:
923
+ TeradataMlException
924
+
925
+ EXAMPLES:
926
+ # Example 1: Create a DataFrame with a range of numbers from 0 to 5.
927
+ >>> from teradataml.dataframe.functions import td_range
928
+ >>> df = td_range(5)
929
+ >>> df.sort('id')
930
+ id
931
+ 0 0
932
+ 1 1
933
+ 2 2
934
+ 3 3
935
+ 4 4
936
+
937
+ # Example 2: Create a DataFrame with a range of numbers from 5 to 1 with step size of -2.
938
+ >>> from teradataml.dataframe.functions import td_range
939
+ >>> td_range(5, 1, -2)
940
+ id
941
+ 0 3
942
+ 1 5
943
+
944
+ >>> Example 3: Create a DataFrame with a range of numbers from 1 to 5 with default step size of 1.
945
+ >>> from teradataml.dataframe.functions import td_range
946
+ >>> td_range(1, 5)
947
+ id
948
+ 0 3
949
+ 1 4
950
+ 2 2
951
+ 3 1
952
+
953
+ """
954
+ # Validate the arguments.
955
+ arg_matrix = []
956
+ arg_matrix.append(["start", start, False, int])
957
+ arg_matrix.append(["end", end, True, int])
958
+ arg_matrix.append(["step", step, True, int])
959
+ _Validators._validate_function_arguments(arg_matrix)
960
+
961
+ # If only start is provided, then set end to start and start to 0.
962
+ if end is None:
963
+ end = start
964
+ start = 0
965
+
966
+ # If start is greater than end, then set the operation to "-" and operator to ">".
967
+ # If end is less than start, then set the operation to "+" and operator to "<".
968
+ if end < start:
969
+ operation, operator, step = "-", ">", -step
970
+ else:
971
+ operation, operator = "+", "<"
972
+
973
+ # Create a temporary table with the start value.
974
+ table_name = UtilFuncs._generate_temp_table_name(prefix="tdml_range_df",
975
+ table_type=TeradataConstants.TERADATA_TABLE)
976
+ execute_sql(f"CREATE MULTISET TABLE {table_name} AS (SELECT {start} AS id) WITH DATA;")
977
+
978
+ # Create a DataFrame from the range query.
979
+ range_query = TableOperatorConstants.RANGE_QUERY.value \
980
+ .format(table_name, step, end, operation, operator)
981
+ df = DataFrame.from_query(range_query)
982
+ return df