teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +86 -13
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +7 -12
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +16 -1
- teradataml/analytics/utils.py +15 -1
- teradataml/automl/__init__.py +290 -106
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +29 -10
- teradataml/automl/data_transformation.py +11 -0
- teradataml/automl/feature_engineering.py +64 -4
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +1 -1
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/constants.py +61 -26
- teradataml/common/messagecodes.py +2 -1
- teradataml/common/messages.py +5 -4
- teradataml/common/utils.py +255 -37
- teradataml/context/context.py +225 -87
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +13 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/dataframe/copy_to.py +37 -26
- teradataml/dataframe/data_transfer.py +61 -45
- teradataml/dataframe/dataframe.py +130 -50
- teradataml/dataframe/dataframe_utils.py +15 -2
- teradataml/dataframe/functions.py +109 -9
- teradataml/dataframe/sql.py +328 -76
- teradataml/dbutils/dbutils.py +33 -13
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/_base.py +6 -157
- teradataml/options/configure.py +4 -5
- teradataml/scriptmgmt/UserEnv.py +305 -38
- teradataml/scriptmgmt/lls_utils.py +376 -130
- teradataml/store/__init__.py +1 -1
- teradataml/table_operators/Apply.py +16 -1
- teradataml/table_operators/Script.py +20 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +2 -1
- teradataml/utils/internal_buffer.py +22 -2
- teradataml/utils/validators.py +313 -57
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +89 -14
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +107 -77
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -243,6 +243,11 @@ class DataFrame():
|
|
|
243
243
|
# Property to determine if table is an ART table or not.
|
|
244
244
|
self._is_art = None
|
|
245
245
|
|
|
246
|
+
# This attribute stores the previous assign arguments in continuous assign calls.
|
|
247
|
+
self._previous_assign_args = None
|
|
248
|
+
# This attribute stores the root DataFrame columns.
|
|
249
|
+
self._root_columns = None
|
|
250
|
+
|
|
246
251
|
self._datalake = None
|
|
247
252
|
self._database = None
|
|
248
253
|
self._table = None
|
|
@@ -2924,9 +2929,8 @@ class DataFrame():
|
|
|
2924
2929
|
msg = Messages.get_message(errcode)
|
|
2925
2930
|
raise TeradataMlException(msg, errcode)
|
|
2926
2931
|
|
|
2927
|
-
@argument_deprecation("20.0.0.5", "include", False, None)
|
|
2928
2932
|
@collect_queryband(queryband="DF_describe")
|
|
2929
|
-
def describe(self, percentiles=[.25, .5, .75],
|
|
2933
|
+
def describe(self, percentiles=[.25, .5, .75], verbose=False, distinct=False, statistics=None,
|
|
2930
2934
|
columns=None, pivot=False):
|
|
2931
2935
|
"""
|
|
2932
2936
|
DESCRIPTION:
|
|
@@ -2956,18 +2960,6 @@ class DataFrame():
|
|
|
2956
2960
|
Default Values: [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
|
|
2957
2961
|
Types: float or List of floats
|
|
2958
2962
|
|
|
2959
|
-
include:
|
|
2960
|
-
Optional Argument.
|
|
2961
|
-
Values can be either None or "all".
|
|
2962
|
-
If the value is "all", both numeric and non-numeric columns are included.
|
|
2963
|
-
Computes count, mean, std, min, percentiles, and max for numeric columns.
|
|
2964
|
-
Computes count and unique for non-numeric columns.
|
|
2965
|
-
If the value is None, only numeric columns are used for collecting statistics.
|
|
2966
|
-
Note:
|
|
2967
|
-
* Value 'all' is not applicable for 'Time Series Aggregate Mode'.
|
|
2968
|
-
Default Values: None
|
|
2969
|
-
Types: str
|
|
2970
|
-
|
|
2971
2963
|
verbose:
|
|
2972
2964
|
Optional Argument.
|
|
2973
2965
|
Specifies a boolean value to be used for time series aggregation, stating whether to get
|
|
@@ -2994,7 +2986,6 @@ class DataFrame():
|
|
|
2994
2986
|
Computes count and unique for non-numeric columns.
|
|
2995
2987
|
Notes:
|
|
2996
2988
|
1. statistics is not applicable for 'Time Series Aggregate Mode'.
|
|
2997
|
-
2. statistics should not be used with include as 'all'.
|
|
2998
2989
|
Permitted Values: count, mean, min, max, unique, std, describe, percentile
|
|
2999
2990
|
Default Values: None
|
|
3000
2991
|
Types: str or List of str
|
|
@@ -3310,7 +3301,6 @@ class DataFrame():
|
|
|
3310
3301
|
awu_matrix = []
|
|
3311
3302
|
awu_matrix.append(["columns", columns, True, (str, list), True])
|
|
3312
3303
|
awu_matrix.append(["percentiles", percentiles, True, (float, list)])
|
|
3313
|
-
awu_matrix.append(["include", include, True, (str), True, [None, "all"]])
|
|
3314
3304
|
awu_matrix.append(["verbose", verbose, True, (bool)])
|
|
3315
3305
|
awu_matrix.append(["distinct", distinct, True, (bool)])
|
|
3316
3306
|
awu_matrix.append(["statistics", statistics, True, (str, list), True,
|
|
@@ -3334,22 +3324,11 @@ class DataFrame():
|
|
|
3334
3324
|
if statistics:
|
|
3335
3325
|
statistics = [stats.lower() for stats in UtilFuncs._as_list(statistics)]
|
|
3336
3326
|
|
|
3337
|
-
# Argument include and statistics should not be used together
|
|
3338
|
-
if include is not None and statistics is not None:
|
|
3339
|
-
raise ValueError(Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH).format(
|
|
3340
|
-
'include', 'statistics'
|
|
3341
|
-
))
|
|
3342
|
-
|
|
3343
3327
|
# Percentiles must be a list of values between 0 and 1.
|
|
3344
3328
|
if not isinstance(percentiles, list) or not all(p > 0 and p < 1 for p in percentiles):
|
|
3345
3329
|
raise ValueError(Messages.get_message(MessageCodes.INVALID_ARG_VALUE, percentiles, "percentiles",
|
|
3346
3330
|
"percentiles must be a list of values between 0 and 1"))
|
|
3347
3331
|
|
|
3348
|
-
# Argument 'include' with value 'all' is not allowed for DataFrameGroupByTime
|
|
3349
|
-
if include is not None and include.lower() == "all" and isinstance(self, DataFrameGroupByTime):
|
|
3350
|
-
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
3351
|
-
'include', 'Aggregation', 'all', 'describe()', 'DataFrame or DataFrameGroupBy'))
|
|
3352
|
-
|
|
3353
3332
|
# Argument 'statistics' is not allowed for DataFrameGroupByTime
|
|
3354
3333
|
if statistics is not None and isinstance(self, DataFrameGroupByTime):
|
|
3355
3334
|
raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
|
|
@@ -3383,7 +3362,7 @@ class DataFrame():
|
|
|
3383
3362
|
# Construct the aggregate query.
|
|
3384
3363
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3385
3364
|
percentiles=percentiles, function_label=function_label,
|
|
3386
|
-
groupby_column_list=groupby_column_list, include=
|
|
3365
|
+
groupby_column_list=groupby_column_list, include=None,
|
|
3387
3366
|
is_time_series_aggregate=True, verbose=verbose,
|
|
3388
3367
|
distinct=distinct,
|
|
3389
3368
|
timebucket_duration=self._timebucket_duration,
|
|
@@ -3414,7 +3393,7 @@ class DataFrame():
|
|
|
3414
3393
|
# Construct the aggregate query.
|
|
3415
3394
|
agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
|
|
3416
3395
|
percentiles=percentiles, function_label=function_label,
|
|
3417
|
-
groupby_column_list=groupby_column_list, include=
|
|
3396
|
+
groupby_column_list=groupby_column_list, include=None,
|
|
3418
3397
|
is_time_series_aggregate=False, verbose=verbose,
|
|
3419
3398
|
distinct=distinct, statistics=statistics)
|
|
3420
3399
|
|
|
@@ -5570,8 +5549,10 @@ class DataFrame():
|
|
|
5570
5549
|
Specifies the function(s) to apply on DataFrame columns.
|
|
5571
5550
|
|
|
5572
5551
|
Valid values for func are:
|
|
5573
|
-
'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique',
|
|
5574
|
-
|
|
5552
|
+
* 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'percentile_<floatvalue>', 'unique',
|
|
5553
|
+
'median', 'var'
|
|
5554
|
+
* Note: In 'percentile_<floatvalue>', <floatvalue> specifies the desired percentile value to
|
|
5555
|
+
calculate aggregate. It should be in the range of 0.0 to 1.0 (both inclusive).
|
|
5575
5556
|
|
|
5576
5557
|
Acceptable formats for function(s) are
|
|
5577
5558
|
string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
|
|
@@ -5605,12 +5586,17 @@ class DataFrame():
|
|
|
5605
5586
|
Output column names after the above operation are:
|
|
5606
5587
|
min_employee_no, sum_employee_no, var_employee_no, min_first_name
|
|
5607
5588
|
|
|
5608
|
-
4. "
|
|
5589
|
+
4. "percentile_<floatvalue>" passed to agg.
|
|
5590
|
+
>>> df.agg({'employee_no' : ['percentile_0.25', 'percentile_0.75', 'min']})
|
|
5591
|
+
>>> df.agg(['percentile_0.25', 'percentile_0.75', 'sum'])
|
|
5592
|
+
>>> df.agg('percentile_0.25')
|
|
5593
|
+
|
|
5594
|
+
5. "func" passed as a ColumnExpression built using the aggregate functions.
|
|
5609
5595
|
>>> df.agg(df.first_name.count())
|
|
5610
5596
|
Output column name after the above operation is:
|
|
5611
5597
|
count(first_name)
|
|
5612
5598
|
|
|
5613
|
-
|
|
5599
|
+
6. "func" passed as a list of ColumnExpression built using the aggregate functions.
|
|
5614
5600
|
>>> df.agg([df.employee_no.min(), df.first_name.count()])
|
|
5615
5601
|
Output column names after the above operation are:
|
|
5616
5602
|
min(employee_no), count(first_name)
|
|
@@ -5698,6 +5684,12 @@ class DataFrame():
|
|
|
5698
5684
|
min_employee_no sum_employee_no var_employee_no min_first_name
|
|
5699
5685
|
0 100 313 44.333333 abcd
|
|
5700
5686
|
|
|
5687
|
+
# Get the minimum, 25 percentile value and variance of employee number, by passing dictionary of
|
|
5688
|
+
# column names to string function/list of string functions as parameter.
|
|
5689
|
+
>>> df.agg({'employee_no' : ['min', 'percentile_0.25', 'var']})
|
|
5690
|
+
min_employee_no percentile_0.25_employee_no var_employee_no
|
|
5691
|
+
0 100 100 44.333333
|
|
5692
|
+
|
|
5701
5693
|
# Get the minimum and sum of all the columns in the dataframe,
|
|
5702
5694
|
# by passing list of string functions as parameter.
|
|
5703
5695
|
>>> df.agg(['min', 'sum'])
|
|
@@ -5743,9 +5735,15 @@ class DataFrame():
|
|
|
5743
5735
|
mean_employee_no unique_employee_no unique_first_name mean_joined_date unique_joined_date
|
|
5744
5736
|
0 104.333333 3 2 60/12/04 2
|
|
5745
5737
|
|
|
5738
|
+
# Get the percentile of each column in the dataframe with default value 0.5.
|
|
5746
5739
|
>>> df.agg('percentile')
|
|
5747
|
-
|
|
5748
|
-
|
|
5740
|
+
percentile_employee_no percentile_marks
|
|
5741
|
+
0 101 None
|
|
5742
|
+
|
|
5743
|
+
# Get 80 percentile of each column in the datafame.
|
|
5744
|
+
>>> df.agg('percentile_0.8')
|
|
5745
|
+
percentile_0.8_employee_no percentile_0.8_marks
|
|
5746
|
+
0 107 None
|
|
5749
5747
|
|
|
5750
5748
|
# Using another table 'sales' (having repeated values) to demonstrate operations
|
|
5751
5749
|
# 'unique' and 'percentile'.
|
|
@@ -5762,9 +5760,11 @@ class DataFrame():
|
|
|
5762
5760
|
Blue Inc 90.0 50 95 101 2017-04-01
|
|
5763
5761
|
Red Inc 200.0 150 140 None 2017-04-01
|
|
5764
5762
|
|
|
5765
|
-
|
|
5766
|
-
|
|
5767
|
-
|
|
5763
|
+
# Get 80 and 40 percentile values of each column in the dataframe.
|
|
5764
|
+
>>> df1 = df.select(['Feb', 'Jan', 'Mar', 'Apr'])
|
|
5765
|
+
>>> df1.agg(['percentile_0.8', 'percentile_0.4'])
|
|
5766
|
+
percentile_0.8_Feb percentile_0.4_Feb percentile_0.8_Jan percentile_0.4_Jan percentile_0.8_Mar percentile_0.4_Mar percentile_0.8_Apr percentile_0.4_Apr
|
|
5767
|
+
0 210.0 200.0 170 150 170 140 250 194
|
|
5768
5768
|
|
|
5769
5769
|
>>> df.agg('unique')
|
|
5770
5770
|
unique_accounts unique_Feb unique_Jan unique_Mar unique_Apr unique_datetime
|
|
@@ -5951,6 +5951,8 @@ class DataFrame():
|
|
|
5951
5951
|
|
|
5952
5952
|
except TeradataMlException:
|
|
5953
5953
|
raise
|
|
5954
|
+
except ValueError:
|
|
5955
|
+
raise
|
|
5954
5956
|
except Exception as err:
|
|
5955
5957
|
raise TeradataMlException(Messages.get_message(
|
|
5956
5958
|
MessageCodes.EXECUTION_FAILED, "perform {} on DataFrame".format(operation), str(err)),
|
|
@@ -7760,7 +7762,7 @@ class DataFrame():
|
|
|
7760
7762
|
"""
|
|
7761
7763
|
return (type(None), int, float, str, decimal.Decimal, ColumnExpression, ClauseElement)
|
|
7762
7764
|
|
|
7763
|
-
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
|
|
7765
|
+
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
|
|
7764
7766
|
"""
|
|
7765
7767
|
DESCRIPTION:
|
|
7766
7768
|
Function generates the MetaExpression and AED nodeid for DataFrame.assign()
|
|
@@ -7773,6 +7775,11 @@ class DataFrame():
|
|
|
7773
7775
|
Default Value: False
|
|
7774
7776
|
Types: bool
|
|
7775
7777
|
|
|
7778
|
+
node_id:
|
|
7779
|
+
Optional Argument.
|
|
7780
|
+
Specifies the input nodeid for the assign operation.
|
|
7781
|
+
Types: str
|
|
7782
|
+
|
|
7776
7783
|
kwargs:
|
|
7777
7784
|
keyword, value pairs
|
|
7778
7785
|
- keywords are the column names.
|
|
@@ -7800,7 +7807,7 @@ class DataFrame():
|
|
|
7800
7807
|
|
|
7801
7808
|
# Join the expressions in result.
|
|
7802
7809
|
assign_expression = ', '.join(list(map(lambda x: x[1], result)))
|
|
7803
|
-
new_nodeid = self._aed_utils._aed_assign(
|
|
7810
|
+
new_nodeid = self._aed_utils._aed_assign(node_id,
|
|
7804
7811
|
assign_expression,
|
|
7805
7812
|
AEDConstants.AED_ASSIGN_DROP_EXISITING_COLUMNS.value)
|
|
7806
7813
|
|
|
@@ -7939,7 +7946,7 @@ class DataFrame():
|
|
|
7939
7946
|
env_mapper[env_name] = [colname]
|
|
7940
7947
|
else:
|
|
7941
7948
|
env_mapper[env_name] = udf_expr.keys()
|
|
7942
|
-
|
|
7949
|
+
debug = False
|
|
7943
7950
|
for env_name, cols in env_mapper.items():
|
|
7944
7951
|
# Create a dictionary of output columns to column type.
|
|
7945
7952
|
returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
|
|
@@ -7950,6 +7957,7 @@ class DataFrame():
|
|
|
7950
7957
|
# Create a dictionary of output column name to udf arguments
|
|
7951
7958
|
function_args = {}
|
|
7952
7959
|
for colname, col in udf_expr.items():
|
|
7960
|
+
debug |= col._debug
|
|
7953
7961
|
delimiter = col._delimiter
|
|
7954
7962
|
quotechar = col._quotechar
|
|
7955
7963
|
if colname in cols:
|
|
@@ -7982,7 +7990,9 @@ class DataFrame():
|
|
|
7982
7990
|
columns_definitions=columns_definitions,
|
|
7983
7991
|
output_type_converters={
|
|
7984
7992
|
col_name: _Dtypes._teradata_type_to_python_type(col_type)
|
|
7985
|
-
for col_name, col_type in returns.items()}
|
|
7993
|
+
for col_name, col_type in returns.items()},
|
|
7994
|
+
debug=debug
|
|
7995
|
+
)
|
|
7986
7996
|
|
|
7987
7997
|
df = tbl_operators.execute()
|
|
7988
7998
|
return df
|
|
@@ -8624,8 +8634,34 @@ class DataFrame():
|
|
|
8624
8634
|
# from udf expression.
|
|
8625
8635
|
if bool(regular_expr):
|
|
8626
8636
|
try:
|
|
8627
|
-
|
|
8637
|
+
root_node_id = None
|
|
8638
|
+
root_df_col = df.columns
|
|
8639
|
+
|
|
8640
|
+
# Get the previous node type, if it is assign and drop_columns is False,
|
|
8641
|
+
# then check if the previous assign arguments exists and are not present
|
|
8642
|
+
# in either the root dataframe columns or the current assign arguments.
|
|
8643
|
+
# if these conditions are met, obtain the root node id (i.e., the first
|
|
8644
|
+
# node of the assign operation) and merge the previous assign arguments with the current ones.
|
|
8645
|
+
|
|
8646
|
+
prev_node_type = df._aed_utils._aed_get_node_query_type(df._nodeid)
|
|
8647
|
+
if not drop_columns and prev_node_type == "assign" and df._previous_assign_args is not None:
|
|
8648
|
+
if not df._root_columns & df._previous_assign_args.keys() and \
|
|
8649
|
+
not df._previous_assign_args.keys() & regular_expr.keys():
|
|
8650
|
+
# Get the root node id and root dataframe columns.
|
|
8651
|
+
root_df_col = df._root_columns
|
|
8652
|
+
root_node_id = df._aed_utils._aed_get_parent_nodeids(df._nodeid)[0]
|
|
8653
|
+
regular_expr = {**df._previous_assign_args, **regular_expr}
|
|
8654
|
+
|
|
8655
|
+
# If root_node_id is None, assign the current node id as root node of assign operation
|
|
8656
|
+
node_id = root_node_id if root_node_id is not None else df._nodeid
|
|
8657
|
+
|
|
8658
|
+
# Generate new meta expression and node id for the new dataframe.
|
|
8659
|
+
(new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(
|
|
8660
|
+
drop_columns, node_id = node_id, **regular_expr)
|
|
8628
8661
|
df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
|
|
8662
|
+
df._previous_assign_args = regular_expr
|
|
8663
|
+
df._root_columns = root_df_col
|
|
8664
|
+
|
|
8629
8665
|
except Exception as err:
|
|
8630
8666
|
errcode = MessageCodes.TDMLDF_INFO_ERROR
|
|
8631
8667
|
msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
|
|
@@ -11569,6 +11605,10 @@ class DataFrame():
|
|
|
11569
11605
|
DESCRIPTION:
|
|
11570
11606
|
Function to apply a user defined function to each row in the
|
|
11571
11607
|
teradataml DataFrame, leveraging Vantage's Script Table Operator.
|
|
11608
|
+
Notes:
|
|
11609
|
+
1. The function requires to use same Python version in both Vantage and local environment.
|
|
11610
|
+
2. Teradata recommends to use "dill" package with same version in both Vantage and
|
|
11611
|
+
local environment.
|
|
11572
11612
|
|
|
11573
11613
|
PARAMETERS:
|
|
11574
11614
|
user_function:
|
|
@@ -11749,6 +11789,15 @@ class DataFrame():
|
|
|
11749
11789
|
Default Value: True
|
|
11750
11790
|
Types: bool
|
|
11751
11791
|
|
|
11792
|
+
debug:
|
|
11793
|
+
Optional Argument.
|
|
11794
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
11795
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
11796
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
11797
|
+
Otherwise, file is removed from the local file system.
|
|
11798
|
+
Default Value: False
|
|
11799
|
+
Types: bool
|
|
11800
|
+
|
|
11752
11801
|
RETURNS:
|
|
11753
11802
|
1. teradataml DataFrame if exec_mode is "IN-DB".
|
|
11754
11803
|
2. Pandas DataFrame if exec_mode is "LOCAL".
|
|
@@ -11901,6 +11950,7 @@ class DataFrame():
|
|
|
11901
11950
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
11902
11951
|
auth = kwargs.pop('auth', None)
|
|
11903
11952
|
charset = kwargs.pop('charset', None)
|
|
11953
|
+
debug = kwargs.pop('debug', False)
|
|
11904
11954
|
|
|
11905
11955
|
# Check for other extra/unknown arguments.
|
|
11906
11956
|
unknown_args = list(kwargs.keys())
|
|
@@ -11919,7 +11969,7 @@ class DataFrame():
|
|
|
11919
11969
|
sort_ascending=sort_ascending,
|
|
11920
11970
|
returns=returns, delimiter=delimiter,
|
|
11921
11971
|
quotechar=quotechar, auth=auth,
|
|
11922
|
-
charset=charset, num_rows=num_rows)
|
|
11972
|
+
charset=charset, num_rows=num_rows, debug=debug)
|
|
11923
11973
|
|
|
11924
11974
|
return tbl_op_util.execute()
|
|
11925
11975
|
|
|
@@ -11936,6 +11986,10 @@ class DataFrame():
|
|
|
11936
11986
|
DESCRIPTION:
|
|
11937
11987
|
Function to apply a user defined function to a group or partition of rows
|
|
11938
11988
|
in the teradataml DataFrame, leveraging Vantage's Script Table Operator.
|
|
11989
|
+
Notes:
|
|
11990
|
+
1. The function requires to use same Python version in both Vantage and local environment.
|
|
11991
|
+
2. Teradata recommends to use "dill" package with same version in both Vantage and
|
|
11992
|
+
local environment.
|
|
11939
11993
|
|
|
11940
11994
|
PARAMETERS:
|
|
11941
11995
|
user_function:
|
|
@@ -12146,6 +12200,15 @@ class DataFrame():
|
|
|
12146
12200
|
Default Value: True
|
|
12147
12201
|
Types: bool
|
|
12148
12202
|
|
|
12203
|
+
debug:
|
|
12204
|
+
Optional Argument.
|
|
12205
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
12206
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
12207
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
12208
|
+
Otherwise, file is removed from the local file system.
|
|
12209
|
+
Default Value: False
|
|
12210
|
+
Types: bool
|
|
12211
|
+
|
|
12149
12212
|
RETURNS:
|
|
12150
12213
|
1. teradataml DataFrame if exec_mode is "IN-DB".
|
|
12151
12214
|
2. Pandas DataFrame if exec_mode is "LOCAL".
|
|
@@ -12311,6 +12374,7 @@ class DataFrame():
|
|
|
12311
12374
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
12312
12375
|
auth = kwargs.pop('auth', None)
|
|
12313
12376
|
charset = kwargs.pop('charset', None)
|
|
12377
|
+
debug = kwargs.pop('debug', False)
|
|
12314
12378
|
|
|
12315
12379
|
# Check for other extra/unknown arguments.
|
|
12316
12380
|
unknown_args = list(kwargs.keys())
|
|
@@ -12329,7 +12393,7 @@ class DataFrame():
|
|
|
12329
12393
|
sort_ascending=sort_ascending,
|
|
12330
12394
|
returns=returns, delimiter=delimiter,
|
|
12331
12395
|
quotechar=quotechar, auth=auth,
|
|
12332
|
-
charset=charset, num_rows=num_rows)
|
|
12396
|
+
charset=charset, num_rows=num_rows, debug=debug)
|
|
12333
12397
|
|
|
12334
12398
|
return tbl_op_util.execute()
|
|
12335
12399
|
|
|
@@ -12346,9 +12410,9 @@ class DataFrame():
|
|
|
12346
12410
|
teradataml DataFrame, leveraging Apply Table Operator of Open
|
|
12347
12411
|
Analytics Framework.
|
|
12348
12412
|
Notes:
|
|
12349
|
-
|
|
12350
|
-
|
|
12351
|
-
|
|
12413
|
+
1. The function requires to use same Python version in both remote environment and local environment.
|
|
12414
|
+
2. Teradata recommends to use "dill" package with same version in both remote environment and
|
|
12415
|
+
local environment.
|
|
12352
12416
|
|
|
12353
12417
|
PARAMETERS:
|
|
12354
12418
|
user_function:
|
|
@@ -12531,6 +12595,15 @@ class DataFrame():
|
|
|
12531
12595
|
Default value: "csv"
|
|
12532
12596
|
Types: str
|
|
12533
12597
|
|
|
12598
|
+
debug:
|
|
12599
|
+
Optional Argument.
|
|
12600
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
12601
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
12602
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
12603
|
+
Otherwise, file is removed from the local file system.
|
|
12604
|
+
Default Value: False
|
|
12605
|
+
Types: bool
|
|
12606
|
+
|
|
12534
12607
|
RETURNS:
|
|
12535
12608
|
teradataml DataFrame.
|
|
12536
12609
|
|
|
@@ -12707,6 +12780,7 @@ class DataFrame():
|
|
|
12707
12780
|
is_local_order = kwargs.pop('is_local_order', False)
|
|
12708
12781
|
nulls_first = kwargs.pop('nulls_first', True)
|
|
12709
12782
|
sort_ascending = kwargs.pop('sort_ascending', True)
|
|
12783
|
+
debug = kwargs.pop('debug', False)
|
|
12710
12784
|
|
|
12711
12785
|
# Check for other extra/unknown arguments.
|
|
12712
12786
|
unknown_args = list(kwargs.keys())
|
|
@@ -12729,7 +12803,8 @@ class DataFrame():
|
|
|
12729
12803
|
charset=None,
|
|
12730
12804
|
num_rows=num_rows,
|
|
12731
12805
|
env_name=env_name,
|
|
12732
|
-
style=style
|
|
12806
|
+
style=style,
|
|
12807
|
+
debug=debug)
|
|
12733
12808
|
|
|
12734
12809
|
return tbl_op_util.execute()
|
|
12735
12810
|
|
|
@@ -15446,7 +15521,7 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15446
15521
|
from sqlalchemy.sql.functions import Function
|
|
15447
15522
|
return (type(None), int, float, str, decimal.Decimal, Function, ColumnExpression, ClauseElement)
|
|
15448
15523
|
|
|
15449
|
-
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
|
|
15524
|
+
def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
|
|
15450
15525
|
"""
|
|
15451
15526
|
DESCRIPTION:
|
|
15452
15527
|
Function generates the MetaExpression and AED nodeid for DataFrameGroupBy.assign()
|
|
@@ -15459,6 +15534,11 @@ class DataFrameGroupBy(DataFrame):
|
|
|
15459
15534
|
and grouping columns are returned. This is unused argument.
|
|
15460
15535
|
Types: bool
|
|
15461
15536
|
|
|
15537
|
+
node_id:
|
|
15538
|
+
Optional Argument.
|
|
15539
|
+
Specifies the input nodeid for the assign operation. This is unused argument.
|
|
15540
|
+
Types: str
|
|
15541
|
+
|
|
15462
15542
|
kwargs:
|
|
15463
15543
|
keyword, value pairs
|
|
15464
15544
|
- keywords are the column names.
|
|
@@ -652,7 +652,7 @@ class DataFrameUtils():
|
|
|
652
652
|
all_operations = list(set(all_operations))
|
|
653
653
|
invalid_aggregates = []
|
|
654
654
|
for operation in all_operations:
|
|
655
|
-
if operation not in valid_aggregate_operations \
|
|
655
|
+
if operation not in valid_aggregate_operations and not operation.startswith('percentile_') \
|
|
656
656
|
and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
|
|
657
657
|
invalid_aggregates.append(operation)
|
|
658
658
|
if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
|
|
@@ -735,7 +735,20 @@ class DataFrameUtils():
|
|
|
735
735
|
quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
|
|
736
736
|
kwargs[key_to_process] = quoted_columns
|
|
737
737
|
|
|
738
|
-
|
|
738
|
+
if operation.startswith('percentile_'):
|
|
739
|
+
try:
|
|
740
|
+
_operation_value = operation.split('_')
|
|
741
|
+
_floatvalue = float(_operation_value[1])
|
|
742
|
+
if _floatvalue < 0.0 or _floatvalue > 1.0 or len(_operation_value)>2:
|
|
743
|
+
raise ValueError
|
|
744
|
+
except ValueError:
|
|
745
|
+
mssg = "Invalid aggregate operation '{}' requested on TeradataML DataFrame." \
|
|
746
|
+
" Valid operation should be in format 'percentile_<floatvalue>' and <floatvalue> " \
|
|
747
|
+
"should be in range [0.0, 1.0].".format(operation)
|
|
748
|
+
raise ValueError(mssg) from None
|
|
749
|
+
func_expression = getattr(df[column], 'percentile')(percentile=_floatvalue)
|
|
750
|
+
else:
|
|
751
|
+
func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
|
|
739
752
|
new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
|
|
740
753
|
# column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
|
|
741
754
|
return True, new_column_name, NUMBER() if describe_op else func_expression.type, \
|
|
@@ -1,25 +1,24 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from inspect import getsource
|
|
3
3
|
import re
|
|
4
|
-
from
|
|
4
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
5
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
5
6
|
from teradataml.dbutils.filemgr import install_file, list_files, remove_file
|
|
6
|
-
from teradataml.
|
|
7
|
+
from teradataml.utils.utils import execute_sql
|
|
7
8
|
import teradatasqlalchemy as tdsqlalchemy
|
|
8
9
|
from teradataml.utils.validators import _Validators
|
|
9
10
|
from teradataml.dataframe.sql import _SQLColumnExpression
|
|
10
11
|
from teradatasqlalchemy import VARCHAR, CLOB, CHAR
|
|
11
|
-
from teradataml.common.constants import TeradataTypes
|
|
12
|
+
from teradataml.common.constants import TableOperatorConstants, TeradataConstants, TeradataTypes
|
|
12
13
|
from teradataml.common.utils import UtilFuncs
|
|
13
|
-
from teradataml.utils.dtypes import _Dtypes
|
|
14
14
|
from teradataml.dataframe.sql_interfaces import ColumnExpression
|
|
15
15
|
from teradataml.table_operators.table_operator_util import _TableOperatorUtils
|
|
16
|
-
from teradataml.utils.internal_buffer import _InternalBuffer
|
|
17
16
|
from teradataml.common.exceptions import TeradataMlException
|
|
18
17
|
from teradataml.common.messages import Messages
|
|
19
18
|
from teradataml.common.messagecodes import MessageCodes
|
|
20
19
|
from teradataml.scriptmgmt.lls_utils import get_env
|
|
21
20
|
|
|
22
|
-
def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None):
|
|
21
|
+
def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None, debug=False):
|
|
23
22
|
"""
|
|
24
23
|
DESCRIPTION:
|
|
25
24
|
Creates a user defined function (UDF).
|
|
@@ -85,6 +84,15 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
|
|
|
85
84
|
* This argument cannot be same as "delimiter" argument.
|
|
86
85
|
* This argument cannot be a newline character.
|
|
87
86
|
|
|
87
|
+
debug:
|
|
88
|
+
Optional Argument.
|
|
89
|
+
Specifies whether to display the script file path generated during function execution or not. This
|
|
90
|
+
argument helps in debugging when there are any failures during function execution. When set
|
|
91
|
+
to True, function displays the path of the script and does not remove the file from local file system.
|
|
92
|
+
Otherwise, file is removed from the local file system.
|
|
93
|
+
Default Value: False
|
|
94
|
+
Types: bool
|
|
95
|
+
|
|
88
96
|
RETURNS:
|
|
89
97
|
ColumnExpression
|
|
90
98
|
|
|
@@ -324,14 +332,14 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
|
|
|
324
332
|
def wrapper(f):
|
|
325
333
|
def func_(*args):
|
|
326
334
|
return _SQLColumnExpression(expression=None, udf=f, udf_type=returns, udf_args=args,\
|
|
327
|
-
env_name=env_name, delimiter=delimiter, quotechar=quotechar)
|
|
335
|
+
env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
|
|
328
336
|
return func_
|
|
329
337
|
return wrapper
|
|
330
338
|
# Notation: @udf
|
|
331
339
|
else:
|
|
332
340
|
def func_(*args):
|
|
333
341
|
return _SQLColumnExpression(expression=None, udf=user_function, udf_type=returns, udf_args=args,\
|
|
334
|
-
env_name=env_name, delimiter=delimiter, quotechar=quotechar)
|
|
342
|
+
env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
|
|
335
343
|
return func_
|
|
336
344
|
|
|
337
345
|
|
|
@@ -879,4 +887,96 @@ def _create_return_type(returns):
|
|
|
879
887
|
return_str = str(returns)
|
|
880
888
|
# Replace the space with underscore in the return type.
|
|
881
889
|
return_str = return_str.replace(" ", "_")
|
|
882
|
-
return return_str
|
|
890
|
+
return return_str
|
|
891
|
+
|
|
892
|
+
def td_range(start, end=None, step=1):
|
|
893
|
+
"""
|
|
894
|
+
DESCRIPTION:
|
|
895
|
+
Creates a DataFrame with a specified range of numbers.
|
|
896
|
+
|
|
897
|
+
Notes:
|
|
898
|
+
1. The range is inclusive of the start and exclusive of the end.
|
|
899
|
+
2. If only start is provided, then end is set to start and start is set to 0.
|
|
900
|
+
|
|
901
|
+
PARAMETERS:
|
|
902
|
+
start:
|
|
903
|
+
Required Argument.
|
|
904
|
+
Specifies the starting number of the range.
|
|
905
|
+
Types: int
|
|
906
|
+
|
|
907
|
+
end:
|
|
908
|
+
Optional Argument.
|
|
909
|
+
Specifies the end number of the range(exclusive).
|
|
910
|
+
Default Value: None
|
|
911
|
+
Types: int
|
|
912
|
+
|
|
913
|
+
step:
|
|
914
|
+
Optional Argument.
|
|
915
|
+
Specifies the step size of the range.
|
|
916
|
+
Default Value: 1
|
|
917
|
+
Types: int
|
|
918
|
+
|
|
919
|
+
RETURNS:
|
|
920
|
+
teradataml DataFrame
|
|
921
|
+
|
|
922
|
+
RAISES:
|
|
923
|
+
TeradataMlException
|
|
924
|
+
|
|
925
|
+
EXAMPLES:
|
|
926
|
+
# Example 1: Create a DataFrame with a range of numbers from 0 to 5.
|
|
927
|
+
>>> from teradataml.dataframe.functions import td_range
|
|
928
|
+
>>> df = td_range(5)
|
|
929
|
+
>>> df.sort('id')
|
|
930
|
+
id
|
|
931
|
+
0 0
|
|
932
|
+
1 1
|
|
933
|
+
2 2
|
|
934
|
+
3 3
|
|
935
|
+
4 4
|
|
936
|
+
|
|
937
|
+
# Example 2: Create a DataFrame with a range of numbers from 5 to 1 with step size of -2.
|
|
938
|
+
>>> from teradataml.dataframe.functions import td_range
|
|
939
|
+
>>> td_range(5, 1, -2)
|
|
940
|
+
id
|
|
941
|
+
0 3
|
|
942
|
+
1 5
|
|
943
|
+
|
|
944
|
+
>>> Example 3: Create a DataFrame with a range of numbers from 1 to 5 with default step size of 1.
|
|
945
|
+
>>> from teradataml.dataframe.functions import td_range
|
|
946
|
+
>>> td_range(1, 5)
|
|
947
|
+
id
|
|
948
|
+
0 3
|
|
949
|
+
1 4
|
|
950
|
+
2 2
|
|
951
|
+
3 1
|
|
952
|
+
|
|
953
|
+
"""
|
|
954
|
+
# Validate the arguments.
|
|
955
|
+
arg_matrix = []
|
|
956
|
+
arg_matrix.append(["start", start, False, int])
|
|
957
|
+
arg_matrix.append(["end", end, True, int])
|
|
958
|
+
arg_matrix.append(["step", step, True, int])
|
|
959
|
+
_Validators._validate_function_arguments(arg_matrix)
|
|
960
|
+
|
|
961
|
+
# If only start is provided, then set end to start and start to 0.
|
|
962
|
+
if end is None:
|
|
963
|
+
end = start
|
|
964
|
+
start = 0
|
|
965
|
+
|
|
966
|
+
# If start is greater than end, then set the operation to "-" and operator to ">".
|
|
967
|
+
# If end is less than start, then set the operation to "+" and operator to "<".
|
|
968
|
+
if end < start:
|
|
969
|
+
operation, operator, step = "-", ">", -step
|
|
970
|
+
else:
|
|
971
|
+
operation, operator = "+", "<"
|
|
972
|
+
|
|
973
|
+
# Create a temporary table with the start value.
|
|
974
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="tdml_range_df",
|
|
975
|
+
table_type=TeradataConstants.TERADATA_TABLE)
|
|
976
|
+
execute_sql(f"CREATE MULTISET TABLE {table_name} AS (SELECT {start} AS id) WITH DATA;")
|
|
977
|
+
|
|
978
|
+
# Create a DataFrame from the range query.
|
|
979
|
+
range_query = TableOperatorConstants.RANGE_QUERY.value \
|
|
980
|
+
.format(table_name, step, end, operation, operator)
|
|
981
|
+
df = DataFrame.from_query(range_query)
|
|
982
|
+
return df
|