PyPI - teradataml - Versions diffs - 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl - Mend

teradataml 20.0.0.4py3-none-any.whl → 20.0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (107) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +86 -13
teradataml/__init__.py +2 -1
teradataml/_version.py +2 -2
teradataml/analytics/analytic_function_executor.py +7 -12
teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
teradataml/analytics/sqle/__init__.py +16 -1
teradataml/analytics/utils.py +15 -1
teradataml/automl/__init__.py +290 -106
teradataml/automl/autodataprep/__init__.py +471 -0
teradataml/automl/data_preparation.py +29 -10
teradataml/automl/data_transformation.py +11 -0
teradataml/automl/feature_engineering.py +64 -4
teradataml/automl/feature_exploration.py +639 -25
teradataml/automl/model_training.py +1 -1
teradataml/clients/auth_client.py +2 -2
teradataml/common/constants.py +61 -26
teradataml/common/messagecodes.py +2 -1
teradataml/common/messages.py +5 -4
teradataml/common/utils.py +255 -37
teradataml/context/context.py +225 -87
teradataml/data/apriori_example.json +22 -0
teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
teradataml/data/ner_dict.csv +8 -0
teradataml/data/ner_input_eng.csv +7 -0
teradataml/data/ner_rule.csv +5 -0
teradataml/data/pos_input.csv +40 -0
teradataml/data/tdnerextractor_example.json +14 -0
teradataml/data/teradataml_example.json +13 -0
teradataml/data/textmorph_example.json +5 -0
teradataml/data/to_num_data.csv +4 -0
teradataml/data/tochar_data.csv +5 -0
teradataml/data/trans_dense.csv +16 -0
teradataml/data/trans_sparse.csv +55 -0
teradataml/dataframe/copy_to.py +37 -26
teradataml/dataframe/data_transfer.py +61 -45
teradataml/dataframe/dataframe.py +130 -50
teradataml/dataframe/dataframe_utils.py +15 -2
teradataml/dataframe/functions.py +109 -9
teradataml/dataframe/sql.py +328 -76
teradataml/dbutils/dbutils.py +33 -13
teradataml/dbutils/filemgr.py +14 -10
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/_base.py +6 -157
teradataml/options/configure.py +4 -5
teradataml/scriptmgmt/UserEnv.py +305 -38
teradataml/scriptmgmt/lls_utils.py +376 -130
teradataml/store/__init__.py +1 -1
teradataml/table_operators/Apply.py +16 -1
teradataml/table_operators/Script.py +20 -1
teradataml/table_operators/table_operator_util.py +58 -9
teradataml/utils/dtypes.py +2 -1
teradataml/utils/internal_buffer.py +22 -2
teradataml/utils/validators.py +313 -57
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +89 -14
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +107 -77
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0

teradataml/dataframe/dataframe.py CHANGED Viewed

@@ -243,6 +243,11 @@ class DataFrame():
         # Property to determine if table is an ART table or not.
         self._is_art = None
+        # This attribute stores the previous assign arguments in continuous assign calls.
+        self._previous_assign_args = None
+        # This attribute stores the root DataFrame columns.
+        self._root_columns = None
         self._datalake = None
         self._database = None
         self._table = None
@@ -2924,9 +2929,8 @@ class DataFrame():
             msg = Messages.get_message(errcode)
             raise TeradataMlException(msg, errcode)
-    @argument_deprecation("20.0.0.5", "include", False, None)
     @collect_queryband(queryband="DF_describe")
-    def describe(self, percentiles=[.25, .5, .75], include=None, verbose=False, distinct=False, statistics=None,
+    def describe(self, percentiles=[.25, .5, .75], verbose=False, distinct=False, statistics=None,
                  columns=None, pivot=False):
         """
         DESCRIPTION:
@@ -2956,18 +2960,6 @@ class DataFrame():
                 Default Values: [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.
                 Types: float or List of floats
-            include:
-                Optional Argument.
-                Values can be either None or "all".
-                If the value is "all", both numeric and non-numeric columns are included.
-                Computes count, mean, std, min, percentiles, and max for numeric columns.
-                Computes count and unique for non-numeric columns.
-                If the value is None, only numeric columns are used for collecting statistics.
-                Note:
-                    * Value 'all' is not applicable for 'Time Series Aggregate Mode'.
-                Default Values: None
-                Types: str
             verbose:
                 Optional Argument.
                 Specifies a boolean value to be used for time series aggregation, stating whether to get
@@ -2994,7 +2986,6 @@ class DataFrame():
                 Computes count and unique for non-numeric columns.
                 Notes:
                     1. statistics is not applicable for 'Time Series Aggregate Mode'.
-                    2. statistics should not be used with include as 'all'.
                 Permitted Values: count, mean, min, max, unique, std, describe, percentile
                 Default Values: None
                 Types: str or List of str
@@ -3310,7 +3301,6 @@ class DataFrame():
         awu_matrix = []
         awu_matrix.append(["columns", columns, True, (str, list), True])
         awu_matrix.append(["percentiles", percentiles, True, (float, list)])
-        awu_matrix.append(["include", include, True, (str), True, [None, "all"]])
         awu_matrix.append(["verbose", verbose, True, (bool)])
         awu_matrix.append(["distinct", distinct, True, (bool)])
         awu_matrix.append(["statistics", statistics, True, (str, list), True,
@@ -3334,22 +3324,11 @@ class DataFrame():
         if statistics:
             statistics = [stats.lower() for stats in UtilFuncs._as_list(statistics)]
-        # Argument include and statistics should not be used together
-        if include is not None and statistics is not None:
-            raise ValueError(Messages.get_message(MessageCodes.CANNOT_USE_TOGETHER_WITH).format(
-                'include', 'statistics'
-            ))
         # Percentiles must be a list of values between 0 and 1.
         if not isinstance(percentiles, list) or not all(p > 0 and p < 1 for p in percentiles):
             raise ValueError(Messages.get_message(MessageCodes.INVALID_ARG_VALUE, percentiles, "percentiles",
                                                   "percentiles must be a list of values between 0 and 1"))
-        # Argument 'include' with value 'all' is not allowed for DataFrameGroupByTime
-        if include is not None and include.lower() == "all" and isinstance(self, DataFrameGroupByTime):
-            raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
-                'include', 'Aggregation', 'all', 'describe()', 'DataFrame or DataFrameGroupBy'))
         # Argument 'statistics' is not allowed for DataFrameGroupByTime
         if statistics is not None and isinstance(self, DataFrameGroupByTime):
             raise ValueError(Messages.get_message(MessageCodes.ARG_VALUE_CLASS_DEPENDENCY).format(
@@ -3383,7 +3362,7 @@ class DataFrame():
                 # Construct the aggregate query.
                 agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
                                                                percentiles=percentiles, function_label=function_label,
-                                                               groupby_column_list=groupby_column_list, include=include,
+                                                               groupby_column_list=groupby_column_list, include=None,
                                                                is_time_series_aggregate=True, verbose=verbose,
                                                                distinct=distinct,
                                                                timebucket_duration=self._timebucket_duration,
@@ -3414,7 +3393,7 @@ class DataFrame():
                     # Construct the aggregate query.
                     agg_query = df_utils._construct_describe_query(df=self, columns=columns, metaexpr=self._metaexpr,
                                                                 percentiles=percentiles, function_label=function_label,
-                                                                groupby_column_list=groupby_column_list, include=include,
+                                                                groupby_column_list=groupby_column_list, include=None,
                                                                 is_time_series_aggregate=False, verbose=verbose,
                                                                 distinct=distinct, statistics=statistics)
@@ -5570,8 +5549,10 @@ class DataFrame():
                 Specifies the function(s) to apply on DataFrame columns.
                 Valid values for func are:
-                    'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique',
-                    'median', 'var'
+                    * 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'percentile_<floatvalue>', 'unique',
+                      'median', 'var'
+                    * Note: In 'percentile_<floatvalue>', <floatvalue> specifies the desired percentile value to
+                            calculate aggregate. It should be in the range of 0.0 to 1.0 (both inclusive).
                 Acceptable formats for function(s) are
                     string, dictionary, list of strings/functions/ColumnExpression or ColumnExpression.
@@ -5605,12 +5586,17 @@ class DataFrame():
                         Output column names after the above operation are:
                           min_employee_no, sum_employee_no, var_employee_no, min_first_name
-                    4. "func" passed as a ColumnExpression built using the aggregate functions.
+                    4. "percentile_<floatvalue>" passed to agg.
+                        >>> df.agg({'employee_no' : ['percentile_0.25', 'percentile_0.75', 'min']})
+                        >>> df.agg(['percentile_0.25', 'percentile_0.75', 'sum'])
+                        >>> df.agg('percentile_0.25')
+                    5. "func" passed as a ColumnExpression built using the aggregate functions.
                         >>> df.agg(df.first_name.count())
                         Output column name after the above operation is:
                           count(first_name)
-                    5. "func" passed as a list of ColumnExpression built using the aggregate functions.
+                    6. "func" passed as a list of ColumnExpression built using the aggregate functions.
                         >>> df.agg([df.employee_no.min(), df.first_name.count()])
                         Output column names after the above operation are:
                           min(employee_no), count(first_name)
@@ -5698,6 +5684,12 @@ class DataFrame():
               min_employee_no sum_employee_no  var_employee_no min_first_name
             0             100             313        44.333333           abcd
+            # Get the minimum, 25 percentile value and variance of employee number, by passing dictionary of
+            # column names to string function/list of string functions as parameter.
+            >>> df.agg({'employee_no' : ['min', 'percentile_0.25', 'var']})
+              min_employee_no  percentile_0.25_employee_no  var_employee_no
+            0              100                          100        44.333333
             # Get the minimum and sum of all the columns in the dataframe,
             # by passing list of string functions as parameter.
             >>> df.agg(['min', 'sum'])
@@ -5743,9 +5735,15 @@ class DataFrame():
                mean_employee_no unique_employee_no unique_first_name mean_joined_date unique_joined_date
             0        104.333333                  3                 2         60/12/04                  2
+            # Get the percentile of each column in the dataframe with default value 0.5.
             >>> df.agg('percentile')
-                  percentile_employee_no percentile_marks
-                0                    101             None
+                percentile_employee_no percentile_marks
+            0                    101             None
+            # Get 80 percentile of each column in the datafame.
+            >>> df.agg('percentile_0.8')
+               percentile_0.8_employee_no percentile_0.8_marks
+            0                         107                 None
             # Using another table 'sales' (having repeated values) to demonstrate operations
             # 'unique' and 'percentile'.
@@ -5762,9 +5760,11 @@ class DataFrame():
                 Blue Inc     90.0    50    95   101  2017-04-01
                 Red Inc     200.0   150   140  None  2017-04-01
-            >>> df.agg('percentile')
-                   percentile_Feb percentile_Jan percentile_Mar percentile_Apr
-                0           200.0            150            140            215
+            # Get 80 and 40 percentile values of each column in the dataframe.
+            >>> df1 = df.select(['Feb', 'Jan', 'Mar', 'Apr'])
+            >>> df1.agg(['percentile_0.8', 'percentile_0.4'])
+                percentile_0.8_Feb  percentile_0.4_Feb  percentile_0.8_Jan  percentile_0.4_Jan  percentile_0.8_Mar  percentile_0.4_Mar  percentile_0.8_Apr  percentile_0.4_Apr
+            0               210.0               200.0                 170                 150                 170                 140                 250                 194
             >>> df.agg('unique')
                   unique_accounts unique_Feb unique_Jan unique_Mar unique_Apr unique_datetime
@@ -5951,6 +5951,8 @@ class DataFrame():
         except TeradataMlException:
             raise
+        except ValueError:
+            raise
         except Exception as err:
             raise TeradataMlException(Messages.get_message(
                 MessageCodes.EXECUTION_FAILED, "perform {} on DataFrame".format(operation), str(err)),
@@ -7760,7 +7762,7 @@ class DataFrame():
         """
         return (type(None), int, float, str, decimal.Decimal, ColumnExpression, ClauseElement)
-    def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
+    def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
         """
         DESCRIPTION:
             Function generates the MetaExpression and AED nodeid for DataFrame.assign()
@@ -7773,6 +7775,11 @@ class DataFrame():
                 Default Value: False
                 Types: bool
+            node_id:
+                Optional Argument.
+                Specifies the input nodeid for the assign operation.
+                Types: str
             kwargs:
                 keyword, value pairs
                 - keywords are the column names.
@@ -7800,7 +7807,7 @@ class DataFrame():
         # Join the expressions in result.
         assign_expression = ', '.join(list(map(lambda x: x[1], result)))
-        new_nodeid = self._aed_utils._aed_assign(self._nodeid,
+        new_nodeid = self._aed_utils._aed_assign(node_id,
                                                  assign_expression,
                                                  AEDConstants.AED_ASSIGN_DROP_EXISITING_COLUMNS.value)
@@ -7939,7 +7946,7 @@ class DataFrame():
                     env_mapper[env_name] = [colname]
         else:
             env_mapper[env_name] = udf_expr.keys()
+        debug = False
         for env_name, cols in env_mapper.items():
             # Create a dictionary of output columns to column type.
             returns = OrderedDict([(column.name, column.type) for column in df._metaexpr.c])
@@ -7950,6 +7957,7 @@ class DataFrame():
             # Create a dictionary of output column name to udf arguments
             function_args = {}
             for colname, col in udf_expr.items():
+                debug |= col._debug
                 delimiter = col._delimiter
                 quotechar = col._quotechar
                 if colname in cols:
@@ -7982,7 +7990,9 @@ class DataFrame():
                                                 columns_definitions=columns_definitions,
                                                 output_type_converters={
                                                     col_name: _Dtypes._teradata_type_to_python_type(col_type)
-                                                    for col_name, col_type in returns.items()})
+                                                    for col_name, col_type in returns.items()},
+                                                debug=debug
+                                                )
             df = tbl_operators.execute()
         return df
@@ -8624,8 +8634,34 @@ class DataFrame():
         # from udf expression.
         if bool(regular_expr):
             try:
-                (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(drop_columns, **regular_expr)
+                root_node_id = None
+                root_df_col = df.columns
+                # Get the previous node type, if it is assign and drop_columns is False,
+                # then check if the previous assign arguments exists and are not present
+                # in either the root dataframe columns or the current assign arguments.
+                # if these conditions are met, obtain the root node id (i.e., the first
+                # node of the assign operation) and merge the previous assign arguments with the current ones.
+                prev_node_type = df._aed_utils._aed_get_node_query_type(df._nodeid)
+                if not drop_columns and prev_node_type == "assign" and df._previous_assign_args is not None:
+                    if not df._root_columns & df._previous_assign_args.keys() and \
+                       not df._previous_assign_args.keys() & regular_expr.keys():
+                        # Get the root node id and root dataframe columns.
+                        root_df_col = df._root_columns
+                        root_node_id = df._aed_utils._aed_get_parent_nodeids(df._nodeid)[0]
+                        regular_expr = {**df._previous_assign_args, **regular_expr}
+                # If root_node_id is None, assign the current node id as root node of assign operation
+                node_id = root_node_id if root_node_id is not None else df._nodeid
+                # Generate new meta expression and node id for the new dataframe.
+                (new_meta, new_nodeid) = df._generate_assign_metaexpr_aed_nodeid(
+                                drop_columns, node_id = node_id, **regular_expr)
                 df = df._create_dataframe_from_node(new_nodeid, new_meta, df._index_label)
+                df._previous_assign_args = regular_expr
+                df._root_columns = root_df_col
             except Exception as err:
                 errcode = MessageCodes.TDMLDF_INFO_ERROR
                 msg = Messages.get_message(MessageCodes.TDMLDF_INFO_ERROR)
@@ -11569,6 +11605,10 @@ class DataFrame():
         DESCRIPTION:
             Function to apply a user defined function to each row in the
             teradataml DataFrame, leveraging Vantage's Script Table Operator.
+            Notes:
+                1. The function requires to use same Python version in both Vantage and local environment.
+                2. Teradata recommends to use "dill" package with same version in both Vantage and
+                   local environment.
         PARAMETERS:
             user_function:
@@ -11749,6 +11789,15 @@ class DataFrame():
                 Default Value: True
                 Types: bool
+            debug:
+                Optional Argument.
+                Specifies whether to display the script file path generated during function execution or not. This
+                argument helps in debugging when there are any failures during function execution. When set
+                to True, function displays the path of the script and does not remove the file from local file system.
+                Otherwise, file is removed from the local file system.
+                Default Value: False
+                Types: bool
         RETURNS:
             1. teradataml DataFrame if exec_mode is "IN-DB".
             2. Pandas DataFrame if exec_mode is "LOCAL".
@@ -11901,6 +11950,7 @@ class DataFrame():
         sort_ascending = kwargs.pop('sort_ascending', True)
         auth = kwargs.pop('auth', None)
         charset = kwargs.pop('charset', None)
+        debug = kwargs.pop('debug', False)
         # Check for other extra/unknown arguments.
         unknown_args = list(kwargs.keys())
@@ -11919,7 +11969,7 @@ class DataFrame():
                                           sort_ascending=sort_ascending,
                                           returns=returns, delimiter=delimiter,
                                           quotechar=quotechar, auth=auth,
-                                          charset=charset, num_rows=num_rows)
+                                          charset=charset, num_rows=num_rows, debug=debug)
         return tbl_op_util.execute()
@@ -11936,6 +11986,10 @@ class DataFrame():
         DESCRIPTION:
             Function to apply a user defined function to a group or partition of rows
             in the teradataml DataFrame, leveraging Vantage's Script Table Operator.
+            Notes:
+                1. The function requires to use same Python version in both Vantage and local environment.
+                2. Teradata recommends to use "dill" package with same version in both Vantage and
+                   local environment.
         PARAMETERS:
             user_function:
@@ -12146,6 +12200,15 @@ class DataFrame():
                 Default Value: True
                 Types: bool
+            debug:
+                Optional Argument.
+                Specifies whether to display the script file path generated during function execution or not. This
+                argument helps in debugging when there are any failures during function execution. When set
+                to True, function displays the path of the script and does not remove the file from local file system.
+                Otherwise, file is removed from the local file system.
+                Default Value: False
+                Types: bool
         RETURNS:
             1. teradataml DataFrame if exec_mode is "IN-DB".
             2. Pandas DataFrame if exec_mode is "LOCAL".
@@ -12311,6 +12374,7 @@ class DataFrame():
         sort_ascending = kwargs.pop('sort_ascending', True)
         auth = kwargs.pop('auth', None)
         charset = kwargs.pop('charset', None)
+        debug = kwargs.pop('debug', False)
         # Check for other extra/unknown arguments.
         unknown_args = list(kwargs.keys())
@@ -12329,7 +12393,7 @@ class DataFrame():
                                           sort_ascending=sort_ascending,
                                           returns=returns, delimiter=delimiter,
                                           quotechar=quotechar, auth=auth,
-                                          charset=charset, num_rows=num_rows)
+                                          charset=charset, num_rows=num_rows, debug=debug)
         return tbl_op_util.execute()
@@ -12346,9 +12410,9 @@ class DataFrame():
             teradataml DataFrame, leveraging Apply Table Operator of Open
             Analytics Framework.
             Notes:
-                 1. The function requires dill package with same version in both remote environment
-                    and local environment.
-                 2. Teradata recommends to use same Python version in both remote and local environment.
+                1. The function requires to use same Python version in both remote environment and local environment.
+                2. Teradata recommends to use "dill" package with same version in both remote environment and
+                   local environment.
         PARAMETERS:
             user_function:
@@ -12531,6 +12595,15 @@ class DataFrame():
                 Default value: "csv"
                 Types: str
+            debug:
+                Optional Argument.
+                Specifies whether to display the script file path generated during function execution or not. This
+                argument helps in debugging when there are any failures during function execution. When set
+                to True, function displays the path of the script and does not remove the file from local file system.
+                Otherwise, file is removed from the local file system.
+                Default Value: False
+                Types: bool
         RETURNS:
             teradataml DataFrame.
@@ -12707,6 +12780,7 @@ class DataFrame():
         is_local_order = kwargs.pop('is_local_order', False)
         nulls_first = kwargs.pop('nulls_first', True)
         sort_ascending = kwargs.pop('sort_ascending', True)
+        debug = kwargs.pop('debug', False)
         # Check for other extra/unknown arguments.
         unknown_args = list(kwargs.keys())
@@ -12729,7 +12803,8 @@ class DataFrame():
                                           charset=None,
                                           num_rows=num_rows,
                                           env_name=env_name,
-                                          style=style)
+                                          style=style,
+                                          debug=debug)
         return tbl_op_util.execute()
@@ -15446,7 +15521,7 @@ class DataFrameGroupBy(DataFrame):
         from sqlalchemy.sql.functions import Function
         return (type(None), int, float, str, decimal.Decimal, Function, ColumnExpression, ClauseElement)
-    def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, **kwargs):
+    def _generate_assign_metaexpr_aed_nodeid(self, drop_columns, node_id, **kwargs):
         """
         DESCRIPTION:
             Function generates the MetaExpression and AED nodeid for DataFrameGroupBy.assign()
@@ -15459,6 +15534,11 @@ class DataFrameGroupBy(DataFrame):
                 and grouping columns are returned. This is unused argument.
                 Types: bool
+            node_id:
+                Optional Argument.
+                Specifies the input nodeid for the assign operation. This is unused argument.
+                Types: str
             kwargs:
                 keyword, value pairs
                 - keywords are the column names.

teradataml/dataframe/dataframe_utils.py CHANGED Viewed

@@ -652,7 +652,7 @@ class DataFrameUtils():
         all_operations = list(set(all_operations))
         invalid_aggregates = []
         for operation in all_operations:
-            if operation not in valid_aggregate_operations \
+            if operation not in valid_aggregate_operations and not operation.startswith('percentile_') \
                     and operation not in UtilFuncs._get_valid_time_series_aggregate_operations():
                 invalid_aggregates.append(operation)
         if len(invalid_aggregates) > 0: # If any of the aggregate operations specified is not valid
@@ -735,7 +735,20 @@ class DataFrameUtils():
                 quoted_columns = UtilFuncs._process_for_teradata_keyword(kwargs[key_to_process])
                 kwargs[key_to_process] = quoted_columns
-            func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
+            if operation.startswith('percentile_'):
+                try:
+                    _operation_value = operation.split('_')
+                    _floatvalue = float(_operation_value[1])
+                    if _floatvalue < 0.0 or _floatvalue > 1.0 or len(_operation_value)>2:
+                        raise ValueError
+                except ValueError:
+                    mssg = "Invalid aggregate operation '{}' requested on TeradataML DataFrame." \
+                           " Valid operation should be in format 'percentile_<floatvalue>' and <floatvalue> " \
+                           "should be in range [0.0, 1.0].".format(operation)
+                    raise ValueError(mssg) from None
+                func_expression = getattr(df[column], 'percentile')(percentile=_floatvalue)
+            else:
+                func_expression = getattr(df[column], operation)(describe_op=describe_op, **kwargs)
             new_column_name = column if describe_op else "{1}_{0}".format(column, operation)
             # column_supported, new_column_name, new_column_type, column_aggr_expr, invalid_column_str
             return True, new_column_name, NUMBER() if describe_op else func_expression.type, \

teradataml/dataframe/functions.py CHANGED Viewed

@@ -1,25 +1,24 @@
 import pandas as pd
 from inspect import getsource
 import re
-from types import FunctionType
+from teradataml.dataframe.copy_to import copy_to_sql
+from teradataml.dataframe.dataframe import DataFrame
 from teradataml.dbutils.filemgr import install_file, list_files, remove_file
-from teradataml.options.configure import configure
+from teradataml.utils.utils import execute_sql
 import teradatasqlalchemy as tdsqlalchemy
 from teradataml.utils.validators import _Validators
 from teradataml.dataframe.sql import _SQLColumnExpression
 from teradatasqlalchemy import VARCHAR, CLOB, CHAR
-from teradataml.common.constants import TeradataTypes
+from teradataml.common.constants import TableOperatorConstants, TeradataConstants, TeradataTypes
 from teradataml.common.utils import UtilFuncs
-from teradataml.utils.dtypes import _Dtypes
 from teradataml.dataframe.sql_interfaces import ColumnExpression
 from teradataml.table_operators.table_operator_util import _TableOperatorUtils
-from teradataml.utils.internal_buffer import _InternalBuffer
 from teradataml.common.exceptions import TeradataMlException
 from teradataml.common.messages import Messages
 from teradataml.common.messagecodes import MessageCodes
 from teradataml.scriptmgmt.lls_utils import get_env
-def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None):
+def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',', quotechar=None, debug=False):
     """
     DESCRIPTION:
         Creates a user defined function (UDF).
@@ -85,6 +84,15 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
                 * This argument cannot be same as "delimiter" argument.
                 * This argument cannot be a newline character.
+        debug:
+            Optional Argument.
+            Specifies whether to display the script file path generated during function execution or not. This
+            argument helps in debugging when there are any failures during function execution. When set
+            to True, function displays the path of the script and does not remove the file from local file system.
+            Otherwise, file is removed from the local file system.
+            Default Value: False
+            Types: bool
     RETURNS:
         ColumnExpression
@@ -324,14 +332,14 @@ def udf(user_function=None, returns=VARCHAR(1024), env_name = None, delimiter=',
         def wrapper(f):
             def func_(*args):
                 return _SQLColumnExpression(expression=None, udf=f, udf_type=returns, udf_args=args,\
-                                            env_name=env_name, delimiter=delimiter, quotechar=quotechar)
+                                            env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
             return func_
         return wrapper
     # Notation: @udf
     else:
         def func_(*args):
             return _SQLColumnExpression(expression=None, udf=user_function, udf_type=returns, udf_args=args,\
-                                        env_name=env_name, delimiter=delimiter, quotechar=quotechar)
+                                        env_name=env_name, delimiter=delimiter, quotechar=quotechar, debug=debug)
     return func_
@@ -879,4 +887,96 @@ def _create_return_type(returns):
         return_str = str(returns)
     # Replace the space with underscore in the return type.
     return_str = return_str.replace(" ", "_")
-    return return_str
+    return return_str
+def td_range(start, end=None, step=1):
+    """
+    DESCRIPTION:
+        Creates a DataFrame with a specified range of numbers.
+        Notes:
+            1. The range is inclusive of the start and exclusive of the end.
+            2. If only start is provided, then end is set to start and start is set to 0.
+    PARAMETERS:
+        start:
+            Required Argument.
+            Specifies the starting number of the range.
+            Types: int
+        end:
+            Optional Argument.
+            Specifies the end number of the range(exclusive).
+            Default Value: None
+            Types: int
+        step:
+            Optional Argument.
+            Specifies the step size of the range.
+            Default Value: 1
+            Types: int
+    RETURNS:
+        teradataml DataFrame
+    RAISES:
+        TeradataMlException
+    EXAMPLES:
+            # Example 1: Create a DataFrame with a range of numbers from 0 to 5.
+            >>> from teradataml.dataframe.functions import td_range
+            >>> df = td_range(5)
+            >>> df.sort('id')
+               id
+            0   0
+            1   1
+            2   2
+            3   3
+            4   4
+            # Example 2: Create a DataFrame with a range of numbers from 5 to 1 with step size of -2.
+            >>> from teradataml.dataframe.functions import td_range
+            >>> td_range(5, 1, -2)
+               id
+            0   3
+            1   5
+            >>> Example 3: Create a DataFrame with a range of numbers from 1 to 5 with default step size of 1.
+            >>> from teradataml.dataframe.functions import td_range
+            >>> td_range(1, 5)
+               id
+            0   3
+            1   4
+            2   2
+            3   1
+    """
+    # Validate the arguments.
+    arg_matrix = []
+    arg_matrix.append(["start", start, False, int])
+    arg_matrix.append(["end", end, True, int])
+    arg_matrix.append(["step", step, True, int])
+    _Validators._validate_function_arguments(arg_matrix)
+    # If only start is provided, then set end to start and start to 0.
+    if end is None:
+        end = start
+        start = 0
+    # If start is greater than end, then set the operation to "-" and operator to ">".
+    # If end is less than start, then set the operation to "+" and operator to "<".
+    if end < start:
+        operation, operator, step = "-", ">", -step
+    else:
+        operation, operator = "+", "<"
+    # Create a temporary table with the start value.
+    table_name = UtilFuncs._generate_temp_table_name(prefix="tdml_range_df",
+                                    table_type=TeradataConstants.TERADATA_TABLE)
+    execute_sql(f"CREATE MULTISET TABLE {table_name} AS (SELECT {start} AS id) WITH DATA;")
+    # Create a DataFrame from the range query.
+    range_query = TableOperatorConstants.RANGE_QUERY.value \
+                        .format(table_name, step, end, operation, operator)
+    df = DataFrame.from_query(range_query)
+    return df

teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.4py3-none-any.whl → 20.0.0.5py3-none-any.whl