PyPI - pointblank - Versions diffs - 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl - Mend

pointblank 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

pointblank/__init__.py +2 -0
pointblank/_constants.py +13 -0
pointblank/_constants_translations.py +216 -0
pointblank/_interrogation.py +182 -0
pointblank/_utils.py +2 -0
pointblank/column.py +352 -4
pointblank/data/api-docs.txt +270 -4
pointblank/validate.py +462 -5
pointblank-0.8.6.dist-info/METADATA +312 -0
{pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/RECORD +13 -13
pointblank-0.8.4.dist-info/METADATA +0 -269
{pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/WHEEL +0 -0
{pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.8.4.dist-info → pointblank-0.8.6.dist-info}/top_level.txt +0 -0

pointblank/validate.py CHANGED Viewed

@@ -52,6 +52,7 @@ from pointblank._interrogation import (
     ColValsCompareTwo,
     ColValsExpr,
     ColValsRegex,
+    ConjointlyValidation,
     NumberOfTestUnits,
     RowCountMatch,
     RowsDistinct,
@@ -6462,6 +6463,250 @@ class Validate:
         return self
+    def conjointly(
+        self,
+        *exprs: Callable,
+        pre: Callable | None = None,
+        thresholds: int | float | bool | tuple | dict | Thresholds = None,
+        actions: Actions | None = None,
+        brief: str | bool | None = None,
+        active: bool = True,
+    ) -> Validate:
+        """
+        Perform multiple row-wise validations for joint validity.
+        The `conjointly()` validation method checks whether each row in the table passes multiple
+        validation conditions simultaneously. This enables compound validation logic where a test
+        unit (typically a row) must satisfy all specified conditions to pass the validation.
+        This method accepts multiple validation expressions as callables, which should return
+        boolean expressions when applied to the data. You can use lambdas that incorporate
+        Polars/Pandas/Ibis expressions (based on the target table type) or create more complex
+        validation functions. The validation will operate over the number of test units that is
+        equal to the number of rows in the table (determined after any `pre=` mutation has been
+        applied).
+        Parameters
+        ----------
+        *exprs
+            Multiple validation expressions provided as callable functions. Each callable should
+            accept a table as its single argument and return a boolean expression or Series/Column
+            that evaluates to boolean values for each row.
+        pre
+            An optional preprocessing function or lambda to apply to the data table during
+            interrogation. This function should take a table as input and return a modified table.
+            Have a look at the *Preprocessing* section for more information on how to use this
+            argument.
+        thresholds
+            Set threshold failure levels for reporting and reacting to exceedences of the levels.
+            The thresholds are set at the step level and will override any global thresholds set in
+            `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will
+            be set locally and global thresholds (if any) will take effect. Look at the *Thresholds*
+            section for information on how to set threshold levels.
+        actions
+            Optional actions to take when the validation step meets or exceeds any set threshold
+            levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to
+            define the actions.
+        brief
+            An optional brief description of the validation step that will be displayed in the
+            reporting table. You can use the templating elements like `"{step}"` to insert
+            the step number, or `"{auto}"` to include an automatically generated brief. If `True`
+            the entire brief will be automatically generated. If `None` (the default) then there
+            won't be a brief.
+        active
+            A boolean value indicating whether the validation step should be active. Using `False`
+            will make the validation step inactive (still reporting its presence and keeping indexes
+            for the steps unchanged).
+        Returns
+        -------
+        Validate
+            The `Validate` object with the added validation step.
+        Preprocessing
+        -------------
+        The `pre=` argument allows for a preprocessing function or lambda to be applied to the data
+        table during interrogation. This function should take a table as input and return a modified
+        table. This is useful for performing any necessary transformations or filtering on the data
+        before the validation step is applied.
+        The preprocessing function can be any callable that takes a table as input and returns a
+        modified table. For example, you could use a lambda function to filter the table based on
+        certain criteria or to apply a transformation to the data. Regarding the lifetime of the
+        transformed table, it only exists during the validation step and is not stored in the
+        `Validate` object or used in subsequent validation steps.
+        Thresholds
+        ----------
+        The `thresholds=` parameter is used to set the failure-condition levels for the validation
+        step. If they are set here at the step level, these thresholds will override any thresholds
+        set at the global level in `Validate(thresholds=...)`.
+        There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values
+        can either be set as a proportion failing of all test units (a value between `0` to `1`),
+        or, the absolute number of failing test units (as integer that's `1` or greater).
+        Thresholds can be defined using one of these input schemes:
+        1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create
+        thresholds)
+        2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is
+        the 'error' level, and position `2` is the 'critical' level
+        3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and
+        'critical'
+        4. a single integer/float value denoting absolute number or fraction of failing test units
+        for the 'warning' level only
+        If the number of failing test units exceeds set thresholds, the validation step will be
+        marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be
+        set, you're free to set any combination of them.
+        Aside from reporting failure conditions, thresholds can be used to determine the actions to
+        take for each level of failure (using the `actions=` parameter).
+        Examples
+        --------
+        ```{python}
+        #| echo: false
+        #| output: false
+        import pointblank as pb
+        pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
+        ```
+        For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`,
+        `b`, and `c`). The table is shown below:
+        ```{python}
+        import pointblank as pb
+        import polars as pl
+        tbl = pl.DataFrame(
+            {
+                "a": [5, 7, 1, 3, 9, 4],
+                "b": [6, 3, 0, 5, 8, 2],
+                "c": [10, 4, 8, 9, 10, 5],
+            }
+        )
+        pb.preview(tbl)
+        ```
+        Let's validate that the values in each row satisfy multiple conditions simultaneously:
+        1. Column `a` should be greater than 2
+        2. Column `b` should be less than 7
+        3. The sum of `a` and `b` should be less than the value in column `c`
+        We'll use `conjointly()` to check all these conditions together:
+        ```{python}
+        validation = (
+            pb.Validate(data=tbl)
+            .conjointly(
+                lambda df: pl.col("a") > 2,
+                lambda df: pl.col("b") < 7,
+                lambda df: pl.col("a") + pl.col("b") < pl.col("c")
+            )
+            .interrogate()
+        )
+        validation
+        ```
+        The validation table shows that not all rows satisfy all three conditions together. For a
+        row to pass the conjoint validation, all three conditions must be true for that row.
+        We can also use preprocessing to filter the data before applying the conjoint validation:
+        ```{python}
+        validation = (
+            pb.Validate(data=tbl)
+            .conjointly(
+                lambda df: pl.col("a") > 2,
+                lambda df: pl.col("b") < 7,
+                lambda df: pl.col("a") + pl.col("b") < pl.col("c"),
+                pre=lambda df: df.filter(pl.col("c") > 5)
+            )
+            .interrogate()
+        )
+        validation
+        ```
+        This allows for more complex validation scenarios where the data is first prepared and then
+        validated against multiple conditions simultaneously.
+        Or, you can use the backend-agnostic column expression helper
+        [`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table
+        backends:
+        ```{python}
+        tbl = pl.DataFrame(
+            {
+                "a": [5, 7, 1, 3, 9, 4],
+                "b": [6, 3, 0, 5, 8, 2],
+                "c": [10, 4, 8, 9, 10, 5],
+            }
+        )
+        # Using backend-agnostic syntax with expr_col()
+        validation = (
+            pb.Validate(data=tbl)
+            .conjointly(
+                lambda df: pb.expr_col("a") > 2,
+                lambda df: pb.expr_col("b") < 7,
+                lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c")
+            )
+            .interrogate()
+        )
+        validation
+        ```
+        Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently
+        across Pandas, Polars, and Ibis table backends without changes, making your validation
+        pipelines more portable.
+        See Also
+        --------
+        Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more
+        information on how to use it with different table backends.
+        """
+        assertion_type = _get_fn_name()
+        if len(exprs) == 0:
+            raise ValueError("At least one validation expression must be provided")
+        _check_pre(pre=pre)
+        _check_thresholds(thresholds=thresholds)
+        _check_boolean_input(param=active, param_name="active")
+        # Determine threshold to use (global or local) and normalize a local `thresholds=` value
+        thresholds = (
+            self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
+        )
+        # Determine brief to use (global or local) and transform any shorthands of `brief=`
+        brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
+        # Package the validation expressions for later evaluation
+        values = {"expressions": exprs}
+        val_info = _ValidationInfo(
+            assertion_type=assertion_type,
+            column=None,  # This is a rowwise validation, not specific to any column
+            values=values,
+            pre=pre,
+            thresholds=thresholds,
+            actions=actions,
+            brief=brief,
+            active=active,
+        )
+        self._add_validation(validation_info=val_info)
+        return self
     def interrogate(
         self,
         collect_extracts: bool = True,
@@ -6841,6 +7086,14 @@ class Validate:
                 results_tbl = None
+            if assertion_category == "CONJOINTLY":
+                results_tbl = ConjointlyValidation(
+                    data_tbl=data_tbl_step,
+                    expressions=value["expressions"],
+                    threshold=threshold,
+                    tbl_type=tbl_type,
+                ).get_test_results()
             if assertion_category not in [
                 "COL_EXISTS_HAS_TYPE",
                 "COL_SCHEMA_MATCH",
@@ -6849,9 +7102,18 @@ class Validate:
             ]:
                 # Extract the `pb_is_good_` column from the table as a results list
                 if tbl_type in IBIS_BACKENDS:
-                    results_list = (
-                        results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
-                    )
+                    # Select the DataFrame library to use for getting the results list
+                    df_lib = _select_df_lib(preference="polars")
+                    df_lib_name = df_lib.__name__
+                    if df_lib_name == "pandas":
+                        results_list = (
+                            results_tbl.select("pb_is_good_").to_pandas()["pb_is_good_"].to_list()
+                        )
+                    else:
+                        results_list = (
+                            results_tbl.select("pb_is_good_").to_polars()["pb_is_good_"].to_list()
+                        )
                 else:
                     results_list = nw.from_native(results_tbl)["pb_is_good_"].to_list()
@@ -7012,7 +7274,7 @@ class Validate:
             # TODO: Add support for extraction of rows for Ibis backends
             if (
                 collect_extracts
-                and assertion_type in ROW_BASED_VALIDATION_TYPES
+                and assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_distinct"]
                 and tbl_type not in IBIS_BACKENDS
             ):
                 # Add row numbers to the results table
@@ -7038,6 +7300,32 @@ class Validate:
                 if len(validation_extract_nw) > extract_limit:
                     validation_extract_nw = validation_extract_nw.head(extract_limit)
+                # If a 'rows_distinct' validation step, then the extract should have the
+                # duplicate rows arranged together
+                if assertion_type == "rows_distinct":
+                    # Get the list of column names in the extract, excluding the `_row_num_` column
+                    column_names = validation_extract_nw.columns
+                    column_names.remove("_row_num_")
+                    # Only include the columns that were defined in `rows_distinct(columns_subset=)`
+                    # (stored here in `column`), if supplied
+                    if column is not None:
+                        column_names = column
+                        column_names_subset = ["_row_num_"] + column
+                        validation_extract_nw = validation_extract_nw.select(column_names_subset)
+                    validation_extract_nw = (
+                        validation_extract_nw.with_columns(
+                            group_min_row=nw.min("_row_num_").over(*column_names)
+                        )
+                        # First sort by the columns to group duplicates and by row numbers
+                        # within groups; this type of sorting will preserve the original order in a
+                        # single operation
+                        .sort(by=["group_min_row"] + column_names + ["_row_num_"])
+                        .drop("group_min_row")
+                    )
+                # Ensure that the extract is set to its native format
                 validation.extract = nw.to_native(validation_extract_nw)
             # Get the end time for this step
@@ -7976,6 +8264,7 @@ class Validate:
         - [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
         - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
         - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
+        - [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
         An extracted row means that a test unit failed for that row in the validation step. The
         extracted rows are a subset of the original table and are useful for further analysis or for
@@ -8357,6 +8646,7 @@ class Validate:
         # Do we have a DataFrame library to work with?
         _check_any_df_lib(method_used="get_tabular_report")
+        # Select the DataFrame library
         df_lib = _select_df_lib(preference="polars")
         # Get information on the input data table
@@ -8586,6 +8876,9 @@ class Validate:
                 else:
                     # With a column subset list, format with commas between the column names
                     columns_upd.append(", ".join(column))
+            elif assertion_type[i] in ["conjointly"]:
+                columns_upd.append("")
             else:
                 columns_upd.append(str(column))
@@ -8657,6 +8950,9 @@ class Validate:
                 values_upd.append(str(count))
+            elif assertion_type[i] in ["conjointly"]:
+                values_upd.append("COLUMN EXPR")
             # If the assertion type is not recognized, add the value as a string
             else:
                 values_upd.append(str(value))
@@ -9330,6 +9626,24 @@ class Validate:
                 lang=lang,
             )
+        elif assertion_type == "rows_distinct":
+            extract = self.get_data_extracts(i=i, frame=True)
+            step_report = _step_report_rows_distinct(
+                i=i,
+                column=column,
+                column_position=column_position,
+                columns_subset=columns_subset,
+                n=n,
+                n_failed=n_failed,
+                all_passed=all_passed,
+                extract=extract,
+                tbl_preview=tbl_preview,
+                header=header,
+                limit=limit,
+                lang=lang,
+            )
         elif assertion_type == "col_schema_match":
             # Get the parameters for column-schema matching
             values_dict = validation_step["values"]
@@ -9925,6 +10239,9 @@ def _create_autobrief_or_failure_text(
             for_failure=for_failure,
         )
+    if assertion_type == "conjointly":
+        return _create_text_conjointly(lang=lang, for_failure=for_failure)
     return None  # pragma: no cover
@@ -10099,6 +10416,12 @@ def _create_text_col_count_match(lang: str, value: int, for_failure: bool = Fals
     return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text)
+def _create_text_conjointly(lang: str, for_failure: bool = False) -> str:
+    type_ = _expect_failure_type(for_failure=for_failure)
+    return EXPECT_FAIL_TEXT[f"conjointly_{type_}_text"][lang]
 def _prep_column_text(column: str | list[str]) -> str:
     if isinstance(column, list):
         return "`" + str(column[0]) + "`"
@@ -10672,7 +10995,7 @@ def _step_report_row_based(
     header: str,
     limit: int | None,
     lang: str,
-):
+) -> GT:
     # Get the length of the extracted data for the step
     extract_length = get_row_count(extract)
@@ -10889,6 +11212,140 @@ def _step_report_row_based(
     return step_report
+def _step_report_rows_distinct(
+    i: int,
+    column: list[str],
+    column_position: list[int],
+    columns_subset: list[str] | None,
+    n: int,
+    n_failed: int,
+    all_passed: bool,
+    extract: any,
+    tbl_preview: GT,
+    header: str,
+    limit: int | None,
+    lang: str,
+) -> GT:
+    # Get the length of the extracted data for the step
+    extract_length = get_row_count(extract)
+    # Determine whether the `lang` value represents a right-to-left language
+    is_rtl_lang = lang in RTL_LANGUAGES
+    direction_rtl = " direction: rtl;" if is_rtl_lang else ""
+    if column is None:
+        text = STEP_REPORT_TEXT["rows_distinct_all"][lang].format(column=column)
+    else:
+        columns_list = ", ".join(column)
+        text = STEP_REPORT_TEXT["rows_distinct_subset"][lang].format(columns_subset=columns_list)
+    if all_passed:
+        step_report = tbl_preview
+        if header is None:
+            return step_report
+        title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
+        success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
+            n=n,
+            column_position=column_position,
+        )
+        preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
+        details = (
+            f"<div style='font-size: 13.6px; {direction_rtl}'>"
+            "<div style='padding-top: 7px;'>"
+            f"{text}"
+            "</div>"
+            "<div style='padding-top: 7px;'>"
+            f"{success_stmt}"
+            "</div>"
+            f"{preview_stmt}"
+            "</div>"
+        )
+        # Generate the default template text for the header when `":default:"` is used
+        if header == ":default:":
+            header = "{title}{details}"
+        # Use commonmark to convert the header text to HTML
+        header = commonmark.commonmark(header)
+        # Place any templated text in the header
+        header = header.format(title=title, details=details)
+        # Create the header with `header` string
+        step_report = step_report.tab_header(title=md(header))
+    else:
+        if limit is None:
+            limit = extract_length
+        # Create a preview of the extracted data
+        step_report = _generate_display_table(
+            data=extract,
+            columns_subset=columns_subset,
+            n_head=limit,
+            n_tail=0,
+            limit=limit,
+            min_tbl_width=600,
+            incl_header=False,
+            mark_missing_values=False,
+        )
+        title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i)
+        failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
+        failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_rows_distinct"][lang].format(
+            failure_rate=failure_rate_metrics,
+            column_position=column_position,
+        )
+        if limit < extract_length:  # pragma: no cover
+            extract_length_resolved = limit
+            extract_text = STEP_REPORT_TEXT["extract_text_first_rows_distinct"][lang].format(
+                extract_length_resolved=extract_length_resolved
+            )
+        else:
+            extract_length_resolved = extract_length
+            extract_text = STEP_REPORT_TEXT["extract_text_all_rows_distinct"][lang].format(
+                extract_length_resolved=extract_length_resolved
+            )
+        details = (
+            f"<div style='font-size: 13.6px; {direction_rtl}'>"
+            "<div style='padding-top: 7px;'>"
+            f"{text}"
+            "</div>"
+            "<div style='padding-top: 7px;'>"
+            f"{failure_rate_stmt}"
+            "</div>"
+            f"{extract_text}"
+            "</div>"
+        )
+        # If `header` is None then don't add a header and just return the step report
+        if header is None:
+            return step_report
+        # Generate the default template text for the header when `":default:"` is used
+        if header == ":default:":
+            header = "{title}{details}"
+        # Use commonmark to convert the header text to HTML
+        header = commonmark.commonmark(header)
+        # Place any templated text in the header
+        header = header.format(title=title, details=details)
+        # Create the header with `header` string
+        step_report = step_report.tab_header(title=md(header))
+    return step_report
 def _step_report_schema_in_order(
     step: int, schema_info: dict, header: str, lang: str, debug_return_df: bool = False
 ) -> GT | any:

pointblank 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

pointblank 0.8.4py3-none-any.whl → 0.8.6py3-none-any.whl