PyPI - valediction - Versions diffs - 1.0.3__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

valediction 1.0.3py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

valediction/convenience.py +7 -12
valediction/datasets/datasets.py +17 -17
valediction/dictionary/generation.py +5 -5
valediction/dictionary/helpers.py +0 -7
valediction/dictionary/importing.py +43 -20
valediction/dictionary/model.py +108 -36
valediction/integrity.py +67 -13
valediction/io/csv_readers.py +3 -3
valediction/support.py +5 -1
valediction/validation/helpers.py +30 -33
valediction/validation/issues.py +37 -25
valediction/validation/validation.py +102 -53
{valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/METADATA +1 -1
{valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/RECORD +15 -15
{valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/WHEEL +0 -0

valediction/integrity.py CHANGED Viewed

@@ -1,6 +1,10 @@
+from __future__ import annotations
 import re
+from copy import deepcopy
 from pathlib import Path
 from re import Pattern
+from typing import Any
 from valediction.data_types.data_types import DataType
 from valediction.support import list_as_bullets
@@ -12,13 +16,58 @@ TEMPLATE_DATA_DICTIONARY_PATH = (
 )
+externally_injected_variables: dict[
+    str, Any
+] = {}  # External injection store for package wrapping (any keys, always included)
+def reset_injected_config_variables() -> None:
+    global externally_injected_variables
+    externally_injected_variables = {}
+def inject_config_variables(variables: dict[str, Any]) -> None:
+    """Injects variables into the Valediction Config, which will always be incorporated
+    as overrides, regardless of Config calling method (default, session-scoped, or
+    contextual).
+    Args:
+        variables (dict[str, Any]): Dictionary of config variables.
+    """
+    global externally_injected_variables, session_config
+    # check type allows
+    if not isinstance(variables, dict):
+        raise TypeError(
+            f"Config injection variables must be a dictionary, not {type(variables)}"
+        )
+    problematic_keys = []
+    for variable_name in variables.keys():
+        if not isinstance(variable_name, str):
+            problematic_keys.append(variable_name)
+    if problematic_keys:
+        raise TypeError("Config injection variables accepts only string keys.")
+    externally_injected_variables = dict(variables or {})
+    # Apply immediately to the current session config (if it exists)
+    if session_config is not None:
+        _apply_external_injections(session_config)
+def _apply_external_injections(config: Config) -> None:
+    for variable_name, variable_value in externally_injected_variables.items():
+        setattr(config, variable_name, deepcopy(variable_value))
 class Config:
     def __init__(self):
         self.template_data_dictionary_path: Path = TEMPLATE_DATA_DICTIONARY_PATH
         self.max_table_name_length: int = 63
         self.max_column_name_length: int = 30
         self.max_primary_keys: int = 7
-        self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Z0-9_]")
+        self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Za-z0-9_]")
         self.null_values: list[str] = ["", "null", "none"]
         self.forbidden_characters: list[str] = []
         self.date_formats: dict[str, DataType] = {
@@ -42,6 +91,7 @@ class Config:
         }
         self.enforce_no_null_columns: bool = True
         self.enforce_primary_keys: bool = True
+        _apply_external_injections(self)
     def __repr__(self):
         date_list = list_as_bullets(
@@ -65,33 +115,37 @@ class Config:
     # Context Wrapper With Reset
     def __enter__(self):
-        global default_config
-        default_config = self
+        global session_config
+        _apply_external_injections(self)
+        session_config = self
         return self
     def __exit__(self, exc_type, exc_value, traceback):
-        global default_config
-        default_config = Config()
+        global session_config
+        session_config = Config()
-default_config: Config = None
+session_config: Config = None
 def get_config() -> Config:
-    """Gets the current `default_config` instance. Changing attributes will set them
-    globally.
+    """Gets the current `session_config` instance. Changing attributes will set them
+    globally for the python session. Use `reset_default_config()` to reset to original
+    defaults.
     Returns:
-        Config: The current default configuration.
+        Config: The current session configuration.
     """
-    global default_config
-    return default_config
+    global session_config
+    return session_config
 def reset_default_config() -> None:
     """Resets `default_config` settings globally to original defaults."""
-    global default_config
-    default_config = Config()
+    global session_config
+    session_config = Config()
 reset_default_config()

valediction/io/csv_readers.py CHANGED Viewed

@@ -11,7 +11,7 @@ import pandas as pd
 from pandas import DataFrame
 from pandas.errors import ParserError
-from valediction.support import _normalise_name
+from valediction.support import _strip
 class FrameChunk(NamedTuple):
@@ -34,7 +34,7 @@ class FrameChunk(NamedTuple):
     total_chunks_seen: int | None
     def estimate_chunk_count(self) -> int:
-        # Buffers (accounting for CSV tails/bytes innacuracy)
+        # Buffers (accounting for CSV tails/bytes inaccuracy)
         EPS_ABS = 4096  # Fixed
         EPS_REL = 0.05  # 5% tail buffer
@@ -93,7 +93,7 @@ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
     """Apply header normalisation and vectorised value stripping after reading."""
     cfg = cfg or CsvReadConfig()
     if cfg.normalise_headers:
-        df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
+        df = df.rename(columns={c: _strip(c) for c in df.columns})
     if cfg.strip_values:
         str_cols = df.select_dtypes(include=["string"]).columns
         if len(str_cols) > 0:

valediction/support.py CHANGED Viewed

@@ -35,10 +35,14 @@ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
     return bullet + bullet.join(elements)
-def _normalise_name(name: str) -> str:
+def _normalise(name: str) -> str:
     return name.strip().upper()
+def _strip(name: str) -> str:
+    return name.strip()
 def _get_runtime_string(runtime: timedelta) -> str:
     total_seconds = runtime.total_seconds()
     hours = trunc(total_seconds / 3600)

valediction/validation/helpers.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pandas.util import hash_pandas_object
 from valediction.data_types.data_types import DataType
 from valediction.dictionary.model import Table
 from valediction.integrity import get_config
+from valediction.support import _normalise
 from valediction.validation.issues import Range
@@ -17,11 +18,14 @@ from valediction.validation.issues import Range
 def _set_nulls(df: DataFrame) -> DataFrame:
     null_values = get_config().null_values
     token_set = {str(t).strip().casefold() for t in null_values}
-    columns = df.select_dtypes(include=["string", "object"]).columns
+    columns = df.select_dtypes(include=["string", "object", "category"]).columns
     for column in columns:
         series = df[column]
-        mask = series.notna() & series.str.casefold().isin(token_set)
-        df[column] = series.mask(mask, NA)
+        s_txt = series.astype("string", copy=False)  # dtype safe
+        mask = s_txt.notna() & s_txt.str.strip().str.casefold().isin(token_set)
+        if mask.any():
+            df[column] = series.mask(mask, NA)
     return df
@@ -68,37 +72,24 @@ def create_pk_hashes(
     Returns:
         Series: Pandas Series with hashes or Nulls.
     """
-    hash_col_name = "PK_HASH"
+    HASH_COL_NAME = "PK_HASH"
     if df_primaries.empty or df_primaries.shape[1] == 0:
-        return Series([], dtype=object, name=hash_col_name)
+        return Series([], dtype=object, name=HASH_COL_NAME)
-    # Any NA in row => invalid PK -> None
+    # Check Nulls
     null_rows = df_primaries.isna().any(axis=1)
-    # First Hash
-    hash_1 = hash_pandas_object(df_primaries, index=False)  # uint64
-    # Second Hash (rows backwards if single row, else salt)
-    if df_primaries.shape[1] > 1:
-        df_primaries_backwards = df_primaries.iloc[:, ::-1]
-    else:
-        s = df_primaries.iloc[:, 0]
-        salt = Series(["§"] * len(s), index=s.index, dtype="string")
-        df_primaries_backwards = DataFrame(
-            {
-                "_a": s,
-                "_b": s.str.cat(salt),
-            }
-        )
-    hash_2 = hash_pandas_object(df_primaries_backwards, index=False)  # uint64
+    # Two independent 64-bit hashes with 16 byte keys
+    hash_1 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk1!")
+    hash_2 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk2!")
+    # Combine into 128-bit integer keys
     a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
     a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
     combined = (a1 << 64) | a2
     hashes = Series(
-        combined, index=df_primaries.index, name=hash_col_name, dtype=object
+        combined, index=df_primaries.index, name=HASH_COL_NAME, dtype=object
     )
     hashes[null_rows] = None
     return hashes
@@ -167,8 +158,9 @@ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
     if df_primaries.empty or df_primaries.shape[1] == 0:
         return Series(False, index=df_primaries.index)
-    col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
+    col_masks = df_primaries.apply(
+        lambda s: s.astype("string", copy=False).str.contains(r"\s", na=False)
+    )
     return col_masks.any(axis=1)
@@ -261,7 +253,9 @@ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
         return Series(False, index=column.index)
     notnull = column.notna()
-    lens = column.str.len()
+    s_txt = column.astype("string", copy=False)
+    lens = s_txt.str.len()
     return notnull & (lens > max_len)
@@ -270,20 +264,23 @@ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
     if not forbidden:
         return column.notna() & False
-    pattern = "[" + re.escape("".join(forbidden)) + "]"
+    pattern = "[" + re.escape("".join([str(s) for s in forbidden])) + "]"
     notnull = column.notna()
-    has_forbidden = column.str.contains(pattern, regex=True, na=False)
+    s_txt = column.astype("string", copy=False)
+    has_forbidden = s_txt.str.contains(pattern, regex=True, na=False)
     return notnull & has_forbidden
 # Apply Data Types #
 def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
     # name -> column object
-    column_dictionary = {column.name: column for column in table_dictionary}
+    column_dictionary = {_normalise(column.name): column for column in table_dictionary}
     for col in df.columns:
-        data_type = column_dictionary.get(col).data_type
-        datetime_format = column_dictionary.get(col).datetime_format
+        data_type = column_dictionary.get(_normalise(col)).data_type
+        datetime_format = column_dictionary.get(_normalise(col)).datetime_format
         if data_type in (DataType.TEXT, DataType.FILE):
             df[col] = df[col].astype("string")

valediction/validation/issues.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pandas import DataFrame, concat
 from valediction.datasets.datasets_helpers import DatasetItemLike
 from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
-from valediction.support import _normalise_name, list_as_bullets
+from valediction.support import _strip, list_as_bullets
 class IssueType(Enum):
@@ -107,6 +107,7 @@ class Issue:
         merged.append(cur)
         self.ranges = merged
+    # Inspect
     def inspect(
         self,
         additional_columns: bool | str | list[str] | None = None,
@@ -132,9 +133,9 @@ class Issue:
             ValueError: if the issue has no parent DatasetItem
         """
         # Guard
-        if not self.parent:
-            raise ValueError("Issue has no parent DatasetItem")
+        self.__guard_parent()
         header = self.__repr__() if print_header else ""
         # Not applicable
         if self.type in APPLIES_WHOLE_COLUMN:
             print(f"{header}: applies to whole column")
@@ -143,22 +144,8 @@ class Issue:
         # Column Inclusion
         if print_header:
             print(f"{header}:")
-        if additional_columns is True:
-            columns = None
-        else:
-            additional_columns = (
-                [additional_columns]
-                if isinstance(additional_columns, str)
-                else additional_columns
-            )
-            base = (
-                set(self.parent.primary_keys)
-                if self.type in PRIMARY_KEY_ISSUES
-                else {self.column}
-            )
-            base |= set(additional_columns or [])
-            base.discard(None)
-            columns = list(base) if base else None
+        columns = self.__select_columns(additional_columns)
         if not self.ranges:
             return DataFrame(columns=columns) if columns else DataFrame()
@@ -194,6 +181,31 @@ class Issue:
         return out if columns is None else out.loc[:, columns]
+    # Inspect Helpers
+    def __guard_parent(self):
+        if not self.parent:
+            raise ValueError("Issue has no parent DatasetItem")
+    def __select_columns(self, additional_columns: bool | str | list[str]) -> list:
+        if additional_columns is True:
+            columns = None
+        else:
+            additional_columns = (
+                [additional_columns]
+                if isinstance(additional_columns, str)
+                else additional_columns
+            )
+            base = (
+                set(self.parent.primary_keys)
+                if self.type in PRIMARY_KEY_ISSUES
+                else {self.column}
+            )
+            base |= set(additional_columns or [])
+            base.discard(None)
+            columns = list(base) if base else None
+        return columns
 @dataclass
 class Issues:
@@ -235,8 +247,8 @@ class Issues:
         parent: DatasetItemLike | None = None,
     ) -> Issue:
         key = (
-            _normalise_name(table),
-            _normalise_name(column) if column is not None else None,
+            _strip(table),
+            _strip(column) if column is not None else None,
             issue_type,
         )
         issue = self._index.get(key)
@@ -255,8 +267,8 @@ class Issues:
         issue_type: IssueType | None = None,
     ) -> list[Issue]:
         """Case-insensitive filter; any arg can be None to act as a wildcard."""
-        table = _normalise_name(table)
-        column = _normalise_name(column) if column is not None else None
+        table = _strip(table)
+        column = _strip(column) if column is not None else None
         output: list[Issue] = []
         if issue_type is not None:
             # direct index lookup where possible
@@ -268,9 +280,9 @@ class Issues:
         # otherwise scan (still cheap; we maintain a compact list)
         for item in self._items:
-            if _normalise_name(item.table) != table:
+            if _strip(item.table) != table:
                 continue
-            if column is not None and (_normalise_name(item.column) or "") != column:
+            if column is not None and (_strip(item.column) or "") != column:
                 continue
             output.append(item)
         return output

valediction/validation/validation.py CHANGED Viewed

@@ -20,7 +20,7 @@ from valediction.io.csv_readers import (
     iter_csv_chunks,
 )
 from valediction.progress import Progress
-from valediction.support import _get_runtime_string, calculate_runtime
+from valediction.support import _get_runtime_string, _normalise, calculate_runtime
 from valediction.validation.helpers import (
     _column_has_values,
     _set_nulls,
@@ -62,7 +62,7 @@ class Validator:
         dataset_item: DatasetItemLike,
         table_dictionary: Table,
         feedback: bool = True,
-        chunk_size: int = 10_000_000,
+        chunk_size: int | None = 10_000_000,
         _padding: int = 0,
     ):
         # User Variables
@@ -86,7 +86,9 @@ class Validator:
         self._dt_needs_infer: set[str] = set()
         #  Helpers
-        self._column_names: set = set(self.table_dictionary.get_column_names())
+        self._column_names: set[str] = {
+            _normalise(n) for n in self.table_dictionary.get_column_names()
+        }
         # Progress Tracking
         self.progress: Progress | None = None
@@ -155,6 +157,20 @@ class Validator:
                 if not datetime_format:
                     self._dt_needs_infer.add(name)
+    # Column Scanning
+    def _resolve_df_col(self, df: DataFrame, name: str) -> str | None:
+        """Return the actual df column label matching name case-insensitively."""
+        target = _normalise(name)
+        return next((c for c in df.columns if _normalise(str(c)) == target), None)
+    def _resolve_df_cols(self, df: DataFrame, names: list[str]) -> list[str]:
+        resolved: list[str] = []
+        for n in names:
+            c = self._resolve_df_col(df, n)
+            if c is not None:
+                resolved.append(c)
+        return resolved
     # Validate
     def validate(self):
         """
@@ -272,28 +288,45 @@ class Validator:
     # Validation: Start Helpers
     def _check_for_missing_columns(self, df: DataFrame):
         self.__begin_step(step="Checking for missing columns")
-        missing = self._column_names - set(df.columns)
-        if missing:
-            for column in missing:
-                self.issues.add(
-                    issue_type=IssueType.MISSING_COLUMN,
-                    table=self.table_name,
-                    column=column,
-                    parent=self.dataset_item,
-                )
+        dict_names = self.table_dictionary.get_column_names()
+        dict_keys = {_normalise(name) for name in dict_names}
+        df_keys = {_normalise(str(column)) for column in df.columns}
+        missing_keys = dict_keys - df_keys
+        if missing_keys:
+            for name in dict_names:
+                if _normalise(name) in missing_keys:
+                    self.issues.add(
+                        issue_type=IssueType.MISSING_COLUMN,
+                        table=self.table_name,
+                        column=name,
+                        parent=self.dataset_item,
+                    )
         self.__complete_step()
     def _check_for_extra_columns(self, df: DataFrame):
         self.__begin_step(step="Checking for extra columns")
-        extra = set(df.columns) - self._column_names
-        if extra:
-            for column in extra:
-                self.issues.add(
-                    issue_type=IssueType.EXTRA_COLUMN,
-                    table=self.table_name,
-                    column=column,
-                    parent=self.dataset_item,
-                )
+        dict_keys = {
+            _normalise(name) for name in self.table_dictionary.get_column_names()
+        }
+        df_cols = [str(column) for column in df.columns]
+        df_keys = {_normalise(column) for column in df_cols}
+        extra_keys = df_keys - dict_keys
+        if extra_keys:
+            for col in df_cols:
+                if _normalise(col) in extra_keys:
+                    self.issues.add(
+                        issue_type=IssueType.EXTRA_COLUMN,
+                        table=self.table_name,
+                        column=col,  # report the actual df label
+                        parent=self.dataset_item,
+                    )
         self.__complete_step()
     # Validation: Chunk Helpers
@@ -319,13 +352,16 @@ class Validator:
         # Check for whitespace (text cols only)
         self.__begin_step(step="Checking for primary key whitespace")
-        pk_cols_text = []
-        for column in self.table_dictionary:
-            if column.name in pk_cols and column.data_type in [DataType.TEXT]:
-                pk_cols_text.append(column.name)
+        pk_keys = {_normalise(p) for p in pk_cols}
+        pk_cols_text = [
+            column.name
+            for column in self.table_dictionary
+            if _normalise(column.name) in pk_keys and column.data_type is DataType.TEXT
+        ]
         if pk_cols_text:
-            space_mask = pk_contains_whitespace_mask(df[pk_cols_text])
+            pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
+            space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
             if space_mask.any():
                 self.issues.add(
                     issue_type=IssueType.PK_WHITESPACE,
@@ -343,7 +379,9 @@ class Validator:
         # Create primary key hashes
         self.__begin_step(step="Creating primary key hashes")
-        pk_hashes = create_pk_hashes(df[pk_cols])
+        pk_cols_df = self._resolve_df_cols(df, pk_cols)
+        pk_hashes = create_pk_hashes(df[pk_cols_df])
         self.__complete_step()
         # Primary Key Nulls
@@ -437,44 +475,51 @@ class Validator:
             self.__complete_step()
             return
-        columns = [col for col in self._dt_needs_infer if col in df.columns]
-        if not columns:
+        cols = [
+            (dict_col, df_col)
+            for dict_col in self._dt_needs_infer
+            if (df_col := self._resolve_df_col(df, dict_col)) is not None
+        ]
+        if not cols:
             self.__complete_step()
             return
-        for column in columns:
-            series = df[column].astype("string", copy=False).str.strip()
-            unique = series.dropna().unique()
+        from valediction.validation.helpers import _allowed_formats_for
+        for dict_col, df_col in cols:
+            unique = (
+                df[df_col].astype("string", copy=False).str.strip().dropna().unique()
+            )
             if len(unique) == 0:
                 continue
             try:
-                fmt_or_false = infer_datetime_format(Series(unique, dtype="string"))
+                fmt = infer_datetime_format(Series(unique, dtype="string"))
             except ValueError:
-                # ambiguous - try again in later chunk
                 continue
-            if fmt_or_false and fmt_or_false is not False:
-                col_dtype = self._find_data_type(column)
-                from valediction.validation.helpers import _allowed_formats_for
-                allowed = _allowed_formats_for(col_dtype)
-                if fmt_or_false in allowed:
-                    self._dt_format_cache[column] = fmt_or_false
-                    self._dt_needs_infer.discard(column)
-                    # Persist in the dictionary
-                    try:
-                        self.table_dictionary.get_column(
-                            column
-                        ).datetime_format = fmt_or_false
-                    except Exception:
-                        pass
+            if not fmt or fmt is False:
+                continue
+            col_dtype = self._find_data_type(dict_col)  # case-insensitive getter
+            if fmt not in _allowed_formats_for(col_dtype):
+                continue
+            self._dt_format_cache[dict_col] = fmt
+            self._dt_needs_infer.discard(dict_col)
+            try:
+                self.table_dictionary.get_column(dict_col).datetime_format = fmt
+            except Exception:
+                pass
         self.__complete_step()
     def _check_column_types(self, df: DataFrame, start_row: int) -> None:
         self.__begin_step(step="Checking column types")
-        present = [col for col in df.columns if col in self._column_names]
+        present = [
+            col for col in df.columns if _normalise(str(col)) in self._column_names
+        ]
         for col in present:
             dtype = self._find_data_type(col)
             if dtype == DataType.TEXT:
@@ -506,7 +551,9 @@ class Validator:
     def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
         self.__begin_step(step="Checking text lengths")
-        present = [col for col in df.columns if col in self._column_names]
+        present = [
+            col for col in df.columns if _normalise(str(col)) in self._column_names
+        ]
         for col in present:
             if self._find_data_type(col) != DataType.TEXT:
                 continue
@@ -524,7 +571,9 @@ class Validator:
     def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
         self.__begin_step(step="Checking for forbidden characters")
-        present = [col for col in df.columns if col in self._column_names]
+        present = [
+            col for col in df.columns if _normalise(str(col)) in self._column_names
+        ]
         for col in present:
             if self._find_data_type(col) != DataType.TEXT:
                 continue

{valediction-1.0.3.dist-info → valediction-1.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: valediction
-Version: 1.0.3
+Version: 1.2.0
 Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
 Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
 Requires-Python: <4.0,>=3.11

valediction 1.0.3__py3-none-any.whl → 1.2.0__py3-none-any.whl

valediction 1.0.3py3-none-any.whl → 1.2.0py3-none-any.whl