PyPI - valediction - Versions diffs - 1.0.0__py3-none-any.whl - Mend

valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

valediction/__init__.py +8 -0
valediction/convenience.py +50 -0
valediction/data_types/__init__.py +0 -0
valediction/data_types/data_type_helpers.py +75 -0
valediction/data_types/data_types.py +58 -0
valediction/data_types/type_inference.py +541 -0
valediction/datasets/__init__.py +0 -0
valediction/datasets/datasets.py +870 -0
valediction/datasets/datasets_helpers.py +46 -0
valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
valediction/demo/DEMOGRAPHICS.csv +101 -0
valediction/demo/DIAGNOSES.csv +650 -0
valediction/demo/LAB_TESTS.csv +1001 -0
valediction/demo/VITALS.csv +1001 -0
valediction/demo/__init__.py +6 -0
valediction/demo/demo_dictionary.py +129 -0
valediction/dictionary/__init__.py +0 -0
valediction/dictionary/exporting.py +501 -0
valediction/dictionary/exporting_helpers.py +371 -0
valediction/dictionary/generation.py +357 -0
valediction/dictionary/helpers.py +174 -0
valediction/dictionary/importing.py +494 -0
valediction/dictionary/integrity.py +37 -0
valediction/dictionary/model.py +582 -0
valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
valediction/exceptions.py +22 -0
valediction/integrity.py +97 -0
valediction/io/__init__.py +0 -0
valediction/io/csv_readers.py +307 -0
valediction/progress.py +206 -0
valediction/support.py +72 -0
valediction/validation/__init__.py +0 -0
valediction/validation/helpers.py +315 -0
valediction/validation/issues.py +280 -0
valediction/validation/validation.py +598 -0
valediction-1.0.0.dist-info/METADATA +15 -0
valediction-1.0.0.dist-info/RECORD +38 -0
valediction-1.0.0.dist-info/WHEEL +4 -0

valediction/validation/helpers.py ADDED Viewed

@@ -0,0 +1,315 @@
+from __future__ import annotations
+import re
+from typing import List
+from numpy import flatnonzero, round
+from pandas import NA, DataFrame, Series, to_datetime, to_numeric
+from pandas.util import hash_pandas_object
+from valediction.data_types.data_types import DataType
+from valediction.dictionary.model import Table
+from valediction.integrity import get_config
+from valediction.validation.issues import Range
+# Remove Nulls
+def _set_nulls(df: DataFrame) -> DataFrame:
+    null_values = get_config().null_values
+    token_set = {str(t).strip().casefold() for t in null_values}
+    columns = df.select_dtypes(include=["string", "object"]).columns
+    for column in columns:
+        series = df[column]
+        mask = series.notna() & series.str.casefold().isin(token_set)
+        df[column] = series.mask(mask, NA)
+    return df
+# Check for Nulls
+def _column_has_values(column: Series):
+    return column.notna().any()
+# Range Setting
+def mask_to_ranges(mask: Series, start_row: int) -> list[Range]:
+    """Convert a boolean mask (over the current chunk) into 0-based contiguous
+    ranges."""
+    idx = flatnonzero(mask.to_numpy())
+    if idx.size == 0:
+        return []
+    ranges: List[Range] = []
+    run_start = idx[0]
+    prev = idx[0]
+    for i in idx[1:]:
+        if i == prev + 1:
+            prev = i
+            continue
+        ranges.append(Range(start=start_row + run_start, end=start_row + prev))
+        run_start = prev = i
+    ranges.append(Range(start=start_row + run_start, end=start_row + prev))
+    return ranges
+# PK Hashes
+def create_pk_hashes(
+    df_primaries: DataFrame,
+) -> Series:
+    """For PK hash collision assessment, compute a deterministic 128-bit hash per row
+    over the provided PK columns. This is created by computing two 64-bit hashes.
+    forwards and backwards and then combining them. Rows with any NA across PK
+    components are returned as None - flagging these for NULL violations.
+    Args:
+        df_primaries (DataFrame): DataFrame
+    Returns:
+        Series: Pandas Series with hashes or Nulls.
+    """
+    hash_col_name = "PK_HASH"
+    if df_primaries.empty or df_primaries.shape[1] == 0:
+        return Series([], dtype=object, name=hash_col_name)
+    # Any NA in row => invalid PK -> None
+    null_rows = df_primaries.isna().any(axis=1)
+    # First Hash
+    hash_1 = hash_pandas_object(df_primaries, index=False)  # uint64
+    # Second Hash (rows backwards if single row, else salt)
+    if df_primaries.shape[1] > 1:
+        df_primaries_backwards = df_primaries.iloc[:, ::-1]
+    else:
+        s = df_primaries.iloc[:, 0]
+        salt = Series(["§"] * len(s), index=s.index, dtype="string")
+        df_primaries_backwards = DataFrame(
+            {
+                "_a": s,
+                "_b": s.str.cat(salt),
+            }
+        )
+    hash_2 = hash_pandas_object(df_primaries_backwards, index=False)  # uint64
+    a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
+    a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
+    combined = (a1 << 64) | a2
+    hashes = Series(
+        combined, index=df_primaries.index, name=hash_col_name, dtype=object
+    )
+    hashes[null_rows] = None
+    return hashes
+def compute_pk_masks(pk_hashes: Series, seen_hashes: set[int]) -> dict[str, Series]:
+    """Compute masks for PK hashes that are either null or have been seen before.
+    Args:
+        pk_hashes (Series): Series of PK hashes.
+        seen_hashes (set[int]): Set of hashes that have been seen before.
+    Returns:
+        dict[str, Series]: Dictionary for boolean masks:
+        - null: rows where PK is None / NA
+        - dup_full: rows that are part of a within-chunk duplicate group
+        - cross_full: rows whose hash was seen in previous chunks (excluding dup_full)
+        - new_first_full: rows that are the first occurrence of a hash
+    """
+    s = pk_hashes
+    null = s.isna()
+    valid = ~null
+    if not valid.any():
+        # empty/default masks
+        return {
+            "null": null,
+            "in_chunk_collision": null,
+            "cross_chunk_collision": null,
+            "first_appearance": null,
+        }
+    s_valid = s[valid]
+    # Within-chunk duplicate membership (mark *all* members)
+    dup_local = s_valid.duplicated(keep=False)
+    # Across-chunk duplicates (exclude those already in a local dup group)
+    seen_local = s_valid.isin(seen_hashes)
+    cross_local = seen_local & ~dup_local
+    # New first occurrences in this chunk (first time we see the hash here, and not seen before)
+    first_local = ~s_valid.duplicated(keep="first")
+    new_first_local = first_local & ~seen_local
+    # Lift back to full length masks
+    in_chunk_collision = valid.copy()
+    in_chunk_collision.loc[valid] = dup_local
+    cross_chunk_collision = valid.copy()
+    cross_chunk_collision.loc[valid] = cross_local
+    first_appearance = valid.copy()
+    first_appearance.loc[valid] = new_first_local
+    return {
+        "null": null,
+        "in_chunk_collision": in_chunk_collision,
+        "cross_chunk_collision": cross_chunk_collision,
+        "first_appearance": first_appearance,
+    }
+# PK Whitespace
+def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
+    if df_primaries.empty or df_primaries.shape[1] == 0:
+        return Series(False, index=df_primaries.index)
+    col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
+    return col_masks.any(axis=1)
+# Data Type Checks Numeric
+def invalid_mask_integer(column: Series, *, tolerance: float = 1e-12) -> Series:
+    """True where a non-null value cannot be treated as an integer without losing non-
+    zero remainder.
+    Accepts scientific notation (e.g. '1e2').
+    """
+    notnull = column.notna()
+    numeric = to_numeric(column, errors="coerce")
+    invalid = notnull & numeric.isna()
+    conversion_mask = notnull & numeric.notna()
+    if conversion_mask.any():
+        vals = numeric[conversion_mask].astype("float64")
+        frac = (vals - round(vals)).abs()
+        invalid_conv = frac > tolerance
+        invalid = invalid.copy()
+        invalid.loc[conversion_mask] = invalid_conv.values
+    return invalid
+def invalid_mask_float(column: Series) -> Series:
+    """True where non-null value is not convertible to a number."""
+    notnull = column.notna()
+    num = to_numeric(column, errors="coerce")
+    return notnull & num.isna()
+# Data Type Checks Date
+def _allowed_formats_for(dtype: DataType) -> list[str]:
+    """Return the list of formats from Config.date_formats allowed for the given
+    DataType."""
+    config = get_config()
+    return [fmt for fmt, data_type in config.date_formats.items() if data_type == dtype]
+def _parse_ok_any(column: Series, formats: list[str]) -> Series:
+    """
+    Vectorised check: True for values that parse under at least one of `formats`.
+    """
+    if not formats:
+        return Series(False, index=column.index)
+    ok_any = Series(False, index=column.index)
+    for fmt in formats:
+        parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
+        ok_any = ok_any | parsed.notna()
+    return ok_any
+def invalid_mask_date(column: Series, fmt: str | None) -> Series:
+    """Must not contain a non-zero time component."""
+    notnull = column.notna()
+    if fmt:
+        parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
+        ok = parsed.notna()
+        has_time = ok & (
+            (parsed.dt.hour != 0)
+            | (parsed.dt.minute != 0)
+            | (parsed.dt.second != 0)
+            | (parsed.dt.microsecond != 0)
+        )
+        return notnull & (~ok | has_time)
+    allowed = _allowed_formats_for(DataType.DATE)
+    ok_any = _parse_ok_any(column, allowed)
+    return notnull & (~ok_any)
+def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
+    notnull = column.notna()
+    if fmt:
+        parsed = to_datetime(column, format=fmt, errors="coerce", utc=False)
+        ok = parsed.notna()
+        return notnull & (~ok)
+    allowed = _allowed_formats_for(DataType.DATETIME)
+    ok_any = _parse_ok_any(column, allowed)
+    return notnull & (~ok_any)
+# Other Text Checks
+def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
+    if max_len is None or max_len <= 0:
+        # treat as unlimited length
+        return Series(False, index=column.index)
+    notnull = column.notna()
+    lens = column.str.len()
+    return notnull & (lens > max_len)
+def invalid_mask_text_forbidden_characters(column: Series) -> Series:
+    forbidden = get_config().forbidden_characters
+    if not forbidden:
+        return column.notna() & False
+    pattern = "[" + re.escape("".join(forbidden)) + "]"
+    notnull = column.notna()
+    has_forbidden = column.str.contains(pattern, regex=True, na=False)
+    return notnull & has_forbidden
+# Apply Data Types #
+def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
+    # name -> column object
+    column_dictionary = {column.name: column for column in table_dictionary}
+    for col in df.columns:
+        data_type = column_dictionary.get(col).data_type
+        datetime_format = column_dictionary.get(col).datetime_format
+        if data_type in (DataType.TEXT, DataType.FILE):
+            df[col] = df[col].astype("string")
+        elif data_type == DataType.INTEGER:
+            # Accepts '12', '12.0', '1e2' etc.; validation guarantees integer-equivalent
+            nums = to_numeric(df[col], errors="raise")
+            df[col] = nums.round().astype("Int64")
+        elif data_type == DataType.FLOAT:
+            nums = to_numeric(df[col], errors="raise")
+            df[col] = nums.astype("Float64")
+        elif data_type == DataType.DATE:
+            dtv = to_datetime(
+                df[col], format=datetime_format, errors="raise", utc=False
+            )
+            df[col] = dtv.dt.normalize()  # midnight
+        elif data_type == DataType.DATETIME:
+            df[col] = to_datetime(
+                df[col], format=datetime_format, errors="raise", utc=False
+            )
+        else:
+            # Fallback: keep as string
+            df[col] = df[col].astype("string")
+    return df

valediction/validation/issues.py ADDED Viewed

@@ -0,0 +1,280 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Iterable, Iterator, Optional
+from pandas import DataFrame, concat
+from valediction.datasets.datasets_helpers import DatasetItemLike
+from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
+from valediction.support import _normalise_name, list_as_bullets
+class IssueType(Enum):
+    # Column / schema
+    MISSING_COLUMN = "MissingColumn"
+    EXTRA_COLUMN = "ExtraColumn"
+    FULLY_NULL_COLUMN = "FullyNullColumn"
+    # Keys
+    PK_NULL = "PrimaryKeyNull"
+    PK_COLLISION = "PrimaryKeyCollision"
+    PK_WHITESPACE = "PrimaryKeyContainsWhitespace"
+    # Types / content
+    TYPE_MISMATCH = "TypeMismatch"
+    TEXT_TOO_LONG = "TextTooLong"
+    FORBIDDEN_CHARACTER = "ForbiddenCharacter"
+# Settings
+APPLIES_WHOLE_COLUMN = {
+    IssueType.MISSING_COLUMN,
+    IssueType.EXTRA_COLUMN,
+    IssueType.FULLY_NULL_COLUMN,
+}
+PRIMARY_KEY_ISSUES = {
+    IssueType.PK_NULL,
+    IssueType.PK_COLLISION,
+    IssueType.PK_WHITESPACE,
+}
+@dataclass
+class Range:
+    start: int
+    end: int
+    def __init__(self, start: int, end: int):
+        self.start: int = int(start)
+        self.end: int = int(end)
+@dataclass
+class Issue:
+    """
+    Summary:
+    Dataclass representing an issue in the dataset.
+    Attributes:
+        type (IssueType): type of issue
+        table (str): name of the table where the issue was detected
+        column (str | None): name of the column where the issue was detected, or None if not applicable
+        ranges (list[Range]): list of contiguous ranges of rows where the issue was detected
+        parent (DatasetItemLike | None): parent dataset item, or None if not applicable
+    """
+    type: IssueType
+    table: str
+    column: str | None
+    ranges: list[Range] = field(default_factory=list)
+    parent: DatasetItemLike | None = None
+    # Magic
+    def __repr__(self) -> str:
+        column_part = f", column={self.column!r}" if self.column is not None else ""
+        sum_ranges = sum(r.end - r.start + 1 for r in self.ranges)
+        sum_range_part = f", total={sum_ranges}" if sum_ranges else ""
+        return f"Issue(type={self.type.value!r}, table={self.table!r}{column_part}{sum_range_part})"
+    # Methods
+    def add_ranges(self, new_ranges: Iterable[Range]) -> None:
+        """
+        Summary:
+            Merge new contiguous/overlapping ranges into self.ranges (kept sorted).
+        Arguments:
+            new_ranges (Iterable[Range]): new contiguous/overlapping ranges to be merged into self.ranges
+        Raises:
+            ValueError: if new_ranges is empty
+        """
+        all_ranges = self.ranges + list(new_ranges)
+        if not all_ranges:
+            self.ranges = []
+            return
+        all_ranges.sort(key=lambda r: (r.start, r.end))
+        merged: list[Range] = []
+        cur = all_ranges[0]
+        for r in all_ranges[1:]:
+            if r.start <= cur.end + 1:  # contiguous/overlap
+                cur.end = max(cur.end, r.end)
+            else:
+                merged.append(cur)
+                cur = r
+        merged.append(cur)
+        self.ranges = merged
+    def inspect(
+        self,
+        additional_columns: bool | str | list[str] | None = None,
+        chunk_size: int = 1_000_000,
+        print_header: bool = True,
+    ) -> DataFrame | str:
+        """
+        Summary:
+            Inspect an issue in the dataset by returning a DataFrame containing the relevant values.
+        Arguments:
+            additional_columns (bool | str | list[str] | None): whether to include additional columns in the DataFrame
+                - if True, include all columns
+                - if str or list[str], include only the specified columns
+                - if None, do not include any additional columns
+            chunk_size (int): the number of rows to include in the DataFrame at a time
+            print_header (bool): whether to print the issue details as a header
+        Returns:
+            DataFrame: a DataFrame containing the relevant rows of the dataset
+        Raises:
+            ValueError: if the issue has no parent DatasetItem
+        """
+        # Guard
+        if not self.parent:
+            raise ValueError("Issue has no parent DatasetItem")
+        header = self.__repr__() if print_header else ""
+        # Not applicable
+        if self.type in APPLIES_WHOLE_COLUMN:
+            print(f"{header}: applies to whole column")
+            return None
+        # Column Inclusion
+        if print_header:
+            print(f"{header}:")
+        if additional_columns is True:
+            columns = None
+        else:
+            additional_columns = (
+                [additional_columns]
+                if isinstance(additional_columns, str)
+                else additional_columns
+            )
+            base = (
+                set(self.parent.primary_keys)
+                if self.type in PRIMARY_KEY_ISSUES
+                else {self.column}
+            )
+            base |= set(additional_columns or [])
+            base.discard(None)
+            columns = list(base) if base else None
+        if not self.ranges:
+            return DataFrame(columns=columns) if columns else DataFrame()
+        spans: list[tuple[int, int]] = [(r.start, r.end) for r in self.ranges]
+        # DataFrame source: slice directly
+        if self.parent.is_dataframe:
+            df: DataFrame = self.parent.data
+            n = len(df)
+            if n == 0:
+                return DataFrame(columns=columns) if columns else DataFrame()
+            # Clamp spans to df length; build parts
+            parts: list[DataFrame] = []
+            for s, e in spans:
+                if s > e or s >= n or e < 0:
+                    continue
+                lo = max(0, s)
+                hi = min(n - 1, e)
+                part: DataFrame = df.iloc[lo : hi + 1]
+                parts.append(part if columns is None else part.loc[:, columns])
+            if not parts:
+                return DataFrame(columns=columns) if columns else DataFrame()
+            return concat(parts, axis=0, ignore_index=False)
+        # CSV source: delegate reading to csv_readers
+        if self.parent.is_path:
+            path = self.parent.data
+            cfg = CsvReadConfig(usecols=columns)
+            out = read_csv_ranges(path, spans, cfg=cfg, chunk_size=chunk_size)
+        return out if columns is None else out.loc[:, columns]
+@dataclass
+class Issues:
+    """List-like container holding Issues with case-insensitive get and range
+    merging."""
+    # Magic
+    def __init__(self) -> None:
+        self._items: list[Issue] = []
+        self._index: dict[
+            tuple[str, Optional[str], IssueType], Issue
+        ] = {}  # table, column, issue_type
+    def __iter__(self) -> Iterator[Issue]:
+        return iter(self._items)
+    def __len__(self) -> int:
+        return len(self._items)
+    def __bool__(self) -> bool:
+        return bool(self._items)
+    def __getitem__(self, idx) -> Issue | list[Issue]:
+        return self._items[idx]
+    def __repr__(self) -> str:
+        if not self._items:
+            return "Issues([])"
+        issues = list_as_bullets(elements=[repr(item) for item in self._items])
+        return f"Issues({issues}\n)"
+    # Methods
+    def add(
+        self,
+        issue_type: IssueType,
+        table: str,
+        column: str | None = None,
+        ranges: Iterable[Range] | None = None,
+        parent: DatasetItemLike | None = None,
+    ) -> Issue:
+        key = (
+            _normalise_name(table),
+            _normalise_name(column) if column is not None else None,
+            issue_type,
+        )
+        issue = self._index.get(key)
+        if issue is None:
+            issue = Issue(type=issue_type, table=table, column=column, parent=parent)
+            self._items.append(issue)
+            self._index[key] = issue
+        if ranges:
+            issue.add_ranges(ranges)
+        return issue
+    def get(
+        self,
+        table: str,
+        column: str | None = None,
+        issue_type: IssueType | None = None,
+    ) -> list[Issue]:
+        """Case-insensitive filter; any arg can be None to act as a wildcard."""
+        table = _normalise_name(table)
+        column = _normalise_name(column) if column is not None else None
+        output: list[Issue] = []
+        if issue_type is not None:
+            # direct index lookup where possible
+            key = (table, column, issue_type)
+            hit = self._index.get(key)
+            if hit:
+                output.append(hit)
+            return output
+        # otherwise scan (still cheap; we maintain a compact list)
+        for item in self._items:
+            if _normalise_name(item.table) != table:
+                continue
+            if column is not None and (_normalise_name(item.column) or "") != column:
+                continue
+            output.append(item)
+        return output
+    def extend(self, issues: Issues) -> None:
+        for issue in issues:
+            self.add(issue.type, issue.table, issue.column, issue.ranges, issue.parent)