PyPI - dfguard - Versions diffs - 0.1.0__py3-none-any.whl - Mend

dfguard 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

dfguard/__init__.py +8 -0
dfguard/py.typed +0 -0
dfguard/pyspark/__init__.py +114 -0
dfguard/pyspark/_enforcement.py +250 -0
dfguard/pyspark/_inference.py +77 -0
dfguard/pyspark/_nullable.py +49 -0
dfguard/pyspark/coercion.py +203 -0
dfguard/pyspark/dataset.py +696 -0
dfguard/pyspark/decorators.py +86 -0
dfguard/pyspark/exceptions.py +55 -0
dfguard/pyspark/history.py +139 -0
dfguard/pyspark/schema.py +418 -0
dfguard/pyspark/types.py +107 -0
dfguard-0.1.0.dist-info/METADATA +415 -0
dfguard-0.1.0.dist-info/RECORD +17 -0
dfguard-0.1.0.dist-info/WHEEL +4 -0
dfguard-0.1.0.dist-info/licenses/LICENSE +147 -0

dfguard/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+dfguard: Type-safe dataframe operations for PySpark, Pandas, and Polars.
+Backends are imported lazily so installing dfguard does not require
+any dataframe library to be present.
+"""
+__version__ = "0.1.0"

dfguard/py.typed ADDED Viewed

File without changes

dfguard/pyspark/__init__.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""
+dfguard.pyspark: runtime schema enforcement for PySpark DataFrames.
+Two-line setup for packages (Kedro, Airflow, any importable module)
+-------------------------------------------------------------------
+::
+    from dfguard.pyspark import schema_of, arm
+    RawSchema = schema_of(raw_df)
+    def enrich(df: RawSchema): ...
+    def clean(df: RawSchema): ...
+    arm()                              # enforces every annotated function above
+For scripts and notebooks use ``@enforce`` per function::
+    from dfguard.pyspark import schema_of, enforce
+    RawSchema = schema_of(raw_df)
+    @enforce
+    def enrich(df: RawSchema): ...
+Declaring schemas upfront (no live DataFrame required)
+------------------------------------------------------
+::
+    from pyspark.sql import types as T
+    from dfguard.pyspark import SparkSchema, Optional, enforce
+    class OrderSchema(SparkSchema):
+        order_id: T.LongType()
+        amount:   T.DoubleType()
+        tags:     T.ArrayType(T.StringType())
+        address:  AddressSchema            # nested struct
+        zip:      Optional[T.StringType()] # nullable
+    @enforce
+    def process(df: OrderSchema): ...      # subset matching: df must have these fields
+Public API
+----------
+``schema_of(df)``
+    Capture ``df``'s schema as a type.  Exact match required.
+``dataset(df)``
+    Wrap ``df`` in a tracked instance.  Every ``withColumn``, ``drop``,
+    ``select``, etc. is recorded in ``schema_history``.
+``arm()``
+    Apply schema enforcement to every annotated function in the calling
+    module.  Call after all function definitions.
+``disarm()``
+    Turn off all enforcement globally. Call ``arm()`` to re-enable.
+``enforce``
+    Per-function decorator.  Only checks schema-annotated args.
+``SparkSchema``
+    Declare a schema as a class using real PySpark types.
+    Subset matching: df must have at least the declared fields.
+``check_schema(schema)`` / ``typed_transform(input_schema, output_schema)``
+    Function decorators for explicit input/output validation.
+"""
+try:
+    import pyspark  # noqa: F401
+except ImportError as _e:
+    raise ImportError(
+        "dfguard's PySpark integration requires PySpark. "
+        "Install it with: pip install 'dfguard[pyspark]'"
+    ) from _e
+from dfguard.pyspark._enforcement import arm, disarm, enforce
+from dfguard.pyspark._inference import infer_schema
+from dfguard.pyspark._nullable import Optional
+from dfguard.pyspark.coercion import result_type
+from dfguard.pyspark.dataset import TypedGroupedData, _TypedDatasetBase, schema_of
+from dfguard.pyspark.dataset import _make_dataset as dataset
+from dfguard.pyspark.decorators import check_schema, typed_transform
+from dfguard.pyspark.exceptions import (
+    ColumnNotFoundError,
+    DfTypesError,
+    SchemaValidationError,
+    TypeAnnotationError,
+)
+from dfguard.pyspark.history import SchemaChange, SchemaHistory
+from dfguard.pyspark.schema import SparkSchema  # noqa: E402
+__all__ = [
+    "schema_of",
+    "dataset",
+    "enforce",
+    "arm",
+    "disarm",
+    "_TypedDatasetBase",
+    "TypedGroupedData",
+    "SparkSchema",
+    "typed_transform",
+    "check_schema",
+    "SchemaChange",
+    "SchemaHistory",
+    "DfTypesError",
+    "SchemaValidationError",
+    "TypeAnnotationError",
+    "ColumnNotFoundError",
+    "infer_schema",
+    "result_type",
+    "Optional",
+]

dfguard/pyspark/_enforcement.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""Schema enforcement without touching non-schema arguments."""
+from __future__ import annotations
+import functools
+import importlib
+import inspect
+import pkgutil
+import types
+import warnings
+from collections.abc import Callable
+from typing import Any, TypeVar, overload
+F = TypeVar("F", bound=Callable[..., Any])
+_ENABLED = True   # dfg.disarm() / dfg.arm() controls this
+_ARMED   = False  # tracks whether arm() has been called
+_SUBSET  = True   # dfg.arm(subset=...) sets the global default; function-level overrides this
+_UNSET = object()  # sentinel: "no function-level override, use global"
+def _is_schema_type(annotation: Any) -> bool:
+    """
+    Return True when *annotation* participates in dfguard enforcement.
+    Any class that exposes a ``_fg_check(value, subset) -> bool`` classmethod
+    is treated as a schema type. This is the extension point: new DataFrame
+    backends (pandas, polars, …) just need to add ``_fg_check`` to their
+    schema class; no changes to enforcement code required.
+    """
+    return isinstance(annotation, type) and callable(getattr(annotation, "_fg_check", None))
+def _schema_matches(value: Any, annotation: type, subset: bool) -> bool:
+    """
+    Check whether *value* satisfies *annotation*.
+    Delegates to ``annotation._fg_check(value, subset)`` when available.
+    The meaning of *subset* is left to each schema type:
+    - ``schema_of`` types (``_TypedDatasetBase``) ignore *subset*: always exact.
+    - ``SparkSchema`` types respect *subset*:
+        - ``True``: extra columns in *value* are fine.
+        - ``False``: *value* must have exactly the declared columns, nothing extra.
+    """
+    checker = getattr(annotation, "_fg_check", None)
+    if callable(checker):
+        return bool(checker(value, subset))
+    return isinstance(value, annotation)
+def _arm_module_dict(module_dict: dict[str, Any], *, subset: Any) -> None:
+    """Patch all public functions in a module's __dict__ with enforce()."""
+    for name, obj in list(module_dict.items()):
+        if name.startswith("_"):
+            continue
+        if isinstance(obj, types.FunctionType):
+            # Pass _UNSET so each wrapped function reads _SUBSET at call-time,
+            # unless overridden at decoration time by the caller.
+            wrapped = enforce(subset=subset)(obj)
+            if wrapped is not obj:
+                module_dict[name] = wrapped
+def arm(
+    module: Any = None,
+    *,
+    package: str | None = None,
+    subset: bool = True,
+) -> None:
+    """
+    Arm the entire calling package and set the global subset default.
+    Call once from your entry point, ``__init__.py``, or ``settings.py`` (Kedro)::
+        import dfguard.pyspark as dfg
+        dfg.arm()                # subset=True (default): extra columns are fine
+        dfg.arm(subset=False)    # exact match: no extra columns allowed anywhere
+    The ``subset`` value becomes the global default. Individual functions decorated
+    with ``@dfg.enforce(subset=...)`` override it for that function only.
+    If called when already armed, re-enables enforcement (sets ``_ENABLED = True``)
+    without re-walking the package.
+    **Specific module object**::
+        dfg.arm(my_module)
+    **Explicit package name**::
+        dfg.arm(package="my_pipeline.nodes")
+    """
+    global _SUBSET, _ENABLED, _ARMED
+    _SUBSET = subset
+    _ENABLED = True
+    if _ARMED:
+        # Already armed: just re-enable and update subset, no re-walking needed.
+        return
+    _ARMED = True
+    if isinstance(module, types.ModuleType):
+        _arm_module_dict(vars(module), subset=_UNSET)
+        return
+    if package is None:
+        frame = inspect.currentframe()
+        if frame is None or frame.f_back is None:
+            return
+        caller_globals = frame.f_back.f_globals
+        package = caller_globals.get("__package__") or caller_globals.get("__name__", "")
+    if not package or package == "__main__":
+        warnings.warn(
+            "dfguard.pyspark.arm() called from __main__. "
+            "Use @dfguard.pyspark.enforce on individual functions instead.",
+            stacklevel=2,
+        )
+        return
+    pkg = importlib.import_module(package)
+    _arm_module_dict(vars(pkg), subset=_UNSET)
+    pkg_path = getattr(pkg, "__path__", None)
+    if pkg_path is not None:
+        for _, mod_name, _ in pkgutil.walk_packages(pkg_path, prefix=package + "."):
+            try:
+                mod = importlib.import_module(mod_name)
+                _arm_module_dict(vars(mod), subset=_UNSET)
+            except Exception as exc:
+                warnings.warn(
+                    f"dfguard: skipped arming module '{mod_name}': {exc}",
+                    stacklevel=2,
+                )
+def disarm() -> None:
+    """Turn off all enforcement globally. Call arm() to re-enable."""
+    global _ENABLED
+    _ENABLED = False
+@overload
+def enforce(func: F) -> F: ...
+@overload
+def enforce(func: None = None, *, subset: bool = ...) -> Callable[[F], F]: ...
+def enforce(
+    func: F | None = None,
+    *,
+    subset: Any = _UNSET,
+) -> F | Callable[[F], F]:
+    """
+    Validate schema annotations on DataFrame arguments.
+    Only intercepts parameters annotated with a ``dfg.schema_of`` type or a
+    ``dfg.SparkSchema`` subclass. All other arguments are left completely alone.
+    **Default**: inherits the global ``subset`` set by ``dfg.arm()``:
+        @dfg.enforce
+        def process(df: OrderSchema, label: str): ...
+    **subset=True**: extra columns in the DataFrame are fine (overrides global)::
+        @dfg.enforce(subset=True)
+        def process(df: OrderSchema): ...
+    **subset=False**: DataFrame must match the schema exactly (overrides global)::
+        @dfg.enforce(subset=False)
+        def process(df: OrderSchema): ...
+    """
+    # Capture the function-level subset at decoration time.
+    # If _UNSET, the wrapper reads _SUBSET at call-time (respects dfg.arm changes).
+    subset_override = subset
+    def decorator(f: F) -> F:
+        params = inspect.signature(f).parameters
+        schema_params = [
+            (name, param.annotation)
+            for name, param in params.items()
+            if param.annotation is not inspect.Parameter.empty
+            and _is_schema_type(param.annotation)
+        ]
+        if not schema_params:
+            return f  # nothing schema-typed, zero overhead
+        sig = inspect.signature(f)
+        @functools.wraps(f)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            if not _ENABLED:
+                return f(*args, **kwargs)
+            # Function-level subset wins; fall back to global if not set.
+            effective_subset = _SUBSET if subset_override is _UNSET else subset_override
+            bound = sig.bind(*args, **kwargs)
+            bound.apply_defaults()
+            for param_name, annotation in schema_params:
+                if param_name not in bound.arguments:
+                    continue
+                value = bound.arguments[param_name]
+                if not _schema_matches(value, annotation, subset=effective_subset):
+                    _raise_schema_mismatch(f.__name__, param_name, annotation, value)
+            return f(*args, **kwargs)
+        return wrapper  # type: ignore[return-value]
+    if func is not None:
+        return decorator(func)
+    return decorator
+def _raise_schema_mismatch(
+    func_name: str,
+    param_name: str,
+    annotation: type,
+    value: Any,
+) -> None:
+    actual_schema = getattr(value, "schema", None)
+    if actual_schema is not None:
+        actual_str = ", ".join(
+            f"{f.name}:{f.dataType.simpleString()}" for f in actual_schema.fields
+        )
+    else:
+        actual_str = type(value).__name__
+    expected_schema = getattr(annotation, "_expected_schema", None)
+    if expected_schema is not None:
+        expected_str = ", ".join(
+            f"{f.name}:{f.dataType.simpleString()}" for f in expected_schema.fields
+        )
+    else:
+        expected_str = getattr(annotation, "__name__", repr(annotation))
+    raise TypeError(
+        f"Schema mismatch in {func_name}() argument '{param_name}':\n"
+        f"  expected: {expected_str}\n"
+        f"  received: {actual_str}"
+    )

dfguard/pyspark/_inference.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+dfguard.pyspark._inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+``infer_schema(df)``: inspect a live DataFrame and generate a SparkSchema
+subclass with the correct types, including deeply nested structs.
+The generated class can immediately be used for validation::
+    schema = infer_schema(df, name="OrderSchema")
+    print(schema.to_code())    # copy-paste into your codebase
+    ds.validate(schema)
+Nested StructTypes are emitted as separate named inner classes.
+"""
+from __future__ import annotations
+from typing import Any
+def infer_schema(df: Any, name: str = "InferredSchema") -> type:
+    """
+    Inspect ``df`` (a pyspark.sql.DataFrame or dataset wrapper) and return
+    a SparkSchema subclass that exactly matches its current schema.
+    Also prints the Python code so developers can copy it into their codebase.
+    Parameters
+    ----------
+    df:
+        A live ``pyspark.sql.DataFrame`` or ``dataset`` wrapper.
+    name:
+        Name to give the generated class.
+    Returns
+    -------
+    type[SparkSchema]
+        A fully usable SparkSchema subclass.
+    """
+    from dfguard.pyspark.schema import SparkSchema
+    struct = df.schema
+    schema_class = SparkSchema.from_struct(struct, name=name)
+    print(_render_code(schema_class, name))
+    return schema_class
+def _render_code(schema_class: Any, name: str) -> str:
+    """Recursively render a SparkSchema (and any nested schemas) as Python source."""
+    lines: list[str] = []
+    nested_lines: list[str] = []
+    # Collect any nested SparkSchema classes stored as class attributes
+    from dfguard.pyspark.schema import SparkSchema
+    for attr_name, attr_val in vars(schema_class).items():
+        if (
+            isinstance(attr_val, type)
+            and issubclass(attr_val, SparkSchema)
+            and attr_val is not SparkSchema
+        ):
+            nested_lines.append(_render_code(attr_val, attr_val.__name__))
+            nested_lines.append("")
+    if nested_lines:
+        lines.extend(nested_lines)
+    lines.append(f"class {name}(SparkSchema):")
+    if not schema_class._schema_fields:
+        lines.append("    pass")
+    else:
+        from dfguard.pyspark.schema import _annotation_to_str
+        for col_name, annotation in schema_class._schema_fields.items():
+            ann_str = _annotation_to_str(annotation)
+            lines.append(f"    {col_name}: {ann_str}")
+    return "\n".join(lines)

dfguard/pyspark/_nullable.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""
+Nullable field marker for SparkSchema.
+``typing.Optional[T.StringType()]`` raises TypeError on Python 3.10 because
+PySpark DataType instances are not callable classes, and Python's typing module
+validates this. Python 3.11+ relaxed the check.
+This module provides a drop-in replacement that works on Python 3.10+.
+"""
+from __future__ import annotations
+from typing import Any
+class _NullableAnnotation:
+    """Wraps a DataType annotation to mark the field as nullable."""
+    __slots__ = ("inner",)
+    def __init__(self, inner: Any) -> None:
+        self.inner = inner
+    def __class_getitem__(cls, item: Any) -> _NullableAnnotation:
+        return cls(item)
+    def __repr__(self) -> str:
+        return f"Optional[{self.inner!r}]"
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, _NullableAnnotation) and self.inner == other.inner
+    def __hash__(self) -> int:
+        try:
+            return hash(("_NullableAnnotation", self.inner))
+        except TypeError:
+            return id(self)
+#: Drop-in for ``typing.Optional`` that works on Python 3.10+ with PySpark DataType fields.
+#:
+#: Usage::
+#:
+#:     from dfguard.pyspark import Optional
+#:
+#:     class OrderSchema(fg.SparkSchema):
+#:         order_id: T.LongType()
+#:         zip:      Optional[T.StringType()]  # nullable field
+Optional = _NullableAnnotation

dfguard/pyspark/coercion.py ADDED Viewed

@@ -0,0 +1,203 @@
+"""
+Type coercion rules for PySpark, implemented in pure Python.
+This mirrors Spark's Catalyst TypeCoercion and DecimalPrecision rules so that
+dfguard can resolve derived column types without a running Spark session.
+Rules source: org.apache.spark.sql.catalyst.analysis.TypeCoercion
+              org.apache.spark.sql.catalyst.analysis.DecimalPrecision
+"""
+from __future__ import annotations
+from typing import Any
+# ---------------------------------------------------------------------------
+# Integer types expressed as Decimal equivalents (precision, scale=0)
+# ---------------------------------------------------------------------------
+_INT_AS_DECIMAL = {
+    "ByteType":    (3,  0),
+    "ShortType":   (5,  0),
+    "IntegerType": (10, 0),
+    "LongType":    (20, 0),
+}
+# Numeric widening rank (higher = wider). Float/Double are above Long because
+# Spark promotes integer+float → double (lossy but matches Spark behaviour).
+_NUMERIC_RANK: dict[str, int] = {
+    "ByteType":    1,
+    "ShortType":   2,
+    "IntegerType": 3,
+    "LongType":    4,
+    "FloatType":   5,
+    "DoubleType":  6,
+}
+def _type_name(dt: Any) -> str:
+    return type(dt).__name__
+def _is_integral(dt: Any) -> bool:
+    return _type_name(dt) in _INT_AS_DECIMAL
+def _is_fractional(dt: Any) -> bool:
+    return _type_name(dt) in ("FloatType", "DoubleType")
+def _is_numeric(dt: Any) -> bool:
+    return _is_integral(dt) or _is_fractional(dt) or _type_name(dt) == "DecimalType"
+# ---------------------------------------------------------------------------
+# Decimal precision arithmetic  (mirrors DecimalPrecision.scala)
+# ---------------------------------------------------------------------------
+def _decimal_for_integral(dt: Any) -> tuple[int, int]:
+    """Return (precision, scale) for an integer type treated as Decimal."""
+    return _INT_AS_DECIMAL[_type_name(dt)]
+def _decimal_add(p1: int, s1: int, p2: int, s2: int) -> tuple[int, int]:
+    """Decimal addition/subtraction result precision and scale."""
+    scale = max(s1, s2)
+    precision = max(p1 - s1, p2 - s2) + scale + 1
+    return precision, scale
+def _decimal_mul(p1: int, s1: int, p2: int, s2: int) -> tuple[int, int]:
+    """Decimal multiplication result precision and scale."""
+    scale = s1 + s2
+    precision = p1 + p2 + 1
+    return precision, scale
+def _decimal_div(p1: int, s1: int, p2: int, s2: int) -> tuple[int, int]:
+    """Decimal division result precision and scale."""
+    scale = max(6, s1 + p2 + 1)
+    precision = p1 - s1 + s2 + scale
+    return precision, scale
+def _make_decimal(p: int, s: int) -> Any:
+    from pyspark.sql import types as T
+    # Spark caps precision at 38
+    p = min(p, 38)
+    return T.DecimalType(p, s)
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def coerce_add(left: Any, right: Any) -> Any:
+    """Return the result type of ``left + right`` (or ``left - right``)."""
+    return _coerce_binary(left, right, "add")
+def coerce_mul(left: Any, right: Any) -> Any:
+    """Return the result type of ``left * right``."""
+    return _coerce_binary(left, right, "mul")
+def coerce_div(left: Any, right: Any) -> Any:
+    """Return the result type of ``left / right``. Always Double in Spark."""
+    from pyspark.sql import types as T
+    if not (_is_numeric(left) and _is_numeric(right)):
+        raise TypeError(f"Cannot divide {_type_name(left)} by {_type_name(right)}")
+    return T.DoubleType()
+def coerce_mod(left: Any, right: Any) -> Any:
+    """Return the result type of ``left % right`` (modulo)."""
+    return _coerce_binary(left, right, "add")   # same widening rules as add
+def _coerce_binary(left: Any, right: Any, op: str) -> Any:
+    from pyspark.sql import types as T
+    ln, rn = _type_name(left), _type_name(right)
+    # Both are simple numeric (no Decimal involved)
+    if ln in _NUMERIC_RANK and rn in _NUMERIC_RANK:
+        # Float/Double always promotes to Double (Spark rule)
+        if _is_fractional(left) or _is_fractional(right):
+            return T.DoubleType()
+        # Pure integer widening
+        winner = left if _NUMERIC_RANK[ln] >= _NUMERIC_RANK[rn] else right
+        return type(winner)()
+    # At least one side is Decimal
+    if ln == "DecimalType" or rn == "DecimalType":
+        # Float or Double wins over Decimal
+        if _is_fractional(left) or _is_fractional(right):
+            return T.DoubleType()
+        # Normalise both sides to (precision, scale)
+        if ln == "DecimalType":
+            p1, s1 = left.precision, left.scale
+        else:
+            p1, s1 = _decimal_for_integral(left)
+        if rn == "DecimalType":
+            p2, s2 = right.precision, right.scale
+        else:
+            p2, s2 = _decimal_for_integral(right)
+        if op == "mul":
+            p, s = _decimal_mul(p1, s1, p2, s2)
+        else:
+            p, s = _decimal_add(p1, s1, p2, s2)
+        return _make_decimal(p, s)
+    raise TypeError(f"Cannot coerce {ln} and {rn} for operation '{op}'")
+def coerce_comparison(left: Any, right: Any) -> Any:
+    """Comparison (==, <, >, <=, >=) always returns BooleanType."""
+    from pyspark.sql import types as T
+    return T.BooleanType()
+def coerce_cast(target: Any) -> Any:
+    """Explicit cast: the target type is the result."""
+    return target
+def result_type(left: Any, right: Any, op: str) -> Any:
+    """
+    Resolve the output DataType for a binary operation between two typed columns.
+    Parameters
+    ----------
+    left, right : DataType instances
+    op : one of '+', '-', '*', '/', '%', '==', '!=', '<', '<=', '>', '>='
+    Returns
+    -------
+    DataType instance representing the result type.
+    Examples
+    --------
+    >>> from pyspark.sql import types as T
+    >>> result_type(T.IntegerType(), T.DecimalType(10, 2), '+')
+    DecimalType(13,2)
+    >>> result_type(T.LongType(), T.DoubleType(), '*')
+    DoubleType()
+    >>> result_type(T.IntegerType(), T.IntegerType(), '/')
+    DoubleType()
+    """
+    if op in ("+", "-"):
+        return coerce_add(left, right)
+    if op == "*":
+        return coerce_mul(left, right)
+    if op == "/":
+        return coerce_div(left, right)
+    if op == "%":
+        return coerce_mod(left, right)
+    if op in ("==", "!=", "<", "<=", ">", ">="):
+        return coerce_comparison(left, right)
+    raise ValueError(f"Unknown operator: {op!r}")