PyPI - conformare - Versions diffs - 0.1.0__py3-none-any.whl - Mend

conformare 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

conformare/__init__.py +201 -0
conformare/adapters/__init__.py +1 -0
conformare/adapters/io.py +327 -0
conformare/adapters/narwhals.py +152 -0
conformare/adapters/pandas.py +264 -0
conformare/adapters/spark.py +221 -0
conformare/bootstrap.py +186 -0
conformare/core/__init__.py +1 -0
conformare/core/checklist.py +253 -0
conformare/core/context.py +441 -0
conformare/core/diagram.py +110 -0
conformare/core/groups.py +109 -0
conformare/core/lineage.py +164 -0
conformare/core/names.py +137 -0
conformare/core/recording.py +90 -0
conformare/core/report.py +2003 -0
conformare/core/risks.py +210 -0
conformare/core/sensitivity.py +285 -0
conformare/core/steps.py +340 -0
conformare/core/suppress.py +82 -0
conformare/profilers/__init__.py +1 -0
conformare/profilers/backend.py +29 -0
conformare/profilers/backend_narwhals.py +69 -0
conformare/profilers/backend_pandas.py +53 -0
conformare/profilers/backend_spark.py +108 -0
conformare/profilers/base.py +46 -0
conformare/profilers/builtins.py +696 -0
conformare/profilers/conditions.py +42 -0
conformare/profilers/engine.py +232 -0
conformare-0.1.0.dist-info/METADATA +300 -0
conformare-0.1.0.dist-info/RECORD +33 -0
conformare-0.1.0.dist-info/WHEEL +4 -0
conformare-0.1.0.dist-info/licenses/LICENSE +77 -0

conformare/__init__.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""conformare -- capture the authored dataframe pipeline and profile each step.
+One shared lineage + profiling + diagram core, two pluggable interception
+adapters: ``trackNarwhals()`` (future, Narwhals) and ``trackSpark()`` (existing
+PySpark, zero code change). See the design doc for the full architecture.
+"""
+from __future__ import annotations
+from importlib.metadata import PackageNotFoundError
+from importlib.metadata import version as _pkg_version
+try:
+    # Single source of truth: the version declared in pyproject.toml, read from the
+    # installed package metadata. Conformare follows Semantic Versioning.
+    __version__ = _pkg_version("conformare")
+except PackageNotFoundError:  # running from a source tree that isn't installed
+    __version__ = "0.0.0+unknown"
+from .adapters.narwhals import trackNarwhals
+from .adapters.pandas import trackPandas
+from .adapters.spark import trackSpark
+from .core.context import (
+    describe,
+    describe_process,
+    groups_registry,
+    reset_context,
+    risk,
+)
+from .core.diagram import to_json as _to_json
+from .core.diagram import to_mermaid as _to_mermaid
+from .core.lineage import store
+from .core.checklist import to_risk_checklist as _to_risk_checklist
+from .core.report import build_model
+from .core.report import to_html as _to_html
+from .core.risks import all_risks, catalog_by_category, get_risk, register_risk
+from .core.sensitivity import (
+    classify_column,
+    mark_sensitive,
+    reset_marks,
+    scan_columns,
+    unmark_sensitive,
+)
+from .core.steps import opaque, track_functions, track_step
+from .core.suppress import opaque_module, opaque_modules, set_opaque_modules
+from .profilers.builtins import (
+    columnCount,
+    dataSize,
+    greatExpectations,
+    histogram,
+    iqrOutliers,
+    nullFraction,
+    rowCount,
+    whylogs,
+)
+from .profilers.conditions import contains_columns, min_rows, schema_has
+from .profilers.engine import (
+    force_profile,
+    get_profiles,
+    profile,
+    profile_sources,
+    release_cache,
+    run_profilers,
+    set_profiles,
+)
+# Imported last: bootstrap pulls in report/context/sensitivity/etc., all loaded above.
+from .bootstrap import bootstrap, decorate, doc, documented
+__all__ = [
+    "trackNarwhals",
+    "trackSpark",
+    "trackPandas",
+    "trackAll",
+    "restore",
+    "set_profiles",
+    "get_profiles",
+    "profile",
+    "force_profile",
+    "profile_sources",
+    "release_cache",
+    "run_profilers",
+    "rowCount",
+    "columnCount",
+    "histogram",
+    "nullFraction",
+    "dataSize",
+    "iqrOutliers",
+    "whylogs",
+    "greatExpectations",
+    "contains_columns",
+    "min_rows",
+    "schema_has",
+    "track_step",
+    "track_functions",
+    "opaque",
+    "opaque_module",
+    "opaque_modules",
+    "set_opaque_modules",
+    "bootstrap",
+    "doc",
+    "decorate",
+    "documented",
+    "describe",
+    "describe_process",
+    "risk",
+    "register_risk",
+    "get_risk",
+    "all_risks",
+    "catalog_by_category",
+    "groups_registry",
+    "reset_context",
+    "mark_sensitive",
+    "unmark_sensitive",
+    "classify_column",
+    "scan_columns",
+    "reset_marks",
+    "to_mermaid",
+    "to_json",
+    "to_html",
+    "to_risk_checklist",
+    "build_model",
+    "lineage",
+    "store",
+]
+def trackAll(
+    *, narwhals: bool = True, spark: bool = True, pandas: bool = False, functions: bool = True
+) -> None:
+    """Enable the adapters (and automatic function tracking) for a mixed codebase.
+    ``pandas`` (native pandas) is off by default: it patches ``pd.read_*`` the same way
+    the Narwhals path does, so enable only one of ``narwhals``/``pandas`` to avoid
+    double-hooking reads."""
+    if narwhals:
+        trackNarwhals(True)
+    if spark:
+        trackSpark(True)
+    if pandas:
+        trackPandas(True)
+    if functions:
+        track_functions(True)
+def restore() -> None:
+    """Remove all patches and the profile hook, and release any frames pinned by
+    force_profile(cache=True); leave lineage intact."""
+    trackNarwhals(False)
+    trackSpark(False)
+    trackPandas(False)
+    track_functions(False)
+    release_cache()
+def to_html(path: str | None = None, title: str = "conformare lineage report") -> str:
+    """Render the captured lineage to a self-contained interactive HTML report."""
+    return _to_html(store, path, title)
+def lineage():
+    """Return the captured lineage events."""
+    return store.events
+def to_mermaid(expanded: bool = True) -> str:
+    """Render the captured lineage to a Mermaid flowchart."""
+    return _to_mermaid(store, expanded)
+def to_json() -> dict:
+    """Export the captured lineage as JSON-serialisable data."""
+    return _to_json(store)
+def to_risk_checklist(
+    path: str | None = None,
+    *,
+    title: str = "Formal Risk Review Checklist",
+    process: str | None = None,
+    date: str | None = None,
+    reviewers: list[str] | None = None,
+    signoff_rows: int = 3,
+) -> str:
+    """Export the risk register as a formal, sign-off-ready Markdown checklist.
+    A business-aligned governance artefact: conformare fills in each declared risk
+    (severity, where it occurs, declared mitigation, owner, governance concern) and
+    leaves blank columns plus a sign-off block for the governance team to complete.
+    Returns the Markdown; also writes it to *path* if given. ``process`` names the
+    pipeline in the header; ``date`` defaults to today; ``reviewers`` pre-populates
+    the sign-off rows."""
+    return _to_risk_checklist(
+        store,
+        path,
+        title=title,
+        process=process,
+        date=date,
+        reviewers=reviewers,
+        signoff_rows=signoff_rows,
+    )

conformare/adapters/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """conformare interception adapters: narwhals (chokepoint) + spark (in place)."""

conformare/adapters/io.py ADDED Viewed

@@ -0,0 +1,327 @@
+"""Read/write (source/sink) capture.
+Reads aren't exposed through Narwhals, so we hook them directly:
+* **Spark** -- patch ``DataFrameReader`` terminal methods, ``SparkSession.table``
+  and ``SparkSession.sql`` to stamp load provenance onto the resulting frame's
+  source node; patch ``DataFrameWriter`` terminal methods to emit a sink node.
+* **pandas** (the Narwhals path) -- patch ``pd.read_*`` to record the location in
+  the frame's ``attrs``; ``nw.from_native`` reads it back and attaches it to the
+  source node. Narwhals' own ``write_csv``/``write_parquet`` are handled by the
+  Narwhals adapter (they flow through ``TrackedDataFrame``).
+Loads enrich the existing **source** node (``store.sources``); writes get their
+own **sink** node (``store.sinks`` + a ``kind="sink"`` edge).
+"""
+from __future__ import annotations
+import functools
+import os
+import threading
+from ..core.lineage import new_id, node_id_of, store
+from ..core.recording import record_source_profile
+_io = threading.local()  # reentrancy guard so only the outermost IO call records
+def _loc(*args, **kwargs):
+    for a in args:
+        if isinstance(a, (str, bytes)):
+            return a if isinstance(a, str) else a.decode("utf-8", "replace")
+        if isinstance(a, os.PathLike):
+            return os.fspath(a)
+    for k in ("path", "tableName", "name", "filepath_or_buffer", "path_or_buf", "file"):
+        v = kwargs.get(k)
+        if isinstance(v, str):
+            return v
+        if isinstance(v, os.PathLike):
+            return os.fspath(v)
+    # list of paths (spark parquet(*paths))
+    for a in args:
+        if isinstance(a, (list, tuple)) and a and isinstance(a[0], str):
+            return ", ".join(map(str, a))
+    return args[0] if args else "(unknown)"
+def short_location(loc: str) -> str:
+    s = str(loc)
+    for sep in ("\\", "/"):
+        if sep in s:
+            s = s.rsplit(sep, 1)[-1]
+    return s[:40]
+def source_name(loc: str) -> str:
+    """A readable table name for a source node, derived from its load location
+    (last path segment, file extension stripped) -- e.g. ``.../customers.csv`` ->
+    ``customers`` -- so source nodes show a name instead of a raw ``<df1>`` id."""
+    s = short_location(loc)
+    if "." in s and not s.startswith("."):
+        s = s.rsplit(".", 1)[0]
+    return s or short_location(loc)
+# --- provenance handoff for the pandas -> from_native path ----------------
+def stamp_read(frame, info: dict) -> None:
+    try:
+        frame.attrs["_ft_source"] = info  # pandas-native metadata
+    except Exception:
+        pass
+def read_info(frame):
+    try:
+        attrs = getattr(frame, "attrs", None)
+        if isinstance(attrs, dict):
+            return attrs.get("_ft_source")
+    except Exception:
+        pass
+    return None
+# --- shared recording helpers --------------------------------------------
+def record_source(node_id, location, fmt, reader, columns=None):
+    if columns is not None:
+        store.set_columns(node_id, columns)
+    store.set_source(node_id, location=str(location), format=fmt, reader=reader)
+    # Name the source node after its location when nothing else named it (Spark
+    # reads have no assignment target to capture), so it shows a real name.
+    if not store.names.get(node_id):
+        store.name(node_id, source_name(location))
+def record_sink(parent_id, location, fmt, writer, backend):
+    sink_id = new_id()
+    store.set_sink(sink_id, location=str(location), format=fmt, writer=writer)
+    store.name(sink_id, short_location(location))
+    store.add(op=f"write.{fmt}", backend=backend, parents=[parent_id], child=sink_id, kind="sink")
+    return sink_id
+# --- pandas reads --------------------------------------------------------
+_PD_READERS = {
+    "read_csv": "csv",
+    "read_parquet": "parquet",
+    "read_json": "json",
+    "read_table": "table",
+    "read_sql": "sql",
+    "read_excel": "excel",
+    "read_feather": "feather",
+    "read_orc": "orc",
+}
+_pd_orig: dict = {}
+def _enable_pandas_reads():
+    try:
+        import pandas as pd
+    except Exception:
+        return
+    for fn, fmt in _PD_READERS.items():
+        orig = getattr(pd, fn, None)
+        if orig is None or fn in _pd_orig:
+            continue
+        _pd_orig[fn] = orig
+        def make(orig, fn, fmt):
+            @functools.wraps(orig)
+            def wrapper(*a, **k):
+                result = orig(*a, **k)
+                try:
+                    stamp_read(
+                        result,
+                        {"location": str(_loc(*a, **k)), "format": fmt, "reader": f"pd.{fn}"},
+                    )
+                except Exception:
+                    pass
+                return result
+            return wrapper
+        setattr(pd, fn, make(orig, fn, fmt))
+def _disable_pandas_reads():
+    try:
+        import pandas as pd
+    except Exception:
+        return
+    for fn, orig in _pd_orig.items():
+        setattr(pd, fn, orig)
+    _pd_orig.clear()
+# --- spark reads / writes / session --------------------------------------
+_spark_orig: dict = {}  # (class, method) -> original
+_READER_METHODS = ["load", "parquet", "csv", "json", "orc", "text", "table"]
+_WRITER_METHODS = ["save", "saveAsTable", "parquet", "csv", "json", "orc", "insertInto"]
+def _reader_classes():
+    out = []
+    try:
+        from pyspark.sql.readwriter import DataFrameReader
+        out.append(DataFrameReader)
+    except Exception:
+        pass
+    try:
+        from pyspark.sql.connect.readwriter import DataFrameReader as CR
+        out.append(CR)
+    except Exception:
+        pass
+    return out
+def _writer_classes():
+    out = []
+    try:
+        from pyspark.sql.readwriter import DataFrameWriter
+        out.append(DataFrameWriter)
+    except Exception:
+        pass
+    try:
+        from pyspark.sql.connect.readwriter import DataFrameWriter as CW
+        out.append(CW)
+    except Exception:
+        pass
+    return out
+def _session_classes():
+    out = []
+    try:
+        from pyspark.sql import SparkSession
+        out.append(SparkSession)
+    except Exception:
+        pass
+    try:
+        from pyspark.sql.connect.session import SparkSession as CS
+        out.append(CS)
+    except Exception:
+        pass
+    return out
+def _is_spark_frame(o):
+    from .spark import _BACKEND
+    return _BACKEND.is_frame(o)
+def _make_reader(method, original):
+    @functools.wraps(original)
+    def wrapper(self, *args, **kwargs):
+        if getattr(_io, "depth", 0):
+            return original(self, *args, **kwargs)
+        _io.depth = 1
+        try:
+            result = original(self, *args, **kwargs)
+            if _is_spark_frame(result):
+                nid = node_id_of(result, create=True)
+                record_source(
+                    nid, _loc(*args, **kwargs), method, "spark.read", columns=list(result.columns)
+                )
+                from .spark import _BACKEND
+                record_source_profile(nid, result, _BACKEND)
+            return result
+        finally:
+            _io.depth = 0
+    return wrapper
+def _make_session_source(method, original, reader, fmt):
+    @functools.wraps(original)
+    def wrapper(self, *args, **kwargs):
+        if getattr(_io, "depth", 0):
+            return original(self, *args, **kwargs)
+        _io.depth = 1
+        try:
+            result = original(self, *args, **kwargs)
+            if _is_spark_frame(result):
+                nid = node_id_of(result, create=True)
+                loc = _loc(*args, **kwargs) if method != "sql" else "SQL query"
+                record_source(nid, loc, fmt, reader, columns=list(result.columns))
+                from .spark import _BACKEND
+                record_source_profile(nid, result, _BACKEND)
+            return result
+        finally:
+            _io.depth = 0
+    return wrapper
+def _make_writer(method, original):
+    @functools.wraps(original)
+    def wrapper(self, *args, **kwargs):
+        if getattr(_io, "depth", 0):
+            return original(self, *args, **kwargs)
+        _io.depth = 1
+        try:
+            result = original(self, *args, **kwargs)
+            df = getattr(self, "_df", None)
+            if df is not None:
+                parent = node_id_of(df, create=True)
+                record_sink(parent, _loc(*args, **kwargs), method, "spark", "spark")
+            return result
+        finally:
+            _io.depth = 0
+    return wrapper
+def _patch(cls, method, factory):
+    orig = getattr(cls, method, None)
+    if callable(orig) and (cls, method) not in _spark_orig:
+        _spark_orig[(cls, method)] = orig
+        setattr(cls, method, factory(method, orig))
+def _enable_spark_io():
+    if _spark_orig:
+        return
+    for cls in _reader_classes():
+        for m in _READER_METHODS:
+            _patch(cls, m, _make_reader)
+    for cls in _writer_classes():
+        for m in _WRITER_METHODS:
+            _patch(cls, m, _make_writer)
+    for cls in _session_classes():
+        for m, fmt in (("table", "table"), ("sql", "sql")):
+            orig = getattr(cls, m, None)
+            if callable(orig) and (cls, m) not in _spark_orig:
+                _spark_orig[(cls, m)] = orig
+                setattr(cls, m, _make_session_source(m, orig, f"spark.{m}", fmt))
+def _disable_spark_io():
+    for (cls, m), orig in _spark_orig.items():
+        setattr(cls, m, orig)
+    _spark_orig.clear()
+# --- public toggles (called by the adapters) -----------------------------
+def enable_pandas_io():
+    _enable_pandas_reads()
+def disable_pandas_io():
+    _disable_pandas_reads()
+def enable_spark_io():
+    _enable_spark_io()
+def disable_spark_io():
+    _disable_spark_io()

conformare/adapters/narwhals.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Narwhals adapter: ``trackNarwhals()`` + ``TrackedDataFrame``.
+Patches the single ingestion chokepoint ``nw.from_native`` so every frame is
+wrapped. The wrapper intercepts all methods generically via ``__getattr__``,
+records each transformation as a ``LineageEvent``, runs profilers, and re-wraps
+frame results so the chain stays tracked. See design Section 6.
+"""
+from __future__ import annotations
+import narwhals as nw
+from ..core.groups import TrackedGroupBy
+from ..core.lineage import new_id, store
+from ..core.names import name_at_caller, op_logic_at_caller
+from ..core.recording import record_op, record_source_profile
+from ..core.steps import current_step
+from ..core.suppress import suppressed
+from ..profilers.backend_narwhals import NarwhalsBackend
+from . import io as _io
+_BACKEND = NarwhalsBackend()
+# Narwhals frame methods that write to a sink (return None / a string, not a frame).
+WRITE_METHODS = {"write_csv", "write_parquet"}
+def _is_frame(o):
+    return _BACKEND.is_frame(o)
+def _unwrap(o):
+    return o._df if isinstance(o, TrackedDataFrame) else o
+def _capture_columns(node_id, frame):
+    try:
+        store.set_columns(node_id, _BACKEND.columns(frame))
+    except Exception:
+        pass
+class TrackedDataFrame:
+    def __init__(self, df, node_id=None, name=None):
+        self._df = df
+        self.node_id = node_id or new_id()
+        # Inside an opaque() block, don't register this frame -- the chain still
+        # works (the wrapper holds the frame), but it never reaches the lineage.
+        if not suppressed():
+            store.name(self.node_id, name)
+            _capture_columns(self.node_id, df)
+    def _wrap_result(self, result, name):
+        child = TrackedDataFrame(result, name=name)
+        return child, child.node_id
+    def __getattr__(self, attr_name):
+        if attr_name in ("_df", "node_id"):
+            raise AttributeError(attr_name)
+        attr = getattr(self._df, attr_name)
+        if not callable(attr):
+            return attr
+        def tracked(*args, **kwargs):
+            name = name_at_caller()  # 0:name_at_caller 1:tracked 2:user
+            logic = op_logic_at_caller()
+            frame_args = [a for a in (*args, *kwargs.values()) if isinstance(a, TrackedDataFrame)]
+            u_args = tuple(_unwrap(a) for a in args)
+            u_kwargs = {k: _unwrap(v) for k, v in kwargs.items()}
+            before = self._df
+            # write_csv / write_parquet -> a sink node, not a tracked frame.
+            if attr_name in WRITE_METHODS:
+                result = attr(*u_args, **u_kwargs)
+                _io.record_sink(
+                    self.node_id,
+                    _io._loc(*u_args, **u_kwargs),
+                    attr_name.replace("write_", ""),
+                    "narwhals",
+                    "narwhals",
+                )
+                return result
+            result = attr(*u_args, **u_kwargs)
+            # group_by returns a GroupBy/LazyGroupBy, not a frame -- Section 8.3.
+            if _BACKEND.is_group(result):
+                return TrackedGroupBy(
+                    result,
+                    self.node_id,
+                    _BACKEND,
+                    self._wrap_result,
+                    group_keys=(logic or {}).get("args", []),
+                )
+            if _is_frame(result):
+                child = TrackedDataFrame(result, name=name)
+                parents = [(self.node_id, self._df)] + [(a.node_id, a._df) for a in frame_args]
+                record_op(
+                    attr_name,
+                    _BACKEND,
+                    parents,
+                    child.node_id,
+                    before,
+                    result,
+                    name=name,
+                    logic=logic,
+                )
+                return child
+            # to_native drops to the bare frame -- record an explicit boundary
+            # (decision 3) and hand the native frame back untracked.
+            if attr_name == "to_native":
+                store.add(
+                    op="to_native",
+                    backend="narwhals",
+                    parents=[self.node_id],
+                    child=None,
+                    kind="sink",
+                    step=current_step(),
+                )
+            return result
+        return tracked
+    def __repr__(self):
+        return f"TrackedDataFrame({store.label(self.node_id)})"
+_orig_from_native = nw.from_native
+_patched = False
+def trackNarwhals(enable: bool = True) -> None:
+    """Patch (or restore) ``nw.from_native`` to mint tracked frames, and hook the
+    pandas read functions so source frames carry their load location."""
+    global _patched
+    if enable and not _patched:
+        def from_native(*a, **k):
+            tdf = TrackedDataFrame(_orig_from_native(*a, **k), name=name_at_caller())
+            info = _io.read_info(a[0]) if a else None
+            if info:
+                store.set_source(tdf.node_id, **info)
+                record_source_profile(tdf.node_id, tdf._df, _BACKEND)
+            return tdf
+        nw.from_native = from_native
+        _io.enable_pandas_io()
+        _patched = True
+    elif not enable and _patched:
+        nw.from_native = _orig_from_native
+        _io.disable_pandas_io()
+        _patched = False