conformare 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
conformare/__init__.py ADDED
@@ -0,0 +1,201 @@
1
+ """conformare -- capture the authored dataframe pipeline and profile each step.
2
+
3
+ One shared lineage + profiling + diagram core, two pluggable interception
4
+ adapters: ``trackNarwhals()`` (future, Narwhals) and ``trackSpark()`` (existing
5
+ PySpark, zero code change). See the design doc for the full architecture.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from importlib.metadata import PackageNotFoundError
11
+ from importlib.metadata import version as _pkg_version
12
+
13
+ try:
14
+ # Single source of truth: the version declared in pyproject.toml, read from the
15
+ # installed package metadata. Conformare follows Semantic Versioning.
16
+ __version__ = _pkg_version("conformare")
17
+ except PackageNotFoundError: # running from a source tree that isn't installed
18
+ __version__ = "0.0.0+unknown"
19
+
20
+ from .adapters.narwhals import trackNarwhals
21
+ from .adapters.pandas import trackPandas
22
+ from .adapters.spark import trackSpark
23
+ from .core.context import (
24
+ describe,
25
+ describe_process,
26
+ groups_registry,
27
+ reset_context,
28
+ risk,
29
+ )
30
+ from .core.diagram import to_json as _to_json
31
+ from .core.diagram import to_mermaid as _to_mermaid
32
+ from .core.lineage import store
33
+ from .core.checklist import to_risk_checklist as _to_risk_checklist
34
+ from .core.report import build_model
35
+ from .core.report import to_html as _to_html
36
+ from .core.risks import all_risks, catalog_by_category, get_risk, register_risk
37
+ from .core.sensitivity import (
38
+ classify_column,
39
+ mark_sensitive,
40
+ reset_marks,
41
+ scan_columns,
42
+ unmark_sensitive,
43
+ )
44
+ from .core.steps import opaque, track_functions, track_step
45
+ from .core.suppress import opaque_module, opaque_modules, set_opaque_modules
46
+ from .profilers.builtins import (
47
+ columnCount,
48
+ dataSize,
49
+ greatExpectations,
50
+ histogram,
51
+ iqrOutliers,
52
+ nullFraction,
53
+ rowCount,
54
+ whylogs,
55
+ )
56
+ from .profilers.conditions import contains_columns, min_rows, schema_has
57
+ from .profilers.engine import (
58
+ force_profile,
59
+ get_profiles,
60
+ profile,
61
+ profile_sources,
62
+ release_cache,
63
+ run_profilers,
64
+ set_profiles,
65
+ )
66
+
67
+ # Imported last: bootstrap pulls in report/context/sensitivity/etc., all loaded above.
68
+ from .bootstrap import bootstrap, decorate, doc, documented
69
+
70
+ __all__ = [
71
+ "trackNarwhals",
72
+ "trackSpark",
73
+ "trackPandas",
74
+ "trackAll",
75
+ "restore",
76
+ "set_profiles",
77
+ "get_profiles",
78
+ "profile",
79
+ "force_profile",
80
+ "profile_sources",
81
+ "release_cache",
82
+ "run_profilers",
83
+ "rowCount",
84
+ "columnCount",
85
+ "histogram",
86
+ "nullFraction",
87
+ "dataSize",
88
+ "iqrOutliers",
89
+ "whylogs",
90
+ "greatExpectations",
91
+ "contains_columns",
92
+ "min_rows",
93
+ "schema_has",
94
+ "track_step",
95
+ "track_functions",
96
+ "opaque",
97
+ "opaque_module",
98
+ "opaque_modules",
99
+ "set_opaque_modules",
100
+ "bootstrap",
101
+ "doc",
102
+ "decorate",
103
+ "documented",
104
+ "describe",
105
+ "describe_process",
106
+ "risk",
107
+ "register_risk",
108
+ "get_risk",
109
+ "all_risks",
110
+ "catalog_by_category",
111
+ "groups_registry",
112
+ "reset_context",
113
+ "mark_sensitive",
114
+ "unmark_sensitive",
115
+ "classify_column",
116
+ "scan_columns",
117
+ "reset_marks",
118
+ "to_mermaid",
119
+ "to_json",
120
+ "to_html",
121
+ "to_risk_checklist",
122
+ "build_model",
123
+ "lineage",
124
+ "store",
125
+ ]
126
+
127
+
128
+ def trackAll(
129
+ *, narwhals: bool = True, spark: bool = True, pandas: bool = False, functions: bool = True
130
+ ) -> None:
131
+ """Enable the adapters (and automatic function tracking) for a mixed codebase.
132
+
133
+ ``pandas`` (native pandas) is off by default: it patches ``pd.read_*`` the same way
134
+ the Narwhals path does, so enable only one of ``narwhals``/``pandas`` to avoid
135
+ double-hooking reads."""
136
+ if narwhals:
137
+ trackNarwhals(True)
138
+ if spark:
139
+ trackSpark(True)
140
+ if pandas:
141
+ trackPandas(True)
142
+ if functions:
143
+ track_functions(True)
144
+
145
+
146
+ def restore() -> None:
147
+ """Remove all patches and the profile hook, and release any frames pinned by
148
+ force_profile(cache=True); leave lineage intact."""
149
+ trackNarwhals(False)
150
+ trackSpark(False)
151
+ trackPandas(False)
152
+ track_functions(False)
153
+ release_cache()
154
+
155
+
156
+ def to_html(path: str | None = None, title: str = "conformare lineage report") -> str:
157
+ """Render the captured lineage to a self-contained interactive HTML report."""
158
+ return _to_html(store, path, title)
159
+
160
+
161
+ def lineage():
162
+ """Return the captured lineage events."""
163
+ return store.events
164
+
165
+
166
+ def to_mermaid(expanded: bool = True) -> str:
167
+ """Render the captured lineage to a Mermaid flowchart."""
168
+ return _to_mermaid(store, expanded)
169
+
170
+
171
+ def to_json() -> dict:
172
+ """Export the captured lineage as JSON-serialisable data."""
173
+ return _to_json(store)
174
+
175
+
176
+ def to_risk_checklist(
177
+ path: str | None = None,
178
+ *,
179
+ title: str = "Formal Risk Review Checklist",
180
+ process: str | None = None,
181
+ date: str | None = None,
182
+ reviewers: list[str] | None = None,
183
+ signoff_rows: int = 3,
184
+ ) -> str:
185
+ """Export the risk register as a formal, sign-off-ready Markdown checklist.
186
+
187
+ A business-aligned governance artefact: conformare fills in each declared risk
188
+ (severity, where it occurs, declared mitigation, owner, governance concern) and
189
+ leaves blank columns plus a sign-off block for the governance team to complete.
190
+ Returns the Markdown; also writes it to *path* if given. ``process`` names the
191
+ pipeline in the header; ``date`` defaults to today; ``reviewers`` pre-populates
192
+ the sign-off rows."""
193
+ return _to_risk_checklist(
194
+ store,
195
+ path,
196
+ title=title,
197
+ process=process,
198
+ date=date,
199
+ reviewers=reviewers,
200
+ signoff_rows=signoff_rows,
201
+ )
@@ -0,0 +1 @@
1
+ """conformare interception adapters: narwhals (chokepoint) + spark (in place)."""
@@ -0,0 +1,327 @@
1
+ """Read/write (source/sink) capture.
2
+
3
+ Reads aren't exposed through Narwhals, so we hook them directly:
4
+
5
+ * **Spark** -- patch ``DataFrameReader`` terminal methods, ``SparkSession.table``
6
+ and ``SparkSession.sql`` to stamp load provenance onto the resulting frame's
7
+ source node; patch ``DataFrameWriter`` terminal methods to emit a sink node.
8
+ * **pandas** (the Narwhals path) -- patch ``pd.read_*`` to record the location in
9
+ the frame's ``attrs``; ``nw.from_native`` reads it back and attaches it to the
10
+ source node. Narwhals' own ``write_csv``/``write_parquet`` are handled by the
11
+ Narwhals adapter (they flow through ``TrackedDataFrame``).
12
+
13
+ Loads enrich the existing **source** node (``store.sources``); writes get their
14
+ own **sink** node (``store.sinks`` + a ``kind="sink"`` edge).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import functools
20
+ import os
21
+ import threading
22
+
23
+ from ..core.lineage import new_id, node_id_of, store
24
+ from ..core.recording import record_source_profile
25
+
26
+ _io = threading.local() # reentrancy guard so only the outermost IO call records
27
+
28
+
29
+ def _loc(*args, **kwargs):
30
+ for a in args:
31
+ if isinstance(a, (str, bytes)):
32
+ return a if isinstance(a, str) else a.decode("utf-8", "replace")
33
+ if isinstance(a, os.PathLike):
34
+ return os.fspath(a)
35
+ for k in ("path", "tableName", "name", "filepath_or_buffer", "path_or_buf", "file"):
36
+ v = kwargs.get(k)
37
+ if isinstance(v, str):
38
+ return v
39
+ if isinstance(v, os.PathLike):
40
+ return os.fspath(v)
41
+ # list of paths (spark parquet(*paths))
42
+ for a in args:
43
+ if isinstance(a, (list, tuple)) and a and isinstance(a[0], str):
44
+ return ", ".join(map(str, a))
45
+ return args[0] if args else "(unknown)"
46
+
47
+
48
+ def short_location(loc: str) -> str:
49
+ s = str(loc)
50
+ for sep in ("\\", "/"):
51
+ if sep in s:
52
+ s = s.rsplit(sep, 1)[-1]
53
+ return s[:40]
54
+
55
+
56
+ def source_name(loc: str) -> str:
57
+ """A readable table name for a source node, derived from its load location
58
+ (last path segment, file extension stripped) -- e.g. ``.../customers.csv`` ->
59
+ ``customers`` -- so source nodes show a name instead of a raw ``<df1>`` id."""
60
+ s = short_location(loc)
61
+ if "." in s and not s.startswith("."):
62
+ s = s.rsplit(".", 1)[0]
63
+ return s or short_location(loc)
64
+
65
+
66
+ # --- provenance handoff for the pandas -> from_native path ----------------
67
+ def stamp_read(frame, info: dict) -> None:
68
+ try:
69
+ frame.attrs["_ft_source"] = info # pandas-native metadata
70
+ except Exception:
71
+ pass
72
+
73
+
74
+ def read_info(frame):
75
+ try:
76
+ attrs = getattr(frame, "attrs", None)
77
+ if isinstance(attrs, dict):
78
+ return attrs.get("_ft_source")
79
+ except Exception:
80
+ pass
81
+ return None
82
+
83
+
84
+ # --- shared recording helpers --------------------------------------------
85
+ def record_source(node_id, location, fmt, reader, columns=None):
86
+ if columns is not None:
87
+ store.set_columns(node_id, columns)
88
+ store.set_source(node_id, location=str(location), format=fmt, reader=reader)
89
+ # Name the source node after its location when nothing else named it (Spark
90
+ # reads have no assignment target to capture), so it shows a real name.
91
+ if not store.names.get(node_id):
92
+ store.name(node_id, source_name(location))
93
+
94
+
95
+ def record_sink(parent_id, location, fmt, writer, backend):
96
+ sink_id = new_id()
97
+ store.set_sink(sink_id, location=str(location), format=fmt, writer=writer)
98
+ store.name(sink_id, short_location(location))
99
+ store.add(op=f"write.{fmt}", backend=backend, parents=[parent_id], child=sink_id, kind="sink")
100
+ return sink_id
101
+
102
+
103
+ # --- pandas reads --------------------------------------------------------
104
+ _PD_READERS = {
105
+ "read_csv": "csv",
106
+ "read_parquet": "parquet",
107
+ "read_json": "json",
108
+ "read_table": "table",
109
+ "read_sql": "sql",
110
+ "read_excel": "excel",
111
+ "read_feather": "feather",
112
+ "read_orc": "orc",
113
+ }
114
+ _pd_orig: dict = {}
115
+
116
+
117
+ def _enable_pandas_reads():
118
+ try:
119
+ import pandas as pd
120
+ except Exception:
121
+ return
122
+ for fn, fmt in _PD_READERS.items():
123
+ orig = getattr(pd, fn, None)
124
+ if orig is None or fn in _pd_orig:
125
+ continue
126
+ _pd_orig[fn] = orig
127
+
128
+ def make(orig, fn, fmt):
129
+ @functools.wraps(orig)
130
+ def wrapper(*a, **k):
131
+ result = orig(*a, **k)
132
+ try:
133
+ stamp_read(
134
+ result,
135
+ {"location": str(_loc(*a, **k)), "format": fmt, "reader": f"pd.{fn}"},
136
+ )
137
+ except Exception:
138
+ pass
139
+ return result
140
+
141
+ return wrapper
142
+
143
+ setattr(pd, fn, make(orig, fn, fmt))
144
+
145
+
146
+ def _disable_pandas_reads():
147
+ try:
148
+ import pandas as pd
149
+ except Exception:
150
+ return
151
+ for fn, orig in _pd_orig.items():
152
+ setattr(pd, fn, orig)
153
+ _pd_orig.clear()
154
+
155
+
156
+ # --- spark reads / writes / session --------------------------------------
157
+ _spark_orig: dict = {} # (class, method) -> original
158
+ _READER_METHODS = ["load", "parquet", "csv", "json", "orc", "text", "table"]
159
+ _WRITER_METHODS = ["save", "saveAsTable", "parquet", "csv", "json", "orc", "insertInto"]
160
+
161
+
162
+ def _reader_classes():
163
+ out = []
164
+ try:
165
+ from pyspark.sql.readwriter import DataFrameReader
166
+
167
+ out.append(DataFrameReader)
168
+ except Exception:
169
+ pass
170
+ try:
171
+ from pyspark.sql.connect.readwriter import DataFrameReader as CR
172
+
173
+ out.append(CR)
174
+ except Exception:
175
+ pass
176
+ return out
177
+
178
+
179
+ def _writer_classes():
180
+ out = []
181
+ try:
182
+ from pyspark.sql.readwriter import DataFrameWriter
183
+
184
+ out.append(DataFrameWriter)
185
+ except Exception:
186
+ pass
187
+ try:
188
+ from pyspark.sql.connect.readwriter import DataFrameWriter as CW
189
+
190
+ out.append(CW)
191
+ except Exception:
192
+ pass
193
+ return out
194
+
195
+
196
+ def _session_classes():
197
+ out = []
198
+ try:
199
+ from pyspark.sql import SparkSession
200
+
201
+ out.append(SparkSession)
202
+ except Exception:
203
+ pass
204
+ try:
205
+ from pyspark.sql.connect.session import SparkSession as CS
206
+
207
+ out.append(CS)
208
+ except Exception:
209
+ pass
210
+ return out
211
+
212
+
213
+ def _is_spark_frame(o):
214
+ from .spark import _BACKEND
215
+
216
+ return _BACKEND.is_frame(o)
217
+
218
+
219
+ def _make_reader(method, original):
220
+ @functools.wraps(original)
221
+ def wrapper(self, *args, **kwargs):
222
+ if getattr(_io, "depth", 0):
223
+ return original(self, *args, **kwargs)
224
+ _io.depth = 1
225
+ try:
226
+ result = original(self, *args, **kwargs)
227
+ if _is_spark_frame(result):
228
+ nid = node_id_of(result, create=True)
229
+ record_source(
230
+ nid, _loc(*args, **kwargs), method, "spark.read", columns=list(result.columns)
231
+ )
232
+ from .spark import _BACKEND
233
+
234
+ record_source_profile(nid, result, _BACKEND)
235
+ return result
236
+ finally:
237
+ _io.depth = 0
238
+
239
+ return wrapper
240
+
241
+
242
+ def _make_session_source(method, original, reader, fmt):
243
+ @functools.wraps(original)
244
+ def wrapper(self, *args, **kwargs):
245
+ if getattr(_io, "depth", 0):
246
+ return original(self, *args, **kwargs)
247
+ _io.depth = 1
248
+ try:
249
+ result = original(self, *args, **kwargs)
250
+ if _is_spark_frame(result):
251
+ nid = node_id_of(result, create=True)
252
+ loc = _loc(*args, **kwargs) if method != "sql" else "SQL query"
253
+ record_source(nid, loc, fmt, reader, columns=list(result.columns))
254
+ from .spark import _BACKEND
255
+
256
+ record_source_profile(nid, result, _BACKEND)
257
+ return result
258
+ finally:
259
+ _io.depth = 0
260
+
261
+ return wrapper
262
+
263
+
264
+ def _make_writer(method, original):
265
+ @functools.wraps(original)
266
+ def wrapper(self, *args, **kwargs):
267
+ if getattr(_io, "depth", 0):
268
+ return original(self, *args, **kwargs)
269
+ _io.depth = 1
270
+ try:
271
+ result = original(self, *args, **kwargs)
272
+ df = getattr(self, "_df", None)
273
+ if df is not None:
274
+ parent = node_id_of(df, create=True)
275
+ record_sink(parent, _loc(*args, **kwargs), method, "spark", "spark")
276
+ return result
277
+ finally:
278
+ _io.depth = 0
279
+
280
+ return wrapper
281
+
282
+
283
+ def _patch(cls, method, factory):
284
+ orig = getattr(cls, method, None)
285
+ if callable(orig) and (cls, method) not in _spark_orig:
286
+ _spark_orig[(cls, method)] = orig
287
+ setattr(cls, method, factory(method, orig))
288
+
289
+
290
+ def _enable_spark_io():
291
+ if _spark_orig:
292
+ return
293
+ for cls in _reader_classes():
294
+ for m in _READER_METHODS:
295
+ _patch(cls, m, _make_reader)
296
+ for cls in _writer_classes():
297
+ for m in _WRITER_METHODS:
298
+ _patch(cls, m, _make_writer)
299
+ for cls in _session_classes():
300
+ for m, fmt in (("table", "table"), ("sql", "sql")):
301
+ orig = getattr(cls, m, None)
302
+ if callable(orig) and (cls, m) not in _spark_orig:
303
+ _spark_orig[(cls, m)] = orig
304
+ setattr(cls, m, _make_session_source(m, orig, f"spark.{m}", fmt))
305
+
306
+
307
+ def _disable_spark_io():
308
+ for (cls, m), orig in _spark_orig.items():
309
+ setattr(cls, m, orig)
310
+ _spark_orig.clear()
311
+
312
+
313
+ # --- public toggles (called by the adapters) -----------------------------
314
+ def enable_pandas_io():
315
+ _enable_pandas_reads()
316
+
317
+
318
+ def disable_pandas_io():
319
+ _disable_pandas_reads()
320
+
321
+
322
+ def enable_spark_io():
323
+ _enable_spark_io()
324
+
325
+
326
+ def disable_spark_io():
327
+ _disable_spark_io()
@@ -0,0 +1,152 @@
1
+ """Narwhals adapter: ``trackNarwhals()`` + ``TrackedDataFrame``.
2
+
3
+ Patches the single ingestion chokepoint ``nw.from_native`` so every frame is
4
+ wrapped. The wrapper intercepts all methods generically via ``__getattr__``,
5
+ records each transformation as a ``LineageEvent``, runs profilers, and re-wraps
6
+ frame results so the chain stays tracked. See design Section 6.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import narwhals as nw
12
+
13
+ from ..core.groups import TrackedGroupBy
14
+ from ..core.lineage import new_id, store
15
+ from ..core.names import name_at_caller, op_logic_at_caller
16
+ from ..core.recording import record_op, record_source_profile
17
+ from ..core.steps import current_step
18
+ from ..core.suppress import suppressed
19
+ from ..profilers.backend_narwhals import NarwhalsBackend
20
+ from . import io as _io
21
+
22
+ _BACKEND = NarwhalsBackend()
23
+
24
+ # Narwhals frame methods that write to a sink (return None / a string, not a frame).
25
+ WRITE_METHODS = {"write_csv", "write_parquet"}
26
+
27
+
28
+ def _is_frame(o):
29
+ return _BACKEND.is_frame(o)
30
+
31
+
32
+ def _unwrap(o):
33
+ return o._df if isinstance(o, TrackedDataFrame) else o
34
+
35
+
36
+ def _capture_columns(node_id, frame):
37
+ try:
38
+ store.set_columns(node_id, _BACKEND.columns(frame))
39
+ except Exception:
40
+ pass
41
+
42
+
43
+ class TrackedDataFrame:
44
+ def __init__(self, df, node_id=None, name=None):
45
+ self._df = df
46
+ self.node_id = node_id or new_id()
47
+ # Inside an opaque() block, don't register this frame -- the chain still
48
+ # works (the wrapper holds the frame), but it never reaches the lineage.
49
+ if not suppressed():
50
+ store.name(self.node_id, name)
51
+ _capture_columns(self.node_id, df)
52
+
53
+ def _wrap_result(self, result, name):
54
+ child = TrackedDataFrame(result, name=name)
55
+ return child, child.node_id
56
+
57
+ def __getattr__(self, attr_name):
58
+ if attr_name in ("_df", "node_id"):
59
+ raise AttributeError(attr_name)
60
+ attr = getattr(self._df, attr_name)
61
+ if not callable(attr):
62
+ return attr
63
+
64
+ def tracked(*args, **kwargs):
65
+ name = name_at_caller() # 0:name_at_caller 1:tracked 2:user
66
+ logic = op_logic_at_caller()
67
+ frame_args = [a for a in (*args, *kwargs.values()) if isinstance(a, TrackedDataFrame)]
68
+ u_args = tuple(_unwrap(a) for a in args)
69
+ u_kwargs = {k: _unwrap(v) for k, v in kwargs.items()}
70
+ before = self._df
71
+
72
+ # write_csv / write_parquet -> a sink node, not a tracked frame.
73
+ if attr_name in WRITE_METHODS:
74
+ result = attr(*u_args, **u_kwargs)
75
+ _io.record_sink(
76
+ self.node_id,
77
+ _io._loc(*u_args, **u_kwargs),
78
+ attr_name.replace("write_", ""),
79
+ "narwhals",
80
+ "narwhals",
81
+ )
82
+ return result
83
+
84
+ result = attr(*u_args, **u_kwargs)
85
+
86
+ # group_by returns a GroupBy/LazyGroupBy, not a frame -- Section 8.3.
87
+ if _BACKEND.is_group(result):
88
+ return TrackedGroupBy(
89
+ result,
90
+ self.node_id,
91
+ _BACKEND,
92
+ self._wrap_result,
93
+ group_keys=(logic or {}).get("args", []),
94
+ )
95
+ if _is_frame(result):
96
+ child = TrackedDataFrame(result, name=name)
97
+ parents = [(self.node_id, self._df)] + [(a.node_id, a._df) for a in frame_args]
98
+ record_op(
99
+ attr_name,
100
+ _BACKEND,
101
+ parents,
102
+ child.node_id,
103
+ before,
104
+ result,
105
+ name=name,
106
+ logic=logic,
107
+ )
108
+ return child
109
+ # to_native drops to the bare frame -- record an explicit boundary
110
+ # (decision 3) and hand the native frame back untracked.
111
+ if attr_name == "to_native":
112
+ store.add(
113
+ op="to_native",
114
+ backend="narwhals",
115
+ parents=[self.node_id],
116
+ child=None,
117
+ kind="sink",
118
+ step=current_step(),
119
+ )
120
+ return result
121
+
122
+ return tracked
123
+
124
+ def __repr__(self):
125
+ return f"TrackedDataFrame({store.label(self.node_id)})"
126
+
127
+
128
+ _orig_from_native = nw.from_native
129
+ _patched = False
130
+
131
+
132
+ def trackNarwhals(enable: bool = True) -> None:
133
+ """Patch (or restore) ``nw.from_native`` to mint tracked frames, and hook the
134
+ pandas read functions so source frames carry their load location."""
135
+ global _patched
136
+ if enable and not _patched:
137
+
138
+ def from_native(*a, **k):
139
+ tdf = TrackedDataFrame(_orig_from_native(*a, **k), name=name_at_caller())
140
+ info = _io.read_info(a[0]) if a else None
141
+ if info:
142
+ store.set_source(tdf.node_id, **info)
143
+ record_source_profile(tdf.node_id, tdf._df, _BACKEND)
144
+ return tdf
145
+
146
+ nw.from_native = from_native
147
+ _io.enable_pandas_io()
148
+ _patched = True
149
+ elif not enable and _patched:
150
+ nw.from_native = _orig_from_native
151
+ _io.disable_pandas_io()
152
+ _patched = False