goldenanalysis 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ """GoldenAnalysis — read-only cross-cutting analysis/metrics/reporting for the Golden Suite.
2
+
3
+ Public surface (Phase 1): the generic frame path.
4
+
5
+ import goldenanalysis as ga
6
+ report = ga.analyze(df, analyzers=["frame.summary"])
7
+ print(report.to_markdown())
8
+
9
+ Suite adapters, the other analyzers, ``ReportHistory``/regression detection, the
10
+ TypeScript port, and the Rust accelerator land in later phases.
11
+
12
+ Public names are re-exported lazily (PEP 562) so the package imports cleanly even
13
+ while submodules are still being built out.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import TYPE_CHECKING, Any
19
+
20
+ __version__ = "0.1.0"
21
+
22
+ __all__ = [
23
+ "analyze",
24
+ "analyze_match",
25
+ "analyze_pipeline",
26
+ "AnalysisReport",
27
+ "Metric",
28
+ "ReportHistory",
29
+ "RegressionPolicy",
30
+ "__version__",
31
+ ]
32
+
33
+ if TYPE_CHECKING:
34
+ from goldenanalysis._api import analyze, analyze_match, analyze_pipeline
35
+ from goldenanalysis.history import ReportHistory
36
+ from goldenanalysis.models import AnalysisReport, Metric, RegressionPolicy
37
+
38
+ # Map exported name -> (submodule, attribute). Resolved on first access.
39
+ _LAZY: dict[str, tuple[str, str]] = {
40
+ "analyze": ("goldenanalysis._api", "analyze"),
41
+ "analyze_match": ("goldenanalysis._api", "analyze_match"),
42
+ "analyze_pipeline": ("goldenanalysis._api", "analyze_pipeline"),
43
+ "AnalysisReport": ("goldenanalysis.models", "AnalysisReport"),
44
+ "Metric": ("goldenanalysis.models", "Metric"),
45
+ "ReportHistory": ("goldenanalysis.history", "ReportHistory"),
46
+ "RegressionPolicy": ("goldenanalysis.models", "RegressionPolicy"),
47
+ }
48
+
49
+
50
+ def __getattr__(name: str) -> Any:
51
+ target = _LAZY.get(name)
52
+ if target is None:
53
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
54
+ import importlib
55
+
56
+ module, attr = target
57
+ return getattr(importlib.import_module(module), attr)
58
+
59
+
60
+ def __dir__() -> list[str]:
61
+ return sorted(__all__)
goldenanalysis/_api.py ADDED
@@ -0,0 +1,154 @@
1
+ """Top-level analyze entrypoints — resolve analyzers, run them over an artifact,
2
+ assemble a single ``AnalysisReport``.
3
+
4
+ - ``analyze(df, ...)`` — the generic frame path (Phase 1).
5
+ - ``analyze_match(result, ...)`` / ``analyze_pipeline(result)`` — suite paths
6
+ (Phase 2a) over a GoldenMatch ``DedupeResult`` / GoldenPipe ``PipeResult``.
7
+
8
+ Cross-run aggregation + narrative generation land in Phase 2b.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Sequence
14
+ from datetime import UTC, datetime
15
+ from typing import Any
16
+
17
+ import polars as pl
18
+
19
+ from goldenanalysis.adapters import FrameArtifactAdapter
20
+ from goldenanalysis.models import AnalysisReport, AnalyzerInput
21
+ from goldenanalysis.registry import available_analyzers, load_analyzer
22
+
23
+
24
+ def _assemble_report(
25
+ inp: AnalyzerInput,
26
+ analyzer_names: Sequence[str],
27
+ *,
28
+ run_id: str | None = None,
29
+ generated_at: datetime | None = None,
30
+ ) -> AnalysisReport:
31
+ """Run ``analyzer_names`` over ``inp`` and assemble one ``AnalysisReport``.
32
+
33
+ Shared by every analyze entrypoint. Names that are requested but not
34
+ discoverable are recorded in ``source["unavailable"]`` rather than raising.
35
+ """
36
+ ds = inp.dataset
37
+ discoverable = set(available_analyzers())
38
+
39
+ ran: list[str] = []
40
+ unavailable: list[str] = []
41
+ metrics = []
42
+ tables = []
43
+ for name in analyzer_names:
44
+ if name not in discoverable:
45
+ unavailable.append(name)
46
+ continue
47
+ result = load_analyzer(name).run(inp)
48
+ metrics.extend(result.metrics)
49
+ tables.extend(result.tables)
50
+ ran.append(name)
51
+
52
+ gen = generated_at or datetime.now(UTC)
53
+ rid = run_id or f"{gen.isoformat()}#{ds}"
54
+ source = {"dataset": ds, "producer": inp.artifacts.get("__producer__", "frame")}
55
+ if unavailable:
56
+ source["unavailable"] = ",".join(unavailable)
57
+
58
+ return AnalysisReport(
59
+ run_id=rid,
60
+ generated_at=gen,
61
+ source=source,
62
+ metrics=metrics,
63
+ tables=tables,
64
+ narrative=None,
65
+ analyzers_run=ran,
66
+ )
67
+
68
+
69
+ def _frame_compatible_analyzers() -> list[str]:
70
+ """Discoverable analyzers that consume a generic ``frame`` and import cleanly.
71
+
72
+ Loading is guarded so analyzers needing optional suite deps are simply skipped
73
+ from the default set rather than breaking the generic path.
74
+ """
75
+ out: list[str] = []
76
+ for name in available_analyzers():
77
+ try:
78
+ analyzer = load_analyzer(name)
79
+ except Exception:
80
+ continue
81
+ if "frame" in analyzer.info.consumes:
82
+ out.append(name)
83
+ return out
84
+
85
+
86
+ def _artifact_compatible_analyzers(inp: AnalyzerInput) -> list[str]:
87
+ """Discoverable analyzers at least one of whose ``consumes`` keys is present
88
+ in ``inp.artifacts`` — the fan-out selector for ``analyze_pipeline``."""
89
+ present = set(inp.artifacts)
90
+ out: list[str] = []
91
+ for name in available_analyzers():
92
+ try:
93
+ analyzer = load_analyzer(name)
94
+ except Exception:
95
+ continue
96
+ if any(key in present for key in analyzer.info.consumes):
97
+ out.append(name)
98
+ return out
99
+
100
+
101
+ def analyze(
102
+ df: pl.DataFrame,
103
+ analyzers: Sequence[str] | None = None,
104
+ *,
105
+ dataset: str | None = None,
106
+ run_id: str | None = None,
107
+ generated_at: datetime | None = None,
108
+ ) -> AnalysisReport:
109
+ """Run ``analyzers`` over ``df`` and return a single ``AnalysisReport``.
110
+
111
+ ``analyzers=None`` defaults to every frame-compatible analyzer. Names that are
112
+ requested but not discoverable are recorded in ``source["unavailable"]`` rather
113
+ than raising — the report says what it could and couldn't compute.
114
+ """
115
+ inp = FrameArtifactAdapter().load(df, dataset=dataset)
116
+ requested = list(analyzers) if analyzers is not None else _frame_compatible_analyzers()
117
+ return _assemble_report(inp, requested, run_id=run_id, generated_at=generated_at)
118
+
119
+
120
+ def analyze_match(
121
+ result: Any,
122
+ *,
123
+ dataset: str | None = None,
124
+ certificate: Any = None,
125
+ run_id: str | None = None,
126
+ generated_at: datetime | None = None,
127
+ ) -> AnalysisReport:
128
+ """Analyze a GoldenMatch ``DedupeResult``: ``match.rates`` + ``cluster.distribution``.
129
+
130
+ ``certificate`` (optional) is a recall certificate — a ``{estimate, safe_bound}``
131
+ dict or a ``RecallEstimate``/``RecallCertificate``. When absent, the recall
132
+ metrics are omitted (graceful degradation).
133
+ """
134
+ from goldenanalysis.adapters.match import MatchArtifactAdapter
135
+
136
+ inp = MatchArtifactAdapter().load(result, dataset=dataset, certificate=certificate)
137
+ return _assemble_report(
138
+ inp, ["match.rates", "cluster.distribution"], run_id=run_id, generated_at=generated_at
139
+ )
140
+
141
+
142
+ def analyze_pipeline(
143
+ result: Any,
144
+ *,
145
+ run_id: str | None = None,
146
+ generated_at: datetime | None = None,
147
+ ) -> AnalysisReport:
148
+ """Analyze a GoldenPipe ``PipeResult``, fanning out to every analyzer whose
149
+ consumed artifacts are present in ``result.artifacts``."""
150
+ from goldenanalysis.adapters.pipe import PipeArtifactAdapter
151
+
152
+ inp = PipeArtifactAdapter().load(result)
153
+ names = _artifact_compatible_analyzers(inp)
154
+ return _assemble_report(inp, names, run_id=run_id, generated_at=generated_at)
@@ -0,0 +1,81 @@
1
+ """Pure regression decision logic — baseline strategy + direction-aware policy.
2
+
3
+ Backend-free: operates on a list of ``(run_id, value)`` history points + the
4
+ current value. ``ReportHistory`` wires storage around this.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import statistics
10
+ from collections.abc import Sequence
11
+
12
+ from goldenanalysis.models import Direction, RegressionPolicy
13
+
14
+
15
+ def baseline_value(history: Sequence[float], strategy: str, *, window: int = 7) -> float | None:
16
+ """The baseline to compare the current value against.
17
+
18
+ - ``"previous"`` / ``"last_known_good"``: the most recent historical value
19
+ (v1: ``last_known_good`` aliases ``previous`` until a health signal exists).
20
+ - ``"rolling_median"``: median of the last ``window`` historical values — immune
21
+ to one noisy night, where ``previous`` would alternately flag and un-flag.
22
+ - any other string is treated as a pinned ``run_id`` and is resolved by the
23
+ caller (which has the run->value map); here it falls through to ``previous``.
24
+
25
+ Returns None when there's no history to compare against.
26
+ """
27
+ if not history:
28
+ return None
29
+ if strategy == "rolling_median":
30
+ tail = list(history[-window:])
31
+ return float(statistics.median(tail))
32
+ # "previous", "last_known_good", or a pinned id resolved upstream.
33
+ return float(history[-1])
34
+
35
+
36
+ def delta_pct(baseline: float, current: float) -> float:
37
+ if baseline == 0:
38
+ return 0.0
39
+ return (current - baseline) / baseline * 100.0
40
+
41
+
42
+ def is_regression(direction: Direction, baseline: float, current: float, threshold_pct: float) -> bool:
43
+ """Direction-aware: a higher_better metric flags only on a DROP beyond the
44
+ threshold; lower_better only on a RISE; neutral on either direction."""
45
+ d = delta_pct(baseline, current)
46
+ if direction == "higher_better":
47
+ return d <= -threshold_pct
48
+ if direction == "lower_better":
49
+ return d >= threshold_pct
50
+ return abs(d) >= threshold_pct
51
+
52
+
53
+ def evaluate_metric(
54
+ *,
55
+ key: str,
56
+ direction: Direction,
57
+ history: Sequence[float],
58
+ current: float,
59
+ strategy: str,
60
+ window: int,
61
+ policy: RegressionPolicy,
62
+ ):
63
+ """Return a ``Regression`` for one metric, or None when there's no baseline.
64
+
65
+ ``flagged`` reflects the direction-aware per-metric gate; the record is always
66
+ returned (when a baseline exists) so callers can show near-misses if they want.
67
+ """
68
+ from goldenanalysis.models import Regression
69
+
70
+ base = baseline_value(history, strategy, window=window)
71
+ if base is None:
72
+ return None
73
+ threshold = policy.threshold_for(key)
74
+ return Regression(
75
+ metric=key,
76
+ baseline=base,
77
+ current=current,
78
+ delta_pct=delta_pct(base, current),
79
+ flagged=is_regression(direction, base, current, threshold),
80
+ direction=direction,
81
+ )
@@ -0,0 +1,11 @@
1
+ """Artifact adapters normalize a producer's output into an ``AnalyzerInput``.
2
+
3
+ Phase 1 ships only the generic ``frame`` adapter (zero suite deps). Suite adapters
4
+ (match/check/flow/pipe) land in Phase 2.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from goldenanalysis.adapters.frame import FrameArtifactAdapter
10
+
11
+ __all__ = ["FrameArtifactAdapter"]
@@ -0,0 +1,33 @@
1
+ """``check`` adapter — GoldenCheck scan output → ``AnalyzerInput.artifacts``.
2
+
3
+ Two entry points:
4
+ - ``from_scan(findings, profile, ...)`` — pure, no ``goldencheck`` import (the seam
5
+ the unit tests and the pipe adapter use).
6
+ - ``load(df, ...)`` — lazy-imports ``goldencheck`` and runs ``scan_dataframe``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+ from goldenanalysis.models import AnalyzerInput
14
+
15
+
16
+ class CheckArtifactAdapter:
17
+ """Normalizes GoldenCheck scan output into an ``AnalyzerInput``."""
18
+
19
+ def from_scan(self, findings: Any, profile: Any = None, *, dataset: str | None = None) -> AnalyzerInput:
20
+ return AnalyzerInput(
21
+ dataset=dataset or "check",
22
+ artifacts={"__producer__": "goldencheck", "findings": findings, "profile": profile},
23
+ )
24
+
25
+ def load(self, df: Any, *, dataset: str | None = None, **scan_kwargs: Any) -> AnalyzerInput:
26
+ try:
27
+ import goldencheck # pyright: ignore[reportMissingImports] # optional [check] extra
28
+ except ImportError as exc: # pragma: no cover - exercised in CI with the extra
29
+ raise RuntimeError(
30
+ "goldenanalysis[check] requires goldencheck: pip install goldenanalysis[check]"
31
+ ) from exc
32
+ findings, profile = goldencheck.scan_dataframe(df, **scan_kwargs)
33
+ return self.from_scan(findings, profile, dataset=dataset)
@@ -0,0 +1,25 @@
1
+ """``flow`` adapter — GoldenFlow ``TransformResult`` → ``AnalyzerInput.artifacts``.
2
+
3
+ Duck-typed: reads ``.df`` and ``.manifest`` off the result; imports nothing from
4
+ ``goldenflow``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from goldenanalysis.models import AnalyzerInput
12
+
13
+
14
+ class FlowArtifactAdapter:
15
+ """Normalizes a GoldenFlow ``TransformResult`` into an ``AnalyzerInput``."""
16
+
17
+ def load(self, result: Any, *, dataset: str | None = None) -> AnalyzerInput:
18
+ return AnalyzerInput(
19
+ dataset=dataset or "flow",
20
+ frame=getattr(result, "df", None),
21
+ artifacts={
22
+ "__producer__": "goldenflow",
23
+ "manifest": getattr(result, "manifest", None),
24
+ },
25
+ )
@@ -0,0 +1,18 @@
1
+ """The generic ``frame`` adapter — the always-available, zero-suite-dep path.
2
+
3
+ Imports nothing from other suite packages, so GoldenAnalysis is useful on any
4
+ polars DataFrame even with no other Golden package installed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import polars as pl
10
+
11
+ from goldenanalysis.models import AnalyzerInput
12
+
13
+
14
+ class FrameArtifactAdapter:
15
+ """Normalizes a raw polars DataFrame into an ``AnalyzerInput``."""
16
+
17
+ def load(self, df: pl.DataFrame, *, dataset: str | None = None) -> AnalyzerInput:
18
+ return AnalyzerInput(frame=df, dataset=dataset or "frame")
@@ -0,0 +1,59 @@
1
+ """``match`` adapter — GoldenMatch ``DedupeResult`` → ``AnalyzerInput.artifacts``.
2
+
3
+ Duck-typed: reads ``.clusters`` / ``.scored_pairs`` / ``.stats`` off the result,
4
+ so it imports nothing from ``goldenmatch``. The recall certificate is optional —
5
+ passed in by the caller, or read off ``result.recall_certificate`` when the
6
+ producer attached one (``dedupe_df(..., certify=True)``).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+ from goldenanalysis.models import AnalyzerInput
14
+
15
+
16
+ def _normalize_cert(cert: Any) -> dict[str, Any] | None:
17
+ """Normalize a recall certificate to ``{estimate, safe_bound}`` (or None)."""
18
+ if cert is None:
19
+ return None
20
+ if isinstance(cert, dict):
21
+ return {
22
+ "estimate": cert.get("estimate", cert.get("recall")),
23
+ "safe_bound": cert.get("safe_bound", cert.get("recall_lower")),
24
+ }
25
+ return {
26
+ "estimate": getattr(cert, "recall", None),
27
+ "safe_bound": getattr(cert, "recall_lower", None),
28
+ }
29
+
30
+
31
+ def _primary_threshold(config: Any) -> float | None:
32
+ """Best-effort: the first matchkey's threshold from the result's config."""
33
+ try:
34
+ matchkeys = config.get_matchkeys() if hasattr(config, "get_matchkeys") else getattr(config, "matchkeys", None)
35
+ for mk in matchkeys or []:
36
+ thr = getattr(mk, "threshold", None)
37
+ if thr is not None:
38
+ return float(thr)
39
+ except Exception:
40
+ return None
41
+ return None
42
+
43
+
44
+ class MatchArtifactAdapter:
45
+ """Normalizes a GoldenMatch ``DedupeResult`` into an ``AnalyzerInput``."""
46
+
47
+ def load(self, result: Any, *, dataset: str | None = None, certificate: Any = None) -> AnalyzerInput:
48
+ cert = certificate if certificate is not None else getattr(result, "recall_certificate", None)
49
+ artifacts: dict[str, Any] = {
50
+ "__producer__": "goldenmatch",
51
+ "clusters": getattr(result, "clusters", {}) or {},
52
+ "scored_pairs": getattr(result, "scored_pairs", []) or [],
53
+ "match_stats": getattr(result, "stats", {}) or {},
54
+ "match_threshold": _primary_threshold(getattr(result, "config", None)),
55
+ }
56
+ normalized = _normalize_cert(cert)
57
+ if normalized is not None:
58
+ artifacts["recall_certificate"] = normalized
59
+ return AnalyzerInput(dataset=dataset or "match", artifacts=artifacts)
@@ -0,0 +1,36 @@
1
+ """``pipe`` adapter — GoldenPipe ``PipeResult`` → ``AnalyzerInput.artifacts``.
2
+
3
+ Near-passthrough: ``PipeResult.artifacts`` already carries the per-stage outputs
4
+ (findings / manifest / clusters / scored_pairs / match_stats / recall_certificate
5
+ / ...) under the same keys the analyzers read. Duck-typed; no ``goldenpipe`` import.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from goldenanalysis.adapters.match import _normalize_cert
14
+ from goldenanalysis.models import AnalyzerInput
15
+
16
+
17
+ def _dataset_from_source(source: Any) -> str:
18
+ if not source or not isinstance(source, str) or source.startswith("<"):
19
+ return "frame"
20
+ return Path(source).stem or "frame"
21
+
22
+
23
+ class PipeArtifactAdapter:
24
+ """Normalizes a GoldenPipe ``PipeResult`` into an ``AnalyzerInput``."""
25
+
26
+ def load(self, result: Any, *, dataset: str | None = None) -> AnalyzerInput:
27
+ artifacts: dict[str, Any] = dict(getattr(result, "artifacts", {}) or {})
28
+ artifacts["__producer__"] = "goldenpipe"
29
+ if "recall_certificate" in artifacts:
30
+ normalized = _normalize_cert(artifacts["recall_certificate"])
31
+ if normalized is None:
32
+ artifacts.pop("recall_certificate", None)
33
+ else:
34
+ artifacts["recall_certificate"] = normalized
35
+ ds = dataset or _dataset_from_source(getattr(result, "source", None))
36
+ return AnalyzerInput(dataset=ds, artifacts=artifacts)
@@ -0,0 +1,7 @@
1
+ """Analyzers: the composable units that compute metrics over an ``AnalyzerInput``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from goldenanalysis.analyzers.base import Analyzer
6
+
7
+ __all__ = ["Analyzer"]
@@ -0,0 +1,19 @@
1
+ """The ``Analyzer`` protocol.
2
+
3
+ An analyzer is anything with an ``info`` descriptor and a ``run`` method. Concrete
4
+ analyzers are discovered by the registry via the ``goldenanalysis.analyzers``
5
+ entry-point group.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Protocol, runtime_checkable
11
+
12
+ from goldenanalysis.models import AnalyzerInfo, AnalyzerInput, AnalyzerResult
13
+
14
+
15
+ @runtime_checkable
16
+ class Analyzer(Protocol):
17
+ info: AnalyzerInfo
18
+
19
+ def run(self, inp: AnalyzerInput) -> AnalyzerResult: ...
@@ -0,0 +1,77 @@
1
+ """``cluster.distribution`` — cluster-size shape from a GoldenMatch result.
2
+
3
+ Reads ``clusters`` (and optionally ``match_stats`` for the record count) from
4
+ ``AnalyzerInput.artifacts``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from goldenanalysis.core import aggregate as agg
10
+ from goldenanalysis.models import (
11
+ AnalysisTable,
12
+ AnalyzerInfo,
13
+ AnalyzerInput,
14
+ AnalyzerResult,
15
+ Metric,
16
+ )
17
+
18
+ _PRODUCES = [
19
+ "cluster.count",
20
+ "cluster.record_count",
21
+ "cluster.singleton_ratio",
22
+ "cluster.size_p50",
23
+ "cluster.size_p95",
24
+ "cluster.size_max",
25
+ "cluster.reduction_ratio",
26
+ ]
27
+
28
+
29
+ class ClusterDistributionAnalyzer:
30
+ """Cluster count, singleton ratio, size quantiles, reduction ratio, histogram."""
31
+
32
+ info = AnalyzerInfo(name="cluster.distribution", consumes=["clusters"], produces=_PRODUCES)
33
+
34
+ def run(self, inp: AnalyzerInput) -> AnalyzerResult:
35
+ clusters = inp.artifacts.get("clusters")
36
+ if not clusters:
37
+ return AnalyzerResult(metrics=[], tables=[])
38
+
39
+ sizes = [int(c.get("size", len(c.get("members", []))) if isinstance(c, dict) else c) for c in clusters.values()]
40
+ count = len(clusters)
41
+ # Prefer the engine's own record total; fall back to summed cluster sizes.
42
+ stats = inp.artifacts.get("match_stats", {}) or {}
43
+ record_count = int(stats.get("total_records", sum(sizes)))
44
+ singletons = sum(1 for s in sizes if s == 1)
45
+
46
+ metrics = [
47
+ Metric(key="cluster.count", value=count, unit="clusters", direction="neutral"),
48
+ Metric(key="cluster.record_count", value=record_count, unit="rows", direction="neutral"),
49
+ Metric(
50
+ key="cluster.singleton_ratio",
51
+ value=(singletons / count) if count else 0.0,
52
+ unit="ratio",
53
+ direction="neutral",
54
+ ),
55
+ Metric(key="cluster.size_p50", value=agg.quantile(sizes, 0.5), unit="rows", direction="neutral"),
56
+ Metric(key="cluster.size_p95", value=agg.quantile(sizes, 0.95), unit="rows", direction="neutral"),
57
+ Metric(key="cluster.size_max", value=max(sizes) if sizes else 0, unit="rows", direction="neutral"),
58
+ Metric(
59
+ key="cluster.reduction_ratio",
60
+ value=(1 - count / record_count) if record_count else 0.0,
61
+ unit="ratio",
62
+ direction="neutral",
63
+ ),
64
+ ]
65
+
66
+ # Discrete size histogram, buckets 1 / 2 / 3 / "4+".
67
+ n1 = sum(1 for s in sizes if s == 1)
68
+ n2 = sum(1 for s in sizes if s == 2)
69
+ n3 = sum(1 for s in sizes if s == 3)
70
+ n4 = sum(1 for s in sizes if s >= 4)
71
+ table = AnalysisTable(
72
+ name="cluster_size_histogram",
73
+ columns=["size", "count"],
74
+ rows=[[1, n1], [2, n2], [3, n3], ["4+", n4]],
75
+ )
76
+
77
+ return AnalyzerResult(metrics=metrics, tables=[table])
@@ -0,0 +1,72 @@
1
+ """``frame.summary`` — generic frame metrics, zero suite deps, always available.
2
+
3
+ Emits the Appendix-A metric set: row/column counts, mean null ratio, exact-
4
+ duplicate-row ratio, estimated in-memory size, plus a ``per_column`` table
5
+ (column, dtype, null_ratio, n_unique).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from goldenanalysis.core import aggregate as agg
11
+ from goldenanalysis.models import (
12
+ AnalysisTable,
13
+ AnalyzerInfo,
14
+ AnalyzerInput,
15
+ AnalyzerResult,
16
+ Metric,
17
+ )
18
+
19
+ _PRODUCES = [
20
+ "frame.row_count",
21
+ "frame.column_count",
22
+ "frame.null_ratio_mean",
23
+ "frame.duplicate_row_ratio",
24
+ "frame.memory_bytes",
25
+ ]
26
+
27
+
28
+ class FrameSummaryAnalyzer:
29
+ """Summarize a raw frame: shape, null mass, duplication, memory footprint."""
30
+
31
+ info = AnalyzerInfo(name="frame.summary", consumes=["frame"], produces=_PRODUCES)
32
+
33
+ def run(self, inp: AnalyzerInput) -> AnalyzerResult:
34
+ df = inp.frame
35
+ if df is None:
36
+ raise ValueError("frame.summary requires AnalyzerInput.frame (a polars DataFrame)")
37
+
38
+ n_rows = df.height
39
+ n_cols = df.width
40
+ null_ratios = agg.null_ratio_per_column(df)
41
+ null_mean = sum(null_ratios.values()) / n_cols if n_cols else 0.0
42
+ dup_ratio = agg.duplicate_row_ratio(df)
43
+ mem_bytes = df.estimated_size()
44
+
45
+ metrics = [
46
+ Metric(key="frame.row_count", value=n_rows, unit="rows", direction="neutral"),
47
+ Metric(key="frame.column_count", value=n_cols, unit="columns", direction="neutral"),
48
+ Metric(
49
+ key="frame.null_ratio_mean",
50
+ value=null_mean,
51
+ unit="ratio",
52
+ direction="lower_better",
53
+ ),
54
+ Metric(
55
+ key="frame.duplicate_row_ratio",
56
+ value=dup_ratio,
57
+ unit="ratio",
58
+ direction="lower_better",
59
+ ),
60
+ Metric(key="frame.memory_bytes", value=mem_bytes, unit="bytes", direction="neutral"),
61
+ ]
62
+
63
+ per_column = AnalysisTable(
64
+ name="per_column",
65
+ columns=["column", "dtype", "null_ratio", "n_unique"],
66
+ rows=[
67
+ [col, str(df[col].dtype), null_ratios[col], df[col].n_unique()]
68
+ for col in df.columns
69
+ ],
70
+ )
71
+
72
+ return AnalyzerResult(metrics=metrics, tables=[per_column])