goldenanalysis 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- goldenanalysis/__init__.py +61 -0
- goldenanalysis/_api.py +154 -0
- goldenanalysis/_regressions.py +81 -0
- goldenanalysis/adapters/__init__.py +11 -0
- goldenanalysis/adapters/check.py +33 -0
- goldenanalysis/adapters/flow.py +25 -0
- goldenanalysis/adapters/frame.py +18 -0
- goldenanalysis/adapters/match.py +59 -0
- goldenanalysis/adapters/pipe.py +36 -0
- goldenanalysis/analyzers/__init__.py +7 -0
- goldenanalysis/analyzers/base.py +19 -0
- goldenanalysis/analyzers/cluster_dist.py +77 -0
- goldenanalysis/analyzers/frame_summary.py +72 -0
- goldenanalysis/analyzers/match_rates.py +92 -0
- goldenanalysis/analyzers/quality_rollup.py +130 -0
- goldenanalysis/cli/__init__.py +1 -0
- goldenanalysis/cli/main.py +160 -0
- goldenanalysis/core/__init__.py +1 -0
- goldenanalysis/core/_native_loader.py +67 -0
- goldenanalysis/core/aggregate.py +113 -0
- goldenanalysis/history.py +205 -0
- goldenanalysis/mcp/__init__.py +6 -0
- goldenanalysis/mcp/server.py +260 -0
- goldenanalysis/models/__init__.py +31 -0
- goldenanalysis/models/analyzer.py +43 -0
- goldenanalysis/models/policy.py +51 -0
- goldenanalysis/models/report.py +102 -0
- goldenanalysis/narrative.py +61 -0
- goldenanalysis/py.typed +0 -0
- goldenanalysis/registry.py +56 -0
- goldenanalysis/render.py +67 -0
- goldenanalysis-0.1.0.dist-info/METADATA +188 -0
- goldenanalysis-0.1.0.dist-info/RECORD +36 -0
- goldenanalysis-0.1.0.dist-info/WHEEL +4 -0
- goldenanalysis-0.1.0.dist-info/entry_points.txt +8 -0
- goldenanalysis-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""GoldenAnalysis — read-only cross-cutting analysis/metrics/reporting for the Golden Suite.
|
|
2
|
+
|
|
3
|
+
Public surface (Phase 1): the generic frame path.
|
|
4
|
+
|
|
5
|
+
import goldenanalysis as ga
|
|
6
|
+
report = ga.analyze(df, analyzers=["frame.summary"])
|
|
7
|
+
print(report.to_markdown())
|
|
8
|
+
|
|
9
|
+
Suite adapters, the other analyzers, ``ReportHistory``/regression detection, the
|
|
10
|
+
TypeScript port, and the Rust accelerator land in later phases.
|
|
11
|
+
|
|
12
|
+
Public names are re-exported lazily (PEP 562) so the package imports cleanly even
|
|
13
|
+
while submodules are still being built out.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import TYPE_CHECKING, Any
|
|
19
|
+
|
|
20
|
+
__version__ = "0.1.0"
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"analyze",
|
|
24
|
+
"analyze_match",
|
|
25
|
+
"analyze_pipeline",
|
|
26
|
+
"AnalysisReport",
|
|
27
|
+
"Metric",
|
|
28
|
+
"ReportHistory",
|
|
29
|
+
"RegressionPolicy",
|
|
30
|
+
"__version__",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from goldenanalysis._api import analyze, analyze_match, analyze_pipeline
|
|
35
|
+
from goldenanalysis.history import ReportHistory
|
|
36
|
+
from goldenanalysis.models import AnalysisReport, Metric, RegressionPolicy
|
|
37
|
+
|
|
38
|
+
# Map exported name -> (submodule, attribute). Resolved on first access.
|
|
39
|
+
_LAZY: dict[str, tuple[str, str]] = {
|
|
40
|
+
"analyze": ("goldenanalysis._api", "analyze"),
|
|
41
|
+
"analyze_match": ("goldenanalysis._api", "analyze_match"),
|
|
42
|
+
"analyze_pipeline": ("goldenanalysis._api", "analyze_pipeline"),
|
|
43
|
+
"AnalysisReport": ("goldenanalysis.models", "AnalysisReport"),
|
|
44
|
+
"Metric": ("goldenanalysis.models", "Metric"),
|
|
45
|
+
"ReportHistory": ("goldenanalysis.history", "ReportHistory"),
|
|
46
|
+
"RegressionPolicy": ("goldenanalysis.models", "RegressionPolicy"),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def __getattr__(name: str) -> Any:
|
|
51
|
+
target = _LAZY.get(name)
|
|
52
|
+
if target is None:
|
|
53
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
54
|
+
import importlib
|
|
55
|
+
|
|
56
|
+
module, attr = target
|
|
57
|
+
return getattr(importlib.import_module(module), attr)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def __dir__() -> list[str]:
|
|
61
|
+
return sorted(__all__)
|
goldenanalysis/_api.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Top-level analyze entrypoints — resolve analyzers, run them over an artifact,
|
|
2
|
+
assemble a single ``AnalysisReport``.
|
|
3
|
+
|
|
4
|
+
- ``analyze(df, ...)`` — the generic frame path (Phase 1).
|
|
5
|
+
- ``analyze_match(result, ...)`` / ``analyze_pipeline(result)`` — suite paths
|
|
6
|
+
(Phase 2a) over a GoldenMatch ``DedupeResult`` / GoldenPipe ``PipeResult``.
|
|
7
|
+
|
|
8
|
+
Cross-run aggregation + narrative generation land in Phase 2b.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections.abc import Sequence
|
|
14
|
+
from datetime import UTC, datetime
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import polars as pl
|
|
18
|
+
|
|
19
|
+
from goldenanalysis.adapters import FrameArtifactAdapter
|
|
20
|
+
from goldenanalysis.models import AnalysisReport, AnalyzerInput
|
|
21
|
+
from goldenanalysis.registry import available_analyzers, load_analyzer
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _assemble_report(
|
|
25
|
+
inp: AnalyzerInput,
|
|
26
|
+
analyzer_names: Sequence[str],
|
|
27
|
+
*,
|
|
28
|
+
run_id: str | None = None,
|
|
29
|
+
generated_at: datetime | None = None,
|
|
30
|
+
) -> AnalysisReport:
|
|
31
|
+
"""Run ``analyzer_names`` over ``inp`` and assemble one ``AnalysisReport``.
|
|
32
|
+
|
|
33
|
+
Shared by every analyze entrypoint. Names that are requested but not
|
|
34
|
+
discoverable are recorded in ``source["unavailable"]`` rather than raising.
|
|
35
|
+
"""
|
|
36
|
+
ds = inp.dataset
|
|
37
|
+
discoverable = set(available_analyzers())
|
|
38
|
+
|
|
39
|
+
ran: list[str] = []
|
|
40
|
+
unavailable: list[str] = []
|
|
41
|
+
metrics = []
|
|
42
|
+
tables = []
|
|
43
|
+
for name in analyzer_names:
|
|
44
|
+
if name not in discoverable:
|
|
45
|
+
unavailable.append(name)
|
|
46
|
+
continue
|
|
47
|
+
result = load_analyzer(name).run(inp)
|
|
48
|
+
metrics.extend(result.metrics)
|
|
49
|
+
tables.extend(result.tables)
|
|
50
|
+
ran.append(name)
|
|
51
|
+
|
|
52
|
+
gen = generated_at or datetime.now(UTC)
|
|
53
|
+
rid = run_id or f"{gen.isoformat()}#{ds}"
|
|
54
|
+
source = {"dataset": ds, "producer": inp.artifacts.get("__producer__", "frame")}
|
|
55
|
+
if unavailable:
|
|
56
|
+
source["unavailable"] = ",".join(unavailable)
|
|
57
|
+
|
|
58
|
+
return AnalysisReport(
|
|
59
|
+
run_id=rid,
|
|
60
|
+
generated_at=gen,
|
|
61
|
+
source=source,
|
|
62
|
+
metrics=metrics,
|
|
63
|
+
tables=tables,
|
|
64
|
+
narrative=None,
|
|
65
|
+
analyzers_run=ran,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _frame_compatible_analyzers() -> list[str]:
|
|
70
|
+
"""Discoverable analyzers that consume a generic ``frame`` and import cleanly.
|
|
71
|
+
|
|
72
|
+
Loading is guarded so analyzers needing optional suite deps are simply skipped
|
|
73
|
+
from the default set rather than breaking the generic path.
|
|
74
|
+
"""
|
|
75
|
+
out: list[str] = []
|
|
76
|
+
for name in available_analyzers():
|
|
77
|
+
try:
|
|
78
|
+
analyzer = load_analyzer(name)
|
|
79
|
+
except Exception:
|
|
80
|
+
continue
|
|
81
|
+
if "frame" in analyzer.info.consumes:
|
|
82
|
+
out.append(name)
|
|
83
|
+
return out
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _artifact_compatible_analyzers(inp: AnalyzerInput) -> list[str]:
|
|
87
|
+
"""Discoverable analyzers at least one of whose ``consumes`` keys is present
|
|
88
|
+
in ``inp.artifacts`` — the fan-out selector for ``analyze_pipeline``."""
|
|
89
|
+
present = set(inp.artifacts)
|
|
90
|
+
out: list[str] = []
|
|
91
|
+
for name in available_analyzers():
|
|
92
|
+
try:
|
|
93
|
+
analyzer = load_analyzer(name)
|
|
94
|
+
except Exception:
|
|
95
|
+
continue
|
|
96
|
+
if any(key in present for key in analyzer.info.consumes):
|
|
97
|
+
out.append(name)
|
|
98
|
+
return out
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def analyze(
|
|
102
|
+
df: pl.DataFrame,
|
|
103
|
+
analyzers: Sequence[str] | None = None,
|
|
104
|
+
*,
|
|
105
|
+
dataset: str | None = None,
|
|
106
|
+
run_id: str | None = None,
|
|
107
|
+
generated_at: datetime | None = None,
|
|
108
|
+
) -> AnalysisReport:
|
|
109
|
+
"""Run ``analyzers`` over ``df`` and return a single ``AnalysisReport``.
|
|
110
|
+
|
|
111
|
+
``analyzers=None`` defaults to every frame-compatible analyzer. Names that are
|
|
112
|
+
requested but not discoverable are recorded in ``source["unavailable"]`` rather
|
|
113
|
+
than raising — the report says what it could and couldn't compute.
|
|
114
|
+
"""
|
|
115
|
+
inp = FrameArtifactAdapter().load(df, dataset=dataset)
|
|
116
|
+
requested = list(analyzers) if analyzers is not None else _frame_compatible_analyzers()
|
|
117
|
+
return _assemble_report(inp, requested, run_id=run_id, generated_at=generated_at)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def analyze_match(
|
|
121
|
+
result: Any,
|
|
122
|
+
*,
|
|
123
|
+
dataset: str | None = None,
|
|
124
|
+
certificate: Any = None,
|
|
125
|
+
run_id: str | None = None,
|
|
126
|
+
generated_at: datetime | None = None,
|
|
127
|
+
) -> AnalysisReport:
|
|
128
|
+
"""Analyze a GoldenMatch ``DedupeResult``: ``match.rates`` + ``cluster.distribution``.
|
|
129
|
+
|
|
130
|
+
``certificate`` (optional) is a recall certificate — a ``{estimate, safe_bound}``
|
|
131
|
+
dict or a ``RecallEstimate``/``RecallCertificate``. When absent, the recall
|
|
132
|
+
metrics are omitted (graceful degradation).
|
|
133
|
+
"""
|
|
134
|
+
from goldenanalysis.adapters.match import MatchArtifactAdapter
|
|
135
|
+
|
|
136
|
+
inp = MatchArtifactAdapter().load(result, dataset=dataset, certificate=certificate)
|
|
137
|
+
return _assemble_report(
|
|
138
|
+
inp, ["match.rates", "cluster.distribution"], run_id=run_id, generated_at=generated_at
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def analyze_pipeline(
|
|
143
|
+
result: Any,
|
|
144
|
+
*,
|
|
145
|
+
run_id: str | None = None,
|
|
146
|
+
generated_at: datetime | None = None,
|
|
147
|
+
) -> AnalysisReport:
|
|
148
|
+
"""Analyze a GoldenPipe ``PipeResult``, fanning out to every analyzer whose
|
|
149
|
+
consumed artifacts are present in ``result.artifacts``."""
|
|
150
|
+
from goldenanalysis.adapters.pipe import PipeArtifactAdapter
|
|
151
|
+
|
|
152
|
+
inp = PipeArtifactAdapter().load(result)
|
|
153
|
+
names = _artifact_compatible_analyzers(inp)
|
|
154
|
+
return _assemble_report(inp, names, run_id=run_id, generated_at=generated_at)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Pure regression decision logic — baseline strategy + direction-aware policy.
|
|
2
|
+
|
|
3
|
+
Backend-free: operates on a list of ``(run_id, value)`` history points + the
|
|
4
|
+
current value. ``ReportHistory`` wires storage around this.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import statistics
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
|
|
12
|
+
from goldenanalysis.models import Direction, RegressionPolicy
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def baseline_value(history: Sequence[float], strategy: str, *, window: int = 7) -> float | None:
|
|
16
|
+
"""The baseline to compare the current value against.
|
|
17
|
+
|
|
18
|
+
- ``"previous"`` / ``"last_known_good"``: the most recent historical value
|
|
19
|
+
(v1: ``last_known_good`` aliases ``previous`` until a health signal exists).
|
|
20
|
+
- ``"rolling_median"``: median of the last ``window`` historical values — immune
|
|
21
|
+
to one noisy night, where ``previous`` would alternately flag and un-flag.
|
|
22
|
+
- any other string is treated as a pinned ``run_id`` and is resolved by the
|
|
23
|
+
caller (which has the run->value map); here it falls through to ``previous``.
|
|
24
|
+
|
|
25
|
+
Returns None when there's no history to compare against.
|
|
26
|
+
"""
|
|
27
|
+
if not history:
|
|
28
|
+
return None
|
|
29
|
+
if strategy == "rolling_median":
|
|
30
|
+
tail = list(history[-window:])
|
|
31
|
+
return float(statistics.median(tail))
|
|
32
|
+
# "previous", "last_known_good", or a pinned id resolved upstream.
|
|
33
|
+
return float(history[-1])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def delta_pct(baseline: float, current: float) -> float:
|
|
37
|
+
if baseline == 0:
|
|
38
|
+
return 0.0
|
|
39
|
+
return (current - baseline) / baseline * 100.0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_regression(direction: Direction, baseline: float, current: float, threshold_pct: float) -> bool:
|
|
43
|
+
"""Direction-aware: a higher_better metric flags only on a DROP beyond the
|
|
44
|
+
threshold; lower_better only on a RISE; neutral on either direction."""
|
|
45
|
+
d = delta_pct(baseline, current)
|
|
46
|
+
if direction == "higher_better":
|
|
47
|
+
return d <= -threshold_pct
|
|
48
|
+
if direction == "lower_better":
|
|
49
|
+
return d >= threshold_pct
|
|
50
|
+
return abs(d) >= threshold_pct
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def evaluate_metric(
|
|
54
|
+
*,
|
|
55
|
+
key: str,
|
|
56
|
+
direction: Direction,
|
|
57
|
+
history: Sequence[float],
|
|
58
|
+
current: float,
|
|
59
|
+
strategy: str,
|
|
60
|
+
window: int,
|
|
61
|
+
policy: RegressionPolicy,
|
|
62
|
+
):
|
|
63
|
+
"""Return a ``Regression`` for one metric, or None when there's no baseline.
|
|
64
|
+
|
|
65
|
+
``flagged`` reflects the direction-aware per-metric gate; the record is always
|
|
66
|
+
returned (when a baseline exists) so callers can show near-misses if they want.
|
|
67
|
+
"""
|
|
68
|
+
from goldenanalysis.models import Regression
|
|
69
|
+
|
|
70
|
+
base = baseline_value(history, strategy, window=window)
|
|
71
|
+
if base is None:
|
|
72
|
+
return None
|
|
73
|
+
threshold = policy.threshold_for(key)
|
|
74
|
+
return Regression(
|
|
75
|
+
metric=key,
|
|
76
|
+
baseline=base,
|
|
77
|
+
current=current,
|
|
78
|
+
delta_pct=delta_pct(base, current),
|
|
79
|
+
flagged=is_regression(direction, base, current, threshold),
|
|
80
|
+
direction=direction,
|
|
81
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Artifact adapters normalize a producer's output into an ``AnalyzerInput``.
|
|
2
|
+
|
|
3
|
+
Phase 1 ships only the generic ``frame`` adapter (zero suite deps). Suite adapters
|
|
4
|
+
(match/check/flow/pipe) land in Phase 2.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from goldenanalysis.adapters.frame import FrameArtifactAdapter
|
|
10
|
+
|
|
11
|
+
__all__ = ["FrameArtifactAdapter"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""``check`` adapter — GoldenCheck scan output → ``AnalyzerInput.artifacts``.
|
|
2
|
+
|
|
3
|
+
Two entry points:
|
|
4
|
+
- ``from_scan(findings, profile, ...)`` — pure, no ``goldencheck`` import (the seam
|
|
5
|
+
the unit tests and the pipe adapter use).
|
|
6
|
+
- ``load(df, ...)`` — lazy-imports ``goldencheck`` and runs ``scan_dataframe``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from goldenanalysis.models import AnalyzerInput
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CheckArtifactAdapter:
|
|
17
|
+
"""Normalizes GoldenCheck scan output into an ``AnalyzerInput``."""
|
|
18
|
+
|
|
19
|
+
def from_scan(self, findings: Any, profile: Any = None, *, dataset: str | None = None) -> AnalyzerInput:
|
|
20
|
+
return AnalyzerInput(
|
|
21
|
+
dataset=dataset or "check",
|
|
22
|
+
artifacts={"__producer__": "goldencheck", "findings": findings, "profile": profile},
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def load(self, df: Any, *, dataset: str | None = None, **scan_kwargs: Any) -> AnalyzerInput:
|
|
26
|
+
try:
|
|
27
|
+
import goldencheck # pyright: ignore[reportMissingImports] # optional [check] extra
|
|
28
|
+
except ImportError as exc: # pragma: no cover - exercised in CI with the extra
|
|
29
|
+
raise RuntimeError(
|
|
30
|
+
"goldenanalysis[check] requires goldencheck: pip install goldenanalysis[check]"
|
|
31
|
+
) from exc
|
|
32
|
+
findings, profile = goldencheck.scan_dataframe(df, **scan_kwargs)
|
|
33
|
+
return self.from_scan(findings, profile, dataset=dataset)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""``flow`` adapter — GoldenFlow ``TransformResult`` → ``AnalyzerInput.artifacts``.
|
|
2
|
+
|
|
3
|
+
Duck-typed: reads ``.df`` and ``.manifest`` off the result; imports nothing from
|
|
4
|
+
``goldenflow``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from goldenanalysis.models import AnalyzerInput
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FlowArtifactAdapter:
|
|
15
|
+
"""Normalizes a GoldenFlow ``TransformResult`` into an ``AnalyzerInput``."""
|
|
16
|
+
|
|
17
|
+
def load(self, result: Any, *, dataset: str | None = None) -> AnalyzerInput:
|
|
18
|
+
return AnalyzerInput(
|
|
19
|
+
dataset=dataset or "flow",
|
|
20
|
+
frame=getattr(result, "df", None),
|
|
21
|
+
artifacts={
|
|
22
|
+
"__producer__": "goldenflow",
|
|
23
|
+
"manifest": getattr(result, "manifest", None),
|
|
24
|
+
},
|
|
25
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""The generic ``frame`` adapter — the always-available, zero-suite-dep path.
|
|
2
|
+
|
|
3
|
+
Imports nothing from other suite packages, so GoldenAnalysis is useful on any
|
|
4
|
+
polars DataFrame even with no other Golden package installed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
from goldenanalysis.models import AnalyzerInput
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FrameArtifactAdapter:
|
|
15
|
+
"""Normalizes a raw polars DataFrame into an ``AnalyzerInput``."""
|
|
16
|
+
|
|
17
|
+
def load(self, df: pl.DataFrame, *, dataset: str | None = None) -> AnalyzerInput:
|
|
18
|
+
return AnalyzerInput(frame=df, dataset=dataset or "frame")
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""``match`` adapter — GoldenMatch ``DedupeResult`` → ``AnalyzerInput.artifacts``.
|
|
2
|
+
|
|
3
|
+
Duck-typed: reads ``.clusters`` / ``.scored_pairs`` / ``.stats`` off the result,
|
|
4
|
+
so it imports nothing from ``goldenmatch``. The recall certificate is optional —
|
|
5
|
+
passed in by the caller, or read off ``result.recall_certificate`` when the
|
|
6
|
+
producer attached one (``dedupe_df(..., certify=True)``).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from goldenanalysis.models import AnalyzerInput
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _normalize_cert(cert: Any) -> dict[str, Any] | None:
|
|
17
|
+
"""Normalize a recall certificate to ``{estimate, safe_bound}`` (or None)."""
|
|
18
|
+
if cert is None:
|
|
19
|
+
return None
|
|
20
|
+
if isinstance(cert, dict):
|
|
21
|
+
return {
|
|
22
|
+
"estimate": cert.get("estimate", cert.get("recall")),
|
|
23
|
+
"safe_bound": cert.get("safe_bound", cert.get("recall_lower")),
|
|
24
|
+
}
|
|
25
|
+
return {
|
|
26
|
+
"estimate": getattr(cert, "recall", None),
|
|
27
|
+
"safe_bound": getattr(cert, "recall_lower", None),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _primary_threshold(config: Any) -> float | None:
|
|
32
|
+
"""Best-effort: the first matchkey's threshold from the result's config."""
|
|
33
|
+
try:
|
|
34
|
+
matchkeys = config.get_matchkeys() if hasattr(config, "get_matchkeys") else getattr(config, "matchkeys", None)
|
|
35
|
+
for mk in matchkeys or []:
|
|
36
|
+
thr = getattr(mk, "threshold", None)
|
|
37
|
+
if thr is not None:
|
|
38
|
+
return float(thr)
|
|
39
|
+
except Exception:
|
|
40
|
+
return None
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MatchArtifactAdapter:
|
|
45
|
+
"""Normalizes a GoldenMatch ``DedupeResult`` into an ``AnalyzerInput``."""
|
|
46
|
+
|
|
47
|
+
def load(self, result: Any, *, dataset: str | None = None, certificate: Any = None) -> AnalyzerInput:
|
|
48
|
+
cert = certificate if certificate is not None else getattr(result, "recall_certificate", None)
|
|
49
|
+
artifacts: dict[str, Any] = {
|
|
50
|
+
"__producer__": "goldenmatch",
|
|
51
|
+
"clusters": getattr(result, "clusters", {}) or {},
|
|
52
|
+
"scored_pairs": getattr(result, "scored_pairs", []) or [],
|
|
53
|
+
"match_stats": getattr(result, "stats", {}) or {},
|
|
54
|
+
"match_threshold": _primary_threshold(getattr(result, "config", None)),
|
|
55
|
+
}
|
|
56
|
+
normalized = _normalize_cert(cert)
|
|
57
|
+
if normalized is not None:
|
|
58
|
+
artifacts["recall_certificate"] = normalized
|
|
59
|
+
return AnalyzerInput(dataset=dataset or "match", artifacts=artifacts)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""``pipe`` adapter — GoldenPipe ``PipeResult`` → ``AnalyzerInput.artifacts``.
|
|
2
|
+
|
|
3
|
+
Near-passthrough: ``PipeResult.artifacts`` already carries the per-stage outputs
|
|
4
|
+
(findings / manifest / clusters / scored_pairs / match_stats / recall_certificate
|
|
5
|
+
/ ...) under the same keys the analyzers read. Duck-typed; no ``goldenpipe`` import.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from goldenanalysis.adapters.match import _normalize_cert
|
|
14
|
+
from goldenanalysis.models import AnalyzerInput
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _dataset_from_source(source: Any) -> str:
|
|
18
|
+
if not source or not isinstance(source, str) or source.startswith("<"):
|
|
19
|
+
return "frame"
|
|
20
|
+
return Path(source).stem or "frame"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PipeArtifactAdapter:
|
|
24
|
+
"""Normalizes a GoldenPipe ``PipeResult`` into an ``AnalyzerInput``."""
|
|
25
|
+
|
|
26
|
+
def load(self, result: Any, *, dataset: str | None = None) -> AnalyzerInput:
|
|
27
|
+
artifacts: dict[str, Any] = dict(getattr(result, "artifacts", {}) or {})
|
|
28
|
+
artifacts["__producer__"] = "goldenpipe"
|
|
29
|
+
if "recall_certificate" in artifacts:
|
|
30
|
+
normalized = _normalize_cert(artifacts["recall_certificate"])
|
|
31
|
+
if normalized is None:
|
|
32
|
+
artifacts.pop("recall_certificate", None)
|
|
33
|
+
else:
|
|
34
|
+
artifacts["recall_certificate"] = normalized
|
|
35
|
+
ds = dataset or _dataset_from_source(getattr(result, "source", None))
|
|
36
|
+
return AnalyzerInput(dataset=ds, artifacts=artifacts)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""The ``Analyzer`` protocol.
|
|
2
|
+
|
|
3
|
+
An analyzer is anything with an ``info`` descriptor and a ``run`` method. Concrete
|
|
4
|
+
analyzers are discovered by the registry via the ``goldenanalysis.analyzers``
|
|
5
|
+
entry-point group.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Protocol, runtime_checkable
|
|
11
|
+
|
|
12
|
+
from goldenanalysis.models import AnalyzerInfo, AnalyzerInput, AnalyzerResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@runtime_checkable
|
|
16
|
+
class Analyzer(Protocol):
|
|
17
|
+
info: AnalyzerInfo
|
|
18
|
+
|
|
19
|
+
def run(self, inp: AnalyzerInput) -> AnalyzerResult: ...
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""``cluster.distribution`` — cluster-size shape from a GoldenMatch result.
|
|
2
|
+
|
|
3
|
+
Reads ``clusters`` (and optionally ``match_stats`` for the record count) from
|
|
4
|
+
``AnalyzerInput.artifacts``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from goldenanalysis.core import aggregate as agg
|
|
10
|
+
from goldenanalysis.models import (
|
|
11
|
+
AnalysisTable,
|
|
12
|
+
AnalyzerInfo,
|
|
13
|
+
AnalyzerInput,
|
|
14
|
+
AnalyzerResult,
|
|
15
|
+
Metric,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
_PRODUCES = [
|
|
19
|
+
"cluster.count",
|
|
20
|
+
"cluster.record_count",
|
|
21
|
+
"cluster.singleton_ratio",
|
|
22
|
+
"cluster.size_p50",
|
|
23
|
+
"cluster.size_p95",
|
|
24
|
+
"cluster.size_max",
|
|
25
|
+
"cluster.reduction_ratio",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ClusterDistributionAnalyzer:
|
|
30
|
+
"""Cluster count, singleton ratio, size quantiles, reduction ratio, histogram."""
|
|
31
|
+
|
|
32
|
+
info = AnalyzerInfo(name="cluster.distribution", consumes=["clusters"], produces=_PRODUCES)
|
|
33
|
+
|
|
34
|
+
def run(self, inp: AnalyzerInput) -> AnalyzerResult:
|
|
35
|
+
clusters = inp.artifacts.get("clusters")
|
|
36
|
+
if not clusters:
|
|
37
|
+
return AnalyzerResult(metrics=[], tables=[])
|
|
38
|
+
|
|
39
|
+
sizes = [int(c.get("size", len(c.get("members", []))) if isinstance(c, dict) else c) for c in clusters.values()]
|
|
40
|
+
count = len(clusters)
|
|
41
|
+
# Prefer the engine's own record total; fall back to summed cluster sizes.
|
|
42
|
+
stats = inp.artifacts.get("match_stats", {}) or {}
|
|
43
|
+
record_count = int(stats.get("total_records", sum(sizes)))
|
|
44
|
+
singletons = sum(1 for s in sizes if s == 1)
|
|
45
|
+
|
|
46
|
+
metrics = [
|
|
47
|
+
Metric(key="cluster.count", value=count, unit="clusters", direction="neutral"),
|
|
48
|
+
Metric(key="cluster.record_count", value=record_count, unit="rows", direction="neutral"),
|
|
49
|
+
Metric(
|
|
50
|
+
key="cluster.singleton_ratio",
|
|
51
|
+
value=(singletons / count) if count else 0.0,
|
|
52
|
+
unit="ratio",
|
|
53
|
+
direction="neutral",
|
|
54
|
+
),
|
|
55
|
+
Metric(key="cluster.size_p50", value=agg.quantile(sizes, 0.5), unit="rows", direction="neutral"),
|
|
56
|
+
Metric(key="cluster.size_p95", value=agg.quantile(sizes, 0.95), unit="rows", direction="neutral"),
|
|
57
|
+
Metric(key="cluster.size_max", value=max(sizes) if sizes else 0, unit="rows", direction="neutral"),
|
|
58
|
+
Metric(
|
|
59
|
+
key="cluster.reduction_ratio",
|
|
60
|
+
value=(1 - count / record_count) if record_count else 0.0,
|
|
61
|
+
unit="ratio",
|
|
62
|
+
direction="neutral",
|
|
63
|
+
),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
# Discrete size histogram, buckets 1 / 2 / 3 / "4+".
|
|
67
|
+
n1 = sum(1 for s in sizes if s == 1)
|
|
68
|
+
n2 = sum(1 for s in sizes if s == 2)
|
|
69
|
+
n3 = sum(1 for s in sizes if s == 3)
|
|
70
|
+
n4 = sum(1 for s in sizes if s >= 4)
|
|
71
|
+
table = AnalysisTable(
|
|
72
|
+
name="cluster_size_histogram",
|
|
73
|
+
columns=["size", "count"],
|
|
74
|
+
rows=[[1, n1], [2, n2], [3, n3], ["4+", n4]],
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return AnalyzerResult(metrics=metrics, tables=[table])
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""``frame.summary`` — generic frame metrics, zero suite deps, always available.
|
|
2
|
+
|
|
3
|
+
Emits the Appendix-A metric set: row/column counts, mean null ratio, exact-
|
|
4
|
+
duplicate-row ratio, estimated in-memory size, plus a ``per_column`` table
|
|
5
|
+
(column, dtype, null_ratio, n_unique).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from goldenanalysis.core import aggregate as agg
|
|
11
|
+
from goldenanalysis.models import (
|
|
12
|
+
AnalysisTable,
|
|
13
|
+
AnalyzerInfo,
|
|
14
|
+
AnalyzerInput,
|
|
15
|
+
AnalyzerResult,
|
|
16
|
+
Metric,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_PRODUCES = [
|
|
20
|
+
"frame.row_count",
|
|
21
|
+
"frame.column_count",
|
|
22
|
+
"frame.null_ratio_mean",
|
|
23
|
+
"frame.duplicate_row_ratio",
|
|
24
|
+
"frame.memory_bytes",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FrameSummaryAnalyzer:
|
|
29
|
+
"""Summarize a raw frame: shape, null mass, duplication, memory footprint."""
|
|
30
|
+
|
|
31
|
+
info = AnalyzerInfo(name="frame.summary", consumes=["frame"], produces=_PRODUCES)
|
|
32
|
+
|
|
33
|
+
def run(self, inp: AnalyzerInput) -> AnalyzerResult:
|
|
34
|
+
df = inp.frame
|
|
35
|
+
if df is None:
|
|
36
|
+
raise ValueError("frame.summary requires AnalyzerInput.frame (a polars DataFrame)")
|
|
37
|
+
|
|
38
|
+
n_rows = df.height
|
|
39
|
+
n_cols = df.width
|
|
40
|
+
null_ratios = agg.null_ratio_per_column(df)
|
|
41
|
+
null_mean = sum(null_ratios.values()) / n_cols if n_cols else 0.0
|
|
42
|
+
dup_ratio = agg.duplicate_row_ratio(df)
|
|
43
|
+
mem_bytes = df.estimated_size()
|
|
44
|
+
|
|
45
|
+
metrics = [
|
|
46
|
+
Metric(key="frame.row_count", value=n_rows, unit="rows", direction="neutral"),
|
|
47
|
+
Metric(key="frame.column_count", value=n_cols, unit="columns", direction="neutral"),
|
|
48
|
+
Metric(
|
|
49
|
+
key="frame.null_ratio_mean",
|
|
50
|
+
value=null_mean,
|
|
51
|
+
unit="ratio",
|
|
52
|
+
direction="lower_better",
|
|
53
|
+
),
|
|
54
|
+
Metric(
|
|
55
|
+
key="frame.duplicate_row_ratio",
|
|
56
|
+
value=dup_ratio,
|
|
57
|
+
unit="ratio",
|
|
58
|
+
direction="lower_better",
|
|
59
|
+
),
|
|
60
|
+
Metric(key="frame.memory_bytes", value=mem_bytes, unit="bytes", direction="neutral"),
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
per_column = AnalysisTable(
|
|
64
|
+
name="per_column",
|
|
65
|
+
columns=["column", "dtype", "null_ratio", "n_unique"],
|
|
66
|
+
rows=[
|
|
67
|
+
[col, str(df[col].dtype), null_ratios[col], df[col].n_unique()]
|
|
68
|
+
for col in df.columns
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return AnalyzerResult(metrics=metrics, tables=[per_column])
|