invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/_data/runtime/tiers.yaml +57 -30
- invarlock/adapters/__init__.py +11 -15
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/calibration/spectral_null.py +15 -10
- invarlock/calibration/variance_ve.py +0 -2
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/calibrate.py +6 -2
- invarlock/cli/commands/certify.py +651 -91
- invarlock/cli/commands/doctor.py +11 -11
- invarlock/cli/commands/explain_gates.py +57 -8
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +1066 -244
- invarlock/cli/commands/verify.py +154 -15
- invarlock/cli/config.py +22 -6
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/api.py +45 -5
- invarlock/core/auto_tuning.py +65 -20
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/contracts.py +7 -1
- invarlock/core/registry.py +11 -13
- invarlock/core/runner.py +425 -75
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -16
- invarlock/eval/data.py +82 -51
- invarlock/eval/metrics.py +63 -2
- invarlock/eval/primary_metric.py +23 -0
- invarlock/eval/tail_stats.py +230 -0
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/_estimators.py +154 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/policies.py +16 -6
- invarlock/guards/rmt.py +627 -546
- invarlock/guards/spectral.py +348 -110
- invarlock/guards/tier_config.py +32 -30
- invarlock/guards/variance.py +7 -31
- invarlock/guards_ref/rmt_ref.py +23 -23
- invarlock/model_profile.py +90 -42
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +384 -55
- invarlock/reporting/certificate_schema.py +3 -2
- invarlock/reporting/dataset_hashing.py +15 -2
- invarlock/reporting/guards_analysis.py +350 -277
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +13 -0
- invarlock/reporting/policy_utils.py +38 -36
- invarlock/reporting/primary_metric_utils.py +71 -17
- invarlock/reporting/render.py +852 -431
- invarlock/reporting/report.py +40 -4
- invarlock/reporting/report_types.py +11 -3
- invarlock/reporting/telemetry.py +86 -0
- invarlock/reporting/validate.py +1 -18
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
invarlock/reporting/html.py
CHANGED
|
@@ -12,19 +12,69 @@ from typing import Any
|
|
|
12
12
|
|
|
13
13
|
from .render import render_certificate_markdown
|
|
14
14
|
|
|
15
|
+
markdown_module: Any | None = None
|
|
16
|
+
try:
|
|
17
|
+
import markdown as _markdown # type: ignore[import-untyped]
|
|
18
|
+
except Exception: # pragma: no cover - optional dependency
|
|
19
|
+
_markdown = None
|
|
20
|
+
else:
|
|
21
|
+
markdown_module = _markdown
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_STATUS_BADGES = {
|
|
25
|
+
"\u2705 PASS": '<span class="badge pass">PASS</span>',
|
|
26
|
+
"\u2705 OK": '<span class="badge pass">OK</span>',
|
|
27
|
+
"\u274c FAIL": '<span class="badge fail">FAIL</span>',
|
|
28
|
+
"\u26a0\ufe0f WARN": '<span class="badge warn">WARN</span>',
|
|
29
|
+
"\u26a0 WARN": '<span class="badge warn">WARN</span>',
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _apply_status_badges(html_body: str) -> str:
|
|
34
|
+
updated = html_body
|
|
35
|
+
for token, replacement in _STATUS_BADGES.items():
|
|
36
|
+
updated = updated.replace(token, replacement)
|
|
37
|
+
return updated
|
|
38
|
+
|
|
15
39
|
|
|
16
40
|
def render_certificate_html(certificate: dict[str, Any]) -> str:
|
|
17
41
|
"""Render a certificate as a simple HTML document.
|
|
18
42
|
|
|
19
|
-
Uses the Markdown renderer and
|
|
20
|
-
|
|
43
|
+
Uses the Markdown renderer and converts to HTML when available, falling back
|
|
44
|
+
to a <pre> block when the markdown dependency is missing.
|
|
21
45
|
"""
|
|
22
46
|
md = render_certificate_markdown(certificate)
|
|
23
|
-
|
|
47
|
+
if markdown_module is None:
|
|
48
|
+
body = f'<pre class="invarlock-md">{escape(md)}</pre>'
|
|
49
|
+
else:
|
|
50
|
+
html_body = markdown_module.markdown(md, extensions=["tables", "fenced_code"])
|
|
51
|
+
html_body = _apply_status_badges(html_body)
|
|
52
|
+
body = f'<div class="invarlock-md">{html_body}</div>'
|
|
24
53
|
return (
|
|
25
54
|
'<!DOCTYPE html><html><head><meta charset="utf-8">'
|
|
26
|
-
"<title>InvarLock
|
|
27
|
-
"<style>
|
|
55
|
+
"<title>InvarLock Evaluation Certificate</title>"
|
|
56
|
+
"<style>"
|
|
57
|
+
":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
|
|
58
|
+
"--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
|
|
59
|
+
"body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,sans-serif;"
|
|
60
|
+
"color:var(--ink);background:linear-gradient(180deg,#fff, #f6f8fa);"
|
|
61
|
+
"margin:0;padding:32px}"
|
|
62
|
+
".invarlock-md{max-width:960px;margin:0 auto;padding:24px;background:#fff;"
|
|
63
|
+
"border:1px solid var(--border);border-radius:16px;box-shadow:0 10px 30px rgba(0,0,0,0.05)}"
|
|
64
|
+
"h1,h2,h3{margin-top:1.4em}h1{margin-top:0}"
|
|
65
|
+
"table{border-collapse:collapse;width:100%;margin:12px 0}"
|
|
66
|
+
"th,td{border:1px solid var(--border);padding:6px 8px;text-align:left}"
|
|
67
|
+
"code,pre{background:var(--panel);border-radius:8px}"
|
|
68
|
+
"pre{padding:12px;overflow:auto}"
|
|
69
|
+
".badge{display:inline-block;padding:2px 8px;border-radius:999px;"
|
|
70
|
+
"font-size:0.75rem;font-weight:700;letter-spacing:0.02em;color:#fff}"
|
|
71
|
+
".badge.pass{background:var(--pass)}"
|
|
72
|
+
".badge.fail{background:var(--fail)}"
|
|
73
|
+
".badge.warn{background:var(--warn)}"
|
|
74
|
+
"@media print{body{background:#fff;padding:0}.invarlock-md{box-shadow:none;"
|
|
75
|
+
"border:0}a{color:inherit;text-decoration:none}.badge{color:#000;"
|
|
76
|
+
"border:1px solid #000;background:transparent}}"
|
|
77
|
+
"</style>"
|
|
28
78
|
"</head><body>" + body + "</body></html>"
|
|
29
79
|
)
|
|
30
80
|
|
|
@@ -55,6 +55,8 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
55
55
|
}
|
|
56
56
|
# Preserve additional provenance knobs used by certificate/digests.
|
|
57
57
|
for key in (
|
|
58
|
+
"pm_acceptance_range",
|
|
59
|
+
"pm_drift_band",
|
|
58
60
|
"policy_overrides",
|
|
59
61
|
"overrides",
|
|
60
62
|
"plugins",
|
|
@@ -179,10 +181,16 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
179
181
|
"latency_ms_p50",
|
|
180
182
|
"latency_ms_p95",
|
|
181
183
|
"memory_mb_peak",
|
|
184
|
+
"gpu_memory_mb_peak",
|
|
185
|
+
"gpu_memory_reserved_mb_peak",
|
|
186
|
+
"timings",
|
|
187
|
+
"guard_timings",
|
|
188
|
+
"memory_snapshots",
|
|
182
189
|
"throughput_sps",
|
|
183
190
|
"spectral",
|
|
184
191
|
"rmt",
|
|
185
192
|
"invariants",
|
|
193
|
+
"primary_metric_tail",
|
|
186
194
|
"logloss_delta_ci",
|
|
187
195
|
"bootstrap",
|
|
188
196
|
"reduction",
|
|
@@ -237,6 +245,11 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
237
245
|
flags=flags,
|
|
238
246
|
)
|
|
239
247
|
|
|
248
|
+
# keep context when provided (profile/assurance provenance)
|
|
249
|
+
ctx = src.get("context")
|
|
250
|
+
if isinstance(ctx, Mapping):
|
|
251
|
+
out["context"] = dict(ctx)
|
|
252
|
+
|
|
240
253
|
# keep evaluation_windows if provided (for deeper pairing-based features)
|
|
241
254
|
ew = src.get("evaluation_windows")
|
|
242
255
|
if isinstance(ew, dict):
|
|
@@ -48,6 +48,10 @@ def _compute_thresholds_payload(
|
|
|
48
48
|
if not isinstance(pm_policy, dict):
|
|
49
49
|
pm_policy = {}
|
|
50
50
|
|
|
51
|
+
pm_tail_policy = metrics_policy.get("pm_tail", {})
|
|
52
|
+
if not isinstance(pm_tail_policy, dict):
|
|
53
|
+
pm_tail_policy = {}
|
|
54
|
+
|
|
51
55
|
acc_policy = metrics_policy.get("accuracy", {})
|
|
52
56
|
if not isinstance(acc_policy, dict):
|
|
53
57
|
acc_policy = {}
|
|
@@ -76,6 +80,12 @@ def _compute_thresholds_payload(
|
|
|
76
80
|
resolved_policy.get("variance", {}) if isinstance(resolved_policy, dict) else {}
|
|
77
81
|
)
|
|
78
82
|
|
|
83
|
+
def _safe_float_any(value: Any, default: float) -> float:
|
|
84
|
+
try:
|
|
85
|
+
return float(value)
|
|
86
|
+
except Exception:
|
|
87
|
+
return float(default)
|
|
88
|
+
|
|
79
89
|
payload = {
|
|
80
90
|
"tier": tier_lc,
|
|
81
91
|
"pm_ratio": {
|
|
@@ -86,6 +96,22 @@ def _compute_thresholds_payload(
|
|
|
86
96
|
),
|
|
87
97
|
"hysteresis_ratio": float(pm_policy.get("hysteresis_ratio", 0.0) or 0.0),
|
|
88
98
|
},
|
|
99
|
+
"pm_tail": {
|
|
100
|
+
"mode": str(pm_tail_policy.get("mode", "warn") or "warn").strip().lower(),
|
|
101
|
+
"min_windows": int(pm_tail_policy.get("min_windows", 0) or 0),
|
|
102
|
+
"quantile": _safe_float_any(pm_tail_policy.get("quantile", 0.95), 0.95),
|
|
103
|
+
"quantile_max": (
|
|
104
|
+
float(pm_tail_policy.get("quantile_max"))
|
|
105
|
+
if isinstance(pm_tail_policy.get("quantile_max"), int | float)
|
|
106
|
+
else None
|
|
107
|
+
),
|
|
108
|
+
"epsilon": _safe_float_any(pm_tail_policy.get("epsilon", 0.0), 0.0),
|
|
109
|
+
"mass_max": (
|
|
110
|
+
float(pm_tail_policy.get("mass_max"))
|
|
111
|
+
if isinstance(pm_tail_policy.get("mass_max"), int | float)
|
|
112
|
+
else None
|
|
113
|
+
),
|
|
114
|
+
},
|
|
89
115
|
"accuracy": {
|
|
90
116
|
"delta_min_pp": float(acc_policy.get("delta_min_pp", -1.0) or -1.0),
|
|
91
117
|
"min_examples": int(acc_policy.get("min_examples", 200) or 200),
|
|
@@ -110,16 +136,6 @@ def _compute_thresholds_hash(payload: dict[str, Any]) -> str:
|
|
|
110
136
|
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]
|
|
111
137
|
|
|
112
138
|
|
|
113
|
-
def _promote_legacy_multiple_testing_key(payload: dict[str, Any]) -> None:
|
|
114
|
-
"""Promote legacy 'multipletesting' to 'multiple_testing' in-place if present."""
|
|
115
|
-
try:
|
|
116
|
-
legacy_mt = payload.pop("multipletesting", None)
|
|
117
|
-
if legacy_mt is not None and "multiple_testing" not in payload:
|
|
118
|
-
payload["multiple_testing"] = legacy_mt
|
|
119
|
-
except Exception:
|
|
120
|
-
pass
|
|
121
|
-
|
|
122
|
-
|
|
123
139
|
def _resolve_policy_tier(report: RunReport) -> str:
|
|
124
140
|
"""Resolve the policy tier from report metadata or context."""
|
|
125
141
|
tier: Any = None
|
|
@@ -218,15 +234,9 @@ def _build_resolved_policies(
|
|
|
218
234
|
from .policy_utils import _format_family_caps as _ffc # self import safe
|
|
219
235
|
|
|
220
236
|
spectral_resolved["family_caps"] = _ffc(spectral_caps)
|
|
221
|
-
# Prefer observed policy sigma_quantile (accepting legacy aliases), then fallback
|
|
222
237
|
pol_sq = None
|
|
223
238
|
try:
|
|
224
239
|
pol_sq = (spectral.get("policy", {}) or {}).get("sigma_quantile")
|
|
225
|
-
if pol_sq is None:
|
|
226
|
-
# Legacy aliases
|
|
227
|
-
pol_sq = (spectral.get("policy", {}) or {}).get("contraction") or (
|
|
228
|
-
spectral.get("policy", {}) or {}
|
|
229
|
-
).get("kappa")
|
|
230
240
|
except Exception:
|
|
231
241
|
pol_sq = None
|
|
232
242
|
spectral_resolved["sigma_quantile"] = _safe_float(
|
|
@@ -276,6 +286,9 @@ def _build_resolved_policies(
|
|
|
276
286
|
spectral_resolved["max_spectral_norm"] = spectral.get("policy", {}).get(
|
|
277
287
|
"max_spectral_norm", spectral_resolved.get("max_spectral_norm")
|
|
278
288
|
)
|
|
289
|
+
mc = spectral.get("measurement_contract")
|
|
290
|
+
if isinstance(mc, dict) and mc:
|
|
291
|
+
spectral_resolved["measurement_contract"] = copy.deepcopy(mc)
|
|
279
292
|
resolved["spectral"] = spectral_resolved
|
|
280
293
|
|
|
281
294
|
# RMT guard
|
|
@@ -295,15 +308,16 @@ def _build_resolved_policies(
|
|
|
295
308
|
rmt_resolved["epsilon_default"] = _safe_float(epsilon_default_val, 0.1)
|
|
296
309
|
from .policy_utils import _format_epsilon_map as _fem
|
|
297
310
|
|
|
298
|
-
epsilon_map = _fem(
|
|
311
|
+
epsilon_map = _fem(
|
|
312
|
+
rmt.get("epsilon_by_family") or rmt_resolved.get("epsilon_by_family") or {}
|
|
313
|
+
)
|
|
299
314
|
if epsilon_map:
|
|
300
315
|
rmt_resolved["epsilon_by_family"] = epsilon_map
|
|
301
|
-
else:
|
|
302
|
-
rmt_resolved.pop("epsilon", None)
|
|
303
|
-
if "epsilon" in rmt_resolved:
|
|
304
|
-
rmt_resolved.pop("epsilon", None)
|
|
305
316
|
if "correct" in rmt_resolved:
|
|
306
317
|
rmt_resolved["correct"] = bool(rmt_resolved["correct"])
|
|
318
|
+
mc = rmt.get("measurement_contract")
|
|
319
|
+
if isinstance(mc, dict) and mc:
|
|
320
|
+
rmt_resolved["measurement_contract"] = copy.deepcopy(mc)
|
|
307
321
|
resolved["rmt"] = rmt_resolved
|
|
308
322
|
|
|
309
323
|
# Variance guard
|
|
@@ -441,13 +455,9 @@ def _extract_effective_policies(report: RunReport) -> dict[str, Any]:
|
|
|
441
455
|
elif guard_name == "spectral":
|
|
442
456
|
sigma_quantile = guard_metrics.get(
|
|
443
457
|
"sigma_quantile",
|
|
444
|
-
|
|
445
|
-
)
|
|
446
|
-
multiple_testing = guard_metrics.get("multiple_testing") or (
|
|
447
|
-
guard_metrics.get("multipletesting")
|
|
448
|
-
if isinstance(guard_metrics.get("multipletesting"), dict)
|
|
449
|
-
else None
|
|
458
|
+
0.95,
|
|
450
459
|
)
|
|
460
|
+
multiple_testing = guard_metrics.get("multiple_testing")
|
|
451
461
|
guard_policy = {
|
|
452
462
|
"max_spectral_norm": guard_metrics.get("max_spectral_norm"),
|
|
453
463
|
"stability_score": guard_metrics.get("stability_score", 0.95),
|
|
@@ -473,20 +483,13 @@ def _extract_effective_policies(report: RunReport) -> dict[str, Any]:
|
|
|
473
483
|
|
|
474
484
|
if guard_policy:
|
|
475
485
|
if guard_name == "spectral":
|
|
476
|
-
sigma_quantile = guard_policy.get("sigma_quantile")
|
|
477
|
-
if sigma_quantile is None:
|
|
478
|
-
sigma_quantile = guard_policy.get("contraction")
|
|
479
|
-
if sigma_quantile is None and "kappa" in guard_policy:
|
|
480
|
-
sigma_quantile = guard_policy["kappa"]
|
|
481
486
|
sanitized_policy = dict(guard_policy)
|
|
487
|
+
sigma_quantile = sanitized_policy.get("sigma_quantile")
|
|
482
488
|
if sigma_quantile is not None:
|
|
483
489
|
try:
|
|
484
490
|
sanitized_policy["sigma_quantile"] = float(sigma_quantile)
|
|
485
491
|
except (TypeError, ValueError):
|
|
486
492
|
pass
|
|
487
|
-
_promote_legacy_multiple_testing_key(sanitized_policy)
|
|
488
|
-
sanitized_policy.pop("contraction", None)
|
|
489
|
-
sanitized_policy.pop("kappa", None)
|
|
490
493
|
if sanitized_policy.get("max_spectral_norm") in (None, 0):
|
|
491
494
|
sanitized_policy["max_spectral_norm"] = None
|
|
492
495
|
guard_policy = sanitized_policy
|
|
@@ -587,7 +590,6 @@ __all__ = [
|
|
|
587
590
|
"_compute_variance_policy_digest",
|
|
588
591
|
"_compute_thresholds_payload",
|
|
589
592
|
"_compute_thresholds_hash",
|
|
590
|
-
"_promote_legacy_multiple_testing_key",
|
|
591
593
|
"_resolve_policy_tier",
|
|
592
594
|
"_build_resolved_policies",
|
|
593
595
|
"_extract_effective_policies",
|
|
@@ -30,6 +30,38 @@ def attach_primary_metric(
|
|
|
30
30
|
pm = m.get("primary_metric") if isinstance(m, dict) else None
|
|
31
31
|
if isinstance(pm, dict) and pm:
|
|
32
32
|
pm_copy = copy.deepcopy(pm)
|
|
33
|
+
pm_copy.setdefault("invalid", bool(pm_copy.get("invalid", False)))
|
|
34
|
+
degraded_reason = pm_copy.get("degraded_reason")
|
|
35
|
+
preview_val = pm_copy.get("preview")
|
|
36
|
+
final_val = pm_copy.get("final")
|
|
37
|
+
ratio_val = pm_copy.get("ratio_vs_baseline")
|
|
38
|
+
baseline_final = (
|
|
39
|
+
baseline_ref.get("primary_metric", {}).get("final")
|
|
40
|
+
if isinstance(baseline_ref, dict)
|
|
41
|
+
else None
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def _is_finite(value: Any) -> bool:
|
|
45
|
+
return isinstance(value, (int, float)) and math.isfinite(float(value))
|
|
46
|
+
|
|
47
|
+
baseline_has_reference = _is_finite(baseline_final)
|
|
48
|
+
needs_pm_fallback = not (_is_finite(preview_val) and _is_finite(final_val))
|
|
49
|
+
needs_ratio_fallback = baseline_has_reference and not _is_finite(ratio_val)
|
|
50
|
+
|
|
51
|
+
if degraded_reason is None:
|
|
52
|
+
if needs_pm_fallback:
|
|
53
|
+
degraded_reason = "non_finite_pm"
|
|
54
|
+
elif needs_ratio_fallback:
|
|
55
|
+
degraded_reason = "non_finite_delta"
|
|
56
|
+
elif pm_copy.get("invalid"):
|
|
57
|
+
degraded_reason = "primary_metric_invalid"
|
|
58
|
+
|
|
59
|
+
pm_copy["degraded"] = bool(
|
|
60
|
+
pm_copy.get("degraded") or pm_copy.get("invalid") or degraded_reason
|
|
61
|
+
)
|
|
62
|
+
if pm_copy["degraded"] and degraded_reason:
|
|
63
|
+
pm_copy.setdefault("degraded_reason", degraded_reason)
|
|
64
|
+
|
|
33
65
|
# Propagate instability hint from ppl_analysis
|
|
34
66
|
try:
|
|
35
67
|
if isinstance(ppl_analysis, dict) and bool(
|
|
@@ -75,33 +107,52 @@ def attach_primary_metric(
|
|
|
75
107
|
pm_copy["analysis_point_final"] = float(mean_fin)
|
|
76
108
|
# Attach analysis-basis CIs for preview/final in log space from report metrics
|
|
77
109
|
try:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
110
|
+
dlci_source: tuple[float, float] | list[float] | None = None
|
|
111
|
+
pairing_source = None
|
|
112
|
+
if isinstance(ppl_analysis, dict):
|
|
113
|
+
stats = ppl_analysis.get("stats") or {}
|
|
114
|
+
if isinstance(stats, dict):
|
|
115
|
+
pairing_source = stats.get("pairing")
|
|
116
|
+
if pairing_source == "paired_baseline":
|
|
117
|
+
dlci_source = _coerce_interval(
|
|
118
|
+
ppl_analysis.get("logloss_delta_ci")
|
|
119
|
+
)
|
|
120
|
+
if dlci_source is None:
|
|
121
|
+
dlci_source = (
|
|
122
|
+
_coerce_interval(m.get("logloss_delta_ci"))
|
|
123
|
+
if isinstance(m, dict)
|
|
124
|
+
else (math.nan, math.nan)
|
|
125
|
+
)
|
|
126
|
+
if (
|
|
127
|
+
isinstance(dlci_source, tuple | list)
|
|
128
|
+
and len(dlci_source) == 2
|
|
129
|
+
):
|
|
130
|
+
lo_raw, hi_raw = dlci_source[0], dlci_source[1]
|
|
131
|
+
if isinstance(lo_raw, (int, float)) and isinstance(
|
|
132
|
+
hi_raw, (int, float)
|
|
133
|
+
):
|
|
134
|
+
lo, hi = float(lo_raw), float(hi_raw)
|
|
135
|
+
if math.isfinite(lo) and math.isfinite(hi):
|
|
136
|
+
pm_copy.setdefault("ci", (lo, hi))
|
|
87
137
|
except Exception:
|
|
88
138
|
pass
|
|
89
139
|
except Exception:
|
|
90
140
|
pass
|
|
91
141
|
# Ensure ratio_vs_baseline present and consistent
|
|
92
142
|
try:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
143
|
+
fin = pm_copy.get("final")
|
|
144
|
+
baseline_final_val = (
|
|
145
|
+
float(baseline_final)
|
|
146
|
+
if isinstance(baseline_final, (int, float))
|
|
147
|
+
and _is_finite(baseline_final)
|
|
96
148
|
else None
|
|
97
149
|
)
|
|
98
|
-
fin = pm_copy.get("final")
|
|
99
150
|
if (
|
|
100
|
-
isinstance(fin, int
|
|
101
|
-
and
|
|
102
|
-
and
|
|
151
|
+
isinstance(fin, (int, float))
|
|
152
|
+
and baseline_final_val is not None
|
|
153
|
+
and baseline_final_val > 0
|
|
103
154
|
):
|
|
104
|
-
pm_copy["ratio_vs_baseline"] = float(fin) /
|
|
155
|
+
pm_copy["ratio_vs_baseline"] = float(fin) / baseline_final_val
|
|
105
156
|
# Ensure display_ci aligns with log-space CI for ppl-like metrics
|
|
106
157
|
try:
|
|
107
158
|
kind = str(pm_copy.get("kind", "")).lower()
|
|
@@ -277,6 +328,9 @@ def attach_primary_metric(
|
|
|
277
328
|
if isinstance(point, float):
|
|
278
329
|
pm["display_ci"] = [point, point]
|
|
279
330
|
else:
|
|
331
|
+
# As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
|
|
280
332
|
pm["display_ci"] = [1.0, 1.0]
|
|
333
|
+
pm.setdefault("estimated", True)
|
|
334
|
+
|
|
281
335
|
except Exception:
|
|
282
336
|
pass
|