invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. invarlock/__init__.py +4 -4
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +37 -50
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_loading.py +7 -7
  8. invarlock/adapters/hf_mixin.py +53 -9
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/assurance/__init__.py +15 -23
  12. invarlock/cli/adapter_auto.py +32 -26
  13. invarlock/cli/app.py +128 -27
  14. invarlock/cli/commands/__init__.py +2 -2
  15. invarlock/cli/commands/calibrate.py +48 -4
  16. invarlock/cli/commands/doctor.py +8 -10
  17. invarlock/cli/commands/evaluate.py +986 -0
  18. invarlock/cli/commands/explain_gates.py +25 -17
  19. invarlock/cli/commands/export_html.py +11 -9
  20. invarlock/cli/commands/plugins.py +13 -9
  21. invarlock/cli/commands/report.py +326 -92
  22. invarlock/cli/commands/run.py +1160 -228
  23. invarlock/cli/commands/verify.py +157 -97
  24. invarlock/cli/config.py +1 -1
  25. invarlock/cli/determinism.py +1 -1
  26. invarlock/cli/doctor_helpers.py +4 -5
  27. invarlock/cli/output.py +193 -0
  28. invarlock/cli/provenance.py +4 -4
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/registry.py +9 -11
  31. invarlock/core/retry.py +14 -14
  32. invarlock/core/runner.py +112 -26
  33. invarlock/edits/noop.py +2 -2
  34. invarlock/edits/quant_rtn.py +67 -39
  35. invarlock/eval/__init__.py +1 -1
  36. invarlock/eval/bench.py +14 -10
  37. invarlock/eval/data.py +68 -23
  38. invarlock/eval/metrics.py +59 -1
  39. invarlock/eval/primary_metric.py +1 -1
  40. invarlock/eval/tasks/__init__.py +12 -0
  41. invarlock/eval/tasks/classification.py +48 -0
  42. invarlock/eval/tasks/qa.py +36 -0
  43. invarlock/eval/tasks/text_generation.py +102 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/rmt.py +2 -2
  46. invarlock/guards/spectral.py +1 -1
  47. invarlock/guards/variance.py +2 -2
  48. invarlock/model_profile.py +64 -62
  49. invarlock/observability/health.py +6 -6
  50. invarlock/observability/metrics.py +108 -0
  51. invarlock/plugins/hf_bnb_adapter.py +32 -21
  52. invarlock/reporting/__init__.py +18 -4
  53. invarlock/reporting/guards_analysis.py +154 -4
  54. invarlock/reporting/html.py +61 -11
  55. invarlock/reporting/normalizer.py +9 -2
  56. invarlock/reporting/policy_utils.py +1 -1
  57. invarlock/reporting/primary_metric_utils.py +11 -11
  58. invarlock/reporting/render.py +876 -510
  59. invarlock/reporting/report.py +72 -30
  60. invarlock/reporting/{certificate.py → report_builder.py} +252 -99
  61. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  62. invarlock/reporting/report_types.py +6 -1
  63. invarlock/reporting/telemetry.py +86 -0
  64. invarlock-0.3.8.dist-info/METADATA +283 -0
  65. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
  66. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
  67. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
  68. invarlock/adapters/hf_gpt2.py +0 -404
  69. invarlock/adapters/hf_llama.py +0 -487
  70. invarlock/cli/commands/certify.py +0 -422
  71. invarlock-0.3.6.dist-info/METADATA +0 -588
  72. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
  73. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ through Transformers. Requires GPU for practical use.
7
7
  Install with the `gpu` extra on supported platforms.
8
8
 
9
9
  This adapter handles both:
10
- 1. Fresh quantization of FP16 models (load_in_8bit/load_in_4bit)
10
+ 1. Fresh quantization of FP16 models (via quantization_config)
11
11
  2. Loading pre-quantized BNB checkpoints (auto-detected via quantization_config)
12
12
  """
13
13
 
@@ -55,16 +55,17 @@ def _detect_pre_quantized_bnb(model_id: str) -> tuple[bool, int]:
55
55
  if not quant_cfg:
56
56
  return False, 0
57
57
 
58
- # Check for BNB quantization
59
- quant_method = quant_cfg.get("quant_method", "").lower()
60
- if quant_method == "bitsandbytes" or "load_in_8bit" in quant_cfg:
58
+ # Check for BNB quantization. Prefer explicit bits, then legacy flags.
59
+ quant_method = str(quant_cfg.get("quant_method", "")).lower()
60
+ if "bitsandbytes" in quant_method or "bnb" in quant_method:
61
+ bits = quant_cfg.get("bits")
62
+ if isinstance(bits, int) and bits in {4, 8}:
63
+ return True, bits
61
64
  if quant_cfg.get("load_in_8bit"):
62
65
  return True, 8
63
66
  if quant_cfg.get("load_in_4bit"):
64
67
  return True, 4
65
- # Fallback to bits field
66
- bits = quant_cfg.get("bits", 8)
67
- return True, bits
68
+ return True, 8
68
69
 
69
70
  except Exception:
70
71
  pass
@@ -82,11 +83,17 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
82
83
  "DEPENDENCY-MISSING: transformers",
83
84
  lambda e: {"dependency": "transformers"},
84
85
  ):
85
- from transformers import AutoModelForCausalLM
86
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
86
87
 
87
88
  # Check if this is a pre-quantized checkpoint
88
89
  is_pre_quantized, pre_quant_bits = _detect_pre_quantized_bnb(model_id)
89
90
 
91
+ if "load_in_8bit" in kwargs or "load_in_4bit" in kwargs:
92
+ raise ValueError(
93
+ "hf_bnb adapter: load_in_8bit/load_in_4bit are not supported. "
94
+ "Use model.quantization_config instead."
95
+ )
96
+
90
97
  if is_pre_quantized:
91
98
  # Load pre-quantized checkpoint WITHOUT re-applying quantization
92
99
  with wrap_errors(
@@ -99,20 +106,25 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
99
106
  model_id,
100
107
  device_map="auto",
101
108
  trust_remote_code=True,
102
- # Do NOT pass load_in_8bit/load_in_4bit for pre-quantized
103
- **{
104
- k: v
105
- for k, v in kwargs.items()
106
- if k not in ("load_in_8bit", "load_in_4bit")
107
- },
109
+ **kwargs,
108
110
  )
109
111
  else:
110
112
  # Fresh quantization of FP16 model
111
- load_in_8bit = bool(kwargs.pop("load_in_8bit", True))
112
- load_in_4bit = bool(kwargs.pop("load_in_4bit", False))
113
-
114
- if load_in_4bit:
115
- load_in_8bit = False
113
+ quantization_config = kwargs.pop("quantization_config", None)
114
+ if quantization_config is None:
115
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
116
+ elif isinstance(quantization_config, dict):
117
+ qdict = dict(quantization_config)
118
+ bits = qdict.pop("bits", None)
119
+ qdict.pop("quant_method", None)
120
+ if isinstance(bits, int):
121
+ if bits == 4:
122
+ qdict.setdefault("load_in_4bit", True)
123
+ qdict.setdefault("load_in_8bit", False)
124
+ elif bits == 8:
125
+ qdict.setdefault("load_in_8bit", True)
126
+ qdict.setdefault("load_in_4bit", False)
127
+ quantization_config = BitsAndBytesConfig(**qdict)
116
128
 
117
129
  with wrap_errors(
118
130
  ModelLoadError,
@@ -123,9 +135,8 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
123
135
  model = AutoModelForCausalLM.from_pretrained(
124
136
  model_id,
125
137
  device_map="auto",
126
- load_in_8bit=load_in_8bit,
127
- load_in_4bit=load_in_4bit,
128
138
  trust_remote_code=True,
139
+ quantization_config=quantization_config,
129
140
  **kwargs,
130
141
  )
131
142
 
@@ -1,7 +1,21 @@
1
1
  """
2
- Reporting package (aliasing evaluation reporting modules).
2
+ Evaluation report tooling (`invarlock.reporting`).
3
3
 
4
- Provides a stable unified import path for report schema, builders,
5
- and certificate generation while keeping backward compatibility with
6
- `invarlock.eval.*` imports.
4
+ Provides the evaluation report schema, builder, and renderers.
7
5
  """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .html import render_report_html
10
+ from .render import render_report_markdown
11
+ from .report_builder import make_report, validate_report
12
+ from .report_schema import REPORT_JSON_SCHEMA, REPORT_SCHEMA_VERSION
13
+
14
+ __all__ = [
15
+ "REPORT_SCHEMA_VERSION",
16
+ "REPORT_JSON_SCHEMA",
17
+ "make_report",
18
+ "render_report_markdown",
19
+ "render_report_html",
20
+ "validate_report",
21
+ ]
@@ -23,7 +23,9 @@ def _measurement_contract_digest(contract: Any) -> str | None:
23
23
 
24
24
 
25
25
  @no_type_check
26
- def _extract_invariants(report: RunReport) -> dict[str, Any]:
26
+ def _extract_invariants(
27
+ report: RunReport, baseline: RunReport | None = None
28
+ ) -> dict[str, Any]:
27
29
  """Extract invariant check results (matches the shape used in tests)."""
28
30
  invariants_data = (report.get("metrics", {}) or {}).get("invariants", {})
29
31
  failures: list[dict[str, Any]] = []
@@ -81,6 +83,108 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
81
83
  guard_entry = guard
82
84
  break
83
85
 
86
+ baseline_guard_entry = None
87
+ if baseline is not None:
88
+ for guard in baseline.get("guards", []) or []:
89
+ if str(guard.get("name", "")).lower() == "invariants":
90
+ baseline_guard_entry = guard
91
+ break
92
+
93
+ def _coerce_checks(value: Any) -> dict[str, Any] | None:
94
+ return value if isinstance(value, dict) else None
95
+
96
+ def _extract_guard_checks(
97
+ entry: Any,
98
+ ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
99
+ if not isinstance(entry, dict):
100
+ return None, None
101
+ details = entry.get("details")
102
+ if not isinstance(details, dict):
103
+ return None, None
104
+ return _coerce_checks(details.get("baseline_checks")), _coerce_checks(
105
+ details.get("current_checks")
106
+ )
107
+
108
+ def _compare_invariants(
109
+ baseline_checks: dict[str, Any],
110
+ current_checks: dict[str, Any],
111
+ ) -> tuple[list[dict[str, Any]], int, int]:
112
+ violations: list[dict[str, Any]] = []
113
+
114
+ # LayerNorm coverage check
115
+ baseline_layer_norms = set(baseline_checks.get("layer_norm_paths", ()))
116
+ current_layer_norms = set(current_checks.get("layer_norm_paths", ()))
117
+ missing_layer_norms = sorted(baseline_layer_norms - current_layer_norms)
118
+ if missing_layer_norms:
119
+ violations.append(
120
+ {
121
+ "type": "layer_norm_missing",
122
+ "missing": missing_layer_norms,
123
+ "message": "Expected LayerNorm modules are missing vs baseline",
124
+ }
125
+ )
126
+
127
+ # Tokenizer / vocab alignment
128
+ baseline_vocab_sizes = baseline_checks.get("embedding_vocab_sizes")
129
+ current_vocab_sizes = current_checks.get("embedding_vocab_sizes")
130
+ if isinstance(baseline_vocab_sizes, dict):
131
+ for module_name, baseline_size in baseline_vocab_sizes.items():
132
+ current_size = None
133
+ if isinstance(current_vocab_sizes, dict):
134
+ current_size = current_vocab_sizes.get(module_name)
135
+ if current_size is None or int(current_size) != int(baseline_size):
136
+ mismatch = {
137
+ "module": module_name,
138
+ "baseline": int(baseline_size),
139
+ "current": None if current_size is None else int(current_size),
140
+ }
141
+ violations.append(
142
+ {
143
+ "type": "tokenizer_mismatch",
144
+ "message": "Embedding vocabulary size changed vs baseline",
145
+ **mismatch,
146
+ }
147
+ )
148
+
149
+ handled_keys = {
150
+ "layer_norm_paths",
151
+ "embedding_vocab_sizes",
152
+ "config_vocab_size",
153
+ }
154
+ for check_name, baseline_value in baseline_checks.items():
155
+ if check_name in handled_keys:
156
+ continue
157
+ current_value = current_checks.get(check_name)
158
+ if current_value != baseline_value:
159
+ violations.append(
160
+ {
161
+ "type": "invariant_violation",
162
+ "check": check_name,
163
+ "baseline": baseline_value,
164
+ "current": current_value,
165
+ "message": (
166
+ f"Invariant {check_name} changed from {baseline_value} to {current_value}"
167
+ ),
168
+ }
169
+ )
170
+
171
+ fatal_violation_types = {"tokenizer_mismatch"}
172
+ fatal_count = 0
173
+ warning_count = 0
174
+ annotated: list[dict[str, Any]] = []
175
+ for violation in violations:
176
+ violation_type = str(violation.get("type") or "")
177
+ severity = "fatal" if violation_type in fatal_violation_types else "warning"
178
+ annotated_violation = dict(violation)
179
+ annotated_violation.setdefault("severity", severity)
180
+ annotated.append(annotated_violation)
181
+ if severity == "fatal":
182
+ fatal_count += 1
183
+ else:
184
+ warning_count += 1
185
+
186
+ return annotated, fatal_count, warning_count
187
+
84
188
  severity_status = "pass"
85
189
  if guard_entry:
86
190
  gm = guard_entry.get("metrics", {}) or {}
@@ -108,9 +212,51 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
108
212
  if detail:
109
213
  row["detail"] = detail
110
214
  failures.append(row)
111
- if fatal_count > 0:
215
+ base_fatal = 0
216
+ base_warn = 0
217
+ baseline_failures: list[dict[str, Any]] = []
218
+ if baseline_guard_entry is not None:
219
+ baseline_pre, baseline_post = _extract_guard_checks(baseline_guard_entry)
220
+ current_pre, current_post = _extract_guard_checks(guard_entry)
221
+ baseline_snapshot = baseline_pre or baseline_post
222
+ current_snapshot = current_post or current_pre
223
+ if isinstance(baseline_snapshot, dict) and isinstance(
224
+ current_snapshot, dict
225
+ ):
226
+ baseline_failures, base_fatal, base_warn = _compare_invariants(
227
+ baseline_snapshot, current_snapshot
228
+ )
229
+ for violation in baseline_failures:
230
+ check_name = violation.get("check")
231
+ if not check_name:
232
+ check_name = (
233
+ violation.get("module")
234
+ or violation.get("type")
235
+ or "invariant"
236
+ )
237
+ row = {
238
+ "check": str(check_name),
239
+ "type": str(violation.get("type") or "violation"),
240
+ "severity": str(violation.get("severity") or "warning"),
241
+ }
242
+ detail = {k: v for k, v in violation.items() if k not in row}
243
+ if detail:
244
+ detail.setdefault("source", "baseline_compare")
245
+ row["detail"] = detail
246
+ failures.append(row)
247
+
248
+ fatal_total = fatal_count + base_fatal
249
+ warn_total = warning_count + base_warn
250
+ try:
251
+ summary["fatal_violations"] = fatal_total
252
+ summary["warning_violations"] = warn_total
253
+ summary["violations_found"] = fatal_total + warn_total
254
+ except Exception:
255
+ pass
256
+
257
+ if fatal_total > 0:
112
258
  severity_status = "fail"
113
- elif warning_count > 0 or violations:
259
+ elif warn_total > 0 or violations:
114
260
  severity_status = "warn"
115
261
 
116
262
  # If any error-severity entry exists among failures, escalate to fail
@@ -130,12 +276,16 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
130
276
  "warning_violations": len(failures),
131
277
  }
132
278
 
279
+ details_out = invariants_data
280
+ if not details_out and guard_entry and isinstance(guard_entry.get("details"), dict):
281
+ details_out = guard_entry.get("details", {})
282
+
133
283
  return {
134
284
  "pre": "pass",
135
285
  "post": status,
136
286
  "status": status,
137
287
  "summary": summary,
138
- "details": invariants_data,
288
+ "details": details_out,
139
289
  "failures": failures,
140
290
  }
141
291
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Minimal HTML exporter for certificates.
2
+ Minimal HTML exporter for reports.
3
3
 
4
4
  This implementation wraps the Markdown rendering in a simple HTML template so
5
5
  that the numbers and core content remain identical across formats.
@@ -10,23 +10,73 @@ from __future__ import annotations
10
10
  from html import escape
11
11
  from typing import Any
12
12
 
13
- from .render import render_certificate_markdown
13
+ from .render import render_report_markdown
14
14
 
15
+ markdown_module: Any | None = None
16
+ try:
17
+ import markdown as _markdown # type: ignore[import-untyped]
18
+ except Exception: # pragma: no cover - optional dependency
19
+ _markdown = None
20
+ else:
21
+ markdown_module = _markdown
15
22
 
16
- def render_certificate_html(certificate: dict[str, Any]) -> str:
17
- """Render a certificate as a simple HTML document.
18
23
 
19
- Uses the Markdown renderer and embeds the content in a <pre> block to ensure
20
- stable parity for snapshot tests without extra dependencies.
24
+ _STATUS_BADGES = {
25
+ "\u2705 PASS": '<span class="badge pass">PASS</span>',
26
+ "\u2705 OK": '<span class="badge pass">OK</span>',
27
+ "\u274c FAIL": '<span class="badge fail">FAIL</span>',
28
+ "\u26a0\ufe0f WARN": '<span class="badge warn">WARN</span>',
29
+ "\u26a0 WARN": '<span class="badge warn">WARN</span>',
30
+ }
31
+
32
+
33
+ def _apply_status_badges(html_body: str) -> str:
34
+ updated = html_body
35
+ for token, replacement in _STATUS_BADGES.items():
36
+ updated = updated.replace(token, replacement)
37
+ return updated
38
+
39
+
40
+ def render_report_html(evaluation_report: dict[str, Any]) -> str:
41
+ """Render an evaluation report as a simple HTML document.
42
+
43
+ Uses the Markdown renderer and converts to HTML when available, falling back
44
+ to a <pre> block when the markdown dependency is missing.
21
45
  """
22
- md = render_certificate_markdown(certificate)
23
- body = f'<pre class="invarlock-md">{escape(md)}</pre>'
46
+ md = render_report_markdown(evaluation_report)
47
+ if markdown_module is None:
48
+ body = f'<pre class="invarlock-md">{escape(md)}</pre>'
49
+ else:
50
+ html_body = markdown_module.markdown(md, extensions=["tables", "fenced_code"])
51
+ html_body = _apply_status_badges(html_body)
52
+ body = f'<div class="invarlock-md">{html_body}</div>'
24
53
  return (
25
54
  '<!DOCTYPE html><html><head><meta charset="utf-8">'
26
- "<title>InvarLock Safety Certificate</title>"
27
- "<style>body{font-family:ui-monospace,Menlo,monospace;white-space:pre-wrap}</style>"
55
+ "<title>InvarLock Evaluation Report</title>"
56
+ "<style>"
57
+ ":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
58
+ "--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
59
+ "body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,sans-serif;"
60
+ "color:var(--ink);background:linear-gradient(180deg,#fff, #f6f8fa);"
61
+ "margin:0;padding:32px}"
62
+ ".invarlock-md{max-width:960px;margin:0 auto;padding:24px;background:#fff;"
63
+ "border:1px solid var(--border);border-radius:16px;box-shadow:0 10px 30px rgba(0,0,0,0.05)}"
64
+ "h1,h2,h3{margin-top:1.4em}h1{margin-top:0}"
65
+ "table{border-collapse:collapse;width:100%;margin:12px 0}"
66
+ "th,td{border:1px solid var(--border);padding:6px 8px;text-align:left}"
67
+ "code,pre{background:var(--panel);border-radius:8px}"
68
+ "pre{padding:12px;overflow:auto}"
69
+ ".badge{display:inline-block;padding:2px 8px;border-radius:999px;"
70
+ "font-size:0.75rem;font-weight:700;letter-spacing:0.02em;color:#fff}"
71
+ ".badge.pass{background:var(--pass)}"
72
+ ".badge.fail{background:var(--fail)}"
73
+ ".badge.warn{background:var(--warn)}"
74
+ "@media print{body{background:#fff;padding:0}.invarlock-md{box-shadow:none;"
75
+ "border:0}a{color:inherit;text-decoration:none}.badge{color:#000;"
76
+ "border:1px solid #000;background:transparent}}"
77
+ "</style>"
28
78
  "</head><body>" + body + "</body></html>"
29
79
  )
30
80
 
31
81
 
32
- __all__ = ["render_certificate_html"]
82
+ __all__ = ["render_report_html"]
@@ -33,7 +33,7 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
33
33
  """Coerce an arbitrary report-like mapping into a canonical RunReport.
34
34
 
35
35
  This is the single entry point for converting pre-canonical or loosely-typed
36
- data into the strict PM-only RunReport shape used by certificate/report.
36
+ data into the strict PM-only RunReport shape used by evaluation report building.
37
37
  """
38
38
  src = _as_mapping(report)
39
39
 
@@ -53,8 +53,10 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
53
53
  "ts": ts,
54
54
  "auto": meta_in.get("auto") if isinstance(meta_in.get("auto"), dict) else None,
55
55
  }
56
- # Preserve additional provenance knobs used by certificate/digests.
56
+ # Preserve additional provenance knobs used by evaluation report digests.
57
57
  for key in (
58
+ "pm_acceptance_range",
59
+ "pm_drift_band",
58
60
  "policy_overrides",
59
61
  "overrides",
60
62
  "plugins",
@@ -179,6 +181,11 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
179
181
  "latency_ms_p50",
180
182
  "latency_ms_p95",
181
183
  "memory_mb_peak",
184
+ "gpu_memory_mb_peak",
185
+ "gpu_memory_reserved_mb_peak",
186
+ "timings",
187
+ "guard_timings",
188
+ "memory_snapshots",
182
189
  "throughput_sps",
183
190
  "spectral",
184
191
  "rmt",
@@ -35,7 +35,7 @@ def _compute_thresholds_payload(
35
35
  tier: str, resolved_policy: dict[str, Any]
36
36
  ) -> dict[str, Any]:
37
37
  """Build canonical thresholds payload for digest stability."""
38
- from .certificate import TIER_RATIO_LIMITS # local to avoid cycles
38
+ from .report_builder import TIER_RATIO_LIMITS # local to avoid cycles
39
39
 
40
40
  tier_lc = (tier or "balanced").lower()
41
41
  metrics_policy = (
@@ -8,21 +8,21 @@ from .utils import _coerce_interval, _weighted_mean
8
8
 
9
9
 
10
10
  def attach_primary_metric(
11
- certificate: dict[str, Any],
11
+ evaluation_report: dict[str, Any],
12
12
  report: dict[str, Any],
13
13
  baseline_raw: dict[str, Any] | None,
14
14
  baseline_ref: dict[str, Any] | None,
15
15
  ppl_analysis: dict[str, Any] | None,
16
16
  ) -> None:
17
- """Attach/normalize the primary_metric block on the certificate.
17
+ """Attach/normalize the primary_metric block on the evaluation report.
18
18
 
19
- Behavior mirrors historical logic in certificate.py and preserves structure:
19
+ Behavior mirrors historical logic in report_builder.py and preserves structure:
20
20
  - Prefer explicit metrics.primary_metric if present
21
21
  - Compute missing ratio_vs_baseline, degenerate display_ci
22
22
  - ppl window-based analysis info (mean logloss) added when available
23
23
  - Fallbacks for classification metrics and eval-window-derived ppl
24
24
  - Ensure display_ci always present for schema invariants
25
- Mutates the certificate in-place.
25
+ Mutates the evaluation report in-place.
26
26
  """
27
27
  # Attach primary metric snapshot when provided in report
28
28
  try:
@@ -180,12 +180,12 @@ def attach_primary_metric(
180
180
  ]
181
181
  except Exception:
182
182
  pass
183
- certificate["primary_metric"] = pm_copy
183
+ evaluation_report["primary_metric"] = pm_copy
184
184
  except Exception:
185
185
  pass
186
186
 
187
187
  def _attach_from_windows() -> None:
188
- if isinstance(certificate.get("primary_metric"), dict):
188
+ if isinstance(evaluation_report.get("primary_metric"), dict):
189
189
  return
190
190
  try:
191
191
  m = (
@@ -212,7 +212,7 @@ def attach_primary_metric(
212
212
  baseline=baseline_raw if isinstance(baseline_raw, dict) else None,
213
213
  )
214
214
  if isinstance(pm_block, dict) and pm_block:
215
- certificate["primary_metric"] = pm_block
215
+ evaluation_report["primary_metric"] = pm_block
216
216
  except Exception:
217
217
  pass
218
218
 
@@ -220,7 +220,7 @@ def attach_primary_metric(
220
220
  _attach_from_windows()
221
221
 
222
222
  # Minimal fallback for classification-only reports without explicit primary_metric
223
- if not isinstance(certificate.get("primary_metric"), dict):
223
+ if not isinstance(evaluation_report.get("primary_metric"), dict):
224
224
  try:
225
225
  metrics_map = report.get("metrics", {}) if isinstance(report, dict) else {}
226
226
  clf = (
@@ -298,7 +298,7 @@ def attach_primary_metric(
298
298
  acc_pm["ratio_vs_baseline"] = delta_pp
299
299
  except Exception:
300
300
  pass
301
- certificate["primary_metric"] = acc_pm
301
+ evaluation_report["primary_metric"] = acc_pm
302
302
  except Exception:
303
303
  pass
304
304
 
@@ -308,8 +308,8 @@ def attach_primary_metric(
308
308
  # Ensure primary_metric has display_ci populated for schema invariants
309
309
  try:
310
310
  pm = (
311
- certificate.get("primary_metric", {})
312
- if isinstance(certificate.get("primary_metric"), dict)
311
+ evaluation_report.get("primary_metric", {})
312
+ if isinstance(evaluation_report.get("primary_metric"), dict)
313
313
  else None
314
314
  )
315
315
  if isinstance(pm, dict) and pm: