invarlock 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. invarlock/__init__.py +3 -3
  2. invarlock/adapters/auto.py +2 -10
  3. invarlock/adapters/hf_loading.py +7 -7
  4. invarlock/adapters/hf_mixin.py +28 -5
  5. invarlock/assurance/__init__.py +15 -23
  6. invarlock/calibration/spectral_null.py +1 -1
  7. invarlock/cli/adapter_auto.py +1 -5
  8. invarlock/cli/app.py +57 -27
  9. invarlock/cli/commands/__init__.py +2 -2
  10. invarlock/cli/commands/calibrate.py +48 -4
  11. invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
  12. invarlock/cli/commands/explain_gates.py +94 -51
  13. invarlock/cli/commands/export_html.py +11 -9
  14. invarlock/cli/commands/report.py +121 -47
  15. invarlock/cli/commands/run.py +274 -66
  16. invarlock/cli/commands/verify.py +84 -89
  17. invarlock/cli/determinism.py +1 -1
  18. invarlock/cli/provenance.py +3 -3
  19. invarlock/core/bootstrap.py +1 -1
  20. invarlock/core/retry.py +14 -14
  21. invarlock/core/runner.py +1 -1
  22. invarlock/edits/noop.py +2 -2
  23. invarlock/edits/quant_rtn.py +2 -2
  24. invarlock/eval/__init__.py +1 -1
  25. invarlock/eval/bench.py +11 -7
  26. invarlock/eval/primary_metric.py +1 -1
  27. invarlock/guards/spectral.py +2 -2
  28. invarlock/guards_ref/spectral_ref.py +1 -1
  29. invarlock/model_profile.py +16 -35
  30. invarlock/observability/health.py +38 -20
  31. invarlock/plugins/hf_bnb_adapter.py +32 -21
  32. invarlock/reporting/__init__.py +18 -4
  33. invarlock/reporting/html.py +7 -7
  34. invarlock/reporting/normalizer.py +2 -2
  35. invarlock/reporting/policy_utils.py +1 -1
  36. invarlock/reporting/primary_metric_utils.py +11 -11
  37. invarlock/reporting/render.py +126 -120
  38. invarlock/reporting/report.py +43 -37
  39. invarlock/reporting/{certificate.py → report_builder.py} +103 -99
  40. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  41. invarlock-0.3.9.dist-info/METADATA +303 -0
  42. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/RECORD +46 -46
  43. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/WHEEL +1 -1
  44. invarlock-0.3.7.dist-info/METADATA +0 -602
  45. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/entry_points.txt +0 -0
  46. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/licenses/LICENSE +0 -0
  47. {invarlock-0.3.7.dist-info → invarlock-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,16 @@
1
1
  """
2
- InvarLock Evaluation Certificate Generation
3
- ==========================================
2
+ InvarLock Evaluation Report Generation
3
+ =====================================
4
4
 
5
- Generate standardized evaluation certificates from RunReport and baseline
5
+ Generate standardized evaluation reports from RunReport and baseline
6
6
  comparison.
7
- Certificates are standalone, portable verification artifacts that can be used
8
- for CI/CD gates and regulatory compliance.
7
+ Evaluation reports are standalone, portable artifacts that record statistical
8
+ gates and evidence for CI/CD checks and audits (not formal verification).
9
9
  """
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
- ## Core certificate generation and analysis orchestration lives here.
13
+ ## Core evaluation report building and analysis orchestration lives here.
14
14
  # mypy: ignore-errors
15
15
  import copy
16
16
  import hashlib
@@ -39,11 +39,7 @@ from invarlock.eval.primary_metric import compute_primary_metric_from_report, ge
39
39
  from invarlock.eval.tail_stats import evaluate_metric_tail
40
40
  from invarlock.utils.digest import hash_json
41
41
 
42
- from . import certificate_schema as _cert_schema
43
- from .certificate_schema import (
44
- CERTIFICATE_JSON_SCHEMA,
45
- CERTIFICATE_SCHEMA_VERSION,
46
- )
42
+ from . import report_schema as _report_schema
47
43
  from .dataset_hashing import (
48
44
  _extract_dataset_info,
49
45
  )
@@ -53,10 +49,15 @@ from .guards_analysis import (
53
49
  _extract_spectral_analysis,
54
50
  _extract_variance_analysis,
55
51
  )
56
- from .report_types import RunReport, validate_report
52
+ from .report_schema import (
53
+ REPORT_JSON_SCHEMA,
54
+ REPORT_SCHEMA_VERSION,
55
+ )
56
+ from .report_types import RunReport
57
+ from .report_types import validate_report as validate_run_report
57
58
 
58
59
  # Expose compute_window_hash for tests that monkeypatch it
59
- # compute_window_hash used to be exposed via certificate; tests now patch
60
+ # compute_window_hash used to be exposed via the evaluation report builder; tests now patch
60
61
  # dataset_hashing.compute_window_hash directly, so this import is no longer needed.
61
62
  from .utils import (
62
63
  _coerce_int,
@@ -79,6 +80,9 @@ TIER_RATIO_LIMITS: dict[str, float] = {
79
80
  "none": 1.10,
80
81
  }
81
82
 
83
+ # Canonical preview→final drift band used when not explicitly configured.
84
+ PM_DRIFT_BAND_DEFAULT: tuple[float, float] = (0.95, 1.05)
85
+
82
86
 
83
87
  def _is_ppl_kind(name: Any) -> bool:
84
88
  """Return True if a primary_metric kind denotes a ppl-like metric.
@@ -103,7 +107,7 @@ def _is_ppl_kind(name: Any) -> bool:
103
107
 
104
108
 
105
109
  ## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
106
- ## use the normalized primary_metric block directly via make_certificate or
110
+ ## use the normalized primary_metric block directly via make_report or
107
111
  ## report processing utilities.
108
112
 
109
113
 
@@ -131,8 +135,8 @@ def _compute_edit_digest(report: dict) -> dict:
131
135
  return {"family": family, "impl_hash": impl_hash, "version": 1}
132
136
 
133
137
 
134
- def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
135
- """Compute certificate confidence label based on stability and CI width.
138
+ def _compute_confidence_label(evaluation_report: dict[str, Any]) -> dict[str, Any]:
139
+ """Compute evaluation report confidence label based on stability and CI width.
136
140
 
137
141
  Heuristics:
138
142
  - High: ppl_acceptable=True, unstable=False, width <= 0.03 (ratio) or <= 1.0 pp for accuracy
@@ -140,7 +144,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
140
144
  - Low: otherwise (floors unmet, failure, or missing bounds)
141
145
  Returns a dict with label, basis, width and threshold for transparency.
142
146
  """
143
- validation = certificate.get("validation", {}) or {}
147
+ validation = evaluation_report.get("validation", {}) or {}
144
148
  pm_ok = bool(validation.get("primary_metric_acceptable", False))
145
149
  # Basis label shown in confidence block:
146
150
  # - For ppl-like metrics, use 'ppl_ratio' to reflect ratio width threshold
@@ -149,7 +153,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
149
153
  basis = "primary_metric"
150
154
  lo = hi = float("nan")
151
155
  try:
152
- pm = certificate.get("primary_metric", {}) or {}
156
+ pm = evaluation_report.get("primary_metric", {}) or {}
153
157
  kind = str(pm.get("kind", "") or "").lower()
154
158
  if isinstance(pm, dict) and pm and pm.get("display_ci"):
155
159
  dci = pm.get("display_ci")
@@ -170,7 +174,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
170
174
  thr_ratio = 0.03 # 3% width for ratio
171
175
  thr_pp = 1.0 # 1.0 percentage point for accuracy kinds
172
176
  try:
173
- pol = certificate.get("resolved_policy")
177
+ pol = evaluation_report.get("resolved_policy")
174
178
  if isinstance(pol, dict):
175
179
  conf_pol = pol.get("confidence")
176
180
  if isinstance(conf_pol, dict):
@@ -187,7 +191,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
187
191
 
188
192
  # Unstable hint from primary metric (if provided)
189
193
  try:
190
- unstable = bool((certificate.get("primary_metric") or {}).get("unstable"))
194
+ unstable = bool((evaluation_report.get("primary_metric") or {}).get("unstable"))
191
195
  except Exception: # pragma: no cover
192
196
  unstable = False
193
197
 
@@ -213,39 +217,39 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
213
217
  }
214
218
 
215
219
 
216
- # Minimal JSON Schema describing the canonical shape of a certificate.
220
+ # Minimal JSON Schema describing the canonical shape of an evaluation report.
217
221
  # This focuses on structural validity; numerical thresholds are validated
218
222
  # separately in metric-specific logic.
219
- # JSON Schema is provided by certificate_schema; no duplication here.
223
+ # JSON Schema is provided by report_schema; no duplication here.
220
224
 
221
225
 
222
226
  # Mirror jsonschema and structural validator for test monkeypatching compatibility.
223
- jsonschema = getattr(_cert_schema, "jsonschema", None)
227
+ jsonschema = getattr(_report_schema, "jsonschema", None)
224
228
 
225
229
 
226
- def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
230
+ def _validate_with_jsonschema(evaluation_report: dict[str, Any]) -> bool:
227
231
  if jsonschema is None:
228
232
  return True
229
233
  try:
230
- jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
234
+ jsonschema.validate(instance=evaluation_report, schema=REPORT_JSON_SCHEMA)
231
235
  return True
232
236
  except Exception: # pragma: no cover
233
237
  return False
234
238
 
235
239
 
236
- def validate_certificate(certificate: dict[str, Any]) -> bool:
237
- """Validate that a certificate has all required fields and valid data."""
240
+ def validate_report(evaluation_report: dict[str, Any]) -> bool:
241
+ """Validate that an evaluation report has all required fields and valid data."""
238
242
  try:
239
- if certificate.get("schema_version") != CERTIFICATE_SCHEMA_VERSION:
243
+ if evaluation_report.get("schema_version") != REPORT_SCHEMA_VERSION:
240
244
  return False
241
245
  # Prefer JSON Schema structural validation; if unavailable or too strict,
242
246
  # fall back to a lenient minimal check used by unit tests.
243
- if not _validate_with_jsonschema(certificate):
247
+ if not _validate_with_jsonschema(evaluation_report):
244
248
  # Minimal fallback: require schema version + run_id + primary_metric
245
- run_id_ok = isinstance(certificate.get("run_id"), str) and bool(
246
- certificate.get("run_id")
249
+ run_id_ok = isinstance(evaluation_report.get("run_id"), str) and bool(
250
+ evaluation_report.get("run_id")
247
251
  )
248
- pm = certificate.get("primary_metric")
252
+ pm = evaluation_report.get("primary_metric")
249
253
  pm_ok = isinstance(pm, dict) and (
250
254
  isinstance(pm.get("final"), int | float)
251
255
  or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
@@ -253,7 +257,7 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
253
257
  if not (run_id_ok and pm_ok):
254
258
  return False
255
259
 
256
- validation = certificate.get("validation", {})
260
+ validation = evaluation_report.get("validation", {})
257
261
  for flag in [
258
262
  "preview_final_drift_acceptable",
259
263
  "primary_metric_acceptable",
@@ -428,8 +432,8 @@ def _load_validation_allowlist() -> set[str]:
428
432
  # disallow unknown validation keys at schema level.
429
433
  try:
430
434
  _vkeys = _load_validation_allowlist()
431
- if isinstance(CERTIFICATE_JSON_SCHEMA.get("properties"), dict):
432
- vspec = CERTIFICATE_JSON_SCHEMA["properties"].get("validation")
435
+ if isinstance(REPORT_JSON_SCHEMA.get("properties"), dict):
436
+ vspec = REPORT_JSON_SCHEMA["properties"].get("validation")
433
437
  if isinstance(vspec, dict):
434
438
  vspec["properties"] = {k: {"type": "boolean"} for k in _vkeys}
435
439
  vspec["additionalProperties"] = False
@@ -446,7 +450,7 @@ except Exception: # pragma: no cover
446
450
  def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunReport:
447
451
  """Normalize a possibly-minimal report and validate its structure.
448
452
 
449
- Uses the local normalizer when available, then checks `validate_report`.
453
+ Uses the local normalizer when available, then checks `validate_run_report`.
450
454
  Raises ValueError on invalid input. Returns the normalized RunReport.
451
455
  """
452
456
  try:
@@ -456,13 +460,13 @@ def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunRep
456
460
  report = _norm(report)
457
461
  except Exception: # pragma: no cover
458
462
  pass
459
- if not validate_report(report):
463
+ if not validate_run_report(report):
460
464
  raise ValueError("Invalid RunReport structure")
461
465
  return report
462
466
 
463
467
 
464
- def _extract_certificate_meta(report: RunReport) -> dict[str, Any]:
465
- """Extract the certificate metadata block with a full seed bundle."""
468
+ def _extract_report_meta(report: RunReport) -> dict[str, Any]:
469
+ """Extract the evaluation report metadata block with a full seed bundle."""
466
470
  meta_section = (
467
471
  report.get("meta", {}) if isinstance(report.get("meta"), dict) else {}
468
472
  )
@@ -739,22 +743,22 @@ def _fallback_paired_windows(
739
743
  return paired_windows
740
744
 
741
745
 
742
- def make_certificate(
746
+ def make_report(
743
747
  report: RunReport,
744
748
  baseline: RunReport | dict[str, Any],
745
749
  ) -> dict[str, Any]:
746
750
  """
747
- Generate an evaluation certificate from a RunReport and baseline comparison.
751
+ Generate an evaluation report from a RunReport and baseline comparison.
748
752
 
749
- The certificate is a standalone, portable artifact that contains all
750
- essential metrics and comparisons needed for safety verification.
753
+ The evaluation report is a standalone, portable artifact that contains all
754
+ essential paired metrics and comparisons used by InvarLock gates.
751
755
 
752
756
  Args:
753
- report: The guarded run report to certify
757
+ report: The guarded run report to evaluate
754
758
  baseline: Step-0 baseline RunReport or baseline metrics dict
755
759
 
756
760
  Returns:
757
- Certificate dictionary with all required fields
761
+ Evaluation report dictionary with all required fields
758
762
 
759
763
  Raises:
760
764
  ValueError: If inputs are invalid or required data is missing
@@ -778,11 +782,11 @@ def make_certificate(
778
782
  baseline_report = None
779
783
 
780
784
  # Extract core metadata with full seed bundle
781
- meta = _extract_certificate_meta(report)
785
+ meta = _extract_report_meta(report)
782
786
 
783
787
  # Propagate environment flags captured in the RunReport (e.g., deterministic algos,
784
788
  # TF32 controls, MPS/CUDA availability). This is useful for auditability and
785
- # reproducibility of certification runs.
789
+ # reproducibility of evaluation runs.
786
790
  try:
787
791
  env_flags = (
788
792
  report.get("meta", {}).get("env_flags")
@@ -1602,7 +1606,7 @@ def make_certificate(
1602
1606
  if device_name:
1603
1607
  telemetry.setdefault("device", device_name)
1604
1608
 
1605
- # Build the certificate
1609
+ # Build the evaluation report
1606
1610
  window_capacity_ctx = (
1607
1611
  report.get("metrics", {}).get("window_capacity")
1608
1612
  if isinstance(report.get("metrics"), dict)
@@ -1920,8 +1924,8 @@ def make_certificate(
1920
1924
  k: bool(v) for k, v in validation_flags.items() if k in _allowed_validation
1921
1925
  }
1922
1926
 
1923
- certificate = {
1924
- "schema_version": CERTIFICATE_SCHEMA_VERSION,
1927
+ evaluation_report = {
1928
+ "schema_version": REPORT_SCHEMA_VERSION,
1925
1929
  "run_id": current_run_id,
1926
1930
  "meta": meta,
1927
1931
  "auto": auto,
@@ -1964,8 +1968,8 @@ def make_certificate(
1964
1968
  _tiny_relax_env = False
1965
1969
  if _tiny_relax_env:
1966
1970
  try:
1967
- certificate.setdefault("auto", {})["tiny_relax"] = True
1968
- prov = certificate.setdefault("provenance", {})
1971
+ evaluation_report.setdefault("auto", {})["tiny_relax"] = True
1972
+ prov = evaluation_report.setdefault("provenance", {})
1969
1973
  flags = prov.setdefault("flags", [])
1970
1974
  if "tiny_relax" not in flags:
1971
1975
  flags.append("tiny_relax")
@@ -1991,12 +1995,12 @@ def make_certificate(
1991
1995
  and "value" in qo
1992
1996
  and math.isfinite(float(qo.get("value", float("nan"))))
1993
1997
  ):
1994
- certificate["quality_overhead"] = qo
1998
+ evaluation_report["quality_overhead"] = qo
1995
1999
  except Exception: # pragma: no cover
1996
2000
  pass
1997
2001
 
1998
2002
  try:
1999
- _propagate_pairing_stats(certificate, ppl_analysis)
2003
+ _propagate_pairing_stats(evaluation_report, ppl_analysis)
2000
2004
  except Exception: # pragma: no cover
2001
2005
  pass
2002
2006
 
@@ -2057,7 +2061,7 @@ def make_certificate(
2057
2061
  (resolved_policy.get("variance") or {}).get("min_effect_lognll", 0.0) or 0.0
2058
2062
  )
2059
2063
 
2060
- certificate["policy_digest"] = {
2064
+ evaluation_report["policy_digest"] = {
2061
2065
  "policy_version": POLICY_VERSION,
2062
2066
  "tier_policy_name": cur_tier,
2063
2067
  "thresholds_hash": thresholds_hash,
@@ -2088,7 +2092,7 @@ def make_certificate(
2088
2092
  payload[key] = item[key]
2089
2093
  sanitized.append(payload)
2090
2094
  if sanitized:
2091
- certificate["secondary_metrics"] = sanitized
2095
+ evaluation_report["secondary_metrics"] = sanitized
2092
2096
  except Exception: # pragma: no cover
2093
2097
  pass
2094
2098
 
@@ -2136,7 +2140,7 @@ def make_certificate(
2136
2140
  except Exception: # pragma: no cover
2137
2141
  continue
2138
2142
  if out:
2139
- certificate["classification"] = {"subgroups": out}
2143
+ evaluation_report["classification"] = {"subgroups": out}
2140
2144
  except Exception: # pragma: no cover
2141
2145
  pass
2142
2146
 
@@ -2152,7 +2156,7 @@ def make_certificate(
2152
2156
  if isinstance(container.get("metrics"), dict)
2153
2157
  else {}
2154
2158
  )
2155
- # Edited report case: also check certificate telemetry keys
2159
+ # Edited report case: also check evaluation_report telemetry keys
2156
2160
  telem = telemetry if isinstance(telemetry, dict) else {}
2157
2161
  # Prefer explicit p50/p95 throughput keys if present
2158
2162
  for key in ("latency_ms_p50", "latency_ms_p95", "throughput_sps"):
@@ -2193,24 +2197,24 @@ def make_certificate(
2193
2197
  entry["ratio"] = float("nan")
2194
2198
  system_overhead[metric_key] = entry
2195
2199
  if system_overhead:
2196
- certificate["system_overhead"] = system_overhead
2200
+ evaluation_report["system_overhead"] = system_overhead
2197
2201
  except Exception: # pragma: no cover
2198
2202
  pass
2199
2203
 
2200
2204
  # Attach/normalize primary metric block (moved to helper)
2201
2205
  from .primary_metric_utils import attach_primary_metric as _attach_pm
2202
2206
 
2203
- _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2207
+ _attach_pm(evaluation_report, report, baseline_raw, baseline_ref, ppl_analysis)
2204
2208
  try:
2205
2209
  if isinstance(pm_drift_band, dict) and pm_drift_band:
2206
- pm_block = certificate.get("primary_metric")
2210
+ pm_block = evaluation_report.get("primary_metric")
2207
2211
  if isinstance(pm_block, dict):
2208
2212
  pm_block.setdefault("drift_band", dict(pm_drift_band))
2209
2213
  except Exception: # pragma: no cover
2210
2214
  pass
2211
2215
  _enforce_display_ci_alignment(
2212
2216
  ratio_ci_source,
2213
- certificate.get("primary_metric"),
2217
+ evaluation_report.get("primary_metric"),
2214
2218
  logloss_delta_ci,
2215
2219
  window_plan_profile,
2216
2220
  )
@@ -2218,8 +2222,8 @@ def make_certificate(
2218
2222
  # Ensure primary_metric has display_ci populated for schema invariants
2219
2223
  try:
2220
2224
  pm = (
2221
- certificate.get("primary_metric", {})
2222
- if isinstance(certificate.get("primary_metric"), dict)
2225
+ evaluation_report.get("primary_metric", {})
2226
+ if isinstance(evaluation_report.get("primary_metric"), dict)
2223
2227
  else None
2224
2228
  )
2225
2229
  if isinstance(pm, dict) and pm:
@@ -2259,8 +2263,8 @@ def make_certificate(
2259
2263
  if not kind:
2260
2264
  kind = "ppl"
2261
2265
  windows_cfg = (
2262
- certificate.get("dataset", {}).get("windows", {})
2263
- if isinstance(certificate.get("dataset"), dict)
2266
+ evaluation_report.get("dataset", {}).get("windows", {})
2267
+ if isinstance(evaluation_report.get("dataset"), dict)
2264
2268
  else {}
2265
2269
  )
2266
2270
  n_prev = windows_cfg.get("preview")
@@ -2268,7 +2272,7 @@ def make_certificate(
2268
2272
  tokens_total = None
2269
2273
  try:
2270
2274
  tokens_total = (
2271
- certificate.get("dataset", {}).get("hash", {}).get("total_tokens")
2275
+ evaluation_report.get("dataset", {}).get("hash", {}).get("total_tokens")
2272
2276
  )
2273
2277
  except Exception: # pragma: no cover
2274
2278
  tokens_total = None
@@ -2276,7 +2280,7 @@ def make_certificate(
2276
2280
  ci_lo = None
2277
2281
  ci_hi = None
2278
2282
  ratio = None
2279
- pmc = certificate.get("primary_metric", {})
2283
+ pmc = evaluation_report.get("primary_metric", {})
2280
2284
  rci = pmc.get("display_ci") or pmc.get("ci")
2281
2285
  if isinstance(rci, tuple | list) and len(rci) == 2:
2282
2286
  ci_lo, ci_hi = rci[0], rci[1]
@@ -2288,7 +2292,7 @@ def make_certificate(
2288
2292
  except Exception: # pragma: no cover
2289
2293
  ci_w = None
2290
2294
  # Gate outcome
2291
- val = certificate.get("validation", {})
2295
+ val = evaluation_report.get("validation", {})
2292
2296
  gate_ok = None
2293
2297
  try:
2294
2298
  gate_ok = bool(val.get("primary_metric_acceptable"))
@@ -2303,10 +2307,10 @@ def make_certificate(
2303
2307
  f"tokens={tokens_total}",
2304
2308
  ]
2305
2309
  try:
2306
- split = (certificate.get("provenance", {}) or {}).get("dataset_split")
2310
+ split = (evaluation_report.get("provenance", {}) or {}).get("dataset_split")
2307
2311
  if not split:
2308
2312
  split = (report.get("provenance", {}) or {}).get("dataset_split")
2309
- sf = (certificate.get("provenance", {}) or {}).get("split_fallback")
2313
+ sf = (evaluation_report.get("provenance", {}) or {}).get("split_fallback")
2310
2314
  if sf is None:
2311
2315
  sf = (report.get("provenance", {}) or {}).get("split_fallback")
2312
2316
  if split:
@@ -2322,7 +2326,7 @@ def make_certificate(
2322
2326
  if isinstance(gate_ok, bool):
2323
2327
  parts.append(f"gate={'pass' if gate_ok else 'fail'}")
2324
2328
  summary_line = "INVARLOCK_TELEMETRY " + " ".join(parts)
2325
- certificate.setdefault("telemetry", {})["summary_line"] = summary_line
2329
+ evaluation_report.setdefault("telemetry", {})["summary_line"] = summary_line
2326
2330
  if str(os.environ.get("INVARLOCK_TELEMETRY", "")).strip().lower() in {
2327
2331
  "1",
2328
2332
  "true",
@@ -2335,17 +2339,17 @@ def make_certificate(
2335
2339
 
2336
2340
  # Attach confidence label (non-gating)
2337
2341
  try:
2338
- certificate["confidence"] = _compute_confidence_label(certificate)
2342
+ evaluation_report["confidence"] = _compute_confidence_label(evaluation_report)
2339
2343
  except Exception: # pragma: no cover
2340
2344
  pass
2341
2345
 
2342
- return certificate
2346
+ return evaluation_report
2343
2347
 
2344
2348
 
2345
2349
  # Console Validation Block helpers have moved to invarlock.reporting.render.
2346
2350
 
2347
2351
 
2348
- ## NOTE: render_certificate_markdown has been moved to invarlock.reporting.render.
2352
+ ## NOTE: render_report_markdown has been moved to invarlock.reporting.render.
2349
2353
  ## It is re-exported at the bottom of this module to preserve the public API.
2350
2354
  ## Private helper functions
2351
2355
 
@@ -2623,7 +2627,7 @@ def _extract_structural_deltas(report: RunReport) -> dict[str, Any]:
2623
2627
  def _extract_edit_metadata(
2624
2628
  report: RunReport, plugin_provenance: dict[str, Any]
2625
2629
  ) -> dict[str, Any]:
2626
- """Extract edit-level provenance and configuration metadata for the certificate."""
2630
+ """Extract edit-level provenance and configuration metadata for the evaluation report."""
2627
2631
 
2628
2632
  edit_section = _get_mapping(report, "edit")
2629
2633
  if not edit_section:
@@ -3020,12 +3024,12 @@ def _compute_quality_overhead_from_guard(
3020
3024
 
3021
3025
 
3022
3026
  def _propagate_pairing_stats(
3023
- certificate: dict[str, Any], ppl_analysis: dict[str, Any] | None
3027
+ evaluation_report: dict[str, Any], ppl_analysis: dict[str, Any] | None
3024
3028
  ) -> None:
3025
- """Surface pairing statistics inside certificate.dataset.windows.stats."""
3026
- if not isinstance(certificate, dict):
3029
+ """Surface pairing statistics inside evaluation_report.dataset.windows.stats."""
3030
+ if not isinstance(evaluation_report, dict):
3027
3031
  return
3028
- ds = certificate.get("dataset", {})
3032
+ ds = evaluation_report.get("dataset", {})
3029
3033
  if not isinstance(ds, dict):
3030
3034
  return
3031
3035
  windows = ds.get("windows", {})
@@ -3079,7 +3083,7 @@ def _propagate_pairing_stats(
3079
3083
  windows["stats"] = stats
3080
3084
  if windows is not ds.get("windows"):
3081
3085
  ds["windows"] = windows
3082
- certificate["dataset"] = ds
3086
+ evaluation_report["dataset"] = ds
3083
3087
 
3084
3088
 
3085
3089
  def _build_provenance_block(
@@ -3262,8 +3266,7 @@ def _resolve_pm_drift_band_from_report(
3262
3266
  ) -> dict[str, float]:
3263
3267
  """Resolve preview→final drift band from report context/meta/env."""
3264
3268
 
3265
- base_min = 0.95
3266
- base_max = 1.05
3269
+ base_min, base_max = PM_DRIFT_BAND_DEFAULT
3267
3270
 
3268
3271
  def _safe_float(val: Any) -> float | None:
3269
3272
  try:
@@ -3372,7 +3375,7 @@ def _compute_validation_flags(
3372
3375
  pm_drift_band: dict[str, float] | None = None,
3373
3376
  pm_tail: dict[str, Any] | None = None,
3374
3377
  ) -> dict[str, bool]:
3375
- """Compute validation flags for the certificate including canonical gates."""
3378
+ """Compute validation flags for the evaluation report including canonical gates."""
3376
3379
  tier = (tier or "balanced").lower()
3377
3380
  # Dev-only tiny relax: widen gates and lower floors when explicitly requested
3378
3381
  import os as _os
@@ -3435,8 +3438,7 @@ def _compute_validation_flags(
3435
3438
  # Canonical Gates
3436
3439
  # 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
3437
3440
  drift_ratio = ppl.get("preview_final_ratio", 1.0)
3438
- drift_min = 0.95
3439
- drift_max = 1.05
3441
+ drift_min, drift_max = PM_DRIFT_BAND_DEFAULT
3440
3442
  if isinstance(pm_drift_band, dict):
3441
3443
  try:
3442
3444
  cand_min = pm_drift_band.get("min")
@@ -3613,7 +3615,7 @@ def _compute_validation_flags(
3613
3615
  if _tiny_relax and threshold_val < 0.10:
3614
3616
  threshold_val = 0.10
3615
3617
  if not math.isfinite(ratio_val):
3616
- # In dev/Compare-&-Certify flows we often lack a bare run; treat missing metric as pass
3618
+ # In dev/Compare-&-Evaluate flows we often lack a bare run; treat missing metric as pass
3617
3619
  guard_overhead_pass = True
3618
3620
  else:
3619
3621
  guard_overhead_pass = ratio_val <= (1.0 + max(0.0, threshold_val))
@@ -3769,7 +3771,7 @@ def _generate_run_id(report: RunReport) -> str:
3769
3771
  return hashlib.sha256(base_str.encode()).hexdigest()[:16]
3770
3772
 
3771
3773
 
3772
- ## NOTE: _compute_certificate_hash moved to invarlock.reporting.render and is re-exported below.
3774
+ ## NOTE: _compute_report_hash moved to invarlock.reporting.render and is re-exported below.
3773
3775
 
3774
3776
 
3775
3777
  def _analyze_bitwidth_map(bitwidth_map: dict[str, Any]) -> dict[str, Any]:
@@ -4114,22 +4116,24 @@ def _extract_compression_diagnostics(
4114
4116
 
4115
4117
  # Re-export rendering API from dedicated module to avoid bloat/cycles
4116
4118
  # Rendering helpers live in invarlock.reporting.render; internal code should import there directly.
4117
- # Tests and public API expect render_certificate_markdown to be available from
4118
- # invarlock.reporting.certificate. Import lazily at module end to avoid cycles with
4119
+ # Tests and public API expect render_report_markdown to be available from
4120
+ # invarlock.reporting.report_builder. Import lazily at module end to avoid cycles with
4119
4121
  # invarlock.reporting.render which imports this module as a namespace.
4120
4122
  try: # pragma: no cover - simple re-export
4121
4123
  from .render import (
4122
4124
  compute_console_validation_block, # type: ignore
4123
- render_certificate_markdown, # type: ignore
4125
+ render_report_markdown, # type: ignore
4124
4126
  )
4125
4127
  except Exception: # pragma: no cover - defensive fallback
4126
4128
 
4127
- def render_certificate_markdown(certificate: dict[str, Any]) -> str: # type: ignore
4129
+ def render_report_markdown(evaluation_report: dict[str, Any]) -> str: # type: ignore
4128
4130
  raise ImportError(
4129
- "render_certificate_markdown is unavailable; rendering dependencies missing"
4131
+ "render_report_markdown is unavailable; rendering dependencies missing"
4130
4132
  )
4131
4133
 
4132
- def compute_console_validation_block(certificate: dict[str, Any]) -> dict[str, Any]: # type: ignore
4134
+ def compute_console_validation_block(
4135
+ evaluation_report: dict[str, Any],
4136
+ ) -> dict[str, Any]: # type: ignore
4133
4137
  raise ImportError(
4134
4138
  "compute_console_validation_block is unavailable; rendering dependencies missing"
4135
4139
  )
@@ -4137,12 +4141,12 @@ except Exception: # pragma: no cover - defensive fallback
4137
4141
 
4138
4142
  # Export public API
4139
4143
  __all__ = [
4140
- "make_certificate",
4141
- "validate_certificate",
4144
+ "make_report",
4145
+ "validate_report",
4142
4146
  "_validate_with_jsonschema",
4143
4147
  "jsonschema",
4144
- "render_certificate_markdown",
4148
+ "render_report_markdown",
4145
4149
  "compute_console_validation_block",
4146
- "CERTIFICATE_SCHEMA_VERSION",
4147
- "CERTIFICATE_JSON_SCHEMA",
4150
+ "REPORT_SCHEMA_VERSION",
4151
+ "REPORT_JSON_SCHEMA",
4148
4152
  ]