invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. invarlock/__init__.py +4 -4
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +37 -50
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_loading.py +7 -7
  8. invarlock/adapters/hf_mixin.py +53 -9
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/assurance/__init__.py +15 -23
  12. invarlock/cli/adapter_auto.py +32 -26
  13. invarlock/cli/app.py +128 -27
  14. invarlock/cli/commands/__init__.py +2 -2
  15. invarlock/cli/commands/calibrate.py +48 -4
  16. invarlock/cli/commands/doctor.py +8 -10
  17. invarlock/cli/commands/evaluate.py +986 -0
  18. invarlock/cli/commands/explain_gates.py +25 -17
  19. invarlock/cli/commands/export_html.py +11 -9
  20. invarlock/cli/commands/plugins.py +13 -9
  21. invarlock/cli/commands/report.py +326 -92
  22. invarlock/cli/commands/run.py +1160 -228
  23. invarlock/cli/commands/verify.py +157 -97
  24. invarlock/cli/config.py +1 -1
  25. invarlock/cli/determinism.py +1 -1
  26. invarlock/cli/doctor_helpers.py +4 -5
  27. invarlock/cli/output.py +193 -0
  28. invarlock/cli/provenance.py +4 -4
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/registry.py +9 -11
  31. invarlock/core/retry.py +14 -14
  32. invarlock/core/runner.py +112 -26
  33. invarlock/edits/noop.py +2 -2
  34. invarlock/edits/quant_rtn.py +67 -39
  35. invarlock/eval/__init__.py +1 -1
  36. invarlock/eval/bench.py +14 -10
  37. invarlock/eval/data.py +68 -23
  38. invarlock/eval/metrics.py +59 -1
  39. invarlock/eval/primary_metric.py +1 -1
  40. invarlock/eval/tasks/__init__.py +12 -0
  41. invarlock/eval/tasks/classification.py +48 -0
  42. invarlock/eval/tasks/qa.py +36 -0
  43. invarlock/eval/tasks/text_generation.py +102 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/rmt.py +2 -2
  46. invarlock/guards/spectral.py +1 -1
  47. invarlock/guards/variance.py +2 -2
  48. invarlock/model_profile.py +64 -62
  49. invarlock/observability/health.py +6 -6
  50. invarlock/observability/metrics.py +108 -0
  51. invarlock/plugins/hf_bnb_adapter.py +32 -21
  52. invarlock/reporting/__init__.py +18 -4
  53. invarlock/reporting/guards_analysis.py +154 -4
  54. invarlock/reporting/html.py +61 -11
  55. invarlock/reporting/normalizer.py +9 -2
  56. invarlock/reporting/policy_utils.py +1 -1
  57. invarlock/reporting/primary_metric_utils.py +11 -11
  58. invarlock/reporting/render.py +876 -510
  59. invarlock/reporting/report.py +72 -30
  60. invarlock/reporting/{certificate.py → report_builder.py} +252 -99
  61. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  62. invarlock/reporting/report_types.py +6 -1
  63. invarlock/reporting/telemetry.py +86 -0
  64. invarlock-0.3.8.dist-info/METADATA +283 -0
  65. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
  66. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
  67. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
  68. invarlock/adapters/hf_gpt2.py +0 -404
  69. invarlock/adapters/hf_llama.py +0 -487
  70. invarlock/cli/commands/certify.py +0 -422
  71. invarlock-0.3.6.dist-info/METADATA +0 -588
  72. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
  73. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,16 @@
1
1
  """
2
- InvarLock Safety Certificate Generation
3
- ==================================
2
+ InvarLock Evaluation Report Generation
3
+ =====================================
4
4
 
5
- Generate standardized safety certificates from RunReport and baseline comparison.
6
- Certificates are standalone, portable verification artifacts that can be used
7
- for CI/CD gates and regulatory compliance.
5
+ Generate standardized evaluation reports from RunReport and baseline
6
+ comparison.
7
+ Evaluation reports are standalone, portable artifacts that record statistical
8
+ gates and evidence for CI/CD checks and audits (not formal verification).
8
9
  """
9
10
 
10
11
  from __future__ import annotations
11
12
 
12
- ## Core certificate generation and analysis orchestration lives here.
13
+ ## Core evaluation report building and analysis orchestration lives here.
13
14
  # mypy: ignore-errors
14
15
  import copy
15
16
  import hashlib
@@ -38,11 +39,7 @@ from invarlock.eval.primary_metric import compute_primary_metric_from_report, ge
38
39
  from invarlock.eval.tail_stats import evaluate_metric_tail
39
40
  from invarlock.utils.digest import hash_json
40
41
 
41
- from . import certificate_schema as _cert_schema
42
- from .certificate_schema import (
43
- CERTIFICATE_JSON_SCHEMA,
44
- CERTIFICATE_SCHEMA_VERSION,
45
- )
42
+ from . import report_schema as _report_schema
46
43
  from .dataset_hashing import (
47
44
  _extract_dataset_info,
48
45
  )
@@ -52,10 +49,15 @@ from .guards_analysis import (
52
49
  _extract_spectral_analysis,
53
50
  _extract_variance_analysis,
54
51
  )
55
- from .report_types import RunReport, validate_report
52
+ from .report_schema import (
53
+ REPORT_JSON_SCHEMA,
54
+ REPORT_SCHEMA_VERSION,
55
+ )
56
+ from .report_types import RunReport
57
+ from .report_types import validate_report as validate_run_report
56
58
 
57
59
  # Expose compute_window_hash for tests that monkeypatch it
58
- # compute_window_hash used to be exposed via certificate; tests now patch
60
+ # compute_window_hash used to be exposed via the evaluation report builder; tests now patch
59
61
  # dataset_hashing.compute_window_hash directly, so this import is no longer needed.
60
62
  from .utils import (
61
63
  _coerce_int,
@@ -102,7 +104,7 @@ def _is_ppl_kind(name: Any) -> bool:
102
104
 
103
105
 
104
106
  ## NOTE: Deprecated helper `_get_ppl_final` was removed; callers should
105
- ## use the normalized primary_metric block directly via make_certificate or
107
+ ## use the normalized primary_metric block directly via make_report or
106
108
  ## report processing utilities.
107
109
 
108
110
 
@@ -130,8 +132,8 @@ def _compute_edit_digest(report: dict) -> dict:
130
132
  return {"family": family, "impl_hash": impl_hash, "version": 1}
131
133
 
132
134
 
133
- def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
134
- """Compute certificate confidence label based on stability and CI width.
135
+ def _compute_confidence_label(evaluation_report: dict[str, Any]) -> dict[str, Any]:
136
+ """Compute evaluation report confidence label based on stability and CI width.
135
137
 
136
138
  Heuristics:
137
139
  - High: ppl_acceptable=True, unstable=False, width <= 0.03 (ratio) or <= 1.0 pp for accuracy
@@ -139,7 +141,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
139
141
  - Low: otherwise (floors unmet, failure, or missing bounds)
140
142
  Returns a dict with label, basis, width and threshold for transparency.
141
143
  """
142
- validation = certificate.get("validation", {}) or {}
144
+ validation = evaluation_report.get("validation", {}) or {}
143
145
  pm_ok = bool(validation.get("primary_metric_acceptable", False))
144
146
  # Basis label shown in confidence block:
145
147
  # - For ppl-like metrics, use 'ppl_ratio' to reflect ratio width threshold
@@ -148,7 +150,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
148
150
  basis = "primary_metric"
149
151
  lo = hi = float("nan")
150
152
  try:
151
- pm = certificate.get("primary_metric", {}) or {}
153
+ pm = evaluation_report.get("primary_metric", {}) or {}
152
154
  kind = str(pm.get("kind", "") or "").lower()
153
155
  if isinstance(pm, dict) and pm and pm.get("display_ci"):
154
156
  dci = pm.get("display_ci")
@@ -169,7 +171,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
169
171
  thr_ratio = 0.03 # 3% width for ratio
170
172
  thr_pp = 1.0 # 1.0 percentage point for accuracy kinds
171
173
  try:
172
- pol = certificate.get("resolved_policy")
174
+ pol = evaluation_report.get("resolved_policy")
173
175
  if isinstance(pol, dict):
174
176
  conf_pol = pol.get("confidence")
175
177
  if isinstance(conf_pol, dict):
@@ -186,7 +188,7 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
186
188
 
187
189
  # Unstable hint from primary metric (if provided)
188
190
  try:
189
- unstable = bool((certificate.get("primary_metric") or {}).get("unstable"))
191
+ unstable = bool((evaluation_report.get("primary_metric") or {}).get("unstable"))
190
192
  except Exception: # pragma: no cover
191
193
  unstable = False
192
194
 
@@ -212,39 +214,39 @@ def _compute_confidence_label(certificate: dict[str, Any]) -> dict[str, Any]:
212
214
  }
213
215
 
214
216
 
215
- # Minimal JSON Schema describing the canonical shape of a certificate.
217
+ # Minimal JSON Schema describing the canonical shape of an evaluation report.
216
218
  # This focuses on structural validity; numerical thresholds are validated
217
219
  # separately in metric-specific logic.
218
- # JSON Schema is provided by certificate_schema; no duplication here.
220
+ # JSON Schema is provided by report_schema; no duplication here.
219
221
 
220
222
 
221
223
  # Mirror jsonschema and structural validator for test monkeypatching compatibility.
222
- jsonschema = getattr(_cert_schema, "jsonschema", None)
224
+ jsonschema = getattr(_report_schema, "jsonschema", None)
223
225
 
224
226
 
225
- def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
227
+ def _validate_with_jsonschema(evaluation_report: dict[str, Any]) -> bool:
226
228
  if jsonschema is None:
227
229
  return True
228
230
  try:
229
- jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
231
+ jsonschema.validate(instance=evaluation_report, schema=REPORT_JSON_SCHEMA)
230
232
  return True
231
233
  except Exception: # pragma: no cover
232
234
  return False
233
235
 
234
236
 
235
- def validate_certificate(certificate: dict[str, Any]) -> bool:
236
- """Validate that a certificate has all required fields and valid data."""
237
+ def validate_report(evaluation_report: dict[str, Any]) -> bool:
238
+ """Validate that an evaluation report has all required fields and valid data."""
237
239
  try:
238
- if certificate.get("schema_version") != CERTIFICATE_SCHEMA_VERSION:
240
+ if evaluation_report.get("schema_version") != REPORT_SCHEMA_VERSION:
239
241
  return False
240
242
  # Prefer JSON Schema structural validation; if unavailable or too strict,
241
243
  # fall back to a lenient minimal check used by unit tests.
242
- if not _validate_with_jsonschema(certificate):
244
+ if not _validate_with_jsonschema(evaluation_report):
243
245
  # Minimal fallback: require schema version + run_id + primary_metric
244
- run_id_ok = isinstance(certificate.get("run_id"), str) and bool(
245
- certificate.get("run_id")
246
+ run_id_ok = isinstance(evaluation_report.get("run_id"), str) and bool(
247
+ evaluation_report.get("run_id")
246
248
  )
247
- pm = certificate.get("primary_metric")
249
+ pm = evaluation_report.get("primary_metric")
248
250
  pm_ok = isinstance(pm, dict) and (
249
251
  isinstance(pm.get("final"), int | float)
250
252
  or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
@@ -252,7 +254,7 @@ def validate_certificate(certificate: dict[str, Any]) -> bool:
252
254
  if not (run_id_ok and pm_ok):
253
255
  return False
254
256
 
255
- validation = certificate.get("validation", {})
257
+ validation = evaluation_report.get("validation", {})
256
258
  for flag in [
257
259
  "preview_final_drift_acceptable",
258
260
  "primary_metric_acceptable",
@@ -427,8 +429,8 @@ def _load_validation_allowlist() -> set[str]:
427
429
  # disallow unknown validation keys at schema level.
428
430
  try:
429
431
  _vkeys = _load_validation_allowlist()
430
- if isinstance(CERTIFICATE_JSON_SCHEMA.get("properties"), dict):
431
- vspec = CERTIFICATE_JSON_SCHEMA["properties"].get("validation")
432
+ if isinstance(REPORT_JSON_SCHEMA.get("properties"), dict):
433
+ vspec = REPORT_JSON_SCHEMA["properties"].get("validation")
432
434
  if isinstance(vspec, dict):
433
435
  vspec["properties"] = {k: {"type": "boolean"} for k in _vkeys}
434
436
  vspec["additionalProperties"] = False
@@ -445,7 +447,7 @@ except Exception: # pragma: no cover
445
447
  def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunReport:
446
448
  """Normalize a possibly-minimal report and validate its structure.
447
449
 
448
- Uses the local normalizer when available, then checks `validate_report`.
450
+ Uses the local normalizer when available, then checks `validate_run_report`.
449
451
  Raises ValueError on invalid input. Returns the normalized RunReport.
450
452
  """
451
453
  try:
@@ -455,13 +457,13 @@ def _normalize_and_validate_report(report: RunReport | dict[str, Any]) -> RunRep
455
457
  report = _norm(report)
456
458
  except Exception: # pragma: no cover
457
459
  pass
458
- if not validate_report(report):
460
+ if not validate_run_report(report):
459
461
  raise ValueError("Invalid RunReport structure")
460
462
  return report
461
463
 
462
464
 
463
- def _extract_certificate_meta(report: RunReport) -> dict[str, Any]:
464
- """Extract the certificate metadata block with a full seed bundle."""
465
+ def _extract_report_meta(report: RunReport) -> dict[str, Any]:
466
+ """Extract the evaluation report metadata block with a full seed bundle."""
465
467
  meta_section = (
466
468
  report.get("meta", {}) if isinstance(report.get("meta"), dict) else {}
467
469
  )
@@ -738,22 +740,22 @@ def _fallback_paired_windows(
738
740
  return paired_windows
739
741
 
740
742
 
741
- def make_certificate(
743
+ def make_report(
742
744
  report: RunReport,
743
745
  baseline: RunReport | dict[str, Any],
744
746
  ) -> dict[str, Any]:
745
747
  """
746
- Generate a safety certificate from a RunReport and baseline comparison.
748
+ Generate an evaluation report from a RunReport and baseline comparison.
747
749
 
748
- The certificate is a standalone, portable artifact that contains all
749
- essential metrics and comparisons needed for safety verification.
750
+ The evaluation report is a standalone, portable artifact that contains all
751
+ essential paired metrics and comparisons used by InvarLock gates.
750
752
 
751
753
  Args:
752
- report: The guarded run report to certify
754
+ report: The guarded run report to evaluate
753
755
  baseline: Step-0 baseline RunReport or baseline metrics dict
754
756
 
755
757
  Returns:
756
- Certificate dictionary with all required fields
758
+ Evaluation report dictionary with all required fields
757
759
 
758
760
  Raises:
759
761
  ValueError: If inputs are invalid or required data is missing
@@ -764,13 +766,24 @@ def make_certificate(
764
766
  # Normalize baseline input
765
767
  baseline_raw = baseline
766
768
  baseline_normalized = _normalize_baseline(baseline_raw)
769
+ baseline_report: RunReport | None = None
770
+ try:
771
+ if (
772
+ isinstance(baseline_raw, dict)
773
+ and "meta" in baseline_raw
774
+ and "metrics" in baseline_raw
775
+ and "edit" in baseline_raw
776
+ ):
777
+ baseline_report = _normalize_and_validate_report(baseline_raw)
778
+ except Exception: # pragma: no cover - baseline compare is best-effort
779
+ baseline_report = None
767
780
 
768
781
  # Extract core metadata with full seed bundle
769
- meta = _extract_certificate_meta(report)
782
+ meta = _extract_report_meta(report)
770
783
 
771
784
  # Propagate environment flags captured in the RunReport (e.g., deterministic algos,
772
785
  # TF32 controls, MPS/CUDA availability). This is useful for auditability and
773
- # reproducibility of certification runs.
786
+ # reproducibility of evaluation runs.
774
787
  try:
775
788
  env_flags = (
776
789
  report.get("meta", {}).get("env_flags")
@@ -1440,7 +1453,7 @@ def make_certificate(
1440
1453
  ppl_analysis["window_plan"] = window_plan_ctx
1441
1454
 
1442
1455
  # Extract invariant status
1443
- invariants = _extract_invariants(report)
1456
+ invariants = _extract_invariants(report, baseline=baseline_report)
1444
1457
 
1445
1458
  # Extract spectral analysis
1446
1459
  spectral = _extract_spectral_analysis(report, baseline_normalized)
@@ -1558,7 +1571,13 @@ def make_certificate(
1558
1571
  telemetry: dict[str, Any] = {}
1559
1572
  metrics_section = report.get("metrics", {})
1560
1573
  if isinstance(metrics_section, dict):
1561
- for key in ("latency_ms_per_tok", "memory_mb_peak", "throughput_tok_per_s"):
1574
+ for key in (
1575
+ "latency_ms_per_tok",
1576
+ "memory_mb_peak",
1577
+ "gpu_memory_mb_peak",
1578
+ "gpu_memory_reserved_mb_peak",
1579
+ "throughput_tok_per_s",
1580
+ ):
1562
1581
  value = metrics_section.get(key)
1563
1582
  if isinstance(value, int | float) and math.isfinite(value):
1564
1583
  telemetry[key] = float(value)
@@ -1584,7 +1603,7 @@ def make_certificate(
1584
1603
  if device_name:
1585
1604
  telemetry.setdefault("device", device_name)
1586
1605
 
1587
- # Build the certificate
1606
+ # Build the evaluation report
1588
1607
  window_capacity_ctx = (
1589
1608
  report.get("metrics", {}).get("window_capacity")
1590
1609
  if isinstance(report.get("metrics"), dict)
@@ -1755,6 +1774,7 @@ def make_certificate(
1755
1774
  capacity_examples = None
1756
1775
 
1757
1776
  pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
1777
+ pm_drift_band = _resolve_pm_drift_band_from_report(report)
1758
1778
 
1759
1779
  # Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
1760
1780
  pm_tail_result: dict[str, Any] = {}
@@ -1881,6 +1901,12 @@ def make_certificate(
1881
1901
  except Exception: # pragma: no cover - defensive against patched functions
1882
1902
  validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
1883
1903
 
1904
+ try:
1905
+ if "pm_drift_band" in inspect.signature(_compute_validation_flags).parameters:
1906
+ validation_kwargs["pm_drift_band"] = pm_drift_band
1907
+ except Exception: # pragma: no cover - defensive against patched functions
1908
+ validation_kwargs["pm_drift_band"] = pm_drift_band
1909
+
1884
1910
  try:
1885
1911
  if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
1886
1912
  validation_kwargs["pm_tail"] = pm_tail_result
@@ -1895,8 +1921,8 @@ def make_certificate(
1895
1921
  k: bool(v) for k, v in validation_flags.items() if k in _allowed_validation
1896
1922
  }
1897
1923
 
1898
- certificate = {
1899
- "schema_version": CERTIFICATE_SCHEMA_VERSION,
1924
+ evaluation_report = {
1925
+ "schema_version": REPORT_SCHEMA_VERSION,
1900
1926
  "run_id": current_run_id,
1901
1927
  "meta": meta,
1902
1928
  "auto": auto,
@@ -1939,8 +1965,8 @@ def make_certificate(
1939
1965
  _tiny_relax_env = False
1940
1966
  if _tiny_relax_env:
1941
1967
  try:
1942
- certificate.setdefault("auto", {})["tiny_relax"] = True
1943
- prov = certificate.setdefault("provenance", {})
1968
+ evaluation_report.setdefault("auto", {})["tiny_relax"] = True
1969
+ prov = evaluation_report.setdefault("provenance", {})
1944
1970
  flags = prov.setdefault("flags", [])
1945
1971
  if "tiny_relax" not in flags:
1946
1972
  flags.append("tiny_relax")
@@ -1966,12 +1992,12 @@ def make_certificate(
1966
1992
  and "value" in qo
1967
1993
  and math.isfinite(float(qo.get("value", float("nan"))))
1968
1994
  ):
1969
- certificate["quality_overhead"] = qo
1995
+ evaluation_report["quality_overhead"] = qo
1970
1996
  except Exception: # pragma: no cover
1971
1997
  pass
1972
1998
 
1973
1999
  try:
1974
- _propagate_pairing_stats(certificate, ppl_analysis)
2000
+ _propagate_pairing_stats(evaluation_report, ppl_analysis)
1975
2001
  except Exception: # pragma: no cover
1976
2002
  pass
1977
2003
 
@@ -2032,7 +2058,7 @@ def make_certificate(
2032
2058
  (resolved_policy.get("variance") or {}).get("min_effect_lognll", 0.0) or 0.0
2033
2059
  )
2034
2060
 
2035
- certificate["policy_digest"] = {
2061
+ evaluation_report["policy_digest"] = {
2036
2062
  "policy_version": POLICY_VERSION,
2037
2063
  "tier_policy_name": cur_tier,
2038
2064
  "thresholds_hash": thresholds_hash,
@@ -2063,7 +2089,7 @@ def make_certificate(
2063
2089
  payload[key] = item[key]
2064
2090
  sanitized.append(payload)
2065
2091
  if sanitized:
2066
- certificate["secondary_metrics"] = sanitized
2092
+ evaluation_report["secondary_metrics"] = sanitized
2067
2093
  except Exception: # pragma: no cover
2068
2094
  pass
2069
2095
 
@@ -2111,7 +2137,7 @@ def make_certificate(
2111
2137
  except Exception: # pragma: no cover
2112
2138
  continue
2113
2139
  if out:
2114
- certificate["classification"] = {"subgroups": out}
2140
+ evaluation_report["classification"] = {"subgroups": out}
2115
2141
  except Exception: # pragma: no cover
2116
2142
  pass
2117
2143
 
@@ -2127,7 +2153,7 @@ def make_certificate(
2127
2153
  if isinstance(container.get("metrics"), dict)
2128
2154
  else {}
2129
2155
  )
2130
- # Edited report case: also check certificate telemetry keys
2156
+ # Edited report case: also check evaluation_report telemetry keys
2131
2157
  telem = telemetry if isinstance(telemetry, dict) else {}
2132
2158
  # Prefer explicit p50/p95 throughput keys if present
2133
2159
  for key in ("latency_ms_p50", "latency_ms_p95", "throughput_sps"):
@@ -2168,17 +2194,24 @@ def make_certificate(
2168
2194
  entry["ratio"] = float("nan")
2169
2195
  system_overhead[metric_key] = entry
2170
2196
  if system_overhead:
2171
- certificate["system_overhead"] = system_overhead
2197
+ evaluation_report["system_overhead"] = system_overhead
2172
2198
  except Exception: # pragma: no cover
2173
2199
  pass
2174
2200
 
2175
2201
  # Attach/normalize primary metric block (moved to helper)
2176
2202
  from .primary_metric_utils import attach_primary_metric as _attach_pm
2177
2203
 
2178
- _attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
2204
+ _attach_pm(evaluation_report, report, baseline_raw, baseline_ref, ppl_analysis)
2205
+ try:
2206
+ if isinstance(pm_drift_band, dict) and pm_drift_band:
2207
+ pm_block = evaluation_report.get("primary_metric")
2208
+ if isinstance(pm_block, dict):
2209
+ pm_block.setdefault("drift_band", dict(pm_drift_band))
2210
+ except Exception: # pragma: no cover
2211
+ pass
2179
2212
  _enforce_display_ci_alignment(
2180
2213
  ratio_ci_source,
2181
- certificate.get("primary_metric"),
2214
+ evaluation_report.get("primary_metric"),
2182
2215
  logloss_delta_ci,
2183
2216
  window_plan_profile,
2184
2217
  )
@@ -2186,8 +2219,8 @@ def make_certificate(
2186
2219
  # Ensure primary_metric has display_ci populated for schema invariants
2187
2220
  try:
2188
2221
  pm = (
2189
- certificate.get("primary_metric", {})
2190
- if isinstance(certificate.get("primary_metric"), dict)
2222
+ evaluation_report.get("primary_metric", {})
2223
+ if isinstance(evaluation_report.get("primary_metric"), dict)
2191
2224
  else None
2192
2225
  )
2193
2226
  if isinstance(pm, dict) and pm:
@@ -2227,8 +2260,8 @@ def make_certificate(
2227
2260
  if not kind:
2228
2261
  kind = "ppl"
2229
2262
  windows_cfg = (
2230
- certificate.get("dataset", {}).get("windows", {})
2231
- if isinstance(certificate.get("dataset"), dict)
2263
+ evaluation_report.get("dataset", {}).get("windows", {})
2264
+ if isinstance(evaluation_report.get("dataset"), dict)
2232
2265
  else {}
2233
2266
  )
2234
2267
  n_prev = windows_cfg.get("preview")
@@ -2236,7 +2269,7 @@ def make_certificate(
2236
2269
  tokens_total = None
2237
2270
  try:
2238
2271
  tokens_total = (
2239
- certificate.get("dataset", {}).get("hash", {}).get("total_tokens")
2272
+ evaluation_report.get("dataset", {}).get("hash", {}).get("total_tokens")
2240
2273
  )
2241
2274
  except Exception: # pragma: no cover
2242
2275
  tokens_total = None
@@ -2244,7 +2277,7 @@ def make_certificate(
2244
2277
  ci_lo = None
2245
2278
  ci_hi = None
2246
2279
  ratio = None
2247
- pmc = certificate.get("primary_metric", {})
2280
+ pmc = evaluation_report.get("primary_metric", {})
2248
2281
  rci = pmc.get("display_ci") or pmc.get("ci")
2249
2282
  if isinstance(rci, tuple | list) and len(rci) == 2:
2250
2283
  ci_lo, ci_hi = rci[0], rci[1]
@@ -2256,7 +2289,7 @@ def make_certificate(
2256
2289
  except Exception: # pragma: no cover
2257
2290
  ci_w = None
2258
2291
  # Gate outcome
2259
- val = certificate.get("validation", {})
2292
+ val = evaluation_report.get("validation", {})
2260
2293
  gate_ok = None
2261
2294
  try:
2262
2295
  gate_ok = bool(val.get("primary_metric_acceptable"))
@@ -2271,10 +2304,10 @@ def make_certificate(
2271
2304
  f"tokens={tokens_total}",
2272
2305
  ]
2273
2306
  try:
2274
- split = (certificate.get("provenance", {}) or {}).get("dataset_split")
2307
+ split = (evaluation_report.get("provenance", {}) or {}).get("dataset_split")
2275
2308
  if not split:
2276
2309
  split = (report.get("provenance", {}) or {}).get("dataset_split")
2277
- sf = (certificate.get("provenance", {}) or {}).get("split_fallback")
2310
+ sf = (evaluation_report.get("provenance", {}) or {}).get("split_fallback")
2278
2311
  if sf is None:
2279
2312
  sf = (report.get("provenance", {}) or {}).get("split_fallback")
2280
2313
  if split:
@@ -2290,7 +2323,7 @@ def make_certificate(
2290
2323
  if isinstance(gate_ok, bool):
2291
2324
  parts.append(f"gate={'pass' if gate_ok else 'fail'}")
2292
2325
  summary_line = "INVARLOCK_TELEMETRY " + " ".join(parts)
2293
- certificate.setdefault("telemetry", {})["summary_line"] = summary_line
2326
+ evaluation_report.setdefault("telemetry", {})["summary_line"] = summary_line
2294
2327
  if str(os.environ.get("INVARLOCK_TELEMETRY", "")).strip().lower() in {
2295
2328
  "1",
2296
2329
  "true",
@@ -2303,17 +2336,17 @@ def make_certificate(
2303
2336
 
2304
2337
  # Attach confidence label (non-gating)
2305
2338
  try:
2306
- certificate["confidence"] = _compute_confidence_label(certificate)
2339
+ evaluation_report["confidence"] = _compute_confidence_label(evaluation_report)
2307
2340
  except Exception: # pragma: no cover
2308
2341
  pass
2309
2342
 
2310
- return certificate
2343
+ return evaluation_report
2311
2344
 
2312
2345
 
2313
2346
  # Console Validation Block helpers have moved to invarlock.reporting.render.
2314
2347
 
2315
2348
 
2316
- ## NOTE: render_certificate_markdown has been moved to invarlock.reporting.render.
2349
+ ## NOTE: render_report_markdown has been moved to invarlock.reporting.render.
2317
2350
  ## It is re-exported at the bottom of this module to preserve the public API.
2318
2351
  ## Private helper functions
2319
2352
 
@@ -2591,7 +2624,7 @@ def _extract_structural_deltas(report: RunReport) -> dict[str, Any]:
2591
2624
  def _extract_edit_metadata(
2592
2625
  report: RunReport, plugin_provenance: dict[str, Any]
2593
2626
  ) -> dict[str, Any]:
2594
- """Extract edit-level provenance and configuration metadata for the certificate."""
2627
+ """Extract edit-level provenance and configuration metadata for the evaluation report."""
2595
2628
 
2596
2629
  edit_section = _get_mapping(report, "edit")
2597
2630
  if not edit_section:
@@ -2614,7 +2647,7 @@ def _extract_edit_metadata(
2614
2647
  alg_lower = str(algorithm).strip().lower()
2615
2648
  except Exception: # pragma: no cover
2616
2649
  alg_lower = ""
2617
- allowed_algorithms = {"quant_rtn", "noop"}
2650
+ allowed_algorithms = {"quant_rtn", "noop", "custom"}
2618
2651
  if alg_lower not in allowed_algorithms:
2619
2652
  algorithm = ""
2620
2653
 
@@ -2988,12 +3021,12 @@ def _compute_quality_overhead_from_guard(
2988
3021
 
2989
3022
 
2990
3023
  def _propagate_pairing_stats(
2991
- certificate: dict[str, Any], ppl_analysis: dict[str, Any] | None
3024
+ evaluation_report: dict[str, Any], ppl_analysis: dict[str, Any] | None
2992
3025
  ) -> None:
2993
- """Surface pairing statistics inside certificate.dataset.windows.stats."""
2994
- if not isinstance(certificate, dict):
3026
+ """Surface pairing statistics inside evaluation_report.dataset.windows.stats."""
3027
+ if not isinstance(evaluation_report, dict):
2995
3028
  return
2996
- ds = certificate.get("dataset", {})
3029
+ ds = evaluation_report.get("dataset", {})
2997
3030
  if not isinstance(ds, dict):
2998
3031
  return
2999
3032
  windows = ds.get("windows", {})
@@ -3047,7 +3080,7 @@ def _propagate_pairing_stats(
3047
3080
  windows["stats"] = stats
3048
3081
  if windows is not ds.get("windows"):
3049
3082
  ds["windows"] = windows
3050
- certificate["dataset"] = ds
3083
+ evaluation_report["dataset"] = ds
3051
3084
 
3052
3085
 
3053
3086
  def _build_provenance_block(
@@ -3225,6 +3258,105 @@ def _resolve_pm_acceptance_range_from_report(
3225
3258
  return {"min": float(min_val), "max": float(max_val)}
3226
3259
 
3227
3260
 
3261
+ def _resolve_pm_drift_band_from_report(
3262
+ report: dict[str, Any] | None,
3263
+ ) -> dict[str, float]:
3264
+ """Resolve preview→final drift band from report context/meta/env."""
3265
+
3266
+ base_min = 0.95
3267
+ base_max = 1.05
3268
+
3269
+ def _safe_float(val: Any) -> float | None:
3270
+ try:
3271
+ if val is None:
3272
+ return None
3273
+ out = float(val)
3274
+ except Exception:
3275
+ return None
3276
+ return out if math.isfinite(out) else None
3277
+
3278
+ cfg_min = None
3279
+ cfg_max = None
3280
+
3281
+ ctx = report.get("context") if isinstance(report, dict) else None
3282
+ if isinstance(ctx, dict):
3283
+ pm_ctx = ctx.get("primary_metric")
3284
+ if isinstance(pm_ctx, dict):
3285
+ band = pm_ctx.get("drift_band")
3286
+ if isinstance(band, dict):
3287
+ cfg_min = _safe_float(band.get("min"))
3288
+ cfg_max = _safe_float(band.get("max"))
3289
+ elif isinstance(band, list | tuple) and len(band) == 2:
3290
+ cfg_min = _safe_float(band[0])
3291
+ cfg_max = _safe_float(band[1])
3292
+ if cfg_min is None or cfg_max is None:
3293
+ alt = ctx.get("pm_drift_band")
3294
+ if isinstance(alt, dict):
3295
+ cfg_min = (
3296
+ cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
3297
+ )
3298
+ cfg_max = (
3299
+ cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
3300
+ )
3301
+
3302
+ if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
3303
+ meta = report.get("meta")
3304
+ if isinstance(meta, dict):
3305
+ meta_band = meta.get("pm_drift_band")
3306
+ if isinstance(meta_band, dict):
3307
+ cfg_min = (
3308
+ cfg_min
3309
+ if cfg_min is not None
3310
+ else _safe_float(meta_band.get("min"))
3311
+ )
3312
+ cfg_max = (
3313
+ cfg_max
3314
+ if cfg_max is not None
3315
+ else _safe_float(meta_band.get("max"))
3316
+ )
3317
+
3318
+ def _parse_env(name: str) -> float | None:
3319
+ try:
3320
+ raw = os.environ.get(name, "")
3321
+ if raw is None or str(raw).strip() == "":
3322
+ return None
3323
+ return float(raw)
3324
+ except Exception:
3325
+ return None
3326
+
3327
+ env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
3328
+ env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
3329
+
3330
+ has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
3331
+ if not has_explicit:
3332
+ return {}
3333
+
3334
+ min_val = (
3335
+ env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
3336
+ )
3337
+ max_val = (
3338
+ env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
3339
+ )
3340
+
3341
+ try:
3342
+ if min_val is not None and min_val <= 0:
3343
+ min_val = base_min
3344
+ except Exception:
3345
+ min_val = base_min
3346
+ try:
3347
+ if max_val is not None and max_val <= 0:
3348
+ max_val = base_max
3349
+ except Exception:
3350
+ max_val = base_max
3351
+ try:
3352
+ if min_val is not None and max_val is not None and min_val >= max_val:
3353
+ min_val, max_val = base_min, base_max
3354
+ except Exception:
3355
+ min_val, max_val = base_min, base_max
3356
+
3357
+ return {"min": float(min_val), "max": float(max_val)}
3358
+
3359
+
3228
3360
  def _compute_validation_flags(
3229
3361
  ppl: dict[str, Any],
3230
3362
  spectral: dict[str, Any],
@@ -3238,9 +3370,10 @@ def _compute_validation_flags(
3238
3370
  moe: dict[str, Any] | None = None,
3239
3371
  dataset_capacity: dict[str, Any] | None = None,
3240
3372
  pm_acceptance_range: dict[str, float] | None = None,
3373
+ pm_drift_band: dict[str, float] | None = None,
3241
3374
  pm_tail: dict[str, Any] | None = None,
3242
3375
  ) -> dict[str, bool]:
3243
- """Compute validation flags for the certificate including canonical gates."""
3376
+ """Compute validation flags for the evaluation report including canonical gates."""
3244
3377
  tier = (tier or "balanced").lower()
3245
3378
  # Dev-only tiny relax: widen gates and lower floors when explicitly requested
3246
3379
  import os as _os
@@ -3301,9 +3434,27 @@ def _compute_validation_flags(
3301
3434
  ratio_limit = min(ratio_limit, float(target_ratio))
3302
3435
 
3303
3436
  # Canonical Gates
3304
- # 1. Drift gate: 0.95 ≤ final/preview ≤ 1.05
3437
+ # 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
3305
3438
  drift_ratio = ppl.get("preview_final_ratio", 1.0)
3306
- preview_final_drift_acceptable = 0.95 <= drift_ratio <= 1.05
3439
+ drift_min = 0.95
3440
+ drift_max = 1.05
3441
+ if isinstance(pm_drift_band, dict):
3442
+ try:
3443
+ cand_min = pm_drift_band.get("min")
3444
+ cand_max = pm_drift_band.get("max")
3445
+ if isinstance(cand_min, int | float) and isinstance(cand_max, int | float):
3446
+ cand_min_f = float(cand_min)
3447
+ cand_max_f = float(cand_max)
3448
+ if (
3449
+ math.isfinite(cand_min_f)
3450
+ and math.isfinite(cand_max_f)
3451
+ and 0 < cand_min_f < cand_max_f
3452
+ ):
3453
+ drift_min = cand_min_f
3454
+ drift_max = cand_max_f
3455
+ except Exception: # pragma: no cover
3456
+ pass
3457
+ preview_final_drift_acceptable = drift_min <= drift_ratio <= drift_max
3307
3458
  if _tiny_relax:
3308
3459
  # Treat drift identity as informational in tiny dev demos
3309
3460
  preview_final_drift_acceptable = True
@@ -3463,7 +3614,7 @@ def _compute_validation_flags(
3463
3614
  if _tiny_relax and threshold_val < 0.10:
3464
3615
  threshold_val = 0.10
3465
3616
  if not math.isfinite(ratio_val):
3466
- # In dev/Compare-&-Certify flows we often lack a bare run; treat missing metric as pass
3617
+ # In dev/Compare-&-Evaluate flows we often lack a bare run; treat missing metric as pass
3467
3618
  guard_overhead_pass = True
3468
3619
  else:
3469
3620
  guard_overhead_pass = ratio_val <= (1.0 + max(0.0, threshold_val))
@@ -3619,7 +3770,7 @@ def _generate_run_id(report: RunReport) -> str:
3619
3770
  return hashlib.sha256(base_str.encode()).hexdigest()[:16]
3620
3771
 
3621
3772
 
3622
- ## NOTE: _compute_certificate_hash moved to invarlock.reporting.render and is re-exported below.
3773
+ ## NOTE: _compute_report_hash moved to invarlock.reporting.render and is re-exported below.
3623
3774
 
3624
3775
 
3625
3776
  def _analyze_bitwidth_map(bitwidth_map: dict[str, Any]) -> dict[str, Any]:
@@ -3964,22 +4115,24 @@ def _extract_compression_diagnostics(
3964
4115
 
3965
4116
  # Re-export rendering API from dedicated module to avoid bloat/cycles
3966
4117
  # Rendering helpers live in invarlock.reporting.render; internal code should import there directly.
3967
- # Tests and public API expect render_certificate_markdown to be available from
3968
- # invarlock.reporting.certificate. Import lazily at module end to avoid cycles with
4118
+ # Tests and public API expect render_report_markdown to be available from
4119
+ # invarlock.reporting.report_builder. Import lazily at module end to avoid cycles with
3969
4120
  # invarlock.reporting.render which imports this module as a namespace.
3970
4121
  try: # pragma: no cover - simple re-export
3971
4122
  from .render import (
3972
4123
  compute_console_validation_block, # type: ignore
3973
- render_certificate_markdown, # type: ignore
4124
+ render_report_markdown, # type: ignore
3974
4125
  )
3975
4126
  except Exception: # pragma: no cover - defensive fallback
3976
4127
 
3977
- def render_certificate_markdown(certificate: dict[str, Any]) -> str: # type: ignore
4128
+ def render_report_markdown(evaluation_report: dict[str, Any]) -> str: # type: ignore
3978
4129
  raise ImportError(
3979
- "render_certificate_markdown is unavailable; rendering dependencies missing"
4130
+ "render_report_markdown is unavailable; rendering dependencies missing"
3980
4131
  )
3981
4132
 
3982
- def compute_console_validation_block(certificate: dict[str, Any]) -> dict[str, Any]: # type: ignore
4133
+ def compute_console_validation_block(
4134
+ evaluation_report: dict[str, Any],
4135
+ ) -> dict[str, Any]: # type: ignore
3983
4136
  raise ImportError(
3984
4137
  "compute_console_validation_block is unavailable; rendering dependencies missing"
3985
4138
  )
@@ -3987,12 +4140,12 @@ except Exception: # pragma: no cover - defensive fallback
3987
4140
 
3988
4141
  # Export public API
3989
4142
  __all__ = [
3990
- "make_certificate",
3991
- "validate_certificate",
4143
+ "make_report",
4144
+ "validate_report",
3992
4145
  "_validate_with_jsonschema",
3993
4146
  "jsonschema",
3994
- "render_certificate_markdown",
4147
+ "render_report_markdown",
3995
4148
  "compute_console_validation_block",
3996
- "CERTIFICATE_SCHEMA_VERSION",
3997
- "CERTIFICATE_JSON_SCHEMA",
4149
+ "REPORT_SCHEMA_VERSION",
4150
+ "REPORT_JSON_SCHEMA",
3998
4151
  ]