invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +11 -15
  4. invarlock/adapters/auto.py +35 -40
  5. invarlock/adapters/capabilities.py +2 -2
  6. invarlock/adapters/hf_causal.py +418 -0
  7. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  8. invarlock/adapters/hf_mixin.py +25 -4
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/calibration/spectral_null.py +15 -10
  12. invarlock/calibration/variance_ve.py +0 -2
  13. invarlock/cli/adapter_auto.py +31 -21
  14. invarlock/cli/app.py +73 -2
  15. invarlock/cli/commands/calibrate.py +6 -2
  16. invarlock/cli/commands/certify.py +651 -91
  17. invarlock/cli/commands/doctor.py +11 -11
  18. invarlock/cli/commands/explain_gates.py +57 -8
  19. invarlock/cli/commands/plugins.py +13 -9
  20. invarlock/cli/commands/report.py +233 -69
  21. invarlock/cli/commands/run.py +1066 -244
  22. invarlock/cli/commands/verify.py +154 -15
  23. invarlock/cli/config.py +22 -6
  24. invarlock/cli/doctor_helpers.py +4 -5
  25. invarlock/cli/output.py +193 -0
  26. invarlock/cli/provenance.py +1 -1
  27. invarlock/core/api.py +45 -5
  28. invarlock/core/auto_tuning.py +65 -20
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/contracts.py +7 -1
  31. invarlock/core/registry.py +11 -13
  32. invarlock/core/runner.py +425 -75
  33. invarlock/edits/quant_rtn.py +65 -37
  34. invarlock/eval/bench.py +3 -16
  35. invarlock/eval/data.py +82 -51
  36. invarlock/eval/metrics.py +63 -2
  37. invarlock/eval/primary_metric.py +23 -0
  38. invarlock/eval/tail_stats.py +230 -0
  39. invarlock/eval/tasks/__init__.py +12 -0
  40. invarlock/eval/tasks/classification.py +48 -0
  41. invarlock/eval/tasks/qa.py +36 -0
  42. invarlock/eval/tasks/text_generation.py +102 -0
  43. invarlock/guards/_estimators.py +154 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/policies.py +16 -6
  46. invarlock/guards/rmt.py +627 -546
  47. invarlock/guards/spectral.py +348 -110
  48. invarlock/guards/tier_config.py +32 -30
  49. invarlock/guards/variance.py +7 -31
  50. invarlock/guards_ref/rmt_ref.py +23 -23
  51. invarlock/model_profile.py +90 -42
  52. invarlock/observability/health.py +6 -6
  53. invarlock/observability/metrics.py +108 -0
  54. invarlock/reporting/certificate.py +384 -55
  55. invarlock/reporting/certificate_schema.py +3 -2
  56. invarlock/reporting/dataset_hashing.py +15 -2
  57. invarlock/reporting/guards_analysis.py +350 -277
  58. invarlock/reporting/html.py +55 -5
  59. invarlock/reporting/normalizer.py +13 -0
  60. invarlock/reporting/policy_utils.py +38 -36
  61. invarlock/reporting/primary_metric_utils.py +71 -17
  62. invarlock/reporting/render.py +852 -431
  63. invarlock/reporting/report.py +40 -4
  64. invarlock/reporting/report_types.py +11 -3
  65. invarlock/reporting/telemetry.py +86 -0
  66. invarlock/reporting/validate.py +1 -18
  67. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
  68. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
  69. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  70. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  71. invarlock/adapters/hf_gpt2.py +0 -404
  72. invarlock/adapters/hf_llama.py +0 -487
  73. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  74. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -12,19 +12,69 @@ from typing import Any
12
12
 
13
13
  from .render import render_certificate_markdown
14
14
 
15
+ markdown_module: Any | None = None
16
+ try:
17
+ import markdown as _markdown # type: ignore[import-untyped]
18
+ except Exception: # pragma: no cover - optional dependency
19
+ _markdown = None
20
+ else:
21
+ markdown_module = _markdown
22
+
23
+
24
+ _STATUS_BADGES = {
25
+ "\u2705 PASS": '<span class="badge pass">PASS</span>',
26
+ "\u2705 OK": '<span class="badge pass">OK</span>',
27
+ "\u274c FAIL": '<span class="badge fail">FAIL</span>',
28
+ "\u26a0\ufe0f WARN": '<span class="badge warn">WARN</span>',
29
+ "\u26a0 WARN": '<span class="badge warn">WARN</span>',
30
+ }
31
+
32
+
33
+ def _apply_status_badges(html_body: str) -> str:
34
+ updated = html_body
35
+ for token, replacement in _STATUS_BADGES.items():
36
+ updated = updated.replace(token, replacement)
37
+ return updated
38
+
15
39
 
16
40
  def render_certificate_html(certificate: dict[str, Any]) -> str:
17
41
  """Render a certificate as a simple HTML document.
18
42
 
19
- Uses the Markdown renderer and embeds the content in a <pre> block to ensure
20
- stable parity for snapshot tests without extra dependencies.
43
+ Uses the Markdown renderer and converts to HTML when available, falling back
44
+ to a <pre> block when the markdown dependency is missing.
21
45
  """
22
46
  md = render_certificate_markdown(certificate)
23
- body = f'<pre class="invarlock-md">{escape(md)}</pre>'
47
+ if markdown_module is None:
48
+ body = f'<pre class="invarlock-md">{escape(md)}</pre>'
49
+ else:
50
+ html_body = markdown_module.markdown(md, extensions=["tables", "fenced_code"])
51
+ html_body = _apply_status_badges(html_body)
52
+ body = f'<div class="invarlock-md">{html_body}</div>'
24
53
  return (
25
54
  '<!DOCTYPE html><html><head><meta charset="utf-8">'
26
- "<title>InvarLock Safety Certificate</title>"
27
- "<style>body{font-family:ui-monospace,Menlo,monospace;white-space:pre-wrap}</style>"
55
+ "<title>InvarLock Evaluation Certificate</title>"
56
+ "<style>"
57
+ ":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
58
+ "--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
59
+ "body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,sans-serif;"
60
+ "color:var(--ink);background:linear-gradient(180deg,#fff, #f6f8fa);"
61
+ "margin:0;padding:32px}"
62
+ ".invarlock-md{max-width:960px;margin:0 auto;padding:24px;background:#fff;"
63
+ "border:1px solid var(--border);border-radius:16px;box-shadow:0 10px 30px rgba(0,0,0,0.05)}"
64
+ "h1,h2,h3{margin-top:1.4em}h1{margin-top:0}"
65
+ "table{border-collapse:collapse;width:100%;margin:12px 0}"
66
+ "th,td{border:1px solid var(--border);padding:6px 8px;text-align:left}"
67
+ "code,pre{background:var(--panel);border-radius:8px}"
68
+ "pre{padding:12px;overflow:auto}"
69
+ ".badge{display:inline-block;padding:2px 8px;border-radius:999px;"
70
+ "font-size:0.75rem;font-weight:700;letter-spacing:0.02em;color:#fff}"
71
+ ".badge.pass{background:var(--pass)}"
72
+ ".badge.fail{background:var(--fail)}"
73
+ ".badge.warn{background:var(--warn)}"
74
+ "@media print{body{background:#fff;padding:0}.invarlock-md{box-shadow:none;"
75
+ "border:0}a{color:inherit;text-decoration:none}.badge{color:#000;"
76
+ "border:1px solid #000;background:transparent}}"
77
+ "</style>"
28
78
  "</head><body>" + body + "</body></html>"
29
79
  )
30
80
 
@@ -55,6 +55,8 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
55
55
  }
56
56
  # Preserve additional provenance knobs used by certificate/digests.
57
57
  for key in (
58
+ "pm_acceptance_range",
59
+ "pm_drift_band",
58
60
  "policy_overrides",
59
61
  "overrides",
60
62
  "plugins",
@@ -179,10 +181,16 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
179
181
  "latency_ms_p50",
180
182
  "latency_ms_p95",
181
183
  "memory_mb_peak",
184
+ "gpu_memory_mb_peak",
185
+ "gpu_memory_reserved_mb_peak",
186
+ "timings",
187
+ "guard_timings",
188
+ "memory_snapshots",
182
189
  "throughput_sps",
183
190
  "spectral",
184
191
  "rmt",
185
192
  "invariants",
193
+ "primary_metric_tail",
186
194
  "logloss_delta_ci",
187
195
  "bootstrap",
188
196
  "reduction",
@@ -237,6 +245,11 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
237
245
  flags=flags,
238
246
  )
239
247
 
248
+ # keep context when provided (profile/assurance provenance)
249
+ ctx = src.get("context")
250
+ if isinstance(ctx, Mapping):
251
+ out["context"] = dict(ctx)
252
+
240
253
  # keep evaluation_windows if provided (for deeper pairing-based features)
241
254
  ew = src.get("evaluation_windows")
242
255
  if isinstance(ew, dict):
@@ -48,6 +48,10 @@ def _compute_thresholds_payload(
48
48
  if not isinstance(pm_policy, dict):
49
49
  pm_policy = {}
50
50
 
51
+ pm_tail_policy = metrics_policy.get("pm_tail", {})
52
+ if not isinstance(pm_tail_policy, dict):
53
+ pm_tail_policy = {}
54
+
51
55
  acc_policy = metrics_policy.get("accuracy", {})
52
56
  if not isinstance(acc_policy, dict):
53
57
  acc_policy = {}
@@ -76,6 +80,12 @@ def _compute_thresholds_payload(
76
80
  resolved_policy.get("variance", {}) if isinstance(resolved_policy, dict) else {}
77
81
  )
78
82
 
83
+ def _safe_float_any(value: Any, default: float) -> float:
84
+ try:
85
+ return float(value)
86
+ except Exception:
87
+ return float(default)
88
+
79
89
  payload = {
80
90
  "tier": tier_lc,
81
91
  "pm_ratio": {
@@ -86,6 +96,22 @@ def _compute_thresholds_payload(
86
96
  ),
87
97
  "hysteresis_ratio": float(pm_policy.get("hysteresis_ratio", 0.0) or 0.0),
88
98
  },
99
+ "pm_tail": {
100
+ "mode": str(pm_tail_policy.get("mode", "warn") or "warn").strip().lower(),
101
+ "min_windows": int(pm_tail_policy.get("min_windows", 0) or 0),
102
+ "quantile": _safe_float_any(pm_tail_policy.get("quantile", 0.95), 0.95),
103
+ "quantile_max": (
104
+ float(pm_tail_policy.get("quantile_max"))
105
+ if isinstance(pm_tail_policy.get("quantile_max"), int | float)
106
+ else None
107
+ ),
108
+ "epsilon": _safe_float_any(pm_tail_policy.get("epsilon", 0.0), 0.0),
109
+ "mass_max": (
110
+ float(pm_tail_policy.get("mass_max"))
111
+ if isinstance(pm_tail_policy.get("mass_max"), int | float)
112
+ else None
113
+ ),
114
+ },
89
115
  "accuracy": {
90
116
  "delta_min_pp": float(acc_policy.get("delta_min_pp", -1.0) or -1.0),
91
117
  "min_examples": int(acc_policy.get("min_examples", 200) or 200),
@@ -110,16 +136,6 @@ def _compute_thresholds_hash(payload: dict[str, Any]) -> str:
110
136
  return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]
111
137
 
112
138
 
113
- def _promote_legacy_multiple_testing_key(payload: dict[str, Any]) -> None:
114
- """Promote legacy 'multipletesting' to 'multiple_testing' in-place if present."""
115
- try:
116
- legacy_mt = payload.pop("multipletesting", None)
117
- if legacy_mt is not None and "multiple_testing" not in payload:
118
- payload["multiple_testing"] = legacy_mt
119
- except Exception:
120
- pass
121
-
122
-
123
139
  def _resolve_policy_tier(report: RunReport) -> str:
124
140
  """Resolve the policy tier from report metadata or context."""
125
141
  tier: Any = None
@@ -218,15 +234,9 @@ def _build_resolved_policies(
218
234
  from .policy_utils import _format_family_caps as _ffc # self import safe
219
235
 
220
236
  spectral_resolved["family_caps"] = _ffc(spectral_caps)
221
- # Prefer observed policy sigma_quantile (accepting legacy aliases), then fallback
222
237
  pol_sq = None
223
238
  try:
224
239
  pol_sq = (spectral.get("policy", {}) or {}).get("sigma_quantile")
225
- if pol_sq is None:
226
- # Legacy aliases
227
- pol_sq = (spectral.get("policy", {}) or {}).get("contraction") or (
228
- spectral.get("policy", {}) or {}
229
- ).get("kappa")
230
240
  except Exception:
231
241
  pol_sq = None
232
242
  spectral_resolved["sigma_quantile"] = _safe_float(
@@ -276,6 +286,9 @@ def _build_resolved_policies(
276
286
  spectral_resolved["max_spectral_norm"] = spectral.get("policy", {}).get(
277
287
  "max_spectral_norm", spectral_resolved.get("max_spectral_norm")
278
288
  )
289
+ mc = spectral.get("measurement_contract")
290
+ if isinstance(mc, dict) and mc:
291
+ spectral_resolved["measurement_contract"] = copy.deepcopy(mc)
279
292
  resolved["spectral"] = spectral_resolved
280
293
 
281
294
  # RMT guard
@@ -295,15 +308,16 @@ def _build_resolved_policies(
295
308
  rmt_resolved["epsilon_default"] = _safe_float(epsilon_default_val, 0.1)
296
309
  from .policy_utils import _format_epsilon_map as _fem
297
310
 
298
- epsilon_map = _fem(rmt.get("epsilon_by_family") or rmt_resolved.pop("epsilon", {}))
311
+ epsilon_map = _fem(
312
+ rmt.get("epsilon_by_family") or rmt_resolved.get("epsilon_by_family") or {}
313
+ )
299
314
  if epsilon_map:
300
315
  rmt_resolved["epsilon_by_family"] = epsilon_map
301
- else:
302
- rmt_resolved.pop("epsilon", None)
303
- if "epsilon" in rmt_resolved:
304
- rmt_resolved.pop("epsilon", None)
305
316
  if "correct" in rmt_resolved:
306
317
  rmt_resolved["correct"] = bool(rmt_resolved["correct"])
318
+ mc = rmt.get("measurement_contract")
319
+ if isinstance(mc, dict) and mc:
320
+ rmt_resolved["measurement_contract"] = copy.deepcopy(mc)
307
321
  resolved["rmt"] = rmt_resolved
308
322
 
309
323
  # Variance guard
@@ -441,13 +455,9 @@ def _extract_effective_policies(report: RunReport) -> dict[str, Any]:
441
455
  elif guard_name == "spectral":
442
456
  sigma_quantile = guard_metrics.get(
443
457
  "sigma_quantile",
444
- guard_metrics.get("contraction", guard_metrics.get("kappa", 0.95)),
445
- )
446
- multiple_testing = guard_metrics.get("multiple_testing") or (
447
- guard_metrics.get("multipletesting")
448
- if isinstance(guard_metrics.get("multipletesting"), dict)
449
- else None
458
+ 0.95,
450
459
  )
460
+ multiple_testing = guard_metrics.get("multiple_testing")
451
461
  guard_policy = {
452
462
  "max_spectral_norm": guard_metrics.get("max_spectral_norm"),
453
463
  "stability_score": guard_metrics.get("stability_score", 0.95),
@@ -473,20 +483,13 @@ def _extract_effective_policies(report: RunReport) -> dict[str, Any]:
473
483
 
474
484
  if guard_policy:
475
485
  if guard_name == "spectral":
476
- sigma_quantile = guard_policy.get("sigma_quantile")
477
- if sigma_quantile is None:
478
- sigma_quantile = guard_policy.get("contraction")
479
- if sigma_quantile is None and "kappa" in guard_policy:
480
- sigma_quantile = guard_policy["kappa"]
481
486
  sanitized_policy = dict(guard_policy)
487
+ sigma_quantile = sanitized_policy.get("sigma_quantile")
482
488
  if sigma_quantile is not None:
483
489
  try:
484
490
  sanitized_policy["sigma_quantile"] = float(sigma_quantile)
485
491
  except (TypeError, ValueError):
486
492
  pass
487
- _promote_legacy_multiple_testing_key(sanitized_policy)
488
- sanitized_policy.pop("contraction", None)
489
- sanitized_policy.pop("kappa", None)
490
493
  if sanitized_policy.get("max_spectral_norm") in (None, 0):
491
494
  sanitized_policy["max_spectral_norm"] = None
492
495
  guard_policy = sanitized_policy
@@ -587,7 +590,6 @@ __all__ = [
587
590
  "_compute_variance_policy_digest",
588
591
  "_compute_thresholds_payload",
589
592
  "_compute_thresholds_hash",
590
- "_promote_legacy_multiple_testing_key",
591
593
  "_resolve_policy_tier",
592
594
  "_build_resolved_policies",
593
595
  "_extract_effective_policies",
@@ -30,6 +30,38 @@ def attach_primary_metric(
30
30
  pm = m.get("primary_metric") if isinstance(m, dict) else None
31
31
  if isinstance(pm, dict) and pm:
32
32
  pm_copy = copy.deepcopy(pm)
33
+ pm_copy.setdefault("invalid", bool(pm_copy.get("invalid", False)))
34
+ degraded_reason = pm_copy.get("degraded_reason")
35
+ preview_val = pm_copy.get("preview")
36
+ final_val = pm_copy.get("final")
37
+ ratio_val = pm_copy.get("ratio_vs_baseline")
38
+ baseline_final = (
39
+ baseline_ref.get("primary_metric", {}).get("final")
40
+ if isinstance(baseline_ref, dict)
41
+ else None
42
+ )
43
+
44
+ def _is_finite(value: Any) -> bool:
45
+ return isinstance(value, (int, float)) and math.isfinite(float(value))
46
+
47
+ baseline_has_reference = _is_finite(baseline_final)
48
+ needs_pm_fallback = not (_is_finite(preview_val) and _is_finite(final_val))
49
+ needs_ratio_fallback = baseline_has_reference and not _is_finite(ratio_val)
50
+
51
+ if degraded_reason is None:
52
+ if needs_pm_fallback:
53
+ degraded_reason = "non_finite_pm"
54
+ elif needs_ratio_fallback:
55
+ degraded_reason = "non_finite_delta"
56
+ elif pm_copy.get("invalid"):
57
+ degraded_reason = "primary_metric_invalid"
58
+
59
+ pm_copy["degraded"] = bool(
60
+ pm_copy.get("degraded") or pm_copy.get("invalid") or degraded_reason
61
+ )
62
+ if pm_copy["degraded"] and degraded_reason:
63
+ pm_copy.setdefault("degraded_reason", degraded_reason)
64
+
33
65
  # Propagate instability hint from ppl_analysis
34
66
  try:
35
67
  if isinstance(ppl_analysis, dict) and bool(
@@ -75,33 +107,52 @@ def attach_primary_metric(
75
107
  pm_copy["analysis_point_final"] = float(mean_fin)
76
108
  # Attach analysis-basis CIs for preview/final in log space from report metrics
77
109
  try:
78
- dlci = (
79
- _coerce_interval(m.get("logloss_delta_ci"))
80
- if isinstance(m, dict)
81
- else (math.nan, math.nan)
82
- )
83
- if isinstance(dlci, tuple | list) and len(dlci) == 2:
84
- lo, hi = float(dlci[0]), float(dlci[1])
85
- if math.isfinite(lo) and math.isfinite(hi):
86
- pm_copy.setdefault("ci", (lo, hi))
110
+ dlci_source: tuple[float, float] | list[float] | None = None
111
+ pairing_source = None
112
+ if isinstance(ppl_analysis, dict):
113
+ stats = ppl_analysis.get("stats") or {}
114
+ if isinstance(stats, dict):
115
+ pairing_source = stats.get("pairing")
116
+ if pairing_source == "paired_baseline":
117
+ dlci_source = _coerce_interval(
118
+ ppl_analysis.get("logloss_delta_ci")
119
+ )
120
+ if dlci_source is None:
121
+ dlci_source = (
122
+ _coerce_interval(m.get("logloss_delta_ci"))
123
+ if isinstance(m, dict)
124
+ else (math.nan, math.nan)
125
+ )
126
+ if (
127
+ isinstance(dlci_source, tuple | list)
128
+ and len(dlci_source) == 2
129
+ ):
130
+ lo_raw, hi_raw = dlci_source[0], dlci_source[1]
131
+ if isinstance(lo_raw, (int, float)) and isinstance(
132
+ hi_raw, (int, float)
133
+ ):
134
+ lo, hi = float(lo_raw), float(hi_raw)
135
+ if math.isfinite(lo) and math.isfinite(hi):
136
+ pm_copy.setdefault("ci", (lo, hi))
87
137
  except Exception:
88
138
  pass
89
139
  except Exception:
90
140
  pass
91
141
  # Ensure ratio_vs_baseline present and consistent
92
142
  try:
93
- base_final = (
94
- baseline_ref.get("primary_metric", {}).get("final")
95
- if isinstance(baseline_ref, dict)
143
+ fin = pm_copy.get("final")
144
+ baseline_final_val = (
145
+ float(baseline_final)
146
+ if isinstance(baseline_final, (int, float))
147
+ and _is_finite(baseline_final)
96
148
  else None
97
149
  )
98
- fin = pm_copy.get("final")
99
150
  if (
100
- isinstance(fin, int | float)
101
- and isinstance(base_final, int | float)
102
- and float(base_final) > 0
151
+ isinstance(fin, (int, float))
152
+ and baseline_final_val is not None
153
+ and baseline_final_val > 0
103
154
  ):
104
- pm_copy["ratio_vs_baseline"] = float(fin) / float(base_final)
155
+ pm_copy["ratio_vs_baseline"] = float(fin) / baseline_final_val
105
156
  # Ensure display_ci aligns with log-space CI for ppl-like metrics
106
157
  try:
107
158
  kind = str(pm_copy.get("kind", "")).lower()
@@ -277,6 +328,9 @@ def attach_primary_metric(
277
328
  if isinstance(point, float):
278
329
  pm["display_ci"] = [point, point]
279
330
  else:
331
+ # As last resort, emit a degenerate [1.0, 1.0] to satisfy schema invariants
280
332
  pm["display_ci"] = [1.0, 1.0]
333
+ pm.setdefault("estimated", True)
334
+
281
335
  except Exception:
282
336
  pass