invarlock 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. invarlock/__init__.py +2 -2
  2. invarlock/_data/runtime/tiers.yaml +57 -30
  3. invarlock/adapters/__init__.py +11 -15
  4. invarlock/adapters/auto.py +35 -40
  5. invarlock/adapters/capabilities.py +2 -2
  6. invarlock/adapters/hf_causal.py +418 -0
  7. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  8. invarlock/adapters/hf_mixin.py +25 -4
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/calibration/spectral_null.py +15 -10
  12. invarlock/calibration/variance_ve.py +0 -2
  13. invarlock/cli/adapter_auto.py +31 -21
  14. invarlock/cli/app.py +73 -2
  15. invarlock/cli/commands/calibrate.py +6 -2
  16. invarlock/cli/commands/certify.py +651 -91
  17. invarlock/cli/commands/doctor.py +11 -11
  18. invarlock/cli/commands/explain_gates.py +57 -8
  19. invarlock/cli/commands/plugins.py +13 -9
  20. invarlock/cli/commands/report.py +233 -69
  21. invarlock/cli/commands/run.py +1066 -244
  22. invarlock/cli/commands/verify.py +154 -15
  23. invarlock/cli/config.py +22 -6
  24. invarlock/cli/doctor_helpers.py +4 -5
  25. invarlock/cli/output.py +193 -0
  26. invarlock/cli/provenance.py +1 -1
  27. invarlock/core/api.py +45 -5
  28. invarlock/core/auto_tuning.py +65 -20
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/contracts.py +7 -1
  31. invarlock/core/registry.py +11 -13
  32. invarlock/core/runner.py +425 -75
  33. invarlock/edits/quant_rtn.py +65 -37
  34. invarlock/eval/bench.py +3 -16
  35. invarlock/eval/data.py +82 -51
  36. invarlock/eval/metrics.py +63 -2
  37. invarlock/eval/primary_metric.py +23 -0
  38. invarlock/eval/tail_stats.py +230 -0
  39. invarlock/eval/tasks/__init__.py +12 -0
  40. invarlock/eval/tasks/classification.py +48 -0
  41. invarlock/eval/tasks/qa.py +36 -0
  42. invarlock/eval/tasks/text_generation.py +102 -0
  43. invarlock/guards/_estimators.py +154 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/policies.py +16 -6
  46. invarlock/guards/rmt.py +627 -546
  47. invarlock/guards/spectral.py +348 -110
  48. invarlock/guards/tier_config.py +32 -30
  49. invarlock/guards/variance.py +7 -31
  50. invarlock/guards_ref/rmt_ref.py +23 -23
  51. invarlock/model_profile.py +90 -42
  52. invarlock/observability/health.py +6 -6
  53. invarlock/observability/metrics.py +108 -0
  54. invarlock/reporting/certificate.py +384 -55
  55. invarlock/reporting/certificate_schema.py +3 -2
  56. invarlock/reporting/dataset_hashing.py +15 -2
  57. invarlock/reporting/guards_analysis.py +350 -277
  58. invarlock/reporting/html.py +55 -5
  59. invarlock/reporting/normalizer.py +13 -0
  60. invarlock/reporting/policy_utils.py +38 -36
  61. invarlock/reporting/primary_metric_utils.py +71 -17
  62. invarlock/reporting/render.py +852 -431
  63. invarlock/reporting/report.py +40 -4
  64. invarlock/reporting/report_types.py +11 -3
  65. invarlock/reporting/telemetry.py +86 -0
  66. invarlock/reporting/validate.py +1 -18
  67. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/METADATA +27 -13
  68. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/RECORD +72 -65
  69. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
  70. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
  71. invarlock/adapters/hf_gpt2.py +0 -404
  72. invarlock/adapters/hf_llama.py +0 -487
  73. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
  74. {invarlock-0.3.5.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,32 @@
1
1
  # mypy: ignore-errors
2
2
  from __future__ import annotations
3
3
 
4
+ import hashlib
5
+ import json
4
6
  import math
5
7
  from typing import Any, no_type_check
6
8
 
7
9
  from invarlock.core.auto_tuning import get_tier_policies
8
10
 
9
- from .policy_utils import _promote_legacy_multiple_testing_key, _resolve_policy_tier
11
+ from .policy_utils import _resolve_policy_tier
10
12
  from .report_types import RunReport
11
13
 
12
14
 
15
+ def _measurement_contract_digest(contract: Any) -> str | None:
16
+ if not isinstance(contract, dict) or not contract:
17
+ return None
18
+ try:
19
+ canonical = json.dumps(contract, sort_keys=True, default=str)
20
+ except Exception:
21
+ return None
22
+ return hashlib.sha256(canonical.encode()).hexdigest()[:16]
23
+
24
+
13
25
  @no_type_check
14
- def _extract_invariants(report: RunReport) -> dict[str, Any]:
15
- """Extract invariant check results (matches legacy shape used in tests)."""
26
+ def _extract_invariants(
27
+ report: RunReport, baseline: RunReport | None = None
28
+ ) -> dict[str, Any]:
29
+ """Extract invariant check results (matches the shape used in tests)."""
16
30
  invariants_data = (report.get("metrics", {}) or {}).get("invariants", {})
17
31
  failures: list[dict[str, Any]] = []
18
32
  summary: dict[str, Any] = {}
@@ -69,6 +83,108 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
69
83
  guard_entry = guard
70
84
  break
71
85
 
86
+ baseline_guard_entry = None
87
+ if baseline is not None:
88
+ for guard in baseline.get("guards", []) or []:
89
+ if str(guard.get("name", "")).lower() == "invariants":
90
+ baseline_guard_entry = guard
91
+ break
92
+
93
+ def _coerce_checks(value: Any) -> dict[str, Any] | None:
94
+ return value if isinstance(value, dict) else None
95
+
96
+ def _extract_guard_checks(
97
+ entry: Any,
98
+ ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
99
+ if not isinstance(entry, dict):
100
+ return None, None
101
+ details = entry.get("details")
102
+ if not isinstance(details, dict):
103
+ return None, None
104
+ return _coerce_checks(details.get("baseline_checks")), _coerce_checks(
105
+ details.get("current_checks")
106
+ )
107
+
108
+ def _compare_invariants(
109
+ baseline_checks: dict[str, Any],
110
+ current_checks: dict[str, Any],
111
+ ) -> tuple[list[dict[str, Any]], int, int]:
112
+ violations: list[dict[str, Any]] = []
113
+
114
+ # LayerNorm coverage check
115
+ baseline_layer_norms = set(baseline_checks.get("layer_norm_paths", ()))
116
+ current_layer_norms = set(current_checks.get("layer_norm_paths", ()))
117
+ missing_layer_norms = sorted(baseline_layer_norms - current_layer_norms)
118
+ if missing_layer_norms:
119
+ violations.append(
120
+ {
121
+ "type": "layer_norm_missing",
122
+ "missing": missing_layer_norms,
123
+ "message": "Expected LayerNorm modules are missing vs baseline",
124
+ }
125
+ )
126
+
127
+ # Tokenizer / vocab alignment
128
+ baseline_vocab_sizes = baseline_checks.get("embedding_vocab_sizes")
129
+ current_vocab_sizes = current_checks.get("embedding_vocab_sizes")
130
+ if isinstance(baseline_vocab_sizes, dict):
131
+ for module_name, baseline_size in baseline_vocab_sizes.items():
132
+ current_size = None
133
+ if isinstance(current_vocab_sizes, dict):
134
+ current_size = current_vocab_sizes.get(module_name)
135
+ if current_size is None or int(current_size) != int(baseline_size):
136
+ mismatch = {
137
+ "module": module_name,
138
+ "baseline": int(baseline_size),
139
+ "current": None if current_size is None else int(current_size),
140
+ }
141
+ violations.append(
142
+ {
143
+ "type": "tokenizer_mismatch",
144
+ "message": "Embedding vocabulary size changed vs baseline",
145
+ **mismatch,
146
+ }
147
+ )
148
+
149
+ handled_keys = {
150
+ "layer_norm_paths",
151
+ "embedding_vocab_sizes",
152
+ "config_vocab_size",
153
+ }
154
+ for check_name, baseline_value in baseline_checks.items():
155
+ if check_name in handled_keys:
156
+ continue
157
+ current_value = current_checks.get(check_name)
158
+ if current_value != baseline_value:
159
+ violations.append(
160
+ {
161
+ "type": "invariant_violation",
162
+ "check": check_name,
163
+ "baseline": baseline_value,
164
+ "current": current_value,
165
+ "message": (
166
+ f"Invariant {check_name} changed from {baseline_value} to {current_value}"
167
+ ),
168
+ }
169
+ )
170
+
171
+ fatal_violation_types = {"tokenizer_mismatch"}
172
+ fatal_count = 0
173
+ warning_count = 0
174
+ annotated: list[dict[str, Any]] = []
175
+ for violation in violations:
176
+ violation_type = str(violation.get("type") or "")
177
+ severity = "fatal" if violation_type in fatal_violation_types else "warning"
178
+ annotated_violation = dict(violation)
179
+ annotated_violation.setdefault("severity", severity)
180
+ annotated.append(annotated_violation)
181
+ if severity == "fatal":
182
+ fatal_count += 1
183
+ else:
184
+ warning_count += 1
185
+
186
+ return annotated, fatal_count, warning_count
187
+
72
188
  severity_status = "pass"
73
189
  if guard_entry:
74
190
  gm = guard_entry.get("metrics", {}) or {}
@@ -96,9 +212,51 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
96
212
  if detail:
97
213
  row["detail"] = detail
98
214
  failures.append(row)
99
- if fatal_count > 0:
215
+ base_fatal = 0
216
+ base_warn = 0
217
+ baseline_failures: list[dict[str, Any]] = []
218
+ if baseline_guard_entry is not None:
219
+ baseline_pre, baseline_post = _extract_guard_checks(baseline_guard_entry)
220
+ current_pre, current_post = _extract_guard_checks(guard_entry)
221
+ baseline_snapshot = baseline_pre or baseline_post
222
+ current_snapshot = current_post or current_pre
223
+ if isinstance(baseline_snapshot, dict) and isinstance(
224
+ current_snapshot, dict
225
+ ):
226
+ baseline_failures, base_fatal, base_warn = _compare_invariants(
227
+ baseline_snapshot, current_snapshot
228
+ )
229
+ for violation in baseline_failures:
230
+ check_name = violation.get("check")
231
+ if not check_name:
232
+ check_name = (
233
+ violation.get("module")
234
+ or violation.get("type")
235
+ or "invariant"
236
+ )
237
+ row = {
238
+ "check": str(check_name),
239
+ "type": str(violation.get("type") or "violation"),
240
+ "severity": str(violation.get("severity") or "warning"),
241
+ }
242
+ detail = {k: v for k, v in violation.items() if k not in row}
243
+ if detail:
244
+ detail.setdefault("source", "baseline_compare")
245
+ row["detail"] = detail
246
+ failures.append(row)
247
+
248
+ fatal_total = fatal_count + base_fatal
249
+ warn_total = warning_count + base_warn
250
+ try:
251
+ summary["fatal_violations"] = fatal_total
252
+ summary["warning_violations"] = warn_total
253
+ summary["violations_found"] = fatal_total + warn_total
254
+ except Exception:
255
+ pass
256
+
257
+ if fatal_total > 0:
100
258
  severity_status = "fail"
101
- elif warning_count > 0 or violations:
259
+ elif warn_total > 0 or violations:
102
260
  severity_status = "warn"
103
261
 
104
262
  # If any error-severity entry exists among failures, escalate to fail
@@ -118,12 +276,16 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
118
276
  "warning_violations": len(failures),
119
277
  }
120
278
 
279
+ details_out = invariants_data
280
+ if not details_out and guard_entry and isinstance(guard_entry.get("details"), dict):
281
+ details_out = guard_entry.get("details", {})
282
+
121
283
  return {
122
284
  "pre": "pass",
123
285
  "post": status,
124
286
  "status": status,
125
287
  "summary": summary,
126
- "details": invariants_data,
288
+ "details": details_out,
127
289
  "failures": failures,
128
290
  }
129
291
 
@@ -299,10 +461,10 @@ def _extract_spectral_analysis(
299
461
  families: dict[str, dict[str, Any]] = {}
300
462
  family_caps: dict[str, dict[str, float]] = {}
301
463
  top_z_scores: dict[str, list[dict[str, Any]]] = {}
464
+ deadband_used: float | None = None
302
465
 
303
466
  if isinstance(guard_metrics, dict):
304
467
  # Resolve deadband from policy/metrics/defaults
305
- deadband_used: float | None = None
306
468
  try:
307
469
  db_raw = guard_policy.get("deadband") if guard_policy else None
308
470
  if db_raw is None and isinstance(guard_metrics, dict):
@@ -314,16 +476,12 @@ def _extract_spectral_analysis(
314
476
  except Exception:
315
477
  deadband_used = None
316
478
 
317
- # Resolve sigma_quantile for summary (policy aliases supported)
479
+ # Resolve sigma_quantile for summary
318
480
  sigma_q_used: float | None = None
319
481
  try:
320
482
  pol_sq = None
321
483
  if isinstance(guard_policy, dict):
322
- pol_sq = (
323
- guard_policy.get("sigma_quantile")
324
- or guard_policy.get("contraction")
325
- or guard_policy.get("kappa")
326
- )
484
+ pol_sq = guard_policy.get("sigma_quantile")
327
485
  if pol_sq is None:
328
486
  pol_sq = default_sigma_quantile
329
487
  if pol_sq is not None:
@@ -371,7 +529,7 @@ def _extract_spectral_analysis(
371
529
  else {}
372
530
  )
373
531
  if not families:
374
- # Prefer z-summary when available; accept legacy 'family_stats' too
532
+ # Prefer z-summary when available; accept 'family_stats' too
375
533
  fzs = guard_metrics.get("family_z_summary")
376
534
  if not isinstance(fzs, dict) or not fzs:
377
535
  fzs = guard_metrics.get("family_stats")
@@ -493,7 +651,7 @@ def _extract_spectral_analysis(
493
651
  for source in sources:
494
652
  if not isinstance(source, dict):
495
653
  continue
496
- candidate = source.get("multiple_testing") or source.get("multipletesting")
654
+ candidate = source.get("multiple_testing")
497
655
  if isinstance(candidate, dict) and candidate:
498
656
  return candidate
499
657
  return None
@@ -505,20 +663,13 @@ def _extract_spectral_analysis(
505
663
  policy_out: dict[str, Any] | None = None
506
664
  if isinstance(guard_policy, dict) and guard_policy:
507
665
  policy_out = dict(guard_policy)
508
- _promote_legacy_multiple_testing_key(policy_out)
509
666
  if default_sigma_quantile is not None:
510
- sq = (
511
- policy_out.get("sigma_quantile")
512
- or policy_out.get("contraction")
513
- or policy_out.get("kappa")
514
- )
667
+ sq = policy_out.get("sigma_quantile")
515
668
  if sq is not None:
516
669
  try:
517
670
  policy_out["sigma_quantile"] = float(sq)
518
671
  except Exception:
519
672
  pass
520
- policy_out.pop("contraction", None)
521
- policy_out.pop("kappa", None)
522
673
  if tier == "balanced":
523
674
  policy_out["correction_enabled"] = False
524
675
  policy_out["max_spectral_norm"] = None
@@ -532,7 +683,7 @@ def _extract_spectral_analysis(
532
683
  "families": families,
533
684
  "family_caps": family_caps,
534
685
  }
535
- # Attach status to summary for backward-compatibility in tests
686
+ # Surface a stable/capped status on the summary for schema parity.
536
687
  try:
537
688
  summary["status"] = "stable" if int(caps_applied) == 0 else "capped"
538
689
  except Exception:
@@ -594,6 +745,40 @@ def _extract_spectral_analysis(
594
745
  result["top_violations"] = top_violations
595
746
  if family_quantiles:
596
747
  result["family_z_quantiles"] = family_quantiles
748
+ result["evaluated"] = bool(spectral_guard)
749
+
750
+ measurement_contract = None
751
+ try:
752
+ mc = (
753
+ guard_metrics.get("measurement_contract")
754
+ if isinstance(guard_metrics, dict)
755
+ else None
756
+ )
757
+ if isinstance(mc, dict) and mc:
758
+ measurement_contract = mc
759
+ except Exception:
760
+ measurement_contract = None
761
+ baseline_contract = None
762
+ try:
763
+ bc = (
764
+ baseline_spectral.get("measurement_contract")
765
+ if isinstance(baseline_spectral, dict)
766
+ else None
767
+ )
768
+ if isinstance(bc, dict) and bc:
769
+ baseline_contract = bc
770
+ except Exception:
771
+ baseline_contract = None
772
+ mc_hash = _measurement_contract_digest(measurement_contract)
773
+ baseline_hash = _measurement_contract_digest(baseline_contract)
774
+ if measurement_contract is not None:
775
+ result["measurement_contract"] = measurement_contract
776
+ if mc_hash:
777
+ result["measurement_contract_hash"] = mc_hash
778
+ if baseline_hash:
779
+ result["baseline_measurement_contract_hash"] = baseline_hash
780
+ if mc_hash and baseline_hash:
781
+ result["measurement_contract_match"] = bool(mc_hash == baseline_hash)
597
782
  result["caps_exceeded"] = bool(caps_exceeded)
598
783
  try:
599
784
  summary["caps_exceeded"] = bool(caps_exceeded)
@@ -624,24 +809,22 @@ def _extract_spectral_analysis(
624
809
  def _extract_rmt_analysis(
625
810
  report: RunReport, baseline: dict[str, Any]
626
811
  ) -> dict[str, Any]:
812
+ """Extract RMT analysis using activation edge-risk ε-band semantics."""
627
813
  tier = _resolve_policy_tier(report)
628
814
  tier_policies = get_tier_policies()
629
815
  tier_defaults = tier_policies.get(tier, tier_policies.get("balanced", {}))
816
+
630
817
  default_epsilon_map = (
631
818
  tier_defaults.get("rmt", {}).get("epsilon_by_family")
632
819
  if isinstance(tier_defaults, dict)
633
820
  else {}
634
821
  )
635
- if not default_epsilon_map and isinstance(tier_defaults, dict):
636
- default_epsilon_map = (tier_defaults.get("rmt", {}) or {}).get("epsilon", {})
637
822
  default_epsilon_map = {
638
823
  str(family): float(value)
639
824
  for family, value in (default_epsilon_map or {}).items()
640
- if isinstance(value, int | float)
825
+ if isinstance(value, int | float) and math.isfinite(float(value))
641
826
  }
642
827
 
643
- outliers_guarded = 0
644
- outliers_bare = 0
645
828
  epsilon_default = 0.1
646
829
  try:
647
830
  eps_def = (
@@ -653,278 +836,168 @@ def _extract_rmt_analysis(
653
836
  epsilon_default = float(eps_def)
654
837
  except Exception:
655
838
  pass
656
- stable = True
657
- explicit_stability = False
658
- max_ratio = 0.0
659
- max_deviation_ratio = 1.0
660
- mean_deviation_ratio = 1.0
661
- epsilon_map: dict[str, float] = {}
662
- baseline_outliers_per_family: dict[str, int] = {}
663
- outliers_per_family: dict[str, int] = {}
664
- epsilon_violations: list[Any] = []
665
- margin_used = None
666
- deadband_used = None
667
- policy_out: dict[str, Any] | None = None
668
839
 
840
+ baseline_rmt = baseline.get("rmt", {}) if isinstance(baseline, dict) else {}
841
+ baseline_edge_by_family: dict[str, float] = {}
842
+ baseline_contract = None
843
+ if isinstance(baseline_rmt, dict) and baseline_rmt:
844
+ bc = baseline_rmt.get("measurement_contract")
845
+ if isinstance(bc, dict) and bc:
846
+ baseline_contract = bc
847
+ base = baseline_rmt.get("edge_risk_by_family") or baseline_rmt.get(
848
+ "edge_risk_by_family_base"
849
+ )
850
+ if isinstance(base, dict):
851
+ for k, v in base.items():
852
+ if isinstance(v, int | float) and math.isfinite(float(v)):
853
+ baseline_edge_by_family[str(k)] = float(v)
854
+
855
+ rmt_guard = None
856
+ guard_metrics: dict[str, Any] = {}
857
+ guard_policy: dict[str, Any] = {}
669
858
  for guard in report.get("guards", []) or []:
670
859
  if str(guard.get("name", "")).lower() == "rmt":
860
+ rmt_guard = guard
671
861
  guard_metrics = guard.get("metrics", {}) or {}
672
862
  guard_policy = guard.get("policy", {}) or {}
673
- if isinstance(guard_policy, dict) and guard_policy:
674
- policy_out = dict(guard_policy)
675
- if "epsilon_by_family" not in policy_out and isinstance(
676
- policy_out.get("epsilon"), dict
677
- ):
678
- policy_out["epsilon_by_family"] = dict(policy_out["epsilon"])
679
- if isinstance(policy_out.get("margin"), int | float) and math.isfinite(
680
- float(policy_out.get("margin"))
681
- ):
682
- margin_used = float(policy_out.get("margin"))
683
- if isinstance(
684
- policy_out.get("deadband"), int | float
685
- ) and math.isfinite(float(policy_out.get("deadband"))):
686
- deadband_used = float(policy_out.get("deadband"))
687
- if isinstance(
688
- policy_out.get("epsilon_default"), int | float
689
- ) and math.isfinite(float(policy_out.get("epsilon_default"))):
690
- epsilon_default = float(policy_out.get("epsilon_default"))
691
- if isinstance(
692
- guard_metrics.get("epsilon_default"), int | float
693
- ) and math.isfinite(float(guard_metrics.get("epsilon_default"))):
694
- epsilon_default = float(guard_metrics.get("epsilon_default"))
695
- outliers_guarded = guard_metrics.get(
696
- "rmt_outliers", guard_metrics.get("layers_flagged", outliers_guarded)
697
- )
698
- max_ratio = guard_metrics.get("max_ratio", 0.0)
699
- epsilon_map = guard_metrics.get("epsilon_by_family", {}) or epsilon_map
700
- if not epsilon_map and isinstance(guard_policy, dict):
701
- eps_src = guard_policy.get("epsilon_by_family") or guard_policy.get(
702
- "epsilon"
703
- )
704
- if isinstance(eps_src, dict):
705
- try:
706
- epsilon_map = {
707
- str(k): float(v)
708
- for k, v in eps_src.items()
709
- if isinstance(v, int | float) and math.isfinite(float(v))
710
- }
711
- except Exception:
712
- pass
713
- baseline_outliers_per_family = (
714
- guard_metrics.get("baseline_outliers_per_family", {})
715
- or baseline_outliers_per_family
716
- )
717
- outliers_per_family = (
718
- guard_metrics.get("outliers_per_family", {}) or outliers_per_family
719
- )
720
- epsilon_violations = guard_metrics.get(
721
- "epsilon_violations", epsilon_violations
722
- )
723
- if outliers_per_family:
724
- outliers_guarded = sum(
725
- int(v)
726
- for v in outliers_per_family.values()
727
- if isinstance(v, int | float)
728
- )
729
- if baseline_outliers_per_family:
730
- outliers_bare = sum(
731
- int(v)
732
- for v in baseline_outliers_per_family.values()
733
- if isinstance(v, int | float)
734
- )
735
- flagged_rate = guard_metrics.get("flagged_rate", 0.0)
736
- stable = flagged_rate <= 0.5
737
- max_mp_ratio = guard_metrics.get("max_mp_ratio_final", 0.0)
738
- mean_mp_ratio = guard_metrics.get("mean_mp_ratio_final", 0.0)
739
-
740
- baseline_max = None
741
- baseline_mean = None
742
- baseline_rmt = baseline.get("rmt", {}) if isinstance(baseline, dict) else {}
743
- if baseline_rmt:
744
- baseline_max = baseline_rmt.get(
745
- "max_mp_ratio", baseline_rmt.get("max_mp_ratio_final")
746
- )
747
- baseline_mean = baseline_rmt.get(
748
- "mean_mp_ratio", baseline_rmt.get("mean_mp_ratio_final")
749
- )
750
- outliers_bare = baseline_rmt.get(
751
- "outliers", baseline_rmt.get("rmt_outliers", 0)
752
- )
753
- if baseline_max is None:
754
- baseline_metrics = (
755
- baseline.get("metrics", {}) if isinstance(baseline, dict) else {}
756
- )
757
- if "rmt" in baseline_metrics:
758
- baseline_rmt_metrics = baseline_metrics["rmt"]
759
- baseline_max = baseline_rmt_metrics.get("max_mp_ratio_final")
760
- baseline_mean = baseline_rmt_metrics.get("mean_mp_ratio_final")
761
- if baseline_max is None and isinstance(guard.get("baseline_metrics"), dict):
762
- gb = guard.get("baseline_metrics")
763
- baseline_max = gb.get("max_mp_ratio")
764
- baseline_mean = gb.get("mean_mp_ratio")
765
- if baseline_max is not None and baseline_max > 0:
766
- max_deviation_ratio = max_mp_ratio / baseline_max
767
- else:
768
- max_deviation_ratio = 1.0
769
- if baseline_mean is not None and baseline_mean > 0:
770
- mean_deviation_ratio = mean_mp_ratio / baseline_mean
771
- else:
772
- mean_deviation_ratio = 1.0
773
- if isinstance(guard_metrics.get("stable"), bool):
774
- stable = bool(guard_metrics.get("stable"))
775
- explicit_stability = True
776
863
  break
777
864
 
778
- # Fallback: use metrics.rmt and/or top-level rmt section when guard is absent
779
- if outliers_guarded == 0:
780
- rmt_metrics = (report.get("metrics", {}) or {}).get("rmt", {})
781
- if isinstance(rmt_metrics, dict):
782
- try:
783
- outliers_guarded = int(rmt_metrics.get("outliers", 0) or 0)
784
- except Exception:
785
- outliers_guarded = 0
786
- if isinstance(rmt_metrics.get("stable"), bool):
787
- stable = bool(rmt_metrics.get("stable"))
788
- explicit_stability = True
789
- rmt_top = report.get("rmt", {}) if isinstance(report.get("rmt"), dict) else {}
790
- if isinstance(rmt_top, dict):
791
- fams = rmt_top.get("families", {})
792
- if isinstance(fams, dict) and fams:
793
- for fam, rec in fams.items():
794
- if not isinstance(rec, dict):
795
- continue
796
- try:
797
- outliers_per_family[str(fam)] = int(
798
- rec.get("outliers_guarded", 0) or 0
799
- )
800
- baseline_outliers_per_family[str(fam)] = int(
801
- rec.get("outliers_bare", 0) or 0
802
- )
803
- if rec.get("epsilon") is not None:
804
- try:
805
- epsilon_map[str(fam)] = float(rec.get("epsilon"))
806
- except Exception:
807
- pass
808
- except Exception:
809
- continue
810
- try:
811
- if outliers_bare == 0:
812
- outliers_bare = int(rmt_top.get("outliers", 0) or 0)
813
- except Exception:
814
- pass
865
+ policy_out: dict[str, Any] | None = None
866
+ if isinstance(guard_policy, dict) and guard_policy:
867
+ policy_out = dict(guard_policy)
868
+ if isinstance(policy_out.get("epsilon_default"), int | float) and math.isfinite(
869
+ float(policy_out.get("epsilon_default"))
870
+ ):
871
+ epsilon_default = float(policy_out.get("epsilon_default"))
815
872
 
816
- # If stability not explicitly provided, derive from outlier behavior
817
- if not explicit_stability:
818
- try:
819
- if outliers_guarded == 0 and outliers_bare == 0:
820
- stable = True
821
- elif outliers_guarded <= outliers_bare:
822
- stable = True
823
- else:
824
- stable = (outliers_guarded - outliers_bare) / max(
825
- outliers_bare, 1
826
- ) <= 0.5
827
- except Exception:
828
- pass
873
+ if isinstance(guard_metrics.get("epsilon_default"), int | float) and math.isfinite(
874
+ float(guard_metrics.get("epsilon_default"))
875
+ ):
876
+ epsilon_default = float(guard_metrics.get("epsilon_default"))
829
877
 
830
- delta_per_family = {
831
- k: int(outliers_per_family.get(k, 0))
832
- - int(baseline_outliers_per_family.get(k, 0))
833
- for k in set(outliers_per_family) | set(baseline_outliers_per_family)
834
- }
835
- delta_total = int(outliers_guarded) - int(outliers_bare)
836
- # Conservative baseline fallback when not available
837
- if outliers_bare == 0 and outliers_guarded > 0:
838
- # Assume baseline had fewer outliers to make acceptance harder
839
- outliers_bare = max(0, outliers_guarded - 1)
840
-
841
- # Recompute stability from epsilon rule when not explicitly provided
842
- if not explicit_stability:
843
- try:
844
- if outliers_per_family and baseline_outliers_per_family:
845
- families_union = set(outliers_per_family) | set(
846
- baseline_outliers_per_family
878
+ edge_base: dict[str, float] = {}
879
+ edge_cur: dict[str, float] = {}
880
+ if isinstance(guard_metrics, dict) and guard_metrics:
881
+ base = guard_metrics.get("edge_risk_by_family_base") or {}
882
+ cur = guard_metrics.get("edge_risk_by_family") or {}
883
+ if isinstance(base, dict):
884
+ for k, v in base.items():
885
+ if isinstance(v, int | float) and math.isfinite(float(v)):
886
+ edge_base[str(k)] = float(v)
887
+ if isinstance(cur, dict):
888
+ for k, v in cur.items():
889
+ if isinstance(v, int | float) and math.isfinite(float(v)):
890
+ edge_cur[str(k)] = float(v)
891
+ if not edge_base and baseline_edge_by_family:
892
+ edge_base = dict(baseline_edge_by_family)
893
+
894
+ epsilon_map: dict[str, float] = {}
895
+ eps_src = guard_metrics.get("epsilon_by_family") or {}
896
+ if not eps_src and isinstance(guard_policy, dict):
897
+ eps_src = guard_policy.get("epsilon_by_family") or {}
898
+ if isinstance(eps_src, dict):
899
+ for k, v in eps_src.items():
900
+ if isinstance(v, int | float) and math.isfinite(float(v)):
901
+ epsilon_map[str(k)] = float(v)
902
+
903
+ epsilon_violations = guard_metrics.get("epsilon_violations") or []
904
+ if not (isinstance(epsilon_violations, list) and epsilon_violations):
905
+ epsilon_violations = []
906
+ families = set(edge_cur) | set(edge_base)
907
+ for family in families:
908
+ base = float(edge_base.get(family, 0.0) or 0.0)
909
+ cur = float(edge_cur.get(family, 0.0) or 0.0)
910
+ if base <= 0.0:
911
+ continue
912
+ eps = float(
913
+ epsilon_map.get(
914
+ family, default_epsilon_map.get(family, epsilon_default)
847
915
  )
848
- checks: list[bool] = []
849
- for fam in families_union:
850
- guarded = int(outliers_per_family.get(fam, 0) or 0)
851
- bare = int(baseline_outliers_per_family.get(fam, 0) or 0)
852
- eps_val = float(epsilon_map.get(fam, epsilon_default))
853
- allowed = math.ceil(bare * (1.0 + eps_val))
854
- checks.append(guarded <= allowed)
855
- if checks:
856
- stable = all(checks)
857
- elif outliers_bare > 0:
858
- stable = outliers_guarded <= (
859
- outliers_bare * (1.0 + float(epsilon_default))
916
+ )
917
+ allowed = (1.0 + eps) * base
918
+ if cur > allowed:
919
+ delta = (cur / base) - 1.0 if base > 0 else float("inf")
920
+ epsilon_violations.append(
921
+ {
922
+ "family": family,
923
+ "edge_base": base,
924
+ "edge_cur": cur,
925
+ "delta": float(delta),
926
+ "allowed": allowed,
927
+ "epsilon": eps,
928
+ }
860
929
  )
861
- except Exception:
862
- pass
863
-
864
- # Compute epsilon scalar (fallback) and detailed family breakdown
865
- if epsilon_map:
866
- epsilon_scalar = max(float(v) for v in epsilon_map.values())
867
- elif default_epsilon_map:
868
- try:
869
- epsilon_scalar = max(float(v) for v in default_epsilon_map.values())
870
- except Exception:
871
- epsilon_scalar = float(epsilon_default)
872
- else:
873
- epsilon_scalar = float(epsilon_default)
874
- try:
875
- epsilon_scalar = round(float(epsilon_scalar), 3)
876
- except Exception:
877
- epsilon_scalar = float(epsilon_default)
878
930
 
879
- def _to_int(v: Any) -> int:
880
- try:
881
- return int(v)
882
- except (TypeError, ValueError):
883
- return 0
931
+ stable = bool(guard_metrics.get("stable", not epsilon_violations))
884
932
 
885
- families = (
886
- set(outliers_per_family) | set(baseline_outliers_per_family) | set(epsilon_map)
933
+ families_all = sorted(
934
+ set(edge_base) | set(edge_cur) | set(epsilon_map) | set(default_epsilon_map)
887
935
  )
888
- family_breakdown = {
889
- family: {
890
- "bare": _to_int(baseline_outliers_per_family.get(family, 0)),
891
- "guarded": _to_int(outliers_per_family.get(family, 0)),
892
- "epsilon": float(epsilon_map.get(family, epsilon_scalar)),
936
+ family_breakdown: dict[str, dict[str, Any]] = {}
937
+ ratios: list[float] = []
938
+ deltas: list[float] = []
939
+ for family in families_all:
940
+ base = float(edge_base.get(family, 0.0) or 0.0)
941
+ cur = float(edge_cur.get(family, 0.0) or 0.0)
942
+ eps = float(
943
+ epsilon_map.get(family, default_epsilon_map.get(family, epsilon_default))
944
+ )
945
+ allowed = (1.0 + eps) * base if base > 0.0 else None
946
+ ratio = (cur / base) if base > 0.0 else None
947
+ delta = ((cur / base) - 1.0) if base > 0.0 else None
948
+ if isinstance(ratio, float) and math.isfinite(ratio):
949
+ ratios.append(ratio)
950
+ if isinstance(delta, float) and math.isfinite(delta):
951
+ deltas.append(delta)
952
+ family_breakdown[family] = {
953
+ "edge_base": base,
954
+ "edge_cur": cur,
955
+ "epsilon": eps,
956
+ "allowed": allowed,
957
+ "ratio": ratio,
958
+ "delta": delta,
893
959
  }
894
- for family in sorted(families)
895
- }
896
960
 
897
- # Stringify per-family dict keys for stability
898
- outliers_per_family = {str(k): _to_int(v) for k, v in outliers_per_family.items()}
899
- baseline_outliers_per_family = {
900
- str(k): _to_int(v) for k, v in baseline_outliers_per_family.items()
901
- }
902
- delta_per_family = {str(k): _to_int(v) for k, v in delta_per_family.items()}
961
+ measurement_contract = None
962
+ try:
963
+ mc = (
964
+ guard_metrics.get("measurement_contract")
965
+ if isinstance(guard_metrics, dict)
966
+ else None
967
+ )
968
+ if isinstance(mc, dict) and mc:
969
+ measurement_contract = mc
970
+ except Exception:
971
+ measurement_contract = None
972
+
973
+ mc_hash = _measurement_contract_digest(measurement_contract)
974
+ baseline_hash = _measurement_contract_digest(baseline_contract)
903
975
 
904
- result = {
905
- "outliers_bare": outliers_bare,
906
- "outliers_guarded": outliers_guarded,
907
- "epsilon": epsilon_scalar,
976
+ result: dict[str, Any] = {
977
+ "tier": tier,
978
+ "edge_risk_by_family_base": dict(edge_base),
979
+ "edge_risk_by_family": dict(edge_cur),
908
980
  "epsilon_default": float(epsilon_default),
909
- "epsilon_by_family": epsilon_map,
910
- "outliers_per_family": outliers_per_family,
911
- "baseline_outliers_per_family": baseline_outliers_per_family,
912
- "delta_per_family": delta_per_family,
913
- "delta_total": delta_total,
914
- "epsilon_violations": epsilon_violations,
981
+ "epsilon_by_family": dict(epsilon_map),
982
+ "epsilon_violations": list(epsilon_violations),
915
983
  "stable": stable,
916
984
  "status": "stable" if stable else "unstable",
917
- "max_ratio": max_ratio,
918
- "max_deviation_ratio": max_deviation_ratio,
919
- "mean_deviation_ratio": mean_deviation_ratio,
985
+ "max_edge_ratio": max(ratios) if ratios else None,
986
+ "max_edge_delta": max(deltas) if deltas else None,
987
+ "mean_edge_delta": (sum(deltas) / len(deltas)) if deltas else None,
920
988
  "families": family_breakdown,
989
+ "evaluated": bool(rmt_guard),
921
990
  }
922
- if margin_used is not None:
923
- result["margin"] = float(margin_used)
924
- if deadband_used is not None:
925
- result["deadband"] = float(deadband_used)
926
991
  if policy_out:
927
992
  result["policy"] = policy_out
993
+ if measurement_contract is not None:
994
+ result["measurement_contract"] = measurement_contract
995
+ if mc_hash:
996
+ result["measurement_contract_hash"] = mc_hash
997
+ if baseline_hash:
998
+ result["baseline_measurement_contract_hash"] = baseline_hash
999
+ if mc_hash and baseline_hash:
1000
+ result["measurement_contract_match"] = bool(mc_hash == baseline_hash)
928
1001
  return result
929
1002
 
930
1003