invarlock 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +2 -2
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +35 -40
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_mixin.py +25 -4
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/cli/adapter_auto.py +31 -21
- invarlock/cli/app.py +73 -2
- invarlock/cli/commands/certify.py +600 -59
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +233 -69
- invarlock/cli/commands/run.py +907 -183
- invarlock/cli/commands/verify.py +76 -11
- invarlock/cli/config.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +1 -1
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/runner.py +111 -25
- invarlock/edits/quant_rtn.py +65 -37
- invarlock/eval/bench.py +3 -3
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +48 -27
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/reporting/certificate.py +159 -9
- invarlock/reporting/certificate_schema.py +1 -1
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +55 -5
- invarlock/reporting/normalizer.py +7 -0
- invarlock/reporting/render.py +791 -431
- invarlock/reporting/report.py +39 -3
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/METADATA +23 -9
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/RECORD +53 -48
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.7.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
InvarLock
|
|
3
|
-
|
|
2
|
+
InvarLock Evaluation Certificate Generation
|
|
3
|
+
==========================================
|
|
4
4
|
|
|
5
|
-
Generate standardized
|
|
5
|
+
Generate standardized evaluation certificates from RunReport and baseline
|
|
6
|
+
comparison.
|
|
6
7
|
Certificates are standalone, portable verification artifacts that can be used
|
|
7
8
|
for CI/CD gates and regulatory compliance.
|
|
8
9
|
"""
|
|
@@ -743,7 +744,7 @@ def make_certificate(
|
|
|
743
744
|
baseline: RunReport | dict[str, Any],
|
|
744
745
|
) -> dict[str, Any]:
|
|
745
746
|
"""
|
|
746
|
-
Generate
|
|
747
|
+
Generate an evaluation certificate from a RunReport and baseline comparison.
|
|
747
748
|
|
|
748
749
|
The certificate is a standalone, portable artifact that contains all
|
|
749
750
|
essential metrics and comparisons needed for safety verification.
|
|
@@ -764,6 +765,17 @@ def make_certificate(
|
|
|
764
765
|
# Normalize baseline input
|
|
765
766
|
baseline_raw = baseline
|
|
766
767
|
baseline_normalized = _normalize_baseline(baseline_raw)
|
|
768
|
+
baseline_report: RunReport | None = None
|
|
769
|
+
try:
|
|
770
|
+
if (
|
|
771
|
+
isinstance(baseline_raw, dict)
|
|
772
|
+
and "meta" in baseline_raw
|
|
773
|
+
and "metrics" in baseline_raw
|
|
774
|
+
and "edit" in baseline_raw
|
|
775
|
+
):
|
|
776
|
+
baseline_report = _normalize_and_validate_report(baseline_raw)
|
|
777
|
+
except Exception: # pragma: no cover - baseline compare is best-effort
|
|
778
|
+
baseline_report = None
|
|
767
779
|
|
|
768
780
|
# Extract core metadata with full seed bundle
|
|
769
781
|
meta = _extract_certificate_meta(report)
|
|
@@ -1440,7 +1452,7 @@ def make_certificate(
|
|
|
1440
1452
|
ppl_analysis["window_plan"] = window_plan_ctx
|
|
1441
1453
|
|
|
1442
1454
|
# Extract invariant status
|
|
1443
|
-
invariants = _extract_invariants(report)
|
|
1455
|
+
invariants = _extract_invariants(report, baseline=baseline_report)
|
|
1444
1456
|
|
|
1445
1457
|
# Extract spectral analysis
|
|
1446
1458
|
spectral = _extract_spectral_analysis(report, baseline_normalized)
|
|
@@ -1558,7 +1570,13 @@ def make_certificate(
|
|
|
1558
1570
|
telemetry: dict[str, Any] = {}
|
|
1559
1571
|
metrics_section = report.get("metrics", {})
|
|
1560
1572
|
if isinstance(metrics_section, dict):
|
|
1561
|
-
for key in (
|
|
1573
|
+
for key in (
|
|
1574
|
+
"latency_ms_per_tok",
|
|
1575
|
+
"memory_mb_peak",
|
|
1576
|
+
"gpu_memory_mb_peak",
|
|
1577
|
+
"gpu_memory_reserved_mb_peak",
|
|
1578
|
+
"throughput_tok_per_s",
|
|
1579
|
+
):
|
|
1562
1580
|
value = metrics_section.get(key)
|
|
1563
1581
|
if isinstance(value, int | float) and math.isfinite(value):
|
|
1564
1582
|
telemetry[key] = float(value)
|
|
@@ -1755,6 +1773,7 @@ def make_certificate(
|
|
|
1755
1773
|
capacity_examples = None
|
|
1756
1774
|
|
|
1757
1775
|
pm_acceptance_range = _resolve_pm_acceptance_range_from_report(report)
|
|
1776
|
+
pm_drift_band = _resolve_pm_drift_band_from_report(report)
|
|
1758
1777
|
|
|
1759
1778
|
# Primary metric tail evidence and gate evaluation (ΔlogNLL vs baseline, per-window).
|
|
1760
1779
|
pm_tail_result: dict[str, Any] = {}
|
|
@@ -1881,6 +1900,12 @@ def make_certificate(
|
|
|
1881
1900
|
except Exception: # pragma: no cover - defensive against patched functions
|
|
1882
1901
|
validation_kwargs["pm_acceptance_range"] = pm_acceptance_range
|
|
1883
1902
|
|
|
1903
|
+
try:
|
|
1904
|
+
if "pm_drift_band" in inspect.signature(_compute_validation_flags).parameters:
|
|
1905
|
+
validation_kwargs["pm_drift_band"] = pm_drift_band
|
|
1906
|
+
except Exception: # pragma: no cover - defensive against patched functions
|
|
1907
|
+
validation_kwargs["pm_drift_band"] = pm_drift_band
|
|
1908
|
+
|
|
1884
1909
|
try:
|
|
1885
1910
|
if "pm_tail" in inspect.signature(_compute_validation_flags).parameters:
|
|
1886
1911
|
validation_kwargs["pm_tail"] = pm_tail_result
|
|
@@ -2176,6 +2201,13 @@ def make_certificate(
|
|
|
2176
2201
|
from .primary_metric_utils import attach_primary_metric as _attach_pm
|
|
2177
2202
|
|
|
2178
2203
|
_attach_pm(certificate, report, baseline_raw, baseline_ref, ppl_analysis)
|
|
2204
|
+
try:
|
|
2205
|
+
if isinstance(pm_drift_band, dict) and pm_drift_band:
|
|
2206
|
+
pm_block = certificate.get("primary_metric")
|
|
2207
|
+
if isinstance(pm_block, dict):
|
|
2208
|
+
pm_block.setdefault("drift_band", dict(pm_drift_band))
|
|
2209
|
+
except Exception: # pragma: no cover
|
|
2210
|
+
pass
|
|
2179
2211
|
_enforce_display_ci_alignment(
|
|
2180
2212
|
ratio_ci_source,
|
|
2181
2213
|
certificate.get("primary_metric"),
|
|
@@ -2614,7 +2646,7 @@ def _extract_edit_metadata(
|
|
|
2614
2646
|
alg_lower = str(algorithm).strip().lower()
|
|
2615
2647
|
except Exception: # pragma: no cover
|
|
2616
2648
|
alg_lower = ""
|
|
2617
|
-
allowed_algorithms = {"quant_rtn", "noop"}
|
|
2649
|
+
allowed_algorithms = {"quant_rtn", "noop", "custom"}
|
|
2618
2650
|
if alg_lower not in allowed_algorithms:
|
|
2619
2651
|
algorithm = ""
|
|
2620
2652
|
|
|
@@ -3225,6 +3257,105 @@ def _resolve_pm_acceptance_range_from_report(
|
|
|
3225
3257
|
return {"min": float(min_val), "max": float(max_val)}
|
|
3226
3258
|
|
|
3227
3259
|
|
|
3260
|
+
def _resolve_pm_drift_band_from_report(
|
|
3261
|
+
report: dict[str, Any] | None,
|
|
3262
|
+
) -> dict[str, float]:
|
|
3263
|
+
"""Resolve preview→final drift band from report context/meta/env."""
|
|
3264
|
+
|
|
3265
|
+
base_min = 0.95
|
|
3266
|
+
base_max = 1.05
|
|
3267
|
+
|
|
3268
|
+
def _safe_float(val: Any) -> float | None:
|
|
3269
|
+
try:
|
|
3270
|
+
if val is None:
|
|
3271
|
+
return None
|
|
3272
|
+
out = float(val)
|
|
3273
|
+
except Exception:
|
|
3274
|
+
return None
|
|
3275
|
+
return out if math.isfinite(out) else None
|
|
3276
|
+
|
|
3277
|
+
cfg_min = None
|
|
3278
|
+
cfg_max = None
|
|
3279
|
+
|
|
3280
|
+
ctx = report.get("context") if isinstance(report, dict) else None
|
|
3281
|
+
if isinstance(ctx, dict):
|
|
3282
|
+
pm_ctx = ctx.get("primary_metric")
|
|
3283
|
+
if isinstance(pm_ctx, dict):
|
|
3284
|
+
band = pm_ctx.get("drift_band")
|
|
3285
|
+
if isinstance(band, dict):
|
|
3286
|
+
cfg_min = _safe_float(band.get("min"))
|
|
3287
|
+
cfg_max = _safe_float(band.get("max"))
|
|
3288
|
+
elif isinstance(band, list | tuple) and len(band) == 2:
|
|
3289
|
+
cfg_min = _safe_float(band[0])
|
|
3290
|
+
cfg_max = _safe_float(band[1])
|
|
3291
|
+
if cfg_min is None or cfg_max is None:
|
|
3292
|
+
alt = ctx.get("pm_drift_band")
|
|
3293
|
+
if isinstance(alt, dict):
|
|
3294
|
+
cfg_min = (
|
|
3295
|
+
cfg_min if cfg_min is not None else _safe_float(alt.get("min"))
|
|
3296
|
+
)
|
|
3297
|
+
cfg_max = (
|
|
3298
|
+
cfg_max if cfg_max is not None else _safe_float(alt.get("max"))
|
|
3299
|
+
)
|
|
3300
|
+
|
|
3301
|
+
if (cfg_min is None or cfg_max is None) and isinstance(report, dict):
|
|
3302
|
+
meta = report.get("meta")
|
|
3303
|
+
if isinstance(meta, dict):
|
|
3304
|
+
meta_band = meta.get("pm_drift_band")
|
|
3305
|
+
if isinstance(meta_band, dict):
|
|
3306
|
+
cfg_min = (
|
|
3307
|
+
cfg_min
|
|
3308
|
+
if cfg_min is not None
|
|
3309
|
+
else _safe_float(meta_band.get("min"))
|
|
3310
|
+
)
|
|
3311
|
+
cfg_max = (
|
|
3312
|
+
cfg_max
|
|
3313
|
+
if cfg_max is not None
|
|
3314
|
+
else _safe_float(meta_band.get("max"))
|
|
3315
|
+
)
|
|
3316
|
+
|
|
3317
|
+
def _parse_env(name: str) -> float | None:
|
|
3318
|
+
try:
|
|
3319
|
+
raw = os.environ.get(name, "")
|
|
3320
|
+
if raw is None or str(raw).strip() == "":
|
|
3321
|
+
return None
|
|
3322
|
+
return float(raw)
|
|
3323
|
+
except Exception:
|
|
3324
|
+
return None
|
|
3325
|
+
|
|
3326
|
+
env_min = _parse_env("INVARLOCK_PM_DRIFT_MIN")
|
|
3327
|
+
env_max = _parse_env("INVARLOCK_PM_DRIFT_MAX")
|
|
3328
|
+
|
|
3329
|
+
has_explicit = any(v is not None for v in (cfg_min, cfg_max, env_min, env_max))
|
|
3330
|
+
if not has_explicit:
|
|
3331
|
+
return {}
|
|
3332
|
+
|
|
3333
|
+
min_val = (
|
|
3334
|
+
env_min if env_min is not None else cfg_min if cfg_min is not None else base_min
|
|
3335
|
+
)
|
|
3336
|
+
max_val = (
|
|
3337
|
+
env_max if env_max is not None else cfg_max if cfg_max is not None else base_max
|
|
3338
|
+
)
|
|
3339
|
+
|
|
3340
|
+
try:
|
|
3341
|
+
if min_val is not None and min_val <= 0:
|
|
3342
|
+
min_val = base_min
|
|
3343
|
+
except Exception:
|
|
3344
|
+
min_val = base_min
|
|
3345
|
+
try:
|
|
3346
|
+
if max_val is not None and max_val <= 0:
|
|
3347
|
+
max_val = base_max
|
|
3348
|
+
except Exception:
|
|
3349
|
+
max_val = base_max
|
|
3350
|
+
try:
|
|
3351
|
+
if min_val is not None and max_val is not None and min_val >= max_val:
|
|
3352
|
+
min_val, max_val = base_min, base_max
|
|
3353
|
+
except Exception:
|
|
3354
|
+
min_val, max_val = base_min, base_max
|
|
3355
|
+
|
|
3356
|
+
return {"min": float(min_val), "max": float(max_val)}
|
|
3357
|
+
|
|
3358
|
+
|
|
3228
3359
|
def _compute_validation_flags(
|
|
3229
3360
|
ppl: dict[str, Any],
|
|
3230
3361
|
spectral: dict[str, Any],
|
|
@@ -3238,6 +3369,7 @@ def _compute_validation_flags(
|
|
|
3238
3369
|
moe: dict[str, Any] | None = None,
|
|
3239
3370
|
dataset_capacity: dict[str, Any] | None = None,
|
|
3240
3371
|
pm_acceptance_range: dict[str, float] | None = None,
|
|
3372
|
+
pm_drift_band: dict[str, float] | None = None,
|
|
3241
3373
|
pm_tail: dict[str, Any] | None = None,
|
|
3242
3374
|
) -> dict[str, bool]:
|
|
3243
3375
|
"""Compute validation flags for the certificate including canonical gates."""
|
|
@@ -3301,9 +3433,27 @@ def _compute_validation_flags(
|
|
|
3301
3433
|
ratio_limit = min(ratio_limit, float(target_ratio))
|
|
3302
3434
|
|
|
3303
3435
|
# Canonical Gates
|
|
3304
|
-
# 1. Drift gate: 0.95 ≤ final/preview ≤ 1.05
|
|
3436
|
+
# 1. Drift gate: by default 0.95 ≤ final/preview ≤ 1.05 (configurable)
|
|
3305
3437
|
drift_ratio = ppl.get("preview_final_ratio", 1.0)
|
|
3306
|
-
|
|
3438
|
+
drift_min = 0.95
|
|
3439
|
+
drift_max = 1.05
|
|
3440
|
+
if isinstance(pm_drift_band, dict):
|
|
3441
|
+
try:
|
|
3442
|
+
cand_min = pm_drift_band.get("min")
|
|
3443
|
+
cand_max = pm_drift_band.get("max")
|
|
3444
|
+
if isinstance(cand_min, int | float) and isinstance(cand_max, int | float):
|
|
3445
|
+
cand_min_f = float(cand_min)
|
|
3446
|
+
cand_max_f = float(cand_max)
|
|
3447
|
+
if (
|
|
3448
|
+
math.isfinite(cand_min_f)
|
|
3449
|
+
and math.isfinite(cand_max_f)
|
|
3450
|
+
and 0 < cand_min_f < cand_max_f
|
|
3451
|
+
):
|
|
3452
|
+
drift_min = cand_min_f
|
|
3453
|
+
drift_max = cand_max_f
|
|
3454
|
+
except Exception: # pragma: no cover
|
|
3455
|
+
pass
|
|
3456
|
+
preview_final_drift_acceptable = drift_min <= drift_ratio <= drift_max
|
|
3307
3457
|
if _tiny_relax:
|
|
3308
3458
|
# Treat drift identity as informational in tiny dev demos
|
|
3309
3459
|
preview_final_drift_acceptable = True
|
|
@@ -20,7 +20,7 @@ CERTIFICATE_SCHEMA_VERSION = "v1"
|
|
|
20
20
|
# separately in metric-specific logic.
|
|
21
21
|
CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
22
22
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
23
|
-
"title": "InvarLock
|
|
23
|
+
"title": "InvarLock Evaluation Certificate",
|
|
24
24
|
"type": "object",
|
|
25
25
|
"required": [
|
|
26
26
|
"schema_version",
|
|
@@ -23,7 +23,9 @@ def _measurement_contract_digest(contract: Any) -> str | None:
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@no_type_check
|
|
26
|
-
def _extract_invariants(
|
|
26
|
+
def _extract_invariants(
|
|
27
|
+
report: RunReport, baseline: RunReport | None = None
|
|
28
|
+
) -> dict[str, Any]:
|
|
27
29
|
"""Extract invariant check results (matches the shape used in tests)."""
|
|
28
30
|
invariants_data = (report.get("metrics", {}) or {}).get("invariants", {})
|
|
29
31
|
failures: list[dict[str, Any]] = []
|
|
@@ -81,6 +83,108 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
|
|
|
81
83
|
guard_entry = guard
|
|
82
84
|
break
|
|
83
85
|
|
|
86
|
+
baseline_guard_entry = None
|
|
87
|
+
if baseline is not None:
|
|
88
|
+
for guard in baseline.get("guards", []) or []:
|
|
89
|
+
if str(guard.get("name", "")).lower() == "invariants":
|
|
90
|
+
baseline_guard_entry = guard
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
def _coerce_checks(value: Any) -> dict[str, Any] | None:
|
|
94
|
+
return value if isinstance(value, dict) else None
|
|
95
|
+
|
|
96
|
+
def _extract_guard_checks(
|
|
97
|
+
entry: Any,
|
|
98
|
+
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
99
|
+
if not isinstance(entry, dict):
|
|
100
|
+
return None, None
|
|
101
|
+
details = entry.get("details")
|
|
102
|
+
if not isinstance(details, dict):
|
|
103
|
+
return None, None
|
|
104
|
+
return _coerce_checks(details.get("baseline_checks")), _coerce_checks(
|
|
105
|
+
details.get("current_checks")
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def _compare_invariants(
|
|
109
|
+
baseline_checks: dict[str, Any],
|
|
110
|
+
current_checks: dict[str, Any],
|
|
111
|
+
) -> tuple[list[dict[str, Any]], int, int]:
|
|
112
|
+
violations: list[dict[str, Any]] = []
|
|
113
|
+
|
|
114
|
+
# LayerNorm coverage check
|
|
115
|
+
baseline_layer_norms = set(baseline_checks.get("layer_norm_paths", ()))
|
|
116
|
+
current_layer_norms = set(current_checks.get("layer_norm_paths", ()))
|
|
117
|
+
missing_layer_norms = sorted(baseline_layer_norms - current_layer_norms)
|
|
118
|
+
if missing_layer_norms:
|
|
119
|
+
violations.append(
|
|
120
|
+
{
|
|
121
|
+
"type": "layer_norm_missing",
|
|
122
|
+
"missing": missing_layer_norms,
|
|
123
|
+
"message": "Expected LayerNorm modules are missing vs baseline",
|
|
124
|
+
}
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Tokenizer / vocab alignment
|
|
128
|
+
baseline_vocab_sizes = baseline_checks.get("embedding_vocab_sizes")
|
|
129
|
+
current_vocab_sizes = current_checks.get("embedding_vocab_sizes")
|
|
130
|
+
if isinstance(baseline_vocab_sizes, dict):
|
|
131
|
+
for module_name, baseline_size in baseline_vocab_sizes.items():
|
|
132
|
+
current_size = None
|
|
133
|
+
if isinstance(current_vocab_sizes, dict):
|
|
134
|
+
current_size = current_vocab_sizes.get(module_name)
|
|
135
|
+
if current_size is None or int(current_size) != int(baseline_size):
|
|
136
|
+
mismatch = {
|
|
137
|
+
"module": module_name,
|
|
138
|
+
"baseline": int(baseline_size),
|
|
139
|
+
"current": None if current_size is None else int(current_size),
|
|
140
|
+
}
|
|
141
|
+
violations.append(
|
|
142
|
+
{
|
|
143
|
+
"type": "tokenizer_mismatch",
|
|
144
|
+
"message": "Embedding vocabulary size changed vs baseline",
|
|
145
|
+
**mismatch,
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
handled_keys = {
|
|
150
|
+
"layer_norm_paths",
|
|
151
|
+
"embedding_vocab_sizes",
|
|
152
|
+
"config_vocab_size",
|
|
153
|
+
}
|
|
154
|
+
for check_name, baseline_value in baseline_checks.items():
|
|
155
|
+
if check_name in handled_keys:
|
|
156
|
+
continue
|
|
157
|
+
current_value = current_checks.get(check_name)
|
|
158
|
+
if current_value != baseline_value:
|
|
159
|
+
violations.append(
|
|
160
|
+
{
|
|
161
|
+
"type": "invariant_violation",
|
|
162
|
+
"check": check_name,
|
|
163
|
+
"baseline": baseline_value,
|
|
164
|
+
"current": current_value,
|
|
165
|
+
"message": (
|
|
166
|
+
f"Invariant {check_name} changed from {baseline_value} to {current_value}"
|
|
167
|
+
),
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
fatal_violation_types = {"tokenizer_mismatch"}
|
|
172
|
+
fatal_count = 0
|
|
173
|
+
warning_count = 0
|
|
174
|
+
annotated: list[dict[str, Any]] = []
|
|
175
|
+
for violation in violations:
|
|
176
|
+
violation_type = str(violation.get("type") or "")
|
|
177
|
+
severity = "fatal" if violation_type in fatal_violation_types else "warning"
|
|
178
|
+
annotated_violation = dict(violation)
|
|
179
|
+
annotated_violation.setdefault("severity", severity)
|
|
180
|
+
annotated.append(annotated_violation)
|
|
181
|
+
if severity == "fatal":
|
|
182
|
+
fatal_count += 1
|
|
183
|
+
else:
|
|
184
|
+
warning_count += 1
|
|
185
|
+
|
|
186
|
+
return annotated, fatal_count, warning_count
|
|
187
|
+
|
|
84
188
|
severity_status = "pass"
|
|
85
189
|
if guard_entry:
|
|
86
190
|
gm = guard_entry.get("metrics", {}) or {}
|
|
@@ -108,9 +212,51 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
|
|
|
108
212
|
if detail:
|
|
109
213
|
row["detail"] = detail
|
|
110
214
|
failures.append(row)
|
|
111
|
-
|
|
215
|
+
base_fatal = 0
|
|
216
|
+
base_warn = 0
|
|
217
|
+
baseline_failures: list[dict[str, Any]] = []
|
|
218
|
+
if baseline_guard_entry is not None:
|
|
219
|
+
baseline_pre, baseline_post = _extract_guard_checks(baseline_guard_entry)
|
|
220
|
+
current_pre, current_post = _extract_guard_checks(guard_entry)
|
|
221
|
+
baseline_snapshot = baseline_pre or baseline_post
|
|
222
|
+
current_snapshot = current_post or current_pre
|
|
223
|
+
if isinstance(baseline_snapshot, dict) and isinstance(
|
|
224
|
+
current_snapshot, dict
|
|
225
|
+
):
|
|
226
|
+
baseline_failures, base_fatal, base_warn = _compare_invariants(
|
|
227
|
+
baseline_snapshot, current_snapshot
|
|
228
|
+
)
|
|
229
|
+
for violation in baseline_failures:
|
|
230
|
+
check_name = violation.get("check")
|
|
231
|
+
if not check_name:
|
|
232
|
+
check_name = (
|
|
233
|
+
violation.get("module")
|
|
234
|
+
or violation.get("type")
|
|
235
|
+
or "invariant"
|
|
236
|
+
)
|
|
237
|
+
row = {
|
|
238
|
+
"check": str(check_name),
|
|
239
|
+
"type": str(violation.get("type") or "violation"),
|
|
240
|
+
"severity": str(violation.get("severity") or "warning"),
|
|
241
|
+
}
|
|
242
|
+
detail = {k: v for k, v in violation.items() if k not in row}
|
|
243
|
+
if detail:
|
|
244
|
+
detail.setdefault("source", "baseline_compare")
|
|
245
|
+
row["detail"] = detail
|
|
246
|
+
failures.append(row)
|
|
247
|
+
|
|
248
|
+
fatal_total = fatal_count + base_fatal
|
|
249
|
+
warn_total = warning_count + base_warn
|
|
250
|
+
try:
|
|
251
|
+
summary["fatal_violations"] = fatal_total
|
|
252
|
+
summary["warning_violations"] = warn_total
|
|
253
|
+
summary["violations_found"] = fatal_total + warn_total
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
if fatal_total > 0:
|
|
112
258
|
severity_status = "fail"
|
|
113
|
-
elif
|
|
259
|
+
elif warn_total > 0 or violations:
|
|
114
260
|
severity_status = "warn"
|
|
115
261
|
|
|
116
262
|
# If any error-severity entry exists among failures, escalate to fail
|
|
@@ -130,12 +276,16 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
|
|
|
130
276
|
"warning_violations": len(failures),
|
|
131
277
|
}
|
|
132
278
|
|
|
279
|
+
details_out = invariants_data
|
|
280
|
+
if not details_out and guard_entry and isinstance(guard_entry.get("details"), dict):
|
|
281
|
+
details_out = guard_entry.get("details", {})
|
|
282
|
+
|
|
133
283
|
return {
|
|
134
284
|
"pre": "pass",
|
|
135
285
|
"post": status,
|
|
136
286
|
"status": status,
|
|
137
287
|
"summary": summary,
|
|
138
|
-
"details":
|
|
288
|
+
"details": details_out,
|
|
139
289
|
"failures": failures,
|
|
140
290
|
}
|
|
141
291
|
|
invarlock/reporting/html.py
CHANGED
|
@@ -12,19 +12,69 @@ from typing import Any
|
|
|
12
12
|
|
|
13
13
|
from .render import render_certificate_markdown
|
|
14
14
|
|
|
15
|
+
markdown_module: Any | None = None
|
|
16
|
+
try:
|
|
17
|
+
import markdown as _markdown # type: ignore[import-untyped]
|
|
18
|
+
except Exception: # pragma: no cover - optional dependency
|
|
19
|
+
_markdown = None
|
|
20
|
+
else:
|
|
21
|
+
markdown_module = _markdown
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_STATUS_BADGES = {
|
|
25
|
+
"\u2705 PASS": '<span class="badge pass">PASS</span>',
|
|
26
|
+
"\u2705 OK": '<span class="badge pass">OK</span>',
|
|
27
|
+
"\u274c FAIL": '<span class="badge fail">FAIL</span>',
|
|
28
|
+
"\u26a0\ufe0f WARN": '<span class="badge warn">WARN</span>',
|
|
29
|
+
"\u26a0 WARN": '<span class="badge warn">WARN</span>',
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _apply_status_badges(html_body: str) -> str:
|
|
34
|
+
updated = html_body
|
|
35
|
+
for token, replacement in _STATUS_BADGES.items():
|
|
36
|
+
updated = updated.replace(token, replacement)
|
|
37
|
+
return updated
|
|
38
|
+
|
|
15
39
|
|
|
16
40
|
def render_certificate_html(certificate: dict[str, Any]) -> str:
|
|
17
41
|
"""Render a certificate as a simple HTML document.
|
|
18
42
|
|
|
19
|
-
Uses the Markdown renderer and
|
|
20
|
-
|
|
43
|
+
Uses the Markdown renderer and converts to HTML when available, falling back
|
|
44
|
+
to a <pre> block when the markdown dependency is missing.
|
|
21
45
|
"""
|
|
22
46
|
md = render_certificate_markdown(certificate)
|
|
23
|
-
|
|
47
|
+
if markdown_module is None:
|
|
48
|
+
body = f'<pre class="invarlock-md">{escape(md)}</pre>'
|
|
49
|
+
else:
|
|
50
|
+
html_body = markdown_module.markdown(md, extensions=["tables", "fenced_code"])
|
|
51
|
+
html_body = _apply_status_badges(html_body)
|
|
52
|
+
body = f'<div class="invarlock-md">{html_body}</div>'
|
|
24
53
|
return (
|
|
25
54
|
'<!DOCTYPE html><html><head><meta charset="utf-8">'
|
|
26
|
-
"<title>InvarLock
|
|
27
|
-
"<style>
|
|
55
|
+
"<title>InvarLock Evaluation Certificate</title>"
|
|
56
|
+
"<style>"
|
|
57
|
+
":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
|
|
58
|
+
"--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
|
|
59
|
+
"body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,sans-serif;"
|
|
60
|
+
"color:var(--ink);background:linear-gradient(180deg,#fff, #f6f8fa);"
|
|
61
|
+
"margin:0;padding:32px}"
|
|
62
|
+
".invarlock-md{max-width:960px;margin:0 auto;padding:24px;background:#fff;"
|
|
63
|
+
"border:1px solid var(--border);border-radius:16px;box-shadow:0 10px 30px rgba(0,0,0,0.05)}"
|
|
64
|
+
"h1,h2,h3{margin-top:1.4em}h1{margin-top:0}"
|
|
65
|
+
"table{border-collapse:collapse;width:100%;margin:12px 0}"
|
|
66
|
+
"th,td{border:1px solid var(--border);padding:6px 8px;text-align:left}"
|
|
67
|
+
"code,pre{background:var(--panel);border-radius:8px}"
|
|
68
|
+
"pre{padding:12px;overflow:auto}"
|
|
69
|
+
".badge{display:inline-block;padding:2px 8px;border-radius:999px;"
|
|
70
|
+
"font-size:0.75rem;font-weight:700;letter-spacing:0.02em;color:#fff}"
|
|
71
|
+
".badge.pass{background:var(--pass)}"
|
|
72
|
+
".badge.fail{background:var(--fail)}"
|
|
73
|
+
".badge.warn{background:var(--warn)}"
|
|
74
|
+
"@media print{body{background:#fff;padding:0}.invarlock-md{box-shadow:none;"
|
|
75
|
+
"border:0}a{color:inherit;text-decoration:none}.badge{color:#000;"
|
|
76
|
+
"border:1px solid #000;background:transparent}}"
|
|
77
|
+
"</style>"
|
|
28
78
|
"</head><body>" + body + "</body></html>"
|
|
29
79
|
)
|
|
30
80
|
|
|
@@ -55,6 +55,8 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
55
55
|
}
|
|
56
56
|
# Preserve additional provenance knobs used by certificate/digests.
|
|
57
57
|
for key in (
|
|
58
|
+
"pm_acceptance_range",
|
|
59
|
+
"pm_drift_band",
|
|
58
60
|
"policy_overrides",
|
|
59
61
|
"overrides",
|
|
60
62
|
"plugins",
|
|
@@ -179,6 +181,11 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
179
181
|
"latency_ms_p50",
|
|
180
182
|
"latency_ms_p95",
|
|
181
183
|
"memory_mb_peak",
|
|
184
|
+
"gpu_memory_mb_peak",
|
|
185
|
+
"gpu_memory_reserved_mb_peak",
|
|
186
|
+
"timings",
|
|
187
|
+
"guard_timings",
|
|
188
|
+
"memory_snapshots",
|
|
182
189
|
"throughput_sps",
|
|
183
190
|
"spectral",
|
|
184
191
|
"rmt",
|