invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +4 -4
- invarlock/adapters/__init__.py +10 -14
- invarlock/adapters/auto.py +37 -50
- invarlock/adapters/capabilities.py +2 -2
- invarlock/adapters/hf_causal.py +418 -0
- invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +53 -9
- invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
- invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +32 -26
- invarlock/cli/app.py +128 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/doctor.py +8 -10
- invarlock/cli/commands/evaluate.py +986 -0
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/plugins.py +13 -9
- invarlock/cli/commands/report.py +326 -92
- invarlock/cli/commands/run.py +1160 -228
- invarlock/cli/commands/verify.py +157 -97
- invarlock/cli/config.py +1 -1
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/doctor_helpers.py +4 -5
- invarlock/cli/output.py +193 -0
- invarlock/cli/provenance.py +4 -4
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/registry.py +9 -11
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +112 -26
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +67 -39
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +14 -10
- invarlock/eval/data.py +68 -23
- invarlock/eval/metrics.py +59 -1
- invarlock/eval/primary_metric.py +1 -1
- invarlock/eval/tasks/__init__.py +12 -0
- invarlock/eval/tasks/classification.py +48 -0
- invarlock/eval/tasks/qa.py +36 -0
- invarlock/eval/tasks/text_generation.py +102 -0
- invarlock/guards/invariants.py +19 -10
- invarlock/guards/rmt.py +2 -2
- invarlock/guards/spectral.py +1 -1
- invarlock/guards/variance.py +2 -2
- invarlock/model_profile.py +64 -62
- invarlock/observability/health.py +6 -6
- invarlock/observability/metrics.py +108 -0
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/guards_analysis.py +154 -4
- invarlock/reporting/html.py +61 -11
- invarlock/reporting/normalizer.py +9 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +876 -510
- invarlock/reporting/report.py +72 -30
- invarlock/reporting/{certificate.py → report_builder.py} +252 -99
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock/reporting/report_types.py +6 -1
- invarlock/reporting/telemetry.py +86 -0
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
- invarlock/adapters/hf_gpt2.py +0 -404
- invarlock/adapters/hf_llama.py +0 -487
- invarlock/cli/commands/certify.py +0 -422
- invarlock-0.3.6.dist-info/METADATA +0 -588
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ through Transformers. Requires GPU for practical use.
|
|
|
7
7
|
Install with the `gpu` extra on supported platforms.
|
|
8
8
|
|
|
9
9
|
This adapter handles both:
|
|
10
|
-
1. Fresh quantization of FP16 models (
|
|
10
|
+
1. Fresh quantization of FP16 models (via quantization_config)
|
|
11
11
|
2. Loading pre-quantized BNB checkpoints (auto-detected via quantization_config)
|
|
12
12
|
"""
|
|
13
13
|
|
|
@@ -55,16 +55,17 @@ def _detect_pre_quantized_bnb(model_id: str) -> tuple[bool, int]:
|
|
|
55
55
|
if not quant_cfg:
|
|
56
56
|
return False, 0
|
|
57
57
|
|
|
58
|
-
# Check for BNB quantization
|
|
59
|
-
quant_method = quant_cfg.get("quant_method", "").lower()
|
|
60
|
-
if
|
|
58
|
+
# Check for BNB quantization. Prefer explicit bits, then legacy flags.
|
|
59
|
+
quant_method = str(quant_cfg.get("quant_method", "")).lower()
|
|
60
|
+
if "bitsandbytes" in quant_method or "bnb" in quant_method:
|
|
61
|
+
bits = quant_cfg.get("bits")
|
|
62
|
+
if isinstance(bits, int) and bits in {4, 8}:
|
|
63
|
+
return True, bits
|
|
61
64
|
if quant_cfg.get("load_in_8bit"):
|
|
62
65
|
return True, 8
|
|
63
66
|
if quant_cfg.get("load_in_4bit"):
|
|
64
67
|
return True, 4
|
|
65
|
-
|
|
66
|
-
bits = quant_cfg.get("bits", 8)
|
|
67
|
-
return True, bits
|
|
68
|
+
return True, 8
|
|
68
69
|
|
|
69
70
|
except Exception:
|
|
70
71
|
pass
|
|
@@ -82,11 +83,17 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
|
|
|
82
83
|
"DEPENDENCY-MISSING: transformers",
|
|
83
84
|
lambda e: {"dependency": "transformers"},
|
|
84
85
|
):
|
|
85
|
-
from transformers import AutoModelForCausalLM
|
|
86
|
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
|
86
87
|
|
|
87
88
|
# Check if this is a pre-quantized checkpoint
|
|
88
89
|
is_pre_quantized, pre_quant_bits = _detect_pre_quantized_bnb(model_id)
|
|
89
90
|
|
|
91
|
+
if "load_in_8bit" in kwargs or "load_in_4bit" in kwargs:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
"hf_bnb adapter: load_in_8bit/load_in_4bit are not supported. "
|
|
94
|
+
"Use model.quantization_config instead."
|
|
95
|
+
)
|
|
96
|
+
|
|
90
97
|
if is_pre_quantized:
|
|
91
98
|
# Load pre-quantized checkpoint WITHOUT re-applying quantization
|
|
92
99
|
with wrap_errors(
|
|
@@ -99,20 +106,25 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
|
|
|
99
106
|
model_id,
|
|
100
107
|
device_map="auto",
|
|
101
108
|
trust_remote_code=True,
|
|
102
|
-
|
|
103
|
-
**{
|
|
104
|
-
k: v
|
|
105
|
-
for k, v in kwargs.items()
|
|
106
|
-
if k not in ("load_in_8bit", "load_in_4bit")
|
|
107
|
-
},
|
|
109
|
+
**kwargs,
|
|
108
110
|
)
|
|
109
111
|
else:
|
|
110
112
|
# Fresh quantization of FP16 model
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
113
|
+
quantization_config = kwargs.pop("quantization_config", None)
|
|
114
|
+
if quantization_config is None:
|
|
115
|
+
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
|
116
|
+
elif isinstance(quantization_config, dict):
|
|
117
|
+
qdict = dict(quantization_config)
|
|
118
|
+
bits = qdict.pop("bits", None)
|
|
119
|
+
qdict.pop("quant_method", None)
|
|
120
|
+
if isinstance(bits, int):
|
|
121
|
+
if bits == 4:
|
|
122
|
+
qdict.setdefault("load_in_4bit", True)
|
|
123
|
+
qdict.setdefault("load_in_8bit", False)
|
|
124
|
+
elif bits == 8:
|
|
125
|
+
qdict.setdefault("load_in_8bit", True)
|
|
126
|
+
qdict.setdefault("load_in_4bit", False)
|
|
127
|
+
quantization_config = BitsAndBytesConfig(**qdict)
|
|
116
128
|
|
|
117
129
|
with wrap_errors(
|
|
118
130
|
ModelLoadError,
|
|
@@ -123,9 +135,8 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
|
|
|
123
135
|
model = AutoModelForCausalLM.from_pretrained(
|
|
124
136
|
model_id,
|
|
125
137
|
device_map="auto",
|
|
126
|
-
load_in_8bit=load_in_8bit,
|
|
127
|
-
load_in_4bit=load_in_4bit,
|
|
128
138
|
trust_remote_code=True,
|
|
139
|
+
quantization_config=quantization_config,
|
|
129
140
|
**kwargs,
|
|
130
141
|
)
|
|
131
142
|
|
invarlock/reporting/__init__.py
CHANGED
|
@@ -1,7 +1,21 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Evaluation report tooling (`invarlock.reporting`).
|
|
3
3
|
|
|
4
|
-
Provides
|
|
5
|
-
and certificate generation while keeping backward compatibility with
|
|
6
|
-
`invarlock.eval.*` imports.
|
|
4
|
+
Provides the evaluation report schema, builder, and renderers.
|
|
7
5
|
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .html import render_report_html
|
|
10
|
+
from .render import render_report_markdown
|
|
11
|
+
from .report_builder import make_report, validate_report
|
|
12
|
+
from .report_schema import REPORT_JSON_SCHEMA, REPORT_SCHEMA_VERSION
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"REPORT_SCHEMA_VERSION",
|
|
16
|
+
"REPORT_JSON_SCHEMA",
|
|
17
|
+
"make_report",
|
|
18
|
+
"render_report_markdown",
|
|
19
|
+
"render_report_html",
|
|
20
|
+
"validate_report",
|
|
21
|
+
]
|
|
@@ -23,7 +23,9 @@ def _measurement_contract_digest(contract: Any) -> str | None:
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@no_type_check
|
|
26
|
-
def _extract_invariants(
|
|
26
|
+
def _extract_invariants(
|
|
27
|
+
report: RunReport, baseline: RunReport | None = None
|
|
28
|
+
) -> dict[str, Any]:
|
|
27
29
|
"""Extract invariant check results (matches the shape used in tests)."""
|
|
28
30
|
invariants_data = (report.get("metrics", {}) or {}).get("invariants", {})
|
|
29
31
|
failures: list[dict[str, Any]] = []
|
|
@@ -81,6 +83,108 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
|
|
|
81
83
|
guard_entry = guard
|
|
82
84
|
break
|
|
83
85
|
|
|
86
|
+
baseline_guard_entry = None
|
|
87
|
+
if baseline is not None:
|
|
88
|
+
for guard in baseline.get("guards", []) or []:
|
|
89
|
+
if str(guard.get("name", "")).lower() == "invariants":
|
|
90
|
+
baseline_guard_entry = guard
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
def _coerce_checks(value: Any) -> dict[str, Any] | None:
|
|
94
|
+
return value if isinstance(value, dict) else None
|
|
95
|
+
|
|
96
|
+
def _extract_guard_checks(
|
|
97
|
+
entry: Any,
|
|
98
|
+
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
99
|
+
if not isinstance(entry, dict):
|
|
100
|
+
return None, None
|
|
101
|
+
details = entry.get("details")
|
|
102
|
+
if not isinstance(details, dict):
|
|
103
|
+
return None, None
|
|
104
|
+
return _coerce_checks(details.get("baseline_checks")), _coerce_checks(
|
|
105
|
+
details.get("current_checks")
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def _compare_invariants(
|
|
109
|
+
baseline_checks: dict[str, Any],
|
|
110
|
+
current_checks: dict[str, Any],
|
|
111
|
+
) -> tuple[list[dict[str, Any]], int, int]:
|
|
112
|
+
violations: list[dict[str, Any]] = []
|
|
113
|
+
|
|
114
|
+
# LayerNorm coverage check
|
|
115
|
+
baseline_layer_norms = set(baseline_checks.get("layer_norm_paths", ()))
|
|
116
|
+
current_layer_norms = set(current_checks.get("layer_norm_paths", ()))
|
|
117
|
+
missing_layer_norms = sorted(baseline_layer_norms - current_layer_norms)
|
|
118
|
+
if missing_layer_norms:
|
|
119
|
+
violations.append(
|
|
120
|
+
{
|
|
121
|
+
"type": "layer_norm_missing",
|
|
122
|
+
"missing": missing_layer_norms,
|
|
123
|
+
"message": "Expected LayerNorm modules are missing vs baseline",
|
|
124
|
+
}
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Tokenizer / vocab alignment
|
|
128
|
+
baseline_vocab_sizes = baseline_checks.get("embedding_vocab_sizes")
|
|
129
|
+
current_vocab_sizes = current_checks.get("embedding_vocab_sizes")
|
|
130
|
+
if isinstance(baseline_vocab_sizes, dict):
|
|
131
|
+
for module_name, baseline_size in baseline_vocab_sizes.items():
|
|
132
|
+
current_size = None
|
|
133
|
+
if isinstance(current_vocab_sizes, dict):
|
|
134
|
+
current_size = current_vocab_sizes.get(module_name)
|
|
135
|
+
if current_size is None or int(current_size) != int(baseline_size):
|
|
136
|
+
mismatch = {
|
|
137
|
+
"module": module_name,
|
|
138
|
+
"baseline": int(baseline_size),
|
|
139
|
+
"current": None if current_size is None else int(current_size),
|
|
140
|
+
}
|
|
141
|
+
violations.append(
|
|
142
|
+
{
|
|
143
|
+
"type": "tokenizer_mismatch",
|
|
144
|
+
"message": "Embedding vocabulary size changed vs baseline",
|
|
145
|
+
**mismatch,
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
handled_keys = {
|
|
150
|
+
"layer_norm_paths",
|
|
151
|
+
"embedding_vocab_sizes",
|
|
152
|
+
"config_vocab_size",
|
|
153
|
+
}
|
|
154
|
+
for check_name, baseline_value in baseline_checks.items():
|
|
155
|
+
if check_name in handled_keys:
|
|
156
|
+
continue
|
|
157
|
+
current_value = current_checks.get(check_name)
|
|
158
|
+
if current_value != baseline_value:
|
|
159
|
+
violations.append(
|
|
160
|
+
{
|
|
161
|
+
"type": "invariant_violation",
|
|
162
|
+
"check": check_name,
|
|
163
|
+
"baseline": baseline_value,
|
|
164
|
+
"current": current_value,
|
|
165
|
+
"message": (
|
|
166
|
+
f"Invariant {check_name} changed from {baseline_value} to {current_value}"
|
|
167
|
+
),
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
fatal_violation_types = {"tokenizer_mismatch"}
|
|
172
|
+
fatal_count = 0
|
|
173
|
+
warning_count = 0
|
|
174
|
+
annotated: list[dict[str, Any]] = []
|
|
175
|
+
for violation in violations:
|
|
176
|
+
violation_type = str(violation.get("type") or "")
|
|
177
|
+
severity = "fatal" if violation_type in fatal_violation_types else "warning"
|
|
178
|
+
annotated_violation = dict(violation)
|
|
179
|
+
annotated_violation.setdefault("severity", severity)
|
|
180
|
+
annotated.append(annotated_violation)
|
|
181
|
+
if severity == "fatal":
|
|
182
|
+
fatal_count += 1
|
|
183
|
+
else:
|
|
184
|
+
warning_count += 1
|
|
185
|
+
|
|
186
|
+
return annotated, fatal_count, warning_count
|
|
187
|
+
|
|
84
188
|
severity_status = "pass"
|
|
85
189
|
if guard_entry:
|
|
86
190
|
gm = guard_entry.get("metrics", {}) or {}
|
|
@@ -108,9 +212,51 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
|
|
|
108
212
|
if detail:
|
|
109
213
|
row["detail"] = detail
|
|
110
214
|
failures.append(row)
|
|
111
|
-
|
|
215
|
+
base_fatal = 0
|
|
216
|
+
base_warn = 0
|
|
217
|
+
baseline_failures: list[dict[str, Any]] = []
|
|
218
|
+
if baseline_guard_entry is not None:
|
|
219
|
+
baseline_pre, baseline_post = _extract_guard_checks(baseline_guard_entry)
|
|
220
|
+
current_pre, current_post = _extract_guard_checks(guard_entry)
|
|
221
|
+
baseline_snapshot = baseline_pre or baseline_post
|
|
222
|
+
current_snapshot = current_post or current_pre
|
|
223
|
+
if isinstance(baseline_snapshot, dict) and isinstance(
|
|
224
|
+
current_snapshot, dict
|
|
225
|
+
):
|
|
226
|
+
baseline_failures, base_fatal, base_warn = _compare_invariants(
|
|
227
|
+
baseline_snapshot, current_snapshot
|
|
228
|
+
)
|
|
229
|
+
for violation in baseline_failures:
|
|
230
|
+
check_name = violation.get("check")
|
|
231
|
+
if not check_name:
|
|
232
|
+
check_name = (
|
|
233
|
+
violation.get("module")
|
|
234
|
+
or violation.get("type")
|
|
235
|
+
or "invariant"
|
|
236
|
+
)
|
|
237
|
+
row = {
|
|
238
|
+
"check": str(check_name),
|
|
239
|
+
"type": str(violation.get("type") or "violation"),
|
|
240
|
+
"severity": str(violation.get("severity") or "warning"),
|
|
241
|
+
}
|
|
242
|
+
detail = {k: v for k, v in violation.items() if k not in row}
|
|
243
|
+
if detail:
|
|
244
|
+
detail.setdefault("source", "baseline_compare")
|
|
245
|
+
row["detail"] = detail
|
|
246
|
+
failures.append(row)
|
|
247
|
+
|
|
248
|
+
fatal_total = fatal_count + base_fatal
|
|
249
|
+
warn_total = warning_count + base_warn
|
|
250
|
+
try:
|
|
251
|
+
summary["fatal_violations"] = fatal_total
|
|
252
|
+
summary["warning_violations"] = warn_total
|
|
253
|
+
summary["violations_found"] = fatal_total + warn_total
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
if fatal_total > 0:
|
|
112
258
|
severity_status = "fail"
|
|
113
|
-
elif
|
|
259
|
+
elif warn_total > 0 or violations:
|
|
114
260
|
severity_status = "warn"
|
|
115
261
|
|
|
116
262
|
# If any error-severity entry exists among failures, escalate to fail
|
|
@@ -130,12 +276,16 @@ def _extract_invariants(report: RunReport) -> dict[str, Any]:
|
|
|
130
276
|
"warning_violations": len(failures),
|
|
131
277
|
}
|
|
132
278
|
|
|
279
|
+
details_out = invariants_data
|
|
280
|
+
if not details_out and guard_entry and isinstance(guard_entry.get("details"), dict):
|
|
281
|
+
details_out = guard_entry.get("details", {})
|
|
282
|
+
|
|
133
283
|
return {
|
|
134
284
|
"pre": "pass",
|
|
135
285
|
"post": status,
|
|
136
286
|
"status": status,
|
|
137
287
|
"summary": summary,
|
|
138
|
-
"details":
|
|
288
|
+
"details": details_out,
|
|
139
289
|
"failures": failures,
|
|
140
290
|
}
|
|
141
291
|
|
invarlock/reporting/html.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Minimal HTML exporter for
|
|
2
|
+
Minimal HTML exporter for reports.
|
|
3
3
|
|
|
4
4
|
This implementation wraps the Markdown rendering in a simple HTML template so
|
|
5
5
|
that the numbers and core content remain identical across formats.
|
|
@@ -10,23 +10,73 @@ from __future__ import annotations
|
|
|
10
10
|
from html import escape
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
|
-
from .render import
|
|
13
|
+
from .render import render_report_markdown
|
|
14
14
|
|
|
15
|
+
markdown_module: Any | None = None
|
|
16
|
+
try:
|
|
17
|
+
import markdown as _markdown # type: ignore[import-untyped]
|
|
18
|
+
except Exception: # pragma: no cover - optional dependency
|
|
19
|
+
_markdown = None
|
|
20
|
+
else:
|
|
21
|
+
markdown_module = _markdown
|
|
15
22
|
|
|
16
|
-
def render_certificate_html(certificate: dict[str, Any]) -> str:
|
|
17
|
-
"""Render a certificate as a simple HTML document.
|
|
18
23
|
|
|
19
|
-
|
|
20
|
-
|
|
24
|
+
_STATUS_BADGES = {
|
|
25
|
+
"\u2705 PASS": '<span class="badge pass">PASS</span>',
|
|
26
|
+
"\u2705 OK": '<span class="badge pass">OK</span>',
|
|
27
|
+
"\u274c FAIL": '<span class="badge fail">FAIL</span>',
|
|
28
|
+
"\u26a0\ufe0f WARN": '<span class="badge warn">WARN</span>',
|
|
29
|
+
"\u26a0 WARN": '<span class="badge warn">WARN</span>',
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _apply_status_badges(html_body: str) -> str:
|
|
34
|
+
updated = html_body
|
|
35
|
+
for token, replacement in _STATUS_BADGES.items():
|
|
36
|
+
updated = updated.replace(token, replacement)
|
|
37
|
+
return updated
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def render_report_html(evaluation_report: dict[str, Any]) -> str:
|
|
41
|
+
"""Render an evaluation report as a simple HTML document.
|
|
42
|
+
|
|
43
|
+
Uses the Markdown renderer and converts to HTML when available, falling back
|
|
44
|
+
to a <pre> block when the markdown dependency is missing.
|
|
21
45
|
"""
|
|
22
|
-
md =
|
|
23
|
-
|
|
46
|
+
md = render_report_markdown(evaluation_report)
|
|
47
|
+
if markdown_module is None:
|
|
48
|
+
body = f'<pre class="invarlock-md">{escape(md)}</pre>'
|
|
49
|
+
else:
|
|
50
|
+
html_body = markdown_module.markdown(md, extensions=["tables", "fenced_code"])
|
|
51
|
+
html_body = _apply_status_badges(html_body)
|
|
52
|
+
body = f'<div class="invarlock-md">{html_body}</div>'
|
|
24
53
|
return (
|
|
25
54
|
'<!DOCTYPE html><html><head><meta charset="utf-8">'
|
|
26
|
-
"<title>InvarLock
|
|
27
|
-
"<style>
|
|
55
|
+
"<title>InvarLock Evaluation Report</title>"
|
|
56
|
+
"<style>"
|
|
57
|
+
":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
|
|
58
|
+
"--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
|
|
59
|
+
"body{font-family:ui-sans-serif,system-ui,-apple-system,Segoe UI,sans-serif;"
|
|
60
|
+
"color:var(--ink);background:linear-gradient(180deg,#fff, #f6f8fa);"
|
|
61
|
+
"margin:0;padding:32px}"
|
|
62
|
+
".invarlock-md{max-width:960px;margin:0 auto;padding:24px;background:#fff;"
|
|
63
|
+
"border:1px solid var(--border);border-radius:16px;box-shadow:0 10px 30px rgba(0,0,0,0.05)}"
|
|
64
|
+
"h1,h2,h3{margin-top:1.4em}h1{margin-top:0}"
|
|
65
|
+
"table{border-collapse:collapse;width:100%;margin:12px 0}"
|
|
66
|
+
"th,td{border:1px solid var(--border);padding:6px 8px;text-align:left}"
|
|
67
|
+
"code,pre{background:var(--panel);border-radius:8px}"
|
|
68
|
+
"pre{padding:12px;overflow:auto}"
|
|
69
|
+
".badge{display:inline-block;padding:2px 8px;border-radius:999px;"
|
|
70
|
+
"font-size:0.75rem;font-weight:700;letter-spacing:0.02em;color:#fff}"
|
|
71
|
+
".badge.pass{background:var(--pass)}"
|
|
72
|
+
".badge.fail{background:var(--fail)}"
|
|
73
|
+
".badge.warn{background:var(--warn)}"
|
|
74
|
+
"@media print{body{background:#fff;padding:0}.invarlock-md{box-shadow:none;"
|
|
75
|
+
"border:0}a{color:inherit;text-decoration:none}.badge{color:#000;"
|
|
76
|
+
"border:1px solid #000;background:transparent}}"
|
|
77
|
+
"</style>"
|
|
28
78
|
"</head><body>" + body + "</body></html>"
|
|
29
79
|
)
|
|
30
80
|
|
|
31
81
|
|
|
32
|
-
__all__ = ["
|
|
82
|
+
__all__ = ["render_report_html"]
|
|
@@ -33,7 +33,7 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
33
33
|
"""Coerce an arbitrary report-like mapping into a canonical RunReport.
|
|
34
34
|
|
|
35
35
|
This is the single entry point for converting pre-canonical or loosely-typed
|
|
36
|
-
data into the strict PM-only RunReport shape used by
|
|
36
|
+
data into the strict PM-only RunReport shape used by evaluation report building.
|
|
37
37
|
"""
|
|
38
38
|
src = _as_mapping(report)
|
|
39
39
|
|
|
@@ -53,8 +53,10 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
53
53
|
"ts": ts,
|
|
54
54
|
"auto": meta_in.get("auto") if isinstance(meta_in.get("auto"), dict) else None,
|
|
55
55
|
}
|
|
56
|
-
# Preserve additional provenance knobs used by
|
|
56
|
+
# Preserve additional provenance knobs used by evaluation report digests.
|
|
57
57
|
for key in (
|
|
58
|
+
"pm_acceptance_range",
|
|
59
|
+
"pm_drift_band",
|
|
58
60
|
"policy_overrides",
|
|
59
61
|
"overrides",
|
|
60
62
|
"plugins",
|
|
@@ -179,6 +181,11 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
179
181
|
"latency_ms_p50",
|
|
180
182
|
"latency_ms_p95",
|
|
181
183
|
"memory_mb_peak",
|
|
184
|
+
"gpu_memory_mb_peak",
|
|
185
|
+
"gpu_memory_reserved_mb_peak",
|
|
186
|
+
"timings",
|
|
187
|
+
"guard_timings",
|
|
188
|
+
"memory_snapshots",
|
|
182
189
|
"throughput_sps",
|
|
183
190
|
"spectral",
|
|
184
191
|
"rmt",
|
|
@@ -35,7 +35,7 @@ def _compute_thresholds_payload(
|
|
|
35
35
|
tier: str, resolved_policy: dict[str, Any]
|
|
36
36
|
) -> dict[str, Any]:
|
|
37
37
|
"""Build canonical thresholds payload for digest stability."""
|
|
38
|
-
from .
|
|
38
|
+
from .report_builder import TIER_RATIO_LIMITS # local to avoid cycles
|
|
39
39
|
|
|
40
40
|
tier_lc = (tier or "balanced").lower()
|
|
41
41
|
metrics_policy = (
|
|
@@ -8,21 +8,21 @@ from .utils import _coerce_interval, _weighted_mean
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def attach_primary_metric(
|
|
11
|
-
|
|
11
|
+
evaluation_report: dict[str, Any],
|
|
12
12
|
report: dict[str, Any],
|
|
13
13
|
baseline_raw: dict[str, Any] | None,
|
|
14
14
|
baseline_ref: dict[str, Any] | None,
|
|
15
15
|
ppl_analysis: dict[str, Any] | None,
|
|
16
16
|
) -> None:
|
|
17
|
-
"""Attach/normalize the primary_metric block on the
|
|
17
|
+
"""Attach/normalize the primary_metric block on the evaluation report.
|
|
18
18
|
|
|
19
|
-
Behavior mirrors historical logic in
|
|
19
|
+
Behavior mirrors historical logic in report_builder.py and preserves structure:
|
|
20
20
|
- Prefer explicit metrics.primary_metric if present
|
|
21
21
|
- Compute missing ratio_vs_baseline, degenerate display_ci
|
|
22
22
|
- ppl window-based analysis info (mean logloss) added when available
|
|
23
23
|
- Fallbacks for classification metrics and eval-window-derived ppl
|
|
24
24
|
- Ensure display_ci always present for schema invariants
|
|
25
|
-
Mutates the
|
|
25
|
+
Mutates the evaluation report in-place.
|
|
26
26
|
"""
|
|
27
27
|
# Attach primary metric snapshot when provided in report
|
|
28
28
|
try:
|
|
@@ -180,12 +180,12 @@ def attach_primary_metric(
|
|
|
180
180
|
]
|
|
181
181
|
except Exception:
|
|
182
182
|
pass
|
|
183
|
-
|
|
183
|
+
evaluation_report["primary_metric"] = pm_copy
|
|
184
184
|
except Exception:
|
|
185
185
|
pass
|
|
186
186
|
|
|
187
187
|
def _attach_from_windows() -> None:
|
|
188
|
-
if isinstance(
|
|
188
|
+
if isinstance(evaluation_report.get("primary_metric"), dict):
|
|
189
189
|
return
|
|
190
190
|
try:
|
|
191
191
|
m = (
|
|
@@ -212,7 +212,7 @@ def attach_primary_metric(
|
|
|
212
212
|
baseline=baseline_raw if isinstance(baseline_raw, dict) else None,
|
|
213
213
|
)
|
|
214
214
|
if isinstance(pm_block, dict) and pm_block:
|
|
215
|
-
|
|
215
|
+
evaluation_report["primary_metric"] = pm_block
|
|
216
216
|
except Exception:
|
|
217
217
|
pass
|
|
218
218
|
|
|
@@ -220,7 +220,7 @@ def attach_primary_metric(
|
|
|
220
220
|
_attach_from_windows()
|
|
221
221
|
|
|
222
222
|
# Minimal fallback for classification-only reports without explicit primary_metric
|
|
223
|
-
if not isinstance(
|
|
223
|
+
if not isinstance(evaluation_report.get("primary_metric"), dict):
|
|
224
224
|
try:
|
|
225
225
|
metrics_map = report.get("metrics", {}) if isinstance(report, dict) else {}
|
|
226
226
|
clf = (
|
|
@@ -298,7 +298,7 @@ def attach_primary_metric(
|
|
|
298
298
|
acc_pm["ratio_vs_baseline"] = delta_pp
|
|
299
299
|
except Exception:
|
|
300
300
|
pass
|
|
301
|
-
|
|
301
|
+
evaluation_report["primary_metric"] = acc_pm
|
|
302
302
|
except Exception:
|
|
303
303
|
pass
|
|
304
304
|
|
|
@@ -308,8 +308,8 @@ def attach_primary_metric(
|
|
|
308
308
|
# Ensure primary_metric has display_ci populated for schema invariants
|
|
309
309
|
try:
|
|
310
310
|
pm = (
|
|
311
|
-
|
|
312
|
-
if isinstance(
|
|
311
|
+
evaluation_report.get("primary_metric", {})
|
|
312
|
+
if isinstance(evaluation_report.get("primary_metric"), dict)
|
|
313
313
|
else None
|
|
314
314
|
)
|
|
315
315
|
if isinstance(pm, dict) and pm:
|