invarlock 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +3 -3
- invarlock/adapters/auto.py +2 -10
- invarlock/adapters/hf_loading.py +7 -7
- invarlock/adapters/hf_mixin.py +28 -5
- invarlock/assurance/__init__.py +15 -23
- invarlock/cli/adapter_auto.py +1 -5
- invarlock/cli/app.py +57 -27
- invarlock/cli/commands/__init__.py +2 -2
- invarlock/cli/commands/calibrate.py +48 -4
- invarlock/cli/commands/{certify.py → evaluate.py} +69 -46
- invarlock/cli/commands/explain_gates.py +25 -17
- invarlock/cli/commands/export_html.py +11 -9
- invarlock/cli/commands/report.py +116 -46
- invarlock/cli/commands/run.py +274 -66
- invarlock/cli/commands/verify.py +84 -89
- invarlock/cli/determinism.py +1 -1
- invarlock/cli/provenance.py +3 -3
- invarlock/core/bootstrap.py +1 -1
- invarlock/core/retry.py +14 -14
- invarlock/core/runner.py +1 -1
- invarlock/edits/noop.py +2 -2
- invarlock/edits/quant_rtn.py +2 -2
- invarlock/eval/__init__.py +1 -1
- invarlock/eval/bench.py +11 -7
- invarlock/eval/primary_metric.py +1 -1
- invarlock/guards/spectral.py +1 -1
- invarlock/model_profile.py +16 -35
- invarlock/plugins/hf_bnb_adapter.py +32 -21
- invarlock/reporting/__init__.py +18 -4
- invarlock/reporting/html.py +7 -7
- invarlock/reporting/normalizer.py +2 -2
- invarlock/reporting/policy_utils.py +1 -1
- invarlock/reporting/primary_metric_utils.py +11 -11
- invarlock/reporting/render.py +126 -120
- invarlock/reporting/report.py +43 -37
- invarlock/reporting/{certificate.py → report_builder.py} +98 -95
- invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
- invarlock-0.3.8.dist-info/METADATA +283 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/RECORD +43 -43
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
- invarlock-0.3.7.dist-info/METADATA +0 -602
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.7.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
invarlock/core/runner.py
CHANGED
|
@@ -521,7 +521,7 @@ class CoreRunner:
|
|
|
521
521
|
f"Guard '{guard.name}' prepare failed: {e}"
|
|
522
522
|
) from e
|
|
523
523
|
|
|
524
|
-
# Store resolved policies in report for
|
|
524
|
+
# Store resolved policies in report for evaluation report generation
|
|
525
525
|
report.meta["tier_policies"] = tier_policies
|
|
526
526
|
|
|
527
527
|
self._log_event(
|
invarlock/edits/noop.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
"""Built-in no-op edit used for baseline and Compare &
|
|
1
|
+
"""Built-in no-op edit used for baseline and Compare & Evaluate (BYOE).
|
|
2
2
|
|
|
3
3
|
This edit does not modify the model and reports zero deltas. It exists to
|
|
4
|
-
support baseline runs and Compare &
|
|
4
|
+
support baseline runs and Compare & Evaluate workflows where the subject
|
|
5
5
|
checkpoint is produced outside of InvarLock.
|
|
6
6
|
"""
|
|
7
7
|
|
invarlock/edits/quant_rtn.py
CHANGED
|
@@ -406,7 +406,7 @@ class RTNQuantEdit(ModelEdit):
|
|
|
406
406
|
if layer_name not in modified_layers:
|
|
407
407
|
modified_layers.append(layer_name)
|
|
408
408
|
|
|
409
|
-
# Store edit plan for
|
|
409
|
+
# Store edit plan for evaluation report generation
|
|
410
410
|
modules_quantized = [r["module_name"] for r in quantization_results]
|
|
411
411
|
|
|
412
412
|
edit_plan = {
|
|
@@ -424,7 +424,7 @@ class RTNQuantEdit(ModelEdit):
|
|
|
424
424
|
return {
|
|
425
425
|
"name": self.name,
|
|
426
426
|
"plan_digest": f"rtn_quantization_{bitwidth}bit_{scope}",
|
|
427
|
-
"plan": edit_plan, # Include the plan for
|
|
427
|
+
"plan": edit_plan, # Include the plan for evaluation report generation
|
|
428
428
|
"deltas": {
|
|
429
429
|
"params_changed": total_params_quantized,
|
|
430
430
|
"sparsity": None, # Quantization doesn't create sparsity
|
invarlock/eval/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Evaluation utilities (`invarlock.eval`).
|
|
2
2
|
|
|
3
3
|
This package now focuses on evaluation helpers (metrics, data, probes).
|
|
4
|
-
Reporting (report building
|
|
4
|
+
Reporting (evaluation report building and schema validation) has moved to
|
|
5
5
|
`invarlock.reporting`.
|
|
6
6
|
"""
|
|
7
7
|
|
invarlock/eval/bench.py
CHANGED
|
@@ -976,17 +976,21 @@ def execute_scenario(
|
|
|
976
976
|
except Exception:
|
|
977
977
|
pass
|
|
978
978
|
|
|
979
|
-
# Generate
|
|
979
|
+
# Generate evaluation report artifact when both runs produced reports
|
|
980
980
|
try:
|
|
981
981
|
if bare_result.success and guarded_result.success:
|
|
982
|
-
from invarlock.reporting.
|
|
982
|
+
from invarlock.reporting.report_builder import make_report
|
|
983
983
|
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
984
|
+
evaluation_report = make_report(guarded_result.report, bare_result.report)
|
|
985
|
+
report_path = scenario_dir / "evaluation.report.json"
|
|
986
|
+
report_path.write_text(
|
|
987
|
+
json.dumps(evaluation_report, indent=2), encoding="utf-8"
|
|
988
|
+
)
|
|
989
|
+
artifacts["evaluation_report"] = str(report_path)
|
|
988
990
|
except Exception as exc:
|
|
989
|
-
logger.warning(
|
|
991
|
+
logger.warning(
|
|
992
|
+
f"Evaluation report generation failed for {scenario_slug}: {exc}"
|
|
993
|
+
)
|
|
990
994
|
|
|
991
995
|
# Resolve epsilon from runtime or use config
|
|
992
996
|
epsilon_used = config.epsilon
|
invarlock/eval/primary_metric.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Primary metric abstraction and minimal ppl_causal implementation (Phase 1).
|
|
3
3
|
|
|
4
4
|
This module introduces a light-weight, task-agnostic metric interface and a
|
|
5
|
-
registry so the runner
|
|
5
|
+
registry so the runner and evaluation report builder can evolve beyond causal-LM perplexity.
|
|
6
6
|
|
|
7
7
|
Phase 1 goal: provide a ppl_causal metric and a helper that can compute point
|
|
8
8
|
estimates directly from evaluation window aggregates already present in run
|
invarlock/guards/spectral.py
CHANGED
|
@@ -344,7 +344,7 @@ class SpectralGuard(Guard):
|
|
|
344
344
|
self.ignore_preview_inflation = bool(policy["ignore_preview_inflation"])
|
|
345
345
|
self.config["ignore_preview_inflation"] = self.ignore_preview_inflation
|
|
346
346
|
|
|
347
|
-
# Optional hydration of baseline stats from policy (e.g., baseline
|
|
347
|
+
# Optional hydration of baseline stats from policy (e.g., baseline evaluation report)
|
|
348
348
|
if "baseline_family_stats" in policy and isinstance(
|
|
349
349
|
policy["baseline_family_stats"], dict
|
|
350
350
|
):
|
invarlock/model_profile.py
CHANGED
|
@@ -6,11 +6,9 @@ from dataclasses import dataclass, field
|
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
AutoTokenizer: Any | None = None
|
|
9
|
-
GPT2Tokenizer: Any | None = None
|
|
10
9
|
|
|
11
10
|
try:
|
|
12
11
|
from transformers import AutoTokenizer as _AutoTokenizer
|
|
13
|
-
from transformers import GPT2Tokenizer as _GPT2Tokenizer
|
|
14
12
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
15
13
|
except Exception: # pragma: no cover - exercised only when transformers is absent
|
|
16
14
|
|
|
@@ -26,7 +24,6 @@ except Exception: # pragma: no cover - exercised only when transformers is abse
|
|
|
26
24
|
|
|
27
25
|
else: # pragma: no cover - transformers optional
|
|
28
26
|
AutoTokenizer = _AutoTokenizer
|
|
29
|
-
GPT2Tokenizer = _GPT2Tokenizer
|
|
30
27
|
|
|
31
28
|
|
|
32
29
|
TokenizerFactory = Callable[[], tuple[PreTrainedTokenizerBase, str]]
|
|
@@ -177,12 +174,12 @@ def _make_bert_tokenizer(model_id: str):
|
|
|
177
174
|
|
|
178
175
|
def _make_gpt2_tokenizer(model_id: str):
|
|
179
176
|
def factory() -> tuple[PreTrainedTokenizerBase, str]:
|
|
180
|
-
if
|
|
177
|
+
if AutoTokenizer is None:
|
|
181
178
|
raise RuntimeError(
|
|
182
179
|
"GPT-2 tokenizers require the 'transformers' extra. "
|
|
183
180
|
"Install it with: pip install 'invarlock[adapters]'."
|
|
184
181
|
)
|
|
185
|
-
tokenizer =
|
|
182
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
186
183
|
if tokenizer.pad_token is None:
|
|
187
184
|
tokenizer.pad_token = tokenizer.eos_token
|
|
188
185
|
hash_value = _hash_tokenizer(tokenizer)
|
|
@@ -193,7 +190,7 @@ def _make_gpt2_tokenizer(model_id: str):
|
|
|
193
190
|
|
|
194
191
|
def _make_causal_auto_tokenizer(model_id: str):
|
|
195
192
|
def factory() -> tuple[PreTrainedTokenizerBase, str]:
|
|
196
|
-
if AutoTokenizer is None
|
|
193
|
+
if AutoTokenizer is None:
|
|
197
194
|
raise RuntimeError(
|
|
198
195
|
"Causal tokenizers require the 'transformers' extra. "
|
|
199
196
|
"Install it with: pip install 'invarlock[adapters]'."
|
|
@@ -202,23 +199,15 @@ def _make_causal_auto_tokenizer(model_id: str):
|
|
|
202
199
|
# local GPT-2 tokenizer if the model assets are not cached or network
|
|
203
200
|
# access is denied.
|
|
204
201
|
tokenizer = None
|
|
205
|
-
|
|
202
|
+
try:
|
|
203
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=True)
|
|
204
|
+
except Exception:
|
|
206
205
|
try:
|
|
207
|
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
208
|
-
model_id, local_files_only=True
|
|
209
|
-
)
|
|
206
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
210
207
|
except Exception:
|
|
211
|
-
|
|
212
|
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
213
|
-
except Exception:
|
|
214
|
-
tokenizer = None
|
|
208
|
+
tokenizer = None
|
|
215
209
|
if tokenizer is None:
|
|
216
|
-
|
|
217
|
-
raise RuntimeError(
|
|
218
|
-
"Tokenization requires the 'transformers' extra. "
|
|
219
|
-
"Install it with: pip install 'invarlock[adapters]'."
|
|
220
|
-
)
|
|
221
|
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
210
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
222
211
|
# Ensure padding/bos tokens are configured so downstream encoding
|
|
223
212
|
# yields stable non-zero ids and a valid attention mask regardless of
|
|
224
213
|
# environment defaults or tokenizer variants.
|
|
@@ -246,30 +235,22 @@ def _make_causal_auto_tokenizer(model_id: str):
|
|
|
246
235
|
|
|
247
236
|
def _make_unknown_tokenizer(model_id: str):
|
|
248
237
|
def factory() -> tuple[PreTrainedTokenizerBase, str]:
|
|
249
|
-
if AutoTokenizer is None
|
|
238
|
+
if AutoTokenizer is None:
|
|
250
239
|
raise RuntimeError(
|
|
251
240
|
"Text tokenization requires the 'transformers' extra. "
|
|
252
241
|
"Install it with: pip install 'invarlock[adapters]'."
|
|
253
242
|
)
|
|
254
243
|
# Unknown families: try local-only first, then remote, then degrade to GPT-2
|
|
255
244
|
tokenizer = None
|
|
256
|
-
|
|
245
|
+
try:
|
|
246
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=True)
|
|
247
|
+
except Exception:
|
|
257
248
|
try:
|
|
258
|
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
259
|
-
model_id, local_files_only=True
|
|
260
|
-
)
|
|
249
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
261
250
|
except Exception:
|
|
262
|
-
|
|
263
|
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
264
|
-
except Exception:
|
|
265
|
-
tokenizer = None
|
|
251
|
+
tokenizer = None
|
|
266
252
|
if tokenizer is None:
|
|
267
|
-
|
|
268
|
-
raise RuntimeError(
|
|
269
|
-
"Text tokenization requires the 'transformers' extra. "
|
|
270
|
-
"Install it with: pip install 'invarlock[adapters]'."
|
|
271
|
-
)
|
|
272
|
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
253
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
273
254
|
if getattr(tokenizer, "pad_token", None) is None:
|
|
274
255
|
eos_token = getattr(tokenizer, "eos_token", None)
|
|
275
256
|
if eos_token is not None:
|
|
@@ -7,7 +7,7 @@ through Transformers. Requires GPU for practical use.
|
|
|
7
7
|
Install with the `gpu` extra on supported platforms.
|
|
8
8
|
|
|
9
9
|
This adapter handles both:
|
|
10
|
-
1. Fresh quantization of FP16 models (
|
|
10
|
+
1. Fresh quantization of FP16 models (via quantization_config)
|
|
11
11
|
2. Loading pre-quantized BNB checkpoints (auto-detected via quantization_config)
|
|
12
12
|
"""
|
|
13
13
|
|
|
@@ -55,16 +55,17 @@ def _detect_pre_quantized_bnb(model_id: str) -> tuple[bool, int]:
|
|
|
55
55
|
if not quant_cfg:
|
|
56
56
|
return False, 0
|
|
57
57
|
|
|
58
|
-
# Check for BNB quantization
|
|
59
|
-
quant_method = quant_cfg.get("quant_method", "").lower()
|
|
60
|
-
if
|
|
58
|
+
# Check for BNB quantization. Prefer explicit bits, then legacy flags.
|
|
59
|
+
quant_method = str(quant_cfg.get("quant_method", "")).lower()
|
|
60
|
+
if "bitsandbytes" in quant_method or "bnb" in quant_method:
|
|
61
|
+
bits = quant_cfg.get("bits")
|
|
62
|
+
if isinstance(bits, int) and bits in {4, 8}:
|
|
63
|
+
return True, bits
|
|
61
64
|
if quant_cfg.get("load_in_8bit"):
|
|
62
65
|
return True, 8
|
|
63
66
|
if quant_cfg.get("load_in_4bit"):
|
|
64
67
|
return True, 4
|
|
65
|
-
|
|
66
|
-
bits = quant_cfg.get("bits", 8)
|
|
67
|
-
return True, bits
|
|
68
|
+
return True, 8
|
|
68
69
|
|
|
69
70
|
except Exception:
|
|
70
71
|
pass
|
|
@@ -82,11 +83,17 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
|
|
|
82
83
|
"DEPENDENCY-MISSING: transformers",
|
|
83
84
|
lambda e: {"dependency": "transformers"},
|
|
84
85
|
):
|
|
85
|
-
from transformers import AutoModelForCausalLM
|
|
86
|
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
|
86
87
|
|
|
87
88
|
# Check if this is a pre-quantized checkpoint
|
|
88
89
|
is_pre_quantized, pre_quant_bits = _detect_pre_quantized_bnb(model_id)
|
|
89
90
|
|
|
91
|
+
if "load_in_8bit" in kwargs or "load_in_4bit" in kwargs:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
"hf_bnb adapter: load_in_8bit/load_in_4bit are not supported. "
|
|
94
|
+
"Use model.quantization_config instead."
|
|
95
|
+
)
|
|
96
|
+
|
|
90
97
|
if is_pre_quantized:
|
|
91
98
|
# Load pre-quantized checkpoint WITHOUT re-applying quantization
|
|
92
99
|
with wrap_errors(
|
|
@@ -99,20 +106,25 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
|
|
|
99
106
|
model_id,
|
|
100
107
|
device_map="auto",
|
|
101
108
|
trust_remote_code=True,
|
|
102
|
-
|
|
103
|
-
**{
|
|
104
|
-
k: v
|
|
105
|
-
for k, v in kwargs.items()
|
|
106
|
-
if k not in ("load_in_8bit", "load_in_4bit")
|
|
107
|
-
},
|
|
109
|
+
**kwargs,
|
|
108
110
|
)
|
|
109
111
|
else:
|
|
110
112
|
# Fresh quantization of FP16 model
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
113
|
+
quantization_config = kwargs.pop("quantization_config", None)
|
|
114
|
+
if quantization_config is None:
|
|
115
|
+
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
|
116
|
+
elif isinstance(quantization_config, dict):
|
|
117
|
+
qdict = dict(quantization_config)
|
|
118
|
+
bits = qdict.pop("bits", None)
|
|
119
|
+
qdict.pop("quant_method", None)
|
|
120
|
+
if isinstance(bits, int):
|
|
121
|
+
if bits == 4:
|
|
122
|
+
qdict.setdefault("load_in_4bit", True)
|
|
123
|
+
qdict.setdefault("load_in_8bit", False)
|
|
124
|
+
elif bits == 8:
|
|
125
|
+
qdict.setdefault("load_in_8bit", True)
|
|
126
|
+
qdict.setdefault("load_in_4bit", False)
|
|
127
|
+
quantization_config = BitsAndBytesConfig(**qdict)
|
|
116
128
|
|
|
117
129
|
with wrap_errors(
|
|
118
130
|
ModelLoadError,
|
|
@@ -123,9 +135,8 @@ class HF_BNB_Adapter(HFAdapterMixin, ModelAdapter):
|
|
|
123
135
|
model = AutoModelForCausalLM.from_pretrained(
|
|
124
136
|
model_id,
|
|
125
137
|
device_map="auto",
|
|
126
|
-
load_in_8bit=load_in_8bit,
|
|
127
|
-
load_in_4bit=load_in_4bit,
|
|
128
138
|
trust_remote_code=True,
|
|
139
|
+
quantization_config=quantization_config,
|
|
129
140
|
**kwargs,
|
|
130
141
|
)
|
|
131
142
|
|
invarlock/reporting/__init__.py
CHANGED
|
@@ -1,7 +1,21 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Evaluation report tooling (`invarlock.reporting`).
|
|
3
3
|
|
|
4
|
-
Provides
|
|
5
|
-
and certificate generation while keeping backward compatibility with
|
|
6
|
-
`invarlock.eval.*` imports.
|
|
4
|
+
Provides the evaluation report schema, builder, and renderers.
|
|
7
5
|
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .html import render_report_html
|
|
10
|
+
from .render import render_report_markdown
|
|
11
|
+
from .report_builder import make_report, validate_report
|
|
12
|
+
from .report_schema import REPORT_JSON_SCHEMA, REPORT_SCHEMA_VERSION
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"REPORT_SCHEMA_VERSION",
|
|
16
|
+
"REPORT_JSON_SCHEMA",
|
|
17
|
+
"make_report",
|
|
18
|
+
"render_report_markdown",
|
|
19
|
+
"render_report_html",
|
|
20
|
+
"validate_report",
|
|
21
|
+
]
|
invarlock/reporting/html.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Minimal HTML exporter for
|
|
2
|
+
Minimal HTML exporter for reports.
|
|
3
3
|
|
|
4
4
|
This implementation wraps the Markdown rendering in a simple HTML template so
|
|
5
5
|
that the numbers and core content remain identical across formats.
|
|
@@ -10,7 +10,7 @@ from __future__ import annotations
|
|
|
10
10
|
from html import escape
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
|
-
from .render import
|
|
13
|
+
from .render import render_report_markdown
|
|
14
14
|
|
|
15
15
|
markdown_module: Any | None = None
|
|
16
16
|
try:
|
|
@@ -37,13 +37,13 @@ def _apply_status_badges(html_body: str) -> str:
|
|
|
37
37
|
return updated
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def
|
|
41
|
-
"""Render
|
|
40
|
+
def render_report_html(evaluation_report: dict[str, Any]) -> str:
|
|
41
|
+
"""Render an evaluation report as a simple HTML document.
|
|
42
42
|
|
|
43
43
|
Uses the Markdown renderer and converts to HTML when available, falling back
|
|
44
44
|
to a <pre> block when the markdown dependency is missing.
|
|
45
45
|
"""
|
|
46
|
-
md =
|
|
46
|
+
md = render_report_markdown(evaluation_report)
|
|
47
47
|
if markdown_module is None:
|
|
48
48
|
body = f'<pre class="invarlock-md">{escape(md)}</pre>'
|
|
49
49
|
else:
|
|
@@ -52,7 +52,7 @@ def render_certificate_html(certificate: dict[str, Any]) -> str:
|
|
|
52
52
|
body = f'<div class="invarlock-md">{html_body}</div>'
|
|
53
53
|
return (
|
|
54
54
|
'<!DOCTYPE html><html><head><meta charset="utf-8">'
|
|
55
|
-
"<title>InvarLock Evaluation
|
|
55
|
+
"<title>InvarLock Evaluation Report</title>"
|
|
56
56
|
"<style>"
|
|
57
57
|
":root{--pass:#2da44e;--fail:#cf222e;--warn:#bf8700;--ink:#1f2328;"
|
|
58
58
|
"--muted:#57606a;--panel:#f6f8fa;--border:#d0d7de}"
|
|
@@ -79,4 +79,4 @@ def render_certificate_html(certificate: dict[str, Any]) -> str:
|
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
-
__all__ = ["
|
|
82
|
+
__all__ = ["render_report_html"]
|
|
@@ -33,7 +33,7 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
33
33
|
"""Coerce an arbitrary report-like mapping into a canonical RunReport.
|
|
34
34
|
|
|
35
35
|
This is the single entry point for converting pre-canonical or loosely-typed
|
|
36
|
-
data into the strict PM-only RunReport shape used by
|
|
36
|
+
data into the strict PM-only RunReport shape used by evaluation report building.
|
|
37
37
|
"""
|
|
38
38
|
src = _as_mapping(report)
|
|
39
39
|
|
|
@@ -53,7 +53,7 @@ def normalize_run_report(report: Mapping[str, Any] | RunReport) -> RunReport:
|
|
|
53
53
|
"ts": ts,
|
|
54
54
|
"auto": meta_in.get("auto") if isinstance(meta_in.get("auto"), dict) else None,
|
|
55
55
|
}
|
|
56
|
-
# Preserve additional provenance knobs used by
|
|
56
|
+
# Preserve additional provenance knobs used by evaluation report digests.
|
|
57
57
|
for key in (
|
|
58
58
|
"pm_acceptance_range",
|
|
59
59
|
"pm_drift_band",
|
|
@@ -35,7 +35,7 @@ def _compute_thresholds_payload(
|
|
|
35
35
|
tier: str, resolved_policy: dict[str, Any]
|
|
36
36
|
) -> dict[str, Any]:
|
|
37
37
|
"""Build canonical thresholds payload for digest stability."""
|
|
38
|
-
from .
|
|
38
|
+
from .report_builder import TIER_RATIO_LIMITS # local to avoid cycles
|
|
39
39
|
|
|
40
40
|
tier_lc = (tier or "balanced").lower()
|
|
41
41
|
metrics_policy = (
|
|
@@ -8,21 +8,21 @@ from .utils import _coerce_interval, _weighted_mean
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def attach_primary_metric(
|
|
11
|
-
|
|
11
|
+
evaluation_report: dict[str, Any],
|
|
12
12
|
report: dict[str, Any],
|
|
13
13
|
baseline_raw: dict[str, Any] | None,
|
|
14
14
|
baseline_ref: dict[str, Any] | None,
|
|
15
15
|
ppl_analysis: dict[str, Any] | None,
|
|
16
16
|
) -> None:
|
|
17
|
-
"""Attach/normalize the primary_metric block on the
|
|
17
|
+
"""Attach/normalize the primary_metric block on the evaluation report.
|
|
18
18
|
|
|
19
|
-
Behavior mirrors historical logic in
|
|
19
|
+
Behavior mirrors historical logic in report_builder.py and preserves structure:
|
|
20
20
|
- Prefer explicit metrics.primary_metric if present
|
|
21
21
|
- Compute missing ratio_vs_baseline, degenerate display_ci
|
|
22
22
|
- ppl window-based analysis info (mean logloss) added when available
|
|
23
23
|
- Fallbacks for classification metrics and eval-window-derived ppl
|
|
24
24
|
- Ensure display_ci always present for schema invariants
|
|
25
|
-
Mutates the
|
|
25
|
+
Mutates the evaluation report in-place.
|
|
26
26
|
"""
|
|
27
27
|
# Attach primary metric snapshot when provided in report
|
|
28
28
|
try:
|
|
@@ -180,12 +180,12 @@ def attach_primary_metric(
|
|
|
180
180
|
]
|
|
181
181
|
except Exception:
|
|
182
182
|
pass
|
|
183
|
-
|
|
183
|
+
evaluation_report["primary_metric"] = pm_copy
|
|
184
184
|
except Exception:
|
|
185
185
|
pass
|
|
186
186
|
|
|
187
187
|
def _attach_from_windows() -> None:
|
|
188
|
-
if isinstance(
|
|
188
|
+
if isinstance(evaluation_report.get("primary_metric"), dict):
|
|
189
189
|
return
|
|
190
190
|
try:
|
|
191
191
|
m = (
|
|
@@ -212,7 +212,7 @@ def attach_primary_metric(
|
|
|
212
212
|
baseline=baseline_raw if isinstance(baseline_raw, dict) else None,
|
|
213
213
|
)
|
|
214
214
|
if isinstance(pm_block, dict) and pm_block:
|
|
215
|
-
|
|
215
|
+
evaluation_report["primary_metric"] = pm_block
|
|
216
216
|
except Exception:
|
|
217
217
|
pass
|
|
218
218
|
|
|
@@ -220,7 +220,7 @@ def attach_primary_metric(
|
|
|
220
220
|
_attach_from_windows()
|
|
221
221
|
|
|
222
222
|
# Minimal fallback for classification-only reports without explicit primary_metric
|
|
223
|
-
if not isinstance(
|
|
223
|
+
if not isinstance(evaluation_report.get("primary_metric"), dict):
|
|
224
224
|
try:
|
|
225
225
|
metrics_map = report.get("metrics", {}) if isinstance(report, dict) else {}
|
|
226
226
|
clf = (
|
|
@@ -298,7 +298,7 @@ def attach_primary_metric(
|
|
|
298
298
|
acc_pm["ratio_vs_baseline"] = delta_pp
|
|
299
299
|
except Exception:
|
|
300
300
|
pass
|
|
301
|
-
|
|
301
|
+
evaluation_report["primary_metric"] = acc_pm
|
|
302
302
|
except Exception:
|
|
303
303
|
pass
|
|
304
304
|
|
|
@@ -308,8 +308,8 @@ def attach_primary_metric(
|
|
|
308
308
|
# Ensure primary_metric has display_ci populated for schema invariants
|
|
309
309
|
try:
|
|
310
310
|
pm = (
|
|
311
|
-
|
|
312
|
-
if isinstance(
|
|
311
|
+
evaluation_report.get("primary_metric", {})
|
|
312
|
+
if isinstance(evaluation_report.get("primary_metric"), dict)
|
|
313
313
|
else None
|
|
314
314
|
)
|
|
315
315
|
if isinstance(pm, dict) and pm:
|