invarlock 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +33 -0
- invarlock/__main__.py +10 -0
- invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
- invarlock/_data/runtime/profiles/release.yaml +23 -0
- invarlock/_data/runtime/tiers.yaml +76 -0
- invarlock/adapters/__init__.py +102 -0
- invarlock/adapters/_capabilities.py +45 -0
- invarlock/adapters/auto.py +99 -0
- invarlock/adapters/base.py +530 -0
- invarlock/adapters/base_types.py +85 -0
- invarlock/adapters/hf_bert.py +852 -0
- invarlock/adapters/hf_gpt2.py +403 -0
- invarlock/adapters/hf_llama.py +485 -0
- invarlock/adapters/hf_mixin.py +383 -0
- invarlock/adapters/hf_onnx.py +112 -0
- invarlock/adapters/hf_t5.py +137 -0
- invarlock/adapters/py.typed +1 -0
- invarlock/assurance/__init__.py +43 -0
- invarlock/cli/__init__.py +8 -0
- invarlock/cli/__main__.py +8 -0
- invarlock/cli/_evidence.py +25 -0
- invarlock/cli/_json.py +75 -0
- invarlock/cli/adapter_auto.py +162 -0
- invarlock/cli/app.py +287 -0
- invarlock/cli/commands/__init__.py +26 -0
- invarlock/cli/commands/certify.py +403 -0
- invarlock/cli/commands/doctor.py +1358 -0
- invarlock/cli/commands/explain_gates.py +151 -0
- invarlock/cli/commands/export_html.py +100 -0
- invarlock/cli/commands/plugins.py +1331 -0
- invarlock/cli/commands/report.py +354 -0
- invarlock/cli/commands/run.py +4146 -0
- invarlock/cli/commands/verify.py +1040 -0
- invarlock/cli/config.py +396 -0
- invarlock/cli/constants.py +68 -0
- invarlock/cli/device.py +92 -0
- invarlock/cli/doctor_helpers.py +74 -0
- invarlock/cli/errors.py +6 -0
- invarlock/cli/overhead_utils.py +60 -0
- invarlock/cli/provenance.py +66 -0
- invarlock/cli/utils.py +41 -0
- invarlock/config.py +56 -0
- invarlock/core/__init__.py +62 -0
- invarlock/core/abi.py +15 -0
- invarlock/core/api.py +274 -0
- invarlock/core/auto_tuning.py +317 -0
- invarlock/core/bootstrap.py +226 -0
- invarlock/core/checkpoint.py +221 -0
- invarlock/core/contracts.py +73 -0
- invarlock/core/error_utils.py +64 -0
- invarlock/core/events.py +298 -0
- invarlock/core/exceptions.py +95 -0
- invarlock/core/registry.py +481 -0
- invarlock/core/retry.py +146 -0
- invarlock/core/runner.py +2041 -0
- invarlock/core/types.py +154 -0
- invarlock/edits/__init__.py +12 -0
- invarlock/edits/_edit_utils.py +249 -0
- invarlock/edits/_external_utils.py +268 -0
- invarlock/edits/noop.py +47 -0
- invarlock/edits/py.typed +1 -0
- invarlock/edits/quant_rtn.py +801 -0
- invarlock/edits/registry.py +166 -0
- invarlock/eval/__init__.py +23 -0
- invarlock/eval/bench.py +1207 -0
- invarlock/eval/bootstrap.py +50 -0
- invarlock/eval/data.py +2052 -0
- invarlock/eval/metrics.py +2167 -0
- invarlock/eval/primary_metric.py +767 -0
- invarlock/eval/probes/__init__.py +24 -0
- invarlock/eval/probes/fft.py +139 -0
- invarlock/eval/probes/mi.py +213 -0
- invarlock/eval/probes/post_attention.py +323 -0
- invarlock/eval/providers/base.py +67 -0
- invarlock/eval/providers/seq2seq.py +111 -0
- invarlock/eval/providers/text_lm.py +113 -0
- invarlock/eval/providers/vision_text.py +93 -0
- invarlock/eval/py.typed +1 -0
- invarlock/guards/__init__.py +18 -0
- invarlock/guards/_contracts.py +9 -0
- invarlock/guards/invariants.py +640 -0
- invarlock/guards/policies.py +805 -0
- invarlock/guards/py.typed +1 -0
- invarlock/guards/rmt.py +2097 -0
- invarlock/guards/spectral.py +1419 -0
- invarlock/guards/tier_config.py +354 -0
- invarlock/guards/variance.py +3298 -0
- invarlock/guards_ref/__init__.py +15 -0
- invarlock/guards_ref/rmt_ref.py +40 -0
- invarlock/guards_ref/spectral_ref.py +135 -0
- invarlock/guards_ref/variance_ref.py +60 -0
- invarlock/model_profile.py +353 -0
- invarlock/model_utils.py +221 -0
- invarlock/observability/__init__.py +10 -0
- invarlock/observability/alerting.py +535 -0
- invarlock/observability/core.py +546 -0
- invarlock/observability/exporters.py +565 -0
- invarlock/observability/health.py +588 -0
- invarlock/observability/metrics.py +457 -0
- invarlock/observability/py.typed +1 -0
- invarlock/observability/utils.py +553 -0
- invarlock/plugins/__init__.py +12 -0
- invarlock/plugins/hello_guard.py +33 -0
- invarlock/plugins/hf_awq_adapter.py +82 -0
- invarlock/plugins/hf_bnb_adapter.py +79 -0
- invarlock/plugins/hf_gptq_adapter.py +78 -0
- invarlock/plugins/py.typed +1 -0
- invarlock/py.typed +1 -0
- invarlock/reporting/__init__.py +7 -0
- invarlock/reporting/certificate.py +3221 -0
- invarlock/reporting/certificate_schema.py +244 -0
- invarlock/reporting/dataset_hashing.py +215 -0
- invarlock/reporting/guards_analysis.py +948 -0
- invarlock/reporting/html.py +32 -0
- invarlock/reporting/normalizer.py +235 -0
- invarlock/reporting/policy_utils.py +517 -0
- invarlock/reporting/primary_metric_utils.py +265 -0
- invarlock/reporting/render.py +1442 -0
- invarlock/reporting/report.py +903 -0
- invarlock/reporting/report_types.py +278 -0
- invarlock/reporting/utils.py +175 -0
- invarlock/reporting/validate.py +631 -0
- invarlock/security.py +176 -0
- invarlock/sparsity_utils.py +323 -0
- invarlock/utils/__init__.py +150 -0
- invarlock/utils/digest.py +45 -0
- invarlock-0.2.0.dist-info/METADATA +586 -0
- invarlock-0.2.0.dist-info/RECORD +132 -0
- invarlock-0.2.0.dist-info/WHEEL +5 -0
- invarlock-0.2.0.dist-info/entry_points.txt +20 -0
- invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
- invarlock-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
# Optional JSON Schema validation support (best-effort)
|
|
8
|
+
try: # pragma: no cover - exercised in integration
|
|
9
|
+
import jsonschema
|
|
10
|
+
except Exception: # pragma: no cover
|
|
11
|
+
jsonschema = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Certificate schema version (PM-first canonical)
|
|
15
|
+
CERTIFICATE_SCHEMA_VERSION = "v1"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Minimal JSON Schema describing the canonical shape of a certificate.
|
|
19
|
+
# This focuses on structural validity; numerical thresholds are validated
|
|
20
|
+
# separately in metric-specific logic.
|
|
21
|
+
CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
|
|
22
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
23
|
+
"title": "InvarLock Safety Certificate",
|
|
24
|
+
"type": "object",
|
|
25
|
+
"required": [
|
|
26
|
+
"schema_version",
|
|
27
|
+
"run_id",
|
|
28
|
+
"artifacts",
|
|
29
|
+
"plugins",
|
|
30
|
+
"meta",
|
|
31
|
+
"dataset",
|
|
32
|
+
],
|
|
33
|
+
"properties": {
|
|
34
|
+
"schema_version": {"const": CERTIFICATE_SCHEMA_VERSION},
|
|
35
|
+
"run_id": {"type": "string", "minLength": 4},
|
|
36
|
+
"edit_name": {"type": "string"},
|
|
37
|
+
"policy_digest": {
|
|
38
|
+
"type": "object",
|
|
39
|
+
"properties": {
|
|
40
|
+
"policy_version": {"type": "string"},
|
|
41
|
+
"tier_policy_name": {"type": "string"},
|
|
42
|
+
"thresholds_hash": {"type": "string"},
|
|
43
|
+
"hysteresis": {"type": "object"},
|
|
44
|
+
"min_effective": {"type": "number"},
|
|
45
|
+
"changed": {"type": "boolean"},
|
|
46
|
+
},
|
|
47
|
+
"additionalProperties": True,
|
|
48
|
+
},
|
|
49
|
+
"plugins": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"properties": {
|
|
52
|
+
"adapters": {"type": "array"},
|
|
53
|
+
"edits": {"type": "array"},
|
|
54
|
+
"guards": {"type": "array"},
|
|
55
|
+
},
|
|
56
|
+
"additionalProperties": True,
|
|
57
|
+
},
|
|
58
|
+
"meta": {"type": "object"},
|
|
59
|
+
"dataset": {
|
|
60
|
+
"type": "object",
|
|
61
|
+
"required": ["provider", "seq_len", "windows"],
|
|
62
|
+
"properties": {
|
|
63
|
+
"provider": {"type": "string"},
|
|
64
|
+
"seq_len": {"type": "integer", "minimum": 1},
|
|
65
|
+
"windows": {
|
|
66
|
+
"type": "object",
|
|
67
|
+
"required": ["preview", "final"],
|
|
68
|
+
"properties": {
|
|
69
|
+
"preview": {"type": "integer", "minimum": 0},
|
|
70
|
+
"final": {"type": "integer", "minimum": 0},
|
|
71
|
+
"seed": {"type": "integer"},
|
|
72
|
+
},
|
|
73
|
+
},
|
|
74
|
+
},
|
|
75
|
+
"additionalProperties": True,
|
|
76
|
+
},
|
|
77
|
+
# ppl_* block removed from required schema; may appear for ppl-like tasks but is optional
|
|
78
|
+
"primary_metric": {
|
|
79
|
+
"type": "object",
|
|
80
|
+
"properties": {
|
|
81
|
+
"kind": {"type": "string"},
|
|
82
|
+
"unit": {"type": "string"},
|
|
83
|
+
"direction": {"type": "string"},
|
|
84
|
+
"aggregation_scope": {"type": "string"},
|
|
85
|
+
"paired": {"type": "boolean"},
|
|
86
|
+
"gating_basis": {"type": "string"},
|
|
87
|
+
"preview": {"type": "number"},
|
|
88
|
+
"final": {"type": "number"},
|
|
89
|
+
"ratio_vs_baseline": {"type": "number"},
|
|
90
|
+
"reps": {"type": "number"},
|
|
91
|
+
"ci_level": {"type": "number"},
|
|
92
|
+
"counts_source": {"enum": ["measured", "pseudo_config"]},
|
|
93
|
+
"estimated": {"type": "boolean"},
|
|
94
|
+
"ci": {
|
|
95
|
+
"type": "array",
|
|
96
|
+
"minItems": 2,
|
|
97
|
+
"maxItems": 2,
|
|
98
|
+
"items": {"type": "number"},
|
|
99
|
+
},
|
|
100
|
+
"display_ci": {
|
|
101
|
+
"type": "array",
|
|
102
|
+
"minItems": 2,
|
|
103
|
+
"maxItems": 2,
|
|
104
|
+
"items": {"type": "number"},
|
|
105
|
+
},
|
|
106
|
+
},
|
|
107
|
+
"additionalProperties": True,
|
|
108
|
+
},
|
|
109
|
+
"system_overhead": {
|
|
110
|
+
"type": "object",
|
|
111
|
+
# Numeric keys must match these patterns when present; allow flexibility otherwise
|
|
112
|
+
"patternProperties": {
|
|
113
|
+
"^latency_ms_(p50|p95)$": {"type": "number"},
|
|
114
|
+
"^throughput_.*$": {"type": "number"},
|
|
115
|
+
},
|
|
116
|
+
"additionalProperties": True,
|
|
117
|
+
},
|
|
118
|
+
"validation": {
|
|
119
|
+
"type": "object",
|
|
120
|
+
# properties populated at import-time from allow-list; default permissive
|
|
121
|
+
"properties": {},
|
|
122
|
+
"additionalProperties": {"type": "boolean"},
|
|
123
|
+
},
|
|
124
|
+
"artifacts": {"type": "object"},
|
|
125
|
+
"provenance": {"type": "object"},
|
|
126
|
+
"resolved_policy": {"type": "object"},
|
|
127
|
+
"policy_provenance": {"type": "object"},
|
|
128
|
+
"structure": {"type": "object"},
|
|
129
|
+
"confidence": {
|
|
130
|
+
"type": "object",
|
|
131
|
+
"properties": {
|
|
132
|
+
"label": {"enum": ["High", "Medium", "Low"]},
|
|
133
|
+
"basis": {"type": "string"},
|
|
134
|
+
"width": {"type": "number"},
|
|
135
|
+
"threshold": {"type": "number"},
|
|
136
|
+
"unstable": {"type": "boolean"},
|
|
137
|
+
},
|
|
138
|
+
"required": ["label", "basis"],
|
|
139
|
+
"additionalProperties": True,
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
"additionalProperties": True,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
_VALIDATION_ALLOWLIST_DEFAULT = {
|
|
147
|
+
"primary_metric_acceptable",
|
|
148
|
+
"preview_final_drift_acceptable",
|
|
149
|
+
"guard_overhead_acceptable",
|
|
150
|
+
"invariants_pass",
|
|
151
|
+
"spectral_stable",
|
|
152
|
+
"rmt_stable",
|
|
153
|
+
# Compatibility keys were removed; PM-only surface
|
|
154
|
+
"hysteresis_applied",
|
|
155
|
+
"moe_observed",
|
|
156
|
+
"moe_identity_ok",
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _load_validation_allowlist() -> set[str]:
|
|
161
|
+
"""Load validation key allow-list from contracts/validation_keys.json when available.
|
|
162
|
+
|
|
163
|
+
Falls back to a safe built-in default when the contracts directory is not present
|
|
164
|
+
(e.g., installed wheel) or when parsing fails.
|
|
165
|
+
"""
|
|
166
|
+
try:
|
|
167
|
+
root = Path(__file__).resolve().parents[3]
|
|
168
|
+
path = root / "contracts" / "validation_keys.json"
|
|
169
|
+
if path.exists():
|
|
170
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
171
|
+
if isinstance(data, list):
|
|
172
|
+
return {str(k) for k in data}
|
|
173
|
+
except Exception:
|
|
174
|
+
pass
|
|
175
|
+
return set(_VALIDATION_ALLOWLIST_DEFAULT)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
|
|
179
|
+
"""Validate certificate with JSON Schema when available."""
|
|
180
|
+
if jsonschema is None:
|
|
181
|
+
return True # Schema library unavailable; fall back to legacy checks
|
|
182
|
+
try:
|
|
183
|
+
jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
|
|
184
|
+
return True
|
|
185
|
+
except Exception:
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def validate_certificate(certificate: dict[str, Any]) -> bool:
|
|
190
|
+
"""Validate certificate structure and essential flags."""
|
|
191
|
+
try:
|
|
192
|
+
if certificate.get("schema_version") != CERTIFICATE_SCHEMA_VERSION:
|
|
193
|
+
return False
|
|
194
|
+
|
|
195
|
+
# Prefer JSON Schema structural validation; if unavailable or too strict,
|
|
196
|
+
# fall back to a lenient minimal check used by unit tests.
|
|
197
|
+
# Tighten JSON Schema: populate validation.properties from allow-list and
|
|
198
|
+
# disallow unknown validation keys at schema level.
|
|
199
|
+
try:
|
|
200
|
+
vkeys = _load_validation_allowlist()
|
|
201
|
+
if isinstance(CERTIFICATE_JSON_SCHEMA.get("properties"), dict):
|
|
202
|
+
vspec = CERTIFICATE_JSON_SCHEMA["properties"].get("validation")
|
|
203
|
+
if isinstance(vspec, dict):
|
|
204
|
+
vspec["properties"] = {k: {"type": "boolean"} for k in vkeys}
|
|
205
|
+
vspec["additionalProperties"] = False
|
|
206
|
+
except Exception:
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
if not _validate_with_jsonschema(certificate):
|
|
210
|
+
# Minimal fallback: require schema version + run_id + primary_metric
|
|
211
|
+
run_id_ok = isinstance(certificate.get("run_id"), str) and bool(
|
|
212
|
+
certificate.get("run_id")
|
|
213
|
+
)
|
|
214
|
+
pm = certificate.get("primary_metric")
|
|
215
|
+
pm_ok = isinstance(pm, dict) and (
|
|
216
|
+
isinstance(pm.get("final"), int | float)
|
|
217
|
+
or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
|
|
218
|
+
)
|
|
219
|
+
if not (run_id_ok and pm_ok):
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
validation = certificate.get("validation", {})
|
|
223
|
+
for flag in [
|
|
224
|
+
"preview_final_drift_acceptable",
|
|
225
|
+
"primary_metric_acceptable",
|
|
226
|
+
"invariants_pass",
|
|
227
|
+
"spectral_stable",
|
|
228
|
+
"rmt_stable",
|
|
229
|
+
"guard_overhead_acceptable",
|
|
230
|
+
]:
|
|
231
|
+
# If present, must be boolean; tolerate missing opt-in flags
|
|
232
|
+
if flag in validation and not isinstance(validation.get(flag), bool):
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
return True
|
|
236
|
+
except (KeyError, TypeError, ValueError):
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
__all__ = [
|
|
241
|
+
"CERTIFICATE_SCHEMA_VERSION",
|
|
242
|
+
"CERTIFICATE_JSON_SCHEMA",
|
|
243
|
+
"validate_certificate",
|
|
244
|
+
]
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from ..eval.data import EvaluationWindow, compute_window_hash
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def compute_window_hashes(
|
|
9
|
+
preview_window: EvaluationWindow, final_window: EvaluationWindow
|
|
10
|
+
) -> dict[str, str | int]:
|
|
11
|
+
"""Compute SHA256 hashes of evaluation windows for dataset integrity.
|
|
12
|
+
|
|
13
|
+
Returns a mapping with preview/final sha256 and total token count.
|
|
14
|
+
"""
|
|
15
|
+
preview_hash = compute_window_hash(preview_window, include_data=True)
|
|
16
|
+
final_hash = compute_window_hash(final_window, include_data=True)
|
|
17
|
+
return {
|
|
18
|
+
"preview": f"sha256:{preview_hash}",
|
|
19
|
+
"final": f"sha256:{final_hash}",
|
|
20
|
+
"total_tokens": sum(
|
|
21
|
+
len(ids) for ids in preview_window.input_ids + final_window.input_ids
|
|
22
|
+
),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _compute_actual_window_hashes(report: dict[str, Any]) -> dict[str, Any]:
|
|
27
|
+
"""Compute actual window hashes from explicit token IDs when available.
|
|
28
|
+
|
|
29
|
+
Expects `report["evaluation_windows"]["preview"|"final"]["input_ids"]` to be present.
|
|
30
|
+
Returns dict with per-split sha256 hashes and token counts when computable; otherwise
|
|
31
|
+
returns an empty dict to signal fallback to config-based hashing.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
# Prefer explicit preview/final hashes included in the report data
|
|
35
|
+
data_config = report.get("data", {}) if isinstance(report, dict) else {}
|
|
36
|
+
if (
|
|
37
|
+
isinstance(data_config, dict)
|
|
38
|
+
and data_config.get("preview_hash")
|
|
39
|
+
and data_config.get("final_hash")
|
|
40
|
+
):
|
|
41
|
+
preview_tokens_ct = data_config.get("preview_total_tokens")
|
|
42
|
+
final_tokens_ct = data_config.get("final_total_tokens")
|
|
43
|
+
total_tokens = 0
|
|
44
|
+
if isinstance(preview_tokens_ct, int):
|
|
45
|
+
total_tokens += preview_tokens_ct
|
|
46
|
+
if isinstance(final_tokens_ct, int):
|
|
47
|
+
total_tokens += final_tokens_ct
|
|
48
|
+
return {
|
|
49
|
+
"preview": f"blake2s:{data_config.get('preview_hash')}",
|
|
50
|
+
"final": f"blake2s:{data_config.get('final_hash')}",
|
|
51
|
+
"dataset": data_config.get("dataset_hash"),
|
|
52
|
+
"total_tokens": total_tokens,
|
|
53
|
+
"preview_tokens": preview_tokens_ct,
|
|
54
|
+
"final_tokens": final_tokens_ct,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
windows = report.get("evaluation_windows", {})
|
|
58
|
+
if not isinstance(windows, dict):
|
|
59
|
+
windows = {}
|
|
60
|
+
preview_ids = (
|
|
61
|
+
windows.get("preview", {}).get("input_ids")
|
|
62
|
+
if windows.get("preview")
|
|
63
|
+
else None
|
|
64
|
+
)
|
|
65
|
+
final_ids = (
|
|
66
|
+
windows.get("final", {}).get("input_ids") if windows.get("final") else None
|
|
67
|
+
)
|
|
68
|
+
if not (isinstance(preview_ids, list) and isinstance(final_ids, list)):
|
|
69
|
+
# Config-based fallback (stable sha256 of config tuple)
|
|
70
|
+
import hashlib as _hashlib
|
|
71
|
+
|
|
72
|
+
data_config = report.get("data", {}) if isinstance(report, dict) else {}
|
|
73
|
+
dataset = str(data_config.get("dataset", "unknown"))
|
|
74
|
+
split = str(
|
|
75
|
+
data_config.get("split", data_config.get("dataset_split", "val"))
|
|
76
|
+
)
|
|
77
|
+
seed = (
|
|
78
|
+
report.get("meta", {}).get("seed", 0)
|
|
79
|
+
if isinstance(report.get("meta"), dict)
|
|
80
|
+
else 0
|
|
81
|
+
)
|
|
82
|
+
preview_n = int(data_config.get("preview_n", 0) or 0)
|
|
83
|
+
final_n = int(data_config.get("final_n", 0) or 0)
|
|
84
|
+
seq_len = int(data_config.get("seq_len", 0) or 0)
|
|
85
|
+
config_str = f"{dataset}{split}{seq_len}{preview_n}{final_n}{seed}"
|
|
86
|
+
digest = _hashlib.sha256(config_str.encode()).hexdigest()
|
|
87
|
+
preview_tokens = preview_n * seq_len if preview_n and seq_len else 0
|
|
88
|
+
final_tokens = final_n * seq_len if final_n and seq_len else 0
|
|
89
|
+
return {
|
|
90
|
+
"preview": f"sha256:{digest[:32]}",
|
|
91
|
+
"final": f"sha256:{digest[32:64] if len(digest) >= 64 else digest[:32]}",
|
|
92
|
+
"dataset": data_config.get("provider_hash")
|
|
93
|
+
or data_config.get("tokenizer_hash"),
|
|
94
|
+
"preview_tokens": preview_tokens,
|
|
95
|
+
"final_tokens": final_tokens,
|
|
96
|
+
"total_tokens": preview_tokens + final_tokens,
|
|
97
|
+
}
|
|
98
|
+
# Compute hashes directly from token ID sequences for robustness
|
|
99
|
+
import hashlib as _hashlib
|
|
100
|
+
|
|
101
|
+
def _hash_sequences(seqs: list[list[int]]) -> str:
|
|
102
|
+
h = _hashlib.sha256()
|
|
103
|
+
for seq in seqs:
|
|
104
|
+
try:
|
|
105
|
+
h.update(str(list(seq)).encode("utf-8"))
|
|
106
|
+
except Exception:
|
|
107
|
+
continue
|
|
108
|
+
return h.hexdigest()
|
|
109
|
+
|
|
110
|
+
preview_hash = _hash_sequences(preview_ids)
|
|
111
|
+
final_hash = _hash_sequences(final_ids)
|
|
112
|
+
preview_tokens = sum(len(s) for s in preview_ids)
|
|
113
|
+
final_tokens = sum(len(s) for s in final_ids)
|
|
114
|
+
return {
|
|
115
|
+
"preview": f"sha256:{preview_hash}",
|
|
116
|
+
"final": f"sha256:{final_hash}",
|
|
117
|
+
"preview_tokens": preview_tokens,
|
|
118
|
+
"final_tokens": final_tokens,
|
|
119
|
+
"dataset": None,
|
|
120
|
+
"total_tokens": preview_tokens + final_tokens,
|
|
121
|
+
}
|
|
122
|
+
except Exception:
|
|
123
|
+
# Signal caller to use config-based fallback
|
|
124
|
+
return {}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _extract_dataset_info(report: dict[str, Any]) -> dict[str, Any]:
|
|
128
|
+
"""Extract dataset configuration and compute window hashes.
|
|
129
|
+
|
|
130
|
+
Tolerates missing `data` by falling back to evaluation_windows lengths and
|
|
131
|
+
placeholder values for non-essential fields.
|
|
132
|
+
"""
|
|
133
|
+
data_config = report.get("data", {}) if isinstance(report, dict) else {}
|
|
134
|
+
eval_windows = (
|
|
135
|
+
report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
|
|
136
|
+
)
|
|
137
|
+
preview_section = (
|
|
138
|
+
eval_windows.get("preview", {}) if isinstance(eval_windows, dict) else {}
|
|
139
|
+
)
|
|
140
|
+
final_section = (
|
|
141
|
+
eval_windows.get("final", {}) if isinstance(eval_windows, dict) else {}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def _len_ids(sec: dict[str, Any]) -> int:
|
|
145
|
+
ids = sec.get("window_ids")
|
|
146
|
+
return int(len(ids)) if isinstance(ids, list) else 0
|
|
147
|
+
|
|
148
|
+
preview_n = int(data_config.get("preview_n", 0) or 0) or _len_ids(preview_section)
|
|
149
|
+
final_n = int(data_config.get("final_n", 0) or 0) or _len_ids(final_section)
|
|
150
|
+
seq_len = int(data_config.get("seq_len", 0) or 0)
|
|
151
|
+
stride = int(data_config.get("stride", 0) or 0)
|
|
152
|
+
|
|
153
|
+
dataset = str(data_config.get("dataset", "unknown"))
|
|
154
|
+
split = str(data_config.get("split", data_config.get("dataset_split", "val")))
|
|
155
|
+
|
|
156
|
+
# Prefer actual window hashes when explicit token IDs are present
|
|
157
|
+
actual_hashes = _compute_actual_window_hashes(report)
|
|
158
|
+
if actual_hashes:
|
|
159
|
+
window_hash = actual_hashes
|
|
160
|
+
else:
|
|
161
|
+
# Config-based fallback: produce a stable sha256 of config tuple
|
|
162
|
+
import hashlib as _hashlib
|
|
163
|
+
|
|
164
|
+
seed = (
|
|
165
|
+
report.get("meta", {}).get("seed", 0)
|
|
166
|
+
if isinstance(report.get("meta"), dict)
|
|
167
|
+
else 0
|
|
168
|
+
)
|
|
169
|
+
preview_n = int(data_config.get("preview_n", 0) or 0)
|
|
170
|
+
final_n = int(data_config.get("final_n", 0) or 0)
|
|
171
|
+
seq_len = int(data_config.get("seq_len", 0) or 0)
|
|
172
|
+
config_str = f"{dataset}{split}{seq_len}{preview_n}{final_n}{seed}"
|
|
173
|
+
digest = _hashlib.sha256(config_str.encode()).hexdigest()
|
|
174
|
+
preview_tokens = preview_n * seq_len if preview_n and seq_len else 0
|
|
175
|
+
final_tokens = final_n * seq_len if final_n and seq_len else 0
|
|
176
|
+
window_hash = {
|
|
177
|
+
"preview": f"sha256:{digest[:32]}",
|
|
178
|
+
"final": f"sha256:{digest[32:64] if len(digest) >= 64 else digest[:32]}",
|
|
179
|
+
"dataset": data_config.get("provider_hash")
|
|
180
|
+
or data_config.get("tokenizer_hash"),
|
|
181
|
+
"preview_tokens": preview_tokens,
|
|
182
|
+
"final_tokens": final_tokens,
|
|
183
|
+
"total_tokens": preview_tokens + final_tokens,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
tokenizer_info = {
|
|
187
|
+
"name": data_config.get("tokenizer_name", "unknown"),
|
|
188
|
+
"hash": data_config.get("tokenizer_hash"),
|
|
189
|
+
"vocab_size": data_config.get("vocab_size"),
|
|
190
|
+
"bos_token": data_config.get("bos_token"),
|
|
191
|
+
"eos_token": data_config.get("eos_token"),
|
|
192
|
+
"pad_token": data_config.get("pad_token"),
|
|
193
|
+
"add_prefix_space": data_config.get("add_prefix_space"),
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
"provider": dataset,
|
|
198
|
+
"split": split,
|
|
199
|
+
"seq_len": seq_len,
|
|
200
|
+
"stride": stride,
|
|
201
|
+
"windows": {
|
|
202
|
+
"preview": preview_n,
|
|
203
|
+
"final": final_n,
|
|
204
|
+
"seed": data_config.get("seed"),
|
|
205
|
+
},
|
|
206
|
+
"hash": window_hash,
|
|
207
|
+
"tokenizer": tokenizer_info,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
__all__ = [
|
|
212
|
+
"compute_window_hashes",
|
|
213
|
+
"_compute_actual_window_hashes",
|
|
214
|
+
"_extract_dataset_info",
|
|
215
|
+
]
|