invarlock 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. invarlock/__init__.py +33 -0
  2. invarlock/__main__.py +10 -0
  3. invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
  4. invarlock/_data/runtime/profiles/release.yaml +23 -0
  5. invarlock/_data/runtime/tiers.yaml +76 -0
  6. invarlock/adapters/__init__.py +102 -0
  7. invarlock/adapters/_capabilities.py +45 -0
  8. invarlock/adapters/auto.py +99 -0
  9. invarlock/adapters/base.py +530 -0
  10. invarlock/adapters/base_types.py +85 -0
  11. invarlock/adapters/hf_bert.py +852 -0
  12. invarlock/adapters/hf_gpt2.py +403 -0
  13. invarlock/adapters/hf_llama.py +485 -0
  14. invarlock/adapters/hf_mixin.py +383 -0
  15. invarlock/adapters/hf_onnx.py +112 -0
  16. invarlock/adapters/hf_t5.py +137 -0
  17. invarlock/adapters/py.typed +1 -0
  18. invarlock/assurance/__init__.py +43 -0
  19. invarlock/cli/__init__.py +8 -0
  20. invarlock/cli/__main__.py +8 -0
  21. invarlock/cli/_evidence.py +25 -0
  22. invarlock/cli/_json.py +75 -0
  23. invarlock/cli/adapter_auto.py +162 -0
  24. invarlock/cli/app.py +287 -0
  25. invarlock/cli/commands/__init__.py +26 -0
  26. invarlock/cli/commands/certify.py +403 -0
  27. invarlock/cli/commands/doctor.py +1358 -0
  28. invarlock/cli/commands/explain_gates.py +151 -0
  29. invarlock/cli/commands/export_html.py +100 -0
  30. invarlock/cli/commands/plugins.py +1331 -0
  31. invarlock/cli/commands/report.py +354 -0
  32. invarlock/cli/commands/run.py +4146 -0
  33. invarlock/cli/commands/verify.py +1040 -0
  34. invarlock/cli/config.py +396 -0
  35. invarlock/cli/constants.py +68 -0
  36. invarlock/cli/device.py +92 -0
  37. invarlock/cli/doctor_helpers.py +74 -0
  38. invarlock/cli/errors.py +6 -0
  39. invarlock/cli/overhead_utils.py +60 -0
  40. invarlock/cli/provenance.py +66 -0
  41. invarlock/cli/utils.py +41 -0
  42. invarlock/config.py +56 -0
  43. invarlock/core/__init__.py +62 -0
  44. invarlock/core/abi.py +15 -0
  45. invarlock/core/api.py +274 -0
  46. invarlock/core/auto_tuning.py +317 -0
  47. invarlock/core/bootstrap.py +226 -0
  48. invarlock/core/checkpoint.py +221 -0
  49. invarlock/core/contracts.py +73 -0
  50. invarlock/core/error_utils.py +64 -0
  51. invarlock/core/events.py +298 -0
  52. invarlock/core/exceptions.py +95 -0
  53. invarlock/core/registry.py +481 -0
  54. invarlock/core/retry.py +146 -0
  55. invarlock/core/runner.py +2041 -0
  56. invarlock/core/types.py +154 -0
  57. invarlock/edits/__init__.py +12 -0
  58. invarlock/edits/_edit_utils.py +249 -0
  59. invarlock/edits/_external_utils.py +268 -0
  60. invarlock/edits/noop.py +47 -0
  61. invarlock/edits/py.typed +1 -0
  62. invarlock/edits/quant_rtn.py +801 -0
  63. invarlock/edits/registry.py +166 -0
  64. invarlock/eval/__init__.py +23 -0
  65. invarlock/eval/bench.py +1207 -0
  66. invarlock/eval/bootstrap.py +50 -0
  67. invarlock/eval/data.py +2052 -0
  68. invarlock/eval/metrics.py +2167 -0
  69. invarlock/eval/primary_metric.py +767 -0
  70. invarlock/eval/probes/__init__.py +24 -0
  71. invarlock/eval/probes/fft.py +139 -0
  72. invarlock/eval/probes/mi.py +213 -0
  73. invarlock/eval/probes/post_attention.py +323 -0
  74. invarlock/eval/providers/base.py +67 -0
  75. invarlock/eval/providers/seq2seq.py +111 -0
  76. invarlock/eval/providers/text_lm.py +113 -0
  77. invarlock/eval/providers/vision_text.py +93 -0
  78. invarlock/eval/py.typed +1 -0
  79. invarlock/guards/__init__.py +18 -0
  80. invarlock/guards/_contracts.py +9 -0
  81. invarlock/guards/invariants.py +640 -0
  82. invarlock/guards/policies.py +805 -0
  83. invarlock/guards/py.typed +1 -0
  84. invarlock/guards/rmt.py +2097 -0
  85. invarlock/guards/spectral.py +1419 -0
  86. invarlock/guards/tier_config.py +354 -0
  87. invarlock/guards/variance.py +3298 -0
  88. invarlock/guards_ref/__init__.py +15 -0
  89. invarlock/guards_ref/rmt_ref.py +40 -0
  90. invarlock/guards_ref/spectral_ref.py +135 -0
  91. invarlock/guards_ref/variance_ref.py +60 -0
  92. invarlock/model_profile.py +353 -0
  93. invarlock/model_utils.py +221 -0
  94. invarlock/observability/__init__.py +10 -0
  95. invarlock/observability/alerting.py +535 -0
  96. invarlock/observability/core.py +546 -0
  97. invarlock/observability/exporters.py +565 -0
  98. invarlock/observability/health.py +588 -0
  99. invarlock/observability/metrics.py +457 -0
  100. invarlock/observability/py.typed +1 -0
  101. invarlock/observability/utils.py +553 -0
  102. invarlock/plugins/__init__.py +12 -0
  103. invarlock/plugins/hello_guard.py +33 -0
  104. invarlock/plugins/hf_awq_adapter.py +82 -0
  105. invarlock/plugins/hf_bnb_adapter.py +79 -0
  106. invarlock/plugins/hf_gptq_adapter.py +78 -0
  107. invarlock/plugins/py.typed +1 -0
  108. invarlock/py.typed +1 -0
  109. invarlock/reporting/__init__.py +7 -0
  110. invarlock/reporting/certificate.py +3221 -0
  111. invarlock/reporting/certificate_schema.py +244 -0
  112. invarlock/reporting/dataset_hashing.py +215 -0
  113. invarlock/reporting/guards_analysis.py +948 -0
  114. invarlock/reporting/html.py +32 -0
  115. invarlock/reporting/normalizer.py +235 -0
  116. invarlock/reporting/policy_utils.py +517 -0
  117. invarlock/reporting/primary_metric_utils.py +265 -0
  118. invarlock/reporting/render.py +1442 -0
  119. invarlock/reporting/report.py +903 -0
  120. invarlock/reporting/report_types.py +278 -0
  121. invarlock/reporting/utils.py +175 -0
  122. invarlock/reporting/validate.py +631 -0
  123. invarlock/security.py +176 -0
  124. invarlock/sparsity_utils.py +323 -0
  125. invarlock/utils/__init__.py +150 -0
  126. invarlock/utils/digest.py +45 -0
  127. invarlock-0.2.0.dist-info/METADATA +586 -0
  128. invarlock-0.2.0.dist-info/RECORD +132 -0
  129. invarlock-0.2.0.dist-info/WHEEL +5 -0
  130. invarlock-0.2.0.dist-info/entry_points.txt +20 -0
  131. invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
  132. invarlock-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,244 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ # Optional JSON Schema validation support (best-effort)
8
+ try: # pragma: no cover - exercised in integration
9
+ import jsonschema
10
+ except Exception: # pragma: no cover
11
+ jsonschema = None
12
+
13
+
14
+ # Certificate schema version (PM-first canonical)
15
+ CERTIFICATE_SCHEMA_VERSION = "v1"
16
+
17
+
18
+ # Minimal JSON Schema describing the canonical shape of a certificate.
19
+ # This focuses on structural validity; numerical thresholds are validated
20
+ # separately in metric-specific logic.
21
+ CERTIFICATE_JSON_SCHEMA: dict[str, Any] = {
22
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
23
+ "title": "InvarLock Safety Certificate",
24
+ "type": "object",
25
+ "required": [
26
+ "schema_version",
27
+ "run_id",
28
+ "artifacts",
29
+ "plugins",
30
+ "meta",
31
+ "dataset",
32
+ ],
33
+ "properties": {
34
+ "schema_version": {"const": CERTIFICATE_SCHEMA_VERSION},
35
+ "run_id": {"type": "string", "minLength": 4},
36
+ "edit_name": {"type": "string"},
37
+ "policy_digest": {
38
+ "type": "object",
39
+ "properties": {
40
+ "policy_version": {"type": "string"},
41
+ "tier_policy_name": {"type": "string"},
42
+ "thresholds_hash": {"type": "string"},
43
+ "hysteresis": {"type": "object"},
44
+ "min_effective": {"type": "number"},
45
+ "changed": {"type": "boolean"},
46
+ },
47
+ "additionalProperties": True,
48
+ },
49
+ "plugins": {
50
+ "type": "object",
51
+ "properties": {
52
+ "adapters": {"type": "array"},
53
+ "edits": {"type": "array"},
54
+ "guards": {"type": "array"},
55
+ },
56
+ "additionalProperties": True,
57
+ },
58
+ "meta": {"type": "object"},
59
+ "dataset": {
60
+ "type": "object",
61
+ "required": ["provider", "seq_len", "windows"],
62
+ "properties": {
63
+ "provider": {"type": "string"},
64
+ "seq_len": {"type": "integer", "minimum": 1},
65
+ "windows": {
66
+ "type": "object",
67
+ "required": ["preview", "final"],
68
+ "properties": {
69
+ "preview": {"type": "integer", "minimum": 0},
70
+ "final": {"type": "integer", "minimum": 0},
71
+ "seed": {"type": "integer"},
72
+ },
73
+ },
74
+ },
75
+ "additionalProperties": True,
76
+ },
77
+ # ppl_* block removed from required schema; may appear for ppl-like tasks but is optional
78
+ "primary_metric": {
79
+ "type": "object",
80
+ "properties": {
81
+ "kind": {"type": "string"},
82
+ "unit": {"type": "string"},
83
+ "direction": {"type": "string"},
84
+ "aggregation_scope": {"type": "string"},
85
+ "paired": {"type": "boolean"},
86
+ "gating_basis": {"type": "string"},
87
+ "preview": {"type": "number"},
88
+ "final": {"type": "number"},
89
+ "ratio_vs_baseline": {"type": "number"},
90
+ "reps": {"type": "number"},
91
+ "ci_level": {"type": "number"},
92
+ "counts_source": {"enum": ["measured", "pseudo_config"]},
93
+ "estimated": {"type": "boolean"},
94
+ "ci": {
95
+ "type": "array",
96
+ "minItems": 2,
97
+ "maxItems": 2,
98
+ "items": {"type": "number"},
99
+ },
100
+ "display_ci": {
101
+ "type": "array",
102
+ "minItems": 2,
103
+ "maxItems": 2,
104
+ "items": {"type": "number"},
105
+ },
106
+ },
107
+ "additionalProperties": True,
108
+ },
109
+ "system_overhead": {
110
+ "type": "object",
111
+ # Numeric keys must match these patterns when present; allow flexibility otherwise
112
+ "patternProperties": {
113
+ "^latency_ms_(p50|p95)$": {"type": "number"},
114
+ "^throughput_.*$": {"type": "number"},
115
+ },
116
+ "additionalProperties": True,
117
+ },
118
+ "validation": {
119
+ "type": "object",
120
+ # properties populated at import-time from allow-list; default permissive
121
+ "properties": {},
122
+ "additionalProperties": {"type": "boolean"},
123
+ },
124
+ "artifacts": {"type": "object"},
125
+ "provenance": {"type": "object"},
126
+ "resolved_policy": {"type": "object"},
127
+ "policy_provenance": {"type": "object"},
128
+ "structure": {"type": "object"},
129
+ "confidence": {
130
+ "type": "object",
131
+ "properties": {
132
+ "label": {"enum": ["High", "Medium", "Low"]},
133
+ "basis": {"type": "string"},
134
+ "width": {"type": "number"},
135
+ "threshold": {"type": "number"},
136
+ "unstable": {"type": "boolean"},
137
+ },
138
+ "required": ["label", "basis"],
139
+ "additionalProperties": True,
140
+ },
141
+ },
142
+ "additionalProperties": True,
143
+ }
144
+
145
+
146
+ _VALIDATION_ALLOWLIST_DEFAULT = {
147
+ "primary_metric_acceptable",
148
+ "preview_final_drift_acceptable",
149
+ "guard_overhead_acceptable",
150
+ "invariants_pass",
151
+ "spectral_stable",
152
+ "rmt_stable",
153
+ # Compatibility keys were removed; PM-only surface
154
+ "hysteresis_applied",
155
+ "moe_observed",
156
+ "moe_identity_ok",
157
+ }
158
+
159
+
160
+ def _load_validation_allowlist() -> set[str]:
161
+ """Load validation key allow-list from contracts/validation_keys.json when available.
162
+
163
+ Falls back to a safe built-in default when the contracts directory is not present
164
+ (e.g., installed wheel) or when parsing fails.
165
+ """
166
+ try:
167
+ root = Path(__file__).resolve().parents[3]
168
+ path = root / "contracts" / "validation_keys.json"
169
+ if path.exists():
170
+ data = json.loads(path.read_text(encoding="utf-8"))
171
+ if isinstance(data, list):
172
+ return {str(k) for k in data}
173
+ except Exception:
174
+ pass
175
+ return set(_VALIDATION_ALLOWLIST_DEFAULT)
176
+
177
+
178
+ def _validate_with_jsonschema(certificate: dict[str, Any]) -> bool:
179
+ """Validate certificate with JSON Schema when available."""
180
+ if jsonschema is None:
181
+ return True # Schema library unavailable; fall back to legacy checks
182
+ try:
183
+ jsonschema.validate(instance=certificate, schema=CERTIFICATE_JSON_SCHEMA)
184
+ return True
185
+ except Exception:
186
+ return False
187
+
188
+
189
+ def validate_certificate(certificate: dict[str, Any]) -> bool:
190
+ """Validate certificate structure and essential flags."""
191
+ try:
192
+ if certificate.get("schema_version") != CERTIFICATE_SCHEMA_VERSION:
193
+ return False
194
+
195
+ # Prefer JSON Schema structural validation; if unavailable or too strict,
196
+ # fall back to a lenient minimal check used by unit tests.
197
+ # Tighten JSON Schema: populate validation.properties from allow-list and
198
+ # disallow unknown validation keys at schema level.
199
+ try:
200
+ vkeys = _load_validation_allowlist()
201
+ if isinstance(CERTIFICATE_JSON_SCHEMA.get("properties"), dict):
202
+ vspec = CERTIFICATE_JSON_SCHEMA["properties"].get("validation")
203
+ if isinstance(vspec, dict):
204
+ vspec["properties"] = {k: {"type": "boolean"} for k in vkeys}
205
+ vspec["additionalProperties"] = False
206
+ except Exception:
207
+ pass
208
+
209
+ if not _validate_with_jsonschema(certificate):
210
+ # Minimal fallback: require schema version + run_id + primary_metric
211
+ run_id_ok = isinstance(certificate.get("run_id"), str) and bool(
212
+ certificate.get("run_id")
213
+ )
214
+ pm = certificate.get("primary_metric")
215
+ pm_ok = isinstance(pm, dict) and (
216
+ isinstance(pm.get("final"), int | float)
217
+ or (isinstance(pm.get("kind"), str) and bool(pm.get("kind")))
218
+ )
219
+ if not (run_id_ok and pm_ok):
220
+ return False
221
+
222
+ validation = certificate.get("validation", {})
223
+ for flag in [
224
+ "preview_final_drift_acceptable",
225
+ "primary_metric_acceptable",
226
+ "invariants_pass",
227
+ "spectral_stable",
228
+ "rmt_stable",
229
+ "guard_overhead_acceptable",
230
+ ]:
231
+ # If present, must be boolean; tolerate missing opt-in flags
232
+ if flag in validation and not isinstance(validation.get(flag), bool):
233
+ return False
234
+
235
+ return True
236
+ except (KeyError, TypeError, ValueError):
237
+ return False
238
+
239
+
240
+ __all__ = [
241
+ "CERTIFICATE_SCHEMA_VERSION",
242
+ "CERTIFICATE_JSON_SCHEMA",
243
+ "validate_certificate",
244
+ ]
@@ -0,0 +1,215 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from ..eval.data import EvaluationWindow, compute_window_hash
6
+
7
+
8
+ def compute_window_hashes(
9
+ preview_window: EvaluationWindow, final_window: EvaluationWindow
10
+ ) -> dict[str, str | int]:
11
+ """Compute SHA256 hashes of evaluation windows for dataset integrity.
12
+
13
+ Returns a mapping with preview/final sha256 and total token count.
14
+ """
15
+ preview_hash = compute_window_hash(preview_window, include_data=True)
16
+ final_hash = compute_window_hash(final_window, include_data=True)
17
+ return {
18
+ "preview": f"sha256:{preview_hash}",
19
+ "final": f"sha256:{final_hash}",
20
+ "total_tokens": sum(
21
+ len(ids) for ids in preview_window.input_ids + final_window.input_ids
22
+ ),
23
+ }
24
+
25
+
26
+ def _compute_actual_window_hashes(report: dict[str, Any]) -> dict[str, Any]:
27
+ """Compute actual window hashes from explicit token IDs when available.
28
+
29
+ Expects `report["evaluation_windows"]["preview"|"final"]["input_ids"]` to be present.
30
+ Returns dict with per-split sha256 hashes and token counts when computable; otherwise
31
+ returns an empty dict to signal fallback to config-based hashing.
32
+ """
33
+ try:
34
+ # Prefer explicit preview/final hashes included in the report data
35
+ data_config = report.get("data", {}) if isinstance(report, dict) else {}
36
+ if (
37
+ isinstance(data_config, dict)
38
+ and data_config.get("preview_hash")
39
+ and data_config.get("final_hash")
40
+ ):
41
+ preview_tokens_ct = data_config.get("preview_total_tokens")
42
+ final_tokens_ct = data_config.get("final_total_tokens")
43
+ total_tokens = 0
44
+ if isinstance(preview_tokens_ct, int):
45
+ total_tokens += preview_tokens_ct
46
+ if isinstance(final_tokens_ct, int):
47
+ total_tokens += final_tokens_ct
48
+ return {
49
+ "preview": f"blake2s:{data_config.get('preview_hash')}",
50
+ "final": f"blake2s:{data_config.get('final_hash')}",
51
+ "dataset": data_config.get("dataset_hash"),
52
+ "total_tokens": total_tokens,
53
+ "preview_tokens": preview_tokens_ct,
54
+ "final_tokens": final_tokens_ct,
55
+ }
56
+
57
+ windows = report.get("evaluation_windows", {})
58
+ if not isinstance(windows, dict):
59
+ windows = {}
60
+ preview_ids = (
61
+ windows.get("preview", {}).get("input_ids")
62
+ if windows.get("preview")
63
+ else None
64
+ )
65
+ final_ids = (
66
+ windows.get("final", {}).get("input_ids") if windows.get("final") else None
67
+ )
68
+ if not (isinstance(preview_ids, list) and isinstance(final_ids, list)):
69
+ # Config-based fallback (stable sha256 of config tuple)
70
+ import hashlib as _hashlib
71
+
72
+ data_config = report.get("data", {}) if isinstance(report, dict) else {}
73
+ dataset = str(data_config.get("dataset", "unknown"))
74
+ split = str(
75
+ data_config.get("split", data_config.get("dataset_split", "val"))
76
+ )
77
+ seed = (
78
+ report.get("meta", {}).get("seed", 0)
79
+ if isinstance(report.get("meta"), dict)
80
+ else 0
81
+ )
82
+ preview_n = int(data_config.get("preview_n", 0) or 0)
83
+ final_n = int(data_config.get("final_n", 0) or 0)
84
+ seq_len = int(data_config.get("seq_len", 0) or 0)
85
+ config_str = f"{dataset}{split}{seq_len}{preview_n}{final_n}{seed}"
86
+ digest = _hashlib.sha256(config_str.encode()).hexdigest()
87
+ preview_tokens = preview_n * seq_len if preview_n and seq_len else 0
88
+ final_tokens = final_n * seq_len if final_n and seq_len else 0
89
+ return {
90
+ "preview": f"sha256:{digest[:32]}",
91
+ "final": f"sha256:{digest[32:64] if len(digest) >= 64 else digest[:32]}",
92
+ "dataset": data_config.get("provider_hash")
93
+ or data_config.get("tokenizer_hash"),
94
+ "preview_tokens": preview_tokens,
95
+ "final_tokens": final_tokens,
96
+ "total_tokens": preview_tokens + final_tokens,
97
+ }
98
+ # Compute hashes directly from token ID sequences for robustness
99
+ import hashlib as _hashlib
100
+
101
+ def _hash_sequences(seqs: list[list[int]]) -> str:
102
+ h = _hashlib.sha256()
103
+ for seq in seqs:
104
+ try:
105
+ h.update(str(list(seq)).encode("utf-8"))
106
+ except Exception:
107
+ continue
108
+ return h.hexdigest()
109
+
110
+ preview_hash = _hash_sequences(preview_ids)
111
+ final_hash = _hash_sequences(final_ids)
112
+ preview_tokens = sum(len(s) for s in preview_ids)
113
+ final_tokens = sum(len(s) for s in final_ids)
114
+ return {
115
+ "preview": f"sha256:{preview_hash}",
116
+ "final": f"sha256:{final_hash}",
117
+ "preview_tokens": preview_tokens,
118
+ "final_tokens": final_tokens,
119
+ "dataset": None,
120
+ "total_tokens": preview_tokens + final_tokens,
121
+ }
122
+ except Exception:
123
+ # Signal caller to use config-based fallback
124
+ return {}
125
+
126
+
127
+ def _extract_dataset_info(report: dict[str, Any]) -> dict[str, Any]:
128
+ """Extract dataset configuration and compute window hashes.
129
+
130
+ Tolerates missing `data` by falling back to evaluation_windows lengths and
131
+ placeholder values for non-essential fields.
132
+ """
133
+ data_config = report.get("data", {}) if isinstance(report, dict) else {}
134
+ eval_windows = (
135
+ report.get("evaluation_windows", {}) if isinstance(report, dict) else {}
136
+ )
137
+ preview_section = (
138
+ eval_windows.get("preview", {}) if isinstance(eval_windows, dict) else {}
139
+ )
140
+ final_section = (
141
+ eval_windows.get("final", {}) if isinstance(eval_windows, dict) else {}
142
+ )
143
+
144
+ def _len_ids(sec: dict[str, Any]) -> int:
145
+ ids = sec.get("window_ids")
146
+ return int(len(ids)) if isinstance(ids, list) else 0
147
+
148
+ preview_n = int(data_config.get("preview_n", 0) or 0) or _len_ids(preview_section)
149
+ final_n = int(data_config.get("final_n", 0) or 0) or _len_ids(final_section)
150
+ seq_len = int(data_config.get("seq_len", 0) or 0)
151
+ stride = int(data_config.get("stride", 0) or 0)
152
+
153
+ dataset = str(data_config.get("dataset", "unknown"))
154
+ split = str(data_config.get("split", data_config.get("dataset_split", "val")))
155
+
156
+ # Prefer actual window hashes when explicit token IDs are present
157
+ actual_hashes = _compute_actual_window_hashes(report)
158
+ if actual_hashes:
159
+ window_hash = actual_hashes
160
+ else:
161
+ # Config-based fallback: produce a stable sha256 of config tuple
162
+ import hashlib as _hashlib
163
+
164
+ seed = (
165
+ report.get("meta", {}).get("seed", 0)
166
+ if isinstance(report.get("meta"), dict)
167
+ else 0
168
+ )
169
+ preview_n = int(data_config.get("preview_n", 0) or 0)
170
+ final_n = int(data_config.get("final_n", 0) or 0)
171
+ seq_len = int(data_config.get("seq_len", 0) or 0)
172
+ config_str = f"{dataset}{split}{seq_len}{preview_n}{final_n}{seed}"
173
+ digest = _hashlib.sha256(config_str.encode()).hexdigest()
174
+ preview_tokens = preview_n * seq_len if preview_n and seq_len else 0
175
+ final_tokens = final_n * seq_len if final_n and seq_len else 0
176
+ window_hash = {
177
+ "preview": f"sha256:{digest[:32]}",
178
+ "final": f"sha256:{digest[32:64] if len(digest) >= 64 else digest[:32]}",
179
+ "dataset": data_config.get("provider_hash")
180
+ or data_config.get("tokenizer_hash"),
181
+ "preview_tokens": preview_tokens,
182
+ "final_tokens": final_tokens,
183
+ "total_tokens": preview_tokens + final_tokens,
184
+ }
185
+
186
+ tokenizer_info = {
187
+ "name": data_config.get("tokenizer_name", "unknown"),
188
+ "hash": data_config.get("tokenizer_hash"),
189
+ "vocab_size": data_config.get("vocab_size"),
190
+ "bos_token": data_config.get("bos_token"),
191
+ "eos_token": data_config.get("eos_token"),
192
+ "pad_token": data_config.get("pad_token"),
193
+ "add_prefix_space": data_config.get("add_prefix_space"),
194
+ }
195
+
196
+ return {
197
+ "provider": dataset,
198
+ "split": split,
199
+ "seq_len": seq_len,
200
+ "stride": stride,
201
+ "windows": {
202
+ "preview": preview_n,
203
+ "final": final_n,
204
+ "seed": data_config.get("seed"),
205
+ },
206
+ "hash": window_hash,
207
+ "tokenizer": tokenizer_info,
208
+ }
209
+
210
+
211
+ __all__ = [
212
+ "compute_window_hashes",
213
+ "_compute_actual_window_hashes",
214
+ "_extract_dataset_info",
215
+ ]