actproof 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
actproof/canonical.py ADDED
@@ -0,0 +1,369 @@
1
+ # SPDX-FileCopyrightText: 2026 Deyan Paroushev
2
+ # SPDX-License-Identifier: MIT
3
+ """
4
+ RFC 8785 JSON Canonicalization Scheme (JCS) with optional compliance discipline.
5
+
6
+ This module is the foundation of every other operation in actproof. A canonical
7
+ manifest is what gets hashed; the hash is what gets anchored on the public ledger
8
+ and timestamped by the QTSP; the receipt that travels outside this library is
9
+ re-verifiable by anyone who recomputes the canonical bytes from the same input
10
+ and gets the same hash.
11
+
12
+ The wrapper pattern
13
+ -------------------
14
+
15
+ We do not implement RFC 8785 from scratch. We wrap the ``rfc8785`` package
16
+ maintained by Trail of Bits, which is audited and broadly used. Our wrapper
17
+ adds three things on top:
18
+
19
+ 1. **Strict mode (default).** Reject inputs that would produce ambiguous,
20
+ non-reproducible, or non-I-JSON canonical bytes. The restrictions are
21
+ carried forward from the production canonicaliser in the Quoruna reference
22
+ implementation (Quoruna-JCS-v1).
23
+
24
+ 2. **Duplicate-key detection on JSON parse.** Python dicts silently swallow
25
+ duplicates. When input arrives as JSON text from an external party, the
26
+ ``canonicalize_from_json`` entry point uses ``object_pairs_hook`` to raise
27
+ on duplicate keys before the dict is constructed.
28
+
29
+ 3. **JSON-Path-style error locations.** Validation errors report where in the
30
+ input the problem occurred (``$.evidence[2].sha256``), which matters when
31
+ manifests have many fields.
32
+
33
+ Strict mode restrictions
34
+ ------------------------
35
+
36
+ When ``strict=True`` (the default), the canonicaliser rejects:
37
+
38
+ * **Floating-point numbers.** Floats have representation-dependent canonical
39
+ forms across platforms. Use scaled integers instead (``*_basis_points``
40
+ for percentages, ``*_minor_units`` for currency, ``*_ppm`` for parts per
41
+ million).
42
+ * **NaN and Infinity.** Not representable in JSON; would produce a
43
+ canonical form that no other implementation could agree on.
44
+ * **Integers outside the I-JSON safe range** ([-(2^53 - 1), 2^53 - 1]).
45
+ RFC 7493 (I-JSON) limits integers to this range because larger values are
46
+ not reliably preserved across JSON implementations. If a larger value
47
+ must be carried, encode it as a string.
48
+ * **Strings that cannot encode to UTF-8** (lone surrogate code points).
49
+
50
+ When ``strict=False``, the canonicaliser delegates directly to ``rfc8785.dumps``
51
+ without pre-validation. This mode produces pure RFC 8785 output and accepts
52
+ anything the underlying library accepts. Use this for general-purpose JCS work
53
+ where the strict restrictions are not appropriate.
54
+
55
+ Quick reference
56
+ ---------------
57
+
58
+ ::
59
+
60
+ from actproof.canonical import canonicalize, hash_canonical_hex
61
+
62
+ manifest = {
63
+ "act_type_id": "op:eu.nis2.art20.management_body_approval.v1",
64
+ "decision_date": "2026-05-14",
65
+ }
66
+
67
+ canonical_bytes = canonicalize(manifest)
68
+ # b'{"act_type_id":"op:eu.nis2.art20.management_body_approval.v1",
69
+ # "decision_date":"2026-05-14"}'
70
+
71
+ manifest_hash = hash_canonical_hex(manifest)
72
+ # "a3f2c1...
73
+
74
+ API
75
+ ---
76
+
77
+ ``canonicalize(obj, *, strict=True) -> bytes``
78
+ Primary entry point. Returns UTF-8 encoded bytes of the canonical
79
+ representation.
80
+
81
+ ``canonicalize_str(obj, *, strict=True) -> str``
82
+ Same as ``canonicalize`` but returns a Python ``str`` instead of bytes.
83
+
84
+ ``canonicalize_from_json(json_str, *, strict=True) -> bytes``
85
+ Parse JSON text with duplicate-key detection, then canonicalise. Use this
86
+ when input arrives as JSON from an external party.
87
+
88
+ ``hash_canonical(obj, *, strict=True) -> bytes``
89
+ Canonicalize and return the SHA-256 raw digest (32 bytes).
90
+
91
+ ``hash_canonical_hex(obj, *, strict=True) -> str``
92
+ Canonicalize and return the SHA-256 hex digest (64 lowercase hex chars).
93
+
94
+ References
95
+ ----------
96
+
97
+ * RFC 8785: https://datatracker.ietf.org/doc/html/rfc8785
98
+ * RFC 7493 (I-JSON): https://datatracker.ietf.org/doc/html/rfc7493
99
+ * rfc8785 library: https://github.com/trailofbits/rfc8785.py
100
+ """
101
+
102
+ from __future__ import annotations
103
+
104
+ import hashlib
105
+ import json
106
+ import math
107
+ from typing import Any
108
+
109
+ try:
110
+ import rfc8785
111
+ except ImportError as exc: # pragma: no cover
112
+ raise ImportError(
113
+ "rfc8785 is required. Install with: pip install 'rfc8785>=0.1.4'"
114
+ ) from exc
115
+
116
+
117
+ __all__ = [
118
+ "canonicalize",
119
+ "canonicalize_str",
120
+ "canonicalize_from_json",
121
+ "hash_canonical",
122
+ "hash_canonical_hex",
123
+ "IJSON_MAX_SAFE_INT",
124
+ "IJSON_MIN_SAFE_INT",
125
+ "CanonicalizationError",
126
+ ]
127
+
128
+
129
+ # ─────────────────────────────────────────────────────────────────
130
+ # CONSTANTS
131
+ # ─────────────────────────────────────────────────────────────────
132
+
133
+ # I-JSON safe integer range, per RFC 7493 section 2.2. Integers outside this
134
+ # range are not reliably preserved across JSON implementations and must be
135
+ # encoded as strings if their values are to survive a round trip.
136
+ IJSON_MAX_SAFE_INT: int = 2**53 - 1
137
+ IJSON_MIN_SAFE_INT: int = -(2**53 - 1)
138
+
139
+
140
+ # ─────────────────────────────────────────────────────────────────
141
+ # EXCEPTIONS
142
+ # ─────────────────────────────────────────────────────────────────
143
+
144
+ class CanonicalizationError(ValueError):
145
+ """Raised when input violates a strict-mode canonicalisation restriction.
146
+
147
+ Subclass of ``ValueError`` so callers can catch ``ValueError`` if they
148
+ prefer to handle all input-validation errors uniformly, or
149
+ ``CanonicalizationError`` specifically when they need to distinguish
150
+ canonicalisation problems from other value errors.
151
+
152
+ The error message includes a JSON-Path-style location (``$.foo.bar[2]``)
153
+ indicating where in the input the problem occurred.
154
+ """
155
+
156
+
157
+ # ─────────────────────────────────────────────────────────────────
158
+ # PUBLIC API
159
+ # ─────────────────────────────────────────────────────────────────
160
+
161
+ def canonicalize(obj: Any, *, strict: bool = True) -> bytes:
162
+ """RFC 8785 canonicalise a JSON-serialisable Python object.
163
+
164
+ Args:
165
+ obj: The Python object to canonicalise. May be a dict, list, str,
166
+ int, float (rejected if ``strict``), bool, or ``None``.
167
+ strict: If ``True`` (default), enforce actproof discipline:
168
+ no floats, no NaN/Infinity, integers in I-JSON safe range,
169
+ strings that encode to UTF-8. If ``False``, delegate directly
170
+ to ``rfc8785.dumps`` with no pre-validation.
171
+
172
+ Returns:
173
+ UTF-8 encoded bytes of the canonical JSON representation.
174
+
175
+ Raises:
176
+ CanonicalizationError: If ``strict=True`` and a restriction is
177
+ violated.
178
+ TypeError: If the input contains a type that JSON cannot represent.
179
+ rfc8785.IntegerDomainError: If ``strict=False`` and an integer is
180
+ outside the I-JSON safe range (raised by the underlying library).
181
+ """
182
+ if strict:
183
+ _validate_strict(obj, "$")
184
+ return rfc8785.dumps(obj)
185
+
186
+
187
+ def canonicalize_str(obj: Any, *, strict: bool = True) -> str:
188
+ """RFC 8785 canonicalise to a Python ``str`` (UTF-8 decoded).
189
+
190
+ Args:
191
+ obj: The object to canonicalise. See ``canonicalize`` for details.
192
+ strict: Whether to enforce strict mode. Default ``True``.
193
+
194
+ Returns:
195
+ The canonical representation as a ``str``.
196
+ """
197
+ return canonicalize(obj, strict=strict).decode("utf-8")
198
+
199
+
200
+ def canonicalize_from_json(json_str: str, *, strict: bool = True) -> bytes:
201
+ """Parse JSON text with duplicate-key detection, then canonicalise.
202
+
203
+ Use this entry point when input arrives as JSON text from an external
204
+ party. Python's ``json.loads`` silently swallows duplicate keys
205
+ (keeping the last value); this function uses ``object_pairs_hook`` to
206
+ raise on duplicates before the dict is constructed.
207
+
208
+ Args:
209
+ json_str: The JSON text to parse and canonicalise.
210
+ strict: Whether to enforce strict mode. Default ``True``.
211
+
212
+ Returns:
213
+ UTF-8 encoded bytes of the canonical JSON representation.
214
+
215
+ Raises:
216
+ CanonicalizationError: If duplicate keys are detected, or any
217
+ other strict-mode restriction is violated downstream.
218
+ json.JSONDecodeError: If the input is not valid JSON.
219
+ """
220
+ obj = json.loads(json_str, object_pairs_hook=_detect_duplicate_keys)
221
+ return canonicalize(obj, strict=strict)
222
+
223
+
224
+ def hash_canonical(obj: Any, *, strict: bool = True) -> bytes:
225
+ """Canonicalise and return the SHA-256 raw digest (32 bytes).
226
+
227
+ Convenience for the most common operation: canonicalise a manifest and
228
+ compute the hash that will be anchored on the public ledger.
229
+
230
+ Args:
231
+ obj: The object to canonicalise and hash.
232
+ strict: Whether to enforce strict mode. Default ``True``.
233
+
234
+ Returns:
235
+ The 32-byte SHA-256 raw digest of the canonical bytes.
236
+ """
237
+ return hashlib.sha256(canonicalize(obj, strict=strict)).digest()
238
+
239
+
240
+ def hash_canonical_hex(obj: Any, *, strict: bool = True) -> str:
241
+ """Canonicalise and return the SHA-256 hex digest.
242
+
243
+ Args:
244
+ obj: The object to canonicalise and hash.
245
+ strict: Whether to enforce strict mode. Default ``True``.
246
+
247
+ Returns:
248
+ The 64-character lowercase hexadecimal SHA-256 digest.
249
+ """
250
+ return hash_canonical(obj, strict=strict).hex()
251
+
252
+
253
+ # ─────────────────────────────────────────────────────────────────
254
+ # INTERNAL: STRICT-MODE VALIDATION
255
+ # ─────────────────────────────────────────────────────────────────
256
+
257
+ def _validate_strict(node: Any, path: str) -> None:
258
+ """Recursively validate ``node`` against strict-mode restrictions.
259
+
260
+ Walks the tree depth-first. Raises ``CanonicalizationError`` on the
261
+ first restriction violation, with a JSON-Path-style location.
262
+ """
263
+ # bool is a subclass of int in Python; check it first.
264
+ if isinstance(node, bool):
265
+ return
266
+
267
+ if isinstance(node, float):
268
+ if math.isnan(node):
269
+ raise CanonicalizationError(
270
+ f"NaN at {path}: not representable in canonical JSON. "
271
+ f"Strict mode forbids NaN and Infinity."
272
+ )
273
+ if math.isinf(node):
274
+ raise CanonicalizationError(
275
+ f"Infinity at {path}: not representable in canonical JSON. "
276
+ f"Strict mode forbids NaN and Infinity."
277
+ )
278
+ raise CanonicalizationError(
279
+ f"Floating-point number {node} at {path}: not allowed in strict "
280
+ f"mode. Floats have representation-dependent canonical forms across "
281
+ f"platforms. Use scaled integers (e.g. *_basis_points for "
282
+ f"percentages, *_minor_units for currency, *_ppm for parts per "
283
+ f"million), or pass strict=False if you do not need cross-platform "
284
+ f"reproducibility."
285
+ )
286
+
287
+ if isinstance(node, int):
288
+ if node > IJSON_MAX_SAFE_INT or node < IJSON_MIN_SAFE_INT:
289
+ raise CanonicalizationError(
290
+ f"Integer {node} at {path} exceeds I-JSON safe range "
291
+ f"[{IJSON_MIN_SAFE_INT}, {IJSON_MAX_SAFE_INT}] "
292
+ f"(RFC 7493 section 2.2). Integers outside this range are not "
293
+ f"reliably preserved across JSON implementations. Encode as a "
294
+ f"string if a larger value must be carried, or pass strict=False."
295
+ )
296
+ return
297
+
298
+ if isinstance(node, str):
299
+ try:
300
+ node.encode("utf-8", errors="strict")
301
+ except UnicodeEncodeError as exc:
302
+ raise CanonicalizationError(
303
+ f"String at {path} contains code points that cannot encode to "
304
+ f"UTF-8 (typically lone surrogates): {exc}. Strict mode requires "
305
+ f"all strings to be valid UTF-8."
306
+ ) from exc
307
+ return
308
+
309
+ if node is None:
310
+ return
311
+
312
+ if isinstance(node, dict):
313
+ for k, v in node.items():
314
+ if not isinstance(k, str):
315
+ raise CanonicalizationError(
316
+ f"Non-string key at {path}: {type(k).__name__} {k!r}. "
317
+ f"JSON requires string keys."
318
+ )
319
+ try:
320
+ k.encode("utf-8", errors="strict")
321
+ except UnicodeEncodeError as exc:
322
+ raise CanonicalizationError(
323
+ f"Key {k!r} at {path} contains code points that cannot "
324
+ f"encode to UTF-8: {exc}."
325
+ ) from exc
326
+ _validate_strict(v, f"{path}.{k}")
327
+ return
328
+
329
+ if isinstance(node, list):
330
+ for i, item in enumerate(node):
331
+ _validate_strict(item, f"{path}[{i}]")
332
+ return
333
+
334
+ raise CanonicalizationError(
335
+ f"Unsupported type at {path}: {type(node).__name__}. "
336
+ f"actproof canonical accepts dict, list, str, int, bool, None."
337
+ )
338
+
339
+
340
+ # ─────────────────────────────────────────────────────────────────
341
+ # INTERNAL: DUPLICATE KEY DETECTION
342
+ # ─────────────────────────────────────────────────────────────────
343
+
344
+ def _detect_duplicate_keys(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
345
+ """``object_pairs_hook`` for ``json.loads`` that detects duplicate keys.
346
+
347
+ Python's default ``json.loads`` silently keeps the last value when keys
348
+ repeat. RFC 8785 canonicalisation requires unique keys; if input arrived
349
+ with duplicates and we silently swallowed them, two parties could compute
350
+ different canonical bytes from textually identical inputs.
351
+
352
+ Raises:
353
+ CanonicalizationError: If any duplicate keys are present.
354
+ """
355
+ keys = [k for k, _ in pairs]
356
+ if len(keys) != len(set(keys)):
357
+ seen: set[str] = set()
358
+ duplicates: list[str] = []
359
+ for k in keys:
360
+ if k in seen:
361
+ if k not in duplicates:
362
+ duplicates.append(k)
363
+ else:
364
+ seen.add(k)
365
+ raise CanonicalizationError(
366
+ f"Duplicate keys forbidden by RFC 8785. "
367
+ f"Duplicates found: {sorted(duplicates)}"
368
+ )
369
+ return dict(pairs)