halo-format 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- halo_format-0.1.0/.gitignore +41 -0
- halo_format-0.1.0/PKG-INFO +15 -0
- halo_format-0.1.0/README.md +5 -0
- halo_format-0.1.0/pyproject.toml +20 -0
- halo_format-0.1.0/src/halo_format/__init__.py +70 -0
- halo_format-0.1.0/src/halo_format/canonical.py +46 -0
- halo_format-0.1.0/src/halo_format/carve.py +41 -0
- halo_format-0.1.0/src/halo_format/encode.py +76 -0
- halo_format-0.1.0/src/halo_format/envelope.py +47 -0
- halo_format-0.1.0/src/halo_format/errors.py +29 -0
- halo_format-0.1.0/src/halo_format/hash.py +29 -0
- halo_format-0.1.0/src/halo_format/navigate.py +169 -0
- halo_format-0.1.0/src/halo_format/node.py +72 -0
- halo_format-0.1.0/src/halo_format/store.py +97 -0
- halo_format-0.1.0/src/halo_format/summarize.py +22 -0
- halo_format-0.1.0/tests/test_canonical.py +33 -0
- halo_format-0.1.0/tests/test_encode.py +160 -0
- halo_format-0.1.0/tests/test_navigate.py +183 -0
- halo_format-0.1.0/tests/test_node.py +51 -0
- halo_format-0.1.0/tests/test_store.py +61 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Node / TypeScript
|
|
2
|
+
node_modules/
|
|
3
|
+
dist/
|
|
4
|
+
*.tsbuildinfo
|
|
5
|
+
.npm/
|
|
6
|
+
pnpm-debug.log*
|
|
7
|
+
npm-debug.log*
|
|
8
|
+
|
|
9
|
+
# Python
|
|
10
|
+
__pycache__/
|
|
11
|
+
*.py[cod]
|
|
12
|
+
*.egg-info/
|
|
13
|
+
.eggs/
|
|
14
|
+
build/
|
|
15
|
+
*.whl
|
|
16
|
+
.venv/
|
|
17
|
+
venv/
|
|
18
|
+
.uv/
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.ruff_cache/
|
|
21
|
+
.mypy_cache/
|
|
22
|
+
|
|
23
|
+
# Coverage / test output
|
|
24
|
+
coverage/
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
|
|
28
|
+
# Release: Python distributions built in CI before the PyPI upload
|
|
29
|
+
dist-python/
|
|
30
|
+
|
|
31
|
+
# Editor / OS
|
|
32
|
+
.DS_Store
|
|
33
|
+
.idea/
|
|
34
|
+
*.swp
|
|
35
|
+
|
|
36
|
+
# Local env
|
|
37
|
+
.env
|
|
38
|
+
.env.local
|
|
39
|
+
|
|
40
|
+
# Internal design docs — never commit
|
|
41
|
+
private/
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: halo-format
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Halo core: encode a value into a content-addressed navigable tree and read it back, verified.
|
|
5
|
+
Project-URL: Homepage, https://github.com/halo-format/halo
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: rfc8785>=0.1.2
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# halo-format
|
|
12
|
+
|
|
13
|
+
Python port of the Halo core. Same interop contract as the TypeScript reference implementation:
|
|
14
|
+
identical input produces identical handles in both ports, guaranteed by the shared conformance
|
|
15
|
+
vectors. See the repo root `README.md` and `private/` for the design.
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# halo-format
|
|
2
|
+
|
|
3
|
+
Python port of the Halo core. Same interop contract as the TypeScript reference implementation:
|
|
4
|
+
identical input produces identical handles in both ports, guaranteed by the shared conformance
|
|
5
|
+
vectors. See the repo root `README.md` and `private/` for the design.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "halo-format"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Halo core: encode a value into a content-addressed navigable tree and read it back, verified."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
# Core has zero runtime deps beyond stdlib hashlib (sha256) and a vetted JCS canonicalizer
|
|
9
|
+
# (rfc8785), which is the interop oracle for canonical bytes.
|
|
10
|
+
dependencies = ["rfc8785>=0.1.2"]
|
|
11
|
+
|
|
12
|
+
[project.urls]
|
|
13
|
+
Homepage = "https://github.com/halo-format/halo"
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["hatchling"]
|
|
17
|
+
build-backend = "hatchling.build"
|
|
18
|
+
|
|
19
|
+
[tool.hatch.build.targets.wheel]
|
|
20
|
+
packages = ["src/halo_format"]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Public surface of halo_format.
|
|
2
|
+
|
|
3
|
+
Producer: encode, extend, merge
|
|
4
|
+
Consumer: open_ -> Navigator(walk, fetch, fetch_many, root); free walk/fetch/fetch_many
|
|
5
|
+
Stores: MemoryStore, FileStore
|
|
6
|
+
Errors: UnknownHandle, HashMismatch, WrongKind, CanonicalizationError, StoreError
|
|
7
|
+
|
|
8
|
+
Re-exports only; implementation lives in the sibling modules.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .canonical import canonical, canonical_bytes
|
|
12
|
+
from .errors import (
|
|
13
|
+
CanonicalizationError,
|
|
14
|
+
HaloError,
|
|
15
|
+
HashMismatch,
|
|
16
|
+
StoreError,
|
|
17
|
+
UnknownHandle,
|
|
18
|
+
WrongKind,
|
|
19
|
+
)
|
|
20
|
+
from .carve import auto_carve, make_auto_carve
|
|
21
|
+
from .encode import encode, extend, merge
|
|
22
|
+
from .envelope import build_envelope, verify_envelope
|
|
23
|
+
from .hash import handle_of, hash_bytes
|
|
24
|
+
from .navigate import Navigator, fetch, fetch_many, open_, walk
|
|
25
|
+
from .node import (
|
|
26
|
+
branch_node,
|
|
27
|
+
decode,
|
|
28
|
+
is_branch,
|
|
29
|
+
is_leaf,
|
|
30
|
+
leaf_node,
|
|
31
|
+
node_handle,
|
|
32
|
+
serialize,
|
|
33
|
+
)
|
|
34
|
+
from .store import FileStore, MemoryStore
|
|
35
|
+
from .summarize import derive_summary
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"canonical",
|
|
39
|
+
"canonical_bytes",
|
|
40
|
+
"hash_bytes",
|
|
41
|
+
"handle_of",
|
|
42
|
+
"branch_node",
|
|
43
|
+
"leaf_node",
|
|
44
|
+
"is_branch",
|
|
45
|
+
"is_leaf",
|
|
46
|
+
"serialize",
|
|
47
|
+
"node_handle",
|
|
48
|
+
"decode",
|
|
49
|
+
"auto_carve",
|
|
50
|
+
"make_auto_carve",
|
|
51
|
+
"derive_summary",
|
|
52
|
+
"build_envelope",
|
|
53
|
+
"verify_envelope",
|
|
54
|
+
"encode",
|
|
55
|
+
"extend",
|
|
56
|
+
"merge",
|
|
57
|
+
"open_",
|
|
58
|
+
"walk",
|
|
59
|
+
"fetch",
|
|
60
|
+
"fetch_many",
|
|
61
|
+
"Navigator",
|
|
62
|
+
"MemoryStore",
|
|
63
|
+
"FileStore",
|
|
64
|
+
"HaloError",
|
|
65
|
+
"UnknownHandle",
|
|
66
|
+
"HashMismatch",
|
|
67
|
+
"WrongKind",
|
|
68
|
+
"CanonicalizationError",
|
|
69
|
+
"StoreError",
|
|
70
|
+
]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""value -> canonical bytes (RFC 8785 JCS).
|
|
2
|
+
|
|
3
|
+
The interop crux and a leaf dependency. Keys sorted by UTF-16 code unit, JCS string escaping,
|
|
4
|
+
ECMAScript Number-to-String for numbers. v1 allows non-integer floats and leans on a vetted JCS
|
|
5
|
+
library here. Must be byte-identical to the TypeScript port; defended by
|
|
6
|
+
conformance/vectors/canonical.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
|
|
11
|
+
import rfc8785
|
|
12
|
+
|
|
13
|
+
from .errors import CanonicalizationError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _assert_canonicalizable(value):
|
|
17
|
+
"""Reject non-finite floats up front so the error matches across ports.
|
|
18
|
+
|
|
19
|
+
bool is a subclass of int in Python; rfc8785 handles it correctly, and we never treat a bool
|
|
20
|
+
as a number here, so no special-casing is needed.
|
|
21
|
+
"""
|
|
22
|
+
if isinstance(value, float):
|
|
23
|
+
if not math.isfinite(value):
|
|
24
|
+
raise CanonicalizationError(f"non-finite number cannot be canonicalized: {value!r}")
|
|
25
|
+
elif isinstance(value, dict):
|
|
26
|
+
for v in value.values():
|
|
27
|
+
_assert_canonicalizable(v)
|
|
28
|
+
elif isinstance(value, (list, tuple)):
|
|
29
|
+
for v in value:
|
|
30
|
+
_assert_canonicalizable(v)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def canonical_bytes(value) -> bytes:
|
|
34
|
+
"""Canonical RFC 8785 (JCS) bytes (UTF-8) of a JSON value — the exact bytes that get hashed."""
|
|
35
|
+
_assert_canonicalizable(value)
|
|
36
|
+
try:
|
|
37
|
+
return rfc8785.dumps(value)
|
|
38
|
+
except CanonicalizationError:
|
|
39
|
+
raise
|
|
40
|
+
except Exception as e: # rfc8785 raises its own/ValueError for unrepresentable input
|
|
41
|
+
raise CanonicalizationError(str(e)) from e
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def canonical(value) -> str:
|
|
45
|
+
"""Canonical RFC 8785 (JCS) string form of a JSON value."""
|
|
46
|
+
return canonical_bytes(value).decode("utf-8")
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""carve(value, path) -> {"as": "leaf"} | {"as": "branch", "children": {...}, "summary"?: str}.
|
|
2
|
+
|
|
3
|
+
auto_carve default: split a non-empty object into one branch per key; chunk arrays longer than a
|
|
4
|
+
threshold (default 25) into contiguous slices; inline everything below a small byte threshold as a
|
|
5
|
+
leaf. Hosts/skills override with task knowledge — an explicit carve always wins over the default.
|
|
6
|
+
Bounds node size so no single node need be the whole payload. Pure and deterministic: the carve
|
|
7
|
+
shape is part of what gets hashed.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .canonical import canonical_bytes
|
|
11
|
+
|
|
12
|
+
DEFAULT_ARRAY_THRESHOLD = 25
|
|
13
|
+
DEFAULT_INLINE_BYTES = 1024
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def make_auto_carve(array_threshold=DEFAULT_ARRAY_THRESHOLD, inline_bytes=DEFAULT_INLINE_BYTES):
|
|
17
|
+
"""Build an auto_carve policy with custom thresholds. `auto_carve` is this with the defaults."""
|
|
18
|
+
|
|
19
|
+
def auto_carve(value, path):
|
|
20
|
+
# bool is a dict/list-free primitive in Python, so it falls through to leaf below.
|
|
21
|
+
if isinstance(value, dict):
|
|
22
|
+
if not value:
|
|
23
|
+
return {"as": "leaf"} # {} is a terminal leaf, not a branch
|
|
24
|
+
if len(canonical_bytes(value)) <= inline_bytes:
|
|
25
|
+
return {"as": "leaf"} # small object inlines whole
|
|
26
|
+
return {"as": "branch", "children": dict(value)}
|
|
27
|
+
if isinstance(value, list):
|
|
28
|
+
if len(value) <= array_threshold:
|
|
29
|
+
return {"as": "leaf"} # short array inlines as one leaf
|
|
30
|
+
children = {}
|
|
31
|
+
chunks = (len(value) + array_threshold - 1) // array_threshold
|
|
32
|
+
for i in range(chunks):
|
|
33
|
+
children[str(i)] = value[i * array_threshold : (i + 1) * array_threshold]
|
|
34
|
+
return {"as": "branch", "children": children} # each chunk re-carves to a leaf
|
|
35
|
+
return {"as": "leaf"} # primitives (None, bool, number, str) are always leaves
|
|
36
|
+
|
|
37
|
+
return auto_carve
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# The default carve policy: object-by-key, chunk long arrays, inline small subtrees.
|
|
41
|
+
auto_carve = make_auto_carve()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Bottom-up Merkle build, plus extend / merge.
|
|
2
|
+
|
|
3
|
+
_build_node recurses children-first so a parent references its children by handle, which gives the
|
|
4
|
+
tree its Merkle structure and free dedup (store.put is idempotent and content-keyed). encode returns
|
|
5
|
+
{"handle", "envelope"}. extend(root, name, value) folds a new branch onto an existing root (last
|
|
6
|
+
write wins on name collision); merge(roots) combines several roots' branch tables. Both reuse every
|
|
7
|
+
unchanged child handle, so the only new node is the root, and because nodes are immutable the prior
|
|
8
|
+
root stays in the store — version history and a change audit for free.
|
|
9
|
+
|
|
10
|
+
Sync to match the Python Store (see store.py's note on the async divergence from TypeScript).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .carve import auto_carve
|
|
14
|
+
from .envelope import build_envelope
|
|
15
|
+
from .errors import WrongKind
|
|
16
|
+
from .node import branch_node, decode, is_branch, leaf_node, serialize
|
|
17
|
+
from .summarize import derive_summary
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _build_node(value, path, store, carve, summarize):
|
|
21
|
+
"""Build one node and everything beneath it, returning the node's handle."""
|
|
22
|
+
decision = carve(value, path)
|
|
23
|
+
if decision["as"] == "leaf":
|
|
24
|
+
node = leaf_node(value)
|
|
25
|
+
else:
|
|
26
|
+
branches = {}
|
|
27
|
+
for name, child in decision["children"].items():
|
|
28
|
+
branches[name] = _build_node(child, path + [name], store, carve, summarize)
|
|
29
|
+
summary = decision.get("summary")
|
|
30
|
+
if summary is None:
|
|
31
|
+
summary = summarize(value, branches)
|
|
32
|
+
node = branch_node(summary, branches)
|
|
33
|
+
return store.put(serialize(node))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _result_for(root, store, alg):
|
|
37
|
+
"""Read the stored root node back and wrap it in an envelope."""
|
|
38
|
+
root_node = decode(store.get(root))
|
|
39
|
+
return {"handle": root, "envelope": build_envelope(root, root_node, alg)}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def encode(value, store, *, carve=None, summarize=None, alg="sha256"):
|
|
43
|
+
"""Encode a JSON value into content-addressed nodes and return its halo."""
|
|
44
|
+
carve = carve or auto_carve
|
|
45
|
+
summarize = summarize or derive_summary
|
|
46
|
+
root = _build_node(value, [], store, carve, summarize)
|
|
47
|
+
return _result_for(root, store, alg)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _read_branch(root, store):
|
|
51
|
+
node = decode(store.get(root))
|
|
52
|
+
if not is_branch(node):
|
|
53
|
+
raise WrongKind("extend/merge expect a branch root")
|
|
54
|
+
return node
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def extend(root, name, value, store, *, carve=None, summarize=None, alg="sha256"):
|
|
58
|
+
"""Fold `value` onto `root` as a new branch named `name` (last write wins on collision)."""
|
|
59
|
+
carve = carve or auto_carve
|
|
60
|
+
summarize = summarize or derive_summary
|
|
61
|
+
base = _read_branch(root, store)
|
|
62
|
+
child_handle = _build_node(value, [name], store, carve, summarize)
|
|
63
|
+
branches = {**base["branches"], name: child_handle}
|
|
64
|
+
new_root = store.put(serialize(branch_node(summarize(value, branches), branches)))
|
|
65
|
+
return _result_for(new_root, store, alg)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def merge(roots, store, *, summarize=None, alg="sha256"):
|
|
69
|
+
"""Combine several roots' branch tables into one new root (last write wins, in order)."""
|
|
70
|
+
summarize = summarize or derive_summary
|
|
71
|
+
branches = {}
|
|
72
|
+
for root in roots:
|
|
73
|
+
node = _read_branch(root, store)
|
|
74
|
+
branches = {**branches, **node["branches"]}
|
|
75
|
+
new_root = store.put(serialize(branch_node(summarize(None, branches), branches)))
|
|
76
|
+
return _result_for(new_root, store, alg)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Build the root envelope — the only thing that crosses into the model's context.
|
|
2
|
+
|
|
3
|
+
{"halo": "1", "alg": ..., "root": ..., "view": {"summary", "branches"}, "source"?: ...}
|
|
4
|
+
|
|
5
|
+
view inlines the root node's content so a consumer navigates with zero fetches and can verify by
|
|
6
|
+
re-hashing the reconstructed root against root. source is envelope-only identification metadata
|
|
7
|
+
(id, tool, args, ts) and is NEVER hashed. (Full envelope verification and source population land
|
|
8
|
+
with the navigator/adapter; this module is the builder the encode pipeline returns.)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .node import branch_node, is_branch, node_handle
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _leaf_summary(value):
|
|
15
|
+
"""A leaf root has no branches; describe its shape instead. Structural + cross-port identical."""
|
|
16
|
+
if value is None:
|
|
17
|
+
return "leaf: null"
|
|
18
|
+
if isinstance(value, bool): # before int/float: bool is an int subclass in Python
|
|
19
|
+
return "leaf: boolean"
|
|
20
|
+
if isinstance(value, (int, float)):
|
|
21
|
+
return "leaf: number"
|
|
22
|
+
if isinstance(value, str):
|
|
23
|
+
return "leaf: string"
|
|
24
|
+
if isinstance(value, list):
|
|
25
|
+
return f"leaf: array of {len(value)} items"
|
|
26
|
+
return f"leaf: object with {len(value)} keys"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_envelope(root, root_node, alg="sha256"):
|
|
30
|
+
"""Build the envelope for an already-stored root node."""
|
|
31
|
+
if is_branch(root_node):
|
|
32
|
+
view = {"summary": root_node["summary"], "branches": dict(root_node["branches"])}
|
|
33
|
+
else:
|
|
34
|
+
view = {"summary": _leaf_summary(root_node["value"]), "branches": {}}
|
|
35
|
+
return {"halo": "1", "alg": alg, "root": root, "view": view}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def verify_envelope(envelope):
|
|
39
|
+
"""Self-verify an envelope from its inlined view alone (no store): re-hash the reconstructed
|
|
40
|
+
root branch node and compare to the claimed root. source is excluded by construction — it is
|
|
41
|
+
never part of a node — so two envelopes differing only in source verify identically.
|
|
42
|
+
|
|
43
|
+
Only a BRANCH root can be self-verified this way; a leaf root's value is not inlined in the view,
|
|
44
|
+
so this returns False for leaf roots and the caller must verify against the store (open_/fetch).
|
|
45
|
+
"""
|
|
46
|
+
reconstructed = branch_node(envelope["view"]["summary"], envelope["view"]["branches"])
|
|
47
|
+
return node_handle(reconstructed, envelope["alg"]) == envelope["root"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Typed errors with shared meaning across ports.
|
|
2
|
+
|
|
3
|
+
UnknownHandle, HashMismatch (the tamper signal, never swallowed), WrongKind,
|
|
4
|
+
CanonicalizationError, StoreError.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HaloError(Exception):
|
|
9
|
+
"""Base class for all Halo errors."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UnknownHandle(HaloError):
|
|
13
|
+
"""A handle is absent from the store."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HashMismatch(HaloError):
|
|
17
|
+
"""Read bytes do not verify against the requested handle — the tamper signal."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class WrongKind(HaloError):
|
|
21
|
+
"""walk was called on a leaf, or fetch on a branch."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CanonicalizationError(HaloError):
|
|
25
|
+
"""A value cannot be canonicalized (e.g. a non-finite number)."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class StoreError(HaloError):
|
|
29
|
+
"""Wraps an adapter-level store failure."""
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""canonical bytes -> handle.
|
|
2
|
+
|
|
3
|
+
handle = "h:" + sha256(canonical(node)).hexdigest(), full 64-hex. Algorithm pluggable behind a
|
|
4
|
+
small registry keyed by alg (default sha256); the envelope declares it per-tree. Uses stdlib
|
|
5
|
+
hashlib. Leaf dependency, byte-exact across ports.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
|
|
10
|
+
from .canonical import canonical_bytes
|
|
11
|
+
|
|
12
|
+
# Algorithm registry keyed by alg. A future move (e.g. blake3) registers a function here and the
|
|
13
|
+
# envelope's alg selects it; call sites never hard-code the algorithm.
|
|
14
|
+
_REGISTRY = {
|
|
15
|
+
"sha256": lambda b: hashlib.sha256(b).hexdigest(),
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def hash_bytes(data: bytes, alg: str = "sha256") -> str:
|
|
20
|
+
"""Hash canonical bytes into a handle: "h:" + lowercase hex digest under alg."""
|
|
21
|
+
fn = _REGISTRY.get(alg)
|
|
22
|
+
if fn is None:
|
|
23
|
+
raise ValueError(f"unknown hash alg: {alg}")
|
|
24
|
+
return "h:" + fn(data)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def handle_of(value, alg: str = "sha256") -> str:
|
|
28
|
+
"""Convenience: handle of a JSON value = hash_bytes(canonical_bytes(value))."""
|
|
29
|
+
return hash_bytes(canonical_bytes(value), alg)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""open_(envelope, store) -> Navigator(walk, fetch, fetch_many, root), plus the free
|
|
2
|
+
walk/fetch/fetch_many low-level path.
|
|
3
|
+
|
|
4
|
+
Every read verifies: read bytes, recompute the hash under the bound alg, compare to the requested
|
|
5
|
+
handle, and only then decode. That single check is what makes the store untrusted — substituted or
|
|
6
|
+
corrupted bytes fail at the moment they are read, before their contents are used, and the guarantee
|
|
7
|
+
propagates from the verified root down every branch to every leaf (the trust chain).
|
|
8
|
+
|
|
9
|
+
walk returns a branch's summary + child handles (never leaf data); fetch returns one verified leaf;
|
|
10
|
+
fetch_many verifies several in one call with per-ref results, so one unknown or tampered entry never
|
|
11
|
+
sinks the batch. The Navigator additionally resolves map-scoped refs (e.g. m1.income, or income /
|
|
12
|
+
income.monthly against the opened envelope) to handles before verifying; raw handles always work and
|
|
13
|
+
need no registration.
|
|
14
|
+
|
|
15
|
+
Sync to match the Python Store (see store.py's note on the async divergence from TypeScript).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from .envelope import verify_envelope
|
|
19
|
+
from .errors import HaloError, HashMismatch, UnknownHandle, WrongKind
|
|
20
|
+
from .hash import hash_bytes
|
|
21
|
+
from .node import decode, is_branch, is_leaf
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _is_raw_handle(ref):
|
|
25
|
+
return ref.startswith("h:")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _read_verified(handle, store, alg):
|
|
29
|
+
"""Read a handle's bytes and verify them against it before decoding. The load-bearing check."""
|
|
30
|
+
data = store.get(handle) # raises UnknownHandle if absent
|
|
31
|
+
if hash_bytes(data, alg) != handle:
|
|
32
|
+
raise HashMismatch(handle)
|
|
33
|
+
return decode(data)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _get_many_bytes(store, handles):
|
|
37
|
+
get_many = getattr(store, "get_many", None)
|
|
38
|
+
if get_many is not None:
|
|
39
|
+
return get_many(handles)
|
|
40
|
+
out = []
|
|
41
|
+
for h in handles:
|
|
42
|
+
try:
|
|
43
|
+
out.append(store.get(h))
|
|
44
|
+
except UnknownHandle:
|
|
45
|
+
out.append(None)
|
|
46
|
+
return out
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def walk(handle, store, alg="sha256"):
|
|
50
|
+
"""Verified walk of a branch handle: returns its summary and child handles, never leaf data."""
|
|
51
|
+
node = _read_verified(handle, store, alg)
|
|
52
|
+
if not is_branch(node):
|
|
53
|
+
raise WrongKind(f"walk expects a branch: {handle}")
|
|
54
|
+
return {"summary": node["summary"], "branches": node["branches"]}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def fetch(handle, store, alg="sha256"):
|
|
58
|
+
"""Verified fetch of a leaf handle: returns its value."""
|
|
59
|
+
node = _read_verified(handle, store, alg)
|
|
60
|
+
if not is_leaf(node):
|
|
61
|
+
raise WrongKind(f"fetch expects a leaf: {handle}")
|
|
62
|
+
return node["value"]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _verify_batch(present, store, alg):
|
|
66
|
+
"""Verify already-resolved (ref, handle) pairs independently. One backend round trip via
|
|
67
|
+
get_many. Missing -> UnknownHandle, bad hash -> HashMismatch, branch-where-leaf -> WrongKind."""
|
|
68
|
+
out = {}
|
|
69
|
+
byteses = _get_many_bytes(store, [h for _, h in present])
|
|
70
|
+
for (ref, handle), data in zip(present, byteses):
|
|
71
|
+
if data is None:
|
|
72
|
+
out[ref] = {"ok": False, "error": "UnknownHandle"}
|
|
73
|
+
elif hash_bytes(data, alg) != handle:
|
|
74
|
+
out[ref] = {"ok": False, "error": "HashMismatch"}
|
|
75
|
+
else:
|
|
76
|
+
node = decode(data)
|
|
77
|
+
out[ref] = (
|
|
78
|
+
{"ok": True, "value": node["value"]}
|
|
79
|
+
if is_leaf(node)
|
|
80
|
+
else {"ok": False, "error": "WrongKind"}
|
|
81
|
+
)
|
|
82
|
+
return out
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def fetch_many(refs, store, alg="sha256"):
|
|
86
|
+
"""Verified batch fetch over raw handles (low-level path; each ref is treated as a handle)."""
|
|
87
|
+
return _verify_batch([(r, r) for r in refs], store, alg)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Navigator:
|
|
91
|
+
"""Bound to an envelope's algorithm, store, and root. Resolves map-scoped refs and verifies
|
|
92
|
+
every read. Register additional envelopes (by their source.id) to resolve refs across several
|
|
93
|
+
maps in one session, e.g. m1.income vs m2.income."""
|
|
94
|
+
|
|
95
|
+
def __init__(self, primary, store):
|
|
96
|
+
self._primary = primary
|
|
97
|
+
self._store = store
|
|
98
|
+
self._alg = primary["alg"]
|
|
99
|
+
self.root = primary["root"]
|
|
100
|
+
self._registry = {}
|
|
101
|
+
self._register(primary)
|
|
102
|
+
|
|
103
|
+
def _register(self, env):
|
|
104
|
+
src = env.get("source")
|
|
105
|
+
if src and src.get("id"):
|
|
106
|
+
self._registry[src["id"]] = env
|
|
107
|
+
|
|
108
|
+
def register(self, env):
|
|
109
|
+
"""Register another envelope so its mapId.branch refs resolve in this session."""
|
|
110
|
+
self._register(env)
|
|
111
|
+
return self
|
|
112
|
+
|
|
113
|
+
def resolve(self, ref):
|
|
114
|
+
"""Resolve a ref (raw handle, mapId.branch[.child...], or a path on the opened envelope)."""
|
|
115
|
+
if _is_raw_handle(ref):
|
|
116
|
+
return ref
|
|
117
|
+
parts = ref.split(".")
|
|
118
|
+
if len(parts) >= 2 and parts[0] in self._registry:
|
|
119
|
+
env = self._registry[parts[0]]
|
|
120
|
+
return self._resolve_path(env["view"]["branches"], parts[1:], ref)
|
|
121
|
+
return self._resolve_path(self._primary["view"]["branches"], parts, ref)
|
|
122
|
+
|
|
123
|
+
def _resolve_path(self, branches, segs, ref):
|
|
124
|
+
if not segs:
|
|
125
|
+
raise UnknownHandle(f"empty ref: {ref}")
|
|
126
|
+
handle = branches.get(segs[0])
|
|
127
|
+
if handle is None:
|
|
128
|
+
raise UnknownHandle(f"unknown ref: {ref}")
|
|
129
|
+
for seg in segs[1:]:
|
|
130
|
+
node = _read_verified(handle, self._store, self._alg)
|
|
131
|
+
if not is_branch(node):
|
|
132
|
+
raise WrongKind(f"ref descends into a leaf: {ref}")
|
|
133
|
+
handle = node["branches"].get(seg)
|
|
134
|
+
if handle is None:
|
|
135
|
+
raise UnknownHandle(f"unknown ref: {ref}")
|
|
136
|
+
return handle
|
|
137
|
+
|
|
138
|
+
def walk(self, ref):
|
|
139
|
+
return walk(self.resolve(ref), self._store, self._alg)
|
|
140
|
+
|
|
141
|
+
def fetch(self, ref):
|
|
142
|
+
return fetch(self.resolve(ref), self._store, self._alg)
|
|
143
|
+
|
|
144
|
+
def fetch_many(self, refs):
|
|
145
|
+
present = []
|
|
146
|
+
out = {}
|
|
147
|
+
for r in refs:
|
|
148
|
+
try:
|
|
149
|
+
present.append((r, self.resolve(r)))
|
|
150
|
+
except HaloError as e:
|
|
151
|
+
# A resolution failure is surfaced per-entry (by error class name), never raised.
|
|
152
|
+
out[r] = {"ok": False, "error": type(e).__name__}
|
|
153
|
+
out.update(_verify_batch(present, self._store, self._alg))
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def open_(envelope, store):
|
|
158
|
+
"""Open an envelope into a Navigator, verifying the root. Zero fetches for an honest branch root."""
|
|
159
|
+
alg = envelope["alg"]
|
|
160
|
+
# The inlined view lets us verify a branch root with no store read (verify_envelope re-hashes
|
|
161
|
+
# the reconstructed root node and compares to the claimed root handle).
|
|
162
|
+
if not verify_envelope(envelope):
|
|
163
|
+
# Either a leaf root (value not inlined) or a tampered branch view. Read the real root to
|
|
164
|
+
# tell them apart: a branch here means the view was tampered.
|
|
165
|
+
node = _read_verified(envelope["root"], store, alg)
|
|
166
|
+
if is_branch(node):
|
|
167
|
+
raise HashMismatch(f"envelope view does not match root: {envelope['root']}")
|
|
168
|
+
# leaf root: nothing branch-navigable; a later fetch(root) re-verifies it.
|
|
169
|
+
return Navigator(envelope, store)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""The node model and its (de)serialization.
|
|
2
|
+
|
|
3
|
+
BranchNode = {"k": "b", "summary": str, "branches": {name: Handle}}
|
|
4
|
+
LeafNode = {"k": "l", "value": JsonValue}
|
|
5
|
+
|
|
6
|
+
The k kind tag is part of the hashed content so a leaf and branch can never collide. Stored form
|
|
7
|
+
is canonical(node) bytes, not the object.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
from .canonical import canonical_bytes
|
|
13
|
+
from .errors import HaloError
|
|
14
|
+
from .hash import hash_bytes
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def branch_node(summary: str, branches: dict) -> dict:
|
|
18
|
+
"""Build a branch node. A shallow copy keeps it independent of the caller's map."""
|
|
19
|
+
return {"k": "b", "summary": summary, "branches": dict(branches)}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def leaf_node(value) -> dict:
|
|
23
|
+
return {"k": "l", "value": value}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_branch(node: dict) -> bool:
|
|
27
|
+
return node.get("k") == "b"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_leaf(node: dict) -> bool:
|
|
31
|
+
return node.get("k") == "l"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def serialize(node: dict) -> bytes:
|
|
35
|
+
"""The exact bytes that get hashed and stored for a node: canonical(node)."""
|
|
36
|
+
return canonical_bytes(node)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def node_handle(node: dict, alg: str = "sha256") -> str:
|
|
40
|
+
"""A node's handle: hash over its canonical bytes. The Merkle building block."""
|
|
41
|
+
return hash_bytes(serialize(node), alg)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def decode(data: bytes) -> dict:
|
|
45
|
+
"""Decode stored bytes back into a validated node.
|
|
46
|
+
|
|
47
|
+
In normal use the bytes were verified against a handle first (navigate verifies on read), so
|
|
48
|
+
decode of verified bytes always succeeds; the validation here guards malformed input, it is
|
|
49
|
+
not a security boundary.
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
obj = json.loads(data.decode("utf-8"))
|
|
53
|
+
except (ValueError, UnicodeDecodeError) as e:
|
|
54
|
+
raise HaloError(f"node bytes are not valid JSON: {e}") from e
|
|
55
|
+
if not isinstance(obj, dict):
|
|
56
|
+
raise HaloError("node must be a JSON object")
|
|
57
|
+
kind = obj.get("k")
|
|
58
|
+
if kind == "l":
|
|
59
|
+
if "value" not in obj:
|
|
60
|
+
raise HaloError("leaf node missing `value`")
|
|
61
|
+
return {"k": "l", "value": obj["value"]}
|
|
62
|
+
if kind == "b":
|
|
63
|
+
if not isinstance(obj.get("summary"), str):
|
|
64
|
+
raise HaloError("branch node `summary` must be a string")
|
|
65
|
+
branches = obj.get("branches")
|
|
66
|
+
if not isinstance(branches, dict):
|
|
67
|
+
raise HaloError("branch node `branches` must be an object")
|
|
68
|
+
for name, h in branches.items():
|
|
69
|
+
if not isinstance(h, str):
|
|
70
|
+
raise HaloError(f'branch handle for "{name}" must be a string')
|
|
71
|
+
return {"k": "b", "summary": obj["summary"], "branches": branches}
|
|
72
|
+
raise HaloError(f"unknown node kind: {kind!r}")
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""The Store protocol plus MemoryStore and FileStore.
|
|
2
|
+
|
|
3
|
+
put(bytes) -> Handle (content-addressed, idempotent, stores once)
|
|
4
|
+
get(handle) -> bytes (raises UnknownHandle if absent)
|
|
5
|
+
has(handle) -> bool
|
|
6
|
+
get_many(handles) -> list[bytes | None] (optional multi-get)
|
|
7
|
+
|
|
8
|
+
Untrusted by design: every read verifies, so a buggy/hostile store cannot substitute data
|
|
9
|
+
undetected. MemoryStore = in-process dict; FileStore = one file per handle under a dir.
|
|
10
|
+
|
|
11
|
+
Note on async: the TypeScript Store is async (Promise-returning) per the SDK contract. The Python
|
|
12
|
+
port is synchronous, which is idiomatic for the in-process and filesystem stores; a heavy/remote
|
|
13
|
+
Python adapter can expose an async variant. The navigator built on top stays sync to match.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
from .errors import StoreError, UnknownHandle
|
|
19
|
+
from .hash import hash_bytes
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _hex_of(handle: str) -> str:
|
|
23
|
+
"""A handle is "h:" + hex; the hex alone is the on-disk filename for FileStore."""
|
|
24
|
+
return handle[2:] if handle.startswith("h:") else handle
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MemoryStore:
|
|
28
|
+
"""In-process content-addressed store — the light deployment and the default for tests."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, alg: str = "sha256"):
|
|
31
|
+
self._map: dict[str, bytes] = {}
|
|
32
|
+
self._alg = alg
|
|
33
|
+
|
|
34
|
+
def put(self, data: bytes) -> str:
|
|
35
|
+
handle = hash_bytes(data, self._alg)
|
|
36
|
+
# Idempotent + immutable: first writer wins, identical content is a no-op.
|
|
37
|
+
self._map.setdefault(handle, data)
|
|
38
|
+
return handle
|
|
39
|
+
|
|
40
|
+
def get(self, handle: str) -> bytes:
|
|
41
|
+
try:
|
|
42
|
+
return self._map[handle]
|
|
43
|
+
except KeyError:
|
|
44
|
+
raise UnknownHandle(handle) from None
|
|
45
|
+
|
|
46
|
+
def has(self, handle: str) -> bool:
|
|
47
|
+
return handle in self._map
|
|
48
|
+
|
|
49
|
+
def get_many(self, handles: list[str]) -> list[bytes | None]:
|
|
50
|
+
return [self._map.get(h) for h in handles]
|
|
51
|
+
|
|
52
|
+
def __len__(self) -> int:
|
|
53
|
+
return len(self._map)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class FileStore:
|
|
57
|
+
"""One file per handle under a directory — cross-step persistence inside a sandbox."""
|
|
58
|
+
|
|
59
|
+
def __init__(self, directory: str, alg: str = "sha256"):
|
|
60
|
+
self._dir = directory
|
|
61
|
+
self._alg = alg
|
|
62
|
+
try:
|
|
63
|
+
os.makedirs(directory, exist_ok=True)
|
|
64
|
+
except OSError as e:
|
|
65
|
+
raise StoreError(f"could not create store dir {directory}: {e}") from e
|
|
66
|
+
|
|
67
|
+
def _path_for(self, handle: str) -> str:
|
|
68
|
+
return os.path.join(self._dir, _hex_of(handle))
|
|
69
|
+
|
|
70
|
+
def put(self, data: bytes) -> str:
|
|
71
|
+
handle = hash_bytes(data, self._alg)
|
|
72
|
+
path = self._path_for(handle)
|
|
73
|
+
if not os.path.exists(path): # immutable: never overwrite
|
|
74
|
+
with open(path, "wb") as f:
|
|
75
|
+
f.write(data)
|
|
76
|
+
return handle
|
|
77
|
+
|
|
78
|
+
def get(self, handle: str) -> bytes:
|
|
79
|
+
try:
|
|
80
|
+
with open(self._path_for(handle), "rb") as f:
|
|
81
|
+
return f.read()
|
|
82
|
+
except FileNotFoundError:
|
|
83
|
+
raise UnknownHandle(handle) from None
|
|
84
|
+
except OSError as e:
|
|
85
|
+
raise StoreError(f"could not read {handle}: {e}") from e
|
|
86
|
+
|
|
87
|
+
def has(self, handle: str) -> bool:
|
|
88
|
+
return os.path.exists(self._path_for(handle))
|
|
89
|
+
|
|
90
|
+
def get_many(self, handles: list[str]) -> list[bytes | None]:
|
|
91
|
+
out: list[bytes | None] = []
|
|
92
|
+
for h in handles:
|
|
93
|
+
try:
|
|
94
|
+
out.append(self.get(h))
|
|
95
|
+
except UnknownHandle:
|
|
96
|
+
out.append(None)
|
|
97
|
+
return out
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""summarize(value, branches) -> str, deterministic.
|
|
2
|
+
|
|
3
|
+
The summary is part of the hashed node, so it MUST be a pure function of structural facts and never
|
|
4
|
+
LLM prose, or the same value would yield different handles and content-addressing breaks.
|
|
5
|
+
derive_summary depends only on the branch names (count + sorted names), which makes it identical for
|
|
6
|
+
encode, extend, and merge alike — none of those reconstruct the aggregate value.
|
|
7
|
+
|
|
8
|
+
Names are sorted by UTF-16 code unit (via the UTF-16-BE encoding as the sort key), matching JCS key
|
|
9
|
+
ordering, so the string is byte-identical to the TypeScript port (whose default sort is already by
|
|
10
|
+
UTF-16 code unit). Plain code-point sorting would diverge on astral characters.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def derive_summary(value, branches):
|
|
15
|
+
"""Deterministic structural summary of a branch: "<n> branches: <sorted names>"."""
|
|
16
|
+
names = sorted(branches.keys(), key=lambda s: s.encode("utf-16-be"))
|
|
17
|
+
n = len(names)
|
|
18
|
+
if n == 0:
|
|
19
|
+
return "0 branches"
|
|
20
|
+
if n == 1:
|
|
21
|
+
return f"1 branch: {names[0]}"
|
|
22
|
+
return f"{n} branches: {', '.join(names)}"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Unit tests for behavior the JSON conformance vectors cannot express.
|
|
2
|
+
|
|
3
|
+
Non-finite numbers are not valid JSON, so they only matter at the encode API boundary; handle
|
|
4
|
+
shape and key-order independence are properties of the primitives themselves. The shared
|
|
5
|
+
vector-driven conformance lives in py/conformance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
import pytest
|
|
12
|
+
|
|
13
|
+
from halo_format import canonical, handle_of
|
|
14
|
+
from halo_format.errors import CanonicalizationError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.parametrize("bad", [math.nan, math.inf, -math.inf])
|
|
18
|
+
def test_rejects_non_finite(bad):
|
|
19
|
+
with pytest.raises(CanonicalizationError):
|
|
20
|
+
canonical(bad)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_rejects_non_finite_nested():
|
|
24
|
+
with pytest.raises(CanonicalizationError):
|
|
25
|
+
canonical({"a": 1, "b": [2, math.inf]})
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_handle_shape():
|
|
29
|
+
assert re.fullmatch(r"h:[0-9a-f]{64}", handle_of({"hello": "world"}))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_handle_is_key_order_independent():
|
|
33
|
+
assert handle_of({"a": 1, "b": 2}) == handle_of({"b": 2, "a": 1})
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Unit tests for the encode pipeline: carve, summarize, encode, extend, merge. These defend the
|
|
2
|
+
load-bearing invariants (determinism, content-addressing, Merkle integrity, dedup) that the shared
|
|
3
|
+
whole-tree envelope vectors will later pin cross-port. CROSS_PORT_ROOT is the root handle the
|
|
4
|
+
TypeScript port produces for the same input — asserting it here is a real cross-implementation lock
|
|
5
|
+
without needing a vector file yet."""
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from halo_format import (
|
|
10
|
+
MemoryStore,
|
|
11
|
+
WrongKind,
|
|
12
|
+
auto_carve,
|
|
13
|
+
branch_node,
|
|
14
|
+
decode,
|
|
15
|
+
derive_summary,
|
|
16
|
+
encode,
|
|
17
|
+
extend,
|
|
18
|
+
is_branch,
|
|
19
|
+
leaf_node,
|
|
20
|
+
make_auto_carve,
|
|
21
|
+
merge,
|
|
22
|
+
node_handle,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# The credit-report-shaped fixture, with a `note` big enough to force the top object to carve.
|
|
26
|
+
FIXTURE = {
|
|
27
|
+
"income": {"monthly": 4200},
|
|
28
|
+
"debts": {"monthly": 2604},
|
|
29
|
+
"inquiries": [1, 2, 3],
|
|
30
|
+
"tradelines": ["a", "b", "c", "d", "e", "f", "g"],
|
|
31
|
+
"note": "x" * 2000,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Root handle the TypeScript port produces for FIXTURE — the cross-port interop lock.
|
|
35
|
+
CROSS_PORT_ROOT = "h:41d7286f74ae2aeb6ca69823ddf25b9381dccba488602c1a8347a6b5eb163223"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_auto_carve_inlines_small_and_primitive():
|
|
39
|
+
assert auto_carve(42, []) == {"as": "leaf"}
|
|
40
|
+
assert auto_carve("s", []) == {"as": "leaf"}
|
|
41
|
+
assert auto_carve(None, []) == {"as": "leaf"}
|
|
42
|
+
assert auto_carve({}, []) == {"as": "leaf"}
|
|
43
|
+
assert auto_carve([], []) == {"as": "leaf"}
|
|
44
|
+
assert auto_carve({"a": 1}, []) == {"as": "leaf"} # small object inlines
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_auto_carve_carves_large_object():
|
|
48
|
+
big = {"a": "x" * 2000, "b": 1}
|
|
49
|
+
d = auto_carve(big, [])
|
|
50
|
+
assert d["as"] == "branch"
|
|
51
|
+
assert sorted(d["children"].keys()) == ["a", "b"]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_auto_carve_chunks_long_array():
|
|
55
|
+
arr = list(range(60))
|
|
56
|
+
d = make_auto_carve(array_threshold=25)(arr, [])
|
|
57
|
+
assert d["as"] == "branch"
|
|
58
|
+
assert list(d["children"].keys()) == ["0", "1", "2"]
|
|
59
|
+
assert d["children"]["0"] == arr[0:25]
|
|
60
|
+
assert d["children"]["2"] == arr[50:60]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_auto_carve_inline_bytes_zero_forces_branch():
|
|
64
|
+
assert make_auto_carve(inline_bytes=0)({"a": 1}, [])["as"] == "branch"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_derive_summary_deterministic_and_sorted():
|
|
68
|
+
assert derive_summary(None, {}) == "0 branches"
|
|
69
|
+
assert derive_summary(None, {"child": "h:x"}) == "1 branch: child"
|
|
70
|
+
assert derive_summary(None, {"income": "h:1", "debts": "h:2"}) == "2 branches: debts, income"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_encode_deterministic_and_cross_port():
|
|
74
|
+
a = encode(FIXTURE, MemoryStore())
|
|
75
|
+
b = encode(FIXTURE, MemoryStore())
|
|
76
|
+
assert a["handle"] == b["handle"]
|
|
77
|
+
assert a["handle"] == CROSS_PORT_ROOT
|
|
78
|
+
assert a["envelope"] == b["envelope"]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_encode_envelope_shape():
|
|
82
|
+
store = MemoryStore()
|
|
83
|
+
r = encode(FIXTURE, store)
|
|
84
|
+
env = r["envelope"]
|
|
85
|
+
assert env["halo"] == "1"
|
|
86
|
+
assert env["alg"] == "sha256"
|
|
87
|
+
assert env["root"] == r["handle"]
|
|
88
|
+
assert env["view"]["summary"] == "5 branches: debts, income, inquiries, note, tradelines"
|
|
89
|
+
assert sorted(env["view"]["branches"].keys()) == [
|
|
90
|
+
"debts",
|
|
91
|
+
"income",
|
|
92
|
+
"inquiries",
|
|
93
|
+
"note",
|
|
94
|
+
"tradelines",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_encode_content_addressed_root():
|
|
99
|
+
store = MemoryStore()
|
|
100
|
+
r = encode(FIXTURE, store)
|
|
101
|
+
root_node = branch_node(r["envelope"]["view"]["summary"], r["envelope"]["view"]["branches"])
|
|
102
|
+
assert node_handle(root_node) == r["handle"]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_encode_merkle_integrity():
|
|
106
|
+
changed = {**FIXTURE, "income": {"monthly": 4201}}
|
|
107
|
+
a = encode(FIXTURE, MemoryStore())
|
|
108
|
+
b = encode(changed, MemoryStore())
|
|
109
|
+
assert b["handle"] != a["handle"]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_encode_dedup_equal_subtrees():
|
|
113
|
+
store = MemoryStore()
|
|
114
|
+
sub = {"blob": "y" * 2000}
|
|
115
|
+
encode({"left": sub, "right": sub, "pad": "z" * 2000}, store)
|
|
116
|
+
# root + 3 children, but left and right share one leaf -> 4 nodes, not 5
|
|
117
|
+
assert len(store) == 4
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_encode_leaf_root():
|
|
121
|
+
store = MemoryStore()
|
|
122
|
+
env = encode([1, 2, 3], store)["envelope"]
|
|
123
|
+
assert env["view"]["summary"] == "leaf: array of 3 items"
|
|
124
|
+
assert env["view"]["branches"] == {}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_extend_reuses_children_and_retains_prior_root():
|
|
128
|
+
store = MemoryStore()
|
|
129
|
+
base = encode({"a": "x" * 2000, "b": "y" * 2000}, store)
|
|
130
|
+
size_before = len(store)
|
|
131
|
+
ext = extend(base["handle"], "c", "z" * 2000, store)
|
|
132
|
+
assert sorted(ext["envelope"]["view"]["branches"].keys()) == ["a", "b", "c"]
|
|
133
|
+
assert len(store) == size_before + 2 # one new leaf c + one new root
|
|
134
|
+
assert store.has(base["handle"]) # prior root survives
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_extend_last_write_wins():
|
|
138
|
+
store = MemoryStore()
|
|
139
|
+
base = encode({"a": "x" * 2000, "b": "y" * 2000}, store)
|
|
140
|
+
ext = extend(base["handle"], "a", "new", store)
|
|
141
|
+
a_handle = ext["envelope"]["view"]["branches"]["a"]
|
|
142
|
+
assert decode(store.get(a_handle)) == leaf_node("new")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_extend_rejects_non_branch_root():
|
|
146
|
+
store = MemoryStore()
|
|
147
|
+
leaf = encode(42, store) # primitive -> leaf root
|
|
148
|
+
with pytest.raises(WrongKind):
|
|
149
|
+
extend(leaf["handle"], "x", 1, store)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_merge_combines_branch_tables_last_write_wins():
|
|
153
|
+
store = MemoryStore()
|
|
154
|
+
m1 = encode({"a": "x" * 2000, "b": "y" * 2000}, store)
|
|
155
|
+
m2 = encode({"b": "Y" * 2000, "c": "z" * 2000}, store)
|
|
156
|
+
merged = merge([m1["handle"], m2["handle"]], store)
|
|
157
|
+
assert sorted(merged["envelope"]["view"]["branches"].keys()) == ["a", "b", "c"]
|
|
158
|
+
m2_node = decode(store.get(m2["handle"]))
|
|
159
|
+
assert is_branch(m2_node)
|
|
160
|
+
assert merged["envelope"]["view"]["branches"]["b"] == m2_node["branches"]["b"]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Unit tests for the navigator: verified walk/fetch/fetch_many, the trust chain (store-untrusted
|
|
2
|
+
safety), ref resolution (raw handle, branch name, nested path, map-scoped via registered
|
|
3
|
+
envelopes), and the encode -> open_ -> walk -> fetch round trip that is the L0+L1 milestone."""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from halo_format import (
|
|
8
|
+
HashMismatch,
|
|
9
|
+
MemoryStore,
|
|
10
|
+
UnknownHandle,
|
|
11
|
+
WrongKind,
|
|
12
|
+
encode,
|
|
13
|
+
fetch,
|
|
14
|
+
fetch_many,
|
|
15
|
+
open_,
|
|
16
|
+
verify_envelope,
|
|
17
|
+
walk,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Top object > 1024 bytes (via bio) so it carves; profile also carves; score/tags inline as leaves.
|
|
21
|
+
DATA = {
|
|
22
|
+
"profile": {"name": "Ada", "city": "London", "bio": "x" * 2000},
|
|
23
|
+
"score": 612,
|
|
24
|
+
"tags": ["a", "b"],
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _encoded(value=DATA):
|
|
29
|
+
store = MemoryStore()
|
|
30
|
+
r = encode(value, store)
|
|
31
|
+
return store, r["handle"], r["envelope"]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TamperStore:
|
|
35
|
+
"""Serves correct bytes for everything except one target handle, for which it returns
|
|
36
|
+
attacker-chosen bytes — exercises the store-untrusted guarantee. No get_many, so reads fall
|
|
37
|
+
back to get() and hit this override."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, base, target, evil):
|
|
40
|
+
self._base = base
|
|
41
|
+
self._target = target
|
|
42
|
+
self._evil = evil
|
|
43
|
+
|
|
44
|
+
def put(self, data):
|
|
45
|
+
return self._base.put(data)
|
|
46
|
+
|
|
47
|
+
def get(self, handle):
|
|
48
|
+
return self._evil if handle == self._target else self._base.get(handle)
|
|
49
|
+
|
|
50
|
+
def has(self, handle):
|
|
51
|
+
return self._base.has(handle)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_open_verifies_honest_branch_root():
|
|
55
|
+
store, _, env = _encoded()
|
|
56
|
+
nav = open_(env, store)
|
|
57
|
+
assert nav.root == env["root"]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_open_leaf_root():
|
|
61
|
+
store, _, env = _encoded([1, 2, 3])
|
|
62
|
+
nav = open_(env, store)
|
|
63
|
+
assert nav.root == env["root"]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_open_rejects_tampered_view():
|
|
67
|
+
store, _, env = _encoded()
|
|
68
|
+
tampered = {**env, "view": {**env["view"], "summary": "lies"}}
|
|
69
|
+
with pytest.raises(HashMismatch):
|
|
70
|
+
open_(tampered, store)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_verify_envelope_true_and_source_stable():
|
|
74
|
+
_, _, env = _encoded()
|
|
75
|
+
assert verify_envelope(env) is True
|
|
76
|
+
a = {**env, "source": {"id": "m1"}}
|
|
77
|
+
b = {**env, "source": {"id": "m2", "tool": "t"}}
|
|
78
|
+
assert verify_envelope(a) is True
|
|
79
|
+
assert verify_envelope(b) is True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_verify_envelope_false_for_tampered_view_and_leaf_root():
|
|
83
|
+
_, _, env = _encoded()
|
|
84
|
+
tampered = {**env, "view": {**env["view"], "summary": "lies"}}
|
|
85
|
+
assert verify_envelope(tampered) is False
|
|
86
|
+
_, _, leaf_env = _encoded(42)
|
|
87
|
+
assert verify_envelope(leaf_env) is False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_free_walk_and_fetch_by_handle():
|
|
91
|
+
store, _, env = _encoded()
|
|
92
|
+
profile_handle = env["view"]["branches"]["profile"]
|
|
93
|
+
w = walk(profile_handle, store)
|
|
94
|
+
assert w["summary"] == "3 branches: bio, city, name"
|
|
95
|
+
assert fetch(w["branches"]["name"], store) == "Ada"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_walk_on_leaf_and_fetch_on_branch_raise_wrongkind():
|
|
99
|
+
store, _, env = _encoded()
|
|
100
|
+
with pytest.raises(WrongKind):
|
|
101
|
+
walk(env["view"]["branches"]["score"], store) # leaf
|
|
102
|
+
with pytest.raises(WrongKind):
|
|
103
|
+
fetch(env["view"]["branches"]["profile"], store) # branch
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_unknown_handle():
|
|
107
|
+
store, _, _ = _encoded()
|
|
108
|
+
with pytest.raises(UnknownHandle):
|
|
109
|
+
fetch("h:" + "0" * 64, store)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_store_untrusted_substituted_leaf_fails():
|
|
113
|
+
store, _, env = _encoded()
|
|
114
|
+
score_handle = env["view"]["branches"]["score"]
|
|
115
|
+
evil = b'{"k":"l","value":999}'
|
|
116
|
+
evil_store = TamperStore(store, score_handle, evil)
|
|
117
|
+
with pytest.raises(HashMismatch):
|
|
118
|
+
fetch(score_handle, evil_store)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_navigator_resolves_name_nested_and_raw_handle():
|
|
122
|
+
store, _, env = _encoded()
|
|
123
|
+
nav = open_(env, store)
|
|
124
|
+
assert nav.fetch("score") == 612
|
|
125
|
+
assert nav.fetch("profile.name") == "Ada"
|
|
126
|
+
assert nav.fetch("profile.city") == "London"
|
|
127
|
+
w = nav.walk("profile")
|
|
128
|
+
assert sorted(w["branches"].keys()) == ["bio", "city", "name"]
|
|
129
|
+
assert nav.fetch(env["view"]["branches"]["score"]) == 612 # raw handle
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def test_navigator_rejects_leaf_descent_and_unknown_name():
|
|
133
|
+
store, _, env = _encoded()
|
|
134
|
+
nav = open_(env, store)
|
|
135
|
+
with pytest.raises(WrongKind):
|
|
136
|
+
nav.fetch("score.nope")
|
|
137
|
+
with pytest.raises(UnknownHandle):
|
|
138
|
+
nav.fetch("missing")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_navigator_map_scoped_refs():
|
|
142
|
+
store = MemoryStore()
|
|
143
|
+
a = encode({"score": 1, "pad": "x" * 2000}, store)
|
|
144
|
+
b = encode({"score": 2, "pad": "y" * 2000}, store)
|
|
145
|
+
env1 = {**a["envelope"], "source": {"id": "m1"}}
|
|
146
|
+
env2 = {**b["envelope"], "source": {"id": "m2"}}
|
|
147
|
+
nav = open_(env1, store).register(env2)
|
|
148
|
+
assert nav.fetch("m1.score") == 1
|
|
149
|
+
assert nav.fetch("m2.score") == 2
|
|
150
|
+
assert nav.fetch("score") == 1 # bare name -> primary (m1)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def test_fetch_many_batches_with_per_ref_results():
|
|
154
|
+
store, _, env = _encoded()
|
|
155
|
+
score_h = env["view"]["branches"]["score"]
|
|
156
|
+
profile_h = env["view"]["branches"]["profile"]
|
|
157
|
+
missing = "h:" + "0" * 64
|
|
158
|
+
out = fetch_many([score_h, profile_h, missing], store)
|
|
159
|
+
assert out[score_h] == {"ok": True, "value": 612}
|
|
160
|
+
assert out[profile_h] == {"ok": False, "error": "WrongKind"}
|
|
161
|
+
assert out[missing] == {"ok": False, "error": "UnknownHandle"}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def test_navigator_fetch_many_surfaces_hashmismatch_per_entry():
|
|
165
|
+
store, _, env = _encoded()
|
|
166
|
+
score_handle = env["view"]["branches"]["score"]
|
|
167
|
+
evil = b'{"k":"l","value":999}'
|
|
168
|
+
evil_nav = open_(env, TamperStore(store, score_handle, evil))
|
|
169
|
+
out = evil_nav.fetch_many(["score", "profile.name", "missing"])
|
|
170
|
+
assert out["score"] == {"ok": False, "error": "HashMismatch"}
|
|
171
|
+
assert out["profile.name"] == {"ok": True, "value": "Ada"}
|
|
172
|
+
assert out["missing"] == {"ok": False, "error": "UnknownHandle"}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def test_round_trip_milestone():
|
|
176
|
+
store, _, env = _encoded()
|
|
177
|
+
nav = open_(env, store)
|
|
178
|
+
out = nav.fetch_many(["profile.name", "profile.city", "score"])
|
|
179
|
+
assert out == {
|
|
180
|
+
"profile.name": {"ok": True, "value": "Ada"},
|
|
181
|
+
"profile.city": {"ok": True, "value": "London"},
|
|
182
|
+
"score": {"ok": True, "value": 612},
|
|
183
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Unit tests for node decode validation and the leaf/branch kind-tag separation. Happy-path
|
|
2
|
+
serialize/handle/round-trip behavior is covered by the shared node vectors in py/conformance."""
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from halo_format import (
|
|
7
|
+
HaloError,
|
|
8
|
+
branch_node,
|
|
9
|
+
decode,
|
|
10
|
+
is_branch,
|
|
11
|
+
is_leaf,
|
|
12
|
+
leaf_node,
|
|
13
|
+
node_handle,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_leaf_constructor_and_guards():
|
|
18
|
+
n = leaf_node(42)
|
|
19
|
+
assert n == {"k": "l", "value": 42}
|
|
20
|
+
assert is_leaf(n) is True
|
|
21
|
+
assert is_branch(n) is False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_branch_constructor_defensive_copy():
|
|
25
|
+
branches = {"a": "h:aa"}
|
|
26
|
+
n = branch_node("one", branches)
|
|
27
|
+
branches["a"] = "h:bb" # mutate caller's map
|
|
28
|
+
assert n["branches"]["a"] == "h:aa" # node unaffected
|
|
29
|
+
assert is_branch(n) is True
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.parametrize(
|
|
33
|
+
"raw",
|
|
34
|
+
[
|
|
35
|
+
b"{bad", # invalid JSON
|
|
36
|
+
b"[1,2]", # not an object
|
|
37
|
+
b'{"k":"x"}', # unknown kind
|
|
38
|
+
b'{"k":"l"}', # leaf missing value
|
|
39
|
+
b'{"k":"b","summary":1,"branches":{}}', # non-string summary
|
|
40
|
+
b'{"k":"b","summary":"s","branches":{"a":1}}', # non-string child handle
|
|
41
|
+
],
|
|
42
|
+
)
|
|
43
|
+
def test_decode_rejects_malformed(raw):
|
|
44
|
+
with pytest.raises(HaloError):
|
|
45
|
+
decode(raw)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_kind_tag_prevents_collision():
|
|
49
|
+
leaf = leaf_node({"summary": "x", "branches": {}})
|
|
50
|
+
branch = branch_node("x", {})
|
|
51
|
+
assert node_handle(leaf) != node_handle(branch)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Unit tests for the stores: content-addressing, idempotent dedup, immutability, missing
|
|
2
|
+
handles, batch reads, filesystem persistence, and cross-store handle agreement."""
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from halo_format import FileStore, MemoryStore, UnknownHandle
|
|
7
|
+
|
|
8
|
+
MISSING = "h:" + "0" * 64
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_memory_put_dedups_identical_bytes():
|
|
12
|
+
s = MemoryStore()
|
|
13
|
+
h1 = s.put(b"hello")
|
|
14
|
+
h2 = s.put(b"hello")
|
|
15
|
+
assert h1 == h2
|
|
16
|
+
assert re.fullmatch(r"h:[0-9a-f]{64}", h1)
|
|
17
|
+
assert len(s) == 1 # stored once
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_memory_get_and_unknown_handle():
|
|
21
|
+
s = MemoryStore()
|
|
22
|
+
h = s.put(b"x")
|
|
23
|
+
assert s.get(h) == b"x"
|
|
24
|
+
try:
|
|
25
|
+
s.get(MISSING)
|
|
26
|
+
assert False, "expected UnknownHandle"
|
|
27
|
+
except UnknownHandle:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_memory_has():
|
|
32
|
+
s = MemoryStore()
|
|
33
|
+
h = s.put(b"y")
|
|
34
|
+
assert s.has(h) is True
|
|
35
|
+
assert s.has(MISSING) is False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_memory_get_many_missing_in_place():
|
|
39
|
+
s = MemoryStore()
|
|
40
|
+
h = s.put(b"present")
|
|
41
|
+
assert s.get_many([h, MISSING]) == [b"present", None]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_filestore_roundtrip_and_persistence(tmp_path):
|
|
45
|
+
a = FileStore(str(tmp_path))
|
|
46
|
+
h = a.put(b"persisted")
|
|
47
|
+
|
|
48
|
+
b = FileStore(str(tmp_path)) # fresh instance, same dir
|
|
49
|
+
assert b.has(h) is True
|
|
50
|
+
assert b.get(h) == b"persisted"
|
|
51
|
+
try:
|
|
52
|
+
b.get(MISSING)
|
|
53
|
+
assert False, "expected UnknownHandle"
|
|
54
|
+
except UnknownHandle:
|
|
55
|
+
pass
|
|
56
|
+
assert b.get_many([h, MISSING]) == [b"persisted", None]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_stores_agree_on_handle(tmp_path):
|
|
60
|
+
data = b"content-addressed regardless of store"
|
|
61
|
+
assert MemoryStore().put(data) == FileStore(str(tmp_path)).put(data)
|