fast-hash-utils 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ lint:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: actions/setup-python@v5
14
+ with:
15
+ python-version: "3.13"
16
+ - run: pip install ruff
17
+ - run: ruff check .
18
+ - run: ruff format --check .
19
+
20
+ test:
21
+ needs: lint
22
+ strategy:
23
+ matrix:
24
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
25
+ os: [ubuntu-latest]
26
+ runs-on: ${{ matrix.os }}
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ with:
30
+ fetch-depth: 0
31
+ - uses: actions/setup-python@v5
32
+ with:
33
+ python-version: ${{ matrix.python-version }}
34
+ allow-prereleases: true
35
+ - run: pip install -e ".[dev]"
36
+ - run: pytest -v
@@ -0,0 +1,88 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ sdist:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ with:
13
+ fetch-depth: 0
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.13"
17
+ - run: pip install build
18
+ - run: python -m build --sdist
19
+ - uses: actions/upload-artifact@v4
20
+ with:
21
+ name: sdist
22
+ path: dist/*.tar.gz
23
+
24
+ wheels:
25
+ strategy:
26
+ matrix:
27
+ include:
28
+ - os: ubuntu-latest
29
+ arch: x86_64
30
+ skip: "pp* *-musllinux_*"
31
+ name: linux-x86_64-manylinux
32
+ - os: ubuntu-latest
33
+ arch: x86_64
34
+ skip: "pp* *-manylinux_*"
35
+ name: linux-x86_64-musllinux
36
+ - os: ubuntu-latest
37
+ arch: aarch64
38
+ skip: "pp* *-musllinux_*"
39
+ name: linux-aarch64-manylinux
40
+ qemu: true
41
+ - os: ubuntu-latest
42
+ arch: aarch64
43
+ skip: "pp* *-manylinux_*"
44
+ name: linux-aarch64-musllinux
45
+ qemu: true
46
+ - os: macos-latest
47
+ arch: x86_64
48
+ skip: "pp*"
49
+ name: macos-x86_64
50
+ - os: macos-latest
51
+ arch: arm64
52
+ skip: "pp*"
53
+ name: macos-arm64
54
+ - os: windows-latest
55
+ arch: AMD64
56
+ skip: "pp*"
57
+ name: windows-x86_64
58
+ runs-on: ${{ matrix.os }}
59
+ steps:
60
+ - uses: actions/checkout@v4
61
+ with:
62
+ fetch-depth: 0
63
+ - uses: docker/setup-qemu-action@v3
64
+ if: matrix.qemu
65
+ with:
66
+ platforms: arm64
67
+ - uses: pypa/cibuildwheel@v2.23
68
+ env:
69
+ CIBW_ARCHS: ${{ matrix.arch }}
70
+ CIBW_SKIP: ${{ matrix.skip }}
71
+ CIBW_BUILD: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
72
+ - uses: actions/upload-artifact@v4
73
+ with:
74
+ name: wheels-${{ matrix.name }}
75
+ path: wheelhouse/*.whl
76
+
77
+ publish:
78
+ needs: [sdist, wheels]
79
+ runs-on: ubuntu-latest
80
+ permissions:
81
+ id-token: write
82
+ environment: pypi
83
+ steps:
84
+ - uses: actions/download-artifact@v4
85
+ with:
86
+ path: dist
87
+ merge-multiple: true
88
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,11 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.so
4
+ *.pyd
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .venv/
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Toby Mao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ .PHONY: install install-release build test test-compiled bench style clean
2
+
3
+ install:
4
+ MYPYC_OPT_LEVEL=0 pip install -e ".[dev]"
5
+
6
+ install-release:
7
+ MYPYC_OPT_LEVEL=3 pip install -e ".[dev]"
8
+
9
+ build:
10
+ mypyc hash_utils/_core.py
11
+
12
+ test:
13
+ pytest -v
14
+
15
+ test-compiled: build test
16
+
17
+ bench:
18
+ python benchmarks/bench.py
19
+
20
+ style:
21
+ ruff check --fix .
22
+ ruff format .
23
+
24
+ clean:
25
+ rm -rf build/ dist/ *.egg-info .pytest_cache
26
+ rm -f hash_utils/*.so hash_utils/*.pyd
27
+ find . -type d -name __pycache__ -exec rm -rf {} +
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: fast-hash-utils
3
+ Version: 0.2.0
4
+ Summary: Fast deterministic dict hashing via mypyc
5
+ Author: Toby Mao
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/tobymao/hash-utils
8
+ Project-URL: Repository, https://github.com/tobymao/hash-utils
9
+ Project-URL: Issues, https://github.com/tobymao/hash-utils/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: mypy-extensions>=1.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: mypy>=1.19; extra == "dev"
26
+ Requires-Dist: pytest>=8.4; extra == "dev"
27
+ Requires-Dist: ruff>=0.15; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # hash-utils
31
+
32
+ Fast deterministic dict hashing via mypyc.
33
+
34
+ ## Functions
35
+
36
+ - **`dict_hash(d)`** — deterministic hash of a nested dict's full content (keys + values)
37
+ - **`shape_hash(d)`** — structural hash that ignores string/int/float values, only hashing keys, value types, bools, and container lengths
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ pip install hash-utils
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ ```python
48
+ from hash_utils import dict_hash, shape_hash
49
+
50
+ d1 = {"name": "alice", "config": {"enabled": True, "tags": []}}
51
+ d2 = {"name": "bob", "config": {"enabled": True, "tags": []}}
52
+
53
+ # Full content hash — different names produce different hashes
54
+ dict_hash(d1) != dict_hash(d2)
55
+
56
+ # Shape hash — same structure produces same hash
57
+ shape_hash(d1) == shape_hash(d2)
58
+ ```
59
+
60
+ ## Why
61
+
62
+ `shape_hash` enables massive deduplication for jsonschema validation. If 13,000 dicts share the same structure but differ only in string values, they collapse to 1 unique shape — skip 12,999 redundant validations.
63
+
64
+ ## Performance
65
+
66
+ Compiled via mypyc to native C. ~400K ops/s for nested dicts on a single core.
67
+
68
+ | Method | ops/s | Deterministic |
69
+ |---|---|---|
70
+ | `shape_hash` (mypyc) | 445K | Yes |
71
+ | `dict_hash` (mypyc) | 405K | Yes |
72
+ | `hash(repr())` | 312K | No |
73
+ | `json.dumps + hash` | 206K | Yes |
74
+
75
+ ## Development
76
+
77
+ ```bash
78
+ python -m venv .venv && source .venv/bin/activate
79
+ make install # editable install with dev deps
80
+ make test # run tests (pure Python or compiled)
81
+ make lint # ruff check + format check
82
+ make clean # remove build artifacts
83
+ ```
84
+
85
+ ## License
86
+
87
+ MIT
@@ -0,0 +1,58 @@
1
+ # hash-utils
2
+
3
+ Fast deterministic dict hashing via mypyc.
4
+
5
+ ## Functions
6
+
7
+ - **`dict_hash(d)`** — deterministic hash of a nested dict's full content (keys + values)
8
+ - **`shape_hash(d)`** — structural hash that ignores string/int/float values, only hashing keys, value types, bools, and container lengths
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install hash-utils
14
+ ```
15
+
16
+ ## Usage
17
+
18
+ ```python
19
+ from hash_utils import dict_hash, shape_hash
20
+
21
+ d1 = {"name": "alice", "config": {"enabled": True, "tags": []}}
22
+ d2 = {"name": "bob", "config": {"enabled": True, "tags": []}}
23
+
24
+ # Full content hash — different names produce different hashes
25
+ dict_hash(d1) != dict_hash(d2)
26
+
27
+ # Shape hash — same structure produces same hash
28
+ shape_hash(d1) == shape_hash(d2)
29
+ ```
30
+
31
+ ## Why
32
+
33
+ `shape_hash` enables massive deduplication for jsonschema validation. If 13,000 dicts share the same structure but differ only in string values, they collapse to 1 unique shape — skip 12,999 redundant validations.
34
+
35
+ ## Performance
36
+
37
+ Compiled via mypyc to native C. ~400K ops/s for nested dicts on a single core.
38
+
39
+ | Method | ops/s | Deterministic |
40
+ |---|---|---|
41
+ | `shape_hash` (mypyc) | 445K | Yes |
42
+ | `dict_hash` (mypyc) | 405K | Yes |
43
+ | `hash(repr())` | 312K | No |
44
+ | `json.dumps + hash` | 206K | Yes |
45
+
46
+ ## Development
47
+
48
+ ```bash
49
+ python -m venv .venv && source .venv/bin/activate
50
+ make install # editable install with dev deps
51
+ make test # run tests (pure Python or compiled)
52
+ make lint # ruff check + format check
53
+ make clean # remove build artifacts
54
+ ```
55
+
56
+ ## License
57
+
58
+ MIT
@@ -0,0 +1,94 @@
1
+ """Benchmarks for dict_hash and shape_hash (pure Python vs mypyc)."""
2
+
3
+ import json
4
+ import time
5
+
6
+ N = 10_000
7
+
8
+
9
+ def make_dicts(n: int) -> list[dict]:
10
+ """Generate n realistic nested dicts."""
11
+ return [
12
+ {
13
+ "name": f"test_node_{i}",
14
+ "unique_id": f"project.model.{i}",
15
+ "description": f"A test node for benchmarking purposes #{i}",
16
+ "config": {
17
+ "enabled": True,
18
+ "severity": "ERROR",
19
+ "warn_if": "!= 0",
20
+ "error_if": "> 10",
21
+ "tags": ["ci", "nightly"],
22
+ "meta": {"owner": "team-data", "priority": i % 5},
23
+ },
24
+ "columns": {
25
+ "id": {"type": "integer", "nullable": False},
26
+ "name": {"type": "string", "nullable": True},
27
+ "value": {"type": "float", "nullable": True},
28
+ },
29
+ }
30
+ for i in range(n)
31
+ ]
32
+
33
+
34
+ def bench(name: str, fn, dicts: list[dict]) -> float:
35
+ # Warmup
36
+ for d in dicts[:100]:
37
+ fn(d)
38
+
39
+ start = time.perf_counter()
40
+ for d in dicts:
41
+ fn(d)
42
+ elapsed = time.perf_counter() - start
43
+
44
+ ops = len(dicts) / elapsed
45
+ print(f"{name:40s} {elapsed:.4f}s {ops:>10,.0f} ops/s")
46
+ return elapsed
47
+
48
+
49
+ def json_hash(d: dict) -> int:
50
+ return hash(json.dumps(d, sort_keys=True))
51
+
52
+
53
+ def main() -> None:
54
+ import hash_utils._core as core_mod
55
+
56
+ is_compiled = hasattr(core_mod, "__loader__") and "mypyc" in str(
57
+ getattr(core_mod, "__file__", "")
58
+ )
59
+ # More reliable: compiled modules are .so/.pyd, not .py
60
+ mod_file = getattr(core_mod, "__file__", "")
61
+ is_compiled = mod_file.endswith((".so", ".pyd"))
62
+
63
+ dicts = make_dicts(N)
64
+ print(f"Benchmarking {N:,} nested dicts")
65
+ print(f"mypyc compiled: {is_compiled}\n")
66
+ print(f"{'Method':40s} {'Time':>7s} {'Throughput':>12s}")
67
+ print("-" * 66)
68
+
69
+ # Pure Python versions (import the source directly)
70
+ import importlib
71
+ import importlib.util
72
+ import pathlib
73
+
74
+ core_py = pathlib.Path(__file__).resolve().parent.parent / "hash_utils" / "_core.py"
75
+ spec = importlib.util.spec_from_file_location("_core_pure", core_py)
76
+ pure = importlib.util.module_from_spec(spec)
77
+ spec.loader.exec_module(pure)
78
+
79
+ bench("dict_hash (pure python)", pure.dict_hash, dicts)
80
+ bench("shape_hash (pure python)", pure.shape_hash, dicts)
81
+
82
+ if is_compiled:
83
+ from hash_utils import dict_hash, shape_hash
84
+
85
+ bench("dict_hash (mypyc)", dict_hash, dicts)
86
+
87
+ bench("hash(json.dumps(sort_keys=True))", json_hash, dicts)
88
+
89
+ if is_compiled:
90
+ bench("shape_hash (mypyc)", shape_hash, dicts)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: fast-hash-utils
3
+ Version: 0.2.0
4
+ Summary: Fast deterministic dict hashing via mypyc
5
+ Author: Toby Mao
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/tobymao/hash-utils
8
+ Project-URL: Repository, https://github.com/tobymao/hash-utils
9
+ Project-URL: Issues, https://github.com/tobymao/hash-utils/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: mypy-extensions>=1.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: mypy>=1.19; extra == "dev"
26
+ Requires-Dist: pytest>=8.4; extra == "dev"
27
+ Requires-Dist: ruff>=0.15; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # hash-utils
31
+
32
+ Fast deterministic dict hashing via mypyc.
33
+
34
+ ## Functions
35
+
36
+ - **`dict_hash(d)`** — deterministic hash of a nested dict's full content (keys + values)
37
+ - **`shape_hash(d)`** — structural hash that ignores string/int/float values, only hashing keys, value types, bools, and container lengths
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ pip install hash-utils
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ ```python
48
+ from hash_utils import dict_hash, shape_hash
49
+
50
+ d1 = {"name": "alice", "config": {"enabled": True, "tags": []}}
51
+ d2 = {"name": "bob", "config": {"enabled": True, "tags": []}}
52
+
53
+ # Full content hash — different names produce different hashes
54
+ dict_hash(d1) != dict_hash(d2)
55
+
56
+ # Shape hash — same structure produces same hash
57
+ shape_hash(d1) == shape_hash(d2)
58
+ ```
59
+
60
+ ## Why
61
+
62
+ `shape_hash` enables massive deduplication for jsonschema validation. If 13,000 dicts share the same structure but differ only in string values, they collapse to 1 unique shape — skip 12,999 redundant validations.
63
+
64
+ ## Performance
65
+
66
+ Compiled via mypyc to native C. ~400K ops/s for nested dicts on a single core.
67
+
68
+ | Method | ops/s | Deterministic |
69
+ |---|---|---|
70
+ | `shape_hash` (mypyc) | 445K | Yes |
71
+ | `dict_hash` (mypyc) | 405K | Yes |
72
+ | `hash(repr())` | 312K | No |
73
+ | `json.dumps + hash` | 206K | Yes |
74
+
75
+ ## Development
76
+
77
+ ```bash
78
+ python -m venv .venv && source .venv/bin/activate
79
+ make install # editable install with dev deps
80
+ make test # run tests (pure Python or compiled)
81
+ make lint # ruff check + format check
82
+ make clean # remove build artifacts
83
+ ```
84
+
85
+ ## License
86
+
87
+ MIT
@@ -0,0 +1,17 @@
1
+ .gitignore
2
+ LICENSE
3
+ Makefile
4
+ README.md
5
+ pyproject.toml
6
+ setup.py
7
+ .github/workflows/ci.yml
8
+ .github/workflows/release.yml
9
+ benchmarks/bench.py
10
+ fast_hash_utils.egg-info/PKG-INFO
11
+ fast_hash_utils.egg-info/SOURCES.txt
12
+ fast_hash_utils.egg-info/dependency_links.txt
13
+ fast_hash_utils.egg-info/requires.txt
14
+ fast_hash_utils.egg-info/top_level.txt
15
+ hash_utils/__init__.py
16
+ hash_utils/_core.py
17
+ tests/test_hash_utils.py
@@ -0,0 +1,6 @@
1
+ mypy-extensions>=1.0
2
+
3
+ [dev]
4
+ mypy>=1.19
5
+ pytest>=8.4
6
+ ruff>=0.15
@@ -0,0 +1,8 @@
1
+ """Fast deterministic dict hashing via mypyc."""
2
+
3
+ from importlib.metadata import version
4
+
5
+ from hash_utils._core import dict_hash, shape_hash
6
+
7
+ __version__ = version("fast-hash-utils")
8
+ __all__ = ["dict_hash", "shape_hash"]
@@ -0,0 +1,120 @@
1
+ """Fast deterministic dict hashing for deduplication.
2
+
3
+ Iteratively traverses a nested dict/list/scalar structure using an
4
+ explicit stack. Accumulates a native i64 hash via bit mixing.
5
+ Designed to be compiled with mypyc.
6
+ """
7
+
8
+ from typing import Dict, List
9
+
10
+ from mypy_extensions import i64
11
+
12
+ # Type tags for shape hashing — distinct primes for good mixing
13
+ _TAG_NONE: i64 = 7
14
+ _TAG_BOOL_T: i64 = 11
15
+ _TAG_BOOL_F: i64 = 13
16
+ _TAG_INT: i64 = 17
17
+ _TAG_FLOAT: i64 = 19
18
+ _TAG_STR: i64 = 23
19
+ _TAG_DICT: i64 = 29
20
+ _TAG_LIST: i64 = 31
21
+ _TAG_OTHER: i64 = 37
22
+
23
+
24
+ def _hash_to_i64(obj: object) -> i64:
25
+ """Get the Python hash of an object as a native i64."""
26
+ return i64(hash(obj))
27
+
28
+
29
+ def _mix(h: i64, v: i64) -> i64:
30
+ """Murmur-inspired bit mixing using native i64 arithmetic."""
31
+ h = h ^ v
32
+ h = (h << 13) | ((h >> 51) & 0x1FFF)
33
+ h = h * 0x5BD1E995
34
+ return h
35
+
36
+
37
+ def dict_hash(d: Dict[object, object]) -> int:
38
+ """Deterministic hash of a nested dict — full values."""
39
+ h: i64 = 0
40
+ stack: List[object] = [d]
41
+
42
+ while stack:
43
+ item: object = stack.pop()
44
+
45
+ if item is None:
46
+ h = _mix(h, _TAG_NONE)
47
+ elif isinstance(item, bool):
48
+ h = _mix(h, 1 if item else 2)
49
+ elif isinstance(item, int):
50
+ h = _mix(h, _hash_to_i64(item))
51
+ elif isinstance(item, float):
52
+ h = _mix(h, _hash_to_i64(item))
53
+ elif isinstance(item, str):
54
+ h = _mix(h, _hash_to_i64(item))
55
+ h = _mix(h, i64(len(item)))
56
+ elif isinstance(item, dict):
57
+ h = _mix(h, i64(len(item)))
58
+ keys: List[str] = sorted(item.keys())
59
+ i: i64 = i64(len(keys)) - 1
60
+ while i >= 0:
61
+ k: str = keys[i]
62
+ stack.append(item[k])
63
+ stack.append(k)
64
+ i -= 1
65
+ elif isinstance(item, list):
66
+ h = _mix(h, i64(len(item)))
67
+ i = i64(len(item)) - 1
68
+ while i >= 0:
69
+ stack.append(item[i])
70
+ i -= 1
71
+ else:
72
+ h = _mix(h, _hash_to_i64(item))
73
+
74
+ return int(h)
75
+
76
+
77
+ def shape_hash(d: Dict[object, object]) -> int:
78
+ """Deterministic hash of a nested dict — shape only.
79
+
80
+ Hashes keys and value types but ignores string/int/float content.
81
+ Two dicts with the same structure always produce the same
82
+ jsonschema validation result, so this is safe for dedup.
83
+ """
84
+ h: i64 = 0
85
+ stack: List[object] = [d]
86
+
87
+ while stack:
88
+ item: object = stack.pop()
89
+
90
+ if item is None:
91
+ h = _mix(h, _TAG_NONE)
92
+ elif isinstance(item, bool):
93
+ h = _mix(h, _TAG_BOOL_T if item else _TAG_BOOL_F)
94
+ elif isinstance(item, int):
95
+ h = _mix(h, _TAG_INT)
96
+ elif isinstance(item, float):
97
+ h = _mix(h, _TAG_FLOAT)
98
+ elif isinstance(item, str):
99
+ h = _mix(h, _TAG_STR)
100
+ elif isinstance(item, dict):
101
+ h = _mix(h, _TAG_DICT)
102
+ h = _mix(h, i64(len(item)))
103
+ keys: List[str] = sorted(item.keys())
104
+ i: i64 = i64(len(keys)) - 1
105
+ while i >= 0:
106
+ k: str = keys[i]
107
+ h = _mix(h, _hash_to_i64(k))
108
+ stack.append(item[k])
109
+ i -= 1
110
+ elif isinstance(item, list):
111
+ h = _mix(h, _TAG_LIST)
112
+ h = _mix(h, i64(len(item)))
113
+ i = i64(len(item)) - 1
114
+ while i >= 0:
115
+ stack.append(item[i])
116
+ i -= 1
117
+ else:
118
+ h = _mix(h, _TAG_OTHER)
119
+
120
+ return int(h)
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "setuptools-scm>=8", "mypy>=1.0", "mypy-extensions>=1.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "fast-hash-utils"
7
+ dynamic = ["version"]
8
+ description = "Fast deterministic dict hashing via mypyc"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [{name = "Toby Mao"}]
13
+ dependencies = ["mypy-extensions>=1.0"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.9",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Programming Language :: Python :: 3.14",
24
+ "Typing :: Typed",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ dev = [
29
+ "mypy>=1.19",
30
+ "pytest>=8.4",
31
+ "ruff>=0.15",
32
+ ]
33
+
34
+ [project.urls]
35
+ Homepage = "https://github.com/tobymao/hash-utils"
36
+ Repository = "https://github.com/tobymao/hash-utils"
37
+ Issues = "https://github.com/tobymao/hash-utils/issues"
38
+
39
+ [tool.setuptools_scm]
40
+
41
+ [tool.setuptools.packages.find]
42
+ include = ["hash_utils*"]
43
+
44
+ [tool.pytest.ini_options]
45
+ testpaths = ["tests"]
46
+
47
+ [tool.ruff]
48
+ target-version = "py39"
49
+
50
+ [tool.ruff.lint]
51
+ select = ["E", "F", "I", "UP"]
52
+
53
+ [tool.ruff.lint.per-file-ignores]
54
+ "hash_utils/_core.py" = ["UP006", "UP035"] # typing.Dict/List required for mypyc + Python 3.9
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+ from setuptools import setup
4
+
5
+ try:
6
+ from mypyc.build import mypycify
7
+
8
+ opt_level = os.environ.get("MYPYC_OPT_LEVEL", "3")
9
+ ext_modules = mypycify(["hash_utils/_core.py"], opt_level=opt_level)
10
+ except ImportError:
11
+ ext_modules = []
12
+
13
+ setup(ext_modules=ext_modules)
@@ -0,0 +1,215 @@
1
+ from __future__ import annotations
2
+
3
+ from hash_utils import dict_hash, shape_hash
4
+
5
+
6
+ class TestDictHash:
7
+ """Tests for dict_hash — full content hashing."""
8
+
9
+ def test_deterministic(self):
10
+ d = {"a": 1, "b": "hello", "c": [1, 2, 3]}
11
+ assert dict_hash(d) == dict_hash(d)
12
+
13
+ def test_key_order_independent(self):
14
+ """Sorted keys means insertion order doesn't matter."""
15
+ d1 = {"a": 1, "b": 2}
16
+ d2 = {"b": 2, "a": 1}
17
+ assert dict_hash(d1) == dict_hash(d2)
18
+
19
+ def test_different_values_differ(self):
20
+ d1 = {"a": 1}
21
+ d2 = {"a": 2}
22
+ assert dict_hash(d1) != dict_hash(d2)
23
+
24
+ def test_different_keys_differ(self):
25
+ d1 = {"a": 1}
26
+ d2 = {"b": 1}
27
+ assert dict_hash(d1) != dict_hash(d2)
28
+
29
+ def test_nested_dicts(self):
30
+ d1 = {"a": {"b": {"c": 1}}}
31
+ d2 = {"a": {"b": {"c": 1}}}
32
+ assert dict_hash(d1) == dict_hash(d2)
33
+
34
+ def test_nested_dicts_differ(self):
35
+ d1 = {"a": {"b": {"c": 1}}}
36
+ d2 = {"a": {"b": {"c": 2}}}
37
+ assert dict_hash(d1) != dict_hash(d2)
38
+
39
+ def test_lists(self):
40
+ d1 = {"a": [1, 2, 3]}
41
+ d2 = {"a": [1, 2, 3]}
42
+ assert dict_hash(d1) == dict_hash(d2)
43
+
44
+ def test_list_order_matters(self):
45
+ d1 = {"a": [1, 2]}
46
+ d2 = {"a": [2, 1]}
47
+ assert dict_hash(d1) != dict_hash(d2)
48
+
49
+ def test_empty_dict(self):
50
+ assert dict_hash({}) == dict_hash({})
51
+
52
+ def test_none_values(self):
53
+ d1 = {"a": None}
54
+ d2 = {"a": None}
55
+ assert dict_hash(d1) == dict_hash(d2)
56
+
57
+ def test_none_vs_zero(self):
58
+ d1 = {"a": None}
59
+ d2 = {"a": 0}
60
+ assert dict_hash(d1) != dict_hash(d2)
61
+
62
+ def test_bool_values(self):
63
+ d1 = {"a": True}
64
+ d2 = {"a": False}
65
+ assert dict_hash(d1) != dict_hash(d2)
66
+
67
+ def test_float_values(self):
68
+ d1 = {"a": 1.5}
69
+ d2 = {"a": 1.5}
70
+ assert dict_hash(d1) == dict_hash(d2)
71
+
72
+ def test_mixed_types(self):
73
+ d = {
74
+ "str": "hello",
75
+ "int": 42,
76
+ "float": 3.14,
77
+ "bool": True,
78
+ "none": None,
79
+ "list": [1, "two", 3.0],
80
+ "dict": {"nested": True},
81
+ }
82
+ assert dict_hash(d) == dict_hash(d)
83
+
84
+ def test_string_length_matters(self):
85
+ """Strings with same hash prefix but different length should differ."""
86
+ d1 = {"a": "x"}
87
+ d2 = {"a": "xx"}
88
+ assert dict_hash(d1) != dict_hash(d2)
89
+
90
+ def test_returns_int(self):
91
+ assert isinstance(dict_hash({"a": 1}), int)
92
+
93
+
94
+ class TestShapeHash:
95
+ """Tests for shape_hash — structural hashing."""
96
+
97
+ def test_deterministic(self):
98
+ d = {"a": 1, "b": "hello"}
99
+ assert shape_hash(d) == shape_hash(d)
100
+
101
+ def test_ignores_string_values(self):
102
+ d1 = {"name": "alice", "city": "london"}
103
+ d2 = {"name": "bob", "city": "paris"}
104
+ assert shape_hash(d1) == shape_hash(d2)
105
+
106
+ def test_ignores_int_values(self):
107
+ d1 = {"count": 1, "total": 100}
108
+ d2 = {"count": 999, "total": 0}
109
+ assert shape_hash(d1) == shape_hash(d2)
110
+
111
+ def test_ignores_float_values(self):
112
+ d1 = {"score": 1.5}
113
+ d2 = {"score": 99.9}
114
+ assert shape_hash(d1) == shape_hash(d2)
115
+
116
+ def test_different_keys_differ(self):
117
+ d1 = {"a": 1}
118
+ d2 = {"b": 1}
119
+ assert shape_hash(d1) != shape_hash(d2)
120
+
121
+ def test_bool_values_matter(self):
122
+ d1 = {"enabled": True}
123
+ d2 = {"enabled": False}
124
+ assert shape_hash(d1) != shape_hash(d2)
125
+
126
+ def test_different_value_types_differ(self):
127
+ d1 = {"a": "string"}
128
+ d2 = {"a": 42}
129
+ assert shape_hash(d1) != shape_hash(d2)
130
+
131
+ def test_dict_length_matters(self):
132
+ d1 = {"a": 1, "b": 2}
133
+ d2 = {"a": 1}
134
+ assert shape_hash(d1) != shape_hash(d2)
135
+
136
+ def test_list_length_matters(self):
137
+ d1 = {"items": [1, 2, 3]}
138
+ d2 = {"items": [1]}
139
+ assert shape_hash(d1) != shape_hash(d2)
140
+
141
+ def test_nested_shape_same(self):
142
+ """Two dicts with identical structure but different leaf values."""
143
+ d1 = {
144
+ "name": "test_user",
145
+ "config": {"enabled": True, "severity": "ERROR", "tags": []},
146
+ }
147
+ d2 = {
148
+ "name": "other_user",
149
+ "config": {"enabled": True, "severity": "WARNING", "tags": []},
150
+ }
151
+ assert shape_hash(d1) == shape_hash(d2)
152
+
153
+ def test_nested_shape_differs(self):
154
+ """Different structure should produce different hashes."""
155
+ d1 = {"config": {"enabled": True}}
156
+ d2 = {"config": {"enabled": True, "extra": "field"}}
157
+ assert shape_hash(d1) != shape_hash(d2)
158
+
159
+ def test_empty_dict(self):
160
+ assert shape_hash({}) == shape_hash({})
161
+
162
+ def test_none_tagged(self):
163
+ d1 = {"a": None}
164
+ d2 = {"a": "hello"}
165
+ assert shape_hash(d1) != shape_hash(d2)
166
+
167
+ def test_key_order_independent(self):
168
+ d1 = {"a": 1, "b": 2}
169
+ d2 = {"b": 2, "a": 1}
170
+ assert shape_hash(d1) == shape_hash(d2)
171
+
172
+ def test_returns_int(self):
173
+ assert isinstance(shape_hash({"a": 1}), int)
174
+
175
+ def test_realistic_dedup(self):
176
+ """Simulate the jsonschema validation dedup use case."""
177
+ # Generate 100 dicts that differ only in string/int values
178
+ dicts = []
179
+ for i in range(100):
180
+ d = {
181
+ "name": f"test_node_{i}",
182
+ "unique_id": f"project.model.{i}",
183
+ "config": {
184
+ "enabled": True,
185
+ "severity": "ERROR",
186
+ "tags": [],
187
+ },
188
+ "columns": {},
189
+ }
190
+ dicts.append(d)
191
+
192
+ hashes = {shape_hash(d) for d in dicts}
193
+ assert len(hashes) == 1, f"Expected 1 unique shape, got {len(hashes)}"
194
+
195
+
196
+ class TestDictHashVsShapeHash:
197
+ """Verify the two functions behave differently where expected."""
198
+
199
+ def test_dict_hash_distinguishes_values(self):
200
+ d1 = {"a": "hello"}
201
+ d2 = {"a": "world"}
202
+ assert dict_hash(d1) != dict_hash(d2)
203
+ assert shape_hash(d1) == shape_hash(d2)
204
+
205
+ def test_both_distinguish_keys(self):
206
+ d1 = {"a": 1}
207
+ d2 = {"b": 1}
208
+ assert dict_hash(d1) != dict_hash(d2)
209
+ assert shape_hash(d1) != shape_hash(d2)
210
+
211
+ def test_both_distinguish_structure(self):
212
+ d1 = {"a": [1, 2]}
213
+ d2 = {"a": {"b": 1}}
214
+ assert dict_hash(d1) != dict_hash(d2)
215
+ assert shape_hash(d1) != shape_hash(d2)