fast-hash-utils 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_hash_utils-0.2.0/.github/workflows/ci.yml +36 -0
- fast_hash_utils-0.2.0/.github/workflows/release.yml +88 -0
- fast_hash_utils-0.2.0/.gitignore +11 -0
- fast_hash_utils-0.2.0/LICENSE +21 -0
- fast_hash_utils-0.2.0/Makefile +27 -0
- fast_hash_utils-0.2.0/PKG-INFO +87 -0
- fast_hash_utils-0.2.0/README.md +58 -0
- fast_hash_utils-0.2.0/benchmarks/bench.py +94 -0
- fast_hash_utils-0.2.0/fast_hash_utils.egg-info/PKG-INFO +87 -0
- fast_hash_utils-0.2.0/fast_hash_utils.egg-info/SOURCES.txt +17 -0
- fast_hash_utils-0.2.0/fast_hash_utils.egg-info/dependency_links.txt +1 -0
- fast_hash_utils-0.2.0/fast_hash_utils.egg-info/requires.txt +6 -0
- fast_hash_utils-0.2.0/fast_hash_utils.egg-info/top_level.txt +1 -0
- fast_hash_utils-0.2.0/hash_utils/__init__.py +8 -0
- fast_hash_utils-0.2.0/hash_utils/_core.py +120 -0
- fast_hash_utils-0.2.0/pyproject.toml +54 -0
- fast_hash_utils-0.2.0/setup.cfg +4 -0
- fast_hash_utils-0.2.0/setup.py +13 -0
- fast_hash_utils-0.2.0/tests/test_hash_utils.py +215 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
lint:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: actions/setup-python@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.13"
|
|
16
|
+
- run: pip install ruff
|
|
17
|
+
- run: ruff check .
|
|
18
|
+
- run: ruff format --check .
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
needs: lint
|
|
22
|
+
strategy:
|
|
23
|
+
matrix:
|
|
24
|
+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
|
|
25
|
+
os: [ubuntu-latest]
|
|
26
|
+
runs-on: ${{ matrix.os }}
|
|
27
|
+
steps:
|
|
28
|
+
- uses: actions/checkout@v4
|
|
29
|
+
with:
|
|
30
|
+
fetch-depth: 0
|
|
31
|
+
- uses: actions/setup-python@v5
|
|
32
|
+
with:
|
|
33
|
+
python-version: ${{ matrix.python-version }}
|
|
34
|
+
allow-prereleases: true
|
|
35
|
+
- run: pip install -e ".[dev]"
|
|
36
|
+
- run: pytest -v
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
sdist:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
with:
|
|
13
|
+
fetch-depth: 0
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.13"
|
|
17
|
+
- run: pip install build
|
|
18
|
+
- run: python -m build --sdist
|
|
19
|
+
- uses: actions/upload-artifact@v4
|
|
20
|
+
with:
|
|
21
|
+
name: sdist
|
|
22
|
+
path: dist/*.tar.gz
|
|
23
|
+
|
|
24
|
+
wheels:
|
|
25
|
+
strategy:
|
|
26
|
+
matrix:
|
|
27
|
+
include:
|
|
28
|
+
- os: ubuntu-latest
|
|
29
|
+
arch: x86_64
|
|
30
|
+
skip: "pp* *-musllinux_*"
|
|
31
|
+
name: linux-x86_64-manylinux
|
|
32
|
+
- os: ubuntu-latest
|
|
33
|
+
arch: x86_64
|
|
34
|
+
skip: "pp* *-manylinux_*"
|
|
35
|
+
name: linux-x86_64-musllinux
|
|
36
|
+
- os: ubuntu-latest
|
|
37
|
+
arch: aarch64
|
|
38
|
+
skip: "pp* *-musllinux_*"
|
|
39
|
+
name: linux-aarch64-manylinux
|
|
40
|
+
qemu: true
|
|
41
|
+
- os: ubuntu-latest
|
|
42
|
+
arch: aarch64
|
|
43
|
+
skip: "pp* *-manylinux_*"
|
|
44
|
+
name: linux-aarch64-musllinux
|
|
45
|
+
qemu: true
|
|
46
|
+
- os: macos-latest
|
|
47
|
+
arch: x86_64
|
|
48
|
+
skip: "pp*"
|
|
49
|
+
name: macos-x86_64
|
|
50
|
+
- os: macos-latest
|
|
51
|
+
arch: arm64
|
|
52
|
+
skip: "pp*"
|
|
53
|
+
name: macos-arm64
|
|
54
|
+
- os: windows-latest
|
|
55
|
+
arch: AMD64
|
|
56
|
+
skip: "pp*"
|
|
57
|
+
name: windows-x86_64
|
|
58
|
+
runs-on: ${{ matrix.os }}
|
|
59
|
+
steps:
|
|
60
|
+
- uses: actions/checkout@v4
|
|
61
|
+
with:
|
|
62
|
+
fetch-depth: 0
|
|
63
|
+
- uses: docker/setup-qemu-action@v3
|
|
64
|
+
if: matrix.qemu
|
|
65
|
+
with:
|
|
66
|
+
platforms: arm64
|
|
67
|
+
- uses: pypa/cibuildwheel@v2.23
|
|
68
|
+
env:
|
|
69
|
+
CIBW_ARCHS: ${{ matrix.arch }}
|
|
70
|
+
CIBW_SKIP: ${{ matrix.skip }}
|
|
71
|
+
CIBW_BUILD: cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*
|
|
72
|
+
- uses: actions/upload-artifact@v4
|
|
73
|
+
with:
|
|
74
|
+
name: wheels-${{ matrix.name }}
|
|
75
|
+
path: wheelhouse/*.whl
|
|
76
|
+
|
|
77
|
+
publish:
|
|
78
|
+
needs: [sdist, wheels]
|
|
79
|
+
runs-on: ubuntu-latest
|
|
80
|
+
permissions:
|
|
81
|
+
id-token: write
|
|
82
|
+
environment: pypi
|
|
83
|
+
steps:
|
|
84
|
+
- uses: actions/download-artifact@v4
|
|
85
|
+
with:
|
|
86
|
+
path: dist
|
|
87
|
+
merge-multiple: true
|
|
88
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Toby Mao
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
.PHONY: install install-release build test test-compiled bench style clean
|
|
2
|
+
|
|
3
|
+
install:
|
|
4
|
+
MYPYC_OPT_LEVEL=0 pip install -e ".[dev]"
|
|
5
|
+
|
|
6
|
+
install-release:
|
|
7
|
+
MYPYC_OPT_LEVEL=3 pip install -e ".[dev]"
|
|
8
|
+
|
|
9
|
+
build:
|
|
10
|
+
mypyc hash_utils/_core.py
|
|
11
|
+
|
|
12
|
+
test:
|
|
13
|
+
pytest -v
|
|
14
|
+
|
|
15
|
+
test-compiled: build test
|
|
16
|
+
|
|
17
|
+
bench:
|
|
18
|
+
python benchmarks/bench.py
|
|
19
|
+
|
|
20
|
+
style:
|
|
21
|
+
ruff check --fix .
|
|
22
|
+
ruff format .
|
|
23
|
+
|
|
24
|
+
clean:
|
|
25
|
+
rm -rf build/ dist/ *.egg-info .pytest_cache
|
|
26
|
+
rm -f hash_utils/*.so hash_utils/*.pyd
|
|
27
|
+
find . -type d -name __pycache__ -exec rm -rf {} +
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fast-hash-utils
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fast deterministic dict hashing via mypyc
|
|
5
|
+
Author: Toby Mao
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/tobymao/hash-utils
|
|
8
|
+
Project-URL: Repository, https://github.com/tobymao/hash-utils
|
|
9
|
+
Project-URL: Issues, https://github.com/tobymao/hash-utils/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: mypy-extensions>=1.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: mypy>=1.19; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest>=8.4; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.15; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# hash-utils
|
|
31
|
+
|
|
32
|
+
Fast deterministic dict hashing via mypyc.
|
|
33
|
+
|
|
34
|
+
## Functions
|
|
35
|
+
|
|
36
|
+
- **`dict_hash(d)`** — deterministic hash of a nested dict's full content (keys + values)
|
|
37
|
+
- **`shape_hash(d)`** — structural hash that ignores string/int/float values, only hashing keys, value types, bools, and container lengths
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install hash-utils
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Usage
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from hash_utils import dict_hash, shape_hash
|
|
49
|
+
|
|
50
|
+
d1 = {"name": "alice", "config": {"enabled": True, "tags": []}}
|
|
51
|
+
d2 = {"name": "bob", "config": {"enabled": True, "tags": []}}
|
|
52
|
+
|
|
53
|
+
# Full content hash — different names produce different hashes
|
|
54
|
+
dict_hash(d1) != dict_hash(d2)
|
|
55
|
+
|
|
56
|
+
# Shape hash — same structure produces same hash
|
|
57
|
+
shape_hash(d1) == shape_hash(d2)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Why
|
|
61
|
+
|
|
62
|
+
`shape_hash` enables massive deduplication for jsonschema validation. If 13,000 dicts share the same structure but differ only in string values, they collapse to 1 unique shape — skip 12,999 redundant validations.
|
|
63
|
+
|
|
64
|
+
## Performance
|
|
65
|
+
|
|
66
|
+
Compiled via mypyc to native C. ~400K ops/s for nested dicts on a single core.
|
|
67
|
+
|
|
68
|
+
| Method | ops/s | Deterministic |
|
|
69
|
+
|---|---|---|
|
|
70
|
+
| `shape_hash` (mypyc) | 445K | Yes |
|
|
71
|
+
| `dict_hash` (mypyc) | 405K | Yes |
|
|
72
|
+
| `hash(repr())` | 312K | No |
|
|
73
|
+
| `json.dumps + hash` | 206K | Yes |
|
|
74
|
+
|
|
75
|
+
## Development
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
python -m venv .venv && source .venv/bin/activate
|
|
79
|
+
make install # editable install with dev deps
|
|
80
|
+
make test # run tests (pure Python or compiled)
|
|
81
|
+
make lint # ruff check + format check
|
|
82
|
+
make clean # remove build artifacts
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# hash-utils
|
|
2
|
+
|
|
3
|
+
Fast deterministic dict hashing via mypyc.
|
|
4
|
+
|
|
5
|
+
## Functions
|
|
6
|
+
|
|
7
|
+
- **`dict_hash(d)`** — deterministic hash of a nested dict's full content (keys + values)
|
|
8
|
+
- **`shape_hash(d)`** — structural hash that ignores string/int/float values, only hashing keys, value types, bools, and container lengths
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install hash-utils
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from hash_utils import dict_hash, shape_hash
|
|
20
|
+
|
|
21
|
+
d1 = {"name": "alice", "config": {"enabled": True, "tags": []}}
|
|
22
|
+
d2 = {"name": "bob", "config": {"enabled": True, "tags": []}}
|
|
23
|
+
|
|
24
|
+
# Full content hash — different names produce different hashes
|
|
25
|
+
dict_hash(d1) != dict_hash(d2)
|
|
26
|
+
|
|
27
|
+
# Shape hash — same structure produces same hash
|
|
28
|
+
shape_hash(d1) == shape_hash(d2)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Why
|
|
32
|
+
|
|
33
|
+
`shape_hash` enables massive deduplication for jsonschema validation. If 13,000 dicts share the same structure but differ only in string values, they collapse to 1 unique shape — skip 12,999 redundant validations.
|
|
34
|
+
|
|
35
|
+
## Performance
|
|
36
|
+
|
|
37
|
+
Compiled via mypyc to native C. ~400K ops/s for nested dicts on a single core.
|
|
38
|
+
|
|
39
|
+
| Method | ops/s | Deterministic |
|
|
40
|
+
|---|---|---|
|
|
41
|
+
| `shape_hash` (mypyc) | 445K | Yes |
|
|
42
|
+
| `dict_hash` (mypyc) | 405K | Yes |
|
|
43
|
+
| `hash(repr())` | 312K | No |
|
|
44
|
+
| `json.dumps + hash` | 206K | Yes |
|
|
45
|
+
|
|
46
|
+
## Development
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python -m venv .venv && source .venv/bin/activate
|
|
50
|
+
make install # editable install with dev deps
|
|
51
|
+
make test # run tests (pure Python or compiled)
|
|
52
|
+
make lint # ruff check + format check
|
|
53
|
+
make clean # remove build artifacts
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## License
|
|
57
|
+
|
|
58
|
+
MIT
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Benchmarks for dict_hash and shape_hash (pure Python vs mypyc)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
N = 10_000
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def make_dicts(n: int) -> list[dict]:
|
|
10
|
+
"""Generate n realistic nested dicts."""
|
|
11
|
+
return [
|
|
12
|
+
{
|
|
13
|
+
"name": f"test_node_{i}",
|
|
14
|
+
"unique_id": f"project.model.{i}",
|
|
15
|
+
"description": f"A test node for benchmarking purposes #{i}",
|
|
16
|
+
"config": {
|
|
17
|
+
"enabled": True,
|
|
18
|
+
"severity": "ERROR",
|
|
19
|
+
"warn_if": "!= 0",
|
|
20
|
+
"error_if": "> 10",
|
|
21
|
+
"tags": ["ci", "nightly"],
|
|
22
|
+
"meta": {"owner": "team-data", "priority": i % 5},
|
|
23
|
+
},
|
|
24
|
+
"columns": {
|
|
25
|
+
"id": {"type": "integer", "nullable": False},
|
|
26
|
+
"name": {"type": "string", "nullable": True},
|
|
27
|
+
"value": {"type": "float", "nullable": True},
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
for i in range(n)
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def bench(name: str, fn, dicts: list[dict]) -> float:
|
|
35
|
+
# Warmup
|
|
36
|
+
for d in dicts[:100]:
|
|
37
|
+
fn(d)
|
|
38
|
+
|
|
39
|
+
start = time.perf_counter()
|
|
40
|
+
for d in dicts:
|
|
41
|
+
fn(d)
|
|
42
|
+
elapsed = time.perf_counter() - start
|
|
43
|
+
|
|
44
|
+
ops = len(dicts) / elapsed
|
|
45
|
+
print(f"{name:40s} {elapsed:.4f}s {ops:>10,.0f} ops/s")
|
|
46
|
+
return elapsed
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def json_hash(d: dict) -> int:
|
|
50
|
+
return hash(json.dumps(d, sort_keys=True))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main() -> None:
|
|
54
|
+
import hash_utils._core as core_mod
|
|
55
|
+
|
|
56
|
+
is_compiled = hasattr(core_mod, "__loader__") and "mypyc" in str(
|
|
57
|
+
getattr(core_mod, "__file__", "")
|
|
58
|
+
)
|
|
59
|
+
# More reliable: compiled modules are .so/.pyd, not .py
|
|
60
|
+
mod_file = getattr(core_mod, "__file__", "")
|
|
61
|
+
is_compiled = mod_file.endswith((".so", ".pyd"))
|
|
62
|
+
|
|
63
|
+
dicts = make_dicts(N)
|
|
64
|
+
print(f"Benchmarking {N:,} nested dicts")
|
|
65
|
+
print(f"mypyc compiled: {is_compiled}\n")
|
|
66
|
+
print(f"{'Method':40s} {'Time':>7s} {'Throughput':>12s}")
|
|
67
|
+
print("-" * 66)
|
|
68
|
+
|
|
69
|
+
# Pure Python versions (import the source directly)
|
|
70
|
+
import importlib
|
|
71
|
+
import importlib.util
|
|
72
|
+
import pathlib
|
|
73
|
+
|
|
74
|
+
core_py = pathlib.Path(__file__).resolve().parent.parent / "hash_utils" / "_core.py"
|
|
75
|
+
spec = importlib.util.spec_from_file_location("_core_pure", core_py)
|
|
76
|
+
pure = importlib.util.module_from_spec(spec)
|
|
77
|
+
spec.loader.exec_module(pure)
|
|
78
|
+
|
|
79
|
+
bench("dict_hash (pure python)", pure.dict_hash, dicts)
|
|
80
|
+
bench("shape_hash (pure python)", pure.shape_hash, dicts)
|
|
81
|
+
|
|
82
|
+
if is_compiled:
|
|
83
|
+
from hash_utils import dict_hash, shape_hash
|
|
84
|
+
|
|
85
|
+
bench("dict_hash (mypyc)", dict_hash, dicts)
|
|
86
|
+
|
|
87
|
+
bench("hash(json.dumps(sort_keys=True))", json_hash, dicts)
|
|
88
|
+
|
|
89
|
+
if is_compiled:
|
|
90
|
+
bench("shape_hash (mypyc)", shape_hash, dicts)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
main()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fast-hash-utils
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fast deterministic dict hashing via mypyc
|
|
5
|
+
Author: Toby Mao
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/tobymao/hash-utils
|
|
8
|
+
Project-URL: Repository, https://github.com/tobymao/hash-utils
|
|
9
|
+
Project-URL: Issues, https://github.com/tobymao/hash-utils/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Typing :: Typed
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: mypy-extensions>=1.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: mypy>=1.19; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest>=8.4; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.15; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# hash-utils
|
|
31
|
+
|
|
32
|
+
Fast deterministic dict hashing via mypyc.
|
|
33
|
+
|
|
34
|
+
## Functions
|
|
35
|
+
|
|
36
|
+
- **`dict_hash(d)`** — deterministic hash of a nested dict's full content (keys + values)
|
|
37
|
+
- **`shape_hash(d)`** — structural hash that ignores string/int/float values, only hashing keys, value types, bools, and container lengths
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install hash-utils
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Usage
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from hash_utils import dict_hash, shape_hash
|
|
49
|
+
|
|
50
|
+
d1 = {"name": "alice", "config": {"enabled": True, "tags": []}}
|
|
51
|
+
d2 = {"name": "bob", "config": {"enabled": True, "tags": []}}
|
|
52
|
+
|
|
53
|
+
# Full content hash — different names produce different hashes
|
|
54
|
+
dict_hash(d1) != dict_hash(d2)
|
|
55
|
+
|
|
56
|
+
# Shape hash — same structure produces same hash
|
|
57
|
+
shape_hash(d1) == shape_hash(d2)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Why
|
|
61
|
+
|
|
62
|
+
`shape_hash` enables massive deduplication for jsonschema validation. If 13,000 dicts share the same structure but differ only in string values, they collapse to 1 unique shape — skip 12,999 redundant validations.
|
|
63
|
+
|
|
64
|
+
## Performance
|
|
65
|
+
|
|
66
|
+
Compiled via mypyc to native C. ~400K ops/s for nested dicts on a single core.
|
|
67
|
+
|
|
68
|
+
| Method | ops/s | Deterministic |
|
|
69
|
+
|---|---|---|
|
|
70
|
+
| `shape_hash` (mypyc) | 445K | Yes |
|
|
71
|
+
| `dict_hash` (mypyc) | 405K | Yes |
|
|
72
|
+
| `hash(repr())` | 312K | No |
|
|
73
|
+
| `json.dumps + hash` | 206K | Yes |
|
|
74
|
+
|
|
75
|
+
## Development
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
python -m venv .venv && source .venv/bin/activate
|
|
79
|
+
make install # editable install with dev deps
|
|
80
|
+
make test # run tests (pure Python or compiled)
|
|
81
|
+
make lint # ruff check + format check
|
|
82
|
+
make clean # remove build artifacts
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
.gitignore
|
|
2
|
+
LICENSE
|
|
3
|
+
Makefile
|
|
4
|
+
README.md
|
|
5
|
+
pyproject.toml
|
|
6
|
+
setup.py
|
|
7
|
+
.github/workflows/ci.yml
|
|
8
|
+
.github/workflows/release.yml
|
|
9
|
+
benchmarks/bench.py
|
|
10
|
+
fast_hash_utils.egg-info/PKG-INFO
|
|
11
|
+
fast_hash_utils.egg-info/SOURCES.txt
|
|
12
|
+
fast_hash_utils.egg-info/dependency_links.txt
|
|
13
|
+
fast_hash_utils.egg-info/requires.txt
|
|
14
|
+
fast_hash_utils.egg-info/top_level.txt
|
|
15
|
+
hash_utils/__init__.py
|
|
16
|
+
hash_utils/_core.py
|
|
17
|
+
tests/test_hash_utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hash_utils
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Fast deterministic dict hashing for deduplication.
|
|
2
|
+
|
|
3
|
+
Iteratively traverses a nested dict/list/scalar structure using an
|
|
4
|
+
explicit stack. Accumulates a native i64 hash via bit mixing.
|
|
5
|
+
Designed to be compiled with mypyc.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Dict, List
|
|
9
|
+
|
|
10
|
+
from mypy_extensions import i64
|
|
11
|
+
|
|
12
|
+
# Type tags for shape hashing — distinct primes for good mixing
|
|
13
|
+
_TAG_NONE: i64 = 7
|
|
14
|
+
_TAG_BOOL_T: i64 = 11
|
|
15
|
+
_TAG_BOOL_F: i64 = 13
|
|
16
|
+
_TAG_INT: i64 = 17
|
|
17
|
+
_TAG_FLOAT: i64 = 19
|
|
18
|
+
_TAG_STR: i64 = 23
|
|
19
|
+
_TAG_DICT: i64 = 29
|
|
20
|
+
_TAG_LIST: i64 = 31
|
|
21
|
+
_TAG_OTHER: i64 = 37
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _hash_to_i64(obj: object) -> i64:
|
|
25
|
+
"""Get the Python hash of an object as a native i64."""
|
|
26
|
+
return i64(hash(obj))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _mix(h: i64, v: i64) -> i64:
|
|
30
|
+
"""Murmur-inspired bit mixing using native i64 arithmetic."""
|
|
31
|
+
h = h ^ v
|
|
32
|
+
h = (h << 13) | ((h >> 51) & 0x1FFF)
|
|
33
|
+
h = h * 0x5BD1E995
|
|
34
|
+
return h
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def dict_hash(d: Dict[object, object]) -> int:
|
|
38
|
+
"""Deterministic hash of a nested dict — full values."""
|
|
39
|
+
h: i64 = 0
|
|
40
|
+
stack: List[object] = [d]
|
|
41
|
+
|
|
42
|
+
while stack:
|
|
43
|
+
item: object = stack.pop()
|
|
44
|
+
|
|
45
|
+
if item is None:
|
|
46
|
+
h = _mix(h, _TAG_NONE)
|
|
47
|
+
elif isinstance(item, bool):
|
|
48
|
+
h = _mix(h, 1 if item else 2)
|
|
49
|
+
elif isinstance(item, int):
|
|
50
|
+
h = _mix(h, _hash_to_i64(item))
|
|
51
|
+
elif isinstance(item, float):
|
|
52
|
+
h = _mix(h, _hash_to_i64(item))
|
|
53
|
+
elif isinstance(item, str):
|
|
54
|
+
h = _mix(h, _hash_to_i64(item))
|
|
55
|
+
h = _mix(h, i64(len(item)))
|
|
56
|
+
elif isinstance(item, dict):
|
|
57
|
+
h = _mix(h, i64(len(item)))
|
|
58
|
+
keys: List[str] = sorted(item.keys())
|
|
59
|
+
i: i64 = i64(len(keys)) - 1
|
|
60
|
+
while i >= 0:
|
|
61
|
+
k: str = keys[i]
|
|
62
|
+
stack.append(item[k])
|
|
63
|
+
stack.append(k)
|
|
64
|
+
i -= 1
|
|
65
|
+
elif isinstance(item, list):
|
|
66
|
+
h = _mix(h, i64(len(item)))
|
|
67
|
+
i = i64(len(item)) - 1
|
|
68
|
+
while i >= 0:
|
|
69
|
+
stack.append(item[i])
|
|
70
|
+
i -= 1
|
|
71
|
+
else:
|
|
72
|
+
h = _mix(h, _hash_to_i64(item))
|
|
73
|
+
|
|
74
|
+
return int(h)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def shape_hash(d: Dict[object, object]) -> int:
|
|
78
|
+
"""Deterministic hash of a nested dict — shape only.
|
|
79
|
+
|
|
80
|
+
Hashes keys and value types but ignores string/int/float content.
|
|
81
|
+
Two dicts with the same structure always produce the same
|
|
82
|
+
jsonschema validation result, so this is safe for dedup.
|
|
83
|
+
"""
|
|
84
|
+
h: i64 = 0
|
|
85
|
+
stack: List[object] = [d]
|
|
86
|
+
|
|
87
|
+
while stack:
|
|
88
|
+
item: object = stack.pop()
|
|
89
|
+
|
|
90
|
+
if item is None:
|
|
91
|
+
h = _mix(h, _TAG_NONE)
|
|
92
|
+
elif isinstance(item, bool):
|
|
93
|
+
h = _mix(h, _TAG_BOOL_T if item else _TAG_BOOL_F)
|
|
94
|
+
elif isinstance(item, int):
|
|
95
|
+
h = _mix(h, _TAG_INT)
|
|
96
|
+
elif isinstance(item, float):
|
|
97
|
+
h = _mix(h, _TAG_FLOAT)
|
|
98
|
+
elif isinstance(item, str):
|
|
99
|
+
h = _mix(h, _TAG_STR)
|
|
100
|
+
elif isinstance(item, dict):
|
|
101
|
+
h = _mix(h, _TAG_DICT)
|
|
102
|
+
h = _mix(h, i64(len(item)))
|
|
103
|
+
keys: List[str] = sorted(item.keys())
|
|
104
|
+
i: i64 = i64(len(keys)) - 1
|
|
105
|
+
while i >= 0:
|
|
106
|
+
k: str = keys[i]
|
|
107
|
+
h = _mix(h, _hash_to_i64(k))
|
|
108
|
+
stack.append(item[k])
|
|
109
|
+
i -= 1
|
|
110
|
+
elif isinstance(item, list):
|
|
111
|
+
h = _mix(h, _TAG_LIST)
|
|
112
|
+
h = _mix(h, i64(len(item)))
|
|
113
|
+
i = i64(len(item)) - 1
|
|
114
|
+
while i >= 0:
|
|
115
|
+
stack.append(item[i])
|
|
116
|
+
i -= 1
|
|
117
|
+
else:
|
|
118
|
+
h = _mix(h, _TAG_OTHER)
|
|
119
|
+
|
|
120
|
+
return int(h)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "setuptools-scm>=8", "mypy>=1.0", "mypy-extensions>=1.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "fast-hash-utils"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Fast deterministic dict hashing via mypyc"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{name = "Toby Mao"}]
|
|
13
|
+
dependencies = ["mypy-extensions>=1.0"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.9",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3.14",
|
|
24
|
+
"Typing :: Typed",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
dev = [
|
|
29
|
+
"mypy>=1.19",
|
|
30
|
+
"pytest>=8.4",
|
|
31
|
+
"ruff>=0.15",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/tobymao/hash-utils"
|
|
36
|
+
Repository = "https://github.com/tobymao/hash-utils"
|
|
37
|
+
Issues = "https://github.com/tobymao/hash-utils/issues"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools_scm]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
include = ["hash_utils*"]
|
|
43
|
+
|
|
44
|
+
[tool.pytest.ini_options]
|
|
45
|
+
testpaths = ["tests"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
target-version = "py39"
|
|
49
|
+
|
|
50
|
+
[tool.ruff.lint]
|
|
51
|
+
select = ["E", "F", "I", "UP"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff.lint.per-file-ignores]
|
|
54
|
+
"hash_utils/_core.py" = ["UP006", "UP035"] # typing.Dict/List required for mypyc + Python 3.9
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from setuptools import setup
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from mypyc.build import mypycify
|
|
7
|
+
|
|
8
|
+
opt_level = os.environ.get("MYPYC_OPT_LEVEL", "3")
|
|
9
|
+
ext_modules = mypycify(["hash_utils/_core.py"], opt_level=opt_level)
|
|
10
|
+
except ImportError:
|
|
11
|
+
ext_modules = []
|
|
12
|
+
|
|
13
|
+
setup(ext_modules=ext_modules)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from hash_utils import dict_hash, shape_hash
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestDictHash:
|
|
7
|
+
"""Tests for dict_hash — full content hashing."""
|
|
8
|
+
|
|
9
|
+
def test_deterministic(self):
|
|
10
|
+
d = {"a": 1, "b": "hello", "c": [1, 2, 3]}
|
|
11
|
+
assert dict_hash(d) == dict_hash(d)
|
|
12
|
+
|
|
13
|
+
def test_key_order_independent(self):
|
|
14
|
+
"""Sorted keys means insertion order doesn't matter."""
|
|
15
|
+
d1 = {"a": 1, "b": 2}
|
|
16
|
+
d2 = {"b": 2, "a": 1}
|
|
17
|
+
assert dict_hash(d1) == dict_hash(d2)
|
|
18
|
+
|
|
19
|
+
def test_different_values_differ(self):
|
|
20
|
+
d1 = {"a": 1}
|
|
21
|
+
d2 = {"a": 2}
|
|
22
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
23
|
+
|
|
24
|
+
def test_different_keys_differ(self):
|
|
25
|
+
d1 = {"a": 1}
|
|
26
|
+
d2 = {"b": 1}
|
|
27
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
28
|
+
|
|
29
|
+
def test_nested_dicts(self):
|
|
30
|
+
d1 = {"a": {"b": {"c": 1}}}
|
|
31
|
+
d2 = {"a": {"b": {"c": 1}}}
|
|
32
|
+
assert dict_hash(d1) == dict_hash(d2)
|
|
33
|
+
|
|
34
|
+
def test_nested_dicts_differ(self):
|
|
35
|
+
d1 = {"a": {"b": {"c": 1}}}
|
|
36
|
+
d2 = {"a": {"b": {"c": 2}}}
|
|
37
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
38
|
+
|
|
39
|
+
def test_lists(self):
|
|
40
|
+
d1 = {"a": [1, 2, 3]}
|
|
41
|
+
d2 = {"a": [1, 2, 3]}
|
|
42
|
+
assert dict_hash(d1) == dict_hash(d2)
|
|
43
|
+
|
|
44
|
+
def test_list_order_matters(self):
|
|
45
|
+
d1 = {"a": [1, 2]}
|
|
46
|
+
d2 = {"a": [2, 1]}
|
|
47
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
48
|
+
|
|
49
|
+
def test_empty_dict(self):
|
|
50
|
+
assert dict_hash({}) == dict_hash({})
|
|
51
|
+
|
|
52
|
+
def test_none_values(self):
|
|
53
|
+
d1 = {"a": None}
|
|
54
|
+
d2 = {"a": None}
|
|
55
|
+
assert dict_hash(d1) == dict_hash(d2)
|
|
56
|
+
|
|
57
|
+
def test_none_vs_zero(self):
|
|
58
|
+
d1 = {"a": None}
|
|
59
|
+
d2 = {"a": 0}
|
|
60
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
61
|
+
|
|
62
|
+
def test_bool_values(self):
|
|
63
|
+
d1 = {"a": True}
|
|
64
|
+
d2 = {"a": False}
|
|
65
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
66
|
+
|
|
67
|
+
def test_float_values(self):
|
|
68
|
+
d1 = {"a": 1.5}
|
|
69
|
+
d2 = {"a": 1.5}
|
|
70
|
+
assert dict_hash(d1) == dict_hash(d2)
|
|
71
|
+
|
|
72
|
+
def test_mixed_types(self):
|
|
73
|
+
d = {
|
|
74
|
+
"str": "hello",
|
|
75
|
+
"int": 42,
|
|
76
|
+
"float": 3.14,
|
|
77
|
+
"bool": True,
|
|
78
|
+
"none": None,
|
|
79
|
+
"list": [1, "two", 3.0],
|
|
80
|
+
"dict": {"nested": True},
|
|
81
|
+
}
|
|
82
|
+
assert dict_hash(d) == dict_hash(d)
|
|
83
|
+
|
|
84
|
+
def test_string_length_matters(self):
|
|
85
|
+
"""Strings with same hash prefix but different length should differ."""
|
|
86
|
+
d1 = {"a": "x"}
|
|
87
|
+
d2 = {"a": "xx"}
|
|
88
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
89
|
+
|
|
90
|
+
def test_returns_int(self):
|
|
91
|
+
assert isinstance(dict_hash({"a": 1}), int)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class TestShapeHash:
|
|
95
|
+
"""Tests for shape_hash — structural hashing."""
|
|
96
|
+
|
|
97
|
+
def test_deterministic(self):
|
|
98
|
+
d = {"a": 1, "b": "hello"}
|
|
99
|
+
assert shape_hash(d) == shape_hash(d)
|
|
100
|
+
|
|
101
|
+
def test_ignores_string_values(self):
|
|
102
|
+
d1 = {"name": "alice", "city": "london"}
|
|
103
|
+
d2 = {"name": "bob", "city": "paris"}
|
|
104
|
+
assert shape_hash(d1) == shape_hash(d2)
|
|
105
|
+
|
|
106
|
+
def test_ignores_int_values(self):
|
|
107
|
+
d1 = {"count": 1, "total": 100}
|
|
108
|
+
d2 = {"count": 999, "total": 0}
|
|
109
|
+
assert shape_hash(d1) == shape_hash(d2)
|
|
110
|
+
|
|
111
|
+
def test_ignores_float_values(self):
|
|
112
|
+
d1 = {"score": 1.5}
|
|
113
|
+
d2 = {"score": 99.9}
|
|
114
|
+
assert shape_hash(d1) == shape_hash(d2)
|
|
115
|
+
|
|
116
|
+
def test_different_keys_differ(self):
|
|
117
|
+
d1 = {"a": 1}
|
|
118
|
+
d2 = {"b": 1}
|
|
119
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
120
|
+
|
|
121
|
+
def test_bool_values_matter(self):
|
|
122
|
+
d1 = {"enabled": True}
|
|
123
|
+
d2 = {"enabled": False}
|
|
124
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
125
|
+
|
|
126
|
+
def test_different_value_types_differ(self):
|
|
127
|
+
d1 = {"a": "string"}
|
|
128
|
+
d2 = {"a": 42}
|
|
129
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
130
|
+
|
|
131
|
+
def test_dict_length_matters(self):
|
|
132
|
+
d1 = {"a": 1, "b": 2}
|
|
133
|
+
d2 = {"a": 1}
|
|
134
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
135
|
+
|
|
136
|
+
def test_list_length_matters(self):
|
|
137
|
+
d1 = {"items": [1, 2, 3]}
|
|
138
|
+
d2 = {"items": [1]}
|
|
139
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
140
|
+
|
|
141
|
+
def test_nested_shape_same(self):
|
|
142
|
+
"""Two dicts with identical structure but different leaf values."""
|
|
143
|
+
d1 = {
|
|
144
|
+
"name": "test_user",
|
|
145
|
+
"config": {"enabled": True, "severity": "ERROR", "tags": []},
|
|
146
|
+
}
|
|
147
|
+
d2 = {
|
|
148
|
+
"name": "other_user",
|
|
149
|
+
"config": {"enabled": True, "severity": "WARNING", "tags": []},
|
|
150
|
+
}
|
|
151
|
+
assert shape_hash(d1) == shape_hash(d2)
|
|
152
|
+
|
|
153
|
+
def test_nested_shape_differs(self):
|
|
154
|
+
"""Different structure should produce different hashes."""
|
|
155
|
+
d1 = {"config": {"enabled": True}}
|
|
156
|
+
d2 = {"config": {"enabled": True, "extra": "field"}}
|
|
157
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
158
|
+
|
|
159
|
+
def test_empty_dict(self):
|
|
160
|
+
assert shape_hash({}) == shape_hash({})
|
|
161
|
+
|
|
162
|
+
def test_none_tagged(self):
|
|
163
|
+
d1 = {"a": None}
|
|
164
|
+
d2 = {"a": "hello"}
|
|
165
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
166
|
+
|
|
167
|
+
def test_key_order_independent(self):
|
|
168
|
+
d1 = {"a": 1, "b": 2}
|
|
169
|
+
d2 = {"b": 2, "a": 1}
|
|
170
|
+
assert shape_hash(d1) == shape_hash(d2)
|
|
171
|
+
|
|
172
|
+
def test_returns_int(self):
|
|
173
|
+
assert isinstance(shape_hash({"a": 1}), int)
|
|
174
|
+
|
|
175
|
+
def test_realistic_dedup(self):
|
|
176
|
+
"""Simulate the jsonschema validation dedup use case."""
|
|
177
|
+
# Generate 100 dicts that differ only in string/int values
|
|
178
|
+
dicts = []
|
|
179
|
+
for i in range(100):
|
|
180
|
+
d = {
|
|
181
|
+
"name": f"test_node_{i}",
|
|
182
|
+
"unique_id": f"project.model.{i}",
|
|
183
|
+
"config": {
|
|
184
|
+
"enabled": True,
|
|
185
|
+
"severity": "ERROR",
|
|
186
|
+
"tags": [],
|
|
187
|
+
},
|
|
188
|
+
"columns": {},
|
|
189
|
+
}
|
|
190
|
+
dicts.append(d)
|
|
191
|
+
|
|
192
|
+
hashes = {shape_hash(d) for d in dicts}
|
|
193
|
+
assert len(hashes) == 1, f"Expected 1 unique shape, got {len(hashes)}"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class TestDictHashVsShapeHash:
|
|
197
|
+
"""Verify the two functions behave differently where expected."""
|
|
198
|
+
|
|
199
|
+
def test_dict_hash_distinguishes_values(self):
|
|
200
|
+
d1 = {"a": "hello"}
|
|
201
|
+
d2 = {"a": "world"}
|
|
202
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
203
|
+
assert shape_hash(d1) == shape_hash(d2)
|
|
204
|
+
|
|
205
|
+
def test_both_distinguish_keys(self):
|
|
206
|
+
d1 = {"a": 1}
|
|
207
|
+
d2 = {"b": 1}
|
|
208
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
209
|
+
assert shape_hash(d1) != shape_hash(d2)
|
|
210
|
+
|
|
211
|
+
def test_both_distinguish_structure(self):
|
|
212
|
+
d1 = {"a": [1, 2]}
|
|
213
|
+
d2 = {"a": {"b": 1}}
|
|
214
|
+
assert dict_hash(d1) != dict_hash(d2)
|
|
215
|
+
assert shape_hash(d1) != shape_hash(d2)
|