individu8 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- individu8-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +35 -0
- individu8-0.1.0/.github/ISSUE_TEMPLATE/config.yml +1 -0
- individu8-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +19 -0
- individu8-0.1.0/.github/workflows/ci.yml +36 -0
- individu8-0.1.0/.github/workflows/dependabot.yml +7 -0
- individu8-0.1.0/.github/workflows/publish.yml +40 -0
- individu8-0.1.0/.gitignore +38 -0
- individu8-0.1.0/.pre-commit-config.yaml +14 -0
- individu8-0.1.0/CHANGELOG.md +21 -0
- individu8-0.1.0/LICENSE +21 -0
- individu8-0.1.0/PKG-INFO +246 -0
- individu8-0.1.0/README.md +198 -0
- individu8-0.1.0/pyproject.toml +120 -0
- individu8-0.1.0/src/individu8/__init__.py +6 -0
- individu8-0.1.0/src/individu8/core.py +529 -0
- individu8-0.1.0/tests/test_individu8.py +498 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: Something isn't working as expected
|
|
4
|
+
title: '[BUG] '
|
|
5
|
+
labels: bug
|
|
6
|
+
assignees: ''
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Environment
|
|
10
|
+
|
|
11
|
+
- OS: <!-- e.g. macOS 15.2, Ubuntu 24.04, Windows 11 -->
|
|
12
|
+
- Python version: <!-- e.g. 3.12.3 -->
|
|
13
|
+
- individu8 version: <!-- e.g. 0.2.0 — run: python -c "import individu8; print(individu8.__version__)" -->
|
|
14
|
+
- json_backend: <!-- orjson (default) or stdlib -->
|
|
15
|
+
|
|
16
|
+
## Steps to reproduce
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
# minimal example that reproduces the issue
|
|
20
|
+
from individu8 import individu8
|
|
21
|
+
|
|
22
|
+
individu8(...)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Expected result
|
|
26
|
+
|
|
27
|
+
<!-- what you expected to happen -->
|
|
28
|
+
|
|
29
|
+
## Actual result
|
|
30
|
+
|
|
31
|
+
<!-- what actually happened — include the full error message and traceback if applicable -->
|
|
32
|
+
|
|
33
|
+
## Additional context
|
|
34
|
+
|
|
35
|
+
<!-- anything else that might be relevant -->
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
blank_issues_enabled: false
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: Suggest an idea or improvement
|
|
4
|
+
title: '[FEAT] '
|
|
5
|
+
labels: enhancement
|
|
6
|
+
assignees: ''
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Problem
|
|
10
|
+
|
|
11
|
+
<!-- what problem would this solve? -->
|
|
12
|
+
|
|
13
|
+
## Proposed solution
|
|
14
|
+
|
|
15
|
+
<!-- how would you like it to work? -->
|
|
16
|
+
|
|
17
|
+
## Alternatives considered
|
|
18
|
+
|
|
19
|
+
<!-- any other approaches you thought of? -->
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#.github/workflows/ci.yml
|
|
2
|
+
name: CI
|
|
3
|
+
|
|
4
|
+
on:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Install uv
|
|
21
|
+
uses: astral-sh/setup-uv@v4
|
|
22
|
+
|
|
23
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
24
|
+
run: uv python install ${{ matrix.python-version }}
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: uv sync --all-extras
|
|
28
|
+
|
|
29
|
+
- name: Lint
|
|
30
|
+
run: uv run ruff check src/ tests/
|
|
31
|
+
|
|
32
|
+
- name: Test
|
|
33
|
+
run: uv run pytest --tb=short -q
|
|
34
|
+
|
|
35
|
+
- name: Security audit
|
|
36
|
+
run: uv run pip-audit
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
|
|
13
|
+
- name: Install uv
|
|
14
|
+
uses: astral-sh/setup-uv@v4
|
|
15
|
+
|
|
16
|
+
- name: Build
|
|
17
|
+
run: uv build
|
|
18
|
+
|
|
19
|
+
- name: Upload dist
|
|
20
|
+
uses: actions/upload-artifact@v4
|
|
21
|
+
with:
|
|
22
|
+
name: dist
|
|
23
|
+
path: dist/
|
|
24
|
+
|
|
25
|
+
publish:
|
|
26
|
+
needs: build
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
environment: pypi
|
|
29
|
+
permissions:
|
|
30
|
+
id-token: write # required for OIDC trusted publishing
|
|
31
|
+
|
|
32
|
+
steps:
|
|
33
|
+
- name: Download dist
|
|
34
|
+
uses: actions/download-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
|
|
39
|
+
- name: Publish to PyPI
|
|
40
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
.Python
|
|
7
|
+
*.egg-info/
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
# uv / pip
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
*.lock
|
|
16
|
+
|
|
17
|
+
# Testing
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
.coverage
|
|
20
|
+
htmlcov/
|
|
21
|
+
.tox/
|
|
22
|
+
|
|
23
|
+
# Ruff
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
|
|
26
|
+
# IDE
|
|
27
|
+
.vscode/
|
|
28
|
+
.idea/
|
|
29
|
+
*.swp
|
|
30
|
+
*.swo
|
|
31
|
+
|
|
32
|
+
# OS
|
|
33
|
+
.DS_Store
|
|
34
|
+
Thumbs.db
|
|
35
|
+
|
|
36
|
+
# Build artifacts
|
|
37
|
+
*.whl
|
|
38
|
+
*.tar.gz
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# .pre-commit-config.yaml
|
|
2
|
+
repos:
|
|
3
|
+
- repo: local
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
name: ruff
|
|
7
|
+
entry: uv run ruff check src/ tests/
|
|
8
|
+
language: system
|
|
9
|
+
pass_filenames: false
|
|
10
|
+
- id: pytest
|
|
11
|
+
name: pytest
|
|
12
|
+
entry: uv run pytest --tb=short -q
|
|
13
|
+
language: system
|
|
14
|
+
pass_filenames: false
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2025-06-21
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- Initial release
|
|
14
|
+
- `individu8()` function for deterministic hashing of dicts, lists, and JSON/YAML strings
|
|
15
|
+
- Filter pipeline: `exclude_all_keys_starting_with`, `exclude_all_keys_ending_with`,
|
|
16
|
+
`exclude_all_keys_containing`, `exclude`, `include`
|
|
17
|
+
- Jsonpath support in `exclude` and `include` via jsonpath-ng
|
|
18
|
+
- Hash algorithms: `blake2b` (default), `blake2s`, `sha256`, `md5`, `shake128`
|
|
19
|
+
- JSON backends: `orjson` (default, fast), `stdlib` (compatibility)
|
|
20
|
+
- Output formats: `same_as_input` (default), `python`, `json`, `yaml`
|
|
21
|
+
- YAML string input support via pyyaml
|
individu8-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Niklas Sköldmark
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
individu8-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: individu8
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic hashing of Python dicts, lists, and JSON/YAML strings
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/individu8
|
|
6
|
+
Project-URL: Repository, https://github.com/yourusername/individu8
|
|
7
|
+
Project-URL: Issues, https://github.com/yourusername/individu8/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/yourusername/individu8/blob/main/CHANGELOG.md
|
|
9
|
+
Author: Niklas Sköldmark
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Niklas Sköldmark
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: blake2,content-hash,deduplication,deterministic,etl,fingerprint,jsonpath,uuid-v8
|
|
33
|
+
Classifier: Development Status :: 4 - Beta
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
41
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
42
|
+
Classifier: Topic :: Utilities
|
|
43
|
+
Requires-Python: >=3.10
|
|
44
|
+
Requires-Dist: jsonpath-ng>=1.6
|
|
45
|
+
Requires-Dist: orjson>=3.9
|
|
46
|
+
Requires-Dist: pyyaml>=6.0
|
|
47
|
+
Description-Content-Type: text/markdown
|
|
48
|
+
|
|
49
|
+
# individu8
|
|
50
|
+
|
|
51
|
+
Deterministic hashing of Python dicts, lists, and JSON/YAML strings.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from individu8 import individu8
|
|
55
|
+
|
|
56
|
+
individu8({"id": 1, "name": "Alice"})
|
|
57
|
+
# "FszF+jYmhYS17K"
|
|
58
|
+
|
|
59
|
+
individu8([{"id": 1}, {"id": 2}])
|
|
60
|
+
# ["GICezwtC7+vhEA", "DxgzKROIe5u0sQ"]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Why
|
|
64
|
+
|
|
65
|
+
- **Stable across runs** — same data always produces the same hash, regardless of key insertion order
|
|
66
|
+
- **Flexible filtering** — exclude volatile fields (timestamps, system columns) before hashing
|
|
67
|
+
- **Multiple input formats** — pass a dict, list, JSON string, or YAML string
|
|
68
|
+
- **Multiple output formats** — base64 (default), hex, or UUID v8
|
|
69
|
+
- **Multiple hash algorithms** — blake2b (default), blake2s, sha256, md5, shake128
|
|
70
|
+
- **Predictable length** — `hash_length` controls the exact number of characters returned
|
|
71
|
+
- **Fast** — uses orjson for serialisation by default (5-10x faster than stdlib json)
|
|
72
|
+
|
|
73
|
+
## Install
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install individu8
|
|
77
|
+
# or
|
|
78
|
+
uv add individu8
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
### Basic
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from individu8 import individu8
|
|
87
|
+
|
|
88
|
+
# hash a dict — returns exactly hash_length characters (default 14)
|
|
89
|
+
individu8({"id": 1, "name": "Alice"})
|
|
90
|
+
# "FszF+jYmhYS17K"
|
|
91
|
+
|
|
92
|
+
# hash a list — returns a list of hashes in the same order
|
|
93
|
+
individu8([{"id": 1}, {"id": 2}])
|
|
94
|
+
# ["GICezwtC7+vhEA", "DxgzKROIe5u0sQ"]
|
|
95
|
+
|
|
96
|
+
# hash a JSON string — identical to dict input
|
|
97
|
+
individu8('{"id": 1, "name": "Alice"}')
|
|
98
|
+
# "FszF+jYmhYS17K"
|
|
99
|
+
|
|
100
|
+
# hash a YAML string — identical to dict input
|
|
101
|
+
individu8("id: 1\nname: Alice")
|
|
102
|
+
# "FszF+jYmhYS17K"
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Filtering
|
|
106
|
+
|
|
107
|
+
Filtering is applied as a pipeline in this order:
|
|
108
|
+
|
|
109
|
+
1. `exclude_all_keys_starting_with` / `ending_with` / `containing` — removes matching keys recursively at any depth
|
|
110
|
+
2. `exclude` — removes specific keys or jsonpath paths from the full document
|
|
111
|
+
3. `include` — narrows to only the specified keys or paths
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
# exclude system/metadata keys at any depth
|
|
115
|
+
individu8(data, exclude_all_keys_starting_with=["_meta", "_dlt"])
|
|
116
|
+
individu8(data, exclude_all_keys_ending_with=["_at", "_id"])
|
|
117
|
+
individu8(data, exclude_all_keys_containing=["temp"])
|
|
118
|
+
|
|
119
|
+
# exclude specific top-level keys
|
|
120
|
+
individu8(data, exclude=["updated_at", "created_at"])
|
|
121
|
+
|
|
122
|
+
# exclude a nested key using jsonpath — only removes "code" inside this specific path,
|
|
123
|
+
# not other keys named "code" elsewhere in the document
|
|
124
|
+
individu8(data, exclude=["person.address[*].street.number.code"])
|
|
125
|
+
|
|
126
|
+
# hash only specific fields
|
|
127
|
+
individu8(data, include=["id", "name"])
|
|
128
|
+
|
|
129
|
+
# combine: hash only street data, excluding volatile subfields
|
|
130
|
+
individu8(
|
|
131
|
+
data,
|
|
132
|
+
exclude=["person.address[*].street.updated_at"],
|
|
133
|
+
include=["person.address[*].street"],
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Hash length
|
|
138
|
+
|
|
139
|
+
`hash_length` controls the exact number of characters in the returned string.
|
|
140
|
+
The library computes as many bytes as needed internally and truncates to exactly
|
|
141
|
+
`hash_length` characters — you always get exactly what you asked for.
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
individu8(data, hash_length=8) # 8 chars
|
|
145
|
+
individu8(data, hash_length=14) # 14 chars (default)
|
|
146
|
+
individu8(data, hash_length=20) # 20 chars
|
|
147
|
+
individu8(data, hash_length=32) # 32 chars
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
For clean alignment with base64 encoding use multiples of 4 (12, 16, 20, 24) —
|
|
151
|
+
other values work fine but the last 1–3 characters of entropy are discarded by
|
|
152
|
+
truncation.
|
|
153
|
+
|
|
154
|
+
For expert control over digest size in bytes, use `hash_bytes` instead:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
individu8(data, hash_bytes=16) # 16-byte digest → 22 base64 chars
|
|
158
|
+
individu8(data, hash_bytes=32) # 32-byte digest → 43 base64 chars
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Hash format
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
# base64 (default) — URL-safe ASCII, no padding
|
|
165
|
+
individu8(data)
|
|
166
|
+
# "FszF+jYmhYS17K"
|
|
167
|
+
|
|
168
|
+
# hex — lowercase hexadecimal, hash_bytes * 2 characters
|
|
169
|
+
individu8(data, hash_format="hex")
|
|
170
|
+
# "19ccbe8d3d2f6b"
|
|
171
|
+
|
|
172
|
+
# UUID v8 — deterministic, RFC 9562 compliant, always 36 characters
|
|
173
|
+
# 122 bits of entropy (128 minus 6 version/variant bits)
|
|
174
|
+
# accepted anywhere a UUID is expected
|
|
175
|
+
individu8(data, hash_format="uuid")
|
|
176
|
+
# "6b5dc393-b52a-8ea0-a003-4f6986d2db77"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
UUID v8 is the RFC 9562 "custom" variant — designed for exactly this use case
|
|
180
|
+
where you want deterministic, content-based UUIDs using your own hash algorithm.
|
|
181
|
+
|
|
182
|
+
### Hash algorithms
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
individu8(data, hash_algorithm="blake2b") # default — fast, modern, recommended
|
|
186
|
+
individu8(data, hash_algorithm="blake2s") # 32-bit optimised, max hash_bytes=32
|
|
187
|
+
individu8(data, hash_algorithm="sha256") # widely compatible, matches git/AWS
|
|
188
|
+
individu8(data, hash_algorithm="md5") # legacy, used for ETags/checksums
|
|
189
|
+
individu8(data, hash_algorithm="shake128") # SHA-3, matches dlt _dlt_id format
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Output structure
|
|
193
|
+
|
|
194
|
+
Controls what wraps the hash string(s) — separate from `hash_format` which
|
|
195
|
+
controls how each individual hash is encoded.
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
individu8(data, output="same_as_input") # default — matches input type
|
|
199
|
+
individu8(data, output="python") # always str or list[str]
|
|
200
|
+
individu8(data, output="json") # always a JSON string
|
|
201
|
+
individu8(data, output="yaml") # always a YAML string
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### JSON backend
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
individu8(data, json_backend="orjson") # default — 5-10x faster
|
|
208
|
+
individu8(data, json_backend="stdlib") # stdlib json, no extra dependencies
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Both backends produce identical hashes for the same input.
|
|
212
|
+
|
|
213
|
+
## Supported input types
|
|
214
|
+
|
|
215
|
+
The following Python types are handled automatically when they appear as values
|
|
216
|
+
in the data being hashed:
|
|
217
|
+
|
|
218
|
+
| Type | Serialised as |
|
|
219
|
+
|---|---|
|
|
220
|
+
| `Decimal` | string (avoids float precision loss) |
|
|
221
|
+
| `datetime` / `date` / `time` | ISO 8601 string |
|
|
222
|
+
| `UUID` | string |
|
|
223
|
+
| `bytes` | base64 string |
|
|
224
|
+
| `namedtuple` | dict via `_asdict()` |
|
|
225
|
+
| `@dataclass` | dict via `dataclasses.asdict()` |
|
|
226
|
+
| `Enum` | `.value` |
|
|
227
|
+
| pydantic `BaseModel` | dict via `.model_dump()` |
|
|
228
|
+
| objects with `.asdict()` | dict via `.asdict()` |
|
|
229
|
+
|
|
230
|
+
## Development
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
git clone https://github.com/niklasskoldmark/individu8
|
|
234
|
+
cd individu8
|
|
235
|
+
uv sync
|
|
236
|
+
uv run pytest
|
|
237
|
+
uv run ruff check src/ tests/
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Changelog
|
|
241
|
+
|
|
242
|
+
See [CHANGELOG.md](CHANGELOG.md).
|
|
243
|
+
|
|
244
|
+
## License
|
|
245
|
+
|
|
246
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# individu8
|
|
2
|
+
|
|
3
|
+
Deterministic hashing of Python dicts, lists, and JSON/YAML strings.
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
from individu8 import individu8
|
|
7
|
+
|
|
8
|
+
individu8({"id": 1, "name": "Alice"})
|
|
9
|
+
# "FszF+jYmhYS17K"
|
|
10
|
+
|
|
11
|
+
individu8([{"id": 1}, {"id": 2}])
|
|
12
|
+
# ["GICezwtC7+vhEA", "DxgzKROIe5u0sQ"]
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Why
|
|
16
|
+
|
|
17
|
+
- **Stable across runs** — same data always produces the same hash, regardless of key insertion order
|
|
18
|
+
- **Flexible filtering** — exclude volatile fields (timestamps, system columns) before hashing
|
|
19
|
+
- **Multiple input formats** — pass a dict, list, JSON string, or YAML string
|
|
20
|
+
- **Multiple output formats** — base64 (default), hex, or UUID v8
|
|
21
|
+
- **Multiple hash algorithms** — blake2b (default), blake2s, sha256, md5, shake128
|
|
22
|
+
- **Predictable length** — `hash_length` controls the exact number of characters returned
|
|
23
|
+
- **Fast** — uses orjson for serialisation by default (5-10x faster than stdlib json)
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install individu8
|
|
29
|
+
# or
|
|
30
|
+
uv add individu8
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
### Basic
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from individu8 import individu8
|
|
39
|
+
|
|
40
|
+
# hash a dict — returns exactly hash_length characters (default 14)
|
|
41
|
+
individu8({"id": 1, "name": "Alice"})
|
|
42
|
+
# "FszF+jYmhYS17K"
|
|
43
|
+
|
|
44
|
+
# hash a list — returns a list of hashes in the same order
|
|
45
|
+
individu8([{"id": 1}, {"id": 2}])
|
|
46
|
+
# ["GICezwtC7+vhEA", "DxgzKROIe5u0sQ"]
|
|
47
|
+
|
|
48
|
+
# hash a JSON string — identical to dict input
|
|
49
|
+
individu8('{"id": 1, "name": "Alice"}')
|
|
50
|
+
# "FszF+jYmhYS17K"
|
|
51
|
+
|
|
52
|
+
# hash a YAML string — identical to dict input
|
|
53
|
+
individu8("id: 1\nname: Alice")
|
|
54
|
+
# "FszF+jYmhYS17K"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Filtering
|
|
58
|
+
|
|
59
|
+
Filtering is applied as a pipeline in this order:
|
|
60
|
+
|
|
61
|
+
1. `exclude_all_keys_starting_with` / `ending_with` / `containing` — removes matching keys recursively at any depth
|
|
62
|
+
2. `exclude` — removes specific keys or jsonpath paths from the full document
|
|
63
|
+
3. `include` — narrows to only the specified keys or paths
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
# exclude system/metadata keys at any depth
|
|
67
|
+
individu8(data, exclude_all_keys_starting_with=["_meta", "_dlt"])
|
|
68
|
+
individu8(data, exclude_all_keys_ending_with=["_at", "_id"])
|
|
69
|
+
individu8(data, exclude_all_keys_containing=["temp"])
|
|
70
|
+
|
|
71
|
+
# exclude specific top-level keys
|
|
72
|
+
individu8(data, exclude=["updated_at", "created_at"])
|
|
73
|
+
|
|
74
|
+
# exclude a nested key using jsonpath — only removes "code" inside this specific path,
|
|
75
|
+
# not other keys named "code" elsewhere in the document
|
|
76
|
+
individu8(data, exclude=["person.address[*].street.number.code"])
|
|
77
|
+
|
|
78
|
+
# hash only specific fields
|
|
79
|
+
individu8(data, include=["id", "name"])
|
|
80
|
+
|
|
81
|
+
# combine: hash only street data, excluding volatile subfields
|
|
82
|
+
individu8(
|
|
83
|
+
data,
|
|
84
|
+
exclude=["person.address[*].street.updated_at"],
|
|
85
|
+
include=["person.address[*].street"],
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Hash length
|
|
90
|
+
|
|
91
|
+
`hash_length` controls the exact number of characters in the returned string.
|
|
92
|
+
The library computes as many bytes as needed internally and truncates to exactly
|
|
93
|
+
`hash_length` characters — you always get exactly what you asked for.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
individu8(data, hash_length=8) # 8 chars
|
|
97
|
+
individu8(data, hash_length=14) # 14 chars (default)
|
|
98
|
+
individu8(data, hash_length=20) # 20 chars
|
|
99
|
+
individu8(data, hash_length=32) # 32 chars
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
For clean alignment with base64 encoding use multiples of 4 (12, 16, 20, 24) —
|
|
103
|
+
other values work fine but the last 1–3 characters of entropy are discarded by
|
|
104
|
+
truncation.
|
|
105
|
+
|
|
106
|
+
For expert control over digest size in bytes, use `hash_bytes` instead:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
individu8(data, hash_bytes=16) # 16-byte digest → 22 base64 chars
|
|
110
|
+
individu8(data, hash_bytes=32) # 32-byte digest → 43 base64 chars
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Hash format
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
# base64 (default) — URL-safe ASCII, no padding
|
|
117
|
+
individu8(data)
|
|
118
|
+
# "FszF+jYmhYS17K"
|
|
119
|
+
|
|
120
|
+
# hex — lowercase hexadecimal, hash_bytes * 2 characters
|
|
121
|
+
individu8(data, hash_format="hex")
|
|
122
|
+
# "19ccbe8d3d2f6b"
|
|
123
|
+
|
|
124
|
+
# UUID v8 — deterministic, RFC 9562 compliant, always 36 characters
|
|
125
|
+
# 122 bits of entropy (128 minus 6 version/variant bits)
|
|
126
|
+
# accepted anywhere a UUID is expected
|
|
127
|
+
individu8(data, hash_format="uuid")
|
|
128
|
+
# "6b5dc393-b52a-8ea0-a003-4f6986d2db77"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
UUID v8 is the RFC 9562 "custom" variant — designed for exactly this use case
|
|
132
|
+
where you want deterministic, content-based UUIDs using your own hash algorithm.
|
|
133
|
+
|
|
134
|
+
### Hash algorithms
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
individu8(data, hash_algorithm="blake2b") # default — fast, modern, recommended
|
|
138
|
+
individu8(data, hash_algorithm="blake2s") # 32-bit optimised, max hash_bytes=32
|
|
139
|
+
individu8(data, hash_algorithm="sha256") # widely compatible, matches git/AWS
|
|
140
|
+
individu8(data, hash_algorithm="md5") # legacy, used for ETags/checksums
|
|
141
|
+
individu8(data, hash_algorithm="shake128") # SHA-3, matches dlt _dlt_id format
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Output structure
|
|
145
|
+
|
|
146
|
+
Controls what wraps the hash string(s) — separate from `hash_format` which
|
|
147
|
+
controls how each individual hash is encoded.
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
individu8(data, output="same_as_input") # default — matches input type
|
|
151
|
+
individu8(data, output="python") # always str or list[str]
|
|
152
|
+
individu8(data, output="json") # always a JSON string
|
|
153
|
+
individu8(data, output="yaml") # always a YAML string
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### JSON backend
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
individu8(data, json_backend="orjson") # default — 5-10x faster
|
|
160
|
+
individu8(data, json_backend="stdlib") # stdlib json, no extra dependencies
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Both backends produce identical hashes for the same input.
|
|
164
|
+
|
|
165
|
+
## Supported input types
|
|
166
|
+
|
|
167
|
+
The following Python types are handled automatically when they appear as values
|
|
168
|
+
in the data being hashed:
|
|
169
|
+
|
|
170
|
+
| Type | Serialised as |
|
|
171
|
+
|---|---|
|
|
172
|
+
| `Decimal` | string (avoids float precision loss) |
|
|
173
|
+
| `datetime` / `date` / `time` | ISO 8601 string |
|
|
174
|
+
| `UUID` | string |
|
|
175
|
+
| `bytes` | base64 string |
|
|
176
|
+
| `namedtuple` | dict via `_asdict()` |
|
|
177
|
+
| `@dataclass` | dict via `dataclasses.asdict()` |
|
|
178
|
+
| `Enum` | `.value` |
|
|
179
|
+
| pydantic `BaseModel` | dict via `.model_dump()` |
|
|
180
|
+
| objects with `.asdict()` | dict via `.asdict()` |
|
|
181
|
+
|
|
182
|
+
## Development
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
git clone https://github.com/niklasskoldmark/individu8
|
|
186
|
+
cd individu8
|
|
187
|
+
uv sync
|
|
188
|
+
uv run pytest
|
|
189
|
+
uv run ruff check src/ tests/
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Changelog
|
|
193
|
+
|
|
194
|
+
See [CHANGELOG.md](CHANGELOG.md).
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT — see [LICENSE](LICENSE).
|