scicanonicalhash 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
.venv
|
|
2
|
+
.DS_Store
|
|
3
|
+
*/*/__pycache__
|
|
4
|
+
others_projects/sciforge
|
|
5
|
+
scistack-gui/extension/node_modules/
|
|
6
|
+
scistack-gui/frontend/node_modules/__pycache__/
|
|
7
|
+
*.pyc
|
|
8
|
+
__pycache__/
|
|
9
|
+
*.pyc
|
|
10
|
+
*.pyo
|
|
11
|
+
|
|
12
|
+
# Generated database artifacts (DuckDB data/lineage + write-ahead logs)
|
|
13
|
+
*.duckdb
|
|
14
|
+
*.duckdb.wal
|
|
15
|
+
*.wal
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scicanonicalhash
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic hashing for arbitrary Python objects
|
|
5
|
+
Project-URL: Repository, https://github.com/example/scicanonicalhash
|
|
6
|
+
Project-URL: Issues, https://github.com/example/scicanonicalhash/issues
|
|
7
|
+
Author: SciStack Contributors
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: cache-key,canonical,data-science,deterministic,hash,reproducibility
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Provides-Extra: all
|
|
24
|
+
Requires-Dist: numpy>=1.20; extra == 'all'
|
|
25
|
+
Requires-Dist: pandas>=1.3; extra == 'all'
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
31
|
+
Provides-Extra: numpy
|
|
32
|
+
Requires-Dist: numpy>=1.20; extra == 'numpy'
|
|
33
|
+
Provides-Extra: pandas
|
|
34
|
+
Requires-Dist: pandas>=1.3; extra == 'pandas'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# Canonical Hash
|
|
38
|
+
|
|
39
|
+
Deterministic hashing for arbitrary Python objects.
|
|
40
|
+
|
|
41
|
+
Provides utilities for creating stable, deterministic hashes of Python objects, essential for cache key computation, data versioning, and reproducibility.
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from scicanonicalhash import canonical_hash, generate_record_id
|
|
47
|
+
|
|
48
|
+
# Hash any supported Python object
|
|
49
|
+
h = canonical_hash(42)
|
|
50
|
+
h = canonical_hash([1, 2, 3])
|
|
51
|
+
h = canonical_hash({"key": "value"})
|
|
52
|
+
|
|
53
|
+
# Generates a 16-character hex string (first 64 bits of SHA-256)
|
|
54
|
+
assert len(h) == 16
|
|
55
|
+
assert canonical_hash(42) == canonical_hash(42) # Deterministic
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Supported Types
|
|
59
|
+
|
|
60
|
+
1. JSON-serializable primitives (None, bool, int, float, str)
|
|
61
|
+
2. numpy ndarrays (via shape + dtype + raw bytes)
|
|
62
|
+
3. pandas DataFrames (via columns + index + array serialization)
|
|
63
|
+
4. pandas Series (via name + array serialization)
|
|
64
|
+
5. Dicts (sorted keys, recursive serialization)
|
|
65
|
+
6. Lists/tuples (order-preserving, recursive serialization)
|
|
66
|
+
|
|
67
|
+
## `generate_record_id`
|
|
68
|
+
|
|
69
|
+
Generate a unique record ID from type, schema version, content hash, and metadata:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
rid = generate_record_id("MyData", 1, "abc123", {"subject": 1})
|
|
73
|
+
```
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Canonical Hash
|
|
2
|
+
|
|
3
|
+
Deterministic hashing for arbitrary Python objects.
|
|
4
|
+
|
|
5
|
+
Provides utilities for creating stable, deterministic hashes of Python objects, essential for cache key computation, data versioning, and reproducibility.
|
|
6
|
+
|
|
7
|
+
## Usage
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from scicanonicalhash import canonical_hash, generate_record_id
|
|
11
|
+
|
|
12
|
+
# Hash any supported Python object
|
|
13
|
+
h = canonical_hash(42)
|
|
14
|
+
h = canonical_hash([1, 2, 3])
|
|
15
|
+
h = canonical_hash({"key": "value"})
|
|
16
|
+
|
|
17
|
+
# Generates a 16-character hex string (first 64 bits of SHA-256)
|
|
18
|
+
assert len(h) == 16
|
|
19
|
+
assert canonical_hash(42) == canonical_hash(42) # Deterministic
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Supported Types
|
|
23
|
+
|
|
24
|
+
1. JSON-serializable primitives (None, bool, int, float, str)
|
|
25
|
+
2. numpy ndarrays (via shape + dtype + raw bytes)
|
|
26
|
+
3. pandas DataFrames (via columns + index + array serialization)
|
|
27
|
+
4. pandas Series (via name + array serialization)
|
|
28
|
+
5. Dicts (sorted keys, recursive serialization)
|
|
29
|
+
6. Lists/tuples (order-preserving, recursive serialization)
|
|
30
|
+
|
|
31
|
+
## `generate_record_id`
|
|
32
|
+
|
|
33
|
+
Generate a unique record ID from type, schema version, content hash, and metadata:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
rid = generate_record_id("MyData", 1, "abc123", {"subject": 1})
|
|
37
|
+
```
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scicanonicalhash"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Deterministic hashing for arbitrary Python objects"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "SciStack Contributors" }
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"hash",
|
|
17
|
+
"deterministic",
|
|
18
|
+
"canonical",
|
|
19
|
+
"cache-key",
|
|
20
|
+
"reproducibility",
|
|
21
|
+
"data-science",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 4 - Beta",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"License :: OSI Approved :: MIT License",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"Programming Language :: Python :: 3.10",
|
|
31
|
+
"Programming Language :: Python :: 3.11",
|
|
32
|
+
"Programming Language :: Python :: 3.12",
|
|
33
|
+
"Topic :: Scientific/Engineering",
|
|
34
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
35
|
+
"Typing :: Typed",
|
|
36
|
+
]
|
|
37
|
+
dependencies = []
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
numpy = ["numpy>=1.20"]
|
|
41
|
+
pandas = ["pandas>=1.3"]
|
|
42
|
+
all = ["numpy>=1.20", "pandas>=1.3"]
|
|
43
|
+
dev = [
|
|
44
|
+
"pytest>=7.0",
|
|
45
|
+
"pytest-cov>=4.0",
|
|
46
|
+
"mypy>=1.0",
|
|
47
|
+
"ruff>=0.1.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.urls]
|
|
51
|
+
Repository = "https://github.com/example/scicanonicalhash"
|
|
52
|
+
Issues = "https://github.com/example/scicanonicalhash/issues"
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.sdist]
|
|
55
|
+
include = [
|
|
56
|
+
"/src",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[tool.hatch.build.targets.wheel]
|
|
60
|
+
packages = ["src/scicanonicalhash"]
|
|
61
|
+
|
|
62
|
+
[tool.pytest.ini_options]
|
|
63
|
+
testpaths = ["tests"]
|
|
64
|
+
pythonpath = ["src"]
|
|
65
|
+
|
|
66
|
+
[tool.mypy]
|
|
67
|
+
python_version = "3.10"
|
|
68
|
+
strict = true
|
|
69
|
+
warn_return_any = true
|
|
70
|
+
warn_unused_configs = true
|
|
71
|
+
|
|
72
|
+
[tool.ruff]
|
|
73
|
+
target-version = "py310"
|
|
74
|
+
line-length = 88
|
|
75
|
+
|
|
76
|
+
[tool.ruff.lint]
|
|
77
|
+
select = [
|
|
78
|
+
"E", # pycodestyle errors
|
|
79
|
+
"W", # pycodestyle warnings
|
|
80
|
+
"F", # Pyflakes
|
|
81
|
+
"I", # isort
|
|
82
|
+
"B", # flake8-bugbear
|
|
83
|
+
"C4", # flake8-comprehensions
|
|
84
|
+
"UP", # pyupgrade
|
|
85
|
+
]
|
|
86
|
+
ignore = [
|
|
87
|
+
"E501", # line too long (handled by formatter)
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
[tool.ruff.lint.isort]
|
|
91
|
+
known-first-party = ["scicanonicalhash"]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Deterministic hashing for arbitrary Python objects.
|
|
2
|
+
|
|
3
|
+
This package provides utilities for creating stable, deterministic hashes
|
|
4
|
+
of Python objects, essential for cache key computation, data versioning,
|
|
5
|
+
and reproducibility in data pipelines.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from scicanonicalhash.hashing import canonical_hash, generate_record_id
|
|
9
|
+
|
|
10
|
+
__all__ = ["canonical_hash", "generate_record_id"]
|
|
11
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Deterministic hashing for arbitrary Python objects.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for creating stable, deterministic hashes
|
|
4
|
+
of Python objects, which is essential for cache key computation, data
|
|
5
|
+
versioning, and reproducibility.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def canonical_hash(obj: Any) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Generate a deterministic hash for arbitrary Python objects.
|
|
16
|
+
|
|
17
|
+
Strategy:
|
|
18
|
+
1. For JSON-serializable primitives (None, bool, int, float, str): use JSON
|
|
19
|
+
2. For numpy ndarrays: use shape + dtype + raw bytes
|
|
20
|
+
3. For pandas DataFrames: use columns + index + array serialization
|
|
21
|
+
4. For pandas Series: use name + array serialization
|
|
22
|
+
5. For dicts: sort keys, recursively serialize
|
|
23
|
+
6. For lists/tuples: preserve order, recursively serialize
|
|
24
|
+
7. For other objects: raise ValueError
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
obj: Any Python object to hash
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
16-character hex string (first 64 bits of SHA-256)
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError: If an unserializable object is provided
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> h = canonical_hash(42)
|
|
37
|
+
>>> len(h) == 16 and all(c in '0123456789abcdef' for c in h)
|
|
38
|
+
True
|
|
39
|
+
>>> canonical_hash(42) == canonical_hash(42) # Deterministic
|
|
40
|
+
True
|
|
41
|
+
>>> canonical_hash([1, 2, 3]) != canonical_hash([1, 2, 4]) # Content-sensitive
|
|
42
|
+
True
|
|
43
|
+
"""
|
|
44
|
+
serialized = _serialize_for_hash(obj)
|
|
45
|
+
return hashlib.sha256(serialized).hexdigest()[:16]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _serialize_for_hash(obj: Any) -> bytes:
|
|
49
|
+
"""Convert object to bytes for hashing."""
|
|
50
|
+
|
|
51
|
+
# Primitives - use JSON for stability
|
|
52
|
+
if isinstance(obj, (type(None), bool, int, float, str)):
|
|
53
|
+
return json.dumps(obj).encode("utf-8")
|
|
54
|
+
|
|
55
|
+
# Dicts - sort keys for determinism
|
|
56
|
+
if isinstance(obj, dict):
|
|
57
|
+
sorted_items = sorted(obj.items(), key=lambda x: str(x[0]))
|
|
58
|
+
parts = []
|
|
59
|
+
for k, v in sorted_items:
|
|
60
|
+
parts.append(_serialize_for_hash(k))
|
|
61
|
+
parts.append(_serialize_for_hash(v))
|
|
62
|
+
return b"dict:" + b"|".join(parts)
|
|
63
|
+
|
|
64
|
+
# Lists/tuples - preserve order
|
|
65
|
+
if isinstance(obj, (list, tuple)):
|
|
66
|
+
type_prefix = b"list:" if isinstance(obj, list) else b"tuple:"
|
|
67
|
+
parts = [_serialize_for_hash(item) for item in obj]
|
|
68
|
+
return type_prefix + b"|".join(parts)
|
|
69
|
+
|
|
70
|
+
# Numpy arrays - use shape, dtype, and raw bytes
|
|
71
|
+
if hasattr(obj, "tobytes") and hasattr(obj, "dtype") and hasattr(obj, "shape"):
|
|
72
|
+
return (
|
|
73
|
+
b"ndarray:"
|
|
74
|
+
+ str(obj.shape).encode()
|
|
75
|
+
+ b":"
|
|
76
|
+
+ str(obj.dtype).encode()
|
|
77
|
+
+ b":"
|
|
78
|
+
+ obj.tobytes()
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Pandas DataFrame
|
|
82
|
+
if hasattr(obj, "to_numpy") and hasattr(obj, "columns"):
|
|
83
|
+
arr = obj.to_numpy()
|
|
84
|
+
cols = list(obj.columns)
|
|
85
|
+
idx = list(obj.index) if hasattr(obj, "index") else []
|
|
86
|
+
return (
|
|
87
|
+
b"dataframe:"
|
|
88
|
+
+ _serialize_for_hash(cols)
|
|
89
|
+
+ b":"
|
|
90
|
+
+ _serialize_for_hash(idx)
|
|
91
|
+
+ b":"
|
|
92
|
+
+ _serialize_for_hash(arr)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Pandas Series
|
|
96
|
+
if hasattr(obj, "to_numpy") and hasattr(obj, "name") and not hasattr(obj, "columns"):
|
|
97
|
+
return (
|
|
98
|
+
b"series:"
|
|
99
|
+
+ _serialize_for_hash(obj.name)
|
|
100
|
+
+ b":"
|
|
101
|
+
+ _serialize_for_hash(obj.to_numpy())
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Python array.array (MATLAB bridge can produce these)
|
|
105
|
+
import array as _array_mod
|
|
106
|
+
if isinstance(obj, _array_mod.array):
|
|
107
|
+
import numpy as np
|
|
108
|
+
return _serialize_for_hash(np.array(obj))
|
|
109
|
+
|
|
110
|
+
# Unsupported type
|
|
111
|
+
raise ValueError(f"Unserializable data type: {type(obj)}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def generate_record_id(
|
|
115
|
+
class_name: str,
|
|
116
|
+
schema_version: int,
|
|
117
|
+
content_hash: str,
|
|
118
|
+
metadata: dict,
|
|
119
|
+
) -> str:
|
|
120
|
+
"""
|
|
121
|
+
Generate a unique record ID from components.
|
|
122
|
+
|
|
123
|
+
The record_id uniquely identifies a record by its type, schema, content,
|
|
124
|
+
and metadata. Useful for addressing/querying versioned data.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
class_name: The record type (e.g., "RotationMatrix")
|
|
128
|
+
schema_version: Integer version of the serialization schema
|
|
129
|
+
content_hash: Pre-computed hash of the data content
|
|
130
|
+
metadata: The addressing metadata (subject, trial, etc.)
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
16-character hex string
|
|
134
|
+
|
|
135
|
+
Example:
|
|
136
|
+
>>> rid = generate_record_id("MyData", 1, "abc123", {"subject": 1})
|
|
137
|
+
>>> len(rid) == 16 and all(c in '0123456789abcdef' for c in rid)
|
|
138
|
+
True
|
|
139
|
+
"""
|
|
140
|
+
components = [
|
|
141
|
+
f"class:{class_name}",
|
|
142
|
+
f"schema:{schema_version}",
|
|
143
|
+
f"content:{content_hash}",
|
|
144
|
+
f"meta:{canonical_hash(metadata)}",
|
|
145
|
+
]
|
|
146
|
+
combined = "|".join(components).encode("utf-8")
|
|
147
|
+
return hashlib.sha256(combined).hexdigest()[:16]
|