scicanonicalhash 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ .venv
2
+ .DS_Store
3
+ */*/__pycache__
4
+ others_projects/sciforge
5
+ scistack-gui/extension/node_modules/
6
+ scistack-gui/frontend/node_modules/__pycache__/
7
+ *.pyc
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+
12
+ # Generated database artifacts (DuckDB data/lineage + write-ahead logs)
13
+ *.duckdb
14
+ *.duckdb.wal
15
+ *.wal
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: scicanonicalhash
3
+ Version: 0.1.0
4
+ Summary: Deterministic hashing for arbitrary Python objects
5
+ Project-URL: Repository, https://github.com/example/scicanonicalhash
6
+ Project-URL: Issues, https://github.com/example/scicanonicalhash/issues
7
+ Author: SciStack Contributors
8
+ License-Expression: MIT
9
+ Keywords: cache-key,canonical,data-science,deterministic,hash,reproducibility
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.9
23
+ Provides-Extra: all
24
+ Requires-Dist: numpy>=1.20; extra == 'all'
25
+ Requires-Dist: pandas>=1.3; extra == 'all'
26
+ Provides-Extra: dev
27
+ Requires-Dist: mypy>=1.0; extra == 'dev'
28
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
29
+ Requires-Dist: pytest>=7.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
31
+ Provides-Extra: numpy
32
+ Requires-Dist: numpy>=1.20; extra == 'numpy'
33
+ Provides-Extra: pandas
34
+ Requires-Dist: pandas>=1.3; extra == 'pandas'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # Canonical Hash
38
+
39
+ Deterministic hashing for arbitrary Python objects.
40
+
41
+ Provides utilities for creating stable, deterministic hashes of Python objects, essential for cache key computation, data versioning, and reproducibility.
42
+
43
+ ## Usage
44
+
45
+ ```python
46
+ from scicanonicalhash import canonical_hash, generate_record_id
47
+
48
+ # Hash any supported Python object
49
+ h = canonical_hash(42)
50
+ h = canonical_hash([1, 2, 3])
51
+ h = canonical_hash({"key": "value"})
52
+
53
+ # Generates a 16-character hex string (first 64 bits of SHA-256)
54
+ assert len(h) == 16
55
+ assert canonical_hash(42) == canonical_hash(42) # Deterministic
56
+ ```
57
+
58
+ ## Supported Types
59
+
60
+ 1. JSON-serializable primitives (None, bool, int, float, str)
61
+ 2. numpy ndarrays (via shape + dtype + raw bytes)
62
+ 3. pandas DataFrames (via columns + index + array serialization)
63
+ 4. pandas Series (via name + array serialization)
64
+ 5. Dicts (sorted keys, recursive serialization)
65
+ 6. Lists/tuples (order-preserving, recursive serialization)
66
+
67
+ ## `generate_record_id`
68
+
69
+ Generate a unique record ID from type, schema version, content hash, and metadata:
70
+
71
+ ```python
72
+ rid = generate_record_id("MyData", 1, "abc123", {"subject": 1})
73
+ ```
@@ -0,0 +1,37 @@
1
+ # Canonical Hash
2
+
3
+ Deterministic hashing for arbitrary Python objects.
4
+
5
+ Provides utilities for creating stable, deterministic hashes of Python objects, essential for cache key computation, data versioning, and reproducibility.
6
+
7
+ ## Usage
8
+
9
+ ```python
10
+ from scicanonicalhash import canonical_hash, generate_record_id
11
+
12
+ # Hash any supported Python object
13
+ h = canonical_hash(42)
14
+ h = canonical_hash([1, 2, 3])
15
+ h = canonical_hash({"key": "value"})
16
+
17
+ # Generates a 16-character hex string (first 64 bits of SHA-256)
18
+ assert len(h) == 16
19
+ assert canonical_hash(42) == canonical_hash(42) # Deterministic
20
+ ```
21
+
22
+ ## Supported Types
23
+
24
+ 1. JSON-serializable primitives (None, bool, int, float, str)
25
+ 2. numpy ndarrays (via shape + dtype + raw bytes)
26
+ 3. pandas DataFrames (via columns + index + array serialization)
27
+ 4. pandas Series (via name + array serialization)
28
+ 5. Dicts (sorted keys, recursive serialization)
29
+ 6. Lists/tuples (order-preserving, recursive serialization)
30
+
31
+ ## `generate_record_id`
32
+
33
+ Generate a unique record ID from type, schema version, content hash, and metadata:
34
+
35
+ ```python
36
+ rid = generate_record_id("MyData", 1, "abc123", {"subject": 1})
37
+ ```
@@ -0,0 +1,91 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "scicanonicalhash"
7
+ version = "0.1.0"
8
+ description = "Deterministic hashing for arbitrary Python objects"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "SciStack Contributors" }
14
+ ]
15
+ keywords = [
16
+ "hash",
17
+ "deterministic",
18
+ "canonical",
19
+ "cache-key",
20
+ "reproducibility",
21
+ "data-science",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 4 - Beta",
25
+ "Intended Audience :: Developers",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: OS Independent",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Topic :: Scientific/Engineering",
34
+ "Topic :: Software Development :: Libraries :: Python Modules",
35
+ "Typing :: Typed",
36
+ ]
37
+ dependencies = []
38
+
39
+ [project.optional-dependencies]
40
+ numpy = ["numpy>=1.20"]
41
+ pandas = ["pandas>=1.3"]
42
+ all = ["numpy>=1.20", "pandas>=1.3"]
43
+ dev = [
44
+ "pytest>=7.0",
45
+ "pytest-cov>=4.0",
46
+ "mypy>=1.0",
47
+ "ruff>=0.1.0",
48
+ ]
49
+
50
+ [project.urls]
51
+ Repository = "https://github.com/example/scicanonicalhash"
52
+ Issues = "https://github.com/example/scicanonicalhash/issues"
53
+
54
+ [tool.hatch.build.targets.sdist]
55
+ include = [
56
+ "/src",
57
+ ]
58
+
59
+ [tool.hatch.build.targets.wheel]
60
+ packages = ["src/scicanonicalhash"]
61
+
62
+ [tool.pytest.ini_options]
63
+ testpaths = ["tests"]
64
+ pythonpath = ["src"]
65
+
66
+ [tool.mypy]
67
+ python_version = "3.10"
68
+ strict = true
69
+ warn_return_any = true
70
+ warn_unused_configs = true
71
+
72
+ [tool.ruff]
73
+ target-version = "py310"
74
+ line-length = 88
75
+
76
+ [tool.ruff.lint]
77
+ select = [
78
+ "E", # pycodestyle errors
79
+ "W", # pycodestyle warnings
80
+ "F", # Pyflakes
81
+ "I", # isort
82
+ "B", # flake8-bugbear
83
+ "C4", # flake8-comprehensions
84
+ "UP", # pyupgrade
85
+ ]
86
+ ignore = [
87
+ "E501", # line too long (handled by formatter)
88
+ ]
89
+
90
+ [tool.ruff.lint.isort]
91
+ known-first-party = ["scicanonicalhash"]
@@ -0,0 +1,11 @@
1
+ """Deterministic hashing for arbitrary Python objects.
2
+
3
+ This package provides utilities for creating stable, deterministic hashes
4
+ of Python objects, essential for cache key computation, data versioning,
5
+ and reproducibility in data pipelines.
6
+ """
7
+
8
+ from scicanonicalhash.hashing import canonical_hash, generate_record_id
9
+
10
+ __all__ = ["canonical_hash", "generate_record_id"]
11
+ __version__ = "0.1.0"
@@ -0,0 +1,147 @@
1
+ """Deterministic hashing for arbitrary Python objects.
2
+
3
+ This module provides utilities for creating stable, deterministic hashes
4
+ of Python objects, which is essential for cache key computation, data
5
+ versioning, and reproducibility.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ from typing import Any
11
+
12
+
13
+ def canonical_hash(obj: Any) -> str:
14
+ """
15
+ Generate a deterministic hash for arbitrary Python objects.
16
+
17
+ Strategy:
18
+ 1. For JSON-serializable primitives (None, bool, int, float, str): use JSON
19
+ 2. For numpy ndarrays: use shape + dtype + raw bytes
20
+ 3. For pandas DataFrames: use columns + index + array serialization
21
+ 4. For pandas Series: use name + array serialization
22
+ 5. For dicts: sort keys, recursively serialize
23
+ 6. For lists/tuples: preserve order, recursively serialize
24
+ 7. For other objects: raise ValueError
25
+
26
+ Args:
27
+ obj: Any Python object to hash
28
+
29
+ Returns:
30
+ 16-character hex string (first 64 bits of SHA-256)
31
+
32
+ Raises:
33
+ ValueError: If an unserializable object is provided
34
+
35
+ Example:
36
+ >>> h = canonical_hash(42)
37
+ >>> len(h) == 16 and all(c in '0123456789abcdef' for c in h)
38
+ True
39
+ >>> canonical_hash(42) == canonical_hash(42) # Deterministic
40
+ True
41
+ >>> canonical_hash([1, 2, 3]) != canonical_hash([1, 2, 4]) # Content-sensitive
42
+ True
43
+ """
44
+ serialized = _serialize_for_hash(obj)
45
+ return hashlib.sha256(serialized).hexdigest()[:16]
46
+
47
+
48
+ def _serialize_for_hash(obj: Any) -> bytes:
49
+ """Convert object to bytes for hashing."""
50
+
51
+ # Primitives - use JSON for stability
52
+ if isinstance(obj, (type(None), bool, int, float, str)):
53
+ return json.dumps(obj).encode("utf-8")
54
+
55
+ # Dicts - sort keys for determinism
56
+ if isinstance(obj, dict):
57
+ sorted_items = sorted(obj.items(), key=lambda x: str(x[0]))
58
+ parts = []
59
+ for k, v in sorted_items:
60
+ parts.append(_serialize_for_hash(k))
61
+ parts.append(_serialize_for_hash(v))
62
+ return b"dict:" + b"|".join(parts)
63
+
64
+ # Lists/tuples - preserve order
65
+ if isinstance(obj, (list, tuple)):
66
+ type_prefix = b"list:" if isinstance(obj, list) else b"tuple:"
67
+ parts = [_serialize_for_hash(item) for item in obj]
68
+ return type_prefix + b"|".join(parts)
69
+
70
+ # Numpy arrays - use shape, dtype, and raw bytes
71
+ if hasattr(obj, "tobytes") and hasattr(obj, "dtype") and hasattr(obj, "shape"):
72
+ return (
73
+ b"ndarray:"
74
+ + str(obj.shape).encode()
75
+ + b":"
76
+ + str(obj.dtype).encode()
77
+ + b":"
78
+ + obj.tobytes()
79
+ )
80
+
81
+ # Pandas DataFrame
82
+ if hasattr(obj, "to_numpy") and hasattr(obj, "columns"):
83
+ arr = obj.to_numpy()
84
+ cols = list(obj.columns)
85
+ idx = list(obj.index) if hasattr(obj, "index") else []
86
+ return (
87
+ b"dataframe:"
88
+ + _serialize_for_hash(cols)
89
+ + b":"
90
+ + _serialize_for_hash(idx)
91
+ + b":"
92
+ + _serialize_for_hash(arr)
93
+ )
94
+
95
+ # Pandas Series
96
+ if hasattr(obj, "to_numpy") and hasattr(obj, "name") and not hasattr(obj, "columns"):
97
+ return (
98
+ b"series:"
99
+ + _serialize_for_hash(obj.name)
100
+ + b":"
101
+ + _serialize_for_hash(obj.to_numpy())
102
+ )
103
+
104
+ # Python array.array (MATLAB bridge can produce these)
105
+ import array as _array_mod
106
+ if isinstance(obj, _array_mod.array):
107
+ import numpy as np
108
+ return _serialize_for_hash(np.array(obj))
109
+
110
+ # Unsupported type
111
+ raise ValueError(f"Unserializable data type: {type(obj)}")
112
+
113
+
114
+ def generate_record_id(
115
+ class_name: str,
116
+ schema_version: int,
117
+ content_hash: str,
118
+ metadata: dict,
119
+ ) -> str:
120
+ """
121
+ Generate a unique record ID from components.
122
+
123
+ The record_id uniquely identifies a record by its type, schema, content,
124
+ and metadata. Useful for addressing/querying versioned data.
125
+
126
+ Args:
127
+ class_name: The record type (e.g., "RotationMatrix")
128
+ schema_version: Integer version of the serialization schema
129
+ content_hash: Pre-computed hash of the data content
130
+ metadata: The addressing metadata (subject, trial, etc.)
131
+
132
+ Returns:
133
+ 16-character hex string
134
+
135
+ Example:
136
+ >>> rid = generate_record_id("MyData", 1, "abc123", {"subject": 1})
137
+ >>> len(rid) == 16 and all(c in '0123456789abcdef' for c in rid)
138
+ True
139
+ """
140
+ components = [
141
+ f"class:{class_name}",
142
+ f"schema:{schema_version}",
143
+ f"content:{content_hash}",
144
+ f"meta:{canonical_hash(metadata)}",
145
+ ]
146
+ combined = "|".join(components).encode("utf-8")
147
+ return hashlib.sha256(combined).hexdigest()[:16]