pydiverse-common 0.3.6__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/PKG-INFO +1 -1
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/source/changelog.md +10 -1
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/pyproject.toml +1 -1
- pydiverse_common-0.3.8/src/pydiverse/common/util/hashing.py +113 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/test_util.py +84 -0
- pydiverse_common-0.3.8/typos.toml +2 -0
- pydiverse_common-0.3.6/src/pydiverse/common/util/hashing.py +0 -32
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.gitattributes +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/CODEOWNERS +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/dependabot.yml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/scripts/check_deps.sh +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/workflows/release.yml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/workflows/tests.yml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.github/workflows/update-lockfiles.yml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.gitignore +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.pre-commit-config.yaml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/.readthedocs.yaml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/LICENSE +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/README.md +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/Makefile +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/make.bat +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/package/README.md +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/source/conf.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/source/index.md +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/source/license.md +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/docs/source/reference/api.rst +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/pixi.lock +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/pixi.toml +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/pytest.ini +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/__init__.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/dtypes.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/errors/__init__.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/testing.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/__init__.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/computation_tracing.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/deep_map.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/deep_merge.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/disposable.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/import_.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/structlog.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/version.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/conftest.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/dtypes/test_dtype_pandas.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/dtypes/test_dtype_polars.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/dtypes/test_dtype_pyarrow.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/dtypes/test_dtype_sqlalchemy.py +0 -0
- {pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/tests/test_version.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydiverse-common
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.8
|
4
4
|
Summary: Common functionality shared between pydiverse libraries
|
5
5
|
Author: QuantCo, Inc.
|
6
6
|
Author-email: Martin Trautmann <windiana@users.sf.net>, Finn Rudolph <finn.rudolph@t-online.de>
|
@@ -1,9 +1,18 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
-
## 0.3.
|
3
|
+
## 0.3.8 (2025-08-19)
|
4
|
+
- fixed util.hashing.hash_polars_dataframe for simple dataframe
|
5
|
+
|
6
|
+
## 0.3.7 (2025-08-19)
|
7
|
+
- support util.hashing.hash_polars_dataframe
|
8
|
+
|
9
|
+
## 0.3.6 (2025-08-01)
|
4
10
|
- hack structlog / dask / pytest capture incompatibility
|
5
11
|
(structlog._output.stderr != sys.stderr leads to pickle error)
|
6
12
|
|
13
|
+
## 0.3.5 (2025-06-27)
|
14
|
+
- added enum type
|
15
|
+
|
7
16
|
## 0.3.4 (2025-06-10)
|
8
17
|
- fixed pypi package dependencies
|
9
18
|
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# Copyright (c) QuantCo and pydiverse contributors 2025-2025
|
2
|
+
# SPDX-License-Identifier: BSD-3-Clause
|
3
|
+
import base64
|
4
|
+
import hashlib
|
5
|
+
import types
|
6
|
+
import warnings
|
7
|
+
|
8
|
+
try:
|
9
|
+
import polars as pl
|
10
|
+
except ImportError:
|
11
|
+
pl = types.ModuleType("pl")
|
12
|
+
pl.DataFrame = None
|
13
|
+
|
14
|
+
|
15
|
+
def stable_hash(*args: str) -> str:
|
16
|
+
"""Compute a hash over a set of strings
|
17
|
+
|
18
|
+
:param args: Some strings from which to compute the cache key
|
19
|
+
:return: A sha256 base32 digest, trimmed to 20 char length
|
20
|
+
"""
|
21
|
+
|
22
|
+
combined_hash = hashlib.sha256(b"PYDIVERSE")
|
23
|
+
for arg in args:
|
24
|
+
arg_bytes = str(arg).encode("utf8")
|
25
|
+
arg_bytes_len = len(arg_bytes).to_bytes(length=8, byteorder="big")
|
26
|
+
|
27
|
+
combined_hash.update(arg_bytes_len)
|
28
|
+
combined_hash.update(arg_bytes)
|
29
|
+
|
30
|
+
# Only take first 20 characters of base32 digest (100 bits). This
|
31
|
+
# provides 50 bits of collision resistance, which is more than enough.
|
32
|
+
# To illustrate: If you were to generate 1k hashes per second,
|
33
|
+
# you still would have to wait over 800k years until you encounter
|
34
|
+
# a collision.
|
35
|
+
|
36
|
+
# NOTE: Can't use base64 because it contains lower and upper case
|
37
|
+
# letters; identifiers in pipedag are all lowercase
|
38
|
+
hash_digest = combined_hash.digest()
|
39
|
+
hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
|
40
|
+
return hash_str[:20]
|
41
|
+
|
42
|
+
|
43
|
+
def hash_polars_dataframe(df: pl.DataFrame, use_init_repr=False) -> str:
|
44
|
+
if not use_init_repr:
|
45
|
+
try:
|
46
|
+
|
47
|
+
def unnest_all(df: pl.DataFrame) -> pl.DataFrame:
|
48
|
+
while struct_cols_and_dtypes := [
|
49
|
+
(col, dtype)
|
50
|
+
for col, dtype in df.schema.items()
|
51
|
+
if dtype == pl.Struct
|
52
|
+
]:
|
53
|
+
df = df.with_columns(
|
54
|
+
pl.col(struct_col_name).struct.rename_fields(
|
55
|
+
[stable_hash(struct_col_name, struct_field_name)]
|
56
|
+
)
|
57
|
+
for struct_col_name, struct_field_name in struct_cols_and_dtypes
|
58
|
+
).unnest(
|
59
|
+
struct_col_name for struct_col_name, _ in struct_cols_and_dtypes
|
60
|
+
)
|
61
|
+
return df
|
62
|
+
|
63
|
+
schema_hash = stable_hash(repr(df.schema))
|
64
|
+
if df.is_empty():
|
65
|
+
content_hash = "empty"
|
66
|
+
else:
|
67
|
+
# Since we need to operate on all lists, we need to access them first
|
68
|
+
# if they are within a struct.
|
69
|
+
df = unnest_all(df)
|
70
|
+
array_columns = [
|
71
|
+
col for col, dtype in df.schema.items() if dtype == pl.Array
|
72
|
+
]
|
73
|
+
list_columns = [
|
74
|
+
col for col, dtype in df.schema.items() if dtype == pl.List
|
75
|
+
]
|
76
|
+
lf = df.lazy()
|
77
|
+
if array_columns:
|
78
|
+
lf = lf.with_columns(pl.col(array_columns).reshape([-1]).implode())
|
79
|
+
lf = lf.with_columns(
|
80
|
+
# Necessary because hash() does not work on lists of strings.
|
81
|
+
# This can be removed when
|
82
|
+
# https://github.com/pola-rs/polars/issues/21523 is resolved
|
83
|
+
# in all supported versions of polars.
|
84
|
+
pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
|
85
|
+
pl.element().hash()
|
86
|
+
)
|
87
|
+
)
|
88
|
+
if list_columns or array_columns:
|
89
|
+
# Necessary because hash_rows() does not work on lists.
|
90
|
+
# This can be removed when
|
91
|
+
# https://github.com/pola-rs/polars/issues/24121 is resolved
|
92
|
+
# in all supported versions of polars.
|
93
|
+
lf = lf.with_columns(pl.col(*list_columns, *array_columns).hash())
|
94
|
+
|
95
|
+
content_hash = str(
|
96
|
+
lf.collect()
|
97
|
+
.hash_rows() # We get a Series of hashes, one for each row
|
98
|
+
# Since polars only hashes rows, we need to implode the Series into
|
99
|
+
# a single row to get a single hash
|
100
|
+
.implode()
|
101
|
+
.hash()
|
102
|
+
.item()
|
103
|
+
)
|
104
|
+
return "0" + stable_hash(schema_hash, content_hash)
|
105
|
+
except Exception:
|
106
|
+
warnings.warn(
|
107
|
+
"Failed to compute hash for polars DataFrame in fast way. "
|
108
|
+
"Falling back to to_init_repr() method.",
|
109
|
+
stacklevel=1,
|
110
|
+
)
|
111
|
+
|
112
|
+
# fallback to to_init_repr string representation
|
113
|
+
return "1" + stable_hash(df.to_init_repr(len(df)))
|
@@ -1,12 +1,20 @@
|
|
1
1
|
# Copyright (c) QuantCo and pydiverse contributors 2025-2025
|
2
2
|
# SPDX-License-Identifier: BSD-3-Clause
|
3
3
|
import traceback
|
4
|
+
import types
|
4
5
|
from dataclasses import dataclass
|
5
6
|
|
6
7
|
import pytest
|
7
8
|
|
8
9
|
from pydiverse.common.errors import DisposedError
|
9
10
|
from pydiverse.common.util import Disposable, deep_map, requires
|
11
|
+
from pydiverse.common.util.hashing import hash_polars_dataframe
|
12
|
+
|
13
|
+
try:
|
14
|
+
import polars as pl
|
15
|
+
except ImportError:
|
16
|
+
pl = types.ModuleType("polars")
|
17
|
+
pl.DataFrame = None
|
10
18
|
|
11
19
|
|
12
20
|
def test_requires():
|
@@ -112,3 +120,79 @@ def test_deep_map():
|
|
112
120
|
res = deep_map([1, d.values()], lambda x: 2 if x == 1 else x)
|
113
121
|
assert res[0] == 2
|
114
122
|
assert list(res[1]) == list(d.values())
|
123
|
+
|
124
|
+
|
125
|
+
def check_df_hashes(df1_a: pl.DataFrame, other_dfs: list[pl.DataFrame]) -> None:
|
126
|
+
assert hash_polars_dataframe(df1_a)[0] == "0"
|
127
|
+
assert hash_polars_dataframe(df1_a, use_init_repr=True)[0] == "1"
|
128
|
+
assert hash_polars_dataframe(df1_a) == hash_polars_dataframe(df1_a)
|
129
|
+
assert hash_polars_dataframe(df1_a, use_init_repr=True) == hash_polars_dataframe(
|
130
|
+
df1_a, use_init_repr=True
|
131
|
+
)
|
132
|
+
for df1_other in other_dfs:
|
133
|
+
assert hash_polars_dataframe(df1_other)[0] == "0"
|
134
|
+
assert hash_polars_dataframe(df1_other, use_init_repr=True)[0] == "1"
|
135
|
+
assert hash_polars_dataframe(df1_a) != hash_polars_dataframe(df1_other)
|
136
|
+
assert hash_polars_dataframe(
|
137
|
+
df1_a, use_init_repr=True
|
138
|
+
) != hash_polars_dataframe(df1_other, use_init_repr=True)
|
139
|
+
assert hash_polars_dataframe(df1_other) == hash_polars_dataframe(df1_other)
|
140
|
+
assert hash_polars_dataframe(
|
141
|
+
df1_other, use_init_repr=True
|
142
|
+
) == hash_polars_dataframe(df1_other, use_init_repr=True)
|
143
|
+
|
144
|
+
|
145
|
+
@pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
|
146
|
+
def test_hashing_basic():
|
147
|
+
df1_a = pl.DataFrame(dict(x=[1]))
|
148
|
+
df1_b = pl.DataFrame(dict(y=[1]))
|
149
|
+
df1_c = pl.DataFrame(dict(x=[2]))
|
150
|
+
df1_d = pl.DataFrame(dict(x=[1.0]))
|
151
|
+
df1_e = pl.DataFrame(dict(x=[]))
|
152
|
+
|
153
|
+
check_df_hashes(df1_a, [df1_b, df1_c, df1_d, df1_e])
|
154
|
+
|
155
|
+
|
156
|
+
@pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
|
157
|
+
def test_hashing():
|
158
|
+
df1_a = pl.DataFrame(
|
159
|
+
data=dict(x=[["foo", "bar"], [""]], y=[[1, 2], None], z=[1, 2])
|
160
|
+
).with_columns(s=pl.struct("x", "y"))
|
161
|
+
df1_b = pl.DataFrame(
|
162
|
+
data=dict(x=[["foo", "bar"], [""]], z=[[1, 2], None], y=[1, 2])
|
163
|
+
).with_columns(s=pl.struct("x", "y"))
|
164
|
+
df1_c = pl.DataFrame(
|
165
|
+
data=dict(x=[["foo", "baR"], [""]], y=[[1, 2], None], z=[1, 2])
|
166
|
+
).with_columns(s=pl.struct("x", "y"))
|
167
|
+
df1_d = pl.DataFrame(
|
168
|
+
data=dict(x=[["foo", "bar"], [""]], y=[[1, 3], None], z=[1, 2])
|
169
|
+
).with_columns(s=pl.struct("x", "y"))
|
170
|
+
df1_e = pl.DataFrame(
|
171
|
+
data=dict(x=[["foo", "bar"], [""]], y=[[1, 3], []], z=[1, 2])
|
172
|
+
).with_columns(s=pl.struct("x", "y"))
|
173
|
+
df1_f = pl.DataFrame(
|
174
|
+
data=dict(x=[["foo", "bar"], [""]], y=[[1, 2], None], z=[1, 2])
|
175
|
+
).with_columns(s=pl.struct("x", "z"))
|
176
|
+
|
177
|
+
check_df_hashes(df1_a, [df1_b, df1_c, df1_d, df1_e, df1_f])
|
178
|
+
|
179
|
+
|
180
|
+
@pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
|
181
|
+
def test_hashing_array():
|
182
|
+
df1_a = pl.DataFrame(
|
183
|
+
data=dict(x=[[[1], [2], [3]]]), schema=dict(x=pl.Array(pl.UInt16, shape=(3, 1)))
|
184
|
+
)
|
185
|
+
df1_b = pl.DataFrame(
|
186
|
+
data=dict(y=[[[1], [2], [3]]]), schema=dict(y=pl.Array(pl.UInt16, shape=(3, 1)))
|
187
|
+
)
|
188
|
+
df1_c = pl.DataFrame(
|
189
|
+
data=dict(x=[[[1], [3], [2]]]), schema=dict(x=pl.Array(pl.UInt16, shape=(3, 1)))
|
190
|
+
)
|
191
|
+
df1_d = pl.DataFrame(
|
192
|
+
data=dict(x=[[[1, 2, 3]]]), schema=dict(x=pl.Array(pl.UInt16, shape=(1, 3)))
|
193
|
+
)
|
194
|
+
df1_e = pl.DataFrame(
|
195
|
+
data=dict(x=[[1, 2, 3]]), schema=dict(x=pl.Array(pl.UInt16, shape=3))
|
196
|
+
)
|
197
|
+
|
198
|
+
check_df_hashes(df1_a, [df1_b, df1_c, df1_d, df1_e])
|
@@ -1,32 +0,0 @@
|
|
1
|
-
# Copyright (c) QuantCo and pydiverse contributors 2025-2025
|
2
|
-
# SPDX-License-Identifier: BSD-3-Clause
|
3
|
-
import base64
|
4
|
-
import hashlib
|
5
|
-
|
6
|
-
|
7
|
-
def stable_hash(*args: str) -> str:
|
8
|
-
"""Compute a hash over a set of strings
|
9
|
-
|
10
|
-
:param args: Some strings from which to compute the cache key
|
11
|
-
:return: A sha256 base32 digest, trimmed to 20 char length
|
12
|
-
"""
|
13
|
-
|
14
|
-
combined_hash = hashlib.sha256(b"PYDIVERSE")
|
15
|
-
for arg in args:
|
16
|
-
arg_bytes = str(arg).encode("utf8")
|
17
|
-
arg_bytes_len = len(arg_bytes).to_bytes(length=8, byteorder="big")
|
18
|
-
|
19
|
-
combined_hash.update(arg_bytes_len)
|
20
|
-
combined_hash.update(arg_bytes)
|
21
|
-
|
22
|
-
# Only take first 20 characters of base32 digest (100 bits). This
|
23
|
-
# provides 50 bits of collision resistance, which is more than enough.
|
24
|
-
# To illustrate: If you were to generate 1k hashes per second,
|
25
|
-
# you still would have to wait over 800k years until you encounter
|
26
|
-
# a collision.
|
27
|
-
|
28
|
-
# NOTE: Can't use base64 because it contains lower and upper case
|
29
|
-
# letters; identifiers in pipedag are all lowercase
|
30
|
-
hash_digest = combined_hash.digest()
|
31
|
-
hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
|
32
|
-
return hash_str[:20]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{pydiverse_common-0.3.6 → pydiverse_common-0.3.8}/src/pydiverse/common/util/computation_tracing.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|