pydiverse-common 0.3.6__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/PKG-INFO +1 -1
  2. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/source/changelog.md +7 -1
  3. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/pyproject.toml +1 -1
  4. pydiverse_common-0.3.7/src/pydiverse/common/util/hashing.py +110 -0
  5. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/test_util.py +73 -0
  6. pydiverse_common-0.3.7/typos.toml +2 -0
  7. pydiverse_common-0.3.6/src/pydiverse/common/util/hashing.py +0 -32
  8. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.gitattributes +0 -0
  9. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/CODEOWNERS +0 -0
  10. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  11. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/dependabot.yml +0 -0
  12. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/scripts/check_deps.sh +0 -0
  13. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/workflows/release.yml +0 -0
  14. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/workflows/tests.yml +0 -0
  15. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.github/workflows/update-lockfiles.yml +0 -0
  16. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.gitignore +0 -0
  17. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.pre-commit-config.yaml +0 -0
  18. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/.readthedocs.yaml +0 -0
  19. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/LICENSE +0 -0
  20. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/README.md +0 -0
  21. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/Makefile +0 -0
  22. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/make.bat +0 -0
  23. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/package/README.md +0 -0
  24. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/source/conf.py +0 -0
  25. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/source/index.md +0 -0
  26. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/source/license.md +0 -0
  27. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/docs/source/reference/api.rst +0 -0
  28. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/pixi.lock +0 -0
  29. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/pixi.toml +0 -0
  30. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/pytest.ini +0 -0
  31. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/__init__.py +0 -0
  32. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/dtypes.py +0 -0
  33. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/errors/__init__.py +0 -0
  34. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/testing.py +0 -0
  35. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/__init__.py +0 -0
  36. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/computation_tracing.py +0 -0
  37. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/deep_map.py +0 -0
  38. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/deep_merge.py +0 -0
  39. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/disposable.py +0 -0
  40. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/import_.py +0 -0
  41. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/util/structlog.py +0 -0
  42. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/src/pydiverse/common/version.py +0 -0
  43. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/conftest.py +0 -0
  44. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/dtypes/test_dtype_pandas.py +0 -0
  45. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/dtypes/test_dtype_polars.py +0 -0
  46. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/dtypes/test_dtype_pyarrow.py +0 -0
  47. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/dtypes/test_dtype_sqlalchemy.py +0 -0
  48. {pydiverse_common-0.3.6 → pydiverse_common-0.3.7}/tests/test_version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydiverse-common
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Common functionality shared between pydiverse libraries
5
5
  Author: QuantCo, Inc.
6
6
  Author-email: Martin Trautmann <windiana@users.sf.net>, Finn Rudolph <finn.rudolph@t-online.de>
@@ -1,9 +1,15 @@
1
1
  # Changelog
2
2
 
3
- ## 0.3.5 (2025-08-01)
3
+ ## 0.3.7 (2025-08-19)
4
+ - support util.hashing.hash_polars_dataframe
5
+
6
+ ## 0.3.6 (2025-08-01)
4
7
  - hack structlog / dask / pytest capture incompatibility
5
8
  (structlog._output.stderr != sys.stderr leads to pickle error)
6
9
 
10
+ ## 0.3.5 (2025-06-27)
11
+ - added enum type
12
+
7
13
  ## 0.3.4 (2025-06-10)
8
14
  - fixed pypi package dependencies
9
15
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pydiverse-common"
3
- version = "0.3.6"
3
+ version = "0.3.7"
4
4
  description = "Common functionality shared between pydiverse libraries"
5
5
  authors = [
6
6
  { name = "QuantCo, Inc." },
@@ -0,0 +1,110 @@
1
+ # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+ import base64
4
+ import hashlib
5
+ import types
6
+ import warnings
7
+
8
+ try:
9
+ import polars as pl
10
+ except ImportError:
11
+ pl = types.ModuleType("pl")
12
+ pl.DataFrame = None
13
+
14
+
15
+ def stable_hash(*args: str) -> str:
16
+ """Compute a hash over a set of strings
17
+
18
+ :param args: Some strings from which to compute the cache key
19
+ :return: A sha256 base32 digest, trimmed to 20 char length
20
+ """
21
+
22
+ combined_hash = hashlib.sha256(b"PYDIVERSE")
23
+ for arg in args:
24
+ arg_bytes = str(arg).encode("utf8")
25
+ arg_bytes_len = len(arg_bytes).to_bytes(length=8, byteorder="big")
26
+
27
+ combined_hash.update(arg_bytes_len)
28
+ combined_hash.update(arg_bytes)
29
+
30
+ # Only take first 20 characters of base32 digest (100 bits). This
31
+ # provides 50 bits of collision resistance, which is more than enough.
32
+ # To illustrate: If you were to generate 1k hashes per second,
33
+ # you still would have to wait over 800k years until you encounter
34
+ # a collision.
35
+
36
+ # NOTE: Can't use base64 because it contains lower and upper case
37
+ # letters; identifiers in pipedag are all lowercase
38
+ hash_digest = combined_hash.digest()
39
+ hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
40
+ return hash_str[:20]
41
+
42
+
43
+ def hash_polars_dataframe(df: pl.DataFrame, use_init_repr=False) -> str:
44
+ if not use_init_repr:
45
+ try:
46
+
47
+ def unnest_all(df: pl.DataFrame) -> pl.DataFrame:
48
+ while struct_cols_and_dtypes := [
49
+ (col, dtype)
50
+ for col, dtype in df.schema.items()
51
+ if dtype == pl.Struct
52
+ ]:
53
+ df = df.with_columns(
54
+ pl.col(struct_col_name).struct.rename_fields(
55
+ [stable_hash(struct_col_name, struct_field_name)]
56
+ )
57
+ for struct_col_name, struct_field_name in struct_cols_and_dtypes
58
+ ).unnest(
59
+ struct_col_name for struct_col_name, _ in struct_cols_and_dtypes
60
+ )
61
+ return df
62
+
63
+ schema_hash = stable_hash(repr(df.schema))
64
+ if df.is_empty():
65
+ content_hash = "empty"
66
+ else:
67
+ # Since we need to operate on all lists, we need to access them first
68
+ # if they are within a struct.
69
+ df = unnest_all(df)
70
+ array_columns = [
71
+ col for col, dtype in df.schema.items() if dtype == pl.Array
72
+ ]
73
+ list_columns = [
74
+ col for col, dtype in df.schema.items() if dtype == pl.List
75
+ ]
76
+ content_hash = str(
77
+ df.lazy()
78
+ .with_columns(pl.col(array_columns).reshape([-1]).implode())
79
+ .with_columns(
80
+ # Necessary because hash() does not work on lists of strings.
81
+ # This can be removed when
82
+ # https://github.com/pola-rs/polars/issues/21523 is resolved
83
+ # in all supported versions of polars.
84
+ pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
85
+ pl.element().hash()
86
+ )
87
+ )
88
+ # Necessary because hash_rows() does not work on lists.
89
+ # This can be removed when
90
+ # https://github.com/pola-rs/polars/issues/24121 is resolved
91
+ # in all supported versions of polars.
92
+ .with_columns(pl.col(*list_columns, *array_columns).hash())
93
+ .collect()
94
+ .hash_rows() # We get a Series of hashes, one for each row
95
+ # Since polars only hashes rows, we need to implode the Series into
96
+ # a single row to get a single hash
97
+ .implode()
98
+ .hash()
99
+ .item()
100
+ )
101
+ return "0" + stable_hash(schema_hash, content_hash)
102
+ except Exception:
103
+ warnings.warn(
104
+ "Failed to compute hash for polars DataFrame in fast way. "
105
+ "Falling back to to_init_repr() method.",
106
+ stacklevel=1,
107
+ )
108
+
109
+ # fallback to to_init_repr string representation
110
+ return "1" + stable_hash(df.to_init_repr(len(df)))
@@ -1,12 +1,20 @@
1
1
  # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
2
  # SPDX-License-Identifier: BSD-3-Clause
3
3
  import traceback
4
+ import types
4
5
  from dataclasses import dataclass
5
6
 
6
7
  import pytest
7
8
 
8
9
  from pydiverse.common.errors import DisposedError
9
10
  from pydiverse.common.util import Disposable, deep_map, requires
11
+ from pydiverse.common.util.hashing import hash_polars_dataframe
12
+
13
+ try:
14
+ import polars as pl
15
+ except ImportError:
16
+ pl = types.ModuleType("polars")
17
+ pl.DataFrame = None
10
18
 
11
19
 
12
20
  def test_requires():
@@ -112,3 +120,68 @@ def test_deep_map():
112
120
  res = deep_map([1, d.values()], lambda x: 2 if x == 1 else x)
113
121
  assert res[0] == 2
114
122
  assert list(res[1]) == list(d.values())
123
+
124
+
125
+ def check_df_hashes(df1_a: pl.DataFrame, other_dfs: list[pl.DataFrame]) -> None:
126
+ assert hash_polars_dataframe(df1_a)[0] == "0"
127
+ assert hash_polars_dataframe(df1_a, use_init_repr=True)[0] == "1"
128
+ assert hash_polars_dataframe(df1_a) == hash_polars_dataframe(df1_a)
129
+ assert hash_polars_dataframe(df1_a, use_init_repr=True) == hash_polars_dataframe(
130
+ df1_a, use_init_repr=True
131
+ )
132
+ for df1_other in other_dfs:
133
+ assert hash_polars_dataframe(df1_other)[0] == "0"
134
+ assert hash_polars_dataframe(df1_other, use_init_repr=True)[0] == "1"
135
+ assert hash_polars_dataframe(df1_a) != hash_polars_dataframe(df1_other)
136
+ assert hash_polars_dataframe(
137
+ df1_a, use_init_repr=True
138
+ ) != hash_polars_dataframe(df1_other, use_init_repr=True)
139
+ assert hash_polars_dataframe(df1_other) == hash_polars_dataframe(df1_other)
140
+ assert hash_polars_dataframe(
141
+ df1_other, use_init_repr=True
142
+ ) == hash_polars_dataframe(df1_other, use_init_repr=True)
143
+
144
+
145
+ @pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
146
+ def test_hashing():
147
+ df1_a = pl.DataFrame(
148
+ data=dict(x=[["foo", "bar"], [""]], y=[[1, 2], None], z=[1, 2])
149
+ ).with_columns(s=pl.struct("x", "y"))
150
+ df1_b = pl.DataFrame(
151
+ data=dict(x=[["foo", "bar"], [""]], z=[[1, 2], None], y=[1, 2])
152
+ ).with_columns(s=pl.struct("x", "y"))
153
+ df1_c = pl.DataFrame(
154
+ data=dict(x=[["foo", "baR"], [""]], y=[[1, 2], None], z=[1, 2])
155
+ ).with_columns(s=pl.struct("x", "y"))
156
+ df1_d = pl.DataFrame(
157
+ data=dict(x=[["foo", "bar"], [""]], y=[[1, 3], None], z=[1, 2])
158
+ ).with_columns(s=pl.struct("x", "y"))
159
+ df1_e = pl.DataFrame(
160
+ data=dict(x=[["foo", "bar"], [""]], y=[[1, 3], []], z=[1, 2])
161
+ ).with_columns(s=pl.struct("x", "y"))
162
+ df1_f = pl.DataFrame(
163
+ data=dict(x=[["foo", "bar"], [""]], y=[[1, 2], None], z=[1, 2])
164
+ ).with_columns(s=pl.struct("x", "z"))
165
+
166
+ check_df_hashes(df1_a, [df1_b, df1_c, df1_d, df1_e, df1_f])
167
+
168
+
169
+ @pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
170
+ def test_hashing_array():
171
+ df1_a = pl.DataFrame(
172
+ data=dict(x=[[[1], [2], [3]]]), schema=dict(x=pl.Array(pl.UInt16, shape=(3, 1)))
173
+ )
174
+ df1_b = pl.DataFrame(
175
+ data=dict(y=[[[1], [2], [3]]]), schema=dict(y=pl.Array(pl.UInt16, shape=(3, 1)))
176
+ )
177
+ df1_c = pl.DataFrame(
178
+ data=dict(x=[[[1], [3], [2]]]), schema=dict(x=pl.Array(pl.UInt16, shape=(3, 1)))
179
+ )
180
+ df1_d = pl.DataFrame(
181
+ data=dict(x=[[[1, 2, 3]]]), schema=dict(x=pl.Array(pl.UInt16, shape=(1, 3)))
182
+ )
183
+ df1_e = pl.DataFrame(
184
+ data=dict(x=[[1, 2, 3]]), schema=dict(x=pl.Array(pl.UInt16, shape=3))
185
+ )
186
+
187
+ check_df_hashes(df1_a, [df1_b, df1_c, df1_d, df1_e])
@@ -0,0 +1,2 @@
1
+ [default]
2
+ extend-ignore-words-re = ["^ba$"]
@@ -1,32 +0,0 @@
1
- # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
- # SPDX-License-Identifier: BSD-3-Clause
3
- import base64
4
- import hashlib
5
-
6
-
7
- def stable_hash(*args: str) -> str:
8
- """Compute a hash over a set of strings
9
-
10
- :param args: Some strings from which to compute the cache key
11
- :return: A sha256 base32 digest, trimmed to 20 char length
12
- """
13
-
14
- combined_hash = hashlib.sha256(b"PYDIVERSE")
15
- for arg in args:
16
- arg_bytes = str(arg).encode("utf8")
17
- arg_bytes_len = len(arg_bytes).to_bytes(length=8, byteorder="big")
18
-
19
- combined_hash.update(arg_bytes_len)
20
- combined_hash.update(arg_bytes)
21
-
22
- # Only take first 20 characters of base32 digest (100 bits). This
23
- # provides 50 bits of collision resistance, which is more than enough.
24
- # To illustrate: If you were to generate 1k hashes per second,
25
- # you still would have to wait over 800k years until you encounter
26
- # a collision.
27
-
28
- # NOTE: Can't use base64 because it contains lower and upper case
29
- # letters; identifiers in pipedag are all lowercase
30
- hash_digest = combined_hash.digest()
31
- hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
32
- return hash_str[:20]