pydiverse-common 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydiverse/common/util/hashing.py +81 -0
- {pydiverse_common-0.3.6.dist-info → pydiverse_common-0.3.8.dist-info}/METADATA +1 -1
- {pydiverse_common-0.3.6.dist-info → pydiverse_common-0.3.8.dist-info}/RECORD +5 -5
- {pydiverse_common-0.3.6.dist-info → pydiverse_common-0.3.8.dist-info}/WHEEL +0 -0
- {pydiverse_common-0.3.6.dist-info → pydiverse_common-0.3.8.dist-info}/licenses/LICENSE +0 -0
pydiverse/common/util/hashing.py
CHANGED
@@ -2,6 +2,14 @@
|
|
2
2
|
# SPDX-License-Identifier: BSD-3-Clause
|
3
3
|
import base64
|
4
4
|
import hashlib
|
5
|
+
import types
|
6
|
+
import warnings
|
7
|
+
|
8
|
+
try:
|
9
|
+
import polars as pl
|
10
|
+
except ImportError:
|
11
|
+
pl = types.ModuleType("pl")
|
12
|
+
pl.DataFrame = None
|
5
13
|
|
6
14
|
|
7
15
|
def stable_hash(*args: str) -> str:
|
@@ -30,3 +38,76 @@ def stable_hash(*args: str) -> str:
|
|
30
38
|
hash_digest = combined_hash.digest()
|
31
39
|
hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
|
32
40
|
return hash_str[:20]
|
41
|
+
|
42
|
+
|
43
|
+
def hash_polars_dataframe(df: pl.DataFrame, use_init_repr=False) -> str:
|
44
|
+
if not use_init_repr:
|
45
|
+
try:
|
46
|
+
|
47
|
+
def unnest_all(df: pl.DataFrame) -> pl.DataFrame:
|
48
|
+
while struct_cols_and_dtypes := [
|
49
|
+
(col, dtype)
|
50
|
+
for col, dtype in df.schema.items()
|
51
|
+
if dtype == pl.Struct
|
52
|
+
]:
|
53
|
+
df = df.with_columns(
|
54
|
+
pl.col(struct_col_name).struct.rename_fields(
|
55
|
+
[stable_hash(struct_col_name, struct_field_name)]
|
56
|
+
)
|
57
|
+
for struct_col_name, struct_field_name in struct_cols_and_dtypes
|
58
|
+
).unnest(
|
59
|
+
struct_col_name for struct_col_name, _ in struct_cols_and_dtypes
|
60
|
+
)
|
61
|
+
return df
|
62
|
+
|
63
|
+
schema_hash = stable_hash(repr(df.schema))
|
64
|
+
if df.is_empty():
|
65
|
+
content_hash = "empty"
|
66
|
+
else:
|
67
|
+
# Since we need to operate on all lists, we need to access them first
|
68
|
+
# if they are within a struct.
|
69
|
+
df = unnest_all(df)
|
70
|
+
array_columns = [
|
71
|
+
col for col, dtype in df.schema.items() if dtype == pl.Array
|
72
|
+
]
|
73
|
+
list_columns = [
|
74
|
+
col for col, dtype in df.schema.items() if dtype == pl.List
|
75
|
+
]
|
76
|
+
lf = df.lazy()
|
77
|
+
if array_columns:
|
78
|
+
lf = lf.with_columns(pl.col(array_columns).reshape([-1]).implode())
|
79
|
+
lf = lf.with_columns(
|
80
|
+
# Necessary because hash() does not work on lists of strings.
|
81
|
+
# This can be removed when
|
82
|
+
# https://github.com/pola-rs/polars/issues/21523 is resolved
|
83
|
+
# in all supported versions of polars.
|
84
|
+
pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
|
85
|
+
pl.element().hash()
|
86
|
+
)
|
87
|
+
)
|
88
|
+
if list_columns or array_columns:
|
89
|
+
# Necessary because hash_rows() does not work on lists.
|
90
|
+
# This can be removed when
|
91
|
+
# https://github.com/pola-rs/polars/issues/24121 is resolved
|
92
|
+
# in all supported versions of polars.
|
93
|
+
lf = lf.with_columns(pl.col(*list_columns, *array_columns).hash())
|
94
|
+
|
95
|
+
content_hash = str(
|
96
|
+
lf.collect()
|
97
|
+
.hash_rows() # We get a Series of hashes, one for each row
|
98
|
+
# Since polars only hashes rows, we need to implode the Series into
|
99
|
+
# a single row to get a single hash
|
100
|
+
.implode()
|
101
|
+
.hash()
|
102
|
+
.item()
|
103
|
+
)
|
104
|
+
return "0" + stable_hash(schema_hash, content_hash)
|
105
|
+
except Exception:
|
106
|
+
warnings.warn(
|
107
|
+
"Failed to compute hash for polars DataFrame in fast way. "
|
108
|
+
"Falling back to to_init_repr() method.",
|
109
|
+
stacklevel=1,
|
110
|
+
)
|
111
|
+
|
112
|
+
# fallback to to_init_repr string representation
|
113
|
+
return "1" + stable_hash(df.to_init_repr(len(df)))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydiverse-common
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.8
|
4
4
|
Summary: Common functionality shared between pydiverse libraries
|
5
5
|
Author: QuantCo, Inc.
|
6
6
|
Author-email: Martin Trautmann <windiana@users.sf.net>, Finn Rudolph <finn.rudolph@t-online.de>
|
@@ -8,10 +8,10 @@ pydiverse/common/util/computation_tracing.py,sha256=HeXRHRUI8vxpzQ27Xcpa0StndSTP
|
|
8
8
|
pydiverse/common/util/deep_map.py,sha256=JtY5ViWMMelOiLzPF7ZjzruCfB-bETISGxCk37qETxg,2540
|
9
9
|
pydiverse/common/util/deep_merge.py,sha256=bV5p5_lsC-9nFah28EiEyG2h6U3Z5AuTqSooxOgCHN0,1929
|
10
10
|
pydiverse/common/util/disposable.py,sha256=4XoGz70YRWA9TAqnUBvRCTAdsOGBviFN0gzxU7veY9o,993
|
11
|
-
pydiverse/common/util/hashing.py,sha256=
|
11
|
+
pydiverse/common/util/hashing.py,sha256=8Z1NybJ_zd3ONpn5annHGjowwArWkd2ZkCtlb3dtz_Q,4576
|
12
12
|
pydiverse/common/util/import_.py,sha256=K7dSgz4YyrqEvqhoOzbwgD7D8HScMoO5XoSWtjbaoUs,4056
|
13
13
|
pydiverse/common/util/structlog.py,sha256=xxhauxMuyxcKXTVg1MiPTkuvPBj8Zcr4o_v8Bq59Nig,3778
|
14
|
-
pydiverse_common-0.3.
|
15
|
-
pydiverse_common-0.3.
|
16
|
-
pydiverse_common-0.3.
|
17
|
-
pydiverse_common-0.3.
|
14
|
+
pydiverse_common-0.3.8.dist-info/METADATA,sha256=ptAGp299BY9NSaM-XEaojLzhL_KVc0SEY-MFqqqAwL0,3399
|
15
|
+
pydiverse_common-0.3.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
pydiverse_common-0.3.8.dist-info/licenses/LICENSE,sha256=AcE6SDVuAq6v9ZLE_8eOCe_NvSE0rAPR3NR7lSowYh4,1517
|
17
|
+
pydiverse_common-0.3.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|