pydiverse-common 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,14 @@
2
2
  # SPDX-License-Identifier: BSD-3-Clause
3
3
  import base64
4
4
  import hashlib
5
+ import types
6
+ import warnings
7
+
8
+ try:
9
+ import polars as pl
10
+ except ImportError:
11
+ pl = types.ModuleType("pl")
12
+ pl.DataFrame = None
5
13
 
6
14
 
7
15
  def stable_hash(*args: str) -> str:
@@ -30,3 +38,73 @@ def stable_hash(*args: str) -> str:
30
38
  hash_digest = combined_hash.digest()
31
39
  hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
32
40
  return hash_str[:20]
41
+
42
+
43
+ def hash_polars_dataframe(df: pl.DataFrame, use_init_repr=False) -> str:
44
+ if not use_init_repr:
45
+ try:
46
+
47
+ def unnest_all(df: pl.DataFrame) -> pl.DataFrame:
48
+ while struct_cols_and_dtypes := [
49
+ (col, dtype)
50
+ for col, dtype in df.schema.items()
51
+ if dtype == pl.Struct
52
+ ]:
53
+ df = df.with_columns(
54
+ pl.col(struct_col_name).struct.rename_fields(
55
+ [stable_hash(struct_col_name, struct_field_name)]
56
+ )
57
+ for struct_col_name, struct_field_name in struct_cols_and_dtypes
58
+ ).unnest(
59
+ struct_col_name for struct_col_name, _ in struct_cols_and_dtypes
60
+ )
61
+ return df
62
+
63
+ schema_hash = stable_hash(repr(df.schema))
64
+ if df.is_empty():
65
+ content_hash = "empty"
66
+ else:
67
+ # Since we need to operate on all lists, we need to access them first
68
+ # if they are within a struct.
69
+ df = unnest_all(df)
70
+ array_columns = [
71
+ col for col, dtype in df.schema.items() if dtype == pl.Array
72
+ ]
73
+ list_columns = [
74
+ col for col, dtype in df.schema.items() if dtype == pl.List
75
+ ]
76
+ content_hash = str(
77
+ df.lazy()
78
+ .with_columns(pl.col(array_columns).reshape([-1]).implode())
79
+ .with_columns(
80
+ # Necessary because hash() does not work on lists of strings.
81
+ # This can be removed when
82
+ # https://github.com/pola-rs/polars/issues/21523 is resolved
83
+ # in all supported versions of polars.
84
+ pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
85
+ pl.element().hash()
86
+ )
87
+ )
88
+ # Necessary because hash_rows() does not work on lists.
89
+ # This can be removed when
90
+ # https://github.com/pola-rs/polars/issues/24121 is resolved
91
+ # in all supported versions of polars.
92
+ .with_columns(pl.col(*list_columns, *array_columns).hash())
93
+ .collect()
94
+ .hash_rows() # We get a Series of hashes, one for each row
95
+ # Since polars only hashes rows, we need to implode the Series into
96
+ # a single row to get a single hash
97
+ .implode()
98
+ .hash()
99
+ .item()
100
+ )
101
+ return "0" + stable_hash(schema_hash, content_hash)
102
+ except Exception:
103
+ warnings.warn(
104
+ "Failed to compute hash for polars DataFrame in fast way. "
105
+ "Falling back to to_init_repr() method.",
106
+ stacklevel=1,
107
+ )
108
+
109
+ # fallback to to_init_repr string representation
110
+ return "1" + stable_hash(df.to_init_repr(len(df)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydiverse-common
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Common functionality shared between pydiverse libraries
5
5
  Author: QuantCo, Inc.
6
6
  Author-email: Martin Trautmann <windiana@users.sf.net>, Finn Rudolph <finn.rudolph@t-online.de>
@@ -8,10 +8,10 @@ pydiverse/common/util/computation_tracing.py,sha256=HeXRHRUI8vxpzQ27Xcpa0StndSTP
8
8
  pydiverse/common/util/deep_map.py,sha256=JtY5ViWMMelOiLzPF7ZjzruCfB-bETISGxCk37qETxg,2540
9
9
  pydiverse/common/util/deep_merge.py,sha256=bV5p5_lsC-9nFah28EiEyG2h6U3Z5AuTqSooxOgCHN0,1929
10
10
  pydiverse/common/util/disposable.py,sha256=4XoGz70YRWA9TAqnUBvRCTAdsOGBviFN0gzxU7veY9o,993
11
- pydiverse/common/util/hashing.py,sha256=6x77BKg-w61u59fuTe9di0BtU-kEKH6UTRcKsRoYJ84,1196
11
+ pydiverse/common/util/hashing.py,sha256=TquRSYuArnupP9X_8G6C15L1WxvZT2TtFLyI3IIE7Do,4503
12
12
  pydiverse/common/util/import_.py,sha256=K7dSgz4YyrqEvqhoOzbwgD7D8HScMoO5XoSWtjbaoUs,4056
13
13
  pydiverse/common/util/structlog.py,sha256=xxhauxMuyxcKXTVg1MiPTkuvPBj8Zcr4o_v8Bq59Nig,3778
14
- pydiverse_common-0.3.6.dist-info/METADATA,sha256=TVaCnX9IArQwB64MGJ4wfVv1EE7KUDI51u9d0QnFXeY,3399
15
- pydiverse_common-0.3.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- pydiverse_common-0.3.6.dist-info/licenses/LICENSE,sha256=AcE6SDVuAq6v9ZLE_8eOCe_NvSE0rAPR3NR7lSowYh4,1517
17
- pydiverse_common-0.3.6.dist-info/RECORD,,
14
+ pydiverse_common-0.3.7.dist-info/METADATA,sha256=SUa7l3bVxbutbqpILoMMZOPs3SHSkYucd0c_2VZt75A,3399
15
+ pydiverse_common-0.3.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ pydiverse_common-0.3.7.dist-info/licenses/LICENSE,sha256=AcE6SDVuAq6v9ZLE_8eOCe_NvSE0rAPR3NR7lSowYh4,1517
17
+ pydiverse_common-0.3.7.dist-info/RECORD,,