pydiverse-common 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydiverse/common/dtypes.py +7 -4
- pydiverse/common/util/hashing.py +78 -0
- pydiverse/common/util/structlog.py +10 -1
- {pydiverse_common-0.3.5.dist-info → pydiverse_common-0.3.7.dist-info}/METADATA +1 -1
- {pydiverse_common-0.3.5.dist-info → pydiverse_common-0.3.7.dist-info}/RECORD +7 -7
- {pydiverse_common-0.3.5.dist-info → pydiverse_common-0.3.7.dist-info}/WHEEL +0 -0
- {pydiverse_common-0.3.5.dist-info → pydiverse_common-0.3.7.dist-info}/licenses/LICENSE +0 -0
pydiverse/common/dtypes.py
CHANGED
@@ -103,7 +103,10 @@ class Dtype:
|
|
103
103
|
type_, pd.core.dtypes.common.classes(np_dtype)
|
104
104
|
)
|
105
105
|
|
106
|
-
|
106
|
+
workaround = (
|
107
|
+
pandas_type is not np.floating
|
108
|
+
) # see https://github.com/pandas-dev/pandas/issues/62018
|
109
|
+
if workaround and pd.api.types.is_signed_integer_dtype(pandas_type):
|
107
110
|
if is_np_dtype(pandas_type, np.int64):
|
108
111
|
return Int64()
|
109
112
|
elif is_np_dtype(pandas_type, np.int32):
|
@@ -113,7 +116,7 @@ class Dtype:
|
|
113
116
|
elif is_np_dtype(pandas_type, np.int8):
|
114
117
|
return Int8()
|
115
118
|
raise TypeError
|
116
|
-
if pd.api.types.is_unsigned_integer_dtype(pandas_type):
|
119
|
+
if workaround and pd.api.types.is_unsigned_integer_dtype(pandas_type):
|
117
120
|
if is_np_dtype(pandas_type, np.uint64):
|
118
121
|
return UInt64()
|
119
122
|
elif is_np_dtype(pandas_type, np.uint32):
|
@@ -123,8 +126,8 @@ class Dtype:
|
|
123
126
|
elif is_np_dtype(pandas_type, np.uint8):
|
124
127
|
return UInt8()
|
125
128
|
raise TypeError
|
126
|
-
if pd.api.types.is_float_dtype(pandas_type):
|
127
|
-
if is_np_dtype(pandas_type, np.float64):
|
129
|
+
if not workaround or pd.api.types.is_float_dtype(pandas_type):
|
130
|
+
if not workaround or is_np_dtype(pandas_type, np.float64):
|
128
131
|
return Float64()
|
129
132
|
elif is_np_dtype(pandas_type, np.float32):
|
130
133
|
return Float32()
|
pydiverse/common/util/hashing.py
CHANGED
@@ -2,6 +2,14 @@
|
|
2
2
|
# SPDX-License-Identifier: BSD-3-Clause
|
3
3
|
import base64
|
4
4
|
import hashlib
|
5
|
+
import types
|
6
|
+
import warnings
|
7
|
+
|
8
|
+
try:
|
9
|
+
import polars as pl
|
10
|
+
except ImportError:
|
11
|
+
pl = types.ModuleType("pl")
|
12
|
+
pl.DataFrame = None
|
5
13
|
|
6
14
|
|
7
15
|
def stable_hash(*args: str) -> str:
|
@@ -30,3 +38,73 @@ def stable_hash(*args: str) -> str:
|
|
30
38
|
hash_digest = combined_hash.digest()
|
31
39
|
hash_str = base64.b32encode(hash_digest).decode("ascii").lower()
|
32
40
|
return hash_str[:20]
|
41
|
+
|
42
|
+
|
43
|
+
def hash_polars_dataframe(df: pl.DataFrame, use_init_repr=False) -> str:
|
44
|
+
if not use_init_repr:
|
45
|
+
try:
|
46
|
+
|
47
|
+
def unnest_all(df: pl.DataFrame) -> pl.DataFrame:
|
48
|
+
while struct_cols_and_dtypes := [
|
49
|
+
(col, dtype)
|
50
|
+
for col, dtype in df.schema.items()
|
51
|
+
if dtype == pl.Struct
|
52
|
+
]:
|
53
|
+
df = df.with_columns(
|
54
|
+
pl.col(struct_col_name).struct.rename_fields(
|
55
|
+
[stable_hash(struct_col_name, struct_field_name)]
|
56
|
+
)
|
57
|
+
for struct_col_name, struct_field_name in struct_cols_and_dtypes
|
58
|
+
).unnest(
|
59
|
+
struct_col_name for struct_col_name, _ in struct_cols_and_dtypes
|
60
|
+
)
|
61
|
+
return df
|
62
|
+
|
63
|
+
schema_hash = stable_hash(repr(df.schema))
|
64
|
+
if df.is_empty():
|
65
|
+
content_hash = "empty"
|
66
|
+
else:
|
67
|
+
# Since we need to operate on all lists, we need to access them first
|
68
|
+
# if they are within a struct.
|
69
|
+
df = unnest_all(df)
|
70
|
+
array_columns = [
|
71
|
+
col for col, dtype in df.schema.items() if dtype == pl.Array
|
72
|
+
]
|
73
|
+
list_columns = [
|
74
|
+
col for col, dtype in df.schema.items() if dtype == pl.List
|
75
|
+
]
|
76
|
+
content_hash = str(
|
77
|
+
df.lazy()
|
78
|
+
.with_columns(pl.col(array_columns).reshape([-1]).implode())
|
79
|
+
.with_columns(
|
80
|
+
# Necessary because hash() does not work on lists of strings.
|
81
|
+
# This can be removed when
|
82
|
+
# https://github.com/pola-rs/polars/issues/21523 is resolved
|
83
|
+
# in all supported versions of polars.
|
84
|
+
pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
|
85
|
+
pl.element().hash()
|
86
|
+
)
|
87
|
+
)
|
88
|
+
# Necessary because hash_rows() does not work on lists.
|
89
|
+
# This can be removed when
|
90
|
+
# https://github.com/pola-rs/polars/issues/24121 is resolved
|
91
|
+
# in all supported versions of polars.
|
92
|
+
.with_columns(pl.col(*list_columns, *array_columns).hash())
|
93
|
+
.collect()
|
94
|
+
.hash_rows() # We get a Series of hashes, one for each row
|
95
|
+
# Since polars only hashes rows, we need to implode the Series into
|
96
|
+
# a single row to get a single hash
|
97
|
+
.implode()
|
98
|
+
.hash()
|
99
|
+
.item()
|
100
|
+
)
|
101
|
+
return "0" + stable_hash(schema_hash, content_hash)
|
102
|
+
except Exception:
|
103
|
+
warnings.warn(
|
104
|
+
"Failed to compute hash for polars DataFrame in fast way. "
|
105
|
+
"Falling back to to_init_repr() method.",
|
106
|
+
stacklevel=1,
|
107
|
+
)
|
108
|
+
|
109
|
+
# fallback to to_init_repr string representation
|
110
|
+
return "1" + stable_hash(df.to_init_repr(len(df)))
|
@@ -80,7 +80,7 @@ class PydiverseConsoleRenderer(structlog.dev.ConsoleRenderer):
|
|
80
80
|
|
81
81
|
def setup_logging(
|
82
82
|
log_level=logging.INFO,
|
83
|
-
log_stream=
|
83
|
+
log_stream=None,
|
84
84
|
timestamp_format="%Y-%m-%d %H:%M:%S.%f",
|
85
85
|
):
|
86
86
|
"""Configures structlog and logging with sane defaults."""
|
@@ -92,6 +92,15 @@ def setup_logging(
|
|
92
92
|
level=log_level,
|
93
93
|
handlers=[StructlogHandler()],
|
94
94
|
)
|
95
|
+
if log_stream is None:
|
96
|
+
try:
|
97
|
+
# hack to avoid dask pickling problems with pytest capture
|
98
|
+
import structlog._output
|
99
|
+
|
100
|
+
structlog._output.stderr = sys.stderr
|
101
|
+
finally:
|
102
|
+
pass
|
103
|
+
log_stream = sys.stderr
|
95
104
|
# Configure structlog
|
96
105
|
structlog.configure(
|
97
106
|
processors=[
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pydiverse-common
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.7
|
4
4
|
Summary: Common functionality shared between pydiverse libraries
|
5
5
|
Author: QuantCo, Inc.
|
6
6
|
Author-email: Martin Trautmann <windiana@users.sf.net>, Finn Rudolph <finn.rudolph@t-online.de>
|
@@ -1,5 +1,5 @@
|
|
1
1
|
pydiverse/common/__init__.py,sha256=J7b4iStFyaEMYre_jdlZ4l_8dLyrMWCIpQdsMQcB8aI,806
|
2
|
-
pydiverse/common/dtypes.py,sha256=
|
2
|
+
pydiverse/common/dtypes.py,sha256=LYZKaKYq_4uI4kUhoaCTTo5j1SRurswIOfN11Bkz25A,15986
|
3
3
|
pydiverse/common/testing.py,sha256=FcivI5wn0X3gzJhwnysKvCOgjSTTXaN6FtSFJ72jfSg,341
|
4
4
|
pydiverse/common/version.py,sha256=1IU_m4r76_Qq0u-Tyo2_bERZFOkh0ZFueVzDqcCfLO0,336
|
5
5
|
pydiverse/common/errors/__init__.py,sha256=FNeEfVbUa23b9sHkFsmxHYhY6sRgjaZysPQmlovpJrI,262
|
@@ -8,10 +8,10 @@ pydiverse/common/util/computation_tracing.py,sha256=HeXRHRUI8vxpzQ27Xcpa0StndSTP
|
|
8
8
|
pydiverse/common/util/deep_map.py,sha256=JtY5ViWMMelOiLzPF7ZjzruCfB-bETISGxCk37qETxg,2540
|
9
9
|
pydiverse/common/util/deep_merge.py,sha256=bV5p5_lsC-9nFah28EiEyG2h6U3Z5AuTqSooxOgCHN0,1929
|
10
10
|
pydiverse/common/util/disposable.py,sha256=4XoGz70YRWA9TAqnUBvRCTAdsOGBviFN0gzxU7veY9o,993
|
11
|
-
pydiverse/common/util/hashing.py,sha256=
|
11
|
+
pydiverse/common/util/hashing.py,sha256=TquRSYuArnupP9X_8G6C15L1WxvZT2TtFLyI3IIE7Do,4503
|
12
12
|
pydiverse/common/util/import_.py,sha256=K7dSgz4YyrqEvqhoOzbwgD7D8HScMoO5XoSWtjbaoUs,4056
|
13
|
-
pydiverse/common/util/structlog.py,sha256=
|
14
|
-
pydiverse_common-0.3.
|
15
|
-
pydiverse_common-0.3.
|
16
|
-
pydiverse_common-0.3.
|
17
|
-
pydiverse_common-0.3.
|
13
|
+
pydiverse/common/util/structlog.py,sha256=xxhauxMuyxcKXTVg1MiPTkuvPBj8Zcr4o_v8Bq59Nig,3778
|
14
|
+
pydiverse_common-0.3.7.dist-info/METADATA,sha256=SUa7l3bVxbutbqpILoMMZOPs3SHSkYucd0c_2VZt75A,3399
|
15
|
+
pydiverse_common-0.3.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
pydiverse_common-0.3.7.dist-info/licenses/LICENSE,sha256=AcE6SDVuAq6v9ZLE_8eOCe_NvSE0rAPR3NR7lSowYh4,1517
|
17
|
+
pydiverse_common-0.3.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|