pydiverse-common 0.3.7__tar.gz → 0.3.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/PKG-INFO +1 -1
  2. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/source/changelog.md +6 -0
  3. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/pyproject.toml +1 -1
  4. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/dtypes.py +27 -3
  5. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/hashing.py +16 -13
  6. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/dtypes/test_dtype_pyarrow.py +14 -0
  7. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/test_util.py +11 -0
  8. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.gitattributes +0 -0
  9. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/CODEOWNERS +0 -0
  10. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  11. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/dependabot.yml +0 -0
  12. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/scripts/check_deps.sh +0 -0
  13. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/workflows/release.yml +0 -0
  14. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/workflows/tests.yml +0 -0
  15. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.github/workflows/update-lockfiles.yml +0 -0
  16. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.gitignore +0 -0
  17. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.pre-commit-config.yaml +0 -0
  18. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/.readthedocs.yaml +0 -0
  19. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/LICENSE +0 -0
  20. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/README.md +0 -0
  21. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/Makefile +0 -0
  22. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/make.bat +0 -0
  23. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/package/README.md +0 -0
  24. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/source/conf.py +0 -0
  25. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/source/index.md +0 -0
  26. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/source/license.md +0 -0
  27. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/docs/source/reference/api.rst +0 -0
  28. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/pixi.lock +0 -0
  29. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/pixi.toml +0 -0
  30. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/pytest.ini +0 -0
  31. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/__init__.py +0 -0
  32. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/errors/__init__.py +0 -0
  33. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/testing.py +0 -0
  34. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/__init__.py +0 -0
  35. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/computation_tracing.py +0 -0
  36. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/deep_map.py +0 -0
  37. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/deep_merge.py +0 -0
  38. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/disposable.py +0 -0
  39. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/import_.py +0 -0
  40. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/util/structlog.py +0 -0
  41. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/src/pydiverse/common/version.py +0 -0
  42. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/conftest.py +0 -0
  43. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/dtypes/test_dtype_pandas.py +0 -0
  44. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/dtypes/test_dtype_polars.py +0 -0
  45. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/dtypes/test_dtype_sqlalchemy.py +0 -0
  46. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/tests/test_version.py +0 -0
  47. {pydiverse_common-0.3.7 → pydiverse_common-0.3.9}/typos.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydiverse-common
3
- Version: 0.3.7
3
+ Version: 0.3.9
4
4
  Summary: Common functionality shared between pydiverse libraries
5
5
  Author: QuantCo, Inc.
6
6
  Author-email: Martin Trautmann <windiana@users.sf.net>, Finn Rudolph <finn.rudolph@t-online.de>
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.9 (2025-08-21)
4
+ - fix enum pyarrow dtype
5
+
6
+ ## 0.3.8 (2025-08-19)
7
+ - fixed util.hashing.hash_polars_dataframe for simple dataframe
8
+
3
9
  ## 0.3.7 (2025-08-19)
4
10
  - support util.hashing.hash_polars_dataframe
5
11
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pydiverse-common"
3
- version = "0.3.7"
3
+ version = "0.3.9"
4
4
  description = "Common functionality shared between pydiverse libraries"
5
5
  authors = [
6
6
  { name = "QuantCo, Inc." },
@@ -201,6 +201,11 @@ class Dtype:
201
201
  return NullType()
202
202
  if pa.types.is_list(arrow_type):
203
203
  return List(Dtype.from_arrow(arrow_type.value_type))
204
+ if pa.types.is_dictionary(arrow_type):
205
+ raise RuntimeError(
206
+ "Most likely this is an Enum type. But metadata about categories is "
207
+ "only in the pyarrow field and not in the pyarrow dtype"
208
+ )
204
209
  raise TypeError
205
210
 
206
211
  @staticmethod
@@ -268,7 +273,7 @@ class Dtype:
268
273
  if backend == PandasBackend.NUMPY:
269
274
  return self.to_pandas_nullable(backend)
270
275
  if backend == PandasBackend.ARROW:
271
- if self == String():
276
+ if self == String() or isinstance(self, Enum):
272
277
  return pd.StringDtype(storage="pyarrow")
273
278
  return pd.ArrowDtype(self.to_arrow())
274
279
 
@@ -355,6 +360,12 @@ class Dtype:
355
360
  NullType(): pa.null(),
356
361
  }[self]
357
362
 
363
+ def to_arrow_field(self, name: str, nullable: bool = True):
364
+ """Convert this Dtype to a PyArrow Field."""
365
+ import pyarrow as pa
366
+
367
+ return pa.field(name, self.to_arrow(), nullable=nullable)
368
+
358
369
  def to_polars(self: "Dtype"):
359
370
  """Convert this Dtype to a Polars type."""
360
371
  import polars as pl
@@ -506,6 +517,19 @@ class Enum(String):
506
517
  def to_arrow(self):
507
518
  import pyarrow as pa
508
519
 
509
- # There is also pa.dictionary(), which seems to be kind of similar to an enum.
510
- # Maybe it is better to convert to this.
520
+ # enum categories can only be maintained in pyarrow field (see to_arrow_field)
511
521
  return pa.string()
522
+
523
+ def to_arrow_field(self, name: str, nullable: bool = True):
524
+ """Convert this Dtype to a PyArrow Field."""
525
+ import pyarrow as pa
526
+
527
+ # try to mimic what polars does
528
+ return pa.field(
529
+ name,
530
+ pa.dictionary(pa.uint32(), pa.large_string()),
531
+ nullable=nullable,
532
+ metadata={
533
+ "_PL_ENUM_VALUES": "".join([f"{len(c)};{c}" for c in self.categories])
534
+ },
535
+ )
@@ -73,24 +73,27 @@ def hash_polars_dataframe(df: pl.DataFrame, use_init_repr=False) -> str:
73
73
  list_columns = [
74
74
  col for col, dtype in df.schema.items() if dtype == pl.List
75
75
  ]
76
- content_hash = str(
77
- df.lazy()
78
- .with_columns(pl.col(array_columns).reshape([-1]).implode())
79
- .with_columns(
80
- # Necessary because hash() does not work on lists of strings.
81
- # This can be removed when
82
- # https://github.com/pola-rs/polars/issues/21523 is resolved
83
- # in all supported versions of polars.
84
- pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
85
- pl.element().hash()
86
- )
76
+ lf = df.lazy()
77
+ if array_columns:
78
+ lf = lf.with_columns(pl.col(array_columns).reshape([-1]).implode())
79
+ lf = lf.with_columns(
80
+ # Necessary because hash() does not work on lists of strings.
81
+ # This can be removed when
82
+ # https://github.com/pola-rs/polars/issues/21523 is resolved
83
+ # in all supported versions of polars.
84
+ pl.selectors.by_dtype(pl.List(pl.String)).list.eval(
85
+ pl.element().hash()
87
86
  )
87
+ )
88
+ if list_columns or array_columns:
88
89
  # Necessary because hash_rows() does not work on lists.
89
90
  # This can be removed when
90
91
  # https://github.com/pola-rs/polars/issues/24121 is resolved
91
92
  # in all supported versions of polars.
92
- .with_columns(pl.col(*list_columns, *array_columns).hash())
93
- .collect()
93
+ lf = lf.with_columns(pl.col(*list_columns, *array_columns).hash())
94
+
95
+ content_hash = str(
96
+ lf.collect()
94
97
  .hash_rows() # We get a Series of hashes, one for each row
95
98
  # Since polars only hashes rows, we need to implode the Series into
96
99
  # a single row to get a single hash
@@ -95,6 +95,20 @@ def test_dtype_to_pyarrow():
95
95
  assert_conversion(Datetime(), pa.timestamp("us"))
96
96
 
97
97
 
98
+ @pytest.mark.skipif(pa is None, reason="requires pyarrow")
99
+ def test_dtype_to_pyarrow_enum():
100
+ import polars as pl
101
+
102
+ def assert_conversion(type_: Dtype, expected_dtype):
103
+ df = pl.DataFrame(dict(x=["a"]), schema=dict(x=expected_dtype))
104
+ expected = df.to_arrow().schema
105
+ actual = pa.schema([type_.to_arrow_field("x", nullable=True)])
106
+ assert actual == expected
107
+ assert actual.field(0).metadata == expected.field(0).metadata
108
+
109
+ assert_conversion(Enum("a", "b;c"), pl.Enum(["a", "b;c"]))
110
+
111
+
98
112
  @pytest.mark.skipif(pa is None, reason="requires pyarrow")
99
113
  @pytest.mark.parametrize(
100
114
  "type_",
@@ -142,6 +142,17 @@ def check_df_hashes(df1_a: pl.DataFrame, other_dfs: list[pl.DataFrame]) -> None:
142
142
  ) == hash_polars_dataframe(df1_other, use_init_repr=True)
143
143
 
144
144
 
145
+ @pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
146
+ def test_hashing_basic():
147
+ df1_a = pl.DataFrame(dict(x=[1]))
148
+ df1_b = pl.DataFrame(dict(y=[1]))
149
+ df1_c = pl.DataFrame(dict(x=[2]))
150
+ df1_d = pl.DataFrame(dict(x=[1.0]))
151
+ df1_e = pl.DataFrame(dict(x=[]))
152
+
153
+ check_df_hashes(df1_a, [df1_b, df1_c, df1_d, df1_e])
154
+
155
+
145
156
  @pytest.mark.skipif(pl.DataFrame is None, reason="requires polars")
146
157
  def test_hashing():
147
158
  df1_a = pl.DataFrame(