xolars 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xolars/__init__.py ADDED
@@ -0,0 +1,12 @@
1
+ """xolars — an xarray Dataset paired with per-dimension Polars frames.
2
+
3
+ The :class:`Xolars` container keeps an :class:`xarray.Dataset` and one Polars
4
+ frame per dimension aligned to the Dataset's coordinate order, including under
5
+ ``isel``/``sel`` selection and zarr+parquet round-trips.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from xolars._core import Xolars
11
+
12
+ __all__ = ["Xolars"]
xolars/_core.py ADDED
@@ -0,0 +1,133 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Hashable, Mapping
4
+ from pathlib import Path
5
+ from typing import Any, Generic, Iterable, TypeVar, cast
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import xarray as xr
10
+ from attrs import define, evolve
11
+ from typing_extensions import Self
12
+ from xarray.core.types import ErrorOptionsWithWarn, ZarrWriteModes
13
+
14
+ F = TypeVar("F", bound=pl.DataFrame | pl.LazyFrame)
15
+
16
+
17
+ @define(frozen=True)
18
+ class Xolars(Generic[F]):
19
+ ds: xr.Dataset
20
+ df: Mapping[Hashable, F]
21
+
22
+ def __attrs_post_init__(self):
23
+ new_df = {}
24
+ for dim, frame in self.df.items():
25
+ dim_str = str(dim)
26
+ if dim_str not in self.ds.sizes:
27
+ raise ValueError(
28
+ f"'{dim}' is not a dimension in ds. Available: {list(self.ds.dims)}"
29
+ )
30
+ col_names = (
31
+ frame.collect_schema().names()
32
+ if isinstance(frame, pl.LazyFrame)
33
+ else frame.columns
34
+ )
35
+ if dim_str not in col_names:
36
+ raise ValueError(
37
+ f"DataFrame for dim '{dim}' is missing column '{dim_str}'"
38
+ )
39
+ # Collect only the dim column for validation (cheap — just IDs)
40
+ if isinstance(frame, pl.LazyFrame):
41
+ id_col: pl.Series = frame.select(dim_str).collect()[dim_str]
42
+ else:
43
+ id_col = cast(pl.DataFrame, frame)[dim_str]
44
+ ds_dim_values = self.ds[dim_str]
45
+ df_dim_values: pl.Series = id_col
46
+ common = np.intersect1d(ds_dim_values.to_numpy(), df_dim_values.to_numpy())
47
+ if len(common) != len(ds_dim_values) or len(common) != len(df_dim_values):
48
+ raise ValueError(
49
+ f"DataFrame for dim '{dim}' values don't match ds['{dim_str}'] coordinates. "
50
+ f"Expected {len(ds_dim_values)} number of shared values, got {len(common)}"
51
+ )
52
+ new_df[dim] = _reorder(frame, ds_dim_values)
53
+ object.__setattr__(self, "df", new_df)
54
+
55
+ @classmethod
56
+ def open(cls, path: Path) -> Xolars[pl.LazyFrame]:
57
+ ds = xr.open_zarr(path / "dataset.zarr")
58
+ df: dict[Hashable, pl.LazyFrame] = {}
59
+ for parquet_path in sorted(path.glob("*.parquet")):
60
+ df[parquet_path.stem] = pl.scan_parquet(parquet_path)
61
+ return Xolars(ds=ds, df=df)
62
+
63
+ def write(self, path: Path, mode: ZarrWriteModes):
64
+ path.mkdir(parents=True, exist_ok=True)
65
+ self.ds.to_zarr(path / "dataset.zarr", mode=mode)
66
+ for dim, frame in self.df.items():
67
+ out = path / f"{dim}.parquet"
68
+ if isinstance(frame, pl.LazyFrame):
69
+ frame.sink_parquet(out)
70
+ else:
71
+ cast(pl.DataFrame, frame).write_parquet(out)
72
+
73
+ def isel(
74
+ self,
75
+ indexers: Mapping[Any, Any] | None = None,
76
+ drop: bool = False,
77
+ missing_dims: ErrorOptionsWithWarn = "raise",
78
+ **indexers_kwargs: Any,
79
+ ) -> Self:
80
+ merged = dict(indexers or {})
81
+ merged.update(indexers_kwargs)
82
+ new_ds = self.ds.isel(merged, drop=drop, missing_dims=missing_dims)
83
+ return evolve(self, ds=new_ds, df=_filter_df(self.df, new_ds, merged))
84
+
85
+ def sel(
86
+ self,
87
+ indexers: Mapping[Any, Any] | None = None,
88
+ method: str | None = None,
89
+ tolerance: int | float | Iterable[int | float] | None = None,
90
+ drop: bool = False,
91
+ **indexers_kwargs: Any,
92
+ ) -> Self:
93
+ merged = dict(indexers or {})
94
+ merged.update(indexers_kwargs)
95
+ new_ds = self.ds.sel(merged, method=method, tolerance=tolerance, drop=drop)
96
+ return evolve(self, ds=new_ds, df=_filter_df(self.df, new_ds, merged))
97
+
98
+ def collect(self) -> Xolars[pl.DataFrame]:
99
+ df: dict[Hashable, pl.DataFrame] = {
100
+ k: v.collect() if isinstance(v, pl.LazyFrame) else cast(pl.DataFrame, v)
101
+ for k, v in self.df.items()
102
+ }
103
+ return Xolars(self.ds, df)
104
+
105
+
106
+ def _reorder(frame: F, coords: xr.DataArray) -> F:
107
+ """Reorder frame rows to match the given coordinate order."""
108
+ dim_name = str(coords.name)
109
+ order_df = pl.DataFrame({dim_name: coords.to_numpy()}).with_row_index("__i__")
110
+ if isinstance(frame, pl.LazyFrame):
111
+ joined = frame.join(order_df.lazy(), on=dim_name, how="right")
112
+ else:
113
+ joined = cast(pl.DataFrame, frame).join(order_df, on=dim_name, how="right")
114
+ return cast(F, joined.sort("__i__").drop("__i__"))
115
+
116
+
117
+ def _filter_df(
118
+ df: Mapping[Hashable, F],
119
+ new_ds: xr.Dataset,
120
+ merged: dict[str, Any],
121
+ ) -> dict[Hashable, F]:
122
+ """Filter df entries whose dim appears in merged indexers to match new_ds coords."""
123
+ new_df: dict[Hashable, F] = {}
124
+ for dim, frame in df.items():
125
+ dim_str = str(dim)
126
+ if dim_str in merged:
127
+ if dim_str not in new_ds.dims:
128
+ continue # scalar index dropped this dimension
129
+ keep = new_ds[dim_str].to_numpy()
130
+ new_df[dim] = cast(F, frame.filter(pl.col(dim_str).is_in(keep)))
131
+ else:
132
+ new_df[dim] = frame
133
+ return new_df
xolars/py.typed ADDED
File without changes
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: xolars
3
+ Version: 0.2.0
4
+ Summary: Xarray + Polars: an xarray Dataset paired with per-dimension Polars frames that stay aligned under selection.
5
+ Keywords: xarray,polars,zarr,dataframe,dataset,bioinformatics
6
+ Author: d-laub
7
+ Author-email: d-laub <dlaub@ucsd.edu>
8
+ License-Expression: Apache-2.0
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Typing :: Typed
17
+ Requires-Dist: numpy<3
18
+ Requires-Dist: polars>=1.39,<2
19
+ Requires-Dist: xarray>=2026.2.0,<2027
20
+ Requires-Dist: attrs>=24
21
+ Requires-Dist: typing-extensions>=4.10
22
+ Requires-Dist: zarr>=3.1.5,<4
23
+ Requires-Python: >=3.11
24
+ Project-URL: Homepage, https://github.com/d-laub/xolars
25
+ Project-URL: Repository, https://github.com/d-laub/xolars
26
+ Project-URL: Issues, https://github.com/d-laub/xolars/issues
27
+ Description-Content-Type: text/markdown
28
+
29
+ # xolars
30
+
31
+ **X**array + p**olars**: an [xarray](https://xarray.dev) `Dataset` paired with one
32
+ [Polars](https://pola.rs) frame per dimension, kept aligned to the Dataset's
33
+ coordinate order — including under `isel`/`sel` selection and zarr + parquet
34
+ round-trips.
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ uv add xolars # or: pip install xolars
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ import numpy as np
46
+ import polars as pl
47
+ import xarray as xr
48
+ from xolars import Xolars
49
+
50
+ ds = xr.Dataset(
51
+ {"expr": (["gene_id", "sample_id"], np.arange(12.0).reshape(3, 4))},
52
+ coords={"gene_id": ["G1", "G2", "G3"], "sample_id": ["S1", "S2", "S3", "S4"]},
53
+ )
54
+ genes = pl.DataFrame({"gene_id": ["G1", "G2", "G3"], "chrom": ["c1", "c2", "c3"]})
55
+
56
+ xol = Xolars(ds=ds, df={"gene_id": genes})
57
+
58
+ # Selection filters the Dataset AND every per-dimension frame, together:
59
+ sub = xol.sel(gene_id=["G3", "G1"])
60
+ assert list(sub.df["gene_id"]["gene_id"]) == list(sub.ds["gene_id"].values)
61
+
62
+ # Persist to zarr (Dataset) + parquet (per-dim frames), then reopen lazily:
63
+ xol.write("mydata.xolars", mode="w")
64
+ reloaded = Xolars.open("mydata.xolars") # frames are pl.LazyFrame
65
+ eager = reloaded.collect() # -> pl.DataFrame
66
+ ```
67
+
68
+ `Xolars` is a frozen, generic container: `Xolars[pl.LazyFrame]` after `open`,
69
+ `Xolars[pl.DataFrame]` after `collect`. Construction validates that each frame's
70
+ dim column exactly matches the Dataset coordinate (same set, same multiplicity)
71
+ and reorders rows to the Dataset's order.
72
+
73
+ ## Development
74
+
75
+ ```bash
76
+ uv sync # create .venv with runtime + dev deps
77
+ uv run pytest -q # tests
78
+ uv run prek run --all-files # ruff + pyrefly + hygiene hooks
79
+ uv run prek install # enable git hooks locally
80
+ ```
81
+
82
+ ## Releasing
83
+
84
+ Releases run via the manual `Release` GitHub Actions workflow
85
+ (`workflow_dispatch`), which uses [commitizen](https://commitizen-tools.github.io/commitizen/)
86
+ to bump the version from conventional commits, tag, create a GitHub release, and
87
+ publish to PyPI via OIDC trusted publishing. Before the workflow can succeed,
88
+ the following one-time setup is required on GitHub (out of scope for the initial
89
+ extraction):
90
+
91
+ 1. Push this repository to `https://github.com/d-laub/xolars`.
92
+ 2. Add a `GH_ACTIONS` repository secret (a PAT able to push to `main`) so the
93
+ bump commit + tag can be pushed past branch protection.
94
+ 3. Configure a `pypi` environment and enable
95
+ [PyPI trusted publishing](https://docs.pypi.org/trusted-publishers/) for the
96
+ `xolars` project pointing at the `publish` job.
@@ -0,0 +1,6 @@
1
+ xolars/__init__.py,sha256=4A92AJpojB0dlDPjZxoNIkwJf104NYo03SufwsGHvMI,383
2
+ xolars/_core.py,sha256=7-Bdds1ZK47JFdymP8u6rpIqLkqGzTzLQ4b5sTmxGEk,5141
3
+ xolars/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ xolars-0.2.0.dist-info/WHEEL,sha256=LlB9zUOn921TC3CC5yCeS6O5jsLlxqKqIpg_Zk6XXcQ,81
5
+ xolars-0.2.0.dist-info/METADATA,sha256=CKfBctG6h43ifYFwujqh7jtiJM92dOPYi1j8ExtPkak,3662
6
+ xolars-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.20
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any