xolars 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xolars-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: xolars
3
+ Version: 0.2.0
4
+ Summary: Xarray + Polars: an xarray Dataset paired with per-dimension Polars frames that stay aligned under selection.
5
+ Keywords: xarray,polars,zarr,dataframe,dataset,bioinformatics
6
+ Author: d-laub
7
+ Author-email: d-laub <dlaub@ucsd.edu>
8
+ License-Expression: Apache-2.0
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Typing :: Typed
17
+ Requires-Dist: numpy<3
18
+ Requires-Dist: polars>=1.39,<2
19
+ Requires-Dist: xarray>=2026.2.0,<2027
20
+ Requires-Dist: attrs>=24
21
+ Requires-Dist: typing-extensions>=4.10
22
+ Requires-Dist: zarr>=3.1.5,<4
23
+ Requires-Python: >=3.11
24
+ Project-URL: Homepage, https://github.com/d-laub/xolars
25
+ Project-URL: Repository, https://github.com/d-laub/xolars
26
+ Project-URL: Issues, https://github.com/d-laub/xolars/issues
27
+ Description-Content-Type: text/markdown
28
+
29
+ # xolars
30
+
31
+ **X**array + p**olars**: an [xarray](https://xarray.dev) `Dataset` paired with one
32
+ [Polars](https://pola.rs) frame per dimension, kept aligned to the Dataset's
33
+ coordinate order — including under `isel`/`sel` selection and zarr + parquet
34
+ round-trips.
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ uv add xolars # or: pip install xolars
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ import numpy as np
46
+ import polars as pl
47
+ import xarray as xr
48
+ from xolars import Xolars
49
+
50
+ ds = xr.Dataset(
51
+ {"expr": (["gene_id", "sample_id"], np.arange(12.0).reshape(3, 4))},
52
+ coords={"gene_id": ["G1", "G2", "G3"], "sample_id": ["S1", "S2", "S3", "S4"]},
53
+ )
54
+ genes = pl.DataFrame({"gene_id": ["G1", "G2", "G3"], "chrom": ["c1", "c2", "c3"]})
55
+
56
+ xol = Xolars(ds=ds, df={"gene_id": genes})
57
+
58
+ # Selection filters the Dataset AND every per-dimension frame, together:
59
+ sub = xol.sel(gene_id=["G3", "G1"])
60
+ assert list(sub.df["gene_id"]["gene_id"]) == list(sub.ds["gene_id"].values)
61
+
62
+ # Persist to zarr (Dataset) + parquet (per-dim frames), then reopen lazily:
63
+ xol.write("mydata.xolars", mode="w")
64
+ reloaded = Xolars.open("mydata.xolars") # frames are pl.LazyFrame
65
+ eager = reloaded.collect() # -> pl.DataFrame
66
+ ```
67
+
68
+ `Xolars` is a frozen, generic container: `Xolars[pl.LazyFrame]` after `open`,
69
+ `Xolars[pl.DataFrame]` after `collect`. Construction validates that each frame's
70
+ dim column exactly matches the Dataset coordinate (same set, same multiplicity)
71
+ and reorders rows to the Dataset's order.
72
+
73
+ ## Development
74
+
75
+ ```bash
76
+ uv sync # create .venv with runtime + dev deps
77
+ uv run pytest -q # tests
78
+ uv run prek run --all-files # ruff + pyrefly + hygiene hooks
79
+ uv run prek install # enable git hooks locally
80
+ ```
81
+
82
+ ## Releasing
83
+
84
+ Releases run via the manual `Release` GitHub Actions workflow
85
+ (`workflow_dispatch`), which uses [commitizen](https://commitizen-tools.github.io/commitizen/)
86
+ to bump the version from conventional commits, tag, create a GitHub release, and
87
+ publish to PyPI via OIDC trusted publishing. Before the workflow can succeed,
88
+ the following one-time setup is required on GitHub (out of scope for the initial
89
+ extraction):
90
+
91
+ 1. Push this repository to `https://github.com/d-laub/xolars`.
92
+ 2. Add a `GH_ACTIONS` repository secret (a PAT able to push to `main`) so the
93
+ bump commit + tag can be pushed past branch protection.
94
+ 3. Configure a `pypi` environment and enable
95
+ [PyPI trusted publishing](https://docs.pypi.org/trusted-publishers/) for the
96
+ `xolars` project pointing at the `publish` job.
xolars-0.2.0/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # xolars
2
+
3
+ **X**array + p**olars**: an [xarray](https://xarray.dev) `Dataset` paired with one
4
+ [Polars](https://pola.rs) frame per dimension, kept aligned to the Dataset's
5
+ coordinate order — including under `isel`/`sel` selection and zarr + parquet
6
+ round-trips.
7
+
8
+ ## Install
9
+
10
+ ```bash
11
+ uv add xolars # or: pip install xolars
12
+ ```
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ import numpy as np
18
+ import polars as pl
19
+ import xarray as xr
20
+ from xolars import Xolars
21
+
22
+ ds = xr.Dataset(
23
+ {"expr": (["gene_id", "sample_id"], np.arange(12.0).reshape(3, 4))},
24
+ coords={"gene_id": ["G1", "G2", "G3"], "sample_id": ["S1", "S2", "S3", "S4"]},
25
+ )
26
+ genes = pl.DataFrame({"gene_id": ["G1", "G2", "G3"], "chrom": ["c1", "c2", "c3"]})
27
+
28
+ xol = Xolars(ds=ds, df={"gene_id": genes})
29
+
30
+ # Selection filters the Dataset AND every per-dimension frame, together:
31
+ sub = xol.sel(gene_id=["G3", "G1"])
32
+ assert list(sub.df["gene_id"]["gene_id"]) == list(sub.ds["gene_id"].values)
33
+
34
+ # Persist to zarr (Dataset) + parquet (per-dim frames), then reopen lazily:
35
+ xol.write("mydata.xolars", mode="w")
36
+ reloaded = Xolars.open("mydata.xolars") # frames are pl.LazyFrame
37
+ eager = reloaded.collect() # -> pl.DataFrame
38
+ ```
39
+
40
+ `Xolars` is a frozen, generic container: `Xolars[pl.LazyFrame]` after `open`,
41
+ `Xolars[pl.DataFrame]` after `collect`. Construction validates that each frame's
42
+ dim column exactly matches the Dataset coordinate (same set, same multiplicity)
43
+ and reorders rows to the Dataset's order.
44
+
45
+ ## Development
46
+
47
+ ```bash
48
+ uv sync # create .venv with runtime + dev deps
49
+ uv run pytest -q # tests
50
+ uv run prek run --all-files # ruff + pyrefly + hygiene hooks
51
+ uv run prek install # enable git hooks locally
52
+ ```
53
+
54
+ ## Releasing
55
+
56
+ Releases run via the manual `Release` GitHub Actions workflow
57
+ (`workflow_dispatch`), which uses [commitizen](https://commitizen-tools.github.io/commitizen/)
58
+ to bump the version from conventional commits, tag, create a GitHub release, and
59
+ publish to PyPI via OIDC trusted publishing. Before the workflow can succeed,
60
+ the following one-time setup is required on GitHub (out of scope for the initial
61
+ extraction):
62
+
63
+ 1. Push this repository to `https://github.com/d-laub/xolars`.
64
+ 2. Add a `GH_ACTIONS` repository secret (a PAT able to push to `main`) so the
65
+ bump commit + tag can be pushed past branch protection.
66
+ 3. Configure a `pypi` environment and enable
67
+ [PyPI trusted publishing](https://docs.pypi.org/trusted-publishers/) for the
68
+ `xolars` project pointing at the `publish` job.
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["uv_build>=0.9.17,<0.10.0"]
3
+ build-backend = "uv_build"
4
+
5
+ [project]
6
+ name = "xolars"
7
+ version = "0.2.0"
8
+ description = "Xarray + Polars: an xarray Dataset paired with per-dimension Polars frames that stay aligned under selection."
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.11"
12
+ authors = [{ name = "d-laub", email = "dlaub@ucsd.edu" }]
13
+ keywords = ["xarray", "polars", "zarr", "dataframe", "dataset", "bioinformatics"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Typing :: Typed",
23
+ ]
24
+ dependencies = [
25
+ "numpy<3",
26
+ "polars>=1.39,<2",
27
+ "xarray>=2026.2.0,<2027",
28
+ "attrs>=24",
29
+ "typing_extensions>=4.10",
30
+ "zarr>=3.1.5,<4",
31
+ ]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/d-laub/xolars"
35
+ Repository = "https://github.com/d-laub/xolars"
36
+ Issues = "https://github.com/d-laub/xolars/issues"
37
+
38
+ [dependency-groups]
39
+ dev = [
40
+ "pytest>=8",
41
+ "ruff>=0.15",
42
+ "pyrefly>=0.16",
43
+ "prek>=0.2",
44
+ "commitizen>=4",
45
+ ]
46
+
47
+ [tool.commitizen]
48
+ name = "cz_conventional_commits"
49
+ version_provider = "pep621"
50
+ tag_format = "v$version"
51
+ update_changelog_on_bump = true
52
+ major_version_zero = true
53
+
54
+ [tool.pyrefly]
55
+ project-includes = ["src", "tests"]
@@ -0,0 +1,12 @@
1
+ """xolars — an xarray Dataset paired with per-dimension Polars frames.
2
+
3
+ The :class:`Xolars` container keeps an :class:`xarray.Dataset` and one Polars
4
+ frame per dimension aligned to the Dataset's coordinate order, including under
5
+ ``isel``/``sel`` selection and zarr+parquet round-trips.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from xolars._core import Xolars
11
+
12
+ __all__ = ["Xolars"]
@@ -0,0 +1,133 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Hashable, Mapping
4
+ from pathlib import Path
5
+ from typing import Any, Generic, Iterable, TypeVar, cast
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import xarray as xr
10
+ from attrs import define, evolve
11
+ from typing_extensions import Self
12
+ from xarray.core.types import ErrorOptionsWithWarn, ZarrWriteModes
13
+
14
+ F = TypeVar("F", bound=pl.DataFrame | pl.LazyFrame)
15
+
16
+
17
+ @define(frozen=True)
18
+ class Xolars(Generic[F]):
19
+ ds: xr.Dataset
20
+ df: Mapping[Hashable, F]
21
+
22
+ def __attrs_post_init__(self):
23
+ new_df = {}
24
+ for dim, frame in self.df.items():
25
+ dim_str = str(dim)
26
+ if dim_str not in self.ds.sizes:
27
+ raise ValueError(
28
+ f"'{dim}' is not a dimension in ds. Available: {list(self.ds.dims)}"
29
+ )
30
+ col_names = (
31
+ frame.collect_schema().names()
32
+ if isinstance(frame, pl.LazyFrame)
33
+ else frame.columns
34
+ )
35
+ if dim_str not in col_names:
36
+ raise ValueError(
37
+ f"DataFrame for dim '{dim}' is missing column '{dim_str}'"
38
+ )
39
+ # Collect only the dim column for validation (cheap — just IDs)
40
+ if isinstance(frame, pl.LazyFrame):
41
+ id_col: pl.Series = frame.select(dim_str).collect()[dim_str]
42
+ else:
43
+ id_col = cast(pl.DataFrame, frame)[dim_str]
44
+ ds_dim_values = self.ds[dim_str]
45
+ df_dim_values: pl.Series = id_col
46
+ common = np.intersect1d(ds_dim_values.to_numpy(), df_dim_values.to_numpy())
47
+ if len(common) != len(ds_dim_values) or len(common) != len(df_dim_values):
48
+ raise ValueError(
49
+ f"DataFrame for dim '{dim}' values don't match ds['{dim_str}'] coordinates. "
50
+ f"Expected {len(ds_dim_values)} number of shared values, got {len(common)}"
51
+ )
52
+ new_df[dim] = _reorder(frame, ds_dim_values)
53
+ object.__setattr__(self, "df", new_df)
54
+
55
+ @classmethod
56
+ def open(cls, path: Path) -> Xolars[pl.LazyFrame]:
57
+ ds = xr.open_zarr(path / "dataset.zarr")
58
+ df: dict[Hashable, pl.LazyFrame] = {}
59
+ for parquet_path in sorted(path.glob("*.parquet")):
60
+ df[parquet_path.stem] = pl.scan_parquet(parquet_path)
61
+ return Xolars(ds=ds, df=df)
62
+
63
+ def write(self, path: Path, mode: ZarrWriteModes):
64
+ path.mkdir(parents=True, exist_ok=True)
65
+ self.ds.to_zarr(path / "dataset.zarr", mode=mode)
66
+ for dim, frame in self.df.items():
67
+ out = path / f"{dim}.parquet"
68
+ if isinstance(frame, pl.LazyFrame):
69
+ frame.sink_parquet(out)
70
+ else:
71
+ cast(pl.DataFrame, frame).write_parquet(out)
72
+
73
+ def isel(
74
+ self,
75
+ indexers: Mapping[Any, Any] | None = None,
76
+ drop: bool = False,
77
+ missing_dims: ErrorOptionsWithWarn = "raise",
78
+ **indexers_kwargs: Any,
79
+ ) -> Self:
80
+ merged = dict(indexers or {})
81
+ merged.update(indexers_kwargs)
82
+ new_ds = self.ds.isel(merged, drop=drop, missing_dims=missing_dims)
83
+ return evolve(self, ds=new_ds, df=_filter_df(self.df, new_ds, merged))
84
+
85
+ def sel(
86
+ self,
87
+ indexers: Mapping[Any, Any] | None = None,
88
+ method: str | None = None,
89
+ tolerance: int | float | Iterable[int | float] | None = None,
90
+ drop: bool = False,
91
+ **indexers_kwargs: Any,
92
+ ) -> Self:
93
+ merged = dict(indexers or {})
94
+ merged.update(indexers_kwargs)
95
+ new_ds = self.ds.sel(merged, method=method, tolerance=tolerance, drop=drop)
96
+ return evolve(self, ds=new_ds, df=_filter_df(self.df, new_ds, merged))
97
+
98
+ def collect(self) -> Xolars[pl.DataFrame]:
99
+ df: dict[Hashable, pl.DataFrame] = {
100
+ k: v.collect() if isinstance(v, pl.LazyFrame) else cast(pl.DataFrame, v)
101
+ for k, v in self.df.items()
102
+ }
103
+ return Xolars(self.ds, df)
104
+
105
+
106
+ def _reorder(frame: F, coords: xr.DataArray) -> F:
107
+ """Reorder frame rows to match the given coordinate order."""
108
+ dim_name = str(coords.name)
109
+ order_df = pl.DataFrame({dim_name: coords.to_numpy()}).with_row_index("__i__")
110
+ if isinstance(frame, pl.LazyFrame):
111
+ joined = frame.join(order_df.lazy(), on=dim_name, how="right")
112
+ else:
113
+ joined = cast(pl.DataFrame, frame).join(order_df, on=dim_name, how="right")
114
+ return cast(F, joined.sort("__i__").drop("__i__"))
115
+
116
+
117
+ def _filter_df(
118
+ df: Mapping[Hashable, F],
119
+ new_ds: xr.Dataset,
120
+ merged: dict[str, Any],
121
+ ) -> dict[Hashable, F]:
122
+ """Filter df entries whose dim appears in merged indexers to match new_ds coords."""
123
+ new_df: dict[Hashable, F] = {}
124
+ for dim, frame in df.items():
125
+ dim_str = str(dim)
126
+ if dim_str in merged:
127
+ if dim_str not in new_ds.dims:
128
+ continue # scalar index dropped this dimension
129
+ keep = new_ds[dim_str].to_numpy()
130
+ new_df[dim] = cast(F, frame.filter(pl.col(dim_str).is_in(keep)))
131
+ else:
132
+ new_df[dim] = frame
133
+ return new_df
File without changes