xolars 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xolars-0.2.0/PKG-INFO +96 -0
- xolars-0.2.0/README.md +68 -0
- xolars-0.2.0/pyproject.toml +55 -0
- xolars-0.2.0/src/xolars/__init__.py +12 -0
- xolars-0.2.0/src/xolars/_core.py +133 -0
- xolars-0.2.0/src/xolars/py.typed +0 -0
xolars-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xolars
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Xarray + Polars: an xarray Dataset paired with per-dimension Polars frames that stay aligned under selection.
|
|
5
|
+
Keywords: xarray,polars,zarr,dataframe,dataset,bioinformatics
|
|
6
|
+
Author: d-laub
|
|
7
|
+
Author-email: d-laub <dlaub@ucsd.edu>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Dist: numpy<3
|
|
18
|
+
Requires-Dist: polars>=1.39,<2
|
|
19
|
+
Requires-Dist: xarray>=2026.2.0,<2027
|
|
20
|
+
Requires-Dist: attrs>=24
|
|
21
|
+
Requires-Dist: typing-extensions>=4.10
|
|
22
|
+
Requires-Dist: zarr>=3.1.5,<4
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Project-URL: Homepage, https://github.com/d-laub/xolars
|
|
25
|
+
Project-URL: Repository, https://github.com/d-laub/xolars
|
|
26
|
+
Project-URL: Issues, https://github.com/d-laub/xolars/issues
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# xolars
|
|
30
|
+
|
|
31
|
+
**X**array + p**olars**: an [xarray](https://xarray.dev) `Dataset` paired with one
|
|
32
|
+
[Polars](https://pola.rs) frame per dimension, kept aligned to the Dataset's
|
|
33
|
+
coordinate order — including under `isel`/`sel` selection and zarr + parquet
|
|
34
|
+
round-trips.
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
uv add xolars # or: pip install xolars
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
import numpy as np
|
|
46
|
+
import polars as pl
|
|
47
|
+
import xarray as xr
|
|
48
|
+
from xolars import Xolars
|
|
49
|
+
|
|
50
|
+
ds = xr.Dataset(
|
|
51
|
+
{"expr": (["gene_id", "sample_id"], np.arange(12.0).reshape(3, 4))},
|
|
52
|
+
coords={"gene_id": ["G1", "G2", "G3"], "sample_id": ["S1", "S2", "S3", "S4"]},
|
|
53
|
+
)
|
|
54
|
+
genes = pl.DataFrame({"gene_id": ["G1", "G2", "G3"], "chrom": ["c1", "c2", "c3"]})
|
|
55
|
+
|
|
56
|
+
xol = Xolars(ds=ds, df={"gene_id": genes})
|
|
57
|
+
|
|
58
|
+
# Selection filters the Dataset AND every per-dimension frame, together:
|
|
59
|
+
sub = xol.sel(gene_id=["G3", "G1"])
|
|
60
|
+
assert list(sub.df["gene_id"]["gene_id"]) == list(sub.ds["gene_id"].values)
|
|
61
|
+
|
|
62
|
+
# Persist to zarr (Dataset) + parquet (per-dim frames), then reopen lazily:
|
|
63
|
+
xol.write("mydata.xolars", mode="w")
|
|
64
|
+
reloaded = Xolars.open("mydata.xolars") # frames are pl.LazyFrame
|
|
65
|
+
eager = reloaded.collect() # -> pl.DataFrame
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
`Xolars` is a frozen, generic container: `Xolars[pl.LazyFrame]` after `open`,
|
|
69
|
+
`Xolars[pl.DataFrame]` after `collect`. Construction validates that each frame's
|
|
70
|
+
dim column exactly matches the Dataset coordinate (same set, same multiplicity)
|
|
71
|
+
and reorders rows to the Dataset's order.
|
|
72
|
+
|
|
73
|
+
## Development
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
uv sync # create .venv with runtime + dev deps
|
|
77
|
+
uv run pytest -q # tests
|
|
78
|
+
uv run prek run --all-files # ruff + pyrefly + hygiene hooks
|
|
79
|
+
uv run prek install # enable git hooks locally
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Releasing
|
|
83
|
+
|
|
84
|
+
Releases run via the manual `Release` GitHub Actions workflow
|
|
85
|
+
(`workflow_dispatch`), which uses [commitizen](https://commitizen-tools.github.io/commitizen/)
|
|
86
|
+
to bump the version from conventional commits, tag, create a GitHub release, and
|
|
87
|
+
publish to PyPI via OIDC trusted publishing. Before the workflow can succeed,
|
|
88
|
+
the following one-time setup is required on GitHub (out of scope for the initial
|
|
89
|
+
extraction):
|
|
90
|
+
|
|
91
|
+
1. Push this repository to `https://github.com/d-laub/xolars`.
|
|
92
|
+
2. Add a `GH_ACTIONS` repository secret (a PAT able to push to `main`) so the
|
|
93
|
+
bump commit + tag can be pushed past branch protection.
|
|
94
|
+
3. Configure a `pypi` environment and enable
|
|
95
|
+
[PyPI trusted publishing](https://docs.pypi.org/trusted-publishers/) for the
|
|
96
|
+
`xolars` project pointing at the `publish` job.
|
xolars-0.2.0/README.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# xolars
|
|
2
|
+
|
|
3
|
+
**X**array + p**olars**: an [xarray](https://xarray.dev) `Dataset` paired with one
|
|
4
|
+
[Polars](https://pola.rs) frame per dimension, kept aligned to the Dataset's
|
|
5
|
+
coordinate order — including under `isel`/`sel` selection and zarr + parquet
|
|
6
|
+
round-trips.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
uv add xolars # or: pip install xolars
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
import numpy as np
|
|
18
|
+
import polars as pl
|
|
19
|
+
import xarray as xr
|
|
20
|
+
from xolars import Xolars
|
|
21
|
+
|
|
22
|
+
ds = xr.Dataset(
|
|
23
|
+
{"expr": (["gene_id", "sample_id"], np.arange(12.0).reshape(3, 4))},
|
|
24
|
+
coords={"gene_id": ["G1", "G2", "G3"], "sample_id": ["S1", "S2", "S3", "S4"]},
|
|
25
|
+
)
|
|
26
|
+
genes = pl.DataFrame({"gene_id": ["G1", "G2", "G3"], "chrom": ["c1", "c2", "c3"]})
|
|
27
|
+
|
|
28
|
+
xol = Xolars(ds=ds, df={"gene_id": genes})
|
|
29
|
+
|
|
30
|
+
# Selection filters the Dataset AND every per-dimension frame, together:
|
|
31
|
+
sub = xol.sel(gene_id=["G3", "G1"])
|
|
32
|
+
assert list(sub.df["gene_id"]["gene_id"]) == list(sub.ds["gene_id"].values)
|
|
33
|
+
|
|
34
|
+
# Persist to zarr (Dataset) + parquet (per-dim frames), then reopen lazily:
|
|
35
|
+
xol.write("mydata.xolars", mode="w")
|
|
36
|
+
reloaded = Xolars.open("mydata.xolars") # frames are pl.LazyFrame
|
|
37
|
+
eager = reloaded.collect() # -> pl.DataFrame
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
`Xolars` is a frozen, generic container: `Xolars[pl.LazyFrame]` after `open`,
|
|
41
|
+
`Xolars[pl.DataFrame]` after `collect`. Construction validates that each frame's
|
|
42
|
+
dim column exactly matches the Dataset coordinate (same set, same multiplicity)
|
|
43
|
+
and reorders rows to the Dataset's order.
|
|
44
|
+
|
|
45
|
+
## Development
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
uv sync # create .venv with runtime + dev deps
|
|
49
|
+
uv run pytest -q # tests
|
|
50
|
+
uv run prek run --all-files # ruff + pyrefly + hygiene hooks
|
|
51
|
+
uv run prek install # enable git hooks locally
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Releasing
|
|
55
|
+
|
|
56
|
+
Releases run via the manual `Release` GitHub Actions workflow
|
|
57
|
+
(`workflow_dispatch`), which uses [commitizen](https://commitizen-tools.github.io/commitizen/)
|
|
58
|
+
to bump the version from conventional commits, tag, create a GitHub release, and
|
|
59
|
+
publish to PyPI via OIDC trusted publishing. Before the workflow can succeed,
|
|
60
|
+
the following one-time setup is required on GitHub (out of scope for the initial
|
|
61
|
+
extraction):
|
|
62
|
+
|
|
63
|
+
1. Push this repository to `https://github.com/d-laub/xolars`.
|
|
64
|
+
2. Add a `GH_ACTIONS` repository secret (a PAT able to push to `main`) so the
|
|
65
|
+
bump commit + tag can be pushed past branch protection.
|
|
66
|
+
3. Configure a `pypi` environment and enable
|
|
67
|
+
[PyPI trusted publishing](https://docs.pypi.org/trusted-publishers/) for the
|
|
68
|
+
`xolars` project pointing at the `publish` job.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build>=0.9.17,<0.10.0"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "xolars"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Xarray + Polars: an xarray Dataset paired with per-dimension Polars frames that stay aligned under selection."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "d-laub", email = "dlaub@ucsd.edu" }]
|
|
13
|
+
keywords = ["xarray", "polars", "zarr", "dataframe", "dataset", "bioinformatics"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Typing :: Typed",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"numpy<3",
|
|
26
|
+
"polars>=1.39,<2",
|
|
27
|
+
"xarray>=2026.2.0,<2027",
|
|
28
|
+
"attrs>=24",
|
|
29
|
+
"typing_extensions>=4.10",
|
|
30
|
+
"zarr>=3.1.5,<4",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/d-laub/xolars"
|
|
35
|
+
Repository = "https://github.com/d-laub/xolars"
|
|
36
|
+
Issues = "https://github.com/d-laub/xolars/issues"
|
|
37
|
+
|
|
38
|
+
[dependency-groups]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=8",
|
|
41
|
+
"ruff>=0.15",
|
|
42
|
+
"pyrefly>=0.16",
|
|
43
|
+
"prek>=0.2",
|
|
44
|
+
"commitizen>=4",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[tool.commitizen]
|
|
48
|
+
name = "cz_conventional_commits"
|
|
49
|
+
version_provider = "pep621"
|
|
50
|
+
tag_format = "v$version"
|
|
51
|
+
update_changelog_on_bump = true
|
|
52
|
+
major_version_zero = true
|
|
53
|
+
|
|
54
|
+
[tool.pyrefly]
|
|
55
|
+
project-includes = ["src", "tests"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""xolars — an xarray Dataset paired with per-dimension Polars frames.
|
|
2
|
+
|
|
3
|
+
The :class:`Xolars` container keeps an :class:`xarray.Dataset` and one Polars
|
|
4
|
+
frame per dimension aligned to the Dataset's coordinate order, including under
|
|
5
|
+
``isel``/``sel`` selection and zarr+parquet round-trips.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from xolars._core import Xolars
|
|
11
|
+
|
|
12
|
+
__all__ = ["Xolars"]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Hashable, Mapping
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Generic, Iterable, TypeVar, cast
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import polars as pl
|
|
9
|
+
import xarray as xr
|
|
10
|
+
from attrs import define, evolve
|
|
11
|
+
from typing_extensions import Self
|
|
12
|
+
from xarray.core.types import ErrorOptionsWithWarn, ZarrWriteModes
|
|
13
|
+
|
|
14
|
+
F = TypeVar("F", bound=pl.DataFrame | pl.LazyFrame)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@define(frozen=True)
|
|
18
|
+
class Xolars(Generic[F]):
|
|
19
|
+
ds: xr.Dataset
|
|
20
|
+
df: Mapping[Hashable, F]
|
|
21
|
+
|
|
22
|
+
def __attrs_post_init__(self):
|
|
23
|
+
new_df = {}
|
|
24
|
+
for dim, frame in self.df.items():
|
|
25
|
+
dim_str = str(dim)
|
|
26
|
+
if dim_str not in self.ds.sizes:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"'{dim}' is not a dimension in ds. Available: {list(self.ds.dims)}"
|
|
29
|
+
)
|
|
30
|
+
col_names = (
|
|
31
|
+
frame.collect_schema().names()
|
|
32
|
+
if isinstance(frame, pl.LazyFrame)
|
|
33
|
+
else frame.columns
|
|
34
|
+
)
|
|
35
|
+
if dim_str not in col_names:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"DataFrame for dim '{dim}' is missing column '{dim_str}'"
|
|
38
|
+
)
|
|
39
|
+
# Collect only the dim column for validation (cheap — just IDs)
|
|
40
|
+
if isinstance(frame, pl.LazyFrame):
|
|
41
|
+
id_col: pl.Series = frame.select(dim_str).collect()[dim_str]
|
|
42
|
+
else:
|
|
43
|
+
id_col = cast(pl.DataFrame, frame)[dim_str]
|
|
44
|
+
ds_dim_values = self.ds[dim_str]
|
|
45
|
+
df_dim_values: pl.Series = id_col
|
|
46
|
+
common = np.intersect1d(ds_dim_values.to_numpy(), df_dim_values.to_numpy())
|
|
47
|
+
if len(common) != len(ds_dim_values) or len(common) != len(df_dim_values):
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"DataFrame for dim '{dim}' values don't match ds['{dim_str}'] coordinates. "
|
|
50
|
+
f"Expected {len(ds_dim_values)} number of shared values, got {len(common)}"
|
|
51
|
+
)
|
|
52
|
+
new_df[dim] = _reorder(frame, ds_dim_values)
|
|
53
|
+
object.__setattr__(self, "df", new_df)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def open(cls, path: Path) -> Xolars[pl.LazyFrame]:
|
|
57
|
+
ds = xr.open_zarr(path / "dataset.zarr")
|
|
58
|
+
df: dict[Hashable, pl.LazyFrame] = {}
|
|
59
|
+
for parquet_path in sorted(path.glob("*.parquet")):
|
|
60
|
+
df[parquet_path.stem] = pl.scan_parquet(parquet_path)
|
|
61
|
+
return Xolars(ds=ds, df=df)
|
|
62
|
+
|
|
63
|
+
def write(self, path: Path, mode: ZarrWriteModes):
|
|
64
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
self.ds.to_zarr(path / "dataset.zarr", mode=mode)
|
|
66
|
+
for dim, frame in self.df.items():
|
|
67
|
+
out = path / f"{dim}.parquet"
|
|
68
|
+
if isinstance(frame, pl.LazyFrame):
|
|
69
|
+
frame.sink_parquet(out)
|
|
70
|
+
else:
|
|
71
|
+
cast(pl.DataFrame, frame).write_parquet(out)
|
|
72
|
+
|
|
73
|
+
def isel(
|
|
74
|
+
self,
|
|
75
|
+
indexers: Mapping[Any, Any] | None = None,
|
|
76
|
+
drop: bool = False,
|
|
77
|
+
missing_dims: ErrorOptionsWithWarn = "raise",
|
|
78
|
+
**indexers_kwargs: Any,
|
|
79
|
+
) -> Self:
|
|
80
|
+
merged = dict(indexers or {})
|
|
81
|
+
merged.update(indexers_kwargs)
|
|
82
|
+
new_ds = self.ds.isel(merged, drop=drop, missing_dims=missing_dims)
|
|
83
|
+
return evolve(self, ds=new_ds, df=_filter_df(self.df, new_ds, merged))
|
|
84
|
+
|
|
85
|
+
def sel(
|
|
86
|
+
self,
|
|
87
|
+
indexers: Mapping[Any, Any] | None = None,
|
|
88
|
+
method: str | None = None,
|
|
89
|
+
tolerance: int | float | Iterable[int | float] | None = None,
|
|
90
|
+
drop: bool = False,
|
|
91
|
+
**indexers_kwargs: Any,
|
|
92
|
+
) -> Self:
|
|
93
|
+
merged = dict(indexers or {})
|
|
94
|
+
merged.update(indexers_kwargs)
|
|
95
|
+
new_ds = self.ds.sel(merged, method=method, tolerance=tolerance, drop=drop)
|
|
96
|
+
return evolve(self, ds=new_ds, df=_filter_df(self.df, new_ds, merged))
|
|
97
|
+
|
|
98
|
+
def collect(self) -> Xolars[pl.DataFrame]:
|
|
99
|
+
df: dict[Hashable, pl.DataFrame] = {
|
|
100
|
+
k: v.collect() if isinstance(v, pl.LazyFrame) else cast(pl.DataFrame, v)
|
|
101
|
+
for k, v in self.df.items()
|
|
102
|
+
}
|
|
103
|
+
return Xolars(self.ds, df)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _reorder(frame: F, coords: xr.DataArray) -> F:
|
|
107
|
+
"""Reorder frame rows to match the given coordinate order."""
|
|
108
|
+
dim_name = str(coords.name)
|
|
109
|
+
order_df = pl.DataFrame({dim_name: coords.to_numpy()}).with_row_index("__i__")
|
|
110
|
+
if isinstance(frame, pl.LazyFrame):
|
|
111
|
+
joined = frame.join(order_df.lazy(), on=dim_name, how="right")
|
|
112
|
+
else:
|
|
113
|
+
joined = cast(pl.DataFrame, frame).join(order_df, on=dim_name, how="right")
|
|
114
|
+
return cast(F, joined.sort("__i__").drop("__i__"))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _filter_df(
|
|
118
|
+
df: Mapping[Hashable, F],
|
|
119
|
+
new_ds: xr.Dataset,
|
|
120
|
+
merged: dict[str, Any],
|
|
121
|
+
) -> dict[Hashable, F]:
|
|
122
|
+
"""Filter df entries whose dim appears in merged indexers to match new_ds coords."""
|
|
123
|
+
new_df: dict[Hashable, F] = {}
|
|
124
|
+
for dim, frame in df.items():
|
|
125
|
+
dim_str = str(dim)
|
|
126
|
+
if dim_str in merged:
|
|
127
|
+
if dim_str not in new_ds.dims:
|
|
128
|
+
continue # scalar index dropped this dimension
|
|
129
|
+
keep = new_ds[dim_str].to_numpy()
|
|
130
|
+
new_df[dim] = cast(F, frame.filter(pl.col(dim_str).is_in(keep)))
|
|
131
|
+
else:
|
|
132
|
+
new_df[dim] = frame
|
|
133
|
+
return new_df
|
|
File without changes
|