ngio 0.4.8__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ngio/__init__.py +5 -2
- ngio/common/__init__.py +11 -6
- ngio/common/_masking_roi.py +34 -54
- ngio/common/_pyramid.py +322 -75
- ngio/common/_roi.py +258 -330
- ngio/experimental/iterators/_feature.py +3 -3
- ngio/experimental/iterators/_rois_utils.py +10 -11
- ngio/hcs/_plate.py +192 -136
- ngio/images/_abstract_image.py +539 -35
- ngio/images/_create_synt_container.py +45 -47
- ngio/images/_create_utils.py +406 -0
- ngio/images/_image.py +524 -248
- ngio/images/_label.py +257 -180
- ngio/images/_masked_image.py +2 -2
- ngio/images/_ome_zarr_container.py +658 -255
- ngio/io_pipes/_io_pipes.py +9 -9
- ngio/io_pipes/_io_pipes_masked.py +7 -7
- ngio/io_pipes/_io_pipes_roi.py +6 -6
- ngio/io_pipes/_io_pipes_types.py +3 -3
- ngio/io_pipes/_match_shape.py +6 -8
- ngio/io_pipes/_ops_slices_utils.py +8 -5
- ngio/ome_zarr_meta/__init__.py +29 -18
- ngio/ome_zarr_meta/_meta_handlers.py +402 -689
- ngio/ome_zarr_meta/ngio_specs/__init__.py +4 -0
- ngio/ome_zarr_meta/ngio_specs/_axes.py +152 -51
- ngio/ome_zarr_meta/ngio_specs/_dataset.py +13 -22
- ngio/ome_zarr_meta/ngio_specs/_ngio_hcs.py +129 -91
- ngio/ome_zarr_meta/ngio_specs/_ngio_image.py +69 -69
- ngio/ome_zarr_meta/v04/__init__.py +5 -1
- ngio/ome_zarr_meta/v04/{_v04_spec_utils.py → _v04_spec.py} +55 -86
- ngio/ome_zarr_meta/v05/__init__.py +27 -0
- ngio/ome_zarr_meta/v05/_custom_models.py +18 -0
- ngio/ome_zarr_meta/v05/_v05_spec.py +495 -0
- ngio/resources/__init__.py +1 -1
- ngio/resources/resource_model.py +1 -1
- ngio/tables/_tables_container.py +82 -24
- ngio/tables/backends/_abstract_backend.py +7 -0
- ngio/tables/backends/_anndata.py +60 -7
- ngio/tables/backends/_anndata_utils.py +2 -4
- ngio/tables/backends/_csv.py +3 -19
- ngio/tables/backends/_json.py +10 -13
- ngio/tables/backends/_parquet.py +3 -31
- ngio/tables/backends/_py_arrow_backends.py +222 -0
- ngio/tables/backends/_utils.py +1 -1
- ngio/tables/v1/_roi_table.py +41 -24
- ngio/utils/__init__.py +8 -12
- ngio/utils/_cache.py +48 -0
- ngio/utils/_zarr_utils.py +354 -236
- {ngio-0.4.8.dist-info → ngio-0.5.0.dist-info}/METADATA +12 -5
- ngio-0.5.0.dist-info/RECORD +88 -0
- ngio/images/_create.py +0 -276
- ngio/tables/backends/_non_zarr_backends.py +0 -196
- ngio/utils/_logger.py +0 -50
- ngio-0.4.8.dist-info/RECORD +0 -85
- {ngio-0.4.8.dist-info → ngio-0.5.0.dist-info}/WHEEL +0 -0
- {ngio-0.4.8.dist-info → ngio-0.5.0.dist-info}/licenses/LICENSE +0 -0
ngio/tables/_tables_container.py
CHANGED
|
@@ -229,10 +229,10 @@ class ImplementedTables:
|
|
|
229
229
|
|
|
230
230
|
|
|
231
231
|
class TablesContainer:
|
|
232
|
-
"""A class to handle the /
|
|
232
|
+
"""A class to handle the /tables group in an OME-NGFF file."""
|
|
233
233
|
|
|
234
234
|
def __init__(self, group_handler: ZarrGroupHandler) -> None:
|
|
235
|
-
"""Initialize the
|
|
235
|
+
"""Initialize the TablesContainer."""
|
|
236
236
|
self._group_handler = group_handler
|
|
237
237
|
|
|
238
238
|
# Validate the group
|
|
@@ -252,17 +252,24 @@ class TablesContainer:
|
|
|
252
252
|
)
|
|
253
253
|
|
|
254
254
|
def _get_tables_list(self) -> list[str]:
|
|
255
|
-
"""
|
|
255
|
+
"""Return the list of table names from the group attributes."""
|
|
256
256
|
attrs = self._group_handler.load_attrs()
|
|
257
257
|
return attrs.get("tables", [])
|
|
258
258
|
|
|
259
259
|
def _get_table_group_handler(self, name: str) -> ZarrGroupHandler:
|
|
260
260
|
"""Get the group handler for a table."""
|
|
261
|
-
handler = self._group_handler.
|
|
261
|
+
handler = self._group_handler.get_handler(path=name)
|
|
262
262
|
return handler
|
|
263
263
|
|
|
264
264
|
def list(self, filter_types: TypedTable | str | None = None) -> list[str]:
|
|
265
|
-
"""List all
|
|
265
|
+
"""List all tables in the group.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
filter_types: If provided, only return tables of this type.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
A list of table names.
|
|
272
|
+
"""
|
|
266
273
|
tables = self._get_tables_list()
|
|
267
274
|
if filter_types is None:
|
|
268
275
|
return tables
|
|
@@ -281,7 +288,16 @@ class TablesContainer:
|
|
|
281
288
|
backend: TableBackend | None = None,
|
|
282
289
|
strict: bool = True,
|
|
283
290
|
) -> Table:
|
|
284
|
-
"""Get a
|
|
291
|
+
"""Get a table from the group.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
name: The name of the table.
|
|
295
|
+
backend: The backend to use for reading the table.
|
|
296
|
+
strict: If True, raise an error if the table type is not implemented.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
The table object.
|
|
300
|
+
"""
|
|
285
301
|
if name not in self.list():
|
|
286
302
|
raise NgioValueError(f"Table '{name}' not found in the group.")
|
|
287
303
|
|
|
@@ -301,7 +317,16 @@ class TablesContainer:
|
|
|
301
317
|
table_cls: type[TableType],
|
|
302
318
|
backend: TableBackend | None = None,
|
|
303
319
|
) -> TableType:
|
|
304
|
-
"""Get a table from the group as a specific type.
|
|
320
|
+
"""Get a table from the group as a specific type.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
name: The name of the table.
|
|
324
|
+
table_cls: The table class to use for loading the table.
|
|
325
|
+
backend: The backend to use for reading the table.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
The table object of the specified type.
|
|
329
|
+
"""
|
|
305
330
|
if name not in self.list():
|
|
306
331
|
raise NgioValueError(f"Table '{name}' not found in the group.")
|
|
307
332
|
|
|
@@ -311,6 +336,27 @@ class TablesContainer:
|
|
|
311
336
|
backend=backend,
|
|
312
337
|
) # type: ignore[return-value]
|
|
313
338
|
|
|
339
|
+
def delete(self, name: str, missing_ok: bool = False) -> None:
|
|
340
|
+
"""Delete a table from the group.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
name (str): The name of the table to delete.
|
|
344
|
+
missing_ok (bool): If True, do not raise an error if
|
|
345
|
+
the table does not exist.
|
|
346
|
+
"""
|
|
347
|
+
existing_tables = self._get_tables_list()
|
|
348
|
+
if name not in existing_tables:
|
|
349
|
+
if missing_ok:
|
|
350
|
+
return
|
|
351
|
+
raise NgioValueError(
|
|
352
|
+
f"Table '{name}' not found in the Tables group. "
|
|
353
|
+
f"Available tables: {existing_tables}"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
self._group_handler.delete_group(name)
|
|
357
|
+
existing_tables.remove(name)
|
|
358
|
+
self._group_handler.write_attrs({"tables": existing_tables})
|
|
359
|
+
|
|
314
360
|
def add(
|
|
315
361
|
self,
|
|
316
362
|
name: str,
|
|
@@ -318,7 +364,14 @@ class TablesContainer:
|
|
|
318
364
|
backend: TableBackend = DefaultTableBackend,
|
|
319
365
|
overwrite: bool = False,
|
|
320
366
|
) -> None:
|
|
321
|
-
"""Add a table to the group.
|
|
367
|
+
"""Add a table to the group.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
name: The name of the table.
|
|
371
|
+
table: The table object to add.
|
|
372
|
+
backend: The backend to use for writing the table.
|
|
373
|
+
overwrite: Whether to overwrite an existing table with the same name.
|
|
374
|
+
"""
|
|
322
375
|
existing_tables = self._get_tables_list()
|
|
323
376
|
if name in existing_tables and not overwrite:
|
|
324
377
|
raise NgioValueError(
|
|
@@ -326,9 +379,7 @@ class TablesContainer:
|
|
|
326
379
|
"Use overwrite=True to replace it."
|
|
327
380
|
)
|
|
328
381
|
|
|
329
|
-
table_handler = self._group_handler.
|
|
330
|
-
path=name, overwrite=overwrite
|
|
331
|
-
)
|
|
382
|
+
table_handler = self._group_handler.get_handler(path=name, overwrite=overwrite)
|
|
332
383
|
|
|
333
384
|
if backend is None:
|
|
334
385
|
backend = table.backend_name
|
|
@@ -360,12 +411,9 @@ def open_tables_container(
|
|
|
360
411
|
store: StoreOrGroup,
|
|
361
412
|
cache: bool = False,
|
|
362
413
|
mode: AccessModeLiteral = "r+",
|
|
363
|
-
parallel_safe: bool = False,
|
|
364
414
|
) -> TablesContainer:
|
|
365
415
|
"""Open a table handler from a Zarr store."""
|
|
366
|
-
handler = ZarrGroupHandler(
|
|
367
|
-
store=store, cache=cache, mode=mode, parallel_safe=parallel_safe
|
|
368
|
-
)
|
|
416
|
+
handler = ZarrGroupHandler(store=store, cache=cache, mode=mode)
|
|
369
417
|
return TablesContainer(handler)
|
|
370
418
|
|
|
371
419
|
|
|
@@ -374,11 +422,12 @@ def open_table(
|
|
|
374
422
|
backend: TableBackend | None = None,
|
|
375
423
|
cache: bool = False,
|
|
376
424
|
mode: AccessModeLiteral = "r+",
|
|
377
|
-
parallel_safe: bool = False,
|
|
378
425
|
) -> Table:
|
|
379
426
|
"""Open a table from a Zarr store."""
|
|
380
427
|
handler = ZarrGroupHandler(
|
|
381
|
-
store=store,
|
|
428
|
+
store=store,
|
|
429
|
+
cache=cache,
|
|
430
|
+
mode=mode,
|
|
382
431
|
)
|
|
383
432
|
meta = _get_meta(handler)
|
|
384
433
|
return ImplementedTables().get_table(
|
|
@@ -392,11 +441,12 @@ def open_table_as(
|
|
|
392
441
|
backend: TableBackend | None = None,
|
|
393
442
|
cache: bool = False,
|
|
394
443
|
mode: AccessModeLiteral = "r+",
|
|
395
|
-
parallel_safe: bool = False,
|
|
396
444
|
) -> TableType:
|
|
397
445
|
"""Open a table from a Zarr store as a specific type."""
|
|
398
446
|
handler = ZarrGroupHandler(
|
|
399
|
-
store=store,
|
|
447
|
+
store=store,
|
|
448
|
+
cache=cache,
|
|
449
|
+
mode=mode,
|
|
400
450
|
)
|
|
401
451
|
return table_cls.from_handler(
|
|
402
452
|
handler=handler,
|
|
@@ -410,12 +460,20 @@ def write_table(
|
|
|
410
460
|
backend: TableBackend = DefaultTableBackend,
|
|
411
461
|
cache: bool = False,
|
|
412
462
|
mode: AccessModeLiteral = "a",
|
|
413
|
-
parallel_safe: bool = False,
|
|
414
463
|
) -> None:
|
|
415
|
-
"""Write a table to a Zarr store.
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
464
|
+
"""Write a table to a Zarr store.
|
|
465
|
+
|
|
466
|
+
A table will be created at the given store location.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
store (StoreOrGroup): The Zarr store or group to write the table to.
|
|
470
|
+
table (Table): The table to write.
|
|
471
|
+
backend (TableBackend): The backend to use for writing the table.
|
|
472
|
+
cache (bool): Whether to use caching for the Zarr group handler.
|
|
473
|
+
mode (AccessModeLiteral): The access mode to use for the Zarr group handler.
|
|
474
|
+
|
|
475
|
+
"""
|
|
476
|
+
handler = ZarrGroupHandler(store=store, cache=cache, mode=mode)
|
|
419
477
|
table.set_backend(
|
|
420
478
|
handler=handler,
|
|
421
479
|
backend=backend,
|
|
@@ -198,6 +198,13 @@ class AbstractTableBackend(ABC):
|
|
|
198
198
|
if metadata is None:
|
|
199
199
|
metadata = {}
|
|
200
200
|
|
|
201
|
+
attrs = self._group_handler.reopen_group().attrs.asdict()
|
|
202
|
+
# This is required by anndata to identify the format
|
|
203
|
+
if "encoding-type" in attrs:
|
|
204
|
+
metadata["encoding-type"] = attrs["encoding-type"]
|
|
205
|
+
if "encoding-version" in attrs:
|
|
206
|
+
metadata["encoding-version"] = attrs["encoding-version"]
|
|
207
|
+
|
|
201
208
|
backend_metadata = BackendMeta(
|
|
202
209
|
backend=self.backend_name(),
|
|
203
210
|
index_key=self.index_key,
|
ngio/tables/backends/_anndata.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
import zarr
|
|
1
2
|
from anndata import AnnData
|
|
3
|
+
from anndata._settings import settings
|
|
2
4
|
from pandas import DataFrame
|
|
3
5
|
from polars import DataFrame as PolarsDataFrame
|
|
4
6
|
from polars import LazyFrame
|
|
7
|
+
from zarr.storage import FsspecStore, LocalStore, MemoryStore
|
|
5
8
|
|
|
6
9
|
from ngio.tables.backends._abstract_backend import AbstractTableBackend
|
|
7
10
|
from ngio.tables.backends._anndata_utils import (
|
|
@@ -12,7 +15,7 @@ from ngio.tables.backends._utils import (
|
|
|
12
15
|
convert_polars_to_anndata,
|
|
13
16
|
normalize_anndata,
|
|
14
17
|
)
|
|
15
|
-
from ngio.utils import NgioValueError
|
|
18
|
+
from ngio.utils import NgioValueError, copy_group
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
class AnnDataBackend(AbstractTableBackend):
|
|
@@ -40,6 +43,7 @@ class AnnDataBackend(AbstractTableBackend):
|
|
|
40
43
|
|
|
41
44
|
def load_as_anndata(self) -> AnnData:
|
|
42
45
|
"""Load the table as an AnnData object."""
|
|
46
|
+
settings.zarr_write_format = self._group_handler.zarr_format
|
|
43
47
|
anndata = custom_anndata_read_zarr(self._group_handler._group)
|
|
44
48
|
anndata = normalize_anndata(anndata, index_key=self.index_key)
|
|
45
49
|
return anndata
|
|
@@ -48,17 +52,66 @@ class AnnDataBackend(AbstractTableBackend):
|
|
|
48
52
|
"""Load the table as an AnnData object."""
|
|
49
53
|
return self.load_as_anndata()
|
|
50
54
|
|
|
55
|
+
def _write_to_local_store(
|
|
56
|
+
self, store: LocalStore, path: str, table: AnnData
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Write the AnnData table to a LocalStore."""
|
|
59
|
+
store_path = f"{store.root}/{path}"
|
|
60
|
+
table.write_zarr(store_path)
|
|
61
|
+
|
|
62
|
+
def _write_to_fsspec_store(
|
|
63
|
+
self, store: FsspecStore, path: str, table: AnnData
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Write the AnnData table to a FsspecStore."""
|
|
66
|
+
full_url = f"{store.path}/{path}"
|
|
67
|
+
fs = store.fs
|
|
68
|
+
mapper = fs.get_mapper(full_url)
|
|
69
|
+
table.write_zarr(mapper)
|
|
70
|
+
|
|
71
|
+
def _write_to_memory_store(
|
|
72
|
+
self, store: MemoryStore, path: str, table: AnnData
|
|
73
|
+
) -> None:
|
|
74
|
+
"""Write the AnnData table to a MemoryStore."""
|
|
75
|
+
store = MemoryStore()
|
|
76
|
+
table.write_zarr(store)
|
|
77
|
+
anndata_group = zarr.open_group(store, mode="r")
|
|
78
|
+
copy_group(
|
|
79
|
+
anndata_group,
|
|
80
|
+
self._group_handler._group,
|
|
81
|
+
suppress_warnings=True,
|
|
82
|
+
)
|
|
83
|
+
|
|
51
84
|
def write_from_anndata(self, table: AnnData) -> None:
|
|
52
85
|
"""Serialize the table from an AnnData object."""
|
|
53
|
-
|
|
54
|
-
|
|
86
|
+
# Make sure to use the correct zarr format
|
|
87
|
+
settings.zarr_write_format = self._group_handler.zarr_format
|
|
88
|
+
store = self._group_handler.store
|
|
89
|
+
path = self._group_handler.group.path
|
|
90
|
+
if isinstance(store, LocalStore):
|
|
91
|
+
self._write_to_local_store(
|
|
92
|
+
store,
|
|
93
|
+
path,
|
|
94
|
+
table,
|
|
95
|
+
)
|
|
96
|
+
elif isinstance(store, FsspecStore):
|
|
97
|
+
self._write_to_fsspec_store(
|
|
98
|
+
store,
|
|
99
|
+
path,
|
|
100
|
+
table,
|
|
101
|
+
)
|
|
102
|
+
elif isinstance(store, MemoryStore):
|
|
103
|
+
self._write_to_memory_store(
|
|
104
|
+
store,
|
|
105
|
+
path,
|
|
106
|
+
table,
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
55
109
|
raise NgioValueError(
|
|
56
|
-
f"Ngio does not support writing
|
|
57
|
-
f"store of type {type(
|
|
110
|
+
f"Ngio does not support writing an AnnData table to a "
|
|
111
|
+
f"store of type {type(store)}. "
|
|
58
112
|
"Please make sure to use a compatible "
|
|
59
|
-
"store like a
|
|
113
|
+
"store like a LocalStore, or FsspecStore."
|
|
60
114
|
)
|
|
61
|
-
table.write_zarr(full_url) # type: ignore (AnnData writer requires a str path)
|
|
62
115
|
|
|
63
116
|
def write_from_pandas(self, table: DataFrame) -> None:
|
|
64
117
|
"""Serialize the table from a pandas DataFrame."""
|
|
@@ -34,10 +34,6 @@ def custom_anndata_read_zarr(
|
|
|
34
34
|
elem_to_read (Sequence[str] | None): The elements to read from the store.
|
|
35
35
|
"""
|
|
36
36
|
group = open_group_wrapper(store=store, mode="r")
|
|
37
|
-
|
|
38
|
-
if not isinstance(group.store, zarr.DirectoryStore):
|
|
39
|
-
elem_to_read = ["X", "obs", "var"]
|
|
40
|
-
|
|
41
37
|
if elem_to_read is None:
|
|
42
38
|
elem_to_read = [
|
|
43
39
|
"X",
|
|
@@ -87,6 +83,8 @@ def custom_anndata_read_zarr(
|
|
|
87
83
|
if isinstance(group["obs"], zarr.Array):
|
|
88
84
|
_clean_uns(adata)
|
|
89
85
|
|
|
86
|
+
if isinstance(adata, dict):
|
|
87
|
+
adata = AnnData(**adata) # type: ignore
|
|
90
88
|
if not isinstance(adata, AnnData):
|
|
91
89
|
raise NgioValueError(f"Expected an AnnData object, but got {type(adata)}")
|
|
92
90
|
return adata
|
ngio/tables/backends/_csv.py
CHANGED
|
@@ -1,20 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
import polars as pl
|
|
1
|
+
from ngio.tables.backends._py_arrow_backends import PyArrowBackend
|
|
3
2
|
|
|
4
|
-
from ngio.tables.backends._non_zarr_backends import NonZarrBaseBackend
|
|
5
3
|
|
|
6
|
-
|
|
7
|
-
def write_lf_to_csv(path: str, table: pl.DataFrame) -> None:
|
|
8
|
-
"""Write a polars DataFrame to a CSV file."""
|
|
9
|
-
table.write_csv(path)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def write_df_to_csv(path: str, table: pd.DataFrame) -> None:
|
|
13
|
-
"""Write a pandas DataFrame to a CSV file."""
|
|
14
|
-
table.to_csv(path, index=False)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class CsvTableBackend(NonZarrBaseBackend):
|
|
4
|
+
class CsvTableBackend(PyArrowBackend):
|
|
18
5
|
"""A class to load and write small tables in CSV format."""
|
|
19
6
|
|
|
20
7
|
def __init__(
|
|
@@ -22,11 +9,8 @@ class CsvTableBackend(NonZarrBaseBackend):
|
|
|
22
9
|
):
|
|
23
10
|
"""Initialize the CsvTableBackend."""
|
|
24
11
|
super().__init__(
|
|
25
|
-
lf_reader=pl.scan_csv,
|
|
26
|
-
df_reader=pd.read_csv,
|
|
27
|
-
lf_writer=write_lf_to_csv,
|
|
28
|
-
df_writer=write_df_to_csv,
|
|
29
12
|
table_name="table.csv",
|
|
13
|
+
table_format="csv",
|
|
30
14
|
)
|
|
31
15
|
|
|
32
16
|
@staticmethod
|
ngio/tables/backends/_json.py
CHANGED
|
@@ -8,7 +8,7 @@ from ngio.tables.backends._utils import (
|
|
|
8
8
|
normalize_pandas_df,
|
|
9
9
|
normalize_polars_lf,
|
|
10
10
|
)
|
|
11
|
-
from ngio.utils import
|
|
11
|
+
from ngio.utils import NgioError
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class JsonTableBackend(AbstractTableBackend):
|
|
@@ -37,22 +37,19 @@ class JsonTableBackend(AbstractTableBackend):
|
|
|
37
37
|
def _get_table_group(self):
|
|
38
38
|
"""Get the table group, creating it if it doesn't exist."""
|
|
39
39
|
try:
|
|
40
|
-
table_group = self._group_handler.get_group(path="table")
|
|
41
|
-
except
|
|
42
|
-
|
|
40
|
+
table_group = self._group_handler.get_group(path="table", create_mode=True)
|
|
41
|
+
except NgioError as e:
|
|
42
|
+
raise NgioError(
|
|
43
|
+
"Could not get or create a 'table' group in the store "
|
|
44
|
+
f"{self._group_handler.store} path "
|
|
45
|
+
f"{self._group_handler.group.path}/table."
|
|
46
|
+
) from e
|
|
43
47
|
return table_group
|
|
44
48
|
|
|
45
|
-
def _load_as_pandas_df(self) -> DataFrame:
|
|
46
|
-
"""Load the table as a pandas DataFrame."""
|
|
47
|
-
table_group = self._get_table_group()
|
|
48
|
-
table_dict = dict(table_group.attrs)
|
|
49
|
-
|
|
50
|
-
data_frame = pd.DataFrame.from_dict(table_dict)
|
|
51
|
-
return data_frame
|
|
52
|
-
|
|
53
49
|
def load_as_pandas_df(self) -> DataFrame:
|
|
54
50
|
"""Load the table as a pandas DataFrame."""
|
|
55
|
-
|
|
51
|
+
table_dict = self._get_table_group().attrs.asdict()
|
|
52
|
+
data_frame = pd.DataFrame.from_dict(table_dict)
|
|
56
53
|
data_frame = normalize_pandas_df(
|
|
57
54
|
data_frame,
|
|
58
55
|
index_key=self.index_key,
|
ngio/tables/backends/_parquet.py
CHANGED
|
@@ -1,32 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
import polars as pl
|
|
1
|
+
from ngio.tables.backends._py_arrow_backends import PyArrowBackend
|
|
3
2
|
|
|
4
|
-
from ngio.tables.backends._non_zarr_backends import NonZarrBaseBackend
|
|
5
3
|
|
|
6
|
-
|
|
7
|
-
def write_lf_to_parquet(path: str, table: pl.DataFrame) -> None:
|
|
8
|
-
"""Write a polars DataFrame to a Parquet file."""
|
|
9
|
-
# make categorical into string (for pandas compatibility)
|
|
10
|
-
schema = table.collect_schema()
|
|
11
|
-
|
|
12
|
-
categorical_columns = []
|
|
13
|
-
for name, dtype in zip(schema.names(), schema.dtypes(), strict=True):
|
|
14
|
-
if dtype == pl.Categorical:
|
|
15
|
-
categorical_columns.append(name)
|
|
16
|
-
|
|
17
|
-
for col in categorical_columns:
|
|
18
|
-
table = table.with_columns(pl.col(col).cast(pl.Utf8))
|
|
19
|
-
|
|
20
|
-
# write to parquet
|
|
21
|
-
table.write_parquet(path)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def write_df_to_parquet(path: str, table: pd.DataFrame) -> None:
|
|
25
|
-
"""Write a pandas DataFrame to a Parquet file."""
|
|
26
|
-
table.to_parquet(path, index=False)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ParquetTableBackend(NonZarrBaseBackend):
|
|
4
|
+
class ParquetTableBackend(PyArrowBackend):
|
|
30
5
|
"""A class to load and write small tables in Parquet format."""
|
|
31
6
|
|
|
32
7
|
def __init__(
|
|
@@ -34,11 +9,8 @@ class ParquetTableBackend(NonZarrBaseBackend):
|
|
|
34
9
|
):
|
|
35
10
|
"""Initialize the ParquetTableBackend."""
|
|
36
11
|
super().__init__(
|
|
37
|
-
lf_reader=pl.scan_parquet,
|
|
38
|
-
df_reader=pd.read_parquet,
|
|
39
|
-
lf_writer=write_lf_to_parquet,
|
|
40
|
-
df_writer=write_df_to_parquet,
|
|
41
12
|
table_name="table.parquet",
|
|
13
|
+
table_format="parquet",
|
|
42
14
|
)
|
|
43
15
|
|
|
44
16
|
@staticmethod
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
import pyarrow.csv as pa_csv
|
|
6
|
+
import pyarrow.dataset as pa_ds
|
|
7
|
+
import pyarrow.fs as pa_fs
|
|
8
|
+
import pyarrow.parquet as pa_parquet
|
|
9
|
+
from pandas import DataFrame
|
|
10
|
+
from polars import DataFrame as PolarsDataFrame
|
|
11
|
+
from polars import LazyFrame
|
|
12
|
+
from zarr.storage import FsspecStore, LocalStore, MemoryStore, ZipStore
|
|
13
|
+
|
|
14
|
+
from ngio.tables.backends._abstract_backend import AbstractTableBackend
|
|
15
|
+
from ngio.tables.backends._utils import normalize_pandas_df, normalize_polars_lf
|
|
16
|
+
from ngio.utils import NgioValueError
|
|
17
|
+
from ngio.utils._zarr_utils import _make_sync_fs
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PyArrowBackend(AbstractTableBackend):
|
|
21
|
+
"""A class to load and write small tables in CSV format."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
table_name: str,
|
|
26
|
+
table_format: Literal["csv", "parquet"] = "parquet",
|
|
27
|
+
):
|
|
28
|
+
self.table_name = table_name
|
|
29
|
+
self.table_format = table_format
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def implements_anndata() -> bool:
|
|
33
|
+
"""Whether the handler implements the anndata protocol."""
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def implements_pandas() -> bool:
|
|
38
|
+
"""Whether the handler implements the dataframe protocol."""
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def implements_polars() -> bool:
|
|
43
|
+
"""Whether the handler implements the polars protocol."""
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def backend_name() -> str:
|
|
48
|
+
"""Return the name of the backend."""
|
|
49
|
+
raise NotImplementedError(
|
|
50
|
+
"The backend_name method must be implemented in the subclass."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def _raise_store_type_not_supported(self):
|
|
54
|
+
"""Raise an error for unsupported store types."""
|
|
55
|
+
ext = self.table_name.split(".")[-1]
|
|
56
|
+
store = self._group_handler.store
|
|
57
|
+
raise NgioValueError(
|
|
58
|
+
f"Ngio does not support reading a {ext} table from a "
|
|
59
|
+
f"store of type {type(store)}. "
|
|
60
|
+
"Please make sure to use a compatible "
|
|
61
|
+
"store like a LocalStore, or "
|
|
62
|
+
"FsspecStore, or MemoryStore, or ZipStore."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def _load_from_local_store(self, store: LocalStore, path: str) -> pa_ds.Dataset:
|
|
66
|
+
"""Load the table from a directory store."""
|
|
67
|
+
root_path = store.root
|
|
68
|
+
table_path = f"{root_path}/{path}/{self.table_name}"
|
|
69
|
+
dataset = pa_ds.dataset(table_path, format=self.table_format)
|
|
70
|
+
return dataset
|
|
71
|
+
|
|
72
|
+
def _load_from_fsspec_store(self, store: FsspecStore, path: str) -> pa_ds.Dataset:
|
|
73
|
+
"""Load the table from an FS store."""
|
|
74
|
+
table_path = f"{store.path}/{path}/{self.table_name}"
|
|
75
|
+
fs = _make_sync_fs(store.fs)
|
|
76
|
+
dataset = pa_ds.dataset(table_path, format=self.table_format, filesystem=fs)
|
|
77
|
+
return dataset
|
|
78
|
+
|
|
79
|
+
def _load_from_in_memory_store(
|
|
80
|
+
self, store: MemoryStore, path: str
|
|
81
|
+
) -> pa_ds.Dataset:
|
|
82
|
+
"""Load the table from an in-memory store."""
|
|
83
|
+
table_path = f"{path}/{self.table_name}"
|
|
84
|
+
table = store._store_dict.get(table_path, None)
|
|
85
|
+
if table is None:
|
|
86
|
+
raise NgioValueError(
|
|
87
|
+
f"Table {self.table_name} not found in the in-memory store at "
|
|
88
|
+
f"path {path}."
|
|
89
|
+
)
|
|
90
|
+
assert isinstance(table, pa.Table)
|
|
91
|
+
dataset = pa_ds.dataset(table)
|
|
92
|
+
return dataset
|
|
93
|
+
|
|
94
|
+
def _load_from_zip_store(self, store: ZipStore, path: str) -> pa_ds.Dataset:
|
|
95
|
+
"""Load the table from a zip store."""
|
|
96
|
+
raise NotImplementedError("Zip store loading is not implemented yet.")
|
|
97
|
+
|
|
98
|
+
def _load_pyarrow_dataset(self) -> pa_ds.Dataset:
|
|
99
|
+
"""Load the table as a pyarrow Dataset."""
|
|
100
|
+
store = self._group_handler.store
|
|
101
|
+
path = self._group_handler.group.path
|
|
102
|
+
if isinstance(store, LocalStore):
|
|
103
|
+
return self._load_from_local_store(store, path)
|
|
104
|
+
elif isinstance(store, FsspecStore):
|
|
105
|
+
return self._load_from_fsspec_store(store, path)
|
|
106
|
+
elif isinstance(store, MemoryStore):
|
|
107
|
+
return self._load_from_in_memory_store(store, path)
|
|
108
|
+
elif isinstance(store, ZipStore):
|
|
109
|
+
return self._load_from_zip_store(store, path)
|
|
110
|
+
self._raise_store_type_not_supported()
|
|
111
|
+
|
|
112
|
+
def load_as_pandas_df(self) -> DataFrame:
|
|
113
|
+
"""Load the table as a pandas DataFrame."""
|
|
114
|
+
dataset = self._load_pyarrow_dataset()
|
|
115
|
+
dataframe = dataset.to_table().to_pandas()
|
|
116
|
+
dataframe = normalize_pandas_df(
|
|
117
|
+
dataframe,
|
|
118
|
+
index_key=self.index_key,
|
|
119
|
+
index_type=self.index_type,
|
|
120
|
+
reset_index=False,
|
|
121
|
+
)
|
|
122
|
+
return dataframe
|
|
123
|
+
|
|
124
|
+
def load(self) -> DataFrame:
|
|
125
|
+
"""Load the table as a pandas DataFrame."""
|
|
126
|
+
return self.load_as_pandas_df()
|
|
127
|
+
|
|
128
|
+
def load_as_polars_lf(self) -> LazyFrame:
|
|
129
|
+
"""Load the table as a polars LazyFrame."""
|
|
130
|
+
dataset = self._load_pyarrow_dataset()
|
|
131
|
+
lazy_frame = pl.scan_pyarrow_dataset(dataset)
|
|
132
|
+
if not isinstance(lazy_frame, LazyFrame):
|
|
133
|
+
raise NgioValueError(
|
|
134
|
+
"Table is not a lazy frame. Please report this issue as an ngio bug."
|
|
135
|
+
f" {type(lazy_frame)}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
lazy_frame = normalize_polars_lf(
|
|
139
|
+
lazy_frame,
|
|
140
|
+
index_key=self.index_key,
|
|
141
|
+
index_type=self.index_type,
|
|
142
|
+
)
|
|
143
|
+
return lazy_frame
|
|
144
|
+
|
|
145
|
+
def _write_to_stream(self, stream, table: pa.Table) -> None:
|
|
146
|
+
"""Write the table to a stream."""
|
|
147
|
+
if self.table_format == "parquet":
|
|
148
|
+
pa_parquet.write_table(table, stream)
|
|
149
|
+
elif self.table_format == "csv":
|
|
150
|
+
pa_csv.write_csv(table, stream)
|
|
151
|
+
else:
|
|
152
|
+
raise NgioValueError(
|
|
153
|
+
f"Unsupported table format: {self.table_format}. "
|
|
154
|
+
"Supported formats are 'parquet' and 'csv'."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def _write_to_local_store(
|
|
158
|
+
self, store: LocalStore, path: str, table: pa.Table
|
|
159
|
+
) -> None:
|
|
160
|
+
"""Write the table to a directory store."""
|
|
161
|
+
root_path = store.root
|
|
162
|
+
table_path = f"{root_path}/{path}/{self.table_name}"
|
|
163
|
+
self._write_to_stream(table_path, table)
|
|
164
|
+
|
|
165
|
+
def _write_to_fsspec_store(
|
|
166
|
+
self, store: FsspecStore, path: str, table: pa.Table
|
|
167
|
+
) -> None:
|
|
168
|
+
"""Write the table to an FS store."""
|
|
169
|
+
table_path = f"{store.path}/{path}/{self.table_name}"
|
|
170
|
+
fs = _make_sync_fs(store.fs)
|
|
171
|
+
fs = pa_fs.PyFileSystem(pa_fs.FSSpecHandler(fs))
|
|
172
|
+
with fs.open_output_stream(table_path) as out_stream:
|
|
173
|
+
self._write_to_stream(out_stream, table)
|
|
174
|
+
|
|
175
|
+
def _write_to_in_memory_store(
|
|
176
|
+
self, store: MemoryStore, path: str, table: pa.Table
|
|
177
|
+
) -> None:
|
|
178
|
+
"""Write the table to an in-memory store."""
|
|
179
|
+
table_path = f"{path}/{self.table_name}"
|
|
180
|
+
store._store_dict[table_path] = table
|
|
181
|
+
|
|
182
|
+
def _write_to_zip_store(self, store: ZipStore, path: str, table: pa.Table) -> None:
|
|
183
|
+
"""Write the table to a zip store."""
|
|
184
|
+
raise NotImplementedError("Writing to zip store is not implemented yet.")
|
|
185
|
+
|
|
186
|
+
def _write_pyarrow_dataset(self, dataset: pa.Table) -> None:
|
|
187
|
+
"""Write the table from a pyarrow Dataset."""
|
|
188
|
+
store = self._group_handler.store
|
|
189
|
+
path = self._group_handler.group.path
|
|
190
|
+
if isinstance(store, LocalStore):
|
|
191
|
+
return self._write_to_local_store(store=store, path=path, table=dataset)
|
|
192
|
+
elif isinstance(store, FsspecStore):
|
|
193
|
+
return self._write_to_fsspec_store(store=store, path=path, table=dataset)
|
|
194
|
+
elif isinstance(store, MemoryStore):
|
|
195
|
+
return self._write_to_in_memory_store(store=store, path=path, table=dataset)
|
|
196
|
+
elif isinstance(store, ZipStore):
|
|
197
|
+
return self._write_to_zip_store(store=store, path=path, table=dataset)
|
|
198
|
+
self._raise_store_type_not_supported()
|
|
199
|
+
|
|
200
|
+
def write_from_pandas(self, table: DataFrame) -> None:
|
|
201
|
+
"""Write the table from a pandas DataFrame."""
|
|
202
|
+
table = normalize_pandas_df(
|
|
203
|
+
table,
|
|
204
|
+
index_key=self.index_key,
|
|
205
|
+
index_type=self.index_type,
|
|
206
|
+
reset_index=True,
|
|
207
|
+
)
|
|
208
|
+
table = pa.Table.from_pandas(table, preserve_index=False)
|
|
209
|
+
self._write_pyarrow_dataset(table)
|
|
210
|
+
|
|
211
|
+
def write_from_polars(self, table: PolarsDataFrame | LazyFrame) -> None:
|
|
212
|
+
"""Write the table from a polars DataFrame or LazyFrame."""
|
|
213
|
+
table = normalize_polars_lf(
|
|
214
|
+
table,
|
|
215
|
+
index_key=self.index_key,
|
|
216
|
+
index_type=self.index_type,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if isinstance(table, LazyFrame):
|
|
220
|
+
table = table.collect()
|
|
221
|
+
table = table.to_arrow()
|
|
222
|
+
self._write_pyarrow_dataset(table)
|