ngio 0.2.7__py3-none-any.whl → 0.3.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ngio/common/__init__.py +16 -0
- ngio/common/_table_ops.py +471 -0
- ngio/hcs/plate.py +430 -72
- ngio/images/ome_zarr_container.py +99 -68
- ngio/ome_zarr_meta/ngio_specs/_channels.py +28 -2
- ngio/tables/__init__.py +8 -1
- ngio/tables/abstract_table.py +268 -0
- ngio/tables/backends/__init__.py +18 -0
- ngio/tables/backends/_abstract_backend.py +58 -80
- ngio/tables/backends/_anndata_v1.py +4 -0
- ngio/tables/backends/_csv_v1.py +23 -150
- ngio/tables/backends/_json_v1.py +3 -0
- ngio/tables/backends/_non_zarr_backends_v1.py +196 -0
- ngio/tables/backends/_parquet_v1.py +47 -0
- ngio/tables/backends/_table_backends.py +34 -15
- ngio/tables/backends/_utils.py +147 -1
- ngio/tables/tables_container.py +180 -92
- ngio/tables/v1/__init__.py +8 -1
- ngio/tables/v1/_condition_table.py +67 -0
- ngio/tables/v1/_feature_table.py +62 -126
- ngio/tables/v1/_generic_table.py +14 -163
- ngio/tables/v1/_roi_table.py +281 -201
- ngio/utils/_fractal_fsspec_store.py +29 -0
- {ngio-0.2.7.dist-info → ngio-0.3.0a0.dist-info}/METADATA +3 -3
- {ngio-0.2.7.dist-info → ngio-0.3.0a0.dist-info}/RECORD +27 -23
- ngio/tables/_validators.py +0 -108
- {ngio-0.2.7.dist-info → ngio-0.3.0a0.dist-info}/WHEEL +0 -0
- {ngio-0.2.7.dist-info → ngio-0.3.0a0.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,15 +5,13 @@ from anndata import AnnData
|
|
|
5
5
|
from pandas import DataFrame
|
|
6
6
|
from polars import DataFrame as PolarsDataFrame
|
|
7
7
|
from polars import LazyFrame
|
|
8
|
-
from pydantic import BaseModel
|
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
|
9
9
|
|
|
10
10
|
from ngio.tables.backends._utils import (
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
convert_polars_to_anndata,
|
|
16
|
-
convert_polars_to_pandas,
|
|
11
|
+
TabularData,
|
|
12
|
+
convert_to_anndata,
|
|
13
|
+
convert_to_pandas,
|
|
14
|
+
convert_to_polars,
|
|
17
15
|
)
|
|
18
16
|
from ngio.utils import NgioValueError, ZarrGroupHandler
|
|
19
17
|
|
|
@@ -21,29 +19,30 @@ from ngio.utils import NgioValueError, ZarrGroupHandler
|
|
|
21
19
|
class BackendMeta(BaseModel):
|
|
22
20
|
"""Metadata for the backend."""
|
|
23
21
|
|
|
24
|
-
backend: str
|
|
22
|
+
backend: str = "anndata_v1"
|
|
25
23
|
index_key: str | None = None
|
|
26
24
|
index_type: Literal["int", "str"] | None = None
|
|
27
25
|
|
|
26
|
+
model_config = ConfigDict(extra="allow")
|
|
27
|
+
|
|
28
28
|
|
|
29
29
|
class AbstractTableBackend(ABC):
|
|
30
30
|
"""Abstract class for table backends."""
|
|
31
31
|
|
|
32
|
-
def
|
|
32
|
+
def set_group_handler(
|
|
33
33
|
self,
|
|
34
34
|
group_handler: ZarrGroupHandler,
|
|
35
35
|
index_key: str | None = None,
|
|
36
36
|
index_type: Literal["int", "str"] | None = None,
|
|
37
|
-
):
|
|
38
|
-
"""
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Attach a group handler to the backend.
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
Index keys and index types are used to ensure that the
|
|
41
|
+
serialization and deserialization of the table
|
|
42
|
+
is consistent across different backends.
|
|
41
43
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
containing the table data.
|
|
45
|
-
index_key (str): The column name to use as the index of the DataFrame.
|
|
46
|
-
index_type (str): The type of the index column in the DataFrame.
|
|
44
|
+
Making sure that this is consistent is
|
|
45
|
+
a duty of the backend implementations.
|
|
47
46
|
"""
|
|
48
47
|
self._group_handler = group_handler
|
|
49
48
|
self._index_key = index_key
|
|
@@ -67,7 +66,11 @@ class AbstractTableBackend(ABC):
|
|
|
67
66
|
"""Check if the backend implements the anndata protocol.
|
|
68
67
|
|
|
69
68
|
If this is True, the backend should implement the
|
|
70
|
-
`
|
|
69
|
+
`write_from_anndata` method.
|
|
70
|
+
|
|
71
|
+
AnnData objects are more complex than DataFrames,
|
|
72
|
+
so if this is true the backend should implement the
|
|
73
|
+
full serialization of the AnnData object.
|
|
71
74
|
|
|
72
75
|
If this is False, these methods should raise a
|
|
73
76
|
`NotImplementedError`.
|
|
@@ -80,7 +83,7 @@ class AbstractTableBackend(ABC):
|
|
|
80
83
|
"""Check if the backend implements the pandas protocol.
|
|
81
84
|
|
|
82
85
|
If this is True, the backend should implement the
|
|
83
|
-
`
|
|
86
|
+
`write_from_dataframe` methods.
|
|
84
87
|
|
|
85
88
|
If this is False, these methods should raise a
|
|
86
89
|
`NotImplementedError`.
|
|
@@ -93,7 +96,7 @@ class AbstractTableBackend(ABC):
|
|
|
93
96
|
"""Check if the backend implements the polars protocol.
|
|
94
97
|
|
|
95
98
|
If this is True, the backend should implement the
|
|
96
|
-
`
|
|
99
|
+
`write_from_polars` methods.
|
|
97
100
|
|
|
98
101
|
If this is False, these methods should raise a
|
|
99
102
|
`NotImplementedError`.
|
|
@@ -122,6 +125,16 @@ class AbstractTableBackend(ABC):
|
|
|
122
125
|
)
|
|
123
126
|
return self._index_type # type: ignore[return-value]
|
|
124
127
|
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def load(self) -> TabularData:
|
|
130
|
+
"""Load the table from the store.
|
|
131
|
+
|
|
132
|
+
This is a generic load method.
|
|
133
|
+
Based on the explicit mode or the type of the table,
|
|
134
|
+
it will call the appropriate load method.
|
|
135
|
+
"""
|
|
136
|
+
...
|
|
137
|
+
|
|
125
138
|
def load_as_anndata(self) -> AnnData:
|
|
126
139
|
"""Load the table as an AnnData object.
|
|
127
140
|
|
|
@@ -129,70 +142,35 @@ class AbstractTableBackend(ABC):
|
|
|
129
142
|
selecting columns is not implemented, because it is not
|
|
130
143
|
straightforward to do so for an arbitrary AnnData object.
|
|
131
144
|
"""
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
elif self.implements_polars():
|
|
138
|
-
return convert_polars_to_anndata(
|
|
139
|
-
self.load_as_polars_lf(),
|
|
140
|
-
index_key=self.index_key,
|
|
141
|
-
)
|
|
142
|
-
else:
|
|
143
|
-
raise NgioValueError(
|
|
144
|
-
"Backend does not implement any of the protocols. "
|
|
145
|
-
"A backend should implement at least one of the "
|
|
146
|
-
"following protocols: anndata, pandas, polars."
|
|
147
|
-
)
|
|
145
|
+
table = self.load()
|
|
146
|
+
return convert_to_anndata(
|
|
147
|
+
table,
|
|
148
|
+
index_key=self.index_key,
|
|
149
|
+
)
|
|
148
150
|
|
|
149
151
|
def load_as_pandas_df(self) -> DataFrame:
|
|
150
152
|
"""Load the table as a pandas DataFrame.
|
|
151
153
|
|
|
152
154
|
If columns are provided, the table should be filtered
|
|
153
155
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
elif self.implements_polars():
|
|
161
|
-
return convert_polars_to_pandas(
|
|
162
|
-
self.load_as_polars_lf(),
|
|
163
|
-
index_key=self.index_key,
|
|
164
|
-
index_type=self.index_type,
|
|
165
|
-
)
|
|
166
|
-
else:
|
|
167
|
-
raise NgioValueError(
|
|
168
|
-
"Backend does not implement any of the protocols. "
|
|
169
|
-
"A backend should implement at least one of the "
|
|
170
|
-
"following protocols: anndata, pandas, polars."
|
|
171
|
-
)
|
|
156
|
+
table = self.load()
|
|
157
|
+
return convert_to_pandas(
|
|
158
|
+
table,
|
|
159
|
+
index_key=self.index_key,
|
|
160
|
+
index_type=self.index_type,
|
|
161
|
+
)
|
|
172
162
|
|
|
173
163
|
def load_as_polars_lf(self) -> LazyFrame:
|
|
174
164
|
"""Load the table as a polars LazyFrame.
|
|
175
165
|
|
|
176
166
|
If columns are provided, the table should be filtered
|
|
177
167
|
"""
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
elif self.implements_pandas():
|
|
185
|
-
return convert_pandas_to_polars(
|
|
186
|
-
self.load_as_pandas_df(),
|
|
187
|
-
index_key=self.index_key,
|
|
188
|
-
index_type=self.index_type,
|
|
189
|
-
).lazy()
|
|
190
|
-
else:
|
|
191
|
-
raise NgioValueError(
|
|
192
|
-
"Backend does not implement any of the protocols. "
|
|
193
|
-
"A backend should implement at least one of the "
|
|
194
|
-
"following protocols: anndata, pandas, polars."
|
|
195
|
-
)
|
|
168
|
+
table = self.load()
|
|
169
|
+
return convert_to_polars(
|
|
170
|
+
table,
|
|
171
|
+
index_key=self.index_key,
|
|
172
|
+
index_type=self.index_type,
|
|
173
|
+
)
|
|
196
174
|
|
|
197
175
|
def write_from_pandas(self, table: DataFrame) -> None:
|
|
198
176
|
"""Serialize the table from a pandas DataFrame."""
|
|
@@ -230,7 +208,7 @@ class AbstractTableBackend(ABC):
|
|
|
230
208
|
|
|
231
209
|
def write(
|
|
232
210
|
self,
|
|
233
|
-
|
|
211
|
+
table_data: TabularData,
|
|
234
212
|
metadata: dict | None = None,
|
|
235
213
|
mode: Literal["pandas", "anndata", "polars"] | None = None,
|
|
236
214
|
) -> None:
|
|
@@ -240,15 +218,15 @@ class AbstractTableBackend(ABC):
|
|
|
240
218
|
Based on the explicit mode or the type of the table,
|
|
241
219
|
it will call the appropriate write method.
|
|
242
220
|
"""
|
|
243
|
-
if mode == "pandas" or isinstance(
|
|
244
|
-
self.write_from_pandas(
|
|
245
|
-
elif mode == "anndata" or isinstance(
|
|
246
|
-
self.write_from_anndata(
|
|
247
|
-
elif mode == "polars" or isinstance(
|
|
248
|
-
self.write_from_polars(
|
|
221
|
+
if mode == "pandas" or isinstance(table_data, DataFrame):
|
|
222
|
+
self.write_from_pandas(table_data) # type: ignore[arg-type]
|
|
223
|
+
elif mode == "anndata" or isinstance(table_data, AnnData):
|
|
224
|
+
self.write_from_anndata(table_data) # type: ignore[arg-type]
|
|
225
|
+
elif mode == "polars" or isinstance(table_data, PolarsDataFrame | LazyFrame):
|
|
226
|
+
self.write_from_polars(table_data)
|
|
249
227
|
else:
|
|
250
228
|
raise NgioValueError(
|
|
251
|
-
f"Unsupported table type {type(
|
|
229
|
+
f"Unsupported table type {type(table_data)}. "
|
|
252
230
|
"Please specify the mode explicitly. "
|
|
253
231
|
"Supported serialization modes are: "
|
|
254
232
|
"'pandas', 'anndata', 'polars'."
|
|
@@ -44,6 +44,10 @@ class AnnDataBackend(AbstractTableBackend):
|
|
|
44
44
|
anndata = normalize_anndata(anndata, index_key=self.index_key)
|
|
45
45
|
return anndata
|
|
46
46
|
|
|
47
|
+
def load(self) -> AnnData:
|
|
48
|
+
"""Load the table as an AnnData object."""
|
|
49
|
+
return self.load_as_anndata()
|
|
50
|
+
|
|
47
51
|
def write_from_anndata(self, table: AnnData) -> None:
|
|
48
52
|
"""Serialize the table from an AnnData object."""
|
|
49
53
|
full_url = self._group_handler.full_url
|
ngio/tables/backends/_csv_v1.py
CHANGED
|
@@ -1,162 +1,35 @@
|
|
|
1
|
-
import io
|
|
2
|
-
|
|
3
1
|
import pandas as pd
|
|
4
2
|
import polars as pl
|
|
5
|
-
from pandas import DataFrame
|
|
6
|
-
from polars import DataFrame as PolarsDataFrame
|
|
7
|
-
from polars import LazyFrame
|
|
8
|
-
from zarr.storage import DirectoryStore, FSStore
|
|
9
|
-
|
|
10
|
-
from ngio.tables.backends._abstract_backend import AbstractTableBackend
|
|
11
|
-
from ngio.tables.backends._utils import normalize_pandas_df, normalize_polars_lf
|
|
12
|
-
from ngio.utils import NgioFileNotFoundError, NgioValueError
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class CsvTableBackend(AbstractTableBackend):
|
|
16
|
-
"""A class to load and write small tables in CSV format."""
|
|
17
|
-
|
|
18
|
-
csv_name = "table.csv"
|
|
19
|
-
|
|
20
|
-
@staticmethod
|
|
21
|
-
def backend_name() -> str:
|
|
22
|
-
"""Return the name of the backend."""
|
|
23
|
-
return "experimental_csv_v1"
|
|
24
3
|
|
|
25
|
-
|
|
26
|
-
def implements_anndata() -> bool:
|
|
27
|
-
"""Whether the handler implements the anndata protocol."""
|
|
28
|
-
return False
|
|
4
|
+
from ngio.tables.backends._non_zarr_backends_v1 import NonZarrBaseBackend
|
|
29
5
|
|
|
30
|
-
@staticmethod
|
|
31
|
-
def implements_pandas() -> bool:
|
|
32
|
-
"""Whether the handler implements the dataframe protocol."""
|
|
33
|
-
return True
|
|
34
6
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
return True
|
|
7
|
+
def write_lf_to_csv(path: str, table: pl.DataFrame) -> None:
|
|
8
|
+
"""Write a polars DataFrame to a CSV file."""
|
|
9
|
+
table.write_csv(path)
|
|
39
10
|
|
|
40
|
-
def _load_from_directory_store(self, reader):
|
|
41
|
-
"""Load the table from a directory store."""
|
|
42
|
-
url = self._group_handler.full_url
|
|
43
|
-
if url is None:
|
|
44
|
-
raise NgioValueError(
|
|
45
|
-
f"Ngio does not support reading a CSV file from a "
|
|
46
|
-
f"store of type {type(self._group_handler)}. "
|
|
47
|
-
"Please make sure to use a compatible "
|
|
48
|
-
"store like a zarr.DirectoryStore."
|
|
49
|
-
)
|
|
50
|
-
csv_path = f"{url}/{self.csv_name}"
|
|
51
|
-
dataframe = reader(csv_path)
|
|
52
|
-
return dataframe
|
|
53
11
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
if bytes_table is None:
|
|
58
|
-
raise NgioFileNotFoundError(f"No table found at {self.csv_name}. ")
|
|
59
|
-
dataframe = reader(io.BytesIO(bytes_table))
|
|
60
|
-
return dataframe
|
|
12
|
+
def write_df_to_csv(path: str, table: pd.DataFrame) -> None:
|
|
13
|
+
"""Write a pandas DataFrame to a CSV file."""
|
|
14
|
+
table.to_csv(path, index=False)
|
|
61
15
|
|
|
62
|
-
def load_as_pandas_df(self) -> DataFrame:
|
|
63
|
-
"""Load the table as a pandas DataFrame."""
|
|
64
|
-
store = self._group_handler.store
|
|
65
|
-
if isinstance(store, DirectoryStore):
|
|
66
|
-
dataframe = self._load_from_directory_store(reader=pd.read_csv)
|
|
67
|
-
elif isinstance(store, FSStore):
|
|
68
|
-
dataframe = self._load_from_fs_store(reader=pd.read_csv)
|
|
69
|
-
else:
|
|
70
|
-
raise NgioValueError(
|
|
71
|
-
f"Ngio does not support reading a CSV file from a "
|
|
72
|
-
f"store of type {type(store)}. "
|
|
73
|
-
"Please make sure to use a compatible "
|
|
74
|
-
"store like a zarr.DirectoryStore or "
|
|
75
|
-
"zarr.FSStore."
|
|
76
|
-
)
|
|
77
16
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
index_key=self.index_key,
|
|
81
|
-
index_type=self.index_type,
|
|
82
|
-
reset_index=False,
|
|
83
|
-
)
|
|
84
|
-
return dataframe
|
|
85
|
-
|
|
86
|
-
def load_as_polars_lf(self) -> LazyFrame:
|
|
87
|
-
"""Load the table as a polars LazyFrame."""
|
|
88
|
-
store = self._group_handler.store
|
|
89
|
-
if isinstance(store, DirectoryStore):
|
|
90
|
-
lazy_frame = self._load_from_directory_store(reader=pl.scan_csv)
|
|
91
|
-
elif isinstance(store, FSStore):
|
|
92
|
-
lazy_frame = self._load_from_fs_store(reader=pl.scan_csv)
|
|
93
|
-
else:
|
|
94
|
-
raise NgioValueError(
|
|
95
|
-
f"Ngio does not support reading a CSV file from a "
|
|
96
|
-
f"store of type {type(store)}. "
|
|
97
|
-
"Please make sure to use a compatible "
|
|
98
|
-
"store like a zarr.DirectoryStore or "
|
|
99
|
-
"zarr.FSStore."
|
|
100
|
-
)
|
|
101
|
-
if not isinstance(lazy_frame, LazyFrame):
|
|
102
|
-
raise NgioValueError(
|
|
103
|
-
"Table is not a lazy frame. Please report this issue as an ngio bug."
|
|
104
|
-
f" {type(lazy_frame)}"
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
lazy_frame = normalize_polars_lf(
|
|
108
|
-
lazy_frame,
|
|
109
|
-
index_key=self.index_key,
|
|
110
|
-
index_type=self.index_type,
|
|
111
|
-
)
|
|
112
|
-
return lazy_frame
|
|
113
|
-
|
|
114
|
-
def _get_store_url(self) -> str:
|
|
115
|
-
"""Get the store URL."""
|
|
116
|
-
store = self._group_handler.store
|
|
117
|
-
if isinstance(store, DirectoryStore):
|
|
118
|
-
full_url = self._group_handler.full_url
|
|
119
|
-
else:
|
|
120
|
-
raise NgioValueError(
|
|
121
|
-
f"Ngio does not support writing a CSV file to a "
|
|
122
|
-
f"store of type {type(store)}. "
|
|
123
|
-
"Please make sure to use a compatible "
|
|
124
|
-
"store like a zarr.DirectoryStore or "
|
|
125
|
-
"zarr.FSStore."
|
|
126
|
-
)
|
|
127
|
-
if full_url is None:
|
|
128
|
-
raise NgioValueError(
|
|
129
|
-
f"Ngio does not support writing a CSV file to a "
|
|
130
|
-
f"store of type {type(store)}. "
|
|
131
|
-
"Please make sure to use a compatible "
|
|
132
|
-
"store like a zarr.DirectoryStore or "
|
|
133
|
-
"zarr.FSStore."
|
|
134
|
-
)
|
|
135
|
-
return full_url
|
|
136
|
-
|
|
137
|
-
def write_from_pandas(self, table: DataFrame) -> None:
|
|
138
|
-
"""Write the table from a pandas DataFrame."""
|
|
139
|
-
table = normalize_pandas_df(
|
|
140
|
-
table,
|
|
141
|
-
index_key=self.index_key,
|
|
142
|
-
index_type=self.index_type,
|
|
143
|
-
reset_index=True,
|
|
144
|
-
)
|
|
145
|
-
full_url = self._get_store_url()
|
|
146
|
-
csv_path = f"{full_url}/{self.csv_name}"
|
|
147
|
-
table.to_csv(csv_path, index=False)
|
|
17
|
+
class CsvTableBackend(NonZarrBaseBackend):
|
|
18
|
+
"""A class to load and write small tables in CSV format."""
|
|
148
19
|
|
|
149
|
-
def
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
):
|
|
23
|
+
"""Initialize the CsvTableBackend."""
|
|
24
|
+
super().__init__(
|
|
25
|
+
lf_reader=pl.scan_csv,
|
|
26
|
+
df_reader=pd.read_csv,
|
|
27
|
+
lf_writer=write_lf_to_csv,
|
|
28
|
+
df_writer=write_df_to_csv,
|
|
29
|
+
table_name="table.csv",
|
|
155
30
|
)
|
|
156
31
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
csv_path = f"{full_url}/{self.csv_name}"
|
|
162
|
-
table.write_csv(csv_path)
|
|
32
|
+
@staticmethod
|
|
33
|
+
def backend_name() -> str:
|
|
34
|
+
"""Return the name of the backend."""
|
|
35
|
+
return "experimental_csv_v1"
|
ngio/tables/backends/_json_v1.py
CHANGED
|
@@ -61,6 +61,9 @@ class JsonTableBackend(AbstractTableBackend):
|
|
|
61
61
|
)
|
|
62
62
|
return data_frame
|
|
63
63
|
|
|
64
|
+
def load(self) -> DataFrame:
|
|
65
|
+
return self.load_as_pandas_df()
|
|
66
|
+
|
|
64
67
|
def _write_from_dict(self, table: dict) -> None:
|
|
65
68
|
"""Write the table from a dictionary to the store."""
|
|
66
69
|
table_group = self._get_table_group()
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pandas import DataFrame
|
|
6
|
+
from polars import DataFrame as PolarsDataFrame
|
|
7
|
+
from polars import LazyFrame
|
|
8
|
+
from zarr.storage import DirectoryStore, FSStore
|
|
9
|
+
|
|
10
|
+
from ngio.tables.backends._abstract_backend import AbstractTableBackend
|
|
11
|
+
from ngio.tables.backends._utils import normalize_pandas_df, normalize_polars_lf
|
|
12
|
+
from ngio.utils import NgioFileNotFoundError, NgioValueError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class NonZarrBaseBackend(AbstractTableBackend):
|
|
16
|
+
"""A class to load and write small tables in CSV format."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
df_reader: Callable[[Any], DataFrame],
|
|
21
|
+
lf_reader: Callable[[Any], LazyFrame],
|
|
22
|
+
df_writer: Callable[[str, DataFrame], None],
|
|
23
|
+
lf_writer: Callable[[str, PolarsDataFrame], None],
|
|
24
|
+
table_name: str,
|
|
25
|
+
):
|
|
26
|
+
self.df_reader = df_reader
|
|
27
|
+
self.lf_reader = lf_reader
|
|
28
|
+
self.df_writer = df_writer
|
|
29
|
+
self.lf_writer = lf_writer
|
|
30
|
+
self.table_name = table_name
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def implements_anndata() -> bool:
|
|
34
|
+
"""Whether the handler implements the anndata protocol."""
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def implements_pandas() -> bool:
|
|
39
|
+
"""Whether the handler implements the dataframe protocol."""
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def implements_polars() -> bool:
|
|
44
|
+
"""Whether the handler implements the polars protocol."""
|
|
45
|
+
return True
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def backend_name() -> str:
|
|
49
|
+
"""Return the name of the backend."""
|
|
50
|
+
raise NotImplementedError(
|
|
51
|
+
"The backend_name method must be implemented in the subclass."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def _load_from_directory_store(self, reader):
|
|
55
|
+
"""Load the table from a directory store."""
|
|
56
|
+
url = self._group_handler.full_url
|
|
57
|
+
if url is None:
|
|
58
|
+
ext = self.table_name.split(".")[-1]
|
|
59
|
+
raise NgioValueError(
|
|
60
|
+
f"Ngio does not support reading a {ext} table from a "
|
|
61
|
+
f"store of type {type(self._group_handler)}. "
|
|
62
|
+
"Please make sure to use a compatible "
|
|
63
|
+
"store like a zarr.DirectoryStore."
|
|
64
|
+
)
|
|
65
|
+
table_path = f"{url}/{self.table_name}"
|
|
66
|
+
dataframe = reader(table_path)
|
|
67
|
+
return dataframe
|
|
68
|
+
|
|
69
|
+
def _load_from_fs_store_df(self, reader):
|
|
70
|
+
"""Load the table from an FS store."""
|
|
71
|
+
path = self._group_handler.group.path
|
|
72
|
+
table_path = f"{path}/{self.table_name}"
|
|
73
|
+
bytes_table = self._group_handler.store.get(table_path)
|
|
74
|
+
if bytes_table is None:
|
|
75
|
+
raise NgioFileNotFoundError(f"No table found at {table_path}. ")
|
|
76
|
+
dataframe = reader(io.BytesIO(bytes_table))
|
|
77
|
+
return dataframe
|
|
78
|
+
|
|
79
|
+
def _load_from_fs_store_lf(self, reader):
|
|
80
|
+
"""Load the table from an FS store."""
|
|
81
|
+
full_url = self._group_handler.full_url
|
|
82
|
+
parquet_path = f"{full_url}/{self.table_name}"
|
|
83
|
+
store_fs = self._group_handler.store.fs # type: ignore
|
|
84
|
+
with store_fs.open(parquet_path, "rb") as f:
|
|
85
|
+
dataframe = reader(f)
|
|
86
|
+
return dataframe
|
|
87
|
+
|
|
88
|
+
def load_as_pandas_df(self) -> DataFrame:
|
|
89
|
+
"""Load the table as a pandas DataFrame."""
|
|
90
|
+
store = self._group_handler.store
|
|
91
|
+
if isinstance(store, DirectoryStore):
|
|
92
|
+
dataframe = self._load_from_directory_store(reader=self.df_reader)
|
|
93
|
+
elif isinstance(store, FSStore):
|
|
94
|
+
dataframe = self._load_from_fs_store_df(reader=self.df_reader)
|
|
95
|
+
else:
|
|
96
|
+
ext = self.table_name.split(".")[-1]
|
|
97
|
+
raise NgioValueError(
|
|
98
|
+
f"Ngio does not support reading a {ext} table from a "
|
|
99
|
+
f"store of type {type(store)}. "
|
|
100
|
+
"Please make sure to use a compatible "
|
|
101
|
+
"store like a zarr.DirectoryStore or "
|
|
102
|
+
"zarr.FSStore."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
dataframe = normalize_pandas_df(
|
|
106
|
+
dataframe,
|
|
107
|
+
index_key=self.index_key,
|
|
108
|
+
index_type=self.index_type,
|
|
109
|
+
reset_index=False,
|
|
110
|
+
)
|
|
111
|
+
return dataframe
|
|
112
|
+
|
|
113
|
+
def load(self) -> DataFrame:
|
|
114
|
+
"""Load the table as a pandas DataFrame."""
|
|
115
|
+
return self.load_as_pandas_df()
|
|
116
|
+
|
|
117
|
+
def load_as_polars_lf(self) -> LazyFrame:
|
|
118
|
+
"""Load the table as a polars LazyFrame."""
|
|
119
|
+
store = self._group_handler.store
|
|
120
|
+
if isinstance(store, DirectoryStore):
|
|
121
|
+
lazy_frame = self._load_from_directory_store(reader=self.lf_reader)
|
|
122
|
+
elif isinstance(store, FSStore):
|
|
123
|
+
lazy_frame = self._load_from_fs_store_lf(reader=self.lf_reader)
|
|
124
|
+
else:
|
|
125
|
+
ext = self.table_name.split(".")[-1]
|
|
126
|
+
raise NgioValueError(
|
|
127
|
+
f"Ngio does not support reading a {ext} from a "
|
|
128
|
+
f"store of type {type(store)}. "
|
|
129
|
+
"Please make sure to use a compatible "
|
|
130
|
+
"store like a zarr.DirectoryStore or "
|
|
131
|
+
"zarr.FSStore."
|
|
132
|
+
)
|
|
133
|
+
if not isinstance(lazy_frame, LazyFrame):
|
|
134
|
+
raise NgioValueError(
|
|
135
|
+
"Table is not a lazy frame. Please report this issue as an ngio bug."
|
|
136
|
+
f" {type(lazy_frame)}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
lazy_frame = normalize_polars_lf(
|
|
140
|
+
lazy_frame,
|
|
141
|
+
index_key=self.index_key,
|
|
142
|
+
index_type=self.index_type,
|
|
143
|
+
)
|
|
144
|
+
return lazy_frame
|
|
145
|
+
|
|
146
|
+
def _get_store_url(self) -> str:
|
|
147
|
+
"""Get the store URL."""
|
|
148
|
+
store = self._group_handler.store
|
|
149
|
+
if isinstance(store, DirectoryStore):
|
|
150
|
+
full_url = self._group_handler.full_url
|
|
151
|
+
else:
|
|
152
|
+
ext = self.table_name.split(".")[-1]
|
|
153
|
+
raise NgioValueError(
|
|
154
|
+
f"Ngio does not support writing a {ext} file to a "
|
|
155
|
+
f"store of type {type(store)}. "
|
|
156
|
+
"Please make sure to use a compatible "
|
|
157
|
+
"store like a zarr.DirectoryStore or "
|
|
158
|
+
"zarr.FSStore."
|
|
159
|
+
)
|
|
160
|
+
if full_url is None:
|
|
161
|
+
ext = self.table_name.split(".")[-1]
|
|
162
|
+
raise NgioValueError(
|
|
163
|
+
f"Ngio does not support writing a {ext} file to a "
|
|
164
|
+
f"store of type {type(store)}. "
|
|
165
|
+
"Please make sure to use a compatible "
|
|
166
|
+
"store like a zarr.DirectoryStore or "
|
|
167
|
+
"zarr.FSStore."
|
|
168
|
+
)
|
|
169
|
+
return full_url
|
|
170
|
+
|
|
171
|
+
def write_from_pandas(self, table: DataFrame) -> None:
|
|
172
|
+
"""Write the table from a pandas DataFrame."""
|
|
173
|
+
table = normalize_pandas_df(
|
|
174
|
+
table,
|
|
175
|
+
index_key=self.index_key,
|
|
176
|
+
index_type=self.index_type,
|
|
177
|
+
reset_index=True,
|
|
178
|
+
)
|
|
179
|
+
full_url = self._get_store_url()
|
|
180
|
+
table_path = f"{full_url}/{self.table_name}"
|
|
181
|
+
self.df_writer(table_path, table)
|
|
182
|
+
|
|
183
|
+
def write_from_polars(self, table: PolarsDataFrame | LazyFrame) -> None:
|
|
184
|
+
"""Write the table from a polars DataFrame or LazyFrame."""
|
|
185
|
+
table = normalize_polars_lf(
|
|
186
|
+
table,
|
|
187
|
+
index_key=self.index_key,
|
|
188
|
+
index_type=self.index_type,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if isinstance(table, LazyFrame):
|
|
192
|
+
table = table.collect()
|
|
193
|
+
|
|
194
|
+
full_url = self._get_store_url()
|
|
195
|
+
table_path = f"{full_url}/{self.table_name}"
|
|
196
|
+
self.lf_writer(table_path, table)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import polars as pl
|
|
3
|
+
|
|
4
|
+
from ngio.tables.backends._non_zarr_backends_v1 import NonZarrBaseBackend
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def write_lf_to_parquet(path: str, table: pl.DataFrame) -> None:
|
|
8
|
+
"""Write a polars DataFrame to a Parquet file."""
|
|
9
|
+
# make categorical into string (for pandas compatibility)
|
|
10
|
+
schema = table.collect_schema()
|
|
11
|
+
|
|
12
|
+
categorical_columns = []
|
|
13
|
+
for name, dtype in zip(schema.names(), schema.dtypes(), strict=True):
|
|
14
|
+
if dtype == pl.Categorical:
|
|
15
|
+
categorical_columns.append(name)
|
|
16
|
+
|
|
17
|
+
for col in categorical_columns:
|
|
18
|
+
table = table.with_columns(pl.col(col).cast(pl.Utf8))
|
|
19
|
+
|
|
20
|
+
# write to parquet
|
|
21
|
+
table.write_parquet(path)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_df_to_parquet(path: str, table: pd.DataFrame) -> None:
|
|
25
|
+
"""Write a pandas DataFrame to a Parquet file."""
|
|
26
|
+
table.to_parquet(path, index=False)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ParquetTableBackend(NonZarrBaseBackend):
|
|
30
|
+
"""A class to load and write small tables in Parquet format."""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize the ParquetTableBackend."""
|
|
36
|
+
super().__init__(
|
|
37
|
+
lf_reader=pl.scan_parquet,
|
|
38
|
+
df_reader=pd.read_parquet,
|
|
39
|
+
lf_writer=write_lf_to_parquet,
|
|
40
|
+
df_writer=write_df_to_parquet,
|
|
41
|
+
table_name="table.parquet",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def backend_name() -> str:
|
|
46
|
+
"""Return the name of the backend."""
|
|
47
|
+
return "experimental_parquet_v1"
|