ngio 0.2.9__py3-none-any.whl → 0.3.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,34 +1,54 @@
1
1
  """Ngio Tables backend implementations."""
2
2
 
3
3
  from ngio.tables.backends._abstract_backend import AbstractTableBackend, BackendMeta
4
+ from ngio.tables.backends._anndata import AnnDataBackend
5
+ from ngio.tables.backends._csv import CsvTableBackend
6
+ from ngio.tables.backends._json import JsonTableBackend
7
+ from ngio.tables.backends._parquet import ParquetTableBackend
4
8
  from ngio.tables.backends._table_backends import (
5
9
  ImplementedTableBackends,
10
+ TableBackend,
6
11
  TableBackendProtocol,
7
12
  )
8
13
  from ngio.tables.backends._utils import (
14
+ TabularData,
9
15
  convert_anndata_to_pandas,
10
16
  convert_anndata_to_polars,
11
17
  convert_pandas_to_anndata,
12
18
  convert_pandas_to_polars,
13
19
  convert_polars_to_anndata,
14
20
  convert_polars_to_pandas,
21
+ convert_to_anndata,
22
+ convert_to_pandas,
23
+ convert_to_polars,
15
24
  normalize_anndata,
16
25
  normalize_pandas_df,
17
26
  normalize_polars_lf,
27
+ normalize_table,
18
28
  )
19
29
 
20
30
  __all__ = [
21
31
  "AbstractTableBackend",
32
+ "AnnDataBackend",
22
33
  "BackendMeta",
34
+ "CsvTableBackend",
23
35
  "ImplementedTableBackends",
36
+ "JsonTableBackend",
37
+ "ParquetTableBackend",
38
+ "TableBackend",
24
39
  "TableBackendProtocol",
40
+ "TabularData",
25
41
  "convert_anndata_to_pandas",
26
42
  "convert_anndata_to_polars",
27
43
  "convert_pandas_to_anndata",
28
44
  "convert_pandas_to_polars",
29
45
  "convert_polars_to_anndata",
30
46
  "convert_polars_to_pandas",
47
+ "convert_to_anndata",
48
+ "convert_to_pandas",
49
+ "convert_to_polars",
31
50
  "normalize_anndata",
32
51
  "normalize_pandas_df",
33
52
  "normalize_polars_lf",
53
+ "normalize_table",
34
54
  ]
@@ -5,15 +5,13 @@ from anndata import AnnData
5
5
  from pandas import DataFrame
6
6
  from polars import DataFrame as PolarsDataFrame
7
7
  from polars import LazyFrame
8
- from pydantic import BaseModel
8
+ from pydantic import BaseModel, ConfigDict
9
9
 
10
10
  from ngio.tables.backends._utils import (
11
- convert_anndata_to_pandas,
12
- convert_anndata_to_polars,
13
- convert_pandas_to_anndata,
14
- convert_pandas_to_polars,
15
- convert_polars_to_anndata,
16
- convert_polars_to_pandas,
11
+ TabularData,
12
+ convert_to_anndata,
13
+ convert_to_pandas,
14
+ convert_to_polars,
17
15
  )
18
16
  from ngio.utils import NgioValueError, ZarrGroupHandler
19
17
 
@@ -21,29 +19,30 @@ from ngio.utils import NgioValueError, ZarrGroupHandler
21
19
  class BackendMeta(BaseModel):
22
20
  """Metadata for the backend."""
23
21
 
24
- backend: str | None = None
22
+ backend: str = "anndata"
25
23
  index_key: str | None = None
26
24
  index_type: Literal["int", "str"] | None = None
27
25
 
26
+ model_config = ConfigDict(extra="allow")
27
+
28
28
 
29
29
  class AbstractTableBackend(ABC):
30
30
  """Abstract class for table backends."""
31
31
 
32
- def __init__(
32
+ def set_group_handler(
33
33
  self,
34
34
  group_handler: ZarrGroupHandler,
35
35
  index_key: str | None = None,
36
36
  index_type: Literal["int", "str"] | None = None,
37
- ):
38
- """Initialize the handler.
37
+ ) -> None:
38
+ """Attach a group handler to the backend.
39
39
 
40
- This is a base class for the table backends protocol.
40
+ Index keys and index types are used to ensure that the
41
+ serialization and deserialization of the table
42
+ is consistent across different backends.
41
43
 
42
- Args:
43
- group_handler (ZarrGroupHandler): An object to handle the Zarr group
44
- containing the table data.
45
- index_key (str): The column name to use as the index of the DataFrame.
46
- index_type (str): The type of the index column in the DataFrame.
44
+ Making sure that this is consistent is
45
+ a duty of the backend implementations.
47
46
  """
48
47
  self._group_handler = group_handler
49
48
  self._index_key = index_key
@@ -67,7 +66,11 @@ class AbstractTableBackend(ABC):
67
66
  """Check if the backend implements the anndata protocol.
68
67
 
69
68
  If this is True, the backend should implement the
70
- `load_as_anndata` and `write_from_anndata` methods.
69
+ `write_from_anndata` method.
70
+
71
+ AnnData objects are more complex than DataFrames,
72
+ so if this is true the backend should implement the
73
+ full serialization of the AnnData object.
71
74
 
72
75
  If this is False, these methods should raise a
73
76
  `NotImplementedError`.
@@ -80,7 +83,7 @@ class AbstractTableBackend(ABC):
80
83
  """Check if the backend implements the pandas protocol.
81
84
 
82
85
  If this is True, the backend should implement the
83
- `load_as_dataframe` and `write_from_dataframe` methods.
86
+ `write_from_dataframe` methods.
84
87
 
85
88
  If this is False, these methods should raise a
86
89
  `NotImplementedError`.
@@ -93,7 +96,7 @@ class AbstractTableBackend(ABC):
93
96
  """Check if the backend implements the polars protocol.
94
97
 
95
98
  If this is True, the backend should implement the
96
- `load_as_polars` and `write_from_polars` methods.
99
+ `write_from_polars` methods.
97
100
 
98
101
  If this is False, these methods should raise a
99
102
  `NotImplementedError`.
@@ -122,6 +125,16 @@ class AbstractTableBackend(ABC):
122
125
  )
123
126
  return self._index_type # type: ignore[return-value]
124
127
 
128
+ @abstractmethod
129
+ def load(self) -> TabularData:
130
+ """Load the table from the store.
131
+
132
+ This is a generic load method.
133
+ Based on the explicit mode or the type of the table,
134
+ it will call the appropriate load method.
135
+ """
136
+ ...
137
+
125
138
  def load_as_anndata(self) -> AnnData:
126
139
  """Load the table as an AnnData object.
127
140
 
@@ -129,70 +142,35 @@ class AbstractTableBackend(ABC):
129
142
  selecting columns is not implemented, because it is not
130
143
  straightforward to do so for an arbitrary AnnData object.
131
144
  """
132
- if self.implements_pandas():
133
- return convert_pandas_to_anndata(
134
- self.load_as_pandas_df(),
135
- index_key=self.index_key,
136
- )
137
- elif self.implements_polars():
138
- return convert_polars_to_anndata(
139
- self.load_as_polars_lf(),
140
- index_key=self.index_key,
141
- )
142
- else:
143
- raise NgioValueError(
144
- "Backend does not implement any of the protocols. "
145
- "A backend should implement at least one of the "
146
- "following protocols: anndata, pandas, polars."
147
- )
145
+ table = self.load()
146
+ return convert_to_anndata(
147
+ table,
148
+ index_key=self.index_key,
149
+ )
148
150
 
149
151
  def load_as_pandas_df(self) -> DataFrame:
150
152
  """Load the table as a pandas DataFrame.
151
153
 
152
154
  If columns are provided, the table should be filtered
153
155
  """
154
- if self.implements_anndata():
155
- return convert_anndata_to_pandas(
156
- self.load_as_anndata(),
157
- index_key=self.index_key,
158
- index_type=self.index_type,
159
- )
160
- elif self.implements_polars():
161
- return convert_polars_to_pandas(
162
- self.load_as_polars_lf(),
163
- index_key=self.index_key,
164
- index_type=self.index_type,
165
- )
166
- else:
167
- raise NgioValueError(
168
- "Backend does not implement any of the protocols. "
169
- "A backend should implement at least one of the "
170
- "following protocols: anndata, pandas, polars."
171
- )
156
+ table = self.load()
157
+ return convert_to_pandas(
158
+ table,
159
+ index_key=self.index_key,
160
+ index_type=self.index_type,
161
+ )
172
162
 
173
163
  def load_as_polars_lf(self) -> LazyFrame:
174
164
  """Load the table as a polars LazyFrame.
175
165
 
176
166
  If columns are provided, the table should be filtered
177
167
  """
178
- if self.implements_anndata():
179
- return convert_anndata_to_polars(
180
- self.load_as_anndata(),
181
- index_key=self.index_key,
182
- index_type=self.index_type,
183
- ).lazy()
184
- elif self.implements_pandas():
185
- return convert_pandas_to_polars(
186
- self.load_as_pandas_df(),
187
- index_key=self.index_key,
188
- index_type=self.index_type,
189
- ).lazy()
190
- else:
191
- raise NgioValueError(
192
- "Backend does not implement any of the protocols. "
193
- "A backend should implement at least one of the "
194
- "following protocols: anndata, pandas, polars."
195
- )
168
+ table = self.load()
169
+ return convert_to_polars(
170
+ table,
171
+ index_key=self.index_key,
172
+ index_type=self.index_type,
173
+ )
196
174
 
197
175
  def write_from_pandas(self, table: DataFrame) -> None:
198
176
  """Serialize the table from a pandas DataFrame."""
@@ -230,7 +208,7 @@ class AbstractTableBackend(ABC):
230
208
 
231
209
  def write(
232
210
  self,
233
- table: DataFrame | AnnData | PolarsDataFrame | LazyFrame,
211
+ table_data: TabularData,
234
212
  metadata: dict | None = None,
235
213
  mode: Literal["pandas", "anndata", "polars"] | None = None,
236
214
  ) -> None:
@@ -240,15 +218,15 @@ class AbstractTableBackend(ABC):
240
218
  Based on the explicit mode or the type of the table,
241
219
  it will call the appropriate write method.
242
220
  """
243
- if mode == "pandas" or isinstance(table, DataFrame):
244
- self.write_from_pandas(table) # type: ignore[arg-type]
245
- elif mode == "anndata" or isinstance(table, AnnData):
246
- self.write_from_anndata(table) # type: ignore[arg-type]
247
- elif mode == "polars" or isinstance(table, PolarsDataFrame | LazyFrame):
248
- self.write_from_polars(table)
221
+ if mode == "pandas" or isinstance(table_data, DataFrame):
222
+ self.write_from_pandas(table_data) # type: ignore[arg-type]
223
+ elif mode == "anndata" or isinstance(table_data, AnnData):
224
+ self.write_from_anndata(table_data) # type: ignore[arg-type]
225
+ elif mode == "polars" or isinstance(table_data, PolarsDataFrame | LazyFrame):
226
+ self.write_from_polars(table_data)
249
227
  else:
250
228
  raise NgioValueError(
251
- f"Unsupported table type {type(table)}. "
229
+ f"Unsupported table type {type(table_data)}. "
252
230
  "Please specify the mode explicitly. "
253
231
  "Supported serialization modes are: "
254
232
  "'pandas', 'anndata', 'polars'."
@@ -21,7 +21,7 @@ class AnnDataBackend(AbstractTableBackend):
21
21
  @staticmethod
22
22
  def backend_name() -> str:
23
23
  """Return the name of the backend."""
24
- return "anndata_v1"
24
+ return "anndata"
25
25
 
26
26
  @staticmethod
27
27
  def implements_anndata() -> bool:
@@ -44,6 +44,10 @@ class AnnDataBackend(AbstractTableBackend):
44
44
  anndata = normalize_anndata(anndata, index_key=self.index_key)
45
45
  return anndata
46
46
 
47
+ def load(self) -> AnnData:
48
+ """Load the table as an AnnData object."""
49
+ return self.load_as_anndata()
50
+
47
51
  def write_from_anndata(self, table: AnnData) -> None:
48
52
  """Serialize the table from an AnnData object."""
49
53
  full_url = self._group_handler.full_url
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+
4
+ from ngio.tables.backends._non_zarr_backends import NonZarrBaseBackend
5
+
6
+
7
+ def write_lf_to_csv(path: str, table: pl.DataFrame) -> None:
8
+ """Write a polars DataFrame to a CSV file."""
9
+ table.write_csv(path)
10
+
11
+
12
+ def write_df_to_csv(path: str, table: pd.DataFrame) -> None:
13
+ """Write a pandas DataFrame to a CSV file."""
14
+ table.to_csv(path, index=False)
15
+
16
+
17
+ class CsvTableBackend(NonZarrBaseBackend):
18
+ """A class to load and write small tables in CSV format."""
19
+
20
+ def __init__(
21
+ self,
22
+ ):
23
+ """Initialize the CsvTableBackend."""
24
+ super().__init__(
25
+ lf_reader=pl.scan_csv,
26
+ df_reader=pd.read_csv,
27
+ lf_writer=write_lf_to_csv,
28
+ df_writer=write_df_to_csv,
29
+ table_name="table.csv",
30
+ )
31
+
32
+ @staticmethod
33
+ def backend_name() -> str:
34
+ """Return the name of the backend."""
35
+ return "csv"
@@ -17,7 +17,7 @@ class JsonTableBackend(AbstractTableBackend):
17
17
  @staticmethod
18
18
  def backend_name() -> str:
19
19
  """Return the name of the backend."""
20
- return "experimental_json_v1"
20
+ return "json"
21
21
 
22
22
  @staticmethod
23
23
  def implements_anndata() -> bool:
@@ -61,6 +61,9 @@ class JsonTableBackend(AbstractTableBackend):
61
61
  )
62
62
  return data_frame
63
63
 
64
+ def load(self) -> DataFrame:
65
+ return self.load_as_pandas_df()
66
+
64
67
  def _write_from_dict(self, table: dict) -> None:
65
68
  """Write the table from a dictionary to the store."""
66
69
  table_group = self._get_table_group()
@@ -1,7 +1,7 @@
1
1
  import io
2
+ from collections.abc import Callable
3
+ from typing import Any
2
4
 
3
- import pandas as pd
4
- import polars as pl
5
5
  from pandas import DataFrame
6
6
  from polars import DataFrame as PolarsDataFrame
7
7
  from polars import LazyFrame
@@ -12,15 +12,22 @@ from ngio.tables.backends._utils import normalize_pandas_df, normalize_polars_lf
12
12
  from ngio.utils import NgioFileNotFoundError, NgioValueError
13
13
 
14
14
 
15
- class CsvTableBackend(AbstractTableBackend):
15
+ class NonZarrBaseBackend(AbstractTableBackend):
16
16
  """A class to load and write small tables in CSV format."""
17
17
 
18
- csv_name = "table.csv"
19
-
20
- @staticmethod
21
- def backend_name() -> str:
22
- """Return the name of the backend."""
23
- return "experimental_csv_v1"
18
+ def __init__(
19
+ self,
20
+ df_reader: Callable[[Any], DataFrame],
21
+ lf_reader: Callable[[Any], LazyFrame],
22
+ df_writer: Callable[[str, DataFrame], None],
23
+ lf_writer: Callable[[str, PolarsDataFrame], None],
24
+ table_name: str,
25
+ ):
26
+ self.df_reader = df_reader
27
+ self.lf_reader = lf_reader
28
+ self.df_writer = df_writer
29
+ self.lf_writer = lf_writer
30
+ self.table_name = table_name
24
31
 
25
32
  @staticmethod
26
33
  def implements_anndata() -> bool:
@@ -37,38 +44,58 @@ class CsvTableBackend(AbstractTableBackend):
37
44
  """Whether the handler implements the polars protocol."""
38
45
  return True
39
46
 
47
+ @staticmethod
48
+ def backend_name() -> str:
49
+ """Return the name of the backend."""
50
+ raise NotImplementedError(
51
+ "The backend_name method must be implemented in the subclass."
52
+ )
53
+
40
54
  def _load_from_directory_store(self, reader):
41
55
  """Load the table from a directory store."""
42
56
  url = self._group_handler.full_url
43
57
  if url is None:
58
+ ext = self.table_name.split(".")[-1]
44
59
  raise NgioValueError(
45
- f"Ngio does not support reading a CSV file from a "
60
+ f"Ngio does not support reading a {ext} table from a "
46
61
  f"store of type {type(self._group_handler)}. "
47
62
  "Please make sure to use a compatible "
48
63
  "store like a zarr.DirectoryStore."
49
64
  )
50
- csv_path = f"{url}/{self.csv_name}"
51
- dataframe = reader(csv_path)
65
+ table_path = f"{url}/{self.table_name}"
66
+ dataframe = reader(table_path)
52
67
  return dataframe
53
68
 
54
- def _load_from_fs_store(self, reader):
69
+ def _load_from_fs_store_df(self, reader):
55
70
  """Load the table from an FS store."""
56
- bytes_table = self._group_handler.store.get(self.csv_name)
71
+ path = self._group_handler.group.path
72
+ table_path = f"{path}/{self.table_name}"
73
+ bytes_table = self._group_handler.store.get(table_path)
57
74
  if bytes_table is None:
58
- raise NgioFileNotFoundError(f"No table found at {self.csv_name}. ")
75
+ raise NgioFileNotFoundError(f"No table found at {table_path}. ")
59
76
  dataframe = reader(io.BytesIO(bytes_table))
60
77
  return dataframe
61
78
 
79
+ def _load_from_fs_store_lf(self, reader):
80
+ """Load the table from an FS store."""
81
+ full_url = self._group_handler.full_url
82
+ parquet_path = f"{full_url}/{self.table_name}"
83
+ store_fs = self._group_handler.store.fs # type: ignore
84
+ with store_fs.open(parquet_path, "rb") as f:
85
+ dataframe = reader(f)
86
+ return dataframe
87
+
62
88
  def load_as_pandas_df(self) -> DataFrame:
63
89
  """Load the table as a pandas DataFrame."""
64
90
  store = self._group_handler.store
65
91
  if isinstance(store, DirectoryStore):
66
- dataframe = self._load_from_directory_store(reader=pd.read_csv)
92
+ dataframe = self._load_from_directory_store(reader=self.df_reader)
67
93
  elif isinstance(store, FSStore):
68
- dataframe = self._load_from_fs_store(reader=pd.read_csv)
94
+ dataframe = self._load_from_fs_store_df(reader=self.df_reader)
69
95
  else:
96
+ ext = self.table_name.split(".")[-1]
70
97
  raise NgioValueError(
71
- f"Ngio does not support reading a CSV file from a "
98
+ f"Ngio does not support reading a {ext} table from a "
72
99
  f"store of type {type(store)}. "
73
100
  "Please make sure to use a compatible "
74
101
  "store like a zarr.DirectoryStore or "
@@ -83,16 +110,21 @@ class CsvTableBackend(AbstractTableBackend):
83
110
  )
84
111
  return dataframe
85
112
 
113
+ def load(self) -> DataFrame:
114
+ """Load the table as a pandas DataFrame."""
115
+ return self.load_as_pandas_df()
116
+
86
117
  def load_as_polars_lf(self) -> LazyFrame:
87
118
  """Load the table as a polars LazyFrame."""
88
119
  store = self._group_handler.store
89
120
  if isinstance(store, DirectoryStore):
90
- lazy_frame = self._load_from_directory_store(reader=pl.scan_csv)
121
+ lazy_frame = self._load_from_directory_store(reader=self.lf_reader)
91
122
  elif isinstance(store, FSStore):
92
- lazy_frame = self._load_from_fs_store(reader=pl.scan_csv)
123
+ lazy_frame = self._load_from_fs_store_lf(reader=self.lf_reader)
93
124
  else:
125
+ ext = self.table_name.split(".")[-1]
94
126
  raise NgioValueError(
95
- f"Ngio does not support reading a CSV file from a "
127
+ f"Ngio does not support reading a {ext} from a "
96
128
  f"store of type {type(store)}. "
97
129
  "Please make sure to use a compatible "
98
130
  "store like a zarr.DirectoryStore or "
@@ -117,16 +149,18 @@ class CsvTableBackend(AbstractTableBackend):
117
149
  if isinstance(store, DirectoryStore):
118
150
  full_url = self._group_handler.full_url
119
151
  else:
152
+ ext = self.table_name.split(".")[-1]
120
153
  raise NgioValueError(
121
- f"Ngio does not support writing a CSV file to a "
154
+ f"Ngio does not support writing a {ext} file to a "
122
155
  f"store of type {type(store)}. "
123
156
  "Please make sure to use a compatible "
124
157
  "store like a zarr.DirectoryStore or "
125
158
  "zarr.FSStore."
126
159
  )
127
160
  if full_url is None:
161
+ ext = self.table_name.split(".")[-1]
128
162
  raise NgioValueError(
129
- f"Ngio does not support writing a CSV file to a "
163
+ f"Ngio does not support writing a {ext} file to a "
130
164
  f"store of type {type(store)}. "
131
165
  "Please make sure to use a compatible "
132
166
  "store like a zarr.DirectoryStore or "
@@ -143,8 +177,8 @@ class CsvTableBackend(AbstractTableBackend):
143
177
  reset_index=True,
144
178
  )
145
179
  full_url = self._get_store_url()
146
- csv_path = f"{full_url}/{self.csv_name}"
147
- table.to_csv(csv_path, index=False)
180
+ table_path = f"{full_url}/{self.table_name}"
181
+ self.df_writer(table_path, table)
148
182
 
149
183
  def write_from_polars(self, table: PolarsDataFrame | LazyFrame) -> None:
150
184
  """Write the table from a polars DataFrame or LazyFrame."""
@@ -158,5 +192,5 @@ class CsvTableBackend(AbstractTableBackend):
158
192
  table = table.collect()
159
193
 
160
194
  full_url = self._get_store_url()
161
- csv_path = f"{full_url}/{self.csv_name}"
162
- table.write_csv(csv_path)
195
+ table_path = f"{full_url}/{self.table_name}"
196
+ self.lf_writer(table_path, table)
@@ -0,0 +1,47 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+
4
+ from ngio.tables.backends._non_zarr_backends import NonZarrBaseBackend
5
+
6
+
7
+ def write_lf_to_parquet(path: str, table: pl.DataFrame) -> None:
8
+ """Write a polars DataFrame to a Parquet file."""
9
+ # make categorical into string (for pandas compatibility)
10
+ schema = table.collect_schema()
11
+
12
+ categorical_columns = []
13
+ for name, dtype in zip(schema.names(), schema.dtypes(), strict=True):
14
+ if dtype == pl.Categorical:
15
+ categorical_columns.append(name)
16
+
17
+ for col in categorical_columns:
18
+ table = table.with_columns(pl.col(col).cast(pl.Utf8))
19
+
20
+ # write to parquet
21
+ table.write_parquet(path)
22
+
23
+
24
+ def write_df_to_parquet(path: str, table: pd.DataFrame) -> None:
25
+ """Write a pandas DataFrame to a Parquet file."""
26
+ table.to_parquet(path, index=False)
27
+
28
+
29
+ class ParquetTableBackend(NonZarrBaseBackend):
30
+ """A class to load and write small tables in Parquet format."""
31
+
32
+ def __init__(
33
+ self,
34
+ ):
35
+ """Initialize the ParquetTableBackend."""
36
+ super().__init__(
37
+ lf_reader=pl.scan_parquet,
38
+ df_reader=pd.read_parquet,
39
+ lf_writer=write_lf_to_parquet,
40
+ df_writer=write_df_to_parquet,
41
+ table_name="table.parquet",
42
+ )
43
+
44
+ @staticmethod
45
+ def backend_name() -> str:
46
+ """Return the name of the backend."""
47
+ return "parquet"