scdataloader 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/mapped.py ADDED
@@ -0,0 +1,700 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from functools import reduce
5
+ from typing import TYPE_CHECKING, Literal
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from lamindb_setup.core.upath import UPath
10
+
11
+ from lamindb.core.storage._anndata_accessor import (
12
+ ArrayType,
13
+ ArrayTypes,
14
+ GroupType,
15
+ GroupTypes,
16
+ StorageType,
17
+ _safer_read_index,
18
+ get_spec,
19
+ registry,
20
+ )
21
+
22
+ if TYPE_CHECKING:
23
+ from lamindb_setup.core.types import UPathStr
24
+
25
+
26
+ class _Connect:
27
+ def __init__(self, storage):
28
+ if isinstance(storage, UPath):
29
+ self.conn, self.store = registry.open("h5py", storage)
30
+ self.to_close = True
31
+ else:
32
+ self.conn, self.store = None, storage
33
+ self.to_close = False
34
+
35
+ def __enter__(self):
36
+ return self.store
37
+
38
+ def __exit__(self, exc_type, exc_val, exc_tb):
39
+ self.close()
40
+
41
+ def close(self):
42
+ if not self.to_close:
43
+ return
44
+ if hasattr(self.store, "close"):
45
+ self.store.close()
46
+ if hasattr(self.conn, "close"):
47
+ self.conn.close()
48
+
49
+
50
+ def mapped(
51
+ self,
52
+ layers_keys: str | list[str] | None = None,
53
+ obs_keys: str | list[str] | None = None,
54
+ obsm_keys: str | list[str] | None = None,
55
+ obs_filter: tuple[str, str | tuple[str, ...]] | None = None,
56
+ join: Literal["inner", "outer"] | None = "inner",
57
+ encode_labels: bool | list[str] = True,
58
+ unknown_label: str | dict[str, str] | None = None,
59
+ cache_categories: bool = True,
60
+ parallel: bool = False,
61
+ dtype: str | None = None,
62
+ stream: bool = False,
63
+ is_run_input: bool | None = None,
64
+ meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
65
+ ) -> MappedCollection:
66
+ path_list = []
67
+ if self._state.adding:
68
+ artifacts = self._artifacts
69
+ print("The collection isn't saved, consider calling `.save()`")
70
+ else:
71
+ artifacts = self.ordered_artifacts.all()
72
+ for artifact in artifacts:
73
+ if artifact.suffix not in {".h5ad", ".zarr"}:
74
+ print(f"Ignoring artifact with suffix {artifact.suffix}")
75
+ continue
76
+ elif not stream:
77
+ path_list.append(artifact.cache())
78
+ else:
79
+ path_list.append(artifact.path)
80
+ ds = MappedCollection(
81
+ path_list,
82
+ layers_keys,
83
+ obs_keys,
84
+ obsm_keys,
85
+ obs_filter,
86
+ join,
87
+ encode_labels,
88
+ unknown_label,
89
+ cache_categories,
90
+ parallel,
91
+ dtype,
92
+ )
93
+ # track only if successful
94
+ return ds
95
+
96
+
97
+ _decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
98
+
99
+
100
+ class MappedCollection:
101
+ """Map-style collection for use in data loaders.
102
+
103
+ This class virtually concatenates `AnnData` arrays as a `pytorch map-style dataset
104
+ <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__.
105
+
106
+ If your `AnnData` collection is in the cloud, move them into a local cache
107
+ first for faster access.
108
+
109
+ `__getitem__` of the `MappedCollection` object takes a single integer index
110
+ and returns a dictionary with the observation data sample for this index from
111
+ the `AnnData` objects in `path_list`. The dictionary has keys for `layers_keys`
112
+ (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
113
+ for the index of the `AnnData` object containing this observation sample.
114
+
115
+ .. note::
116
+
117
+ For a guide, see :doc:`docs:scrna-mappedcollection`.
118
+
119
+ For more convenient use within :class:`~lamindb.core.MappedCollection`,
120
+ see :meth:`~lamindb.Collection.mapped`.
121
+
122
+ This currently only works for collections of `AnnData` objects.
123
+
124
+ The implementation was influenced by the `SCimilarity
125
+ <https://github.com/Genentech/scimilarity>`__ data loader.
126
+
127
+
128
+ Args:
129
+ path_list: A list of paths to `AnnData` objects stored in `.h5ad` or `.zarr` formats.
130
+ layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
131
+ retrieves ``.X``.
132
+ obsm_keys: Keys from the ``.obsm`` slots.
133
+ obs_keys: Keys from the ``.obs`` slots.
134
+ obs_filter: Select only observations with these values for the given obs column.
135
+ Should be a tuple with an obs column name as the first element
136
+ and filtering values (a string or a tuple of strings) as the second element.
137
+ join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
138
+ does not join.
139
+ encode_labels: Encode labels into integers.
140
+ Can be a list with elements from ``obs_keys``.
141
+ unknown_label: Encode this label to -1.
142
+ Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
143
+ or from ``encode_labels`` if it is a list.
144
+ cache_categories: Enable caching categories of ``obs_keys`` for faster access.
145
+ parallel: Enable sampling with multiple processes.
146
+ dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
147
+ """
148
+
149
+ def __init__(
150
+ self,
151
+ path_list: list[UPathStr],
152
+ layers_keys: str | list[str] | None = None,
153
+ obs_keys: str | list[str] | None = None,
154
+ obsm_keys: str | list[str] | None = None,
155
+ obs_filter: tuple[str, str | tuple[str, ...]] | None = None,
156
+ join: Literal["inner", "outer"] | None = "inner",
157
+ encode_labels: bool | list[str] = True,
158
+ unknown_label: str | dict[str, str] | None = None,
159
+ cache_categories: bool = True,
160
+ parallel: bool = False,
161
+ dtype: str | None = None,
162
+ metacell_mode: float = 0.0,
163
+ meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
164
+ ):
165
+ if join not in {None, "inner", "outer"}: # pragma: nocover
166
+ raise ValueError(
167
+ f"join must be one of None, 'inner, or 'outer' but was {type(join)}"
168
+ )
169
+
170
+ self.filtered = obs_filter is not None
171
+ if self.filtered and len(obs_filter) != 2:
172
+ raise ValueError(
173
+ "obs_filter should be a tuple with obs column name "
174
+ "as the first element and filtering values as the second element"
175
+ )
176
+
177
+ if layers_keys is None:
178
+ self.layers_keys = ["X"]
179
+ else:
180
+ self.layers_keys = (
181
+ [layers_keys] if isinstance(layers_keys, str) else layers_keys
182
+ )
183
+
184
+ obsm_keys = [obsm_keys] if isinstance(obsm_keys, str) else obsm_keys
185
+ self.obsm_keys = obsm_keys
186
+
187
+ obs_keys = [obs_keys] if isinstance(obs_keys, str) else obs_keys
188
+ self.obs_keys = obs_keys
189
+
190
+ if isinstance(encode_labels, list):
191
+ if len(encode_labels) == 0:
192
+ encode_labels = False
193
+ elif obs_keys is None or not all(
194
+ enc_label in obs_keys for enc_label in encode_labels
195
+ ):
196
+ raise ValueError(
197
+ "All elements of `encode_labels` should be in `obs_keys`."
198
+ )
199
+ else:
200
+ if encode_labels:
201
+ encode_labels = obs_keys if obs_keys is not None else False
202
+ self.encode_labels = encode_labels
203
+
204
+ if encode_labels and isinstance(unknown_label, dict):
205
+ if not all(unkey in encode_labels for unkey in unknown_label): # type: ignore
206
+ raise ValueError(
207
+ "All keys of `unknown_label` should be in `encode_labels` and `obs_keys`."
208
+ )
209
+ self.unknown_label = unknown_label
210
+
211
+ self.storages = [] # type: ignore
212
+ self.conns = [] # type: ignore
213
+ self.parallel = parallel
214
+ self.metacell_mode = metacell_mode
215
+ self.path_list = path_list
216
+ self.meta_assays = meta_assays
217
+ self._make_connections(path_list, parallel)
218
+
219
+ self._cache_cats: dict = {}
220
+ if self.obs_keys is not None:
221
+ if cache_categories:
222
+ self._cache_categories(self.obs_keys)
223
+ self.encoders: dict = {}
224
+ if self.encode_labels:
225
+ self._make_encoders(self.encode_labels) # type: ignore
226
+
227
+ self.n_obs_list = []
228
+ self.indices_list = []
229
+ for i, storage in enumerate(self.storages):
230
+ with _Connect(storage) as store:
231
+ X = store["X"]
232
+ store_path = self.path_list[i]
233
+ self._check_csc_raise_error(X, "X", store_path)
234
+ if self.filtered:
235
+ obs_filter_key, obs_filter_values = obs_filter
236
+ indices_storage = np.where(
237
+ np.isin(
238
+ self._get_labels(store, obs_filter_key), obs_filter_values
239
+ )
240
+ )[0]
241
+ n_obs_storage = len(indices_storage)
242
+ else:
243
+ if isinstance(X, ArrayTypes): # type: ignore
244
+ n_obs_storage = X.shape[0]
245
+ else:
246
+ n_obs_storage = X.attrs["shape"][0]
247
+ indices_storage = np.arange(n_obs_storage)
248
+ self.n_obs_list.append(n_obs_storage)
249
+ self.indices_list.append(indices_storage)
250
+ for layer_key in self.layers_keys:
251
+ if layer_key == "X":
252
+ continue
253
+ self._check_csc_raise_error(
254
+ store["layers"][layer_key],
255
+ f"layers/{layer_key}",
256
+ store_path,
257
+ )
258
+ if self.obsm_keys is not None:
259
+ for obsm_key in self.obsm_keys:
260
+ self._check_csc_raise_error(
261
+ store["obsm"][obsm_key],
262
+ f"obsm/{obsm_key}",
263
+ store_path,
264
+ )
265
+ self.n_obs = sum(self.n_obs_list)
266
+
267
+ self.indices = np.hstack(self.indices_list)
268
+ self.storage_idx = np.repeat(np.arange(len(self.storages)), self.n_obs_list)
269
+
270
+ self.join_vars: Literal["inner", "outer"] | None = join
271
+ self.var_indices: list | None = None
272
+ self.var_joint: pd.Index | None = None
273
+ self.n_vars_list: list | None = None
274
+ self.var_list: list | None = None
275
+ self.n_vars: int | None = None
276
+ if self.join_vars is not None:
277
+ self._make_join_vars()
278
+ self.n_vars = len(self.var_joint)
279
+
280
+ self._dtype = dtype
281
+ self._closed = False
282
+
283
+ def _make_connections(self, path_list: list, parallel: bool):
284
+ for path in path_list:
285
+ path = UPath(path)
286
+ if path.exists() and path.is_file(): # type: ignore
287
+ if parallel:
288
+ conn, storage = None, path
289
+ else:
290
+ conn, storage = registry.open("h5py", path)
291
+ else:
292
+ conn, storage = registry.open("zarr", path)
293
+ self.conns.append(conn)
294
+ self.storages.append(storage)
295
+
296
+ def _cache_categories(self, obs_keys: list):
297
+ self._cache_cats = {}
298
+ for label in obs_keys:
299
+ self._cache_cats[label] = []
300
+ for storage in self.storages:
301
+ with _Connect(storage) as store:
302
+ cats = self._get_categories(store, label)
303
+ if cats is not None:
304
+ cats = (
305
+ _decode(cats) if isinstance(cats[0], bytes) else cats[...]
306
+ )
307
+ self._cache_cats[label].append(cats)
308
+
309
+ def _make_encoders(self, encode_labels: list):
310
+ for label in encode_labels:
311
+ cats = self.get_merged_categories(label)
312
+ encoder = {}
313
+ if isinstance(self.unknown_label, dict):
314
+ unknown_label = self.unknown_label.get(label, None)
315
+ else:
316
+ unknown_label = self.unknown_label
317
+ if unknown_label is not None and unknown_label in cats:
318
+ cats.remove(unknown_label)
319
+ encoder[unknown_label] = -1
320
+ encoder.update({cat: i for i, cat in enumerate(cats)})
321
+ self.encoders[label] = encoder
322
+
323
+ def _read_vars(self):
324
+ self.var_list = []
325
+ self.n_vars_list = []
326
+ for storage in self.storages:
327
+ with _Connect(storage) as store:
328
+ vars = _safer_read_index(store["var"])
329
+ self.var_list.append(vars)
330
+ self.n_vars_list.append(len(vars))
331
+
332
+ def _make_join_vars(self):
333
+ if self.var_list is None:
334
+ self._read_vars()
335
+ vars_eq = all(self.var_list[0].equals(vrs) for vrs in self.var_list[1:])
336
+ if vars_eq:
337
+ self.join_vars = None
338
+ self.var_joint = self.var_list[0]
339
+ return
340
+
341
+ if self.join_vars == "inner":
342
+ self.var_joint = reduce(pd.Index.intersection, self.var_list)
343
+ if len(self.var_joint) == 0:
344
+ raise ValueError(
345
+ "The provided AnnData objects don't have shared varibales.\n"
346
+ "Use join='outer'."
347
+ )
348
+ self.var_indices = [
349
+ vrs.get_indexer(self.var_joint) for vrs in self.var_list
350
+ ]
351
+ elif self.join_vars == "outer":
352
+ self.var_joint = reduce(pd.Index.union, self.var_list)
353
+ self.var_indices = [
354
+ self.var_joint.get_indexer(vrs) for vrs in self.var_list
355
+ ]
356
+
357
+ def check_vars_sorted(self, ascending: bool = True) -> bool:
358
+ """Returns `True` if all variables are sorted in all objects."""
359
+ if self.var_list is None:
360
+ self._read_vars()
361
+ if ascending:
362
+ vrs_sort_status = (vrs.is_monotonic_increasing for vrs in self.var_list)
363
+ else:
364
+ vrs_sort_status = (vrs.is_monotonic_decreasing for vrs in self.var_list)
365
+ return all(vrs_sort_status)
366
+
367
+ def check_vars_non_aligned(self, vars: pd.Index | list) -> list[int]:
368
+ """Returns indices of objects with non-aligned variables.
369
+
370
+ Args:
371
+ vars: Check alignment against these variables.
372
+ """
373
+ if self.var_list is None:
374
+ self._read_vars()
375
+ vars = pd.Index(vars)
376
+ return [i for i, vrs in enumerate(self.var_list) if not vrs.equals(vars)]
377
+
378
+ def _check_csc_raise_error(
379
+ self, elem: GroupType | ArrayType, key: str, path: UPathStr
380
+ ):
381
+ if isinstance(elem, ArrayTypes): # type: ignore
382
+ return
383
+ if get_spec(elem).encoding_type == "csc_matrix":
384
+ if not self.parallel:
385
+ self.close()
386
+ raise ValueError(
387
+ f"{key} in {path} is a csc matrix, `MappedCollection` doesn't support this format yet."
388
+ )
389
+
390
+ def __len__(self):
391
+ return self.n_obs
392
+
393
+ @property
394
+ def shape(self) -> tuple[int, int]:
395
+ """Shape of the (virtually aligned) dataset."""
396
+ return (self.n_obs, self.n_vars)
397
+
398
+ @property
399
+ def original_shapes(self) -> list[tuple[int, int]]:
400
+ """Shapes of the underlying AnnData objects."""
401
+ if self.n_vars_list is None:
402
+ n_vars_list = [None] * len(self.n_obs_list)
403
+ else:
404
+ n_vars_list = self.n_vars_list
405
+ return list(zip(self.n_obs_list, n_vars_list))
406
+
407
+ def __getitem__(self, idx: int):
408
+ obs_idx = self.indices[idx]
409
+ storage_idx = self.storage_idx[idx]
410
+ if self.var_indices is not None:
411
+ var_idxs_join = self.var_indices[storage_idx]
412
+ else:
413
+ var_idxs_join = None
414
+
415
+ with _Connect(self.storages[storage_idx]) as store:
416
+ out = {}
417
+ for layers_key in self.layers_keys:
418
+ lazy_data = (
419
+ store["X"] if layers_key == "X" else store["layers"][layers_key]
420
+ )
421
+ out[layers_key] = self._get_data_idx(
422
+ lazy_data, obs_idx, self.join_vars, var_idxs_join, self.n_vars
423
+ )
424
+ # out[layers_key]
425
+ if self.obsm_keys is not None:
426
+ for obsm_key in self.obsm_keys:
427
+ lazy_data = store["obsm"][obsm_key]
428
+ out[f"obsm_{obsm_key}"] = self._get_data_idx(lazy_data, obs_idx)
429
+ out["_store_idx"] = storage_idx
430
+ if self.obs_keys is not None:
431
+ for label in self.obs_keys:
432
+ if label in self._cache_cats:
433
+ cats = self._cache_cats[label][storage_idx]
434
+ if cats is None:
435
+ cats = []
436
+ else:
437
+ cats = None
438
+ label_idx = self._get_obs_idx(store, obs_idx, label, cats)
439
+ if label in self.encoders:
440
+ label_idx = self.encoders[label][label_idx]
441
+ out[label] = label_idx
442
+
443
+ out["is_meta"] = False
444
+ if len(self.meta_assays) > 0 and "assay_ontology_term_id" in self.obs_keys:
445
+ if out["assay_ontology_term_id"] in self.meta_assays:
446
+ out["is_meta"] = True
447
+ return out
448
+ if self.metacell_mode > 0:
449
+ if np.random.random() < self.metacell_mode:
450
+ out["is_meta"] = True
451
+ distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
452
+ nn_idx = np.argsort(-1 / (distances - 1e-6))[:3]
453
+ for i in nn_idx:
454
+ out[layers_key] += self._get_data_idx(
455
+ lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
456
+ )
457
+
458
+ return out
459
+
460
+ def _get_data_idx(
461
+ self,
462
+ lazy_data: ArrayType | GroupType,
463
+ idx: int,
464
+ join_vars: Literal["inner", "outer"] | None = None,
465
+ var_idxs_join: list | None = None,
466
+ n_vars_out: int | None = None,
467
+ ):
468
+ """Get the index for the data."""
469
+ if isinstance(lazy_data, ArrayTypes): # type: ignore
470
+ lazy_data_idx = lazy_data[idx] # type: ignore
471
+ if join_vars is None:
472
+ result = lazy_data_idx
473
+ if self._dtype is not None:
474
+ result = result.astype(self._dtype, copy=False)
475
+ elif join_vars == "outer":
476
+ dtype = lazy_data_idx.dtype if self._dtype is None else self._dtype
477
+ result = np.zeros(n_vars_out, dtype=dtype)
478
+ result[var_idxs_join] = lazy_data_idx
479
+ else: # inner join
480
+ result = lazy_data_idx[var_idxs_join]
481
+ if self._dtype is not None:
482
+ result = result.astype(self._dtype, copy=False)
483
+ return result
484
+ else: # assume csr_matrix here
485
+ data = lazy_data["data"] # type: ignore
486
+ indices = lazy_data["indices"] # type: ignore
487
+ indptr = lazy_data["indptr"] # type: ignore
488
+ s = slice(*(indptr[idx : idx + 2]))
489
+ data_s = data[s]
490
+ dtype = data_s.dtype if self._dtype is None else self._dtype
491
+ if join_vars == "outer":
492
+ lazy_data_idx = np.zeros(n_vars_out, dtype=dtype)
493
+ lazy_data_idx[var_idxs_join[indices[s]]] = data_s
494
+ else:
495
+ lazy_data_idx = np.zeros(lazy_data.attrs["shape"][1], dtype=dtype) # type: ignore
496
+ lazy_data_idx[indices[s]] = data_s
497
+ if join_vars == "inner":
498
+ lazy_data_idx = lazy_data_idx[var_idxs_join]
499
+ return lazy_data_idx
500
+
501
+ def _get_obs_idx(
502
+ self,
503
+ storage: StorageType,
504
+ idx: int,
505
+ label_key: str,
506
+ categories: list | None = None,
507
+ ):
508
+ """Get the index for the label by key."""
509
+ obs = storage["obs"] # type: ignore
510
+ # how backwards compatible do we want to be here actually?
511
+ if isinstance(obs, ArrayTypes): # type: ignore
512
+ label = obs[idx][obs.dtype.names.index(label_key)]
513
+ else:
514
+ labels = obs[label_key]
515
+ if isinstance(labels, ArrayTypes): # type: ignore
516
+ label = labels[idx]
517
+ else:
518
+ label = labels["codes"][idx]
519
+ if categories is not None:
520
+ cats = categories
521
+ else:
522
+ cats = self._get_categories(storage, label_key)
523
+ if cats is not None and len(cats) > 0:
524
+ label = cats[label]
525
+ if isinstance(label, bytes):
526
+ label = label.decode("utf-8")
527
+ return label
528
+
529
+ def get_label_weights(
530
+ self,
531
+ obs_keys: str | list[str],
532
+ scaler: float | None = None,
533
+ return_categories: bool = False,
534
+ ):
535
+ """Get all weights for the given label keys.
536
+
537
+ This counts the number of labels for each label and returns
538
+ weights for each obs label accoding to the formula `1 / num of this label in the data`.
539
+ If `scaler` is provided, then `scaler / (scaler + num of this label in the data)`.
540
+
541
+ Args:
542
+ obs_keys: A key in the ``.obs`` slots or a list of keys. If a list is provided,
543
+ the labels from the obs keys will be concatenated with ``"__"`` delimeter
544
+ scaler: Use this number to scale the provided weights.
545
+ return_categories: If `False`, returns weights for each observation,
546
+ can be directly passed to a sampler. If `True`, returns a dictionary with
547
+ unique categories for labels (concatenated if `obs_keys` is a list)
548
+ and their weights.
549
+ """
550
+ if isinstance(obs_keys, str):
551
+ obs_keys = [obs_keys]
552
+ labels_list = []
553
+ for label_key in obs_keys:
554
+ labels_to_str = self.get_merged_labels(label_key).astype(str).astype("O")
555
+ labels_list.append(labels_to_str)
556
+ if len(labels_list) > 1:
557
+ labels = ["__".join(labels_obs) for labels_obs in zip(*labels_list)]
558
+ else:
559
+ labels = labels_list[0]
560
+ counter = Counter(labels)
561
+ MIN, MAX = counter.values().min(), counter.values().max()
562
+ if return_categories:
563
+ return {
564
+ k: 1.0 / v
565
+ if scaler is None
566
+ else (MAX / scaler) / ((1 + v - MIN) + MAX / scaler)
567
+ for k, v in counter.items()
568
+ }
569
+ counts = np.array([counter[label] for label in labels])
570
+ if scaler is None:
571
+ weights = 1.0 / counts
572
+ else:
573
+ weights = (MAX / scaler) / ((1 + counts - MIN) + MAX / scaler)
574
+ return weights
575
+
576
+ def get_merged_labels(self, label_key: str):
577
+ """Get merged labels for `label_key` from all `.obs`."""
578
+ labels_merge = []
579
+ for i, storage in enumerate(self.storages):
580
+ with _Connect(storage) as store:
581
+ labels = self._get_labels(store, label_key, storage_idx=i)
582
+ if self.filtered:
583
+ labels = labels[self.indices_list[i]]
584
+ labels_merge.append(labels)
585
+ return np.hstack(labels_merge)
586
+
587
+ def get_merged_categories(self, label_key: str):
588
+ """Get merged categories for `label_key` from all `.obs`."""
589
+ cats_merge = set()
590
+ for i, storage in enumerate(self.storages):
591
+ with _Connect(storage) as store:
592
+ if label_key in self._cache_cats:
593
+ cats = self._cache_cats[label_key][i]
594
+ else:
595
+ cats = self._get_categories(store, label_key)
596
+ if cats is not None:
597
+ cats = _decode(cats) if isinstance(cats[0], bytes) else cats
598
+ cats_merge.update(cats)
599
+ else:
600
+ codes = self._get_codes(store, label_key)
601
+ codes = _decode(codes) if isinstance(codes[0], bytes) else codes
602
+ cats_merge.update(codes)
603
+ return sorted(cats_merge)
604
+
605
+ def _get_categories(self, storage: StorageType, label_key: str):
606
+ """Get categories."""
607
+ obs = storage["obs"] # type: ignore
608
+ if isinstance(obs, ArrayTypes): # type: ignore
609
+ cat_key_uns = f"{label_key}_categories"
610
+ if cat_key_uns in storage["uns"]: # type: ignore
611
+ return storage["uns"][cat_key_uns] # type: ignore
612
+ else:
613
+ return None
614
+ else:
615
+ if "__categories" in obs:
616
+ cats = obs["__categories"]
617
+ if label_key in cats:
618
+ return cats[label_key]
619
+ else:
620
+ return None
621
+ labels = obs[label_key]
622
+ if isinstance(labels, GroupTypes): # type: ignore
623
+ if "categories" in labels:
624
+ return labels["categories"]
625
+ else:
626
+ return None
627
+ else:
628
+ if "categories" in labels.attrs:
629
+ return labels.attrs["categories"]
630
+ else:
631
+ return None
632
+ return None
633
+
634
+ def _get_codes(self, storage: StorageType, label_key: str):
635
+ """Get codes."""
636
+ obs = storage["obs"] # type: ignore
637
+ if isinstance(obs, ArrayTypes): # type: ignore
638
+ label = obs[label_key]
639
+ else:
640
+ label = obs[label_key]
641
+ if isinstance(label, ArrayTypes): # type: ignore
642
+ return label[...]
643
+ else:
644
+ return label["codes"][...]
645
+
646
+ def _get_labels(
647
+ self, storage: StorageType, label_key: str, storage_idx: int | None = None
648
+ ):
649
+ """Get labels."""
650
+ codes = self._get_codes(storage, label_key)
651
+ labels = _decode(codes) if isinstance(codes[0], bytes) else codes
652
+ if storage_idx is not None and label_key in self._cache_cats:
653
+ cats = self._cache_cats[label_key][storage_idx]
654
+ else:
655
+ cats = self._get_categories(storage, label_key)
656
+ if cats is not None:
657
+ cats = _decode(cats) if isinstance(cats[0], bytes) else cats
658
+ labels = cats[labels]
659
+ return labels
660
+
661
+ def close(self):
662
+ """Close connections to array streaming backend.
663
+
664
+ No effect if `parallel=True`.
665
+ """
666
+ for storage in self.storages:
667
+ if hasattr(storage, "close"):
668
+ storage.close()
669
+ for conn in self.conns:
670
+ if hasattr(conn, "close"):
671
+ conn.close()
672
+ self._closed = True
673
+
674
+ @property
675
+ def closed(self) -> bool:
676
+ """Check if connections to array streaming backend are closed.
677
+
678
+ Does not matter if `parallel=True`.
679
+ """
680
+ return self._closed
681
+
682
+ def __enter__(self):
683
+ return self
684
+
685
+ def __exit__(self, exc_type, exc_val, exc_tb):
686
+ self.close()
687
+
688
+ @staticmethod
689
+ def torch_worker_init_fn(worker_id):
690
+ """`worker_init_fn` for `torch.utils.data.DataLoader`.
691
+
692
+ Improves performance for `num_workers > 1`.
693
+ """
694
+ from torch.utils.data import get_worker_info
695
+
696
+ mapped = get_worker_info().dataset
697
+ mapped.parallel = False
698
+ mapped.storages = []
699
+ mapped.conns = []
700
+ mapped._make_connections(mapped.path_list, parallel=False)