lamindb 0.76.6__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1174
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -382
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -93
  24. lamindb/core/_context.py +574 -558
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -866
  27. lamindb/core/_label_manager.py +253 -252
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -0
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -196
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -245
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/METADATA +5 -5
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.6.dist-info/RECORD +0 -59
lamindb/_collection.py CHANGED
@@ -1,382 +1,387 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from typing import (
5
- TYPE_CHECKING,
6
- Any,
7
- Iterable,
8
- Literal,
9
- )
10
-
11
- import anndata as ad
12
- import lamindb_setup as ln_setup
13
- import pandas as pd
14
- from lamin_utils import logger
15
- from lamindb_setup.core._docs import doc_args
16
- from lamindb_setup.core.hashing import hash_set
17
- from lnschema_core.models import (
18
- Collection,
19
- CollectionArtifact,
20
- FeatureSet,
21
- )
22
- from lnschema_core.types import VisibilityChoice
23
-
24
- from lamindb._utils import attach_func_to_class_method
25
- from lamindb.core._data import _track_run_input, describe, view_lineage
26
- from lamindb.core._mapped_collection import MappedCollection
27
- from lamindb.core.versioning import process_revises
28
-
29
- from . import Artifact, Run
30
- from ._record import init_self_from_db, update_attributes
31
- from .core._data import (
32
- add_transform_to_kwargs,
33
- get_run,
34
- save_feature_set_links,
35
- save_feature_sets,
36
- )
37
- from .core._settings import settings
38
-
39
- if TYPE_CHECKING:
40
- from lamindb.core.storage import UPath
41
-
42
- from ._query_set import QuerySet
43
-
44
-
45
- class CollectionFeatureManager:
46
- """Query features of artifact in collection."""
47
-
48
- def __init__(self, collection: Collection):
49
- self._collection = collection
50
-
51
- def get_feature_sets_union(self) -> dict[str, FeatureSet]:
52
- links_feature_set_artifact = Artifact.feature_sets.through.objects.filter(
53
- artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
54
- )
55
- feature_sets_by_slots = defaultdict(list)
56
- for link in links_feature_set_artifact:
57
- feature_sets_by_slots[link.slot].append(link.featureset_id)
58
- feature_sets_union = {}
59
- for slot, feature_set_ids_slot in feature_sets_by_slots.items():
60
- feature_set_1 = FeatureSet.get(id=feature_set_ids_slot[0])
61
- related_name = feature_set_1._get_related_name()
62
- features_registry = getattr(FeatureSet, related_name).field.model
63
- # this way of writing the __in statement turned out to be the fastest
64
- # evaluated on a link table with 16M entries connecting 500 feature sets with
65
- # 60k genes
66
- feature_ids = (
67
- features_registry.feature_sets.through.objects.filter(
68
- featureset_id__in=feature_set_ids_slot
69
- )
70
- .values(f"{features_registry.__name__.lower()}_id")
71
- .distinct()
72
- )
73
- features = features_registry.filter(id__in=feature_ids)
74
- feature_sets_union[slot] = FeatureSet(features, dtype=feature_set_1.dtype)
75
- return feature_sets_union
76
-
77
-
78
- def __init__(
79
- collection: Collection,
80
- *args,
81
- **kwargs,
82
- ):
83
- collection.features = CollectionFeatureManager(collection)
84
- if len(args) == len(collection._meta.concrete_fields):
85
- super(Collection, collection).__init__(*args, **kwargs)
86
- return None
87
- # now we proceed with the user-facing constructor
88
- if len(args) > 1:
89
- raise ValueError("Only one non-keyword arg allowed: artifacts")
90
- artifacts: Artifact | Iterable[Artifact] = (
91
- kwargs.pop("artifacts") if len(args) == 0 else args[0]
92
- )
93
- meta_artifact: Artifact | None = (
94
- kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
95
- )
96
- name: str | None = kwargs.pop("name") if "name" in kwargs else None
97
- description: str | None = (
98
- kwargs.pop("description") if "description" in kwargs else None
99
- )
100
- reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
101
- reference_type: str | None = (
102
- kwargs.pop("reference_type") if "reference_type" in kwargs else None
103
- )
104
- run: Run | None = kwargs.pop("run") if "run" in kwargs else None
105
- revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
106
- version: str | None = kwargs.pop("version") if "version" in kwargs else None
107
- visibility: int | None = (
108
- kwargs.pop("visibility")
109
- if "visibility" in kwargs
110
- else VisibilityChoice.default.value
111
- )
112
- if "is_new_version_of" in kwargs:
113
- logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
114
- revises = kwargs.pop("is_new_version_of")
115
- if not len(kwargs) == 0:
116
- raise ValueError(
117
- f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}"
118
- )
119
- provisional_uid, version, name, revises = process_revises(
120
- revises, version, name, Collection
121
- )
122
- run = get_run(run)
123
- if isinstance(artifacts, Artifact):
124
- artifacts = [artifacts]
125
- else:
126
- if not hasattr(artifacts, "__getitem__"):
127
- raise ValueError("Artifact or List[Artifact] is allowed.")
128
- assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
129
- hash = from_artifacts(artifacts) # type: ignore
130
- if meta_artifact is not None:
131
- if not isinstance(meta_artifact, Artifact):
132
- raise ValueError("meta_artifact has to be an Artifact")
133
- if isinstance(meta_artifact, Artifact):
134
- if meta_artifact._state.adding:
135
- raise ValueError(
136
- "Save meta_artifact artifact before creating collection!"
137
- )
138
- # we ignore collections in trash containing the same hash
139
- if hash is not None:
140
- existing_collection = Collection.filter(hash=hash).one_or_none()
141
- else:
142
- existing_collection = None
143
- if existing_collection is not None:
144
- logger.warning(
145
- f"returning existing collection with same hash: {existing_collection}"
146
- )
147
- # update the run of the existing artifact
148
- if run is not None:
149
- # save the information that this artifact was previously
150
- # produced by another run
151
- if existing_collection.run is not None:
152
- existing_collection.run._output_collections_with_later_updates.add(
153
- existing_collection
154
- )
155
- # update the run of the artifact with the latest run
156
- existing_collection.run = run
157
- existing_collection.transform = run.transform
158
- init_self_from_db(collection, existing_collection)
159
- update_attributes(collection, {"description": description, "name": name})
160
- else:
161
- kwargs = {}
162
- add_transform_to_kwargs(kwargs, run)
163
- search_names_setting = settings.creation.search_names
164
- if revises is not None and name == revises.name:
165
- settings.creation.search_names = False
166
- super(Collection, collection).__init__(
167
- uid=provisional_uid,
168
- name=name,
169
- description=description,
170
- reference=reference,
171
- reference_type=reference_type,
172
- meta_artifact=meta_artifact,
173
- hash=hash,
174
- run=run,
175
- version=version,
176
- visibility=visibility,
177
- revises=revises,
178
- **kwargs,
179
- )
180
- settings.creation.search_names = search_names_setting
181
- collection._artifacts = artifacts
182
- # register provenance
183
- if revises is not None:
184
- _track_run_input(revises, run=run)
185
- _track_run_input(artifacts, run=run)
186
-
187
-
188
- # internal function, not exposed to user
189
- def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
190
- # assert all artifacts are already saved
191
- saved = not any(artifact._state.adding for artifact in artifacts)
192
- if not saved:
193
- raise ValueError("Not all artifacts are yet saved, please save them")
194
- # validate consistency of hashes - we do not allow duplicate hashes
195
- hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
196
- hashes_set = set(hashes)
197
- if len(hashes) != len(hashes_set):
198
- seen = set()
199
- non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
200
- raise ValueError(
201
- "Please pass artifacts with distinct hashes: these ones are non-unique"
202
- f" {non_unique}"
203
- )
204
- hash = hash_set(hashes_set)
205
- return hash
206
-
207
-
208
- # docstring handled through attach_func_to_class_method
209
- def mapped(
210
- self,
211
- layers_keys: str | list[str] | None = None,
212
- obs_keys: str | list[str] | None = None,
213
- obsm_keys: str | list[str] | None = None,
214
- join: Literal["inner", "outer"] | None = "inner",
215
- encode_labels: bool | list[str] = True,
216
- unknown_label: str | dict[str, str] | None = None,
217
- cache_categories: bool = True,
218
- parallel: bool = False,
219
- dtype: str | None = None,
220
- stream: bool = False,
221
- is_run_input: bool | None = None,
222
- ) -> MappedCollection:
223
- path_list = []
224
- for artifact in self.ordered_artifacts.all():
225
- if artifact.suffix not in {".h5ad", ".zarr"}:
226
- logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
227
- continue
228
- elif not stream:
229
- path_list.append(artifact.cache())
230
- else:
231
- path_list.append(artifact.path)
232
- ds = MappedCollection(
233
- path_list,
234
- layers_keys,
235
- obs_keys,
236
- obsm_keys,
237
- join,
238
- encode_labels,
239
- unknown_label,
240
- cache_categories,
241
- parallel,
242
- dtype,
243
- )
244
- # track only if successful
245
- _track_run_input(self, is_run_input)
246
- return ds
247
-
248
-
249
- # docstring handled through attach_func_to_class_method
250
- def cache(self, is_run_input: bool | None = None) -> list[UPath]:
251
- path_list = []
252
- for artifact in self.ordered_artifacts.all():
253
- path_list.append(artifact.cache())
254
- _track_run_input(self, is_run_input)
255
- return path_list
256
-
257
-
258
- # docstring handled through attach_func_to_class_method
259
- def load(
260
- self,
261
- join: Literal["inner", "outer"] = "outer",
262
- is_run_input: bool | None = None,
263
- **kwargs,
264
- ) -> Any:
265
- # cannot call _track_run_input here, see comment further down
266
- all_artifacts = self.ordered_artifacts.all()
267
- suffixes = [artifact.suffix for artifact in all_artifacts]
268
- if len(set(suffixes)) != 1:
269
- raise RuntimeError(
270
- "Can only load collections where all artifacts have the same suffix"
271
- )
272
- # because we're tracking data flow on the collection-level, here, we don't
273
- # want to track it on the artifact-level
274
- objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
275
- artifact_uids = [artifact.uid for artifact in all_artifacts]
276
- if isinstance(objects[0], pd.DataFrame):
277
- concat_object = pd.concat(objects, join=join)
278
- elif isinstance(objects[0], ad.AnnData):
279
- concat_object = ad.concat(
280
- objects, join=join, label="artifact_uid", keys=artifact_uids
281
- )
282
- # only call it here because there might be errors during concat
283
- _track_run_input(self, is_run_input)
284
- return concat_object
285
-
286
-
287
- # docstring handled through attach_func_to_class_method
288
- def delete(self, permanent: bool | None = None) -> None:
289
- # change visibility to trash
290
- trash_visibility = VisibilityChoice.trash.value
291
- if self.visibility > trash_visibility and permanent is not True:
292
- self.visibility = trash_visibility
293
- self.save()
294
- logger.warning(f"moved collection to trash (visibility = {trash_visibility})")
295
- return
296
-
297
- # permanent delete
298
- if permanent is None:
299
- response = input(
300
- "Collection record is already in trash! Are you sure to delete it from your"
301
- " database? (y/n) You can't undo this action."
302
- )
303
- delete_record = response == "y"
304
- else:
305
- delete_record = permanent
306
-
307
- if delete_record:
308
- super(Collection, self).delete()
309
-
310
-
311
- # docstring handled through attach_func_to_class_method
312
- def save(self, using: str | None = None) -> Collection:
313
- if self.meta_artifact is not None:
314
- self.meta_artifact.save()
315
- # we don't need to save feature sets again
316
- save_feature_sets(self)
317
- super(Collection, self).save()
318
- # we don't allow updating the collection of artifacts
319
- # if users want to update the set of artifacts, they
320
- # have to create a new collection
321
- if hasattr(self, "_artifacts"):
322
- links = [
323
- CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
324
- for artifact in self._artifacts
325
- ]
326
- # the below seems to preserve the order of the list in the
327
- # auto-incrementing integer primary
328
- # merely using .artifacts.set(*...) doesn't achieve this
329
- # we need ignore_conflicts=True so that this won't error if links already exist
330
- CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
331
- save_feature_set_links(self)
332
- if using is not None:
333
- logger.warning("using argument is ignored")
334
- return self
335
-
336
-
337
- # docstring handled through attach_func_to_class_method
338
- def restore(self) -> None:
339
- self.visibility = VisibilityChoice.default.value
340
- self.save()
341
-
342
-
343
- @property # type: ignore
344
- @doc_args(Collection.ordered_artifacts.__doc__)
345
- def ordered_artifacts(self) -> QuerySet:
346
- """{}""" # noqa: D415
347
- return self.artifacts.order_by("links_collection__id")
348
-
349
-
350
- @property # type: ignore
351
- @doc_args(Collection.data_artifact.__doc__)
352
- def data_artifact(self) -> Artifact | None:
353
- """{}""" # noqa: D415
354
- return self.artifacts.first()
355
-
356
-
357
- METHOD_NAMES = [
358
- "__init__",
359
- "mapped",
360
- "cache",
361
- "load",
362
- "delete",
363
- "save",
364
- "restore",
365
- ]
366
-
367
- if ln_setup._TESTING:
368
- from inspect import signature
369
-
370
- SIGS = {
371
- name: signature(getattr(Collection, name))
372
- for name in METHOD_NAMES
373
- if name != "__init__"
374
- }
375
-
376
- for name in METHOD_NAMES:
377
- attach_func_to_class_method(name, Collection, globals())
378
-
379
- Collection.ordered_artifacts = ordered_artifacts
380
- Collection.data_artifact = data_artifact
381
- Collection.describe = describe
382
- Collection.view_lineage = view_lineage
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ Any,
7
+ Iterable,
8
+ Literal,
9
+ )
10
+
11
+ import anndata as ad
12
+ import lamindb_setup as ln_setup
13
+ import pandas as pd
14
+ from lamin_utils import logger
15
+ from lamindb_setup.core._docs import doc_args
16
+ from lamindb_setup.core.hashing import hash_set
17
+ from lnschema_core.models import (
18
+ Collection,
19
+ CollectionArtifact,
20
+ FeatureSet,
21
+ )
22
+ from lnschema_core.types import VisibilityChoice
23
+
24
+ from lamindb._utils import attach_func_to_class_method
25
+ from lamindb.core._data import _track_run_input, describe, view_lineage
26
+ from lamindb.core._mapped_collection import MappedCollection
27
+ from lamindb.core.versioning import process_revises
28
+
29
+ from . import Artifact, Run
30
+ from ._record import init_self_from_db, update_attributes
31
+ from .core._data import (
32
+ add_transform_to_kwargs,
33
+ get_run,
34
+ save_feature_set_links,
35
+ save_feature_sets,
36
+ )
37
+ from .core._settings import settings
38
+
39
+ if TYPE_CHECKING:
40
+ from lamindb.core.storage import UPath
41
+
42
+ from ._query_set import QuerySet
43
+
44
+
45
+ class CollectionFeatureManager:
46
+ """Query features of artifact in collection."""
47
+
48
+ def __init__(self, collection: Collection):
49
+ self._collection = collection
50
+
51
+ def get_feature_sets_union(self) -> dict[str, FeatureSet]:
52
+ links_feature_set_artifact = Artifact.feature_sets.through.objects.filter(
53
+ artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
54
+ )
55
+ feature_sets_by_slots = defaultdict(list)
56
+ for link in links_feature_set_artifact:
57
+ feature_sets_by_slots[link.slot].append(link.featureset_id)
58
+ feature_sets_union = {}
59
+ for slot, feature_set_ids_slot in feature_sets_by_slots.items():
60
+ feature_set_1 = FeatureSet.get(id=feature_set_ids_slot[0])
61
+ related_name = feature_set_1._get_related_name()
62
+ features_registry = getattr(FeatureSet, related_name).field.model
63
+ # this way of writing the __in statement turned out to be the fastest
64
+ # evaluated on a link table with 16M entries connecting 500 feature sets with
65
+ # 60k genes
66
+ feature_ids = (
67
+ features_registry.feature_sets.through.objects.filter(
68
+ featureset_id__in=feature_set_ids_slot
69
+ )
70
+ .values(f"{features_registry.__name__.lower()}_id")
71
+ .distinct()
72
+ )
73
+ features = features_registry.filter(id__in=feature_ids)
74
+ feature_sets_union[slot] = FeatureSet(features, dtype=feature_set_1.dtype)
75
+ return feature_sets_union
76
+
77
+
78
+ def __init__(
79
+ collection: Collection,
80
+ *args,
81
+ **kwargs,
82
+ ):
83
+ collection.features = CollectionFeatureManager(collection)
84
+ if len(args) == len(collection._meta.concrete_fields):
85
+ super(Collection, collection).__init__(*args, **kwargs)
86
+ return None
87
+ # now we proceed with the user-facing constructor
88
+ if len(args) > 1:
89
+ raise ValueError("Only one non-keyword arg allowed: artifacts")
90
+ artifacts: Artifact | Iterable[Artifact] = (
91
+ kwargs.pop("artifacts") if len(args) == 0 else args[0]
92
+ )
93
+ meta_artifact: Artifact | None = (
94
+ kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
95
+ )
96
+ name: str | None = kwargs.pop("name") if "name" in kwargs else None
97
+ description: str | None = (
98
+ kwargs.pop("description") if "description" in kwargs else None
99
+ )
100
+ reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
101
+ reference_type: str | None = (
102
+ kwargs.pop("reference_type") if "reference_type" in kwargs else None
103
+ )
104
+ run: Run | None = kwargs.pop("run") if "run" in kwargs else None
105
+ revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
106
+ version: str | None = kwargs.pop("version") if "version" in kwargs else None
107
+ visibility: int | None = (
108
+ kwargs.pop("visibility")
109
+ if "visibility" in kwargs
110
+ else VisibilityChoice.default.value
111
+ )
112
+ if "is_new_version_of" in kwargs:
113
+ logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
114
+ revises = kwargs.pop("is_new_version_of")
115
+ if not len(kwargs) == 0:
116
+ raise ValueError(
117
+ f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}"
118
+ )
119
+ provisional_uid, version, name, revises = process_revises(
120
+ revises, version, name, Collection
121
+ )
122
+ run = get_run(run)
123
+ if isinstance(artifacts, Artifact):
124
+ artifacts = [artifacts]
125
+ else:
126
+ if not hasattr(artifacts, "__getitem__"):
127
+ raise ValueError("Artifact or List[Artifact] is allowed.")
128
+ assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
129
+ hash = from_artifacts(artifacts) # type: ignore
130
+ if meta_artifact is not None:
131
+ if not isinstance(meta_artifact, Artifact):
132
+ raise ValueError("meta_artifact has to be an Artifact")
133
+ if isinstance(meta_artifact, Artifact):
134
+ if meta_artifact._state.adding:
135
+ raise ValueError(
136
+ "Save meta_artifact artifact before creating collection!"
137
+ )
138
+ # we ignore collections in trash containing the same hash
139
+ if hash is not None:
140
+ existing_collection = Collection.filter(hash=hash).one_or_none()
141
+ else:
142
+ existing_collection = None
143
+ if existing_collection is not None:
144
+ logger.warning(
145
+ f"returning existing collection with same hash: {existing_collection}"
146
+ )
147
+ # update the run of the existing artifact
148
+ if run is not None:
149
+ # save the information that this artifact was previously
150
+ # produced by another run
151
+ if existing_collection.run is not None:
152
+ existing_collection.run._output_collections_with_later_updates.add(
153
+ existing_collection
154
+ )
155
+ # update the run of the artifact with the latest run
156
+ existing_collection.run = run
157
+ existing_collection.transform = run.transform
158
+ init_self_from_db(collection, existing_collection)
159
+ update_attributes(collection, {"description": description, "name": name})
160
+ else:
161
+ kwargs = {}
162
+ add_transform_to_kwargs(kwargs, run)
163
+ search_names_setting = settings.creation.search_names
164
+ if revises is not None and name == revises.name:
165
+ settings.creation.search_names = False
166
+ super(Collection, collection).__init__(
167
+ uid=provisional_uid,
168
+ name=name,
169
+ description=description,
170
+ reference=reference,
171
+ reference_type=reference_type,
172
+ meta_artifact=meta_artifact,
173
+ hash=hash,
174
+ run=run,
175
+ version=version,
176
+ visibility=visibility,
177
+ revises=revises,
178
+ **kwargs,
179
+ )
180
+ settings.creation.search_names = search_names_setting
181
+ collection._artifacts = artifacts
182
+ # register provenance
183
+ if revises is not None:
184
+ _track_run_input(revises, run=run)
185
+ _track_run_input(artifacts, run=run)
186
+
187
+
188
+ # internal function, not exposed to user
189
+ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
190
+ # assert all artifacts are already saved
191
+ saved = not any(artifact._state.adding for artifact in artifacts)
192
+ if not saved:
193
+ raise ValueError("Not all artifacts are yet saved, please save them")
194
+ # validate consistency of hashes - we do not allow duplicate hashes
195
+ hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
196
+ hashes_set = set(hashes)
197
+ if len(hashes) != len(hashes_set):
198
+ seen = set()
199
+ non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
200
+ raise ValueError(
201
+ "Please pass artifacts with distinct hashes: these ones are non-unique"
202
+ f" {non_unique}"
203
+ )
204
+ hash = hash_set(hashes_set)
205
+ return hash
206
+
207
+
208
+ # docstring handled through attach_func_to_class_method
209
+ def mapped(
210
+ self,
211
+ layers_keys: str | list[str] | None = None,
212
+ obs_keys: str | list[str] | None = None,
213
+ obsm_keys: str | list[str] | None = None,
214
+ join: Literal["inner", "outer"] | None = "inner",
215
+ encode_labels: bool | list[str] = True,
216
+ unknown_label: str | dict[str, str] | None = None,
217
+ cache_categories: bool = True,
218
+ parallel: bool = False,
219
+ dtype: str | None = None,
220
+ stream: bool = False,
221
+ is_run_input: bool | None = None,
222
+ ) -> MappedCollection:
223
+ path_list = []
224
+ if self._state.adding:
225
+ artifacts = self._artifacts
226
+ logger.warning("The collection isn't saved, consider calling `.save()`")
227
+ else:
228
+ artifacts = self.ordered_artifacts.all()
229
+ for artifact in artifacts:
230
+ if artifact.suffix not in {".h5ad", ".zarr"}:
231
+ logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
232
+ continue
233
+ elif not stream:
234
+ path_list.append(artifact.cache())
235
+ else:
236
+ path_list.append(artifact.path)
237
+ ds = MappedCollection(
238
+ path_list,
239
+ layers_keys,
240
+ obs_keys,
241
+ obsm_keys,
242
+ join,
243
+ encode_labels,
244
+ unknown_label,
245
+ cache_categories,
246
+ parallel,
247
+ dtype,
248
+ )
249
+ # track only if successful
250
+ _track_run_input(self, is_run_input)
251
+ return ds
252
+
253
+
254
+ # docstring handled through attach_func_to_class_method
255
+ def cache(self, is_run_input: bool | None = None) -> list[UPath]:
256
+ path_list = []
257
+ for artifact in self.ordered_artifacts.all():
258
+ path_list.append(artifact.cache())
259
+ _track_run_input(self, is_run_input)
260
+ return path_list
261
+
262
+
263
+ # docstring handled through attach_func_to_class_method
264
+ def load(
265
+ self,
266
+ join: Literal["inner", "outer"] = "outer",
267
+ is_run_input: bool | None = None,
268
+ **kwargs,
269
+ ) -> Any:
270
+ # cannot call _track_run_input here, see comment further down
271
+ all_artifacts = self.ordered_artifacts.all()
272
+ suffixes = [artifact.suffix for artifact in all_artifacts]
273
+ if len(set(suffixes)) != 1:
274
+ raise RuntimeError(
275
+ "Can only load collections where all artifacts have the same suffix"
276
+ )
277
+ # because we're tracking data flow on the collection-level, here, we don't
278
+ # want to track it on the artifact-level
279
+ objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
280
+ artifact_uids = [artifact.uid for artifact in all_artifacts]
281
+ if isinstance(objects[0], pd.DataFrame):
282
+ concat_object = pd.concat(objects, join=join)
283
+ elif isinstance(objects[0], ad.AnnData):
284
+ concat_object = ad.concat(
285
+ objects, join=join, label="artifact_uid", keys=artifact_uids
286
+ )
287
+ # only call it here because there might be errors during concat
288
+ _track_run_input(self, is_run_input)
289
+ return concat_object
290
+
291
+
292
+ # docstring handled through attach_func_to_class_method
293
+ def delete(self, permanent: bool | None = None) -> None:
294
+ # change visibility to trash
295
+ trash_visibility = VisibilityChoice.trash.value
296
+ if self.visibility > trash_visibility and permanent is not True:
297
+ self.visibility = trash_visibility
298
+ self.save()
299
+ logger.warning(f"moved collection to trash (visibility = {trash_visibility})")
300
+ return
301
+
302
+ # permanent delete
303
+ if permanent is None:
304
+ response = input(
305
+ "Collection record is already in trash! Are you sure to delete it from your"
306
+ " database? (y/n) You can't undo this action."
307
+ )
308
+ delete_record = response == "y"
309
+ else:
310
+ delete_record = permanent
311
+
312
+ if delete_record:
313
+ super(Collection, self).delete()
314
+
315
+
316
+ # docstring handled through attach_func_to_class_method
317
+ def save(self, using: str | None = None) -> Collection:
318
+ if self.meta_artifact is not None:
319
+ self.meta_artifact.save()
320
+ # we don't need to save feature sets again
321
+ save_feature_sets(self)
322
+ super(Collection, self).save()
323
+ # we don't allow updating the collection of artifacts
324
+ # if users want to update the set of artifacts, they
325
+ # have to create a new collection
326
+ if hasattr(self, "_artifacts"):
327
+ links = [
328
+ CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
329
+ for artifact in self._artifacts
330
+ ]
331
+ # the below seems to preserve the order of the list in the
332
+ # auto-incrementing integer primary
333
+ # merely using .artifacts.set(*...) doesn't achieve this
334
+ # we need ignore_conflicts=True so that this won't error if links already exist
335
+ CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
336
+ save_feature_set_links(self)
337
+ if using is not None:
338
+ logger.warning("using argument is ignored")
339
+ return self
340
+
341
+
342
+ # docstring handled through attach_func_to_class_method
343
+ def restore(self) -> None:
344
+ self.visibility = VisibilityChoice.default.value
345
+ self.save()
346
+
347
+
348
+ @property # type: ignore
349
+ @doc_args(Collection.ordered_artifacts.__doc__)
350
+ def ordered_artifacts(self) -> QuerySet:
351
+ """{}""" # noqa: D415
352
+ return self.artifacts.order_by("links_collection__id")
353
+
354
+
355
+ @property # type: ignore
356
+ @doc_args(Collection.data_artifact.__doc__)
357
+ def data_artifact(self) -> Artifact | None:
358
+ """{}""" # noqa: D415
359
+ return self.artifacts.first()
360
+
361
+
362
+ METHOD_NAMES = [
363
+ "__init__",
364
+ "mapped",
365
+ "cache",
366
+ "load",
367
+ "delete",
368
+ "save",
369
+ "restore",
370
+ ]
371
+
372
+ if ln_setup._TESTING:
373
+ from inspect import signature
374
+
375
+ SIGS = {
376
+ name: signature(getattr(Collection, name))
377
+ for name in METHOD_NAMES
378
+ if name != "__init__"
379
+ }
380
+
381
+ for name in METHOD_NAMES:
382
+ attach_func_to_class_method(name, Collection, globals())
383
+
384
+ Collection.ordered_artifacts = ordered_artifacts
385
+ Collection.data_artifact = data_artifact
386
+ Collection.describe = describe
387
+ Collection.view_lineage = view_lineage