lamindb 0.76.8__py3-none-any.whl → 0.76.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1205
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +389 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +631 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +581 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -90
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -172
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/METADATA +4 -4
  59. lamindb-0.76.9.dist-info/RECORD +60 -0
  60. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/_collection.py CHANGED
@@ -1,387 +1,389 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- from typing import (
5
- TYPE_CHECKING,
6
- Any,
7
- Iterable,
8
- Literal,
9
- )
10
-
11
- import anndata as ad
12
- import lamindb_setup as ln_setup
13
- import pandas as pd
14
- from lamin_utils import logger
15
- from lamindb_setup.core._docs import doc_args
16
- from lamindb_setup.core.hashing import hash_set
17
- from lnschema_core.models import (
18
- Collection,
19
- CollectionArtifact,
20
- FeatureSet,
21
- )
22
- from lnschema_core.types import VisibilityChoice
23
-
24
- from lamindb._utils import attach_func_to_class_method
25
- from lamindb.core._data import _track_run_input, describe, view_lineage
26
- from lamindb.core._mapped_collection import MappedCollection
27
- from lamindb.core.versioning import process_revises
28
-
29
- from . import Artifact, Run
30
- from ._record import init_self_from_db, update_attributes
31
- from .core._data import (
32
- add_transform_to_kwargs,
33
- get_run,
34
- save_feature_set_links,
35
- save_feature_sets,
36
- )
37
- from .core._settings import settings
38
-
39
- if TYPE_CHECKING:
40
- from lamindb.core.storage import UPath
41
-
42
- from ._query_set import QuerySet
43
-
44
-
45
- class CollectionFeatureManager:
46
- """Query features of artifact in collection."""
47
-
48
- def __init__(self, collection: Collection):
49
- self._collection = collection
50
-
51
- def get_feature_sets_union(self) -> dict[str, FeatureSet]:
52
- links_feature_set_artifact = Artifact.feature_sets.through.objects.filter(
53
- artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
54
- )
55
- feature_sets_by_slots = defaultdict(list)
56
- for link in links_feature_set_artifact:
57
- feature_sets_by_slots[link.slot].append(link.featureset_id)
58
- feature_sets_union = {}
59
- for slot, feature_set_ids_slot in feature_sets_by_slots.items():
60
- feature_set_1 = FeatureSet.get(id=feature_set_ids_slot[0])
61
- related_name = feature_set_1._get_related_name()
62
- features_registry = getattr(FeatureSet, related_name).field.model
63
- # this way of writing the __in statement turned out to be the fastest
64
- # evaluated on a link table with 16M entries connecting 500 feature sets with
65
- # 60k genes
66
- feature_ids = (
67
- features_registry.feature_sets.through.objects.filter(
68
- featureset_id__in=feature_set_ids_slot
69
- )
70
- .values(f"{features_registry.__name__.lower()}_id")
71
- .distinct()
72
- )
73
- features = features_registry.filter(id__in=feature_ids)
74
- feature_sets_union[slot] = FeatureSet(features, dtype=feature_set_1.dtype)
75
- return feature_sets_union
76
-
77
-
78
- def __init__(
79
- collection: Collection,
80
- *args,
81
- **kwargs,
82
- ):
83
- collection.features = CollectionFeatureManager(collection)
84
- if len(args) == len(collection._meta.concrete_fields):
85
- super(Collection, collection).__init__(*args, **kwargs)
86
- return None
87
- # now we proceed with the user-facing constructor
88
- if len(args) > 1:
89
- raise ValueError("Only one non-keyword arg allowed: artifacts")
90
- artifacts: Artifact | Iterable[Artifact] = (
91
- kwargs.pop("artifacts") if len(args) == 0 else args[0]
92
- )
93
- meta_artifact: Artifact | None = (
94
- kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
95
- )
96
- name: str | None = kwargs.pop("name") if "name" in kwargs else None
97
- description: str | None = (
98
- kwargs.pop("description") if "description" in kwargs else None
99
- )
100
- reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
101
- reference_type: str | None = (
102
- kwargs.pop("reference_type") if "reference_type" in kwargs else None
103
- )
104
- run: Run | None = kwargs.pop("run") if "run" in kwargs else None
105
- revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
106
- version: str | None = kwargs.pop("version") if "version" in kwargs else None
107
- visibility: int | None = (
108
- kwargs.pop("visibility")
109
- if "visibility" in kwargs
110
- else VisibilityChoice.default.value
111
- )
112
- if "is_new_version_of" in kwargs:
113
- logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
114
- revises = kwargs.pop("is_new_version_of")
115
- if not len(kwargs) == 0:
116
- raise ValueError(
117
- f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}"
118
- )
119
- provisional_uid, version, name, revises = process_revises(
120
- revises, version, name, Collection
121
- )
122
- run = get_run(run)
123
- if isinstance(artifacts, Artifact):
124
- artifacts = [artifacts]
125
- else:
126
- if not hasattr(artifacts, "__getitem__"):
127
- raise ValueError("Artifact or List[Artifact] is allowed.")
128
- assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
129
- hash = from_artifacts(artifacts) # type: ignore
130
- if meta_artifact is not None:
131
- if not isinstance(meta_artifact, Artifact):
132
- raise ValueError("meta_artifact has to be an Artifact")
133
- if isinstance(meta_artifact, Artifact):
134
- if meta_artifact._state.adding:
135
- raise ValueError(
136
- "Save meta_artifact artifact before creating collection!"
137
- )
138
- # we ignore collections in trash containing the same hash
139
- if hash is not None:
140
- existing_collection = Collection.filter(hash=hash).one_or_none()
141
- else:
142
- existing_collection = None
143
- if existing_collection is not None:
144
- logger.warning(
145
- f"returning existing collection with same hash: {existing_collection}"
146
- )
147
- # update the run of the existing artifact
148
- if run is not None:
149
- # save the information that this artifact was previously
150
- # produced by another run
151
- if existing_collection.run is not None:
152
- existing_collection.run._output_collections_with_later_updates.add(
153
- existing_collection
154
- )
155
- # update the run of the artifact with the latest run
156
- existing_collection.run = run
157
- existing_collection.transform = run.transform
158
- init_self_from_db(collection, existing_collection)
159
- update_attributes(collection, {"description": description, "name": name})
160
- else:
161
- kwargs = {}
162
- add_transform_to_kwargs(kwargs, run)
163
- search_names_setting = settings.creation.search_names
164
- if revises is not None and name == revises.name:
165
- settings.creation.search_names = False
166
- super(Collection, collection).__init__(
167
- uid=provisional_uid,
168
- name=name,
169
- description=description,
170
- reference=reference,
171
- reference_type=reference_type,
172
- meta_artifact=meta_artifact,
173
- hash=hash,
174
- run=run,
175
- version=version,
176
- visibility=visibility,
177
- revises=revises,
178
- **kwargs,
179
- )
180
- settings.creation.search_names = search_names_setting
181
- collection._artifacts = artifacts
182
- # register provenance
183
- if revises is not None:
184
- _track_run_input(revises, run=run)
185
- _track_run_input(artifacts, run=run)
186
-
187
-
188
- # internal function, not exposed to user
189
- def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
190
- # assert all artifacts are already saved
191
- saved = not any(artifact._state.adding for artifact in artifacts)
192
- if not saved:
193
- raise ValueError("Not all artifacts are yet saved, please save them")
194
- # validate consistency of hashes - we do not allow duplicate hashes
195
- hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
196
- hashes_set = set(hashes)
197
- if len(hashes) != len(hashes_set):
198
- seen = set()
199
- non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
200
- raise ValueError(
201
- "Please pass artifacts with distinct hashes: these ones are non-unique"
202
- f" {non_unique}"
203
- )
204
- hash = hash_set(hashes_set)
205
- return hash
206
-
207
-
208
- # docstring handled through attach_func_to_class_method
209
- def mapped(
210
- self,
211
- layers_keys: str | list[str] | None = None,
212
- obs_keys: str | list[str] | None = None,
213
- obsm_keys: str | list[str] | None = None,
214
- join: Literal["inner", "outer"] | None = "inner",
215
- encode_labels: bool | list[str] = True,
216
- unknown_label: str | dict[str, str] | None = None,
217
- cache_categories: bool = True,
218
- parallel: bool = False,
219
- dtype: str | None = None,
220
- stream: bool = False,
221
- is_run_input: bool | None = None,
222
- ) -> MappedCollection:
223
- path_list = []
224
- if self._state.adding:
225
- artifacts = self._artifacts
226
- logger.warning("The collection isn't saved, consider calling `.save()`")
227
- else:
228
- artifacts = self.ordered_artifacts.all()
229
- for artifact in artifacts:
230
- if artifact.suffix not in {".h5ad", ".zarr"}:
231
- logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
232
- continue
233
- elif not stream:
234
- path_list.append(artifact.cache())
235
- else:
236
- path_list.append(artifact.path)
237
- ds = MappedCollection(
238
- path_list,
239
- layers_keys,
240
- obs_keys,
241
- obsm_keys,
242
- join,
243
- encode_labels,
244
- unknown_label,
245
- cache_categories,
246
- parallel,
247
- dtype,
248
- )
249
- # track only if successful
250
- _track_run_input(self, is_run_input)
251
- return ds
252
-
253
-
254
- # docstring handled through attach_func_to_class_method
255
- def cache(self, is_run_input: bool | None = None) -> list[UPath]:
256
- path_list = []
257
- for artifact in self.ordered_artifacts.all():
258
- path_list.append(artifact.cache())
259
- _track_run_input(self, is_run_input)
260
- return path_list
261
-
262
-
263
- # docstring handled through attach_func_to_class_method
264
- def load(
265
- self,
266
- join: Literal["inner", "outer"] = "outer",
267
- is_run_input: bool | None = None,
268
- **kwargs,
269
- ) -> Any:
270
- # cannot call _track_run_input here, see comment further down
271
- all_artifacts = self.ordered_artifacts.all()
272
- suffixes = [artifact.suffix for artifact in all_artifacts]
273
- if len(set(suffixes)) != 1:
274
- raise RuntimeError(
275
- "Can only load collections where all artifacts have the same suffix"
276
- )
277
- # because we're tracking data flow on the collection-level, here, we don't
278
- # want to track it on the artifact-level
279
- objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
280
- artifact_uids = [artifact.uid for artifact in all_artifacts]
281
- if isinstance(objects[0], pd.DataFrame):
282
- concat_object = pd.concat(objects, join=join)
283
- elif isinstance(objects[0], ad.AnnData):
284
- concat_object = ad.concat(
285
- objects, join=join, label="artifact_uid", keys=artifact_uids
286
- )
287
- # only call it here because there might be errors during concat
288
- _track_run_input(self, is_run_input)
289
- return concat_object
290
-
291
-
292
- # docstring handled through attach_func_to_class_method
293
- def delete(self, permanent: bool | None = None) -> None:
294
- # change visibility to trash
295
- trash_visibility = VisibilityChoice.trash.value
296
- if self.visibility > trash_visibility and permanent is not True:
297
- self.visibility = trash_visibility
298
- self.save()
299
- logger.warning(f"moved collection to trash (visibility = {trash_visibility})")
300
- return
301
-
302
- # permanent delete
303
- if permanent is None:
304
- response = input(
305
- "Collection record is already in trash! Are you sure to delete it from your"
306
- " database? (y/n) You can't undo this action."
307
- )
308
- delete_record = response == "y"
309
- else:
310
- delete_record = permanent
311
-
312
- if delete_record:
313
- super(Collection, self).delete()
314
-
315
-
316
- # docstring handled through attach_func_to_class_method
317
- def save(self, using: str | None = None) -> Collection:
318
- if self.meta_artifact is not None:
319
- self.meta_artifact.save()
320
- # we don't need to save feature sets again
321
- save_feature_sets(self)
322
- super(Collection, self).save()
323
- # we don't allow updating the collection of artifacts
324
- # if users want to update the set of artifacts, they
325
- # have to create a new collection
326
- if hasattr(self, "_artifacts"):
327
- links = [
328
- CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
329
- for artifact in self._artifacts
330
- ]
331
- # the below seems to preserve the order of the list in the
332
- # auto-incrementing integer primary
333
- # merely using .artifacts.set(*...) doesn't achieve this
334
- # we need ignore_conflicts=True so that this won't error if links already exist
335
- CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
336
- save_feature_set_links(self)
337
- if using is not None:
338
- logger.warning("using argument is ignored")
339
- return self
340
-
341
-
342
- # docstring handled through attach_func_to_class_method
343
- def restore(self) -> None:
344
- self.visibility = VisibilityChoice.default.value
345
- self.save()
346
-
347
-
348
- @property # type: ignore
349
- @doc_args(Collection.ordered_artifacts.__doc__)
350
- def ordered_artifacts(self) -> QuerySet:
351
- """{}""" # noqa: D415
352
- return self.artifacts.order_by("links_collection__id")
353
-
354
-
355
- @property # type: ignore
356
- @doc_args(Collection.data_artifact.__doc__)
357
- def data_artifact(self) -> Artifact | None:
358
- """{}""" # noqa: D415
359
- return self.artifacts.first()
360
-
361
-
362
- METHOD_NAMES = [
363
- "__init__",
364
- "mapped",
365
- "cache",
366
- "load",
367
- "delete",
368
- "save",
369
- "restore",
370
- ]
371
-
372
- if ln_setup._TESTING:
373
- from inspect import signature
374
-
375
- SIGS = {
376
- name: signature(getattr(Collection, name))
377
- for name in METHOD_NAMES
378
- if name != "__init__"
379
- }
380
-
381
- for name in METHOD_NAMES:
382
- attach_func_to_class_method(name, Collection, globals())
383
-
384
- Collection.ordered_artifacts = ordered_artifacts
385
- Collection.data_artifact = data_artifact
386
- Collection.describe = describe
387
- Collection.view_lineage = view_lineage
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ Any,
7
+ Iterable,
8
+ Literal,
9
+ )
10
+
11
+ import anndata as ad
12
+ import lamindb_setup as ln_setup
13
+ import pandas as pd
14
+ from lamin_utils import logger
15
+ from lamindb_setup.core._docs import doc_args
16
+ from lamindb_setup.core.hashing import hash_set
17
+ from lnschema_core.models import (
18
+ Collection,
19
+ CollectionArtifact,
20
+ FeatureSet,
21
+ )
22
+ from lnschema_core.types import VisibilityChoice
23
+
24
+ from lamindb._utils import attach_func_to_class_method
25
+ from lamindb.core._data import _track_run_input, describe, view_lineage
26
+ from lamindb.core._mapped_collection import MappedCollection
27
+ from lamindb.core.versioning import process_revises
28
+
29
+ from . import Artifact, Run
30
+ from ._record import init_self_from_db, update_attributes
31
+ from .core._data import (
32
+ add_transform_to_kwargs,
33
+ get_run,
34
+ save_feature_set_links,
35
+ save_feature_sets,
36
+ )
37
+ from .core._settings import settings
38
+
39
+ if TYPE_CHECKING:
40
+ from lamindb.core.storage import UPath
41
+
42
+ from ._query_set import QuerySet
43
+
44
+
45
+ class CollectionFeatureManager:
46
+ """Query features of artifact in collection."""
47
+
48
+ def __init__(self, collection: Collection):
49
+ self._collection = collection
50
+
51
+ def get_feature_sets_union(self) -> dict[str, FeatureSet]:
52
+ links_feature_set_artifact = Artifact.feature_sets.through.objects.filter(
53
+ artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
54
+ )
55
+ feature_sets_by_slots = defaultdict(list)
56
+ for link in links_feature_set_artifact:
57
+ feature_sets_by_slots[link.slot].append(link.featureset_id)
58
+ feature_sets_union = {}
59
+ for slot, feature_set_ids_slot in feature_sets_by_slots.items():
60
+ feature_set_1 = FeatureSet.get(id=feature_set_ids_slot[0])
61
+ related_name = feature_set_1._get_related_name()
62
+ features_registry = getattr(FeatureSet, related_name).field.model
63
+ # this way of writing the __in statement turned out to be the fastest
64
+ # evaluated on a link table with 16M entries connecting 500 feature sets with
65
+ # 60k genes
66
+ feature_ids = (
67
+ features_registry.feature_sets.through.objects.filter(
68
+ featureset_id__in=feature_set_ids_slot
69
+ )
70
+ .values(f"{features_registry.__name__.lower()}_id")
71
+ .distinct()
72
+ )
73
+ features = features_registry.filter(id__in=feature_ids)
74
+ feature_sets_union[slot] = FeatureSet(features, dtype=feature_set_1.dtype)
75
+ return feature_sets_union
76
+
77
+
78
+ def __init__(
79
+ collection: Collection,
80
+ *args,
81
+ **kwargs,
82
+ ):
83
+ collection.features = CollectionFeatureManager(collection)
84
+ if len(args) == len(collection._meta.concrete_fields):
85
+ super(Collection, collection).__init__(*args, **kwargs)
86
+ return None
87
+ # now we proceed with the user-facing constructor
88
+ if len(args) > 1:
89
+ raise ValueError("Only one non-keyword arg allowed: artifacts")
90
+ artifacts: Artifact | Iterable[Artifact] = (
91
+ kwargs.pop("artifacts") if len(args) == 0 else args[0]
92
+ )
93
+ meta_artifact: Artifact | None = (
94
+ kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
95
+ )
96
+ name: str | None = kwargs.pop("name") if "name" in kwargs else None
97
+ description: str | None = (
98
+ kwargs.pop("description") if "description" in kwargs else None
99
+ )
100
+ reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
101
+ reference_type: str | None = (
102
+ kwargs.pop("reference_type") if "reference_type" in kwargs else None
103
+ )
104
+ run: Run | None = kwargs.pop("run") if "run" in kwargs else None
105
+ revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
106
+ version: str | None = kwargs.pop("version") if "version" in kwargs else None
107
+ visibility: int | None = (
108
+ kwargs.pop("visibility")
109
+ if "visibility" in kwargs
110
+ else VisibilityChoice.default.value
111
+ )
112
+ if "is_new_version_of" in kwargs:
113
+ logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
114
+ revises = kwargs.pop("is_new_version_of")
115
+ if not len(kwargs) == 0:
116
+ raise ValueError(
117
+ f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}"
118
+ )
119
+ provisional_uid, version, name, revises = process_revises(
120
+ revises, version, name, Collection
121
+ )
122
+ run = get_run(run)
123
+ if isinstance(artifacts, Artifact):
124
+ artifacts = [artifacts]
125
+ else:
126
+ if not hasattr(artifacts, "__getitem__"):
127
+ raise ValueError("Artifact or List[Artifact] is allowed.")
128
+ assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
129
+ hash = from_artifacts(artifacts) # type: ignore
130
+ if meta_artifact is not None:
131
+ if not isinstance(meta_artifact, Artifact):
132
+ raise ValueError("meta_artifact has to be an Artifact")
133
+ if isinstance(meta_artifact, Artifact):
134
+ if meta_artifact._state.adding:
135
+ raise ValueError(
136
+ "Save meta_artifact artifact before creating collection!"
137
+ )
138
+ # we ignore collections in trash containing the same hash
139
+ if hash is not None:
140
+ existing_collection = Collection.filter(hash=hash).one_or_none()
141
+ else:
142
+ existing_collection = None
143
+ if existing_collection is not None:
144
+ logger.warning(
145
+ f"returning existing collection with same hash: {existing_collection}"
146
+ )
147
+ # update the run of the existing artifact
148
+ if run is not None:
149
+ # save the information that this artifact was previously
150
+ # produced by another run
151
+ if existing_collection.run is not None:
152
+ existing_collection.run._output_collections_with_later_updates.add(
153
+ existing_collection
154
+ )
155
+ # update the run of the artifact with the latest run
156
+ existing_collection.run = run
157
+ existing_collection.transform = run.transform
158
+ init_self_from_db(collection, existing_collection)
159
+ update_attributes(collection, {"description": description, "name": name})
160
+ else:
161
+ kwargs = {}
162
+ add_transform_to_kwargs(kwargs, run)
163
+ search_names_setting = settings.creation.search_names
164
+ if revises is not None and name == revises.name:
165
+ settings.creation.search_names = False
166
+ super(Collection, collection).__init__(
167
+ uid=provisional_uid,
168
+ name=name,
169
+ description=description,
170
+ reference=reference,
171
+ reference_type=reference_type,
172
+ meta_artifact=meta_artifact,
173
+ hash=hash,
174
+ run=run,
175
+ version=version,
176
+ visibility=visibility,
177
+ revises=revises,
178
+ **kwargs,
179
+ )
180
+ settings.creation.search_names = search_names_setting
181
+ collection._artifacts = artifacts
182
+ # register provenance
183
+ if revises is not None:
184
+ _track_run_input(revises, run=run)
185
+ _track_run_input(artifacts, run=run)
186
+
187
+
188
+ # internal function, not exposed to user
189
+ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
190
+ # assert all artifacts are already saved
191
+ saved = not any(artifact._state.adding for artifact in artifacts)
192
+ if not saved:
193
+ raise ValueError("Not all artifacts are yet saved, please save them")
194
+ # validate consistency of hashes - we do not allow duplicate hashes
195
+ hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
196
+ hashes_set = set(hashes)
197
+ if len(hashes) != len(hashes_set):
198
+ seen = set()
199
+ non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
200
+ raise ValueError(
201
+ "Please pass artifacts with distinct hashes: these ones are non-unique"
202
+ f" {non_unique}"
203
+ )
204
+ hash = hash_set(hashes_set)
205
+ return hash
206
+
207
+
208
+ # docstring handled through attach_func_to_class_method
209
+ def mapped(
210
+ self,
211
+ layers_keys: str | list[str] | None = None,
212
+ obs_keys: str | list[str] | None = None,
213
+ obsm_keys: str | list[str] | None = None,
214
+ obs_filter: tuple[str, str | tuple[str, ...]] | None = None,
215
+ join: Literal["inner", "outer"] | None = "inner",
216
+ encode_labels: bool | list[str] = True,
217
+ unknown_label: str | dict[str, str] | None = None,
218
+ cache_categories: bool = True,
219
+ parallel: bool = False,
220
+ dtype: str | None = None,
221
+ stream: bool = False,
222
+ is_run_input: bool | None = None,
223
+ ) -> MappedCollection:
224
+ path_list = []
225
+ if self._state.adding:
226
+ artifacts = self._artifacts
227
+ logger.warning("The collection isn't saved, consider calling `.save()`")
228
+ else:
229
+ artifacts = self.ordered_artifacts.all()
230
+ for artifact in artifacts:
231
+ if artifact.suffix not in {".h5ad", ".zarr"}:
232
+ logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
233
+ continue
234
+ elif not stream:
235
+ path_list.append(artifact.cache())
236
+ else:
237
+ path_list.append(artifact.path)
238
+ ds = MappedCollection(
239
+ path_list,
240
+ layers_keys,
241
+ obs_keys,
242
+ obsm_keys,
243
+ obs_filter,
244
+ join,
245
+ encode_labels,
246
+ unknown_label,
247
+ cache_categories,
248
+ parallel,
249
+ dtype,
250
+ )
251
+ # track only if successful
252
+ _track_run_input(self, is_run_input)
253
+ return ds
254
+
255
+
256
+ # docstring handled through attach_func_to_class_method
257
+ def cache(self, is_run_input: bool | None = None) -> list[UPath]:
258
+ path_list = []
259
+ for artifact in self.ordered_artifacts.all():
260
+ path_list.append(artifact.cache())
261
+ _track_run_input(self, is_run_input)
262
+ return path_list
263
+
264
+
265
+ # docstring handled through attach_func_to_class_method
266
+ def load(
267
+ self,
268
+ join: Literal["inner", "outer"] = "outer",
269
+ is_run_input: bool | None = None,
270
+ **kwargs,
271
+ ) -> Any:
272
+ # cannot call _track_run_input here, see comment further down
273
+ all_artifacts = self.ordered_artifacts.all()
274
+ suffixes = [artifact.suffix for artifact in all_artifacts]
275
+ if len(set(suffixes)) != 1:
276
+ raise RuntimeError(
277
+ "Can only load collections where all artifacts have the same suffix"
278
+ )
279
+ # because we're tracking data flow on the collection-level, here, we don't
280
+ # want to track it on the artifact-level
281
+ objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
282
+ artifact_uids = [artifact.uid for artifact in all_artifacts]
283
+ if isinstance(objects[0], pd.DataFrame):
284
+ concat_object = pd.concat(objects, join=join)
285
+ elif isinstance(objects[0], ad.AnnData):
286
+ concat_object = ad.concat(
287
+ objects, join=join, label="artifact_uid", keys=artifact_uids
288
+ )
289
+ # only call it here because there might be errors during concat
290
+ _track_run_input(self, is_run_input)
291
+ return concat_object
292
+
293
+
294
+ # docstring handled through attach_func_to_class_method
295
+ def delete(self, permanent: bool | None = None) -> None:
296
+ # change visibility to trash
297
+ trash_visibility = VisibilityChoice.trash.value
298
+ if self.visibility > trash_visibility and permanent is not True:
299
+ self.visibility = trash_visibility
300
+ self.save()
301
+ logger.warning(f"moved collection to trash (visibility = {trash_visibility})")
302
+ return
303
+
304
+ # permanent delete
305
+ if permanent is None:
306
+ response = input(
307
+ "Collection record is already in trash! Are you sure to delete it from your"
308
+ " database? (y/n) You can't undo this action."
309
+ )
310
+ delete_record = response == "y"
311
+ else:
312
+ delete_record = permanent
313
+
314
+ if delete_record:
315
+ super(Collection, self).delete()
316
+
317
+
318
+ # docstring handled through attach_func_to_class_method
319
+ def save(self, using: str | None = None) -> Collection:
320
+ if self.meta_artifact is not None:
321
+ self.meta_artifact.save()
322
+ # we don't need to save feature sets again
323
+ save_feature_sets(self)
324
+ super(Collection, self).save()
325
+ # we don't allow updating the collection of artifacts
326
+ # if users want to update the set of artifacts, they
327
+ # have to create a new collection
328
+ if hasattr(self, "_artifacts"):
329
+ links = [
330
+ CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
331
+ for artifact in self._artifacts
332
+ ]
333
+ # the below seems to preserve the order of the list in the
334
+ # auto-incrementing integer primary
335
+ # merely using .artifacts.set(*...) doesn't achieve this
336
+ # we need ignore_conflicts=True so that this won't error if links already exist
337
+ CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
338
+ save_feature_set_links(self)
339
+ if using is not None:
340
+ logger.warning("using argument is ignored")
341
+ return self
342
+
343
+
344
+ # docstring handled through attach_func_to_class_method
345
+ def restore(self) -> None:
346
+ self.visibility = VisibilityChoice.default.value
347
+ self.save()
348
+
349
+
350
+ @property # type: ignore
351
+ @doc_args(Collection.ordered_artifacts.__doc__)
352
+ def ordered_artifacts(self) -> QuerySet:
353
+ """{}""" # noqa: D415
354
+ return self.artifacts.order_by("links_collection__id")
355
+
356
+
357
+ @property # type: ignore
358
+ @doc_args(Collection.data_artifact.__doc__)
359
+ def data_artifact(self) -> Artifact | None:
360
+ """{}""" # noqa: D415
361
+ return self.artifacts.first()
362
+
363
+
364
+ METHOD_NAMES = [
365
+ "__init__",
366
+ "mapped",
367
+ "cache",
368
+ "load",
369
+ "delete",
370
+ "save",
371
+ "restore",
372
+ ]
373
+
374
+ if ln_setup._TESTING:
375
+ from inspect import signature
376
+
377
+ SIGS = {
378
+ name: signature(getattr(Collection, name))
379
+ for name in METHOD_NAMES
380
+ if name != "__init__"
381
+ }
382
+
383
+ for name in METHOD_NAMES:
384
+ attach_func_to_class_method(name, Collection, globals())
385
+
386
+ Collection.ordered_artifacts = ordered_artifacts
387
+ Collection.data_artifact = data_artifact
388
+ Collection.describe = describe
389
+ Collection.view_lineage = view_lineage