lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. lamindb/__init__.py +31 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_context.py +41 -10
  10. lamindb/core/_mapped_collection.py +4 -2
  11. lamindb/core/_settings.py +6 -6
  12. lamindb/core/_sync_git.py +1 -1
  13. lamindb/core/_track_environment.py +2 -1
  14. lamindb/core/datasets/_small.py +3 -3
  15. lamindb/core/loaders.py +22 -9
  16. lamindb/core/storage/_anndata_accessor.py +8 -3
  17. lamindb/core/storage/_backed_access.py +14 -7
  18. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  19. lamindb/core/storage/_tiledbsoma.py +6 -4
  20. lamindb/core/storage/_zarr.py +32 -11
  21. lamindb/core/storage/objects.py +59 -26
  22. lamindb/core/storage/paths.py +16 -13
  23. lamindb/curators/__init__.py +173 -145
  24. lamindb/errors.py +1 -1
  25. lamindb/integrations/_vitessce.py +4 -4
  26. lamindb/migrations/0089_subsequent_runs.py +159 -0
  27. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  28. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  29. lamindb/models/__init__.py +79 -0
  30. lamindb/{core → models}/_describe.py +3 -3
  31. lamindb/{core → models}/_django.py +8 -5
  32. lamindb/{core → models}/_feature_manager.py +103 -87
  33. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  34. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  35. lamindb/{core → models}/_label_manager.py +10 -17
  36. lamindb/{core/relations.py → models/_relations.py} +8 -1
  37. lamindb/models/artifact.py +2601 -0
  38. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  39. lamindb/models/collection.py +683 -0
  40. lamindb/models/core.py +135 -0
  41. lamindb/models/feature.py +643 -0
  42. lamindb/models/flextable.py +163 -0
  43. lamindb/{_parents.py → models/has_parents.py} +55 -49
  44. lamindb/models/project.py +384 -0
  45. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  46. lamindb/{_query_set.py → models/query_set.py} +52 -30
  47. lamindb/models/record.py +1757 -0
  48. lamindb/models/run.py +563 -0
  49. lamindb/{_save.py → models/save.py} +18 -8
  50. lamindb/models/schema.py +732 -0
  51. lamindb/models/transform.py +360 -0
  52. lamindb/models/ulabel.py +249 -0
  53. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
  54. lamindb-1.2a2.dist-info/RECORD +94 -0
  55. lamindb/_artifact.py +0 -1361
  56. lamindb/_collection.py +0 -440
  57. lamindb/_feature.py +0 -316
  58. lamindb/_is_versioned.py +0 -40
  59. lamindb/_record.py +0 -1065
  60. lamindb/_run.py +0 -60
  61. lamindb/_schema.py +0 -347
  62. lamindb/_storage.py +0 -15
  63. lamindb/_transform.py +0 -170
  64. lamindb/_ulabel.py +0 -56
  65. lamindb/_utils.py +0 -9
  66. lamindb/base/validation.py +0 -63
  67. lamindb/core/_data.py +0 -491
  68. lamindb/core/fields.py +0 -12
  69. lamindb/models.py +0 -4435
  70. lamindb-1.1.0.dist-info/RECORD +0 -95
  71. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
  72. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,683 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ Any,
7
+ Literal,
8
+ overload,
9
+ )
10
+
11
+ import anndata as ad
12
+ import pandas as pd
13
+ from django.db import models
14
+ from django.db.models import CASCADE, PROTECT
15
+ from lamin_utils import logger
16
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
17
+
18
+ from lamindb.base.fields import (
19
+ CharField,
20
+ ForeignKey,
21
+ OneToOneField,
22
+ TextField,
23
+ )
24
+
25
+ from ..base.ids import base62_20
26
+ from ..core._mapped_collection import MappedCollection
27
+ from ..core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
28
+ from ..errors import FieldValidationError
29
+ from ..models._is_versioned import process_revises
30
+ from ._is_versioned import IsVersioned
31
+ from .artifact import (
32
+ Artifact,
33
+ _populate_subsequent_runs_,
34
+ _track_run_input,
35
+ describe_artifact_collection,
36
+ get_run,
37
+ save_schema_links,
38
+ )
39
+ from .has_parents import view_lineage
40
+ from .record import (
41
+ BasicRecord,
42
+ LinkORM,
43
+ Record,
44
+ _get_record_kwargs,
45
+ init_self_from_db,
46
+ update_attributes,
47
+ )
48
+ from .run import Run, TracksRun, TracksUpdates
49
+
50
+ if TYPE_CHECKING:
51
+ from collections.abc import Iterable
52
+
53
+ from pyarrow.dataset import Dataset as PyArrowDataset
54
+
55
+ from ..core.storage import UPath
56
+ from .project import Project, Reference
57
+ from .query_set import QuerySet
58
+ from .transform import Transform
59
+ from .ulabel import ULabel
60
+
61
+
62
+ # below is a draft for the future, see also the tests in test_collection.py
63
+ #
64
+ # class CollectionFeatureManager:
65
+ # """Query features of artifact in collection."""
66
+
67
+ # def __init__(self, collection: Collection):
68
+ # self._collection = collection
69
+
70
+ # def _get_staged_feature_sets_union(self) -> dict[str, Schema]:
71
+ # links_schema_artifact = Artifact.feature_sets.through.objects.filter(
72
+ # artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
73
+ # )
74
+ # feature_sets_by_slots = defaultdict(list)
75
+ # for link in links_schema_artifact:
76
+ # feature_sets_by_slots[link.slot].append(link.schema_id)
77
+ # feature_sets_union = {}
78
+ # for slot, schema_ids_slot in feature_sets_by_slots.items():
79
+ # schema_1 = Schema.get(id=schema_ids_slot[0])
80
+ # related_name = schema_1._get_related_name()
81
+ # features_registry = getattr(Schema, related_name).field.model
82
+ # # this way of writing the __in statement turned out to be the fastest
83
+ # # evaluated on a link table with 16M entries connecting 500 feature sets with
84
+ # # 60k genes
85
+ # feature_ids = (
86
+ # features_registry.schemas.through.objects.filter(
87
+ # schema_id__in=schema_ids_slot
88
+ # )
89
+ # .values(f"{features_registry.__name__.lower()}_id")
90
+ # .distinct()
91
+ # )
92
+ # features = features_registry.filter(id__in=feature_ids)
93
+ # feature_sets_union[slot] = Schema(features, dtype=schema_1.dtype)
94
+ # return feature_sets_union
95
+
96
+
97
+ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
98
+ """Collections of artifacts.
99
+
100
+ Collections provide a simple way of versioning collections of artifacts.
101
+
102
+ Args:
103
+ artifacts: `list[Artifact]` A list of artifacts.
104
+ key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
105
+ description: `str | None = None` A description.
106
+ revises: `Collection | None = None` An old version of the collection.
107
+ run: `Run | None = None` The run that creates the collection.
108
+ meta: `Artifact | None = None` An artifact that defines metadata for the collection.
109
+ reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
110
+ reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
111
+
112
+ See Also:
113
+ :class:`~lamindb.Artifact`
114
+
115
+ Examples:
116
+
117
+ Create a collection from a list of :class:`~lamindb.Artifact` objects:
118
+
119
+ >>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
120
+
121
+ Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
122
+
123
+ >>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
124
+
125
+ """
126
+
127
+ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
128
+ abstract = False
129
+
130
+ _len_full_uid: int = 20
131
+ _len_stem_uid: int = 16
132
+ _name_field: str = "key"
133
+
134
+ id: int = models.AutoField(primary_key=True)
135
+ """Internal id, valid only in one DB instance."""
136
+ uid: str = CharField(
137
+ editable=False,
138
+ unique=True,
139
+ db_index=True,
140
+ max_length=_len_full_uid,
141
+ default=base62_20,
142
+ )
143
+ """Universal id, valid across DB instances."""
144
+ key: str = CharField(db_index=True)
145
+ """Name or path-like key."""
146
+ # below is the only case in which we use a TextField
147
+ # for description; we do so because users had descriptions exceeding 255 chars
148
+ # in their instances
149
+ description: str | None = TextField(null=True, db_index=True)
150
+ """A description or title."""
151
+ hash: str | None = CharField(
152
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True
153
+ )
154
+ """Hash of collection content."""
155
+ reference: str | None = CharField(max_length=255, db_index=True, null=True)
156
+ """A reference like URL or external ID."""
157
+ # also for reference_type here, we allow an extra long max_length
158
+ reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
159
+ """Type of reference, e.g., cellxgene Census collection_id."""
160
+ ulabels: ULabel = models.ManyToManyField(
161
+ "ULabel", through="CollectionULabel", related_name="collections"
162
+ )
163
+ """ULabels sampled in the collection (see :class:`~lamindb.Feature`)."""
164
+ run: Run | None = ForeignKey(
165
+ Run, PROTECT, related_name="output_collections", null=True, default=None
166
+ )
167
+ """:class:`~lamindb.Run` that created the `collection`."""
168
+ input_of_runs: Run = models.ManyToManyField(Run, related_name="input_collections")
169
+ """Runs that use this collection as an input."""
170
+ _subsequent_runs: Run = models.ManyToManyField(
171
+ "Run",
172
+ related_name="_recreated_collections",
173
+ db_table="lamindb_collection__previous_runs", # legacy name, change in lamindb v2
174
+ )
175
+ """Runs that re-created the record after initial creation."""
176
+ artifacts: Artifact = models.ManyToManyField(
177
+ "Artifact", related_name="collections", through="CollectionArtifact"
178
+ )
179
+ """Artifacts in collection."""
180
+ meta_artifact: Artifact | None = OneToOneField(
181
+ "Artifact",
182
+ PROTECT,
183
+ null=True,
184
+ unique=True,
185
+ related_name="_meta_of_collection",
186
+ )
187
+ """An artifact that stores metadata that indexes a collection.
188
+
189
+ It has a 1:1 correspondence with an artifact. If needed, you can access the
190
+ collection from the artifact via a private field:
191
+ `artifact._meta_of_collection`.
192
+ """
193
+ _actions: Artifact = models.ManyToManyField(Artifact, related_name="+")
194
+ """Actions to attach for the UI."""
195
+ projects: Project
196
+ """Linked projects."""
197
+ references: Reference
198
+ """Linked references."""
199
+
200
+ @overload
201
+ def __init__(
202
+ self,
203
+ artifacts: list[Artifact],
204
+ key: str,
205
+ description: str | None = None,
206
+ meta: Any | None = None,
207
+ reference: str | None = None,
208
+ reference_type: str | None = None,
209
+ run: Run | None = None,
210
+ revises: Collection | None = None,
211
+ ): ...
212
+
213
+ @overload
214
+ def __init__(
215
+ self,
216
+ *db_args,
217
+ ): ...
218
+
219
+ def __init__(
220
+ self,
221
+ *args,
222
+ **kwargs,
223
+ ):
224
+ if len(args) == len(self._meta.concrete_fields):
225
+ super().__init__(*args, **kwargs)
226
+ return None
227
+ # now we proceed with the user-facing constructor
228
+ if len(args) > 1:
229
+ raise ValueError("Only one non-keyword arg allowed: artifacts")
230
+ artifacts: Artifact | Iterable[Artifact] = (
231
+ kwargs.pop("artifacts") if len(args) == 0 else args[0]
232
+ )
233
+ meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
234
+ tmp_key: str | None = kwargs.pop("key", None)
235
+ description: str | None = kwargs.pop("description", None)
236
+ reference: str | None = kwargs.pop("reference", None)
237
+ reference_type: str | None = kwargs.pop("reference_type", None)
238
+ run: Run | None = kwargs.pop("run", None)
239
+ revises: Collection | None = kwargs.pop("revises", None)
240
+ version: str | None = kwargs.pop("version", None)
241
+ _branch_code: int | None = kwargs.pop("_branch_code", 1)
242
+ key: str
243
+ if "name" in kwargs:
244
+ key = kwargs.pop("name")
245
+ warnings.warn(
246
+ f"argument `name` will be removed, please pass {key} to `key` instead",
247
+ FutureWarning,
248
+ stacklevel=2,
249
+ )
250
+ else:
251
+ key = tmp_key
252
+ if not len(kwargs) == 0:
253
+ valid_keywords = ", ".join(
254
+ [val[0] for val in _get_record_kwargs(Collection)]
255
+ )
256
+ raise FieldValidationError(
257
+ f"Only {valid_keywords} can be passed, you passed: {kwargs}"
258
+ )
259
+ if revises is None:
260
+ revises = (
261
+ Collection.filter(key=key, is_latest=True)
262
+ .order_by("-created_at")
263
+ .first()
264
+ )
265
+ provisional_uid, version, key, description, revises = process_revises(
266
+ revises, version, key, description, Collection
267
+ )
268
+ run = get_run(run)
269
+ if isinstance(artifacts, Artifact):
270
+ artifacts = [artifacts]
271
+ else:
272
+ if not hasattr(artifacts, "__getitem__"):
273
+ raise ValueError("Artifact or list[Artifact] is allowed.")
274
+ assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
275
+ hash = from_artifacts(artifacts) # type: ignore
276
+ if meta_artifact is not None:
277
+ if not isinstance(meta_artifact, Artifact):
278
+ raise ValueError("meta_artifact has to be an Artifact")
279
+ if isinstance(meta_artifact, Artifact):
280
+ if meta_artifact._state.adding:
281
+ raise ValueError(
282
+ "Save meta_artifact artifact before creating collection!"
283
+ )
284
+ # we ignore collections in trash containing the same hash
285
+ if hash is not None:
286
+ existing_collection = Collection.filter(hash=hash).one_or_none()
287
+ else:
288
+ existing_collection = None
289
+ if existing_collection is not None:
290
+ logger.warning(
291
+ f"returning existing collection with same hash: {existing_collection}; if you intended to query to track this collection as an input, use: ln.Collection.get()"
292
+ )
293
+ if run is not None:
294
+ existing_collection._populate_subsequent_runs(run)
295
+ init_self_from_db(self, existing_collection)
296
+ update_attributes(self, {"description": description, "key": key})
297
+ else:
298
+ _skip_validation = revises is not None and key == revises.key
299
+ super().__init__( # type: ignore
300
+ uid=provisional_uid,
301
+ key=key,
302
+ description=description,
303
+ reference=reference,
304
+ reference_type=reference_type,
305
+ meta_artifact=meta_artifact,
306
+ hash=hash,
307
+ run=run,
308
+ version=version,
309
+ _branch_code=_branch_code,
310
+ revises=revises,
311
+ _skip_validation=_skip_validation,
312
+ )
313
+ self._artifacts = artifacts
314
+ if revises is not None:
315
+ _track_run_input(revises, run=run)
316
+ _track_run_input(artifacts, run=run)
317
+
318
+ def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
319
+ """Append an artifact to the collection.
320
+
321
+ This does not modify the original collection in-place, but returns a new version
322
+ of the original collection with the appended artifact.
323
+
324
+ Args:
325
+ artifact: An artifact to add to the collection.
326
+ run: The run that creates the new version of the collection.
327
+
328
+ Examples::
329
+
330
+ collection_v1 = ln.Collection(artifact, key="My collection").save()
331
+ collection_v2 = collection.append(another_artifact) # returns a new version of the collection
332
+ collection_v2.save() # save the new version
333
+
334
+ """
335
+ return Collection( # type: ignore
336
+ self.artifacts.all().list() + [artifact],
337
+ # key is automatically derived from revises.key
338
+ description=self.description,
339
+ revises=self,
340
+ run=run,
341
+ )
342
+
343
+ def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
344
+ """Return a cloud-backed pyarrow Dataset.
345
+
346
+ Works for `pyarrow` compatible formats.
347
+
348
+ Notes:
349
+ For more info, see tutorial: :doc:`/arrays`.
350
+ """
351
+ if self._state.adding:
352
+ artifacts = self._artifacts
353
+ logger.warning("the collection isn't saved, consider calling `.save()`")
354
+ else:
355
+ artifacts = self.ordered_artifacts.all()
356
+ paths = [artifact.path for artifact in artifacts]
357
+ # this checks that the filesystem is the same for all paths
358
+ # this is a requirement of pyarrow.dataset.dataset
359
+ fs = paths[0].fs
360
+ for path in paths[1:]:
361
+ # this assumes that the filesystems are cached by fsspec
362
+ if path.fs is not fs:
363
+ raise ValueError(
364
+ "The collection has artifacts with different filesystems, this is not supported."
365
+ )
366
+ if not _is_pyarrow_dataset(paths):
367
+ suffixes = {path.suffix for path in paths}
368
+ suffixes_str = ", ".join(suffixes)
369
+ err_msg = (
370
+ "This collection is not compatible with pyarrow.dataset.dataset(), "
371
+ )
372
+ err_msg += (
373
+ f"the artifacts have incompatible file types: {suffixes_str}"
374
+ if len(suffixes) > 1
375
+ else f"the file type {suffixes_str} is not supported by pyarrow."
376
+ )
377
+ raise ValueError(err_msg)
378
+ dataset = _open_pyarrow_dataset(paths)
379
+ # track only if successful
380
+ _track_run_input(self, is_run_input)
381
+ return dataset
382
+
383
+ def mapped(
384
+ self,
385
+ layers_keys: str | list[str] | None = None,
386
+ obs_keys: str | list[str] | None = None,
387
+ obsm_keys: str | list[str] | None = None,
388
+ obs_filter: dict[str, str | list[str]] | None = None,
389
+ join: Literal["inner", "outer"] | None = "inner",
390
+ encode_labels: bool | list[str] = True,
391
+ unknown_label: str | dict[str, str] | None = None,
392
+ cache_categories: bool = True,
393
+ parallel: bool = False,
394
+ dtype: str | None = None,
395
+ stream: bool = False,
396
+ is_run_input: bool | None = None,
397
+ ) -> MappedCollection:
398
+ """Return a map-style dataset.
399
+
400
+ Returns a `pytorch map-style dataset
401
+ <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
402
+ virtually concatenating `AnnData` arrays.
403
+
404
+ If your `AnnData` collection is in the cloud, move them into a local
405
+ cache first via :meth:`~lamindb.Collection.cache`.
406
+
407
+ `__getitem__` of the `MappedCollection` object takes a single integer index
408
+ and returns a dictionary with the observation data sample for this index from
409
+ the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`
410
+ (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
411
+ for the index of the `AnnData` object containing this observation sample.
412
+
413
+ .. note::
414
+
415
+ For a guide, see :doc:`docs:scrna-mappedcollection`.
416
+
417
+ This method currently only works for collections of `AnnData` artifacts.
418
+
419
+ Args:
420
+ layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
421
+ retrieves ``.X``.
422
+ obs_keys: Keys from the ``.obs`` slots.
423
+ obsm_keys: Keys from the ``.obsm`` slots.
424
+ obs_filter: Select only observations with these values for the given obs columns.
425
+ Should be a dictionary with obs column names as keys
426
+ and filtering values (a string or a list of strings) as values.
427
+ join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
428
+ does not join.
429
+ encode_labels: Encode labels into integers.
430
+ Can be a list with elements from ``obs_keys``.
431
+ unknown_label: Encode this label to -1.
432
+ Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
433
+ or from ``encode_labels`` if it is a list.
434
+ cache_categories: Enable caching categories of ``obs_keys`` for faster access.
435
+ parallel: Enable sampling with multiple processes.
436
+ dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
437
+ stream: Whether to stream data from the array backend.
438
+ is_run_input: Whether to track this collection as run input.
439
+
440
+ Examples:
441
+ >>> import lamindb as ln
442
+ >>> from torch.utils.data import DataLoader
443
+ >>> ds = ln.Collection.get(description="my collection")
444
+ >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
445
+ >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
446
+ """
447
+ path_list = []
448
+ if self._state.adding:
449
+ artifacts = self._artifacts
450
+ logger.warning("the collection isn't saved, consider calling `.save()`")
451
+ else:
452
+ artifacts = self.ordered_artifacts.all()
453
+ for artifact in artifacts:
454
+ if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
455
+ logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
456
+ continue
457
+ elif not stream:
458
+ path_list.append(artifact.cache())
459
+ else:
460
+ path_list.append(artifact.path)
461
+ ds = MappedCollection(
462
+ path_list,
463
+ layers_keys,
464
+ obs_keys,
465
+ obsm_keys,
466
+ obs_filter,
467
+ join,
468
+ encode_labels,
469
+ unknown_label,
470
+ cache_categories,
471
+ parallel,
472
+ dtype,
473
+ )
474
+ # track only if successful
475
+ _track_run_input(self, is_run_input)
476
+ return ds
477
+
478
+ def cache(self, is_run_input: bool | None = None) -> list[UPath]:
479
+ """Download cloud artifacts in collection to local cache.
480
+
481
+ Follows synching logic: only caches outdated artifacts.
482
+
483
+ Returns paths to locally cached on-disk artifacts.
484
+
485
+ Args:
486
+ is_run_input: Whether to track this collection as run input.
487
+ """
488
+ path_list = []
489
+ for artifact in self.ordered_artifacts.all():
490
+ path_list.append(artifact.cache())
491
+ _track_run_input(self, is_run_input)
492
+ return path_list
493
+
494
+ def load(
495
+ self,
496
+ join: Literal["inner", "outer"] = "outer",
497
+ is_run_input: bool | None = None,
498
+ **kwargs,
499
+ ) -> Any:
500
+ """Stage and load to memory.
501
+
502
+ Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
503
+ """
504
+ # cannot call _track_run_input here, see comment further down
505
+ all_artifacts = self.ordered_artifacts.all()
506
+ suffixes = [artifact.suffix for artifact in all_artifacts]
507
+ if len(set(suffixes)) != 1:
508
+ raise RuntimeError(
509
+ "Can only load collections where all artifacts have the same suffix"
510
+ )
511
+ # because we're tracking data flow on the collection-level, here, we don't
512
+ # want to track it on the artifact-level
513
+ objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
514
+ artifact_uids = [artifact.uid for artifact in all_artifacts]
515
+ if isinstance(objects[0], pd.DataFrame):
516
+ concat_object = pd.concat(objects, join=join)
517
+ elif isinstance(objects[0], ad.AnnData):
518
+ concat_object = ad.concat(
519
+ objects, join=join, label="artifact_uid", keys=artifact_uids
520
+ )
521
+ # only call it here because there might be errors during concat
522
+ _track_run_input(self, is_run_input)
523
+ return concat_object
524
+
525
+ def delete(self, permanent: bool | None = None) -> None:
526
+ """Delete collection.
527
+
528
+ Args:
529
+ permanent: Whether to permanently delete the collection record (skips trash).
530
+
531
+ Examples:
532
+
533
+ For any `Collection` object `collection`, call:
534
+
535
+ >>> collection.delete()
536
+ """
537
+ # change _branch_code to trash
538
+ trash__branch_code = -1
539
+ if self._branch_code > trash__branch_code and permanent is not True:
540
+ self._branch_code = trash__branch_code
541
+ self.save()
542
+ logger.warning(
543
+ f"moved collection to trash (_branch_code = {trash__branch_code})"
544
+ )
545
+ return
546
+
547
+ # permanent delete
548
+ if permanent is None:
549
+ response = input(
550
+ "Collection record is already in trash! Are you sure to delete it from your"
551
+ " database? (y/n) You can't undo this action."
552
+ )
553
+ delete_record = response == "y"
554
+ else:
555
+ delete_record = permanent
556
+
557
+ if delete_record:
558
+ super().delete()
559
+
560
+ def save(self, using: str | None = None) -> Collection:
561
+ """Save the collection and underlying artifacts to database & storage.
562
+
563
+ Args:
564
+ using: The database to which you want to save.
565
+
566
+ Examples:
567
+ >>> collection = ln.Collection("./myfile.csv", name="myfile")
568
+ """
569
+ if self.meta_artifact is not None:
570
+ self.meta_artifact.save()
571
+ super().save()
572
+ # we don't allow updating the collection of artifacts
573
+ # if users want to update the set of artifacts, they
574
+ # have to create a new collection
575
+ if hasattr(self, "_artifacts"):
576
+ links = [
577
+ CollectionArtifact(collection_id=self.id, artifact_id=artifact.id) # type: ignore
578
+ for artifact in self._artifacts
579
+ ]
580
+ # the below seems to preserve the order of the list in the
581
+ # auto-incrementing integer primary
582
+ # merely using .artifacts.set(*...) doesn't achieve this
583
+ # we need ignore_conflicts=True so that this won't error if links already exist
584
+ CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
585
+ save_schema_links(self)
586
+ if using is not None:
587
+ logger.warning("using argument is ignored")
588
+ return self
589
+
590
+ def restore(self) -> None:
591
+ """Restore collection record from trash.
592
+
593
+ Examples:
594
+
595
+ For any `Collection` object `collection`, call:
596
+
597
+ >>> collection.restore()
598
+ """
599
+ self._branch_code = 1
600
+ self.save()
601
+
602
+ @property
603
+ def transform(self) -> Transform | None:
604
+ """Transform whose run created the collection."""
605
+ return self.run.transform if self.run is not None else None
606
+
607
+ @property
608
+ def name(self) -> str:
609
+ """Name of the collection.
610
+
611
+ Splits `key` on `/` and returns the last element.
612
+ """
613
+ return self.key.split("/")[-1]
614
+
615
+ @property
616
+ def ordered_artifacts(self) -> QuerySet:
617
+ """Ordered `QuerySet` of `.artifacts`.
618
+
619
+ Accessing the many-to-many field `collection.artifacts` directly gives
620
+ you non-deterministic order.
621
+
622
+ Using the property `.ordered_artifacts` allows to iterate through a set
623
+ that's ordered in the order of creation.
624
+ """
625
+ return self.artifacts.order_by("links_collection__id")
626
+
627
+ @property
628
+ def data_artifact(self) -> Artifact | None:
629
+ """Access to a single data artifact.
630
+
631
+ If the collection has a single data & metadata artifact, this allows access via::
632
+
633
+ collection.data_artifact # first & only element of collection.artifacts
634
+ collection.meta_artifact # metadata
635
+
636
+ """
637
+ return self.artifacts.first()
638
+
639
+ def describe(self) -> None:
640
+ """Describe relations of record.
641
+
642
+ Examples:
643
+ >>> artifact.describe()
644
+ """
645
+ return describe_artifact_collection(self)
646
+
647
+ def _populate_subsequent_runs(self, run: Run) -> None:
648
+ _populate_subsequent_runs_(self, run)
649
+
650
+
651
+ # internal function, not exposed to user
652
+ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
653
+ # assert all artifacts are already saved
654
+ saved = not any(artifact._state.adding for artifact in artifacts)
655
+ if not saved:
656
+ raise ValueError("Not all artifacts are yet saved, please save them")
657
+ # validate consistency of hashes - we do not allow duplicate hashes
658
+ hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
659
+ hashes_set = set(hashes)
660
+ if len(hashes) != len(hashes_set):
661
+ seen = set()
662
+ non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
663
+ raise ValueError(
664
+ "Please pass artifacts with distinct hashes: these ones are non-unique"
665
+ f" {non_unique}"
666
+ )
667
+ hash = hash_set(hashes_set)
668
+ return hash
669
+
670
+
671
+ class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
672
+ id: int = models.BigAutoField(primary_key=True)
673
+ collection: Collection = ForeignKey(
674
+ Collection, CASCADE, related_name="links_artifact"
675
+ )
676
+ artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection")
677
+
678
+ class Meta:
679
+ unique_together = ("collection", "artifact")
680
+
681
+
682
+ # mypy: ignore-errors
683
+ Collection.view_lineage = view_lineage