lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +31 -26
- lamindb/_finish.py +9 -1
- lamindb/_tracked.py +26 -3
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/base/users.py +1 -4
- lamindb/core/__init__.py +7 -65
- lamindb/core/_context.py +41 -10
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +22 -9
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +14 -7
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/_tiledbsoma.py +6 -4
- lamindb/core/storage/_zarr.py +32 -11
- lamindb/core/storage/objects.py +59 -26
- lamindb/core/storage/paths.py +16 -13
- lamindb/curators/__init__.py +173 -145
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2601 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +52 -30
- lamindb/models/record.py +1757 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +18 -8
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
- lamindb-1.2a2.dist-info/RECORD +94 -0
- lamindb/_artifact.py +0 -1361
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1065
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4435
- lamindb-1.1.0.dist-info/RECORD +0 -95
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,683 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import warnings
|
4
|
+
from typing import (
|
5
|
+
TYPE_CHECKING,
|
6
|
+
Any,
|
7
|
+
Literal,
|
8
|
+
overload,
|
9
|
+
)
|
10
|
+
|
11
|
+
import anndata as ad
|
12
|
+
import pandas as pd
|
13
|
+
from django.db import models
|
14
|
+
from django.db.models import CASCADE, PROTECT
|
15
|
+
from lamin_utils import logger
|
16
|
+
from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
|
17
|
+
|
18
|
+
from lamindb.base.fields import (
|
19
|
+
CharField,
|
20
|
+
ForeignKey,
|
21
|
+
OneToOneField,
|
22
|
+
TextField,
|
23
|
+
)
|
24
|
+
|
25
|
+
from ..base.ids import base62_20
|
26
|
+
from ..core._mapped_collection import MappedCollection
|
27
|
+
from ..core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
|
28
|
+
from ..errors import FieldValidationError
|
29
|
+
from ..models._is_versioned import process_revises
|
30
|
+
from ._is_versioned import IsVersioned
|
31
|
+
from .artifact import (
|
32
|
+
Artifact,
|
33
|
+
_populate_subsequent_runs_,
|
34
|
+
_track_run_input,
|
35
|
+
describe_artifact_collection,
|
36
|
+
get_run,
|
37
|
+
save_schema_links,
|
38
|
+
)
|
39
|
+
from .has_parents import view_lineage
|
40
|
+
from .record import (
|
41
|
+
BasicRecord,
|
42
|
+
LinkORM,
|
43
|
+
Record,
|
44
|
+
_get_record_kwargs,
|
45
|
+
init_self_from_db,
|
46
|
+
update_attributes,
|
47
|
+
)
|
48
|
+
from .run import Run, TracksRun, TracksUpdates
|
49
|
+
|
50
|
+
if TYPE_CHECKING:
|
51
|
+
from collections.abc import Iterable
|
52
|
+
|
53
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
54
|
+
|
55
|
+
from ..core.storage import UPath
|
56
|
+
from .project import Project, Reference
|
57
|
+
from .query_set import QuerySet
|
58
|
+
from .transform import Transform
|
59
|
+
from .ulabel import ULabel
|
60
|
+
|
61
|
+
|
62
|
+
# below is a draft for the future, see also the tests in test_collection.py
|
63
|
+
#
|
64
|
+
# class CollectionFeatureManager:
|
65
|
+
# """Query features of artifact in collection."""
|
66
|
+
|
67
|
+
# def __init__(self, collection: Collection):
|
68
|
+
# self._collection = collection
|
69
|
+
|
70
|
+
# def _get_staged_feature_sets_union(self) -> dict[str, Schema]:
|
71
|
+
# links_schema_artifact = Artifact.feature_sets.through.objects.filter(
|
72
|
+
# artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
|
73
|
+
# )
|
74
|
+
# feature_sets_by_slots = defaultdict(list)
|
75
|
+
# for link in links_schema_artifact:
|
76
|
+
# feature_sets_by_slots[link.slot].append(link.schema_id)
|
77
|
+
# feature_sets_union = {}
|
78
|
+
# for slot, schema_ids_slot in feature_sets_by_slots.items():
|
79
|
+
# schema_1 = Schema.get(id=schema_ids_slot[0])
|
80
|
+
# related_name = schema_1._get_related_name()
|
81
|
+
# features_registry = getattr(Schema, related_name).field.model
|
82
|
+
# # this way of writing the __in statement turned out to be the fastest
|
83
|
+
# # evaluated on a link table with 16M entries connecting 500 feature sets with
|
84
|
+
# # 60k genes
|
85
|
+
# feature_ids = (
|
86
|
+
# features_registry.schemas.through.objects.filter(
|
87
|
+
# schema_id__in=schema_ids_slot
|
88
|
+
# )
|
89
|
+
# .values(f"{features_registry.__name__.lower()}_id")
|
90
|
+
# .distinct()
|
91
|
+
# )
|
92
|
+
# features = features_registry.filter(id__in=feature_ids)
|
93
|
+
# feature_sets_union[slot] = Schema(features, dtype=schema_1.dtype)
|
94
|
+
# return feature_sets_union
|
95
|
+
|
96
|
+
|
97
|
+
class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
98
|
+
"""Collections of artifacts.
|
99
|
+
|
100
|
+
Collections provide a simple way of versioning collections of artifacts.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
artifacts: `list[Artifact]` A list of artifacts.
|
104
|
+
key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
|
105
|
+
description: `str | None = None` A description.
|
106
|
+
revises: `Collection | None = None` An old version of the collection.
|
107
|
+
run: `Run | None = None` The run that creates the collection.
|
108
|
+
meta: `Artifact | None = None` An artifact that defines metadata for the collection.
|
109
|
+
reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
|
110
|
+
reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
|
111
|
+
|
112
|
+
See Also:
|
113
|
+
:class:`~lamindb.Artifact`
|
114
|
+
|
115
|
+
Examples:
|
116
|
+
|
117
|
+
Create a collection from a list of :class:`~lamindb.Artifact` objects:
|
118
|
+
|
119
|
+
>>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
|
120
|
+
|
121
|
+
Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
|
122
|
+
|
123
|
+
>>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
|
124
|
+
|
125
|
+
"""
|
126
|
+
|
127
|
+
class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
128
|
+
abstract = False
|
129
|
+
|
130
|
+
_len_full_uid: int = 20
|
131
|
+
_len_stem_uid: int = 16
|
132
|
+
_name_field: str = "key"
|
133
|
+
|
134
|
+
id: int = models.AutoField(primary_key=True)
|
135
|
+
"""Internal id, valid only in one DB instance."""
|
136
|
+
uid: str = CharField(
|
137
|
+
editable=False,
|
138
|
+
unique=True,
|
139
|
+
db_index=True,
|
140
|
+
max_length=_len_full_uid,
|
141
|
+
default=base62_20,
|
142
|
+
)
|
143
|
+
"""Universal id, valid across DB instances."""
|
144
|
+
key: str = CharField(db_index=True)
|
145
|
+
"""Name or path-like key."""
|
146
|
+
# below is the only case in which we use a TextField
|
147
|
+
# for description; we do so because users had descriptions exceeding 255 chars
|
148
|
+
# in their instances
|
149
|
+
description: str | None = TextField(null=True, db_index=True)
|
150
|
+
"""A description or title."""
|
151
|
+
hash: str | None = CharField(
|
152
|
+
max_length=HASH_LENGTH, db_index=True, null=True, unique=True
|
153
|
+
)
|
154
|
+
"""Hash of collection content."""
|
155
|
+
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
156
|
+
"""A reference like URL or external ID."""
|
157
|
+
# also for reference_type here, we allow an extra long max_length
|
158
|
+
reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
|
159
|
+
"""Type of reference, e.g., cellxgene Census collection_id."""
|
160
|
+
ulabels: ULabel = models.ManyToManyField(
|
161
|
+
"ULabel", through="CollectionULabel", related_name="collections"
|
162
|
+
)
|
163
|
+
"""ULabels sampled in the collection (see :class:`~lamindb.Feature`)."""
|
164
|
+
run: Run | None = ForeignKey(
|
165
|
+
Run, PROTECT, related_name="output_collections", null=True, default=None
|
166
|
+
)
|
167
|
+
""":class:`~lamindb.Run` that created the `collection`."""
|
168
|
+
input_of_runs: Run = models.ManyToManyField(Run, related_name="input_collections")
|
169
|
+
"""Runs that use this collection as an input."""
|
170
|
+
_subsequent_runs: Run = models.ManyToManyField(
|
171
|
+
"Run",
|
172
|
+
related_name="_recreated_collections",
|
173
|
+
db_table="lamindb_collection__previous_runs", # legacy name, change in lamindb v2
|
174
|
+
)
|
175
|
+
"""Runs that re-created the record after initial creation."""
|
176
|
+
artifacts: Artifact = models.ManyToManyField(
|
177
|
+
"Artifact", related_name="collections", through="CollectionArtifact"
|
178
|
+
)
|
179
|
+
"""Artifacts in collection."""
|
180
|
+
meta_artifact: Artifact | None = OneToOneField(
|
181
|
+
"Artifact",
|
182
|
+
PROTECT,
|
183
|
+
null=True,
|
184
|
+
unique=True,
|
185
|
+
related_name="_meta_of_collection",
|
186
|
+
)
|
187
|
+
"""An artifact that stores metadata that indexes a collection.
|
188
|
+
|
189
|
+
It has a 1:1 correspondence with an artifact. If needed, you can access the
|
190
|
+
collection from the artifact via a private field:
|
191
|
+
`artifact._meta_of_collection`.
|
192
|
+
"""
|
193
|
+
_actions: Artifact = models.ManyToManyField(Artifact, related_name="+")
|
194
|
+
"""Actions to attach for the UI."""
|
195
|
+
projects: Project
|
196
|
+
"""Linked projects."""
|
197
|
+
references: Reference
|
198
|
+
"""Linked references."""
|
199
|
+
|
200
|
+
@overload
|
201
|
+
def __init__(
|
202
|
+
self,
|
203
|
+
artifacts: list[Artifact],
|
204
|
+
key: str,
|
205
|
+
description: str | None = None,
|
206
|
+
meta: Any | None = None,
|
207
|
+
reference: str | None = None,
|
208
|
+
reference_type: str | None = None,
|
209
|
+
run: Run | None = None,
|
210
|
+
revises: Collection | None = None,
|
211
|
+
): ...
|
212
|
+
|
213
|
+
@overload
|
214
|
+
def __init__(
|
215
|
+
self,
|
216
|
+
*db_args,
|
217
|
+
): ...
|
218
|
+
|
219
|
+
def __init__(
|
220
|
+
self,
|
221
|
+
*args,
|
222
|
+
**kwargs,
|
223
|
+
):
|
224
|
+
if len(args) == len(self._meta.concrete_fields):
|
225
|
+
super().__init__(*args, **kwargs)
|
226
|
+
return None
|
227
|
+
# now we proceed with the user-facing constructor
|
228
|
+
if len(args) > 1:
|
229
|
+
raise ValueError("Only one non-keyword arg allowed: artifacts")
|
230
|
+
artifacts: Artifact | Iterable[Artifact] = (
|
231
|
+
kwargs.pop("artifacts") if len(args) == 0 else args[0]
|
232
|
+
)
|
233
|
+
meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
|
234
|
+
tmp_key: str | None = kwargs.pop("key", None)
|
235
|
+
description: str | None = kwargs.pop("description", None)
|
236
|
+
reference: str | None = kwargs.pop("reference", None)
|
237
|
+
reference_type: str | None = kwargs.pop("reference_type", None)
|
238
|
+
run: Run | None = kwargs.pop("run", None)
|
239
|
+
revises: Collection | None = kwargs.pop("revises", None)
|
240
|
+
version: str | None = kwargs.pop("version", None)
|
241
|
+
_branch_code: int | None = kwargs.pop("_branch_code", 1)
|
242
|
+
key: str
|
243
|
+
if "name" in kwargs:
|
244
|
+
key = kwargs.pop("name")
|
245
|
+
warnings.warn(
|
246
|
+
f"argument `name` will be removed, please pass {key} to `key` instead",
|
247
|
+
FutureWarning,
|
248
|
+
stacklevel=2,
|
249
|
+
)
|
250
|
+
else:
|
251
|
+
key = tmp_key
|
252
|
+
if not len(kwargs) == 0:
|
253
|
+
valid_keywords = ", ".join(
|
254
|
+
[val[0] for val in _get_record_kwargs(Collection)]
|
255
|
+
)
|
256
|
+
raise FieldValidationError(
|
257
|
+
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
258
|
+
)
|
259
|
+
if revises is None:
|
260
|
+
revises = (
|
261
|
+
Collection.filter(key=key, is_latest=True)
|
262
|
+
.order_by("-created_at")
|
263
|
+
.first()
|
264
|
+
)
|
265
|
+
provisional_uid, version, key, description, revises = process_revises(
|
266
|
+
revises, version, key, description, Collection
|
267
|
+
)
|
268
|
+
run = get_run(run)
|
269
|
+
if isinstance(artifacts, Artifact):
|
270
|
+
artifacts = [artifacts]
|
271
|
+
else:
|
272
|
+
if not hasattr(artifacts, "__getitem__"):
|
273
|
+
raise ValueError("Artifact or list[Artifact] is allowed.")
|
274
|
+
assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
|
275
|
+
hash = from_artifacts(artifacts) # type: ignore
|
276
|
+
if meta_artifact is not None:
|
277
|
+
if not isinstance(meta_artifact, Artifact):
|
278
|
+
raise ValueError("meta_artifact has to be an Artifact")
|
279
|
+
if isinstance(meta_artifact, Artifact):
|
280
|
+
if meta_artifact._state.adding:
|
281
|
+
raise ValueError(
|
282
|
+
"Save meta_artifact artifact before creating collection!"
|
283
|
+
)
|
284
|
+
# we ignore collections in trash containing the same hash
|
285
|
+
if hash is not None:
|
286
|
+
existing_collection = Collection.filter(hash=hash).one_or_none()
|
287
|
+
else:
|
288
|
+
existing_collection = None
|
289
|
+
if existing_collection is not None:
|
290
|
+
logger.warning(
|
291
|
+
f"returning existing collection with same hash: {existing_collection}; if you intended to query to track this collection as an input, use: ln.Collection.get()"
|
292
|
+
)
|
293
|
+
if run is not None:
|
294
|
+
existing_collection._populate_subsequent_runs(run)
|
295
|
+
init_self_from_db(self, existing_collection)
|
296
|
+
update_attributes(self, {"description": description, "key": key})
|
297
|
+
else:
|
298
|
+
_skip_validation = revises is not None and key == revises.key
|
299
|
+
super().__init__( # type: ignore
|
300
|
+
uid=provisional_uid,
|
301
|
+
key=key,
|
302
|
+
description=description,
|
303
|
+
reference=reference,
|
304
|
+
reference_type=reference_type,
|
305
|
+
meta_artifact=meta_artifact,
|
306
|
+
hash=hash,
|
307
|
+
run=run,
|
308
|
+
version=version,
|
309
|
+
_branch_code=_branch_code,
|
310
|
+
revises=revises,
|
311
|
+
_skip_validation=_skip_validation,
|
312
|
+
)
|
313
|
+
self._artifacts = artifacts
|
314
|
+
if revises is not None:
|
315
|
+
_track_run_input(revises, run=run)
|
316
|
+
_track_run_input(artifacts, run=run)
|
317
|
+
|
318
|
+
def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
|
319
|
+
"""Append an artifact to the collection.
|
320
|
+
|
321
|
+
This does not modify the original collection in-place, but returns a new version
|
322
|
+
of the original collection with the appended artifact.
|
323
|
+
|
324
|
+
Args:
|
325
|
+
artifact: An artifact to add to the collection.
|
326
|
+
run: The run that creates the new version of the collection.
|
327
|
+
|
328
|
+
Examples::
|
329
|
+
|
330
|
+
collection_v1 = ln.Collection(artifact, key="My collection").save()
|
331
|
+
collection_v2 = collection.append(another_artifact) # returns a new version of the collection
|
332
|
+
collection_v2.save() # save the new version
|
333
|
+
|
334
|
+
"""
|
335
|
+
return Collection( # type: ignore
|
336
|
+
self.artifacts.all().list() + [artifact],
|
337
|
+
# key is automatically derived from revises.key
|
338
|
+
description=self.description,
|
339
|
+
revises=self,
|
340
|
+
run=run,
|
341
|
+
)
|
342
|
+
|
343
|
+
def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
|
344
|
+
"""Return a cloud-backed pyarrow Dataset.
|
345
|
+
|
346
|
+
Works for `pyarrow` compatible formats.
|
347
|
+
|
348
|
+
Notes:
|
349
|
+
For more info, see tutorial: :doc:`/arrays`.
|
350
|
+
"""
|
351
|
+
if self._state.adding:
|
352
|
+
artifacts = self._artifacts
|
353
|
+
logger.warning("the collection isn't saved, consider calling `.save()`")
|
354
|
+
else:
|
355
|
+
artifacts = self.ordered_artifacts.all()
|
356
|
+
paths = [artifact.path for artifact in artifacts]
|
357
|
+
# this checks that the filesystem is the same for all paths
|
358
|
+
# this is a requirement of pyarrow.dataset.dataset
|
359
|
+
fs = paths[0].fs
|
360
|
+
for path in paths[1:]:
|
361
|
+
# this assumes that the filesystems are cached by fsspec
|
362
|
+
if path.fs is not fs:
|
363
|
+
raise ValueError(
|
364
|
+
"The collection has artifacts with different filesystems, this is not supported."
|
365
|
+
)
|
366
|
+
if not _is_pyarrow_dataset(paths):
|
367
|
+
suffixes = {path.suffix for path in paths}
|
368
|
+
suffixes_str = ", ".join(suffixes)
|
369
|
+
err_msg = (
|
370
|
+
"This collection is not compatible with pyarrow.dataset.dataset(), "
|
371
|
+
)
|
372
|
+
err_msg += (
|
373
|
+
f"the artifacts have incompatible file types: {suffixes_str}"
|
374
|
+
if len(suffixes) > 1
|
375
|
+
else f"the file type {suffixes_str} is not supported by pyarrow."
|
376
|
+
)
|
377
|
+
raise ValueError(err_msg)
|
378
|
+
dataset = _open_pyarrow_dataset(paths)
|
379
|
+
# track only if successful
|
380
|
+
_track_run_input(self, is_run_input)
|
381
|
+
return dataset
|
382
|
+
|
383
|
+
def mapped(
|
384
|
+
self,
|
385
|
+
layers_keys: str | list[str] | None = None,
|
386
|
+
obs_keys: str | list[str] | None = None,
|
387
|
+
obsm_keys: str | list[str] | None = None,
|
388
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
389
|
+
join: Literal["inner", "outer"] | None = "inner",
|
390
|
+
encode_labels: bool | list[str] = True,
|
391
|
+
unknown_label: str | dict[str, str] | None = None,
|
392
|
+
cache_categories: bool = True,
|
393
|
+
parallel: bool = False,
|
394
|
+
dtype: str | None = None,
|
395
|
+
stream: bool = False,
|
396
|
+
is_run_input: bool | None = None,
|
397
|
+
) -> MappedCollection:
|
398
|
+
"""Return a map-style dataset.
|
399
|
+
|
400
|
+
Returns a `pytorch map-style dataset
|
401
|
+
<https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
|
402
|
+
virtually concatenating `AnnData` arrays.
|
403
|
+
|
404
|
+
If your `AnnData` collection is in the cloud, move them into a local
|
405
|
+
cache first via :meth:`~lamindb.Collection.cache`.
|
406
|
+
|
407
|
+
`__getitem__` of the `MappedCollection` object takes a single integer index
|
408
|
+
and returns a dictionary with the observation data sample for this index from
|
409
|
+
the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`
|
410
|
+
(`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
|
411
|
+
for the index of the `AnnData` object containing this observation sample.
|
412
|
+
|
413
|
+
.. note::
|
414
|
+
|
415
|
+
For a guide, see :doc:`docs:scrna-mappedcollection`.
|
416
|
+
|
417
|
+
This method currently only works for collections of `AnnData` artifacts.
|
418
|
+
|
419
|
+
Args:
|
420
|
+
layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
|
421
|
+
retrieves ``.X``.
|
422
|
+
obs_keys: Keys from the ``.obs`` slots.
|
423
|
+
obsm_keys: Keys from the ``.obsm`` slots.
|
424
|
+
obs_filter: Select only observations with these values for the given obs columns.
|
425
|
+
Should be a dictionary with obs column names as keys
|
426
|
+
and filtering values (a string or a list of strings) as values.
|
427
|
+
join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
|
428
|
+
does not join.
|
429
|
+
encode_labels: Encode labels into integers.
|
430
|
+
Can be a list with elements from ``obs_keys``.
|
431
|
+
unknown_label: Encode this label to -1.
|
432
|
+
Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
|
433
|
+
or from ``encode_labels`` if it is a list.
|
434
|
+
cache_categories: Enable caching categories of ``obs_keys`` for faster access.
|
435
|
+
parallel: Enable sampling with multiple processes.
|
436
|
+
dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
|
437
|
+
stream: Whether to stream data from the array backend.
|
438
|
+
is_run_input: Whether to track this collection as run input.
|
439
|
+
|
440
|
+
Examples:
|
441
|
+
>>> import lamindb as ln
|
442
|
+
>>> from torch.utils.data import DataLoader
|
443
|
+
>>> ds = ln.Collection.get(description="my collection")
|
444
|
+
>>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
|
445
|
+
>>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
|
446
|
+
"""
|
447
|
+
path_list = []
|
448
|
+
if self._state.adding:
|
449
|
+
artifacts = self._artifacts
|
450
|
+
logger.warning("the collection isn't saved, consider calling `.save()`")
|
451
|
+
else:
|
452
|
+
artifacts = self.ordered_artifacts.all()
|
453
|
+
for artifact in artifacts:
|
454
|
+
if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
|
455
|
+
logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
|
456
|
+
continue
|
457
|
+
elif not stream:
|
458
|
+
path_list.append(artifact.cache())
|
459
|
+
else:
|
460
|
+
path_list.append(artifact.path)
|
461
|
+
ds = MappedCollection(
|
462
|
+
path_list,
|
463
|
+
layers_keys,
|
464
|
+
obs_keys,
|
465
|
+
obsm_keys,
|
466
|
+
obs_filter,
|
467
|
+
join,
|
468
|
+
encode_labels,
|
469
|
+
unknown_label,
|
470
|
+
cache_categories,
|
471
|
+
parallel,
|
472
|
+
dtype,
|
473
|
+
)
|
474
|
+
# track only if successful
|
475
|
+
_track_run_input(self, is_run_input)
|
476
|
+
return ds
|
477
|
+
|
478
|
+
def cache(self, is_run_input: bool | None = None) -> list[UPath]:
|
479
|
+
"""Download cloud artifacts in collection to local cache.
|
480
|
+
|
481
|
+
Follows synching logic: only caches outdated artifacts.
|
482
|
+
|
483
|
+
Returns paths to locally cached on-disk artifacts.
|
484
|
+
|
485
|
+
Args:
|
486
|
+
is_run_input: Whether to track this collection as run input.
|
487
|
+
"""
|
488
|
+
path_list = []
|
489
|
+
for artifact in self.ordered_artifacts.all():
|
490
|
+
path_list.append(artifact.cache())
|
491
|
+
_track_run_input(self, is_run_input)
|
492
|
+
return path_list
|
493
|
+
|
494
|
+
def load(
|
495
|
+
self,
|
496
|
+
join: Literal["inner", "outer"] = "outer",
|
497
|
+
is_run_input: bool | None = None,
|
498
|
+
**kwargs,
|
499
|
+
) -> Any:
|
500
|
+
"""Stage and load to memory.
|
501
|
+
|
502
|
+
Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
|
503
|
+
"""
|
504
|
+
# cannot call _track_run_input here, see comment further down
|
505
|
+
all_artifacts = self.ordered_artifacts.all()
|
506
|
+
suffixes = [artifact.suffix for artifact in all_artifacts]
|
507
|
+
if len(set(suffixes)) != 1:
|
508
|
+
raise RuntimeError(
|
509
|
+
"Can only load collections where all artifacts have the same suffix"
|
510
|
+
)
|
511
|
+
# because we're tracking data flow on the collection-level, here, we don't
|
512
|
+
# want to track it on the artifact-level
|
513
|
+
objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
|
514
|
+
artifact_uids = [artifact.uid for artifact in all_artifacts]
|
515
|
+
if isinstance(objects[0], pd.DataFrame):
|
516
|
+
concat_object = pd.concat(objects, join=join)
|
517
|
+
elif isinstance(objects[0], ad.AnnData):
|
518
|
+
concat_object = ad.concat(
|
519
|
+
objects, join=join, label="artifact_uid", keys=artifact_uids
|
520
|
+
)
|
521
|
+
# only call it here because there might be errors during concat
|
522
|
+
_track_run_input(self, is_run_input)
|
523
|
+
return concat_object
|
524
|
+
|
525
|
+
def delete(self, permanent: bool | None = None) -> None:
|
526
|
+
"""Delete collection.
|
527
|
+
|
528
|
+
Args:
|
529
|
+
permanent: Whether to permanently delete the collection record (skips trash).
|
530
|
+
|
531
|
+
Examples:
|
532
|
+
|
533
|
+
For any `Collection` object `collection`, call:
|
534
|
+
|
535
|
+
>>> collection.delete()
|
536
|
+
"""
|
537
|
+
# change _branch_code to trash
|
538
|
+
trash__branch_code = -1
|
539
|
+
if self._branch_code > trash__branch_code and permanent is not True:
|
540
|
+
self._branch_code = trash__branch_code
|
541
|
+
self.save()
|
542
|
+
logger.warning(
|
543
|
+
f"moved collection to trash (_branch_code = {trash__branch_code})"
|
544
|
+
)
|
545
|
+
return
|
546
|
+
|
547
|
+
# permanent delete
|
548
|
+
if permanent is None:
|
549
|
+
response = input(
|
550
|
+
"Collection record is already in trash! Are you sure to delete it from your"
|
551
|
+
" database? (y/n) You can't undo this action."
|
552
|
+
)
|
553
|
+
delete_record = response == "y"
|
554
|
+
else:
|
555
|
+
delete_record = permanent
|
556
|
+
|
557
|
+
if delete_record:
|
558
|
+
super().delete()
|
559
|
+
|
560
|
+
def save(self, using: str | None = None) -> Collection:
|
561
|
+
"""Save the collection and underlying artifacts to database & storage.
|
562
|
+
|
563
|
+
Args:
|
564
|
+
using: The database to which you want to save.
|
565
|
+
|
566
|
+
Examples:
|
567
|
+
>>> collection = ln.Collection("./myfile.csv", name="myfile")
|
568
|
+
"""
|
569
|
+
if self.meta_artifact is not None:
|
570
|
+
self.meta_artifact.save()
|
571
|
+
super().save()
|
572
|
+
# we don't allow updating the collection of artifacts
|
573
|
+
# if users want to update the set of artifacts, they
|
574
|
+
# have to create a new collection
|
575
|
+
if hasattr(self, "_artifacts"):
|
576
|
+
links = [
|
577
|
+
CollectionArtifact(collection_id=self.id, artifact_id=artifact.id) # type: ignore
|
578
|
+
for artifact in self._artifacts
|
579
|
+
]
|
580
|
+
# the below seems to preserve the order of the list in the
|
581
|
+
# auto-incrementing integer primary
|
582
|
+
# merely using .artifacts.set(*...) doesn't achieve this
|
583
|
+
# we need ignore_conflicts=True so that this won't error if links already exist
|
584
|
+
CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
|
585
|
+
save_schema_links(self)
|
586
|
+
if using is not None:
|
587
|
+
logger.warning("using argument is ignored")
|
588
|
+
return self
|
589
|
+
|
590
|
+
def restore(self) -> None:
|
591
|
+
"""Restore collection record from trash.
|
592
|
+
|
593
|
+
Examples:
|
594
|
+
|
595
|
+
For any `Collection` object `collection`, call:
|
596
|
+
|
597
|
+
>>> collection.restore()
|
598
|
+
"""
|
599
|
+
self._branch_code = 1
|
600
|
+
self.save()
|
601
|
+
|
602
|
+
@property
|
603
|
+
def transform(self) -> Transform | None:
|
604
|
+
"""Transform whose run created the collection."""
|
605
|
+
return self.run.transform if self.run is not None else None
|
606
|
+
|
607
|
+
@property
|
608
|
+
def name(self) -> str:
|
609
|
+
"""Name of the collection.
|
610
|
+
|
611
|
+
Splits `key` on `/` and returns the last element.
|
612
|
+
"""
|
613
|
+
return self.key.split("/")[-1]
|
614
|
+
|
615
|
+
@property
|
616
|
+
def ordered_artifacts(self) -> QuerySet:
|
617
|
+
"""Ordered `QuerySet` of `.artifacts`.
|
618
|
+
|
619
|
+
Accessing the many-to-many field `collection.artifacts` directly gives
|
620
|
+
you non-deterministic order.
|
621
|
+
|
622
|
+
Using the property `.ordered_artifacts` allows to iterate through a set
|
623
|
+
that's ordered in the order of creation.
|
624
|
+
"""
|
625
|
+
return self.artifacts.order_by("links_collection__id")
|
626
|
+
|
627
|
+
@property
|
628
|
+
def data_artifact(self) -> Artifact | None:
|
629
|
+
"""Access to a single data artifact.
|
630
|
+
|
631
|
+
If the collection has a single data & metadata artifact, this allows access via::
|
632
|
+
|
633
|
+
collection.data_artifact # first & only element of collection.artifacts
|
634
|
+
collection.meta_artifact # metadata
|
635
|
+
|
636
|
+
"""
|
637
|
+
return self.artifacts.first()
|
638
|
+
|
639
|
+
def describe(self) -> None:
|
640
|
+
"""Describe relations of record.
|
641
|
+
|
642
|
+
Examples:
|
643
|
+
>>> artifact.describe()
|
644
|
+
"""
|
645
|
+
return describe_artifact_collection(self)
|
646
|
+
|
647
|
+
def _populate_subsequent_runs(self, run: Run) -> None:
|
648
|
+
_populate_subsequent_runs_(self, run)
|
649
|
+
|
650
|
+
|
651
|
+
# internal function, not exposed to user
|
652
|
+
def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
653
|
+
# assert all artifacts are already saved
|
654
|
+
saved = not any(artifact._state.adding for artifact in artifacts)
|
655
|
+
if not saved:
|
656
|
+
raise ValueError("Not all artifacts are yet saved, please save them")
|
657
|
+
# validate consistency of hashes - we do not allow duplicate hashes
|
658
|
+
hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
|
659
|
+
hashes_set = set(hashes)
|
660
|
+
if len(hashes) != len(hashes_set):
|
661
|
+
seen = set()
|
662
|
+
non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
|
663
|
+
raise ValueError(
|
664
|
+
"Please pass artifacts with distinct hashes: these ones are non-unique"
|
665
|
+
f" {non_unique}"
|
666
|
+
)
|
667
|
+
hash = hash_set(hashes_set)
|
668
|
+
return hash
|
669
|
+
|
670
|
+
|
671
|
+
class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
|
672
|
+
id: int = models.BigAutoField(primary_key=True)
|
673
|
+
collection: Collection = ForeignKey(
|
674
|
+
Collection, CASCADE, related_name="links_artifact"
|
675
|
+
)
|
676
|
+
artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection")
|
677
|
+
|
678
|
+
class Meta:
|
679
|
+
unique_together = ("collection", "artifact")
|
680
|
+
|
681
|
+
|
682
|
+
# mypy: ignore-errors
|
683
|
+
Collection.view_lineage = view_lineage
|