lamindb 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lamindb/__init__.py +30 -25
  2. lamindb/_tracked.py +1 -1
  3. lamindb/_view.py +2 -3
  4. lamindb/base/__init__.py +1 -1
  5. lamindb/base/ids.py +1 -10
  6. lamindb/core/__init__.py +7 -65
  7. lamindb/core/_compat.py +60 -0
  8. lamindb/core/_context.py +43 -20
  9. lamindb/core/_settings.py +6 -6
  10. lamindb/core/_sync_git.py +1 -1
  11. lamindb/core/loaders.py +30 -19
  12. lamindb/core/storage/_backed_access.py +4 -2
  13. lamindb/core/storage/_tiledbsoma.py +8 -6
  14. lamindb/core/storage/_zarr.py +104 -25
  15. lamindb/core/storage/objects.py +63 -28
  16. lamindb/core/storage/paths.py +4 -1
  17. lamindb/core/types.py +10 -0
  18. lamindb/curators/__init__.py +100 -85
  19. lamindb/errors.py +1 -1
  20. lamindb/integrations/_vitessce.py +4 -4
  21. lamindb/migrations/0089_subsequent_runs.py +159 -0
  22. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  23. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  24. lamindb/models/__init__.py +79 -0
  25. lamindb/{core → models}/_describe.py +3 -3
  26. lamindb/{core → models}/_django.py +8 -5
  27. lamindb/{core → models}/_feature_manager.py +103 -87
  28. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  29. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  30. lamindb/{core → models}/_label_manager.py +10 -17
  31. lamindb/{core/relations.py → models/_relations.py} +8 -1
  32. lamindb/models/artifact.py +2602 -0
  33. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  34. lamindb/models/collection.py +683 -0
  35. lamindb/models/core.py +135 -0
  36. lamindb/models/feature.py +643 -0
  37. lamindb/models/flextable.py +163 -0
  38. lamindb/{_parents.py → models/has_parents.py} +55 -49
  39. lamindb/models/project.py +384 -0
  40. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  41. lamindb/{_query_set.py → models/query_set.py} +40 -26
  42. lamindb/models/record.py +1762 -0
  43. lamindb/models/run.py +563 -0
  44. lamindb/{_save.py → models/save.py} +9 -7
  45. lamindb/models/schema.py +732 -0
  46. lamindb/models/transform.py +360 -0
  47. lamindb/models/ulabel.py +249 -0
  48. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  49. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/RECORD +51 -51
  50. lamindb/_artifact.py +0 -1379
  51. lamindb/_collection.py +0 -440
  52. lamindb/_feature.py +0 -316
  53. lamindb/_is_versioned.py +0 -40
  54. lamindb/_record.py +0 -1064
  55. lamindb/_run.py +0 -60
  56. lamindb/_schema.py +0 -347
  57. lamindb/_storage.py +0 -15
  58. lamindb/_transform.py +0 -170
  59. lamindb/_ulabel.py +0 -56
  60. lamindb/_utils.py +0 -9
  61. lamindb/base/validation.py +0 -63
  62. lamindb/core/_data.py +0 -491
  63. lamindb/core/fields.py +0 -12
  64. lamindb/models.py +0 -4475
  65. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  66. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
lamindb/_collection.py DELETED
@@ -1,440 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import warnings
4
- from collections import defaultdict
5
- from typing import (
6
- TYPE_CHECKING,
7
- Any,
8
- Literal,
9
- )
10
-
11
- import anndata as ad
12
- import lamindb_setup as ln_setup
13
- import pandas as pd
14
- from lamin_utils import logger
15
- from lamindb_setup.core._docs import doc_args
16
- from lamindb_setup.core.hashing import hash_set
17
-
18
- from ._parents import view_lineage
19
- from ._record import _get_record_kwargs, init_self_from_db, update_attributes
20
- from ._utils import attach_func_to_class_method
21
- from .core._data import (
22
- _track_run_input,
23
- describe,
24
- get_run,
25
- save_schema_links,
26
- save_staged_feature_sets,
27
- )
28
- from .core._mapped_collection import MappedCollection
29
- from .core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
30
- from .core.versioning import process_revises
31
- from .errors import FieldValidationError
32
- from .models import (
33
- Artifact,
34
- Collection,
35
- CollectionArtifact,
36
- Run,
37
- Schema,
38
- )
39
-
40
- if TYPE_CHECKING:
41
- from collections.abc import Iterable
42
-
43
- from pyarrow.dataset import Dataset as PyArrowDataset
44
-
45
- from ._query_set import QuerySet
46
- from .core.storage import UPath
47
-
48
-
49
- class CollectionFeatureManager:
50
- """Query features of artifact in collection."""
51
-
52
- def __init__(self, collection: Collection):
53
- self._collection = collection
54
-
55
- def _get_staged_feature_sets_union(self) -> dict[str, Schema]:
56
- links_schema_artifact = Artifact.feature_sets.through.objects.filter(
57
- artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
58
- )
59
- feature_sets_by_slots = defaultdict(list)
60
- for link in links_schema_artifact:
61
- feature_sets_by_slots[link.slot].append(link.schema_id)
62
- feature_sets_union = {}
63
- for slot, schema_ids_slot in feature_sets_by_slots.items():
64
- schema_1 = Schema.get(id=schema_ids_slot[0])
65
- related_name = schema_1._get_related_name()
66
- features_registry = getattr(Schema, related_name).field.model
67
- # this way of writing the __in statement turned out to be the fastest
68
- # evaluated on a link table with 16M entries connecting 500 feature sets with
69
- # 60k genes
70
- feature_ids = (
71
- features_registry.schemas.through.objects.filter(
72
- schema_id__in=schema_ids_slot
73
- )
74
- .values(f"{features_registry.__name__.lower()}_id")
75
- .distinct()
76
- )
77
- features = features_registry.filter(id__in=feature_ids)
78
- feature_sets_union[slot] = Schema(features, dtype=schema_1.dtype)
79
- return feature_sets_union
80
-
81
-
82
- def __init__(
83
- collection: Collection,
84
- *args,
85
- **kwargs,
86
- ):
87
- collection.features = CollectionFeatureManager(collection)
88
- if len(args) == len(collection._meta.concrete_fields):
89
- super(Collection, collection).__init__(*args, **kwargs)
90
- return None
91
- # now we proceed with the user-facing constructor
92
- if len(args) > 1:
93
- raise ValueError("Only one non-keyword arg allowed: artifacts")
94
- artifacts: Artifact | Iterable[Artifact] = (
95
- kwargs.pop("artifacts") if len(args) == 0 else args[0]
96
- )
97
- meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
98
- tmp_key: str | None = kwargs.pop("key", None)
99
- description: str | None = kwargs.pop("description", None)
100
- reference: str | None = kwargs.pop("reference", None)
101
- reference_type: str | None = kwargs.pop("reference_type", None)
102
- run: Run | None = kwargs.pop("run", None)
103
- revises: Collection | None = kwargs.pop("revises", None)
104
- version: str | None = kwargs.pop("version", None)
105
- _branch_code: int | None = kwargs.pop("_branch_code", 1)
106
- key: str
107
- if "name" in kwargs:
108
- key = kwargs.pop("name")
109
- warnings.warn(
110
- f"argument `name` will be removed, please pass {key} to `key` instead",
111
- FutureWarning,
112
- stacklevel=2,
113
- )
114
- else:
115
- key = tmp_key
116
- if not len(kwargs) == 0:
117
- valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Collection)])
118
- raise FieldValidationError(
119
- f"Only {valid_keywords} can be passed, you passed: {kwargs}"
120
- )
121
- if revises is None:
122
- revises = (
123
- Collection.filter(key=key, is_latest=True).order_by("-created_at").first()
124
- )
125
- provisional_uid, version, key, description, revises = process_revises(
126
- revises, version, key, description, Collection
127
- )
128
- run = get_run(run)
129
- if isinstance(artifacts, Artifact):
130
- artifacts = [artifacts]
131
- else:
132
- if not hasattr(artifacts, "__getitem__"):
133
- raise ValueError("Artifact or list[Artifact] is allowed.")
134
- assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101
135
- hash = from_artifacts(artifacts) # type: ignore
136
- if meta_artifact is not None:
137
- if not isinstance(meta_artifact, Artifact):
138
- raise ValueError("meta_artifact has to be an Artifact")
139
- if isinstance(meta_artifact, Artifact):
140
- if meta_artifact._state.adding:
141
- raise ValueError(
142
- "Save meta_artifact artifact before creating collection!"
143
- )
144
- # we ignore collections in trash containing the same hash
145
- if hash is not None:
146
- existing_collection = Collection.filter(hash=hash).one_or_none()
147
- else:
148
- existing_collection = None
149
- if existing_collection is not None:
150
- logger.warning(
151
- f"returning existing collection with same hash: {existing_collection}; if you intended to query to track this collection as an input, use: ln.Collection.get()"
152
- )
153
- # update the run of the existing collection
154
- if run is not None:
155
- # save the information that this collection was previously produced
156
- # by another run
157
- # note: same logic exists for _output_artifacts_with_later_updates
158
- if existing_collection.run is not None and existing_collection.run != run:
159
- existing_collection.run._output_collections_with_later_updates.add(
160
- existing_collection
161
- )
162
- # update the run of the collection with the latest run
163
- existing_collection.run = run
164
- init_self_from_db(collection, existing_collection)
165
- update_attributes(collection, {"description": description, "key": key})
166
- else:
167
- _skip_validation = revises is not None and key == revises.key
168
- super(Collection, collection).__init__( # type: ignore
169
- uid=provisional_uid,
170
- key=key,
171
- description=description,
172
- reference=reference,
173
- reference_type=reference_type,
174
- meta_artifact=meta_artifact,
175
- hash=hash,
176
- run=run,
177
- version=version,
178
- _branch_code=_branch_code,
179
- revises=revises,
180
- _skip_validation=_skip_validation,
181
- )
182
- collection._artifacts = artifacts
183
- # register provenance
184
- if revises is not None:
185
- _track_run_input(revises, run=run)
186
- _track_run_input(artifacts, run=run)
187
-
188
-
189
- # docstring handled through attach_func_to_class_method
190
- def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
191
- return Collection( # type: ignore
192
- self.artifacts.all().list() + [artifact],
193
- # key is automatically taken from revises.key
194
- description=self.description,
195
- revises=self,
196
- run=run,
197
- )
198
-
199
-
200
- # internal function, not exposed to user
201
- def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
202
- # assert all artifacts are already saved
203
- saved = not any(artifact._state.adding for artifact in artifacts)
204
- if not saved:
205
- raise ValueError("Not all artifacts are yet saved, please save them")
206
- # validate consistency of hashes - we do not allow duplicate hashes
207
- hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
208
- hashes_set = set(hashes)
209
- if len(hashes) != len(hashes_set):
210
- seen = set()
211
- non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
212
- raise ValueError(
213
- "Please pass artifacts with distinct hashes: these ones are non-unique"
214
- f" {non_unique}"
215
- )
216
- hash = hash_set(hashes_set)
217
- return hash
218
-
219
-
220
- # docstring handled through attach_func_to_class_method
221
- def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
222
- if self._state.adding:
223
- artifacts = self._artifacts
224
- logger.warning("the collection isn't saved, consider calling `.save()`")
225
- else:
226
- artifacts = self.ordered_artifacts.all()
227
- paths = [artifact.path for artifact in artifacts]
228
- # this checks that the filesystem is the same for all paths
229
- # this is a requirement of pyarrow.dataset.dataset
230
- fs = paths[0].fs
231
- for path in paths[1:]:
232
- # this assumes that the filesystems are cached by fsspec
233
- if path.fs is not fs:
234
- raise ValueError(
235
- "The collection has artifacts with different filesystems, this is not supported."
236
- )
237
- if not _is_pyarrow_dataset(paths):
238
- suffixes = {path.suffix for path in paths}
239
- suffixes_str = ", ".join(suffixes)
240
- err_msg = "This collection is not compatible with pyarrow.dataset.dataset(), "
241
- err_msg += (
242
- f"the artifacts have incompatible file types: {suffixes_str}"
243
- if len(suffixes) > 1
244
- else f"the file type {suffixes_str} is not supported by pyarrow."
245
- )
246
- raise ValueError(err_msg)
247
- dataset = _open_pyarrow_dataset(paths)
248
- # track only if successful
249
- _track_run_input(self, is_run_input)
250
- return dataset
251
-
252
-
253
- # docstring handled through attach_func_to_class_method
254
- def mapped(
255
- self,
256
- layers_keys: str | list[str] | None = None,
257
- obs_keys: str | list[str] | None = None,
258
- obsm_keys: str | list[str] | None = None,
259
- obs_filter: dict[str, str | list[str]] | None = None,
260
- join: Literal["inner", "outer"] | None = "inner",
261
- encode_labels: bool | list[str] = True,
262
- unknown_label: str | dict[str, str] | None = None,
263
- cache_categories: bool = True,
264
- parallel: bool = False,
265
- dtype: str | None = None,
266
- stream: bool = False,
267
- is_run_input: bool | None = None,
268
- ) -> MappedCollection:
269
- path_list = []
270
- if self._state.adding:
271
- artifacts = self._artifacts
272
- logger.warning("the collection isn't saved, consider calling `.save()`")
273
- else:
274
- artifacts = self.ordered_artifacts.all()
275
- for artifact in artifacts:
276
- if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
277
- logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
278
- continue
279
- elif not stream:
280
- path_list.append(artifact.cache())
281
- else:
282
- path_list.append(artifact.path)
283
- ds = MappedCollection(
284
- path_list,
285
- layers_keys,
286
- obs_keys,
287
- obsm_keys,
288
- obs_filter,
289
- join,
290
- encode_labels,
291
- unknown_label,
292
- cache_categories,
293
- parallel,
294
- dtype,
295
- )
296
- # track only if successful
297
- _track_run_input(self, is_run_input)
298
- return ds
299
-
300
-
301
- # docstring handled through attach_func_to_class_method
302
- def cache(self, is_run_input: bool | None = None) -> list[UPath]:
303
- path_list = []
304
- for artifact in self.ordered_artifacts.all():
305
- path_list.append(artifact.cache())
306
- _track_run_input(self, is_run_input)
307
- return path_list
308
-
309
-
310
- # docstring handled through attach_func_to_class_method
311
- def load(
312
- self,
313
- join: Literal["inner", "outer"] = "outer",
314
- is_run_input: bool | None = None,
315
- **kwargs,
316
- ) -> Any:
317
- # cannot call _track_run_input here, see comment further down
318
- all_artifacts = self.ordered_artifacts.all()
319
- suffixes = [artifact.suffix for artifact in all_artifacts]
320
- if len(set(suffixes)) != 1:
321
- raise RuntimeError(
322
- "Can only load collections where all artifacts have the same suffix"
323
- )
324
- # because we're tracking data flow on the collection-level, here, we don't
325
- # want to track it on the artifact-level
326
- objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
327
- artifact_uids = [artifact.uid for artifact in all_artifacts]
328
- if isinstance(objects[0], pd.DataFrame):
329
- concat_object = pd.concat(objects, join=join)
330
- elif isinstance(objects[0], ad.AnnData):
331
- concat_object = ad.concat(
332
- objects, join=join, label="artifact_uid", keys=artifact_uids
333
- )
334
- # only call it here because there might be errors during concat
335
- _track_run_input(self, is_run_input)
336
- return concat_object
337
-
338
-
339
- # docstring handled through attach_func_to_class_method
340
- def delete(self, permanent: bool | None = None) -> None:
341
- # change _branch_code to trash
342
- trash__branch_code = -1
343
- if self._branch_code > trash__branch_code and permanent is not True:
344
- self._branch_code = trash__branch_code
345
- self.save()
346
- logger.warning(
347
- f"moved collection to trash (_branch_code = {trash__branch_code})"
348
- )
349
- return
350
-
351
- # permanent delete
352
- if permanent is None:
353
- response = input(
354
- "Collection record is already in trash! Are you sure to delete it from your"
355
- " database? (y/n) You can't undo this action."
356
- )
357
- delete_record = response == "y"
358
- else:
359
- delete_record = permanent
360
-
361
- if delete_record:
362
- super(Collection, self).delete()
363
-
364
-
365
- # docstring handled through attach_func_to_class_method
366
- def save(self, using: str | None = None) -> Collection:
367
- if self.meta_artifact is not None:
368
- self.meta_artifact.save()
369
- # we don't need to save feature sets again
370
- save_staged_feature_sets(self)
371
- super(Collection, self).save()
372
- # we don't allow updating the collection of artifacts
373
- # if users want to update the set of artifacts, they
374
- # have to create a new collection
375
- if hasattr(self, "_artifacts"):
376
- links = [
377
- CollectionArtifact(collection_id=self.id, artifact_id=artifact.id) # type: ignore
378
- for artifact in self._artifacts
379
- ]
380
- # the below seems to preserve the order of the list in the
381
- # auto-incrementing integer primary
382
- # merely using .artifacts.set(*...) doesn't achieve this
383
- # we need ignore_conflicts=True so that this won't error if links already exist
384
- CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True)
385
- save_schema_links(self)
386
- if using is not None:
387
- logger.warning("using argument is ignored")
388
- return self
389
-
390
-
391
- # docstring handled through attach_func_to_class_method
392
- def restore(self) -> None:
393
- self._branch_code = 1
394
- self.save()
395
-
396
-
397
- @property # type: ignore
398
- @doc_args(Collection.ordered_artifacts.__doc__)
399
- def ordered_artifacts(self) -> QuerySet:
400
- """{}""" # noqa: D415
401
- # tracking is done via QueryManager (_query_manager.py)
402
- return self.artifacts.order_by("links_collection__id")
403
-
404
-
405
- @property # type: ignore
406
- @doc_args(Collection.data_artifact.__doc__)
407
- def data_artifact(self) -> Artifact | None:
408
- """{}""" # noqa: D415
409
- return self.artifacts.first()
410
-
411
-
412
- METHOD_NAMES = [
413
- "__init__",
414
- "append",
415
- "open",
416
- "mapped",
417
- "cache",
418
- "load",
419
- "delete",
420
- "save",
421
- "restore",
422
- ]
423
-
424
- if ln_setup._TESTING:
425
- from inspect import signature
426
-
427
- SIGS = {
428
- name: signature(getattr(Collection, name))
429
- for name in METHOD_NAMES
430
- if name != "__init__"
431
- }
432
-
433
- for name in METHOD_NAMES:
434
- attach_func_to_class_method(name, Collection, globals())
435
-
436
- # mypy: ignore-errors
437
- Collection.ordered_artifacts = ordered_artifacts
438
- Collection.data_artifact = data_artifact
439
- Collection.describe = describe
440
- Collection.view_lineage = view_lineage