lamindb 0.77.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. lamindb/__init__.py +39 -32
  2. lamindb/_artifact.py +95 -64
  3. lamindb/_can_curate.py +19 -10
  4. lamindb/_collection.py +51 -49
  5. lamindb/_feature.py +9 -9
  6. lamindb/_finish.py +99 -86
  7. lamindb/_from_values.py +20 -17
  8. lamindb/_is_versioned.py +2 -1
  9. lamindb/_parents.py +23 -16
  10. lamindb/_query_manager.py +3 -3
  11. lamindb/_query_set.py +85 -18
  12. lamindb/_record.py +121 -46
  13. lamindb/_run.py +3 -3
  14. lamindb/_save.py +14 -8
  15. lamindb/{_feature_set.py → _schema.py} +34 -31
  16. lamindb/_storage.py +2 -1
  17. lamindb/_transform.py +51 -23
  18. lamindb/_ulabel.py +17 -8
  19. lamindb/_view.py +15 -14
  20. lamindb/base/__init__.py +24 -0
  21. lamindb/base/fields.py +281 -0
  22. lamindb/base/ids.py +103 -0
  23. lamindb/base/types.py +51 -0
  24. lamindb/base/users.py +30 -0
  25. lamindb/base/validation.py +67 -0
  26. lamindb/core/__init__.py +19 -14
  27. lamindb/core/_context.py +297 -228
  28. lamindb/core/_data.py +44 -49
  29. lamindb/core/_describe.py +41 -31
  30. lamindb/core/_django.py +59 -44
  31. lamindb/core/_feature_manager.py +192 -168
  32. lamindb/core/_label_manager.py +22 -22
  33. lamindb/core/_mapped_collection.py +17 -14
  34. lamindb/core/_settings.py +1 -12
  35. lamindb/core/_sync_git.py +56 -9
  36. lamindb/core/_track_environment.py +1 -1
  37. lamindb/core/datasets/_core.py +5 -6
  38. lamindb/core/exceptions.py +0 -7
  39. lamindb/core/fields.py +1 -1
  40. lamindb/core/loaders.py +18 -2
  41. lamindb/core/{schema.py → relations.py} +22 -19
  42. lamindb/core/storage/_anndata_accessor.py +1 -2
  43. lamindb/core/storage/_backed_access.py +2 -1
  44. lamindb/core/storage/_tiledbsoma.py +40 -13
  45. lamindb/core/storage/objects.py +1 -1
  46. lamindb/core/storage/paths.py +13 -8
  47. lamindb/core/subsettings/__init__.py +0 -2
  48. lamindb/core/types.py +2 -23
  49. lamindb/core/versioning.py +11 -7
  50. lamindb/{_curate.py → curators/__init__.py} +700 -57
  51. lamindb/curators/_spatial.py +528 -0
  52. lamindb/integrations/_vitessce.py +1 -3
  53. lamindb/migrations/0052_squashed.py +1261 -0
  54. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +57 -0
  55. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +35 -0
  56. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +61 -0
  57. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +22 -0
  58. lamindb/migrations/0057_link_models_latest_report_and_others.py +356 -0
  59. lamindb/migrations/0058_artifact__actions_collection__actions.py +22 -0
  60. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +31 -0
  61. lamindb/migrations/0060_alter_artifact__actions.py +22 -0
  62. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +45 -0
  63. lamindb/migrations/0062_add_is_latest_field.py +32 -0
  64. lamindb/migrations/0063_populate_latest_field.py +45 -0
  65. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +33 -0
  66. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +22 -0
  67. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +352 -0
  68. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +20 -0
  69. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +20 -0
  70. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +1294 -0
  71. lamindb/migrations/0069_squashed.py +1770 -0
  72. lamindb/migrations/0070_lamindbv1_migrate_data.py +78 -0
  73. lamindb/migrations/0071_lamindbv1_migrate_schema.py +741 -0
  74. lamindb/migrations/0072_remove_user__branch_code_remove_user_aux_and_more.py +148 -0
  75. lamindb/migrations/0073_merge_ourprojects.py +945 -0
  76. lamindb/migrations/0074_lamindbv1_part4.py +374 -0
  77. lamindb/migrations/0075_lamindbv1_part5.py +276 -0
  78. lamindb/migrations/0076_lamindbv1_part6.py +621 -0
  79. lamindb/migrations/0077_lamindbv1_part6b.py +228 -0
  80. lamindb/migrations/0078_lamindbv1_part6c.py +468 -0
  81. lamindb/migrations/0079_alter_rundata_value_json_and_more.py +36 -0
  82. lamindb/migrations/__init__.py +0 -0
  83. lamindb/models.py +4064 -0
  84. {lamindb-0.77.2.dist-info → lamindb-1.0.0.dist-info}/METADATA +15 -20
  85. lamindb-1.0.0.dist-info/RECORD +100 -0
  86. {lamindb-0.77.2.dist-info → lamindb-1.0.0.dist-info}/WHEEL +1 -1
  87. lamindb/core/subsettings/_transform_settings.py +0 -21
  88. lamindb-0.77.2.dist-info/RECORD +0 -63
  89. {lamindb-0.77.2.dist-info → lamindb-1.0.0.dist-info}/LICENSE +0 -0
lamindb/models.py ADDED
@@ -0,0 +1,4064 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from collections import defaultdict
5
+ from datetime import date, datetime # noqa: TC003
6
+ from itertools import chain
7
+ from typing import (
8
+ TYPE_CHECKING,
9
+ Any,
10
+ Literal,
11
+ NamedTuple,
12
+ overload,
13
+ )
14
+
15
+ from django.core.validators import RegexValidator
16
+ from django.db import IntegrityError, models
17
+ from django.db.models import CASCADE, PROTECT, Field, Q
18
+ from django.db.models.base import ModelBase
19
+ from django.db.models.fields.related import (
20
+ ManyToManyField,
21
+ ManyToManyRel,
22
+ ManyToOneRel,
23
+ )
24
+ from lamin_utils import colors
25
+ from lamindb_setup import _check_instance_setup
26
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict
27
+
28
+ from lamindb.base import deprecated, doc_args
29
+ from lamindb.base.fields import (
30
+ BigIntegerField,
31
+ BooleanField,
32
+ CharField,
33
+ DateField,
34
+ DateTimeField,
35
+ EmailField,
36
+ ForeignKey,
37
+ IntegerField,
38
+ JSONField,
39
+ OneToOneField,
40
+ TextField,
41
+ URLField,
42
+ )
43
+
44
+ from .base.ids import base62_8, base62_12, base62_20
45
+ from .base.types import (
46
+ ArtifactKind,
47
+ FeatureDtype,
48
+ FieldAttr,
49
+ ListLike,
50
+ StrField,
51
+ TransformType,
52
+ )
53
+ from .base.users import current_user_id
54
+
55
+ if TYPE_CHECKING:
56
+ from collections.abc import Iterable
57
+ from pathlib import Path
58
+
59
+ import numpy as np
60
+ import pandas as pd
61
+ from anndata import AnnData
62
+ from lamin_utils._inspect import InspectResult
63
+ from lamindb_setup.core.types import UPathStr
64
+ from mudata import MuData
65
+ from pyarrow.dataset import Dataset as PyArrowDataset
66
+ from tiledbsoma import Collection as SOMACollection
67
+ from tiledbsoma import Experiment as SOMAExperiment
68
+ from upath import UPath
69
+
70
+ from lamindb.core import LabelManager, MappedCollection, QuerySet, RecordList
71
+ from lamindb.core.storage import AnnDataAccessor, BackedAccessor
72
+
73
+
74
+ _TRACKING_READY: bool | None = None
75
+
76
+
77
+ class IsVersioned(models.Model):
78
+ """Base class for versioned models."""
79
+
80
+ class Meta:
81
+ abstract = True
82
+
83
+ _len_stem_uid: int
84
+
85
+ version: str | None = CharField(max_length=30, null=True, db_index=True)
86
+ """Version (default `None`).
87
+
88
+ Defines version of a family of records characterized by the same `stem_uid`.
89
+
90
+ Consider using `semantic versioning <https://semver.org>`__
91
+ with `Python versioning <https://peps.python.org/pep-0440/>`__.
92
+ """
93
+ is_latest: bool = BooleanField(default=True, db_index=True)
94
+ """Boolean flag that indicates whether a record is the latest in its version family."""
95
+
96
+ @overload
97
+ def __init__(self): ...
98
+
99
+ @overload
100
+ def __init__(
101
+ self,
102
+ *db_args,
103
+ ): ...
104
+
105
+ def __init__(
106
+ self,
107
+ *args,
108
+ **kwargs,
109
+ ):
110
+ self._revises = kwargs.pop("revises") if "revises" in kwargs else None
111
+ super().__init__(*args, **kwargs)
112
+
113
+ @property
114
+ def stem_uid(self) -> str:
115
+ """Universal id characterizing the version family.
116
+
117
+ The full uid of a record is obtained via concatenating the stem uid and version information::
118
+
119
+ stem_uid = random_base62(n_char) # a random base62 sequence of length 12 (transform) or 16 (artifact, collection)
120
+ version_uid = "0000" # an auto-incrementing 4-digit base62 number
121
+ uid = f"{stem_uid}{version_uid}" # concatenate the stem_uid & version_uid
122
+
123
+ """
124
+ return self.uid[: self._len_stem_uid] # type: ignore
125
+
126
+ @property
127
+ def versions(self) -> QuerySet:
128
+ """Lists all records of the same version family.
129
+
130
+ >>> new_artifact = ln.Artifact(df2, revises=artifact).save()
131
+ >>> new_artifact.versions()
132
+ """
133
+ db = self._state.db
134
+ if db is not None and db != "default":
135
+ return self.__class__.using(db).filter(uid__startswith=self.stem_uid) # type: ignore
136
+ else:
137
+ return self.__class__.filter(uid__startswith=self.stem_uid) # type: ignore
138
+
139
+ def _add_to_version_family(self, revises: IsVersioned, version: str | None = None):
140
+ """Add current record to a version family.
141
+
142
+ Args:
143
+ revises: a record that belongs to the version family.
144
+ version: semantic version of the record.
145
+ """
146
+ pass
147
+
148
+
149
+ def current_run() -> Run | None:
150
+ global _TRACKING_READY
151
+
152
+ if not _TRACKING_READY:
153
+ _TRACKING_READY = _check_instance_setup()
154
+ if _TRACKING_READY:
155
+ import lamindb.core
156
+
157
+ return lamindb.context.run
158
+ else:
159
+ return None
160
+
161
+
162
+ class TracksRun(models.Model):
163
+ """Base class tracking latest run, creating user, and `created_at` timestamp."""
164
+
165
+ class Meta:
166
+ abstract = True
167
+
168
+ created_at: datetime = DateTimeField(
169
+ editable=False, db_default=models.functions.Now(), db_index=True
170
+ )
171
+ """Time of creation of record."""
172
+ created_by: User = ForeignKey(
173
+ "lamindb.User",
174
+ PROTECT,
175
+ editable=False,
176
+ default=current_user_id,
177
+ related_name="+",
178
+ )
179
+ """Creator of record."""
180
+ run: Run | None = ForeignKey(
181
+ "lamindb.Run", PROTECT, null=True, default=current_run, related_name="+"
182
+ )
183
+ """Last run that created or updated the record."""
184
+
185
+ @overload
186
+ def __init__(self): ...
187
+
188
+ @overload
189
+ def __init__(
190
+ self,
191
+ *db_args,
192
+ ): ...
193
+
194
+ def __init__(
195
+ self,
196
+ *args,
197
+ **kwargs,
198
+ ):
199
+ super().__init__(*args, **kwargs)
200
+
201
+
202
+ class TracksUpdates(models.Model):
203
+ """Base class tracking previous runs and `updated_at` timestamp."""
204
+
205
+ class Meta:
206
+ abstract = True
207
+
208
+ updated_at: datetime = DateTimeField(
209
+ editable=False, db_default=models.functions.Now(), db_index=True
210
+ )
211
+ """Time of last update to record."""
212
+
213
+ @overload
214
+ def __init__(self): ...
215
+
216
+ @overload
217
+ def __init__(
218
+ self,
219
+ *db_args,
220
+ ): ...
221
+
222
+ def __init__(
223
+ self,
224
+ *args,
225
+ **kwargs,
226
+ ):
227
+ super().__init__(*args, **kwargs)
228
+
229
+
230
+ class CanCurate:
231
+ """Base class providing :class:`~lamindb.core.Record`-based validation."""
232
+
233
+ @classmethod
234
+ def inspect(
235
+ cls,
236
+ values: ListLike,
237
+ field: str | StrField | None = None,
238
+ *,
239
+ mute: bool = False,
240
+ organism: str | Record | None = None,
241
+ source: Record | None = None,
242
+ ) -> InspectResult:
243
+ """Inspect if values are mappable to a field.
244
+
245
+ Being mappable means that an exact match exists.
246
+
247
+ Args:
248
+ values: Values that will be checked against the field.
249
+ field: The field of values. Examples are `'ontology_id'` to map
250
+ against the source ID or `'name'` to map against the ontologies
251
+ field names.
252
+ mute: Whether to mute logging.
253
+ organism: An Organism name or record.
254
+ source: A `bionty.Source` record that specifies the version to inspect against.
255
+
256
+ See Also:
257
+ :meth:`~lamindb.core.CanCurate.validate`
258
+
259
+ Examples:
260
+ >>> import bionty as bt
261
+ >>> bt.settings.organism = "human"
262
+ >>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
263
+ >>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
264
+ >>> result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol)
265
+ >>> result.validated
266
+ ['A1CF', 'A1BG']
267
+ >>> result.non_validated
268
+ ['FANCD1', 'FANCD20']
269
+ """
270
+ pass
271
+
272
+ @classmethod
273
+ def validate(
274
+ cls,
275
+ values: ListLike,
276
+ field: str | StrField | None = None,
277
+ *,
278
+ mute: bool = False,
279
+ organism: str | Record | None = None,
280
+ source: Record | None = None,
281
+ ) -> np.ndarray:
282
+ """Validate values against existing values of a string field.
283
+
284
+ Note this is strict validation, only asserts exact matches.
285
+
286
+ Args:
287
+ values: Values that will be validated against the field.
288
+ field: The field of values.
289
+ Examples are `'ontology_id'` to map against the source ID
290
+ or `'name'` to map against the ontologies field names.
291
+ mute: Whether to mute logging.
292
+ organism: An Organism name or record.
293
+ source: A `bionty.Source` record that specifies the version to validate against.
294
+
295
+ Returns:
296
+ A vector of booleans indicating if an element is validated.
297
+
298
+ See Also:
299
+ :meth:`~lamindb.core.CanCurate.inspect`
300
+
301
+ Examples:
302
+ >>> import bionty as bt
303
+ >>> bt.settings.organism = "human"
304
+ >>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
305
+ >>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
306
+ >>> bt.Gene.validate(gene_symbols, field=bt.Gene.symbol)
307
+ array([ True, True, False, False])
308
+ """
309
+ pass
310
+
311
+ def from_values(
312
+ cls,
313
+ values: ListLike,
314
+ field: StrField | None = None,
315
+ create: bool = False,
316
+ organism: Record | str | None = None,
317
+ source: Record | None = None,
318
+ mute: bool = False,
319
+ ) -> RecordList:
320
+ """Bulk create validated records by parsing values for an identifier such as a name or an id).
321
+
322
+ Args:
323
+ values: A list of values for an identifier, e.g.
324
+ `["name1", "name2"]`.
325
+ field: A `Record` field to look up, e.g., `bt.CellMarker.name`.
326
+ create: Whether to create records if they don't exist.
327
+ organism: A `bionty.Organism` name or record.
328
+ source: A `bionty.Source` record to validate against to create records for.
329
+ mute: Whether to mute logging.
330
+
331
+ Returns:
332
+ A list of validated records. For bionty registries. Also returns knowledge-coupled records.
333
+
334
+ Notes:
335
+ For more info, see tutorial: :doc:`docs:bio-registries`.
336
+
337
+ Examples:
338
+
339
+ Bulk create from non-validated values will log warnings & returns empty list:
340
+
341
+ >>> ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], field="name")
342
+ >>> assert len(ulabels) == 0
343
+
344
+ Bulk create records from validated values returns the corresponding existing records:
345
+
346
+ >>> ln.save([ln.ULabel(name=name) for name in ["benchmark", "prediction", "test"]])
347
+ >>> ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], field="name")
348
+ >>> assert len(ulabels) == 3
349
+
350
+ Bulk create records from public reference:
351
+
352
+ >>> import bionty as bt
353
+ >>> records = bt.CellType.from_values(["T cell", "B cell"], field="name")
354
+ >>> records
355
+ """
356
+ pass
357
+
358
+ @classmethod
359
+ def standardize(
360
+ cls,
361
+ values: Iterable,
362
+ field: str | StrField | None = None,
363
+ *,
364
+ return_field: str | StrField | None = None,
365
+ return_mapper: bool = False,
366
+ case_sensitive: bool = False,
367
+ mute: bool = False,
368
+ public_aware: bool = True,
369
+ keep: Literal["first", "last", False] = "first",
370
+ synonyms_field: str = "synonyms",
371
+ organism: str | Record | None = None,
372
+ source: Record | None = None,
373
+ ) -> list[str] | dict[str, str]:
374
+ """Maps input synonyms to standardized names.
375
+
376
+ Args:
377
+ values: Identifiers that will be standardized.
378
+ field: The field representing the standardized names.
379
+ return_field: The field to return. Defaults to field.
380
+ return_mapper: If `True`, returns `{input_value: standardized_name}`.
381
+ case_sensitive: Whether the mapping is case sensitive.
382
+ mute: Whether to mute logging.
383
+ public_aware: Whether to standardize from Bionty reference. Defaults to `True` for Bionty registries.
384
+ keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`:
385
+ - `"first"`: returns the first mapped standardized name
386
+ - `"last"`: returns the last mapped standardized name
387
+ - `False`: returns all mapped standardized name.
388
+
389
+ When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates.
390
+
391
+ When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value.
392
+ synonyms_field: A field containing the concatenated synonyms.
393
+ organism: An Organism name or record.
394
+ source: A `bionty.Source` record that specifies the version to validate against.
395
+
396
+ Returns:
397
+ If `return_mapper` is `False`: a list of standardized names. Otherwise,
398
+ a dictionary of mapped values with mappable synonyms as keys and
399
+ standardized names as values.
400
+
401
+ See Also:
402
+ :meth:`~lamindb.core.CanCurate.add_synonym`
403
+ Add synonyms.
404
+ :meth:`~lamindb.core.CanCurate.remove_synonym`
405
+ Remove synonyms.
406
+
407
+ Examples:
408
+ >>> import bionty as bt
409
+ >>> bt.settings.organism = "human"
410
+ >>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
411
+ >>> gene_synonyms = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
412
+ >>> standardized_names = bt.Gene.standardize(gene_synonyms)
413
+ >>> standardized_names
414
+ ['A1CF', 'A1BG', 'BRCA2', 'FANCD20']
415
+ """
416
+ pass
417
+
418
+ def add_synonym(
419
+ self,
420
+ synonym: str | ListLike,
421
+ force: bool = False,
422
+ save: bool | None = None,
423
+ ):
424
+ """Add synonyms to a record.
425
+
426
+ Args:
427
+ synonym: The synonyms to add to the record.
428
+ force: Whether to add synonyms even if they are already synonyms of other records.
429
+ save: Whether to save the record to the database.
430
+
431
+ See Also:
432
+ :meth:`~lamindb.core.CanCurate.remove_synonym`
433
+ Remove synonyms.
434
+
435
+ Examples:
436
+ >>> import bionty as bt
437
+ >>> bt.CellType.from_source(name="T cell").save()
438
+ >>> lookup = bt.CellType.lookup()
439
+ >>> record = lookup.t_cell
440
+ >>> record.synonyms
441
+ 'T-cell|T lymphocyte|T-lymphocyte'
442
+ >>> record.add_synonym("T cells")
443
+ >>> record.synonyms
444
+ 'T cells|T-cell|T-lymphocyte|T lymphocyte'
445
+ """
446
+ pass
447
+
448
+ def remove_synonym(self, synonym: str | ListLike):
449
+ """Remove synonyms from a record.
450
+
451
+ Args:
452
+ synonym: The synonym values to remove.
453
+
454
+ See Also:
455
+ :meth:`~lamindb.core.CanCurate.add_synonym`
456
+ Add synonyms
457
+
458
+ Examples:
459
+ >>> import bionty as bt
460
+ >>> bt.CellType.from_source(name="T cell").save()
461
+ >>> lookup = bt.CellType.lookup()
462
+ >>> record = lookup.t_cell
463
+ >>> record.synonyms
464
+ 'T-cell|T lymphocyte|T-lymphocyte'
465
+ >>> record.remove_synonym("T-cell")
466
+ 'T lymphocyte|T-lymphocyte'
467
+ """
468
+ pass
469
+
470
+ def set_abbr(self, value: str):
471
+ """Set value for abbr field and add to synonyms.
472
+
473
+ Args:
474
+ value: A value for an abbreviation.
475
+
476
+ See Also:
477
+ :meth:`~lamindb.core.CanCurate.add_synonym`
478
+
479
+ Examples:
480
+ >>> import bionty as bt
481
+ >>> bt.ExperimentalFactor.from_source(name="single-cell RNA sequencing").save()
482
+ >>> scrna = bt.ExperimentalFactor.get(name="single-cell RNA sequencing")
483
+ >>> scrna.abbr
484
+ None
485
+ >>> scrna.synonyms
486
+ 'single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing'
487
+ >>> scrna.set_abbr("scRNA")
488
+ >>> scrna.abbr
489
+ 'scRNA'
490
+ >>> scrna.synonyms
491
+ 'scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq'
492
+ >>> scrna.save()
493
+ """
494
+ pass
495
+
496
+
497
+ class HasParents:
498
+ """Base class for hierarchical registries (ontologies)."""
499
+
500
+ def view_parents(
501
+ self,
502
+ field: StrField | None = None,
503
+ with_children: bool = False,
504
+ distance: int = 5,
505
+ ):
506
+ """View parents in an ontology.
507
+
508
+ Args:
509
+ field: Field to display on graph
510
+ with_children: Whether to also show children.
511
+ distance: Maximum distance still shown.
512
+
513
+ Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).
514
+
515
+ Examples:
516
+ >>> import bionty as bt
517
+ >>> bt.Tissue.from_source(name="subsegmental bronchus").save()
518
+ >>> record = bt.Tissue.get(name="respiratory tube")
519
+ >>> record.view_parents()
520
+ >>> tissue.view_parents(with_children=True)
521
+ """
522
+ pass
523
+
524
+ def query_parents(self) -> QuerySet:
525
+ """Query parents in an ontology."""
526
+ pass
527
+
528
+ def query_children(self) -> QuerySet:
529
+ """Query children in an ontology."""
530
+ pass
531
+
532
+
533
+ class ValidateFields:
534
+ pass
535
+
536
+
537
+ RECORD_REGISTRY_EXAMPLE = """Example::
538
+
539
+ from lamindb import Record, fields
540
+
541
+ # sub-classing `Record` creates a new registry
542
+ class Experiment(Record):
543
+ name: str = fields.CharField()
544
+
545
+ # instantiating `Experiment` creates a record `experiment`
546
+ experiment = Experiment(name="my experiment")
547
+
548
+ # you can save the record to the database
549
+ experiment.save()
550
+
551
+ # `Experiment` refers to the registry, which you can query
552
+ df = Experiment.filter(name__startswith="my ").df()
553
+ """
554
+
555
+
556
+ # this is the metaclass for Record
557
+ @doc_args(RECORD_REGISTRY_EXAMPLE)
558
+ class Registry(ModelBase):
559
+ """Metaclass for :class:`~lamindb.core.Record`.
560
+
561
+ Each `Registry` *object* is a `Record` *class* and corresponds to a table in the metadata SQL database.
562
+
563
+ You work with `Registry` objects whenever you use *class methods* of `Record`.
564
+
565
+ You call any subclass of `Record` a "registry" and their objects "records". A `Record` object corresponds to a row in the SQL table.
566
+
567
+ If you want to create a new registry, you sub-class `Record`.
568
+
569
+ {}
570
+
571
+ Note: `Registry` inherits from Django's `ModelBase`.
572
+ """
573
+
574
+ def __new__(cls, name, bases, attrs, **kwargs):
575
+ new_class = super().__new__(cls, name, bases, attrs, **kwargs)
576
+ return new_class
577
+
578
+ # below creates a sensible auto-complete behavior that differs across the
579
+ # class and instance level in Jupyter Editors it doesn't have any effect for
580
+ # static type analyzer like pylance used in VSCode
581
+ def __dir__(cls):
582
+ # this is needed to bring auto-complete on the class-level back
583
+ # https://laminlabs.slack.com/archives/C04FPE8V01W/p1717535625268849
584
+ # Filter class attributes, excluding instance methods
585
+ exclude_instance_methods = "sphinx" not in sys.modules
586
+ # https://laminlabs.slack.com/archives/C04FPE8V01W/p1721134595920959
587
+
588
+ def include_attribute(attr_name, attr_value):
589
+ if attr_name.startswith("__"):
590
+ return False
591
+ if exclude_instance_methods and callable(attr_value):
592
+ return isinstance(attr_value, (classmethod, staticmethod, type))
593
+ return True
594
+
595
+ # check also inherited attributes
596
+ if hasattr(cls, "mro"):
597
+ attrs = chain(*(c.__dict__.items() for c in cls.mro()))
598
+ else:
599
+ attrs = cls.__dict__.items()
600
+
601
+ result = []
602
+ for attr_name, attr_value in attrs:
603
+ if attr_name not in result and include_attribute(attr_name, attr_value):
604
+ result.append(attr_name)
605
+
606
+ # Add non-dunder attributes from Registry
607
+ for attr in dir(Registry):
608
+ if not attr.startswith("__") and attr not in result:
609
+ result.append(attr)
610
+ return result
611
+
612
+ def __repr__(cls) -> str:
613
+ return registry_repr(cls)
614
+
615
+ def lookup(
616
+ cls,
617
+ field: StrField | None = None,
618
+ return_field: StrField | None = None,
619
+ ) -> NamedTuple:
620
+ """Return an auto-complete object for a field.
621
+
622
+ Args:
623
+ field: The field to look up the values for. Defaults to first string field.
624
+ return_field: The field to return. If `None`, returns the whole record.
625
+
626
+ Returns:
627
+ A `NamedTuple` of lookup information of the field values with a
628
+ dictionary converter.
629
+
630
+ See Also:
631
+ :meth:`~lamindb.core.Record.search`
632
+
633
+ Examples:
634
+ >>> import bionty as bt
635
+ >>> bt.settings.organism = "human"
636
+ >>> bt.Gene.from_source(symbol="ADGB-DT").save()
637
+ >>> lookup = bt.Gene.lookup()
638
+ >>> lookup.adgb_dt
639
+ >>> lookup_dict = lookup.dict()
640
+ >>> lookup_dict['ADGB-DT']
641
+ >>> lookup_by_ensembl_id = bt.Gene.lookup(field="ensembl_gene_id")
642
+ >>> genes.ensg00000002745
643
+ >>> lookup_return_symbols = bt.Gene.lookup(field="ensembl_gene_id", return_field="symbol")
644
+ """
645
+ pass
646
+
647
+ def filter(cls, *queries, **expressions) -> QuerySet:
648
+ """Query records.
649
+
650
+ Args:
651
+ queries: One or multiple `Q` objects.
652
+ expressions: Fields and values passed as Django query expressions.
653
+
654
+ Returns:
655
+ A :class:`~lamindb.core.QuerySet`.
656
+
657
+ See Also:
658
+ - Guide: :doc:`docs:registries`
659
+ - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__
660
+
661
+ Examples:
662
+ >>> ln.ULabel(name="my label").save()
663
+ >>> ln.ULabel.filter(name__startswith="my").df()
664
+ """
665
+ pass
666
+
667
+ def get(
668
+ cls,
669
+ idlike: int | str | None = None,
670
+ **expressions,
671
+ ) -> Record:
672
+ """Get a single record.
673
+
674
+ Args:
675
+ idlike: Either a uid stub, uid or an integer id.
676
+ expressions: Fields and values passed as Django query expressions.
677
+
678
+ Returns:
679
+ A record.
680
+
681
+ Raises:
682
+ :exc:`docs:lamindb.core.exceptions.DoesNotExist`: In case no matching record is found.
683
+
684
+ See Also:
685
+ - Guide: :doc:`docs:registries`
686
+ - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__
687
+
688
+ Examples:
689
+ >>> ulabel = ln.ULabel.get("FvtpPJLJ")
690
+ >>> ulabel = ln.ULabel.get(name="my-label")
691
+ """
692
+ pass
693
+
694
+ def df(
695
+ cls,
696
+ include: str | list[str] | None = None,
697
+ features: bool | list[str] = False,
698
+ limit: int = 100,
699
+ ) -> pd.DataFrame:
700
+ """Convert to `pd.DataFrame`.
701
+
702
+ By default, shows all direct fields, except `updated_at`.
703
+
704
+ Use arguments `include` or `feature` to include other data.
705
+
706
+ Args:
707
+ include: Related fields to include as columns. Takes strings of
708
+ form `"ulabels__name"`, `"cell_types__name"`, etc. or a list
709
+ of such strings.
710
+ features: If `True`, map all features of the
711
+ :class:`~lamindb.Feature` registry onto the resulting
712
+ `DataFrame`. Only available for `Artifact`.
713
+ limit: Maximum number of rows to display from a Pandas DataFrame.
714
+ Defaults to 100 to reduce database load.
715
+
716
+ Examples:
717
+
718
+ Include the name of the creator in the `DataFrame`:
719
+
720
+ >>> ln.ULabel.df(include="created_by__name"])
721
+
722
+ Include display of features for `Artifact`:
723
+
724
+ >>> df = ln.Artifact.df(features=True)
725
+ >>> ln.view(df) # visualize with type annotations
726
+
727
+ Only include select features:
728
+
729
+ >>> df = ln.Artifact.df(features=["cell_type_by_expert", "cell_type_by_model"])
730
+ """
731
+ pass
732
+
733
+ def search(
734
+ cls,
735
+ string: str,
736
+ *,
737
+ field: StrField | None = None,
738
+ limit: int | None = 20,
739
+ case_sensitive: bool = False,
740
+ ) -> QuerySet:
741
+ """Search.
742
+
743
+ Args:
744
+ string: The input string to match against the field ontology values.
745
+ field: The field or fields to search. Search all string fields by default.
746
+ limit: Maximum amount of top results to return.
747
+ case_sensitive: Whether the match is case sensitive.
748
+
749
+ Returns:
750
+ A sorted `DataFrame` of search results with a score in column `score`.
751
+ If `return_queryset` is `True`. `QuerySet`.
752
+
753
+ See Also:
754
+ :meth:`~lamindb.core.Record.filter`
755
+ :meth:`~lamindb.core.Record.lookup`
756
+
757
+ Examples:
758
+ >>> ulabels = ln.ULabel.from_values(["ULabel1", "ULabel2", "ULabel3"], field="name")
759
+ >>> ln.save(ulabels)
760
+ >>> ln.ULabel.search("ULabel2")
761
+ """
762
+ pass
763
+
764
+ def using(
765
+ cls,
766
+ instance: str | None,
767
+ ) -> QuerySet:
768
+ """Use a non-default LaminDB instance.
769
+
770
+ Args:
771
+ instance: An instance identifier of form "account_handle/instance_name".
772
+
773
+ Examples:
774
+ >>> ln.ULabel.using("account_handle/instance_name").search("ULabel7", field="name")
775
+ uid score
776
+ name
777
+ ULabel7 g7Hk9b2v 100.0
778
+ ULabel5 t4Jm6s0q 75.0
779
+ ULabel6 r2Xw8p1z 75.0
780
+ """
781
+ pass
782
+
783
+ def __get_module_name__(cls) -> str:
784
+ schema_module_name = cls.__module__.split(".")[0]
785
+ module_name = schema_module_name.replace("lnschema_", "")
786
+ if module_name == "lamindb":
787
+ module_name = "core"
788
+ return module_name
789
+
790
+ @deprecated("__get_module_name__")
791
+ def __get_schema_name__(cls) -> str:
792
+ return cls.__get_module_name__()
793
+
794
+ def __get_name_with_module__(cls) -> str:
795
+ module_name = cls.__get_module_name__()
796
+ if module_name == "core":
797
+ module_prefix = ""
798
+ else:
799
+ module_prefix = f"{module_name}."
800
+ return f"{module_prefix}{cls.__name__}"
801
+
802
+ @deprecated("__get_name_with_module__")
803
+ def __get_name_with_schema__(cls) -> str:
804
+ return cls.__get_name_with_module__()
805
+
806
+
807
+ class BasicRecord(models.Model, metaclass=Registry):
808
+ """Basic metadata record.
809
+
810
+ It has the same methods as Record, but doesn't have the additional fields.
811
+
812
+ It's mainly used for LinkORMs and similar.
813
+ """
814
+
815
+ class Meta:
816
+ abstract = True
817
+
818
+
819
+ class Space(BasicRecord):
820
+ """Spaces."""
821
+
822
+ id: int = models.SmallAutoField(primary_key=True)
823
+ """Internal id, valid only in one DB instance."""
824
+ name: str = models.CharField(max_length=100, db_index=True)
825
+ """Name of space."""
826
+ uid: str = CharField(
827
+ unique=True,
828
+ max_length=12,
829
+ default="00000000",
830
+ db_default="00000000",
831
+ db_index=True,
832
+ )
833
+ """Universal id."""
834
+ description: str | None = CharField(null=True)
835
+ """Description of space."""
836
+ created_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
837
+ """Time of creation of record."""
838
+ created_by: User = ForeignKey(
839
+ "User", CASCADE, default=None, related_name="+", null=True
840
+ )
841
+ """Creator of run."""
842
+
843
+
844
+ @doc_args(RECORD_REGISTRY_EXAMPLE)
845
+ class Record(BasicRecord, metaclass=Registry):
846
+ """Metadata record.
847
+
848
+ Every `Record` is a data model that comes with a registry in form of a SQL
849
+ table in your database.
850
+
851
+ Sub-classing `Record` creates a new registry while instantiating a `Record`
852
+ creates a new record.
853
+
854
+ {}
855
+
856
+ `Record`'s metaclass is :class:`~lamindb.core.Registry`.
857
+
858
+ `Record` inherits from Django's `Model` class. Why does LaminDB call it `Record`
859
+ and not `Model`? The term `Record` can't lead to confusion with statistical,
860
+ machine learning or biological models.
861
+ """
862
+
863
+ _branch_code: int = models.SmallIntegerField(db_index=True, default=1, db_default=1)
864
+ """Whether record is on a branch, in archive or in trash.
865
+
866
+ This dictates whether a record appears in queries & searches.
867
+
868
+ Coding is as follows:
869
+
870
+ - 3: template (hidden in queries & searches)
871
+ - 2: draft (hidden in queries & searches)
872
+ - 1: default (visible in queries & searches)
873
+ - 0: archive (hidden, meant to be kept)
874
+ - -1: trash (hidden, scheduled for deletion)
875
+
876
+ Any integer higher than >3 codes a branch that's involved in a pull request.
877
+ """
878
+ space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1)
879
+ """The space in which the record lives."""
880
+ _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)
881
+ """Auxiliary field for dictionary-like metadata."""
882
+
883
+ def save(self, *args, **kwargs) -> Record:
884
+ """Save.
885
+
886
+ Always saves to the default database.
887
+ """
888
+ # we need this here because we're using models also from plain
889
+ # django outside of lamindb
890
+ super().save(*args, **kwargs)
891
+ return self
892
+
893
+ def delete(self) -> None:
894
+ """Delete."""
895
+ pass
896
+
897
+ class Meta:
898
+ abstract = True
899
+
900
+
901
+ class FeatureManager:
902
+ """Feature manager."""
903
+
904
+ pass
905
+
906
+
907
+ class ParamManager:
908
+ """Param manager."""
909
+
910
+ pass
911
+
912
+
913
+ class ParamManagerArtifact(ParamManager):
914
+ """Param manager."""
915
+
916
+ pass
917
+
918
+
919
+ class ParamManagerRun(ParamManager):
920
+ """Param manager."""
921
+
922
+ pass
923
+
924
+
925
+ # -------------------------------------------------------------------------------------
926
+ # A note on required fields at the Record level
927
+ #
928
+ # As Django does most of its validation on the Form-level, it doesn't offer functionality
929
+ # for validating the integrity of an Record object upon instantation (similar to pydantic)
930
+ #
931
+ # For required fields, we define them as commonly done on the SQL level together
932
+ # with a validator in Record (validate_required_fields)
933
+ #
934
+ # This goes against the Django convention, but goes with the SQLModel convention
935
+ # (Optional fields can be null on the SQL level, non-optional fields cannot)
936
+ #
937
+ # Due to Django's convention where CharFieldAttr has pre-configured (null=False, default=""), marking
938
+ # a required field necessitates passing `default=None`. Without the validator it would trigger
939
+ # an error at the SQL-level, with it, it triggers it at instantiation
940
+
941
+ # -------------------------------------------------------------------------------------
942
+ # A note on class and instance methods of core Record
943
+ #
944
+ # All of these are defined and tested within lamindb, in files starting with _{orm_name}.py
945
+
946
+ # -------------------------------------------------------------------------------------
947
+ # A note on maximal lengths of char fields
948
+ #
949
+ # 100 characters:
950
+ # "Raindrops pitter-pattered on the windowpane, blurring the"
951
+ # "city lights outside, curled up with a mug."
952
+ # A good maximal length for a name (title).
953
+ #
954
+ # 150 characters: We choose this for name maximal length because some users like long names.
955
+ #
956
+ # 255 characters:
957
+ # "In creating a precise 255-character paragraph, one engages in"
958
+ # "a dance of words, where clarity meets brevity. Every syllable counts,"
959
+ # "illustrating the skill in compact expression, ensuring the essence of the"
960
+ # "message shines through within the exacting limit."
961
+ # This is a good maximal length for a description field.
962
+
963
+
964
+ class User(BasicRecord, CanCurate):
965
+ """Users.
966
+
967
+ All data in this registry is synched from `lamin.ai` to ensure a universal
968
+ user identity. There is no need to manually create records.
969
+
970
+ Examples:
971
+
972
+ Query a user by handle:
973
+
974
+ >>> user = ln.User.get(handle="testuser1")
975
+ >>> user
976
+ """
977
+
978
+ _name_field: str = "handle"
979
+
980
+ id: int = models.AutoField(primary_key=True)
981
+ """Internal id, valid only in one DB instance."""
982
+ uid: str = CharField(unique=True, db_index=True, max_length=8)
983
+ """Universal id, valid across DB instances."""
984
+ handle: str = CharField(max_length=30, unique=True, db_index=True)
985
+ """Universal handle, valid across DB instances (required)."""
986
+ name: str | None = CharField(max_length=150, db_index=True, null=True)
987
+ """Name (optional).""" # has to match hub specification, where it's also optional
988
+ created_artifacts: Artifact
989
+ """Artifacts created by user."""
990
+ created_transforms: Transform
991
+ """Transforms created by user."""
992
+ created_runs: Run
993
+ """Runs created by user."""
994
+ created_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
995
+ """Time of creation of record."""
996
+ updated_at: datetime = DateTimeField(auto_now=True, db_index=True)
997
+ """Time of last update to record."""
998
+
999
+ @overload
1000
+ def __init__(
1001
+ self,
1002
+ handle: str,
1003
+ email: str,
1004
+ name: str | None,
1005
+ ): ...
1006
+
1007
+ @overload
1008
+ def __init__(
1009
+ self,
1010
+ *db_args,
1011
+ ): ...
1012
+
1013
+ def __init__(
1014
+ self,
1015
+ *args,
1016
+ **kwargs,
1017
+ ):
1018
+ super().__init__(*args, **kwargs)
1019
+
1020
+
1021
+ class Storage(Record, TracksRun, TracksUpdates):
1022
+ """Storage locations.
1023
+
1024
+ A storage location is either a directory/folder (local or in the cloud) or
1025
+ an entire S3/GCP bucket.
1026
+
1027
+ A LaminDB instance can manage and link multiple storage locations. But any
1028
+ storage location is managed by *at most one* LaminDB instance.
1029
+
1030
+ .. dropdown:: Managed vs. linked storage locations
1031
+
1032
+ The LaminDB instance can update & delete artifacts in managed storage
1033
+ locations but merely read artifacts in linked storage locations.
1034
+
1035
+ When you transfer artifacts from another instance, the default is to
1036
+ only copy metadata into the target instance, but merely link the data.
1037
+
1038
+ The `instance_uid` field indicates the managing LaminDB instance of a
1039
+ storage location.
1040
+
1041
+ When you delete a LaminDB instance, you'll be warned about data in managed
1042
+ storage locations while data in linked storage locations is ignored.
1043
+
1044
+ See Also:
1045
+ :attr:`~lamindb.core.Settings.storage`
1046
+ Default storage.
1047
+ :attr:`~lamindb.setup.core.StorageSettings`
1048
+ Storage settings.
1049
+
1050
+ Examples:
1051
+
1052
+ Configure the default storage location upon initiation of a LaminDB instance::
1053
+
1054
+ lamin init --storage ./mydata # or "s3://my-bucket" or "gs://my-bucket"
1055
+
1056
+ View the default storage location:
1057
+
1058
+ >>> ln.settings.storage
1059
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata')
1060
+
1061
+ Dynamically change the default storage:
1062
+
1063
+ >>> ln.settings.storage = "./storage_2" # or a cloud bucket
1064
+ """
1065
+
1066
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1067
+ abstract = False
1068
+
1069
+ _name_field: str = "root"
1070
+
1071
+ id: int = models.AutoField(primary_key=True)
1072
+ """Internal id, valid only in one DB instance."""
1073
+ uid: str = CharField(unique=True, max_length=12, default=base62_12, db_index=True)
1074
+ """Universal id, valid across DB instances."""
1075
+ # we are very conservative here with 255 characters
1076
+ root: str = CharField(max_length=255, db_index=True, unique=True)
1077
+ """Root path of storage. n s3 path. local path, etc. (required)."""
1078
+ description: str | None = CharField(max_length=255, db_index=True, null=True)
1079
+ """A description of what the storage location is used for (optional)."""
1080
+ type: str = CharField(max_length=30, db_index=True)
1081
+ """Can be "local" vs. "s3" vs. "gs"."""
1082
+ region: str | None = CharField(max_length=64, db_index=True, null=True)
1083
+ """Cloud storage region, if applicable."""
1084
+ instance_uid: str | None = CharField(max_length=12, db_index=True, null=True)
1085
+ """Instance that manages this storage location."""
1086
+ artifacts: Artifact
1087
+ """Artifacts contained in this storage location."""
1088
+
1089
+ @overload
1090
+ def __init__(
1091
+ self,
1092
+ root: str,
1093
+ type: str,
1094
+ region: str | None,
1095
+ ): ...
1096
+
1097
+ @overload
1098
+ def __init__(
1099
+ self,
1100
+ *db_args,
1101
+ ): ...
1102
+
1103
+ def __init__(
1104
+ self,
1105
+ *args,
1106
+ **kwargs,
1107
+ ):
1108
+ super().__init__(*args, **kwargs)
1109
+
1110
+ @property
1111
+ def path(self) -> Path | UPath:
1112
+ """Bucket or folder path.
1113
+
1114
+ Cloud storage bucket:
1115
+
1116
+ >>> ln.Storage("s3://my-bucket").save()
1117
+
1118
+ Directory/folder in cloud storage:
1119
+
1120
+ >>> ln.Storage("s3://my-bucket/my-directory").save()
1121
+
1122
+ Local directory/folder:
1123
+
1124
+ >>> ln.Storage("./my-directory").save()
1125
+ """
1126
+ pass
1127
+
1128
+
1129
+ # does not inherit from TracksRun because the Transform
1130
+ # is needed to define a run
1131
+ class Transform(Record, IsVersioned):
1132
+ """Data transformations.
1133
+
1134
+ A "transform" can refer to a Python function, a script, a notebook, or a
1135
+ pipeline. If you execute a transform, you generate a run
1136
+ (:class:`~lamindb.Run`). A run has inputs and outputs.
1137
+
1138
+ A pipeline is typically created with a workflow tool (Nextflow, Snakemake,
1139
+ Prefect, Flyte, MetaFlow, redun, Airflow, ...) and stored in a versioned
1140
+ repository.
1141
+
1142
+ Transforms are versioned so that a given transform version maps on a given
1143
+ source code version.
1144
+
1145
+ .. dropdown:: Can I sync transforms to git?
1146
+
1147
+ If you switch on
1148
+ :attr:`~lamindb.core.Settings.sync_git_repo` a script-like transform is
1149
+ synched to its hashed state in a git repository upon calling `ln.track()`.
1150
+
1151
+ >>> ln.settings.sync_git_repo = "https://github.com/laminlabs/lamindb"
1152
+ >>> ln.track()
1153
+
1154
+ The definition of transforms and runs is consistent the OpenLineage
1155
+ specification where a :class:`~lamindb.Transform` record would be called a
1156
+ "job" and a :class:`~lamindb.Run` record a "run".
1157
+
1158
+ Args:
1159
+ name: `str` A name or title.
1160
+ key: `str | None = None` A short name or path-like semantic key.
1161
+ type: `TransformType | None = "pipeline"` See :class:`~lamindb.base.types.TransformType`.
1162
+ revises: `Transform | None = None` An old version of the transform.
1163
+
1164
+ See Also:
1165
+ :meth:`~lamindb.core.Context.track`
1166
+ Globally track a script, notebook or pipeline run.
1167
+ :class:`~lamindb.Run`
1168
+ Executions of transforms.
1169
+
1170
+ Notes:
1171
+ - :doc:`docs:track`
1172
+ - :doc:`docs:data-flow`
1173
+ - :doc:`docs:redun`
1174
+ - :doc:`docs:nextflow`
1175
+ - :doc:`docs:snakemake`
1176
+
1177
+ Examples:
1178
+
1179
+ Create a transform for a pipeline:
1180
+
1181
+ >>> transform = ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
1182
+
1183
+ Create a transform from a notebook:
1184
+
1185
+ >>> ln.track()
1186
+
1187
+ View predecessors of a transform:
1188
+
1189
+ >>> transform.view_lineage()
1190
+ """
1191
+
1192
+ class Meta(Record.Meta, IsVersioned.Meta):
1193
+ abstract = False
1194
+
1195
+ _len_stem_uid: int = 12
1196
+ _len_full_uid: int = 16
1197
+ _name_field: str = "key"
1198
+
1199
+ id: int = models.AutoField(primary_key=True)
1200
+ """Internal id, valid only in one DB instance."""
1201
+ uid: str = CharField(unique=True, db_index=True, max_length=_len_full_uid)
1202
+ """Universal id."""
1203
+ key: str | None = CharField(db_index=True, null=True)
1204
+ """A name or "/"-separated path-like string.
1205
+
1206
+ All transforms with the same key are part of the same version family.
1207
+ """
1208
+ description: str | None = CharField(db_index=True, null=True)
1209
+ """A description."""
1210
+ type: TransformType = CharField(
1211
+ max_length=20,
1212
+ db_index=True,
1213
+ default="pipeline",
1214
+ )
1215
+ """:class:`~lamindb.base.types.TransformType` (default `"pipeline"`)."""
1216
+ source_code: str | None = TextField(null=True)
1217
+ """Source code of the transform.
1218
+
1219
+ .. versionchanged:: 0.75
1220
+ The `source_code` field is no longer an artifact, but a text field.
1221
+ """
1222
+ hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
1223
+ """Hash of the source code."""
1224
+ reference: str | None = CharField(max_length=255, db_index=True, null=True)
1225
+ """Reference for the transform, e.g., a URL."""
1226
+ reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
1227
+ """Reference type of the transform, e.g., 'url'."""
1228
+ runs: Run
1229
+ """Runs of this transform."""
1230
+ ulabels: ULabel = models.ManyToManyField(
1231
+ "ULabel", through="TransformULabel", related_name="transforms"
1232
+ )
1233
+ """ULabel annotations of this transform."""
1234
+ predecessors: Transform = models.ManyToManyField(
1235
+ "self", symmetrical=False, related_name="successors"
1236
+ )
1237
+ """Preceding transforms.
1238
+
1239
+ These are auto-populated whenever an artifact or collection serves as a run
1240
+ input, e.g., `artifact.run` and `artifact.transform` get populated & saved.
1241
+
1242
+ The table provides a more convenient method to query for the predecessors that
1243
+ bypasses querying the :class:`~lamindb.Run`.
1244
+
1245
+ It also allows to manually add predecessors whose outputs are not tracked in a run.
1246
+ """
1247
+ successors: Transform
1248
+ """Subsequent transforms.
1249
+
1250
+ See :attr:`~lamindb.Transform.predecessors`.
1251
+ """
1252
+ output_artifacts: Artifact
1253
+ """The artifacts generated by all runs of this transform.
1254
+
1255
+ If you're looking for the outputs of a single run, see :attr:`lamindb.Run.output_artifacts`.
1256
+ """
1257
+ output_collections: Collection
1258
+ """The collections generated by all runs of this transform.
1259
+
1260
+ If you're looking for the outputs of a single run, see :attr:`lamindb.Run.output_collections`.
1261
+ """
1262
+ created_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
1263
+ """Time of creation of record."""
1264
+ updated_at: datetime = DateTimeField(auto_now=True, db_index=True)
1265
+ """Time of last update to record."""
1266
+ created_by: User = ForeignKey(
1267
+ User, PROTECT, default=current_user_id, related_name="created_transforms"
1268
+ )
1269
+ """Creator of record."""
1270
+ _template: Transform | None = ForeignKey(
1271
+ "Transform", PROTECT, related_name="_derived_from", default=None, null=True
1272
+ )
1273
+ """Creating template."""
1274
+
1275
+ @overload
1276
+ def __init__(
1277
+ self,
1278
+ name: str,
1279
+ key: str | None = None,
1280
+ type: TransformType | None = None,
1281
+ revises: Transform | None = None,
1282
+ ): ...
1283
+
1284
+ @overload
1285
+ def __init__(
1286
+ self,
1287
+ *db_args,
1288
+ ): ...
1289
+
1290
+ def __init__(
1291
+ self,
1292
+ *args,
1293
+ **kwargs,
1294
+ ):
1295
+ super().__init__(*args, **kwargs)
1296
+
1297
+ @property
1298
+ def name(self) -> str:
1299
+ """Name of the transform.
1300
+
1301
+ Splits `key` on `/` and returns the last element.
1302
+ """
1303
+ return self.key.split("/")[-1]
1304
+
1305
+ @property
1306
+ def latest_run(self) -> Run:
1307
+ """The latest run of this transform."""
1308
+ pass
1309
+
1310
+ def view_lineage(self) -> None:
1311
+ """View lineage of transforms."""
1312
+ pass
1313
+
1314
+
1315
+ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1316
+ """Parameters of runs & models."""
1317
+
1318
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1319
+ abstract = False
1320
+
1321
+ _name_field: str = "name"
1322
+
1323
+ name: str = CharField(max_length=100, db_index=True)
1324
+ dtype: str = CharField(max_length=64, db_index=True)
1325
+ """Data type ("num", "cat", "int", "float", "bool", "datetime").
1326
+
1327
+ For categorical types, can define from which registry values are
1328
+ sampled, e.g., `cat[ULabel]` or `cat[bionty.CellType]`.
1329
+ """
1330
+ type: Param | None = ForeignKey("self", PROTECT, null=True, related_name="records")
1331
+ """Type of param (e.g., 'Pipeline', 'ModelTraining', 'PostProcessing').
1332
+
1333
+ Allows to group features by type, e.g., all read outs, all metrics, etc.
1334
+ """
1335
+ records: Param
1336
+ """Records of this type."""
1337
+ is_type: bool = BooleanField(default=None, db_index=True, null=True)
1338
+ """Distinguish types from instances of the type."""
1339
+ _expect_many: bool = models.BooleanField(default=False, db_default=False)
1340
+ """Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`).
1341
+
1342
+ - if it's `False` (default), the values mean artifact/run-level values and a dtype of `datetime` means `datetime`
1343
+ - if it's `True`, the values are from an aggregation, which this seems like an edge case but when characterizing a model ensemble trained with different parameters it could be relevant
1344
+ """
1345
+ schemas: Schema = models.ManyToManyField(
1346
+ "Schema", through="SchemaParam", related_name="params"
1347
+ )
1348
+ """Feature sets linked to this feature."""
1349
+ # backward fields
1350
+ values: ParamValue
1351
+ """Values for this parameter."""
1352
+
1353
+
1354
+ # FeatureValue behaves in many ways like a link in a LinkORM
1355
+ # in particular, we don't want a _public field on it
1356
+ # Also, we don't inherit from TracksRun because a ParamValue
1357
+ # is typically created before a run is created and we want to
1358
+ # avoid delete cycles (for Model params though it might be helpful)
1359
+ class ParamValue(Record):
1360
+ """Parameter values.
1361
+
1362
+ Is largely analogous to `FeatureValue`.
1363
+ """
1364
+
1365
+ # we do not have a unique constraint on param & value because it leads to hashing errors
1366
+ # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
1367
+ # we do not hash values because we have `get_or_create` logic all over the place
1368
+ # and also for checking whether the (param, value) combination exists
1369
+ # there does not seem an issue with querying for a dict-like value
1370
+ # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
1371
+ _name_field: str = "value"
1372
+
1373
+ param: Param = ForeignKey(Param, CASCADE, related_name="values")
1374
+ """The dimension metadata."""
1375
+ value: Any = (
1376
+ models.JSONField()
1377
+ ) # stores float, integer, boolean, datetime or dictionaries
1378
+ """The JSON-like value."""
1379
+ # it'd be confusing and hard to populate a run here because these
1380
+ # values are typically created upon creating a run
1381
+ # hence, ParamValue does _not_ inherit from TracksRun but manually
1382
+ # adds created_at & created_by
1383
+ # because ParamValue cannot be updated, we don't need updated_at
1384
+ created_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
1385
+ """Time of creation of record."""
1386
+ created_by: User = ForeignKey(
1387
+ User, PROTECT, default=current_user_id, related_name="+"
1388
+ )
1389
+ """Creator of record."""
1390
+ hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
1391
+
1392
+ class Meta:
1393
+ constraints = [
1394
+ # For simple types, use direct value comparison
1395
+ models.UniqueConstraint(
1396
+ fields=["param", "value"],
1397
+ name="unique_simple_param_value",
1398
+ condition=Q(hash__isnull=True),
1399
+ ),
1400
+ # For complex types (dictionaries), use hash
1401
+ models.UniqueConstraint(
1402
+ fields=["param", "hash"],
1403
+ name="unique_complex_param_value",
1404
+ condition=Q(hash__isnull=False),
1405
+ ),
1406
+ ]
1407
+
1408
+ @classmethod
1409
+ def get_or_create(cls, param, value):
1410
+ # Simple types: int, float, str, bool
1411
+ if isinstance(value, (int, float, str, bool)):
1412
+ try:
1413
+ return cls.objects.create(param=param, value=value, hash=None), False
1414
+ except IntegrityError:
1415
+ return cls.objects.get(param=param, value=value), True
1416
+
1417
+ # Complex types: dict, list
1418
+ else:
1419
+ hash = hash_dict(value)
1420
+ try:
1421
+ return cls.objects.create(param=param, value=value, hash=hash), False
1422
+ except IntegrityError:
1423
+ return cls.objects.get(param=param, hash=hash), True
1424
+
1425
+
1426
+ class Run(Record):
1427
+ """Runs of transforms.
1428
+
1429
+ Args:
1430
+ transform: `Transform` A :class:`~lamindb.Transform` record.
1431
+ reference: `str | None = None` For instance, an external ID or a download URL.
1432
+ reference_type: `str | None = None` For instance, `redun_id`, `nextflow_id` or `url`.
1433
+
1434
+ See Also:
1435
+ :meth:`~lamindb.core.Context.track`
1436
+ Track global run & transform records for a notebook or pipeline.
1437
+
1438
+ Examples:
1439
+
1440
+ Create a run record:
1441
+
1442
+ >>> ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
1443
+ >>> transform = ln.Transform.get(name="Cell Ranger", version="7.2.0")
1444
+ >>> run = ln.Run(transform)
1445
+
1446
+ Create a global run context for a custom transform:
1447
+
1448
+ >>> ln.track(transform=transform)
1449
+ >>> ln.context.run # globally available run
1450
+
1451
+ Track a global run context for a notebook or script:
1452
+
1453
+ >>> ln.track() # Jupyter notebook metadata is automatically parsed
1454
+ >>> ln.context.run
1455
+ """
1456
+
1457
+ _name_field: str = "started_at"
1458
+
1459
+ params: ParamManager = ParamManagerRun # type: ignore
1460
+ """Param manager.
1461
+
1462
+ Guide: :ref:`track-run-parameters`
1463
+
1464
+ Example::
1465
+
1466
+ run.params.add_values({
1467
+ "learning_rate": 0.01,
1468
+ "input_dir": "s3://my-bucket/mydataset",
1469
+ "downsample": True,
1470
+ "preprocess_params": {
1471
+ "normalization_type": "cool",
1472
+ "subset_highlyvariable": True,
1473
+ },
1474
+ })
1475
+ """
1476
+
1477
+ id: int = models.BigAutoField(primary_key=True)
1478
+ """Internal id, valid only in one DB instance."""
1479
+ uid: str = CharField(unique=True, db_index=True, max_length=20, default=base62_20)
1480
+ """Universal id, valid across DB instances."""
1481
+ name: str | None = CharField(max_length=150, null=True)
1482
+ """A name."""
1483
+ transform = ForeignKey(Transform, CASCADE, related_name="runs")
1484
+ """The transform :class:`~lamindb.Transform` that is being run."""
1485
+ started_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
1486
+ """Start time of run."""
1487
+ finished_at: datetime | None = DateTimeField(db_index=True, null=True, default=None)
1488
+ """Finished time of run."""
1489
+ # we don't want to make below a OneToOne because there could be the same trivial report
1490
+ # generated for many different runs
1491
+ report: Artifact | None = ForeignKey(
1492
+ "Artifact", PROTECT, null=True, related_name="_report_of", default=None
1493
+ )
1494
+ """Report of run, e.g.. n html file."""
1495
+ _logfile: Artifact | None = ForeignKey(
1496
+ "Artifact", PROTECT, null=True, related_name="_logfile_of", default=None
1497
+ )
1498
+ """Report of run, e.g.. n html file."""
1499
+ environment: Artifact | None = ForeignKey(
1500
+ "Artifact", PROTECT, null=True, related_name="_environment_of", default=None
1501
+ )
1502
+ """Computational environment for the run.
1503
+
1504
+ For instance, `Dockerfile`, `docker image`, `requirements.txt`, `environment.yml`, etc.
1505
+ """
1506
+ input_artifacts: Artifact
1507
+ """The artifacts serving as input for this run.
1508
+
1509
+ Related accessor: :attr:`~lamindb.Artifact.input_of_runs`.
1510
+ """
1511
+ output_artifacts: Artifact
1512
+ """The artifacts generated by this run.
1513
+
1514
+ Related accessor: via :attr:`~lamindb.Artifact.run`
1515
+ """
1516
+ input_collections: Collection
1517
+ """The collections serving as input for this run."""
1518
+ output_collections: Collection
1519
+ """The collections generated by this run."""
1520
+ _param_values: ParamValue = models.ManyToManyField(
1521
+ ParamValue, through="RunParamValue", related_name="runs"
1522
+ )
1523
+ """Parameter values."""
1524
+ reference: str | None = CharField(max_length=255, db_index=True, null=True)
1525
+ """A reference like a URL or external ID (such as from a workflow manager)."""
1526
+ reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
1527
+ """Type of reference such as a workflow manager execution ID."""
1528
+ created_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
1529
+ """Time of first creation. Mismatches ``started_at`` if the run is re-run."""
1530
+ created_by: User = ForeignKey(
1531
+ User, CASCADE, default=current_user_id, related_name="created_runs"
1532
+ )
1533
+ """Creator of run."""
1534
+ initiated_by_run: Run | None = ForeignKey(
1535
+ "Run", CASCADE, null=True, related_name="initiated_runs", default=None
1536
+ )
1537
+ """The run that triggered the current run.
1538
+
1539
+ This is not a preceding run. The preceding runs ("predecessors") is the set
1540
+ of runs that produced the output artifacts that serve as the inputs for the
1541
+ present run.
1542
+
1543
+ Be careful with using this field at this point.
1544
+ """
1545
+ children: Run
1546
+ """The runs that are triggered by this run."""
1547
+ _is_consecutive: bool | None = BooleanField(null=True)
1548
+ """Indicates whether code was consecutively executed. Is relevant for notebooks."""
1549
+ _status_code: int = models.SmallIntegerField(default=0, db_index=True)
1550
+ """Status code of the run.
1551
+
1552
+ - 0: scheduled
1553
+ - 1: started
1554
+ - 2: errored
1555
+ - 3: aborted
1556
+ - 4: completed
1557
+ """
1558
+
1559
+ @overload
1560
+ def __init__(
1561
+ self,
1562
+ transform: Transform,
1563
+ reference: str | None = None,
1564
+ reference_type: str | None = None,
1565
+ ): ...
1566
+
1567
+ @overload
1568
+ def __init__(
1569
+ self,
1570
+ *db_args,
1571
+ ): ...
1572
+
1573
+ def __init__(
1574
+ self,
1575
+ *args,
1576
+ **kwargs,
1577
+ ):
1578
+ super().__init__(*args, **kwargs)
1579
+
1580
+
1581
+ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1582
+ """Universal labels.
1583
+
1584
+ Args:
1585
+ name: `str` A name.
1586
+ description: `str` A description.
1587
+ reference: `str | None = None` For instance, an external ID or a URL.
1588
+ reference_type: `str | None = None` For instance, `"url"`.
1589
+
1590
+ A `ULabel` record provides the easiest way to annotate a dataset
1591
+ with a label: `"My project"`, `"curated"`, or `"Batch X"`:
1592
+
1593
+ >>> my_project = ULabel(name="My project")
1594
+ >>> my_project.save()
1595
+ >>> artifact.ulabels.add(my_project)
1596
+
1597
+ Often, a ulabel is measured *within* a dataset. For instance, an artifact
1598
+ might characterize 2 species of the Iris flower (`"setosa"` &
1599
+ `"versicolor"`) measured by a `"species"` feature. Use the
1600
+ :class:`~lamindb.Curator` flow to automatically parse, validate, and
1601
+ annotate with labels that are contained in `DataFrame` or `AnnData`
1602
+ artifacts.
1603
+
1604
+ .. note::
1605
+
1606
+ If you work with complex entities like cell lines, cell types, tissues,
1607
+ etc., consider using the pre-defined biological registries in
1608
+ :mod:`bionty` to label artifacts & collections.
1609
+
1610
+ If you work with biological samples, likely, the only sustainable way of
1611
+ tracking metadata, is to create a custom schema module.
1612
+
1613
+ See Also:
1614
+ :meth:`~lamindb.Feature`
1615
+ Dimensions of measurement for artifacts & collections.
1616
+ :attr:`~lamindb.Artifact.features`
1617
+ Feature manager for an artifact.
1618
+
1619
+ Examples:
1620
+
1621
+ Create a new label:
1622
+
1623
+ >>> train_split = ln.ULabel(name="train").save()
1624
+
1625
+ Organize labels in a hierarchy:
1626
+
1627
+ >>> split_type = ln.ULabel(name="Split", is_type=True).save()
1628
+ >>> train_split = ln.ULabel(name="train", type="split_type").save()
1629
+
1630
+ Label an artifact:
1631
+
1632
+ >>> artifact.ulabels.add(ulabel)
1633
+
1634
+ Query by `ULabel`:
1635
+
1636
+ >>> ln.Artifact.filter(ulabels=train_split)
1637
+ """
1638
+
1639
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1640
+ abstract = False
1641
+
1642
+ _name_field: str = "name"
1643
+
1644
+ id: int = models.AutoField(primary_key=True)
1645
+ """Internal id, valid only in one DB instance."""
1646
+ uid: str = CharField(unique=True, db_index=True, max_length=8, default=base62_8)
1647
+ """A universal random id, valid across DB instances."""
1648
+ name: str = CharField(max_length=150, db_index=True)
1649
+ """Name or title of ulabel (`unique=True`)."""
1650
+ type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="records")
1651
+ """Type of ulabel, e.g., `"donor"`, `"split"`, etc.
1652
+
1653
+ Allows to group ulabels by type, e.g., all donors, all split ulabels, etc.
1654
+ """
1655
+ records: ULabel
1656
+ """Records of this type."""
1657
+ is_type: bool = BooleanField(default=None, db_index=True, null=True)
1658
+ """Distinguish types from instances of the type.
1659
+
1660
+ For example, a ulabel "Project" would be a type, and the actual projects "Project 1", "Project 2", would be records of that `type`.
1661
+ """
1662
+ description: str | None = TextField(null=True)
1663
+ """A description (optional)."""
1664
+ reference: str | None = CharField(max_length=255, db_index=True, null=True)
1665
+ """A reference like URL or external ID."""
1666
+ reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
1667
+ """Type of reference such as a donor_id from Vendor X."""
1668
+ parents: ULabel = models.ManyToManyField(
1669
+ "self", symmetrical=False, related_name="children"
1670
+ )
1671
+ """Parent entities of this ulabel.
1672
+
1673
+ For advanced use cases, you can build an ontology under a given `type`.
1674
+
1675
+ Say, if you modeled `CellType` as a `ULabel`, you would introduce a type `CellType` and model the hiearchy of cell types under it.
1676
+ """
1677
+ children: ULabel
1678
+ """Child entities of this ulabel.
1679
+
1680
+ Reverse accessor for parents.
1681
+ """
1682
+ transforms: Transform
1683
+ """Transforms annotated with this ulabel."""
1684
+ artifacts: Artifact
1685
+ """Artifacts annotated with this ulabel."""
1686
+ collections: Collection
1687
+ """Collections annotated with this ulabel."""
1688
+
1689
+ @overload
1690
+ def __init__(
1691
+ self,
1692
+ name: str,
1693
+ description: str | None = None,
1694
+ reference: str | None = None,
1695
+ reference_type: str | None = None,
1696
+ ): ...
1697
+
1698
+ @overload
1699
+ def __init__(
1700
+ self,
1701
+ *db_args,
1702
+ ): ...
1703
+
1704
+ def __init__(
1705
+ self,
1706
+ *args,
1707
+ **kwargs,
1708
+ ):
1709
+ pass
1710
+
1711
+
1712
+ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1713
+ """Dataset dimensions.
1714
+
1715
+ A feature represents a dimension of a dataset, such as a column in a
1716
+ `DataFrame`. The `Feature` registry organizes metadata of features.
1717
+
1718
+ The `Feature` registry helps you organize and query datasets based on their
1719
+ features and corresponding label annotations. For instance, when working
1720
+ with a "T cell" label, it could be measured through different features
1721
+ such as `"cell_type_by_expert"` where an expert manually classified the
1722
+ cell, or `"cell_type_by_model"` where a computational model made the
1723
+ classification.
1724
+
1725
+ The two most important metadata of a feature are its `name` and the `dtype`.
1726
+ In addition to typical data types, LaminDB has a `"num"` `dtype` to
1727
+ concisely denote the union of all numerical types.
1728
+
1729
+ Args:
1730
+ name: `str` Name of the feature, typically. column name.
1731
+ dtype: `FeatureDtype | Registry | list[Registry]` See :class:`~lamindb.base.types.FeatureDtype`.
1732
+ For categorical types, can define from which registry values are
1733
+ sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
1734
+ unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
1735
+ description: `str | None = None` A description.
1736
+ synonyms: `str | None = None` Bar-separated synonyms.
1737
+
1738
+ Note:
1739
+
1740
+ For more control, you can use :mod:`bionty` registries to manage simple
1741
+ biological entities like genes, proteins & cell markers. Or you define
1742
+ custom registries to manage high-level derived features like gene sets.
1743
+
1744
+ See Also:
1745
+ :meth:`~lamindb.Feature.from_df`
1746
+ Create feature records from DataFrame.
1747
+ :attr:`~lamindb.Artifact.features`
1748
+ Feature manager of an artifact or collection.
1749
+ :class:`~lamindb.ULabel`
1750
+ Universal labels.
1751
+ :class:`~lamindb.Schema`
1752
+ Feature sets.
1753
+
1754
+ Example:
1755
+
1756
+ A simple `"str"` feature.
1757
+
1758
+ >>> ln.Feature(
1759
+ ... name="sample_note",
1760
+ ... dtype="str",
1761
+ ... ).save()
1762
+
1763
+ A dtype `"cat[ULabel]"` can be more easily passed as below.
1764
+
1765
+ >>> ln.Feature(
1766
+ ... name="project",
1767
+ ... dtype=ln.ULabel,
1768
+ ... ).save()
1769
+
1770
+ A dtype `"cat[ULabel|bionty.CellType]"` can be more easily passed as below.
1771
+
1772
+ >>> ln.Feature(
1773
+ ... name="cell_type",
1774
+ ... dtype=[ln.ULabel, bt.CellType],
1775
+ ... ).save()
1776
+
1777
+ Hint:
1778
+
1779
+ *Features* and *labels* denote two ways of using entities to organize data:
1780
+
1781
+ 1. A feature qualifies *what* is measured, i.e., a numerical or categorical random variable
1782
+ 2. A label *is* a measured value, i.e., a category
1783
+
1784
+ Consider annotating a dataset by that it measured expression of 30k
1785
+ genes: genes relate to the dataset as feature identifiers through a
1786
+ feature set with 30k members. Now consider annotating the artifact by
1787
+ whether that it measured the knock-out of 3 genes: here, the 3 genes act
1788
+ as labels of the dataset.
1789
+
1790
+ Re-shaping data can introduce ambiguity among features & labels. If this
1791
+ happened, ask yourself what the joint measurement was: a feature
1792
+ qualifies variables in a joint measurement. The canonical data matrix
1793
+ lists jointly measured variables in the columns.
1794
+
1795
+ """
1796
+
1797
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1798
+ abstract = False
1799
+
1800
+ _name_field: str = "name"
1801
+
1802
+ id: int = models.AutoField(primary_key=True)
1803
+ """Internal id, valid only in one DB instance."""
1804
+ uid: str = CharField(unique=True, db_index=True, max_length=12, default=base62_12)
1805
+ """Universal id, valid across DB instances."""
1806
+ name: str = CharField(max_length=150, db_index=True, unique=True)
1807
+ """Name of feature (`unique=True`)."""
1808
+ dtype: FeatureDtype = CharField(db_index=True)
1809
+ """Data type (:class:`~lamindb.base.types.FeatureDtype`).
1810
+
1811
+ For categorical types, can define from which registry values are
1812
+ sampled, e.g., `'cat[ULabel]'` or `'cat[bionty.CellType]'`. Unions are also
1813
+ allowed if the feature samples from two registries, e.g., `'cat[ULabel|bionty.CellType]'`
1814
+ """
1815
+ type: Feature | None = ForeignKey(
1816
+ "self", PROTECT, null=True, related_name="records"
1817
+ )
1818
+ """Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').
1819
+
1820
+ Allows to group features by type, e.g., all read outs, all metrics, etc.
1821
+ """
1822
+ records: Feature
1823
+ """Records of this type."""
1824
+ is_type: bool = BooleanField(default=None, db_index=True, null=True)
1825
+ """Distinguish types from instances of the type."""
1826
+ unit: str | None = CharField(max_length=30, db_index=True, null=True)
1827
+ """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
1828
+ description: str | None = TextField(db_index=True, null=True)
1829
+ """A description."""
1830
+ array_rank: int = models.SmallIntegerField(default=0, db_index=True)
1831
+ """Rank of feature.
1832
+
1833
+ Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.
1834
+
1835
+ Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with
1836
+ the dimension of the feature space.
1837
+ """
1838
+ array_size: int = models.IntegerField(default=0, db_index=True)
1839
+ """Number of elements of the feature.
1840
+
1841
+ Total number of elements (product of shape components) of the array.
1842
+
1843
+ - A number or string (a scalar): 1
1844
+ - A 50-dimensional embedding: 50
1845
+ - A 25 x 25 image: 625
1846
+ """
1847
+ array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)
1848
+ """Shape of the feature.
1849
+
1850
+ - A number or string (a scalar): [1]
1851
+ - A 50-dimensional embedding: [50]
1852
+ - A 25 x 25 image: [25, 25]
1853
+
1854
+ Is stored as a list rather than a tuple because it's serialized as JSON.
1855
+ """
1856
+ proxy_dtype: FeatureDtype | None = CharField(default=None, null=True)
1857
+ """Proxy data type.
1858
+
1859
+ If the feature is an image it's often stored via a path to the image file. Hence, while the dtype might be
1860
+ image with a certain shape, the proxy dtype would be str.
1861
+ """
1862
+ synonyms: str | None = TextField(null=True)
1863
+ """Bar-separated (|) synonyms (optional)."""
1864
+ # we define the below ManyToMany on the feature model because it parallels
1865
+ # how other registries (like Gene, Protein, etc.) relate to Schema
1866
+ # it makes the API more consistent
1867
+ schemas: Schema = models.ManyToManyField(
1868
+ "Schema", through="SchemaFeature", related_name="features"
1869
+ )
1870
+ """Feature sets linked to this feature."""
1871
+ _expect_many: bool = models.BooleanField(default=True, db_default=True)
1872
+ """Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `True`).
1873
+
1874
+ - if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level mean `set[datetime]` on the artifact-level
1875
+ - if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact")
1876
+ """
1877
+ _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
1878
+ # backward fields
1879
+ values: FeatureValue
1880
+ """Values for this feature."""
1881
+
1882
+ @overload
1883
+ def __init__(
1884
+ self,
1885
+ name: str,
1886
+ dtype: FeatureDtype | Registry | list[Registry],
1887
+ unit: str | None,
1888
+ description: str | None,
1889
+ synonyms: str | None,
1890
+ ): ...
1891
+
1892
+ @overload
1893
+ def __init__(
1894
+ self,
1895
+ *db_args,
1896
+ ): ...
1897
+
1898
+ def __init__(
1899
+ self,
1900
+ *args,
1901
+ **kwargs,
1902
+ ):
1903
+ pass
1904
+
1905
+ @classmethod
1906
+ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
1907
+ """Create Feature records for columns."""
1908
+ pass
1909
+
1910
+ def save(self, *args, **kwargs) -> Feature:
1911
+ """Save."""
1912
+ pass
1913
+
1914
+
1915
+ class FeatureValue(Record, TracksRun):
1916
+ """Non-categorical features values.
1917
+
1918
+ Categorical feature values are stored in their respective registries:
1919
+ :class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.
1920
+
1921
+ Unlike for ULabel, in `FeatureValue`, values are grouped by features and
1922
+ not by an ontological hierarchy.
1923
+ """
1924
+
1925
+ # we do not have a unique constraint on feature & value because it leads to hashing errors
1926
+ # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
1927
+ # we do not hash values because we have `get_or_create` logic all over the place
1928
+ # and also for checking whether the (feature, value) combination exists
1929
+ # there does not seem an issue with querying for a dict-like value
1930
+ # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
1931
+
1932
+ _name_field: str = "value"
1933
+
1934
+ feature: Feature | None = ForeignKey(
1935
+ Feature, CASCADE, null=True, related_name="values", default=None
1936
+ )
1937
+ """The dimension metadata."""
1938
+ value: Any = models.JSONField()
1939
+ """The JSON-like value."""
1940
+ hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
1941
+ """Value hash."""
1942
+
1943
+ class Meta(BasicRecord.Meta, TracksRun.Meta):
1944
+ constraints = [
1945
+ # For simple types, use direct value comparison
1946
+ models.UniqueConstraint(
1947
+ fields=["feature", "value"],
1948
+ name="unique_simple_feature_value",
1949
+ condition=Q(hash__isnull=True),
1950
+ ),
1951
+ # For complex types (dictionaries), use hash
1952
+ models.UniqueConstraint(
1953
+ fields=["feature", "hash"],
1954
+ name="unique_complex_feature_value",
1955
+ condition=Q(hash__isnull=False),
1956
+ ),
1957
+ ]
1958
+
1959
+ @classmethod
1960
+ def get_or_create(cls, feature, value):
1961
+ # Simple types: int, float, str, bool
1962
+ if isinstance(value, (int, float, str, bool)):
1963
+ try:
1964
+ return cls.objects.create(
1965
+ feature=feature, value=value, hash=None
1966
+ ), False
1967
+ except IntegrityError:
1968
+ return cls.objects.get(feature=feature, value=value), True
1969
+
1970
+ # Complex types: dict, list
1971
+ else:
1972
+ hash = hash_dict(value)
1973
+ try:
1974
+ return cls.objects.create(
1975
+ feature=feature, value=value, hash=hash
1976
+ ), False
1977
+ except IntegrityError:
1978
+ return cls.objects.get(feature=feature, hash=hash), True
1979
+
1980
+
1981
+ class Schema(Record, CanCurate, TracksRun):
1982
+ """Feature sets (dataset schemas).
1983
+
1984
+ Stores references to dataset schemas: these are the sets of columns in a dataset
1985
+ that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
1986
+ entities.
1987
+
1988
+ .. dropdown:: Why does LaminDB model feature sets, not just features?
1989
+
1990
+ 1. Performance: Imagine you measure the same panel of 20k transcripts in
1991
+ 1M samples. By modeling the panel as a feature set, you can link all
1992
+ your artifacts against one feature set and only need to store 1M
1993
+ instead of 1M x 20k = 20B links.
1994
+ 2. Interpretation: Model protein panels, gene panels, etc.
1995
+ 3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
1996
+
1997
+ These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
1998
+
1999
+ Args:
2000
+ features: `Iterable[Record]` An iterable of :class:`~lamindb.Feature`
2001
+ records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
2002
+ a set upon instantiation. If you'd like to pass values, use
2003
+ :meth:`~lamindb.Schema.from_values` or
2004
+ :meth:`~lamindb.Schema.from_df`.
2005
+ dtype: `str | None = None` The simple type. Defaults to
2006
+ `None` for sets of :class:`~lamindb.Feature` records.
2007
+ Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
2008
+ name: `str | None = None` A name.
2009
+
2010
+ Note:
2011
+
2012
+ A feature set can be identified by the `hash` its feature uids.
2013
+ It's stored in the `.hash` field.
2014
+
2015
+ A `slot` provides a string key to access feature sets.
2016
+ It's typically the accessor within the registered data object, here `pd.DataFrame.columns`.
2017
+
2018
+ See Also:
2019
+ :meth:`~lamindb.Schema.from_values`
2020
+ Create from values.
2021
+ :meth:`~lamindb.Schema.from_df`
2022
+ Create from dataframe columns.
2023
+
2024
+ Examples:
2025
+
2026
+ Create a feature set / schema from df with types:
2027
+
2028
+ >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
2029
+ >>> feature_set = ln.FeatureSet.from_df(df)
2030
+
2031
+ Create a feature set / schema from features:
2032
+
2033
+ >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
2034
+ >>> feature_set = ln.FeatureSet(features)
2035
+
2036
+ Create a feature set / schema from feature values:
2037
+
2038
+ >>> import bionty as bt
2039
+ >>> feature_set = ln.FeatureSet.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
2040
+
2041
+ Link a feature set to an artifact:
2042
+
2043
+ >>> artifact.features.add_feature_set(feature_set, slot="var")
2044
+
2045
+ """
2046
+
2047
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
2048
+ abstract = False
2049
+
2050
+ _name_field: str = "name"
2051
+
2052
+ id: int = models.AutoField(primary_key=True)
2053
+ """Internal id, valid only in one DB instance."""
2054
+ uid: str = CharField(unique=True, db_index=True, max_length=20)
2055
+ """A universal id (hash of the set of feature values)."""
2056
+ name: str | None = CharField(max_length=150, null=True)
2057
+ """A name."""
2058
+ n = IntegerField()
2059
+ """Number of features in the set."""
2060
+ dtype: str | None = CharField(max_length=64, null=True)
2061
+ """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
2062
+
2063
+ For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
2064
+ """
2065
+ # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
2066
+ # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
2067
+ itype: str | None = CharField(max_length=120, db_index=True, null=True)
2068
+ """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
2069
+
2070
+ Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
2071
+
2072
+ .. versionchanged:: 1.0.0
2073
+ Was called `itype` before.
2074
+ """
2075
+ type: Feature | None = ForeignKey(
2076
+ "self", PROTECT, null=True, related_name="records"
2077
+ )
2078
+ """Type of feature set (e.g., 'ExpressionPanel', 'ProteinPanel', 'Multimodal', 'Metadata', 'Embedding').
2079
+
2080
+ Allows to group feature sets by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
2081
+ """
2082
+ records: Feature
2083
+ """Records of this type."""
2084
+ is_type: bool = BooleanField(default=None, db_index=True, null=True)
2085
+ """Distinguish types from instances of the type."""
2086
+ otype: str | None = CharField(max_length=64, db_index=True, null=True)
2087
+ """Default Python object type, e.g., DataFrame, AnnData."""
2088
+ hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2089
+ """A hash of the set of feature identifiers.
2090
+
2091
+ For a composite schema, the hash of hashes.
2092
+ """
2093
+ minimal_set: bool = BooleanField(default=True, db_index=True)
2094
+ """Whether the schema contains a minimal set of linked features (default `True`).
2095
+
2096
+ If `False`, no features are linked to this schema.
2097
+
2098
+ If `True`, features are linked and considered as a minimally required set in validation.
2099
+ """
2100
+ ordered_set: bool = BooleanField(default=False, db_index=True)
2101
+ """Whether the linked features are ordered (default `False`)."""
2102
+ maximal_set: bool = BooleanField(default=False, db_index=True)
2103
+ """If `False`, additional features are allowed (default `False`).
2104
+
2105
+ If `True`, the the minimal set is a maximal set and no additional features are allowed.
2106
+ """
2107
+ composite: Schema | None = ForeignKey(
2108
+ "self", PROTECT, related_name="components", default=None, null=True
2109
+ )
2110
+ """The composite schema that contains this schema as a component.
2111
+
2112
+ The composite schema composes multiple simpler schemas into one object.
2113
+
2114
+ For example, an AnnData composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
2115
+ """
2116
+ slot: str | None = CharField(max_length=100, db_index=True, null=True)
2117
+ """The slot in which the schema is stored in the composite schema."""
2118
+ validated_by: Schema | None = ForeignKey(
2119
+ "self", PROTECT, related_name="validated_schemas", default=None, null=True
2120
+ )
2121
+ """The schema that validated this schema during curation.
2122
+
2123
+ When performing validation, the schema that enforced validation is often less concrete than what is validated.
2124
+
2125
+ For instance, the set of measured features might be a superset of the minimally required set of features.
2126
+
2127
+ Often, the curating schema does not specficy any concrete features at all
2128
+ """
2129
+ features: Feature
2130
+ """The features contained in the schema."""
2131
+ params: Param
2132
+ """The params contained in the schema."""
2133
+ artifacts: Artifact
2134
+ """The artifacts that observe this schema."""
2135
+ _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
2136
+
2137
+ @overload
2138
+ def __init__(
2139
+ self,
2140
+ features: Iterable[Record],
2141
+ dtype: str | None = None,
2142
+ name: str | None = None,
2143
+ ): ...
2144
+
2145
+ @overload
2146
+ def __init__(
2147
+ self,
2148
+ *db_args,
2149
+ ): ...
2150
+
2151
+ def __init__(
2152
+ self,
2153
+ *args,
2154
+ **kwargs,
2155
+ ):
2156
+ pass
2157
+
2158
+ @classmethod
2159
+ def from_values( # type: ignore
2160
+ cls,
2161
+ values: ListLike,
2162
+ field: FieldAttr = Feature.name,
2163
+ type: str | None = None,
2164
+ name: str | None = None,
2165
+ mute: bool = False,
2166
+ organism: Record | str | None = None,
2167
+ source: Record | None = None,
2168
+ raise_validation_error: bool = True,
2169
+ ) -> Schema:
2170
+ """Create feature set for validated features.
2171
+
2172
+ Args:
2173
+ values: A list of values, like feature names or ids.
2174
+ field: The field of a reference registry to map values.
2175
+ type: The simple type.
2176
+ Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
2177
+ defaults to `"float"` otherwise.
2178
+ name: A name.
2179
+ organism: An organism to resolve gene mapping.
2180
+ source: A public ontology to resolve feature identifier mapping.
2181
+ raise_validation_error: Whether to raise a validation error if some values are not valid.
2182
+
2183
+ Raises:
2184
+ ValidationError: If some values are not valid.
2185
+
2186
+ Examples:
2187
+
2188
+ >>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
2189
+ >>> schema = ln.Schema.from_values(features)
2190
+
2191
+ >>> genes = ["ENSG00000139618", "ENSG00000198786"]
2192
+ >>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
2193
+ """
2194
+ pass
2195
+
2196
+ @classmethod
2197
+ def from_df(
2198
+ cls,
2199
+ df: pd.DataFrame,
2200
+ field: FieldAttr = Feature.name,
2201
+ name: str | None = None,
2202
+ mute: bool = False,
2203
+ organism: Record | str | None = None,
2204
+ source: Record | None = None,
2205
+ ) -> Schema | None:
2206
+ """Create feature set for validated features."""
2207
+ pass
2208
+
2209
+ def save(self, *args, **kwargs) -> Schema:
2210
+ """Save."""
2211
+ pass
2212
+
2213
+ @property
2214
+ def members(self) -> QuerySet:
2215
+ """A queryset for the individual records of the set."""
2216
+ pass
2217
+
2218
+ @property
2219
+ @deprecated("itype")
2220
+ def registry(self) -> str:
2221
+ return self.itype
2222
+
2223
+ @registry.setter
2224
+ def registry(self, value) -> None:
2225
+ self.itype = value
2226
+
2227
+
2228
+ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2229
+ """Datasets & models stored as files, folders, or arrays.
2230
+
2231
+ Artifacts manage data in local or remote storage.
2232
+
2233
+ Some artifacts are array-like, e.g., when stored as `.parquet`, `.h5ad`,
2234
+ `.zarr`, or `.tiledb`.
2235
+
2236
+ Args:
2237
+ data: `UPathStr` A path to a local or remote folder or file.
2238
+ type: `Literal["dataset", "model"] | None = None` The artifact type.
2239
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
2240
+ description: `str | None = None` A description.
2241
+ revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
2242
+ run: `Run | None = None` The run that creates the artifact.
2243
+
2244
+ .. dropdown:: Typical storage formats & their API accessors
2245
+
2246
+ Arrays:
2247
+
2248
+ - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
2249
+ - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
2250
+ - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
2251
+
2252
+ Non-arrays:
2253
+
2254
+ - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
2255
+ - Fastq: `.fastq` ⟷ /
2256
+ - VCF: `.vcf` ⟷ /
2257
+ - QC: `.html` ⟷ /
2258
+
2259
+ You'll find these values in the `suffix` & `accessor` fields.
2260
+
2261
+ LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
2262
+
2263
+ See Also:
2264
+ :class:`~lamindb.Storage`
2265
+ Storage locations for artifacts.
2266
+ :class:`~lamindb.Collection`
2267
+ Collections of artifacts.
2268
+ :meth:`~lamindb.Artifact.from_df`
2269
+ Create an artifact from a `DataFrame`.
2270
+ :meth:`~lamindb.Artifact.from_anndata`
2271
+ Create an artifact from an `AnnData`.
2272
+
2273
+ Examples:
2274
+
2275
+ Create an artifact from a file path and pass `description`:
2276
+
2277
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv", description="My file")
2278
+ >>> artifact = ln.Artifact("./my_local_file.jpg", description="My image")
2279
+
2280
+ You can also pass `key` to create a virtual filepath hierarchy:
2281
+
2282
+ >>> artifact = ln.Artifact("./my_local_file.jpg", key="example_datasets/dataset1.jpg")
2283
+
2284
+ What works for files also works for folders:
2285
+
2286
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder")
2287
+ >>> artifact = ln.Artifact("./my_local_folder", description="My local folder")
2288
+ >>> artifact = ln.Artifact("./my_local_folder", key="project1/my_target_folder")
2289
+
2290
+ .. dropdown:: Why does the API look this way?
2291
+
2292
+ It's inspired by APIs building on AWS S3.
2293
+
2294
+ Both boto3 and quilt select a bucket (akin to default storage in LaminDB) and define a target path through a `key` argument.
2295
+
2296
+ In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
2297
+
2298
+ # signature: S3.Bucket.upload_file(filepath, key)
2299
+ import boto3
2300
+ s3 = boto3.resource('s3')
2301
+ bucket = s3.Bucket('mybucket')
2302
+ bucket.upload_file('/tmp/hello.txt', 'hello.txt')
2303
+
2304
+ In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
2305
+
2306
+ # signature: quilt3.Bucket.put_file(key, filepath)
2307
+ import quilt3
2308
+ bucket = quilt3.Bucket('mybucket')
2309
+ bucket.put_file('hello.txt', '/tmp/hello.txt')
2310
+
2311
+
2312
+ Make a new version of an artifact:
2313
+
2314
+ >>> artifact = ln.Artifact.from_df(df, key="example_datasets/dataset1.parquet").save()
2315
+ >>> artifact_v2 = ln.Artifact(df_updated, key="example_datasets/dataset1.parquet").save()
2316
+
2317
+ Alternatively, if you don't want to provide a value for `key`, you can use `revises`:
2318
+
2319
+ >>> artifact = ln.Artifact.from_df(df, description="My dataframe").save()
2320
+ >>> artifact_v2 = ln.Artifact(df_updated, revises=artifact).save()
2321
+
2322
+ """
2323
+
2324
+ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
2325
+ abstract = False
2326
+
2327
+ _len_full_uid: int = 20
2328
+ _len_stem_uid: int = 16
2329
+
2330
+ params: ParamManager = ParamManagerArtifact # type: ignore
2331
+ """Param manager.
2332
+
2333
+ Example::
2334
+
2335
+ artifact.params.add_values({
2336
+ "hidden_size": 32,
2337
+ "bottleneck_size": 16,
2338
+ "batch_size": 32,
2339
+ "preprocess_params": {
2340
+ "normalization_type": "cool",
2341
+ "subset_highlyvariable": True,
2342
+ },
2343
+ })
2344
+ """
2345
+
2346
+ features: FeatureManager = FeatureManager # type: ignore
2347
+ """Feature manager.
2348
+
2349
+ Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
2350
+
2351
+ Annotate with features & values::
2352
+
2353
+ artifact.features.add_values({
2354
+ "species": organism, # here, organism is an Organism record
2355
+ "scientist": ['Barbara McClintock', 'Edgar Anderson'],
2356
+ "temperature": 27.6,
2357
+ "study": "Candidate marker study"
2358
+ })
2359
+
2360
+ Query for features & values::
2361
+
2362
+ ln.Artifact.features.filter(scientist="Barbara McClintock")
2363
+
2364
+ Features may or may not be part of the artifact content in storage. For
2365
+ instance, the :class:`~lamindb.Curator` flow validates the columns of a
2366
+ `DataFrame`-like artifact and annotates it with features corresponding to
2367
+ these columns. `artifact.features.add_values`, by contrast, does not
2368
+ validate the content of the artifact.
2369
+ """
2370
+
2371
+ @property
2372
+ def labels(self) -> LabelManager:
2373
+ """Label manager.
2374
+
2375
+ To annotate with labels, you typically use the registry-specific accessors,
2376
+ for instance :attr:`~lamindb.Artifact.ulabels`::
2377
+
2378
+ candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
2379
+ artifact.ulabels.add(candidate_marker_study)
2380
+
2381
+ Similarly, you query based on these accessors::
2382
+
2383
+ ln.Artifact.filter(ulabels__name="Candidate marker study").all()
2384
+
2385
+ Unlike the registry-specific accessors, the `.labels` accessor provides
2386
+ a way of associating labels with features::
2387
+
2388
+ study = ln.Feature(name="study", dtype="cat").save()
2389
+ artifact.labels.add(candidate_marker_study, feature=study)
2390
+
2391
+ Note that the above is equivalent to::
2392
+
2393
+ artifact.features.add_values({"study": candidate_marker_study})
2394
+ """
2395
+ from lamindb.core._label_manager import LabelManager
2396
+
2397
+ return LabelManager(self)
2398
+
2399
+ id: int = models.AutoField(primary_key=True)
2400
+ """Internal id, valid only in one DB instance."""
2401
+ uid: str = CharField(unique=True, db_index=True, max_length=_len_full_uid)
2402
+ """A universal random id."""
2403
+ key: str | None = CharField(db_index=True, null=True)
2404
+ """A (virtual) relative file path within the artifact's storage location.
2405
+
2406
+ Setting a `key` is useful to automatically group artifacts into a version family.
2407
+
2408
+ LaminDB defaults to a virtual file path to make renaming of data in object storage easy.
2409
+
2410
+ If you register existing files in a storage location, the `key` equals the
2411
+ actual filepath on the underyling filesytem or object store.
2412
+ """
2413
+ description: str | None = CharField(db_index=True, null=True)
2414
+ """A description.
2415
+
2416
+ LaminDB doesn't require you to pass a key, you can
2417
+ """
2418
+ storage: Storage = ForeignKey(Storage, PROTECT, related_name="artifacts")
2419
+ """Storage location, e.g. an S3 or GCP bucket or a local directory."""
2420
+ suffix: str = CharField(max_length=30, db_index=True)
2421
+ # Initially, we thought about having this be nullable to indicate folders
2422
+ # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
2423
+ """Path suffix or empty string if no canonical suffix exists.
2424
+
2425
+ This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
2426
+ """
2427
+ kind: ArtifactKind | None = CharField(
2428
+ max_length=20,
2429
+ db_index=True,
2430
+ null=True,
2431
+ )
2432
+ """:class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
2433
+ otype: str | None = CharField(max_length=64, db_index=True, null=True)
2434
+ """Default Python object type, e.g., DataFrame, AnnData."""
2435
+ size: int | None = BigIntegerField(null=True, db_index=True, default=None)
2436
+ """Size in bytes.
2437
+
2438
+ Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
2439
+ """
2440
+ hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2441
+ """Hash or pseudo-hash of artifact content.
2442
+
2443
+ Useful to ascertain integrity and avoid duplication.
2444
+ """
2445
+ n_files: int | None = BigIntegerField(null=True, db_index=True, default=None)
2446
+ """Number of files for folder-like artifacts, `None` for file-like artifacts.
2447
+
2448
+ Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
2449
+
2450
+ .. versionchanged:: 1.0
2451
+ Renamed from `n_objects` to `n_files`.
2452
+ """
2453
+ n_observations: int | None = BigIntegerField(null=True, db_index=True, default=None)
2454
+ """Number of observations.
2455
+
2456
+ Typically, this denotes the first array dimension.
2457
+ """
2458
+ _hash_type: str | None = CharField(max_length=30, db_index=True, null=True)
2459
+ """Type of hash."""
2460
+ ulabels: ULabel = models.ManyToManyField(
2461
+ ULabel, through="ArtifactULabel", related_name="artifacts"
2462
+ )
2463
+ """The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
2464
+ run: Run | None = ForeignKey(
2465
+ Run, PROTECT, related_name="output_artifacts", null=True, default=None
2466
+ )
2467
+ """Run that created the artifact."""
2468
+ input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
2469
+ """Runs that use this artifact as an input."""
2470
+ # if the artifact is replicated or updated in a new run, we link the previous
2471
+ # run in previous_runs
2472
+ _previous_runs: Run = models.ManyToManyField(
2473
+ "Run", related_name="_output_artifacts_with_later_updates"
2474
+ )
2475
+ """Sequence of runs that created or updated the record."""
2476
+ collections: Collection
2477
+ """The collections that this artifact is part of."""
2478
+ schema: Schema | None = ForeignKey(
2479
+ Schema, PROTECT, null=True, default=None, related_name="artifacts"
2480
+ )
2481
+ """The schema of the artifact (to be populated in lamindb 1.1)."""
2482
+ _schemas_m2m: Schema = models.ManyToManyField(
2483
+ Schema, related_name="_artifacts_m2m", through="ArtifactSchema"
2484
+ )
2485
+ """[For backward compatibility] The feature sets measured in the artifact."""
2486
+ _feature_values: FeatureValue = models.ManyToManyField(
2487
+ FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
2488
+ )
2489
+ """Non-categorical feature values for annotation."""
2490
+ _param_values: ParamValue = models.ManyToManyField(
2491
+ ParamValue, through="ArtifactParamValue", related_name="artifacts"
2492
+ )
2493
+ """Parameter values."""
2494
+ _key_is_virtual: bool = BooleanField()
2495
+ """Indicates whether `key` is virtual or part of an actual file path."""
2496
+ # be mindful that below, passing related_name="+" leads to errors
2497
+ _actions: Artifact = models.ManyToManyField(
2498
+ "self", symmetrical=False, related_name="_action_targets"
2499
+ )
2500
+ """Actions to attach for the UI."""
2501
+ created_by: User = ForeignKey(
2502
+ "lamindb.User",
2503
+ PROTECT,
2504
+ default=current_user_id,
2505
+ related_name="created_artifacts",
2506
+ )
2507
+ """Creator of record."""
2508
+ _overwrite_versions: bool = BooleanField(default=None)
2509
+ """Indicates whether to store or overwrite versions.
2510
+
2511
+ It defaults to False for file-like artifacts and to True for folder-like artifacts.
2512
+ """
2513
+
2514
+ @overload
2515
+ def __init__(
2516
+ self,
2517
+ # we're not choosing the name "path" for this arg because
2518
+ # it'd be confusing with `artifact.path`, which is not the same
2519
+ # so "data" conveys better that this is input data that's ingested
2520
+ # and will be moved to a target path at `artifact.path`
2521
+ # also internally, we sometimes pass "data objects" like a DataFrame
2522
+ # here; and we might refactor this but we might also keep that internal
2523
+ # usage
2524
+ data: UPathStr,
2525
+ type: ArtifactKind | None = None,
2526
+ key: str | None = None,
2527
+ description: str | None = None,
2528
+ revises: Artifact | None = None,
2529
+ run: Run | None = None,
2530
+ ): ...
2531
+
2532
+ @overload
2533
+ def __init__(
2534
+ self,
2535
+ *db_args,
2536
+ ): ...
2537
+
2538
+ def __init__(
2539
+ self,
2540
+ *args,
2541
+ **kwargs,
2542
+ ):
2543
+ pass
2544
+
2545
+ @property
2546
+ @deprecated("kind")
2547
+ def type(self) -> str:
2548
+ return self.kind
2549
+
2550
+ @property
2551
+ @deprecated("otype")
2552
+ def _accessor(self) -> str:
2553
+ return self.otype
2554
+
2555
+ @property
2556
+ def transform(self) -> Transform | None:
2557
+ """Transform whose run created the artifact."""
2558
+ return self.run.transform if self.run is not None else None
2559
+
2560
+ @property
2561
+ @deprecated("n_files")
2562
+ def n_objects(self) -> int:
2563
+ return self.n_files
2564
+
2565
+ @property
2566
+ def feature_sets(self) -> QuerySet[Schema]:
2567
+ """Feature sets linked to this artifact."""
2568
+ return self._schemas_m2m
2569
+
2570
+ # add the below because this is what people will have in their code
2571
+ # if they implement the recommended migration strategy
2572
+ # - FeatureSet -> Schema
2573
+ # - featureset -> schema
2574
+ # - feature_set -> schema
2575
+ # @property
2576
+ # def schemas(self) -> QuerySet[Schema]:
2577
+ # """Schemas linked to artifact via many-to-many relationship.
2578
+
2579
+ # Is now mediating the private `._schemas_m2m` relationship during
2580
+ # a transition period to better schema management.
2581
+
2582
+ # .. versionchanged: 1.0
2583
+ # Was previously called `.feature_sets`.
2584
+
2585
+ # """
2586
+ # return self._schemas_m2m
2587
+
2588
+ @property
2589
+ def path(self) -> Path:
2590
+ """Path.
2591
+
2592
+ File in cloud storage, here AWS S3:
2593
+
2594
+ >>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
2595
+ >>> artifact.path
2596
+ S3Path('s3://my-bucket/my-file.csv')
2597
+
2598
+ File in local storage:
2599
+
2600
+ >>> ln.Artifact("./myfile.csv", key="myfile").save()
2601
+ >>> artifact = ln.Artifact.get(key="myfile")
2602
+ >>> artifact.path
2603
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
2604
+ """
2605
+ pass
2606
+
2607
+ @classmethod
2608
+ def from_df(
2609
+ cls,
2610
+ df: pd.DataFrame,
2611
+ key: str | None = None,
2612
+ description: str | None = None,
2613
+ run: Run | None = None,
2614
+ revises: Artifact | None = None,
2615
+ **kwargs,
2616
+ ) -> Artifact:
2617
+ """Create from `DataFrame`, validate & link features.
2618
+
2619
+ Args:
2620
+ df: A `DataFrame` object.
2621
+ key: A relative path within default storage,
2622
+ e.g., `"myfolder/myfile.parquet"`.
2623
+ description: A description.
2624
+ revises: An old version of the artifact.
2625
+ run: The run that creates the artifact.
2626
+
2627
+ See Also:
2628
+ :meth:`~lamindb.Collection`
2629
+ Track collections.
2630
+ :class:`~lamindb.Feature`
2631
+ Track features.
2632
+
2633
+ Examples:
2634
+ >>> df = ln.core.datasets.df_iris_in_meter_batch1()
2635
+ >>> df.head()
2636
+ sepal_length sepal_width petal_length petal_width iris_organism_code
2637
+ 0 0.051 0.035 0.014 0.002 0
2638
+ 1 0.049 0.030 0.014 0.002 0
2639
+ 2 0.047 0.032 0.013 0.002 0
2640
+ 3 0.046 0.031 0.015 0.002 0
2641
+ 4 0.050 0.036 0.014 0.002 0
2642
+ >>> artifact = ln.Artifact.from_df(df, description="Iris flower collection batch1")
2643
+ >>> artifact.save()
2644
+ """
2645
+ pass
2646
+
2647
+ @classmethod
2648
+ def from_anndata(
2649
+ cls,
2650
+ adata: AnnData | UPathStr,
2651
+ key: str | None = None,
2652
+ description: str | None = None,
2653
+ run: Run | None = None,
2654
+ revises: Artifact | None = None,
2655
+ **kwargs,
2656
+ ) -> Artifact:
2657
+ """Create from ``AnnData``, validate & link features.
2658
+
2659
+ Args:
2660
+ adata: An `AnnData` object or a path of AnnData-like.
2661
+ key: A relative path within default storage,
2662
+ e.g., `"myfolder/myfile.h5ad"`.
2663
+ description: A description.
2664
+ revises: An old version of the artifact.
2665
+ run: The run that creates the artifact.
2666
+
2667
+ See Also:
2668
+
2669
+ :meth:`~lamindb.Collection`
2670
+ Track collections.
2671
+ :class:`~lamindb.Feature`
2672
+ Track features.
2673
+
2674
+ Examples:
2675
+ >>> import bionty as bt
2676
+ >>> bt.settings.organism = "human"
2677
+ >>> adata = ln.core.datasets.anndata_with_obs()
2678
+ >>> artifact = ln.Artifact.from_anndata(adata, description="mini anndata with obs")
2679
+ >>> artifact.save()
2680
+ """
2681
+ pass
2682
+
2683
+ @classmethod
2684
+ def from_mudata(
2685
+ cls,
2686
+ mdata: MuData,
2687
+ key: str | None = None,
2688
+ description: str | None = None,
2689
+ run: Run | None = None,
2690
+ revises: Artifact | None = None,
2691
+ **kwargs,
2692
+ ) -> Artifact:
2693
+ """Create from ``MuData``, validate & link features.
2694
+
2695
+ Args:
2696
+ mdata: An `MuData` object.
2697
+ key: A relative path within default storage,
2698
+ e.g., `"myfolder/myfile.h5mu"`.
2699
+ description: A description.
2700
+ revises: An old version of the artifact.
2701
+ run: The run that creates the artifact.
2702
+
2703
+ See Also:
2704
+ :meth:`~lamindb.Collection`
2705
+ Track collections.
2706
+ :class:`~lamindb.Feature`
2707
+ Track features.
2708
+
2709
+ Examples:
2710
+ >>> import bionty as bt
2711
+ >>> bt.settings.organism = "human"
2712
+ >>> mdata = ln.core.datasets.mudata_papalexi21_subset()
2713
+ >>> artifact = ln.Artifact.from_mudata(mdata, description="a mudata object")
2714
+ >>> artifact.save()
2715
+ """
2716
+ pass
2717
+
2718
+ @classmethod
2719
+ def from_dir(
2720
+ cls,
2721
+ path: UPathStr,
2722
+ key: str | None = None,
2723
+ *,
2724
+ run: Run | None = None,
2725
+ ) -> list[Artifact]:
2726
+ """Create a list of artifact objects from a directory.
2727
+
2728
+ Hint:
2729
+ If you have a high number of files (several 100k) and don't want to
2730
+ track them individually, create a single :class:`~lamindb.Artifact` via
2731
+ ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.
2732
+
2733
+ Args:
2734
+ path: Source path of folder.
2735
+ key: Key for storage destination. If `None` and
2736
+ directory is in a registered location, the inferred `key` will
2737
+ reflect the relative position. If `None` and directory is outside
2738
+ of a registered storage location, the inferred key defaults to `path.name`.
2739
+ run: A `Run` object.
2740
+
2741
+ Examples:
2742
+ >>> dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
2743
+ >>> artifacts = ln.Artifact.from_dir(dir_path)
2744
+ >>> ln.save(artifacts)
2745
+ """
2746
+ pass
2747
+
2748
+ def replace(
2749
+ self,
2750
+ data: UPathStr,
2751
+ run: Run | None = None,
2752
+ format: str | None = None,
2753
+ ) -> None:
2754
+ """Replace artifact content.
2755
+
2756
+ Args:
2757
+ data: A file path.
2758
+ run: The run that created the artifact gets
2759
+ auto-linked if ``ln.track()`` was called.
2760
+
2761
+ Examples:
2762
+ Say we made a change to the content of an artifact, e.g., edited the image
2763
+ `paradisi05_laminopathic_nuclei.jpg`.
2764
+
2765
+ This is how we replace the old file in storage with the new file:
2766
+
2767
+ >>> artifact.replace("paradisi05_laminopathic_nuclei.jpg")
2768
+ >>> artifact.save()
2769
+
2770
+ Note that this neither changes the storage key nor the filename.
2771
+
2772
+ However, it will update the suffix if it changes.
2773
+ """
2774
+ pass
2775
+
2776
+ def open(
2777
+ self, mode: str = "r", is_run_input: bool | None = None
2778
+ ) -> (
2779
+ AnnDataAccessor
2780
+ | BackedAccessor
2781
+ | SOMACollection
2782
+ | SOMAExperiment
2783
+ | PyArrowDataset
2784
+ ):
2785
+ """Return a cloud-backed data object.
2786
+
2787
+ Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2788
+ `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
2789
+
2790
+ Args:
2791
+ mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2792
+ otherwise should be always `"r"` (read-only mode).
2793
+
2794
+ Notes:
2795
+ For more info, see tutorial: :doc:`/arrays`.
2796
+
2797
+ Examples:
2798
+
2799
+ Read AnnData in backed mode from cloud:
2800
+
2801
+ >>> artifact = ln.Artifact.get(key="lndb-storage/pbmc68k.h5ad")
2802
+ >>> artifact.open()
2803
+ AnnDataAccessor object with n_obs × n_vars = 70 × 765
2804
+ constructed for the AnnData object pbmc68k.h5ad
2805
+ ...
2806
+ """
2807
+ pass
2808
+
2809
+ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
2810
+ """Cache and load into memory.
2811
+
2812
+ See all :mod:`~lamindb.core.loaders`.
2813
+
2814
+ Examples:
2815
+
2816
+ Load a `DataFrame`-like artifact:
2817
+
2818
+ >>> artifact.load().head()
2819
+ sepal_length sepal_width petal_length petal_width iris_organism_code
2820
+ 0 0.051 0.035 0.014 0.002 0
2821
+ 1 0.049 0.030 0.014 0.002 0
2822
+ 2 0.047 0.032 0.013 0.002 0
2823
+ 3 0.046 0.031 0.015 0.002 0
2824
+ 4 0.050 0.036 0.014 0.002 0
2825
+
2826
+ Load an `AnnData`-like artifact:
2827
+
2828
+ >>> artifact.load()
2829
+ AnnData object with n_obs × n_vars = 70 × 765
2830
+
2831
+ Fall back to :meth:`~lamindb.Artifact.cache` if no in-memory representation is configured:
2832
+
2833
+ >>> artifact.load()
2834
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/.lamindb/jb7BY5UJoQVGMUOKiLcn.jpg')
2835
+ """
2836
+ pass
2837
+
2838
+ def cache(self, is_run_input: bool | None = None) -> Path:
2839
+ """Download cloud artifact to local cache.
2840
+
2841
+ Follows synching logic: only caches an artifact if it's outdated in the local cache.
2842
+
2843
+ Returns a path to a locally cached on-disk object (say a `.jpg` file).
2844
+
2845
+ Examples:
2846
+
2847
+ Sync file from cloud and return the local path of the cache:
2848
+
2849
+ >>> artifact.cache()
2850
+ PosixPath('/home/runner/work/Caches/lamindb/lamindb-ci/lndb-storage/pbmc68k.h5ad')
2851
+ """
2852
+ pass
2853
+
2854
+ def delete(
2855
+ self, permanent: bool | None = None, storage: bool | None = None
2856
+ ) -> None:
2857
+ """Trash or permanently delete.
2858
+
2859
+ A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`).
2860
+ A second call permanently deletes the artifact.
2861
+ If it is a folder artifact with multiple versions, deleting a non-latest version
2862
+ will not delete the underlying storage by default (if `storage=True` is not specified).
2863
+ Deleting the latest version will delete all the versions for folder artifacts.
2864
+
2865
+ FAQ: :doc:`docs:faq/storage`
2866
+
2867
+ Args:
2868
+ permanent: Permanently delete the artifact (skip trash).
2869
+ storage: Indicate whether you want to delete the artifact in storage.
2870
+
2871
+ Examples:
2872
+
2873
+ For an `Artifact` object `artifact`, call:
2874
+
2875
+ >>> artifact = ln.Artifact.filter(key="some.csv").one()
2876
+ >>> artifact.delete() # delete a single file artifact
2877
+
2878
+ >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=False).first()
2879
+ >>> artiact.delete() # delete an old version, the data will not be deleted
2880
+
2881
+ >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=True).one()
2882
+ >>> artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2883
+ """
2884
+ pass
2885
+
2886
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
2887
+ """Save to database & storage.
2888
+
2889
+ Args:
2890
+ upload: Trigger upload to cloud storage in instances with hybrid storage mode.
2891
+
2892
+ Examples:
2893
+ >>> artifact = ln.Artifact("./myfile.csv", description="myfile")
2894
+ >>> artifact.save()
2895
+ """
2896
+ pass
2897
+
2898
+ def restore(self) -> None:
2899
+ """Restore from trash.
2900
+
2901
+ Examples:
2902
+
2903
+ For any `Artifact` object `artifact`, call:
2904
+
2905
+ >>> artifact.restore()
2906
+ """
2907
+ pass
2908
+
2909
+ def describe(self) -> None:
2910
+ """Describe relations of record.
2911
+
2912
+ Examples:
2913
+ >>> artifact.describe()
2914
+ """
2915
+ pass
2916
+
2917
+
2918
+ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
2919
+ """Collections of artifacts.
2920
+
2921
+ Collections provide a simple way of versioning collections of artifacts.
2922
+
2923
+ Args:
2924
+ artifacts: `list[Artifact]` A list of artifacts.
2925
+ name: `str` A name.
2926
+ description: `str | None = None` A description.
2927
+ revises: `Collection | None = None` An old version of the collection.
2928
+ run: `Run | None = None` The run that creates the collection.
2929
+ meta: `Artifact | None = None` An artifact that defines metadata for the collection.
2930
+ reference: `str | None = None` For instance, an external ID or a URL.
2931
+ reference_type: `str | None = None` For instance, `"url"`.
2932
+
2933
+ See Also:
2934
+ :class:`~lamindb.Artifact`
2935
+
2936
+ Examples:
2937
+
2938
+ Create a collection from a list of :class:`~lamindb.Artifact` objects:
2939
+
2940
+ >>> collection = ln.Collection([artifact1, artifact2], name="My collection")
2941
+
2942
+ Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
2943
+
2944
+ >>> collection = ln.Collection(data_artifact, name="My collection", meta=metadata_artifact)
2945
+
2946
+ """
2947
+
2948
+ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
2949
+ abstract = False
2950
+
2951
+ _len_full_uid: int = 20
2952
+ _len_stem_uid: int = 16
2953
+ _name_field: str = "key"
2954
+
2955
+ id: int = models.AutoField(primary_key=True)
2956
+ """Internal id, valid only in one DB instance."""
2957
+ uid: str = CharField(
2958
+ unique=True, db_index=True, max_length=_len_full_uid, default=base62_20
2959
+ )
2960
+ """Universal id, valid across DB instances."""
2961
+ key: str = CharField(db_index=True)
2962
+ """Name or path-like key."""
2963
+ description: str | None = TextField(null=True)
2964
+ """A description or title."""
2965
+ hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2966
+ """Hash of collection content. 86 base64 chars allow to store 64 bytes, 512 bits."""
2967
+ reference: str | None = CharField(max_length=255, db_index=True, null=True)
2968
+ """A reference like URL or external ID."""
2969
+ # also for reference_type here, we allow an extra long max_length
2970
+ reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
2971
+ """Type of reference, e.g., cellxgene Census collection_id."""
2972
+ ulabels: ULabel = models.ManyToManyField(
2973
+ "ULabel", through="CollectionULabel", related_name="collections"
2974
+ )
2975
+ """ULabels sampled in the collection (see :class:`~lamindb.Feature`)."""
2976
+ run: Run | None = ForeignKey(
2977
+ Run, PROTECT, related_name="output_collections", null=True, default=None
2978
+ )
2979
+ """:class:`~lamindb.Run` that created the `collection`."""
2980
+ input_of_runs: Run = models.ManyToManyField(Run, related_name="input_collections")
2981
+ """Runs that use this collection as an input."""
2982
+ _previous_runs: Run = models.ManyToManyField(
2983
+ "Run", related_name="_output_collections_with_later_updates"
2984
+ )
2985
+ """Sequence of runs that created or updated the record."""
2986
+ artifacts: Artifact = models.ManyToManyField(
2987
+ "Artifact", related_name="collections", through="CollectionArtifact"
2988
+ )
2989
+ """Artifacts in collection."""
2990
+ meta_artifact: Artifact | None = OneToOneField(
2991
+ "Artifact",
2992
+ PROTECT,
2993
+ null=True,
2994
+ unique=True,
2995
+ related_name="_meta_of_collection",
2996
+ )
2997
+ """An artifact that stores metadata that indexes a collection.
2998
+
2999
+ It has a 1:1 correspondence with an artifact. If needed, you can access the
3000
+ collection from the artifact via a private field:
3001
+ `artifact._meta_of_collection`.
3002
+ """
3003
+ _actions: Artifact = models.ManyToManyField(Artifact, related_name="+")
3004
+ """Actions to attach for the UI."""
3005
+
3006
+ @overload
3007
+ def __init__(
3008
+ self,
3009
+ artifacts: list[Artifact],
3010
+ name: str,
3011
+ description: str | None = None,
3012
+ meta: Any | None = None,
3013
+ reference: str | None = None,
3014
+ reference_type: str | None = None,
3015
+ run: Run | None = None,
3016
+ revises: Collection | None = None,
3017
+ ): ...
3018
+
3019
+ @overload
3020
+ def __init__(
3021
+ self,
3022
+ *db_args,
3023
+ ): ...
3024
+
3025
+ def __init__(
3026
+ self,
3027
+ *args,
3028
+ **kwargs,
3029
+ ):
3030
+ pass
3031
+
3032
+ def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
3033
+ """Add an artifact to the collection.
3034
+
3035
+ Creates a new version of the collection.
3036
+
3037
+ Args:
3038
+ artifact: An artifact to add to the collection.
3039
+ run: The run that creates the new version of the collection.
3040
+
3041
+ .. versionadded:: 0.76.14
3042
+ """
3043
+ pass
3044
+
3045
+ def mapped(
3046
+ self,
3047
+ layers_keys: str | list[str] | None = None,
3048
+ obs_keys: str | list[str] | None = None,
3049
+ obsm_keys: str | list[str] | None = None,
3050
+ obs_filter: dict[str, str | tuple[str, ...]] | None = None,
3051
+ join: Literal["inner", "outer"] | None = "inner",
3052
+ encode_labels: bool | list[str] = True,
3053
+ unknown_label: str | dict[str, str] | None = None,
3054
+ cache_categories: bool = True,
3055
+ parallel: bool = False,
3056
+ dtype: str | None = None,
3057
+ stream: bool = False,
3058
+ is_run_input: bool | None = None,
3059
+ ) -> MappedCollection:
3060
+ """Return a map-style dataset.
3061
+
3062
+ Returns a `pytorch map-style dataset
3063
+ <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
3064
+ virtually concatenating `AnnData` arrays.
3065
+
3066
+ If your `AnnData` collection is in the cloud, move them into a local
3067
+ cache first via :meth:`~lamindb.Collection.cache`.
3068
+
3069
+ `__getitem__` of the `MappedCollection` object takes a single integer index
3070
+ and returns a dictionary with the observation data sample for this index from
3071
+ the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`
3072
+ (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
3073
+ for the index of the `AnnData` object containing this observation sample.
3074
+
3075
+ .. note::
3076
+
3077
+ For a guide, see :doc:`docs:scrna-mappedcollection`.
3078
+
3079
+ This method currently only works for collections of `AnnData` artifacts.
3080
+
3081
+ Args:
3082
+ layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
3083
+ retrieves ``.X``.
3084
+ obs_keys: Keys from the ``.obs`` slots.
3085
+ obsm_keys: Keys from the ``.obsm`` slots.
3086
+ obs_filter: Select only observations with these values for the given obs columns.
3087
+ Should be a dictionary with obs column names as keys
3088
+ and filtering values (a string or a tuple of strings) as values.
3089
+ join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
3090
+ does not join.
3091
+ encode_labels: Encode labels into integers.
3092
+ Can be a list with elements from ``obs_keys``.
3093
+ unknown_label: Encode this label to -1.
3094
+ Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
3095
+ or from ``encode_labels`` if it is a list.
3096
+ cache_categories: Enable caching categories of ``obs_keys`` for faster access.
3097
+ parallel: Enable sampling with multiple processes.
3098
+ dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
3099
+ stream: Whether to stream data from the array backend.
3100
+ is_run_input: Whether to track this collection as run input.
3101
+
3102
+ Examples:
3103
+ >>> import lamindb as ln
3104
+ >>> from torch.utils.data import DataLoader
3105
+ >>> ds = ln.Collection.get(description="my collection")
3106
+ >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
3107
+ >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
3108
+ """
3109
+ pass
3110
+
3111
+ def cache(self, is_run_input: bool | None = None) -> list[UPath]:
3112
+ """Download cloud artifacts in collection to local cache.
3113
+
3114
+ Follows synching logic: only caches outdated artifacts.
3115
+
3116
+ Returns paths to locally cached on-disk artifacts.
3117
+
3118
+ Args:
3119
+ is_run_input: Whether to track this collection as run input.
3120
+ """
3121
+ pass
3122
+
3123
+ def load(
3124
+ self,
3125
+ join: Literal["inner", "outer"] = "outer",
3126
+ is_run_input: bool | None = None,
3127
+ **kwargs,
3128
+ ) -> Any:
3129
+ """Stage and load to memory.
3130
+
3131
+ Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
3132
+ """
3133
+ pass
3134
+
3135
+ def delete(self, permanent: bool | None = None) -> None:
3136
+ """Delete collection.
3137
+
3138
+ Args:
3139
+ permanent: Whether to permanently delete the collection record (skips trash).
3140
+
3141
+ Examples:
3142
+
3143
+ For any `Collection` object `collection`, call:
3144
+
3145
+ >>> collection.delete()
3146
+ """
3147
+ pass
3148
+
3149
+ def save(self, using: str | None = None) -> Collection:
3150
+ """Save the collection and underlying artifacts to database & storage.
3151
+
3152
+ Args:
3153
+ using: The database to which you want to save.
3154
+
3155
+ Examples:
3156
+ >>> collection = ln.Collection("./myfile.csv", name="myfile")
3157
+ >>> collection.save()
3158
+ """
3159
+ pass
3160
+
3161
+ def restore(self) -> None:
3162
+ """Restore collection record from trash.
3163
+
3164
+ Examples:
3165
+
3166
+ For any `Collection` object `collection`, call:
3167
+
3168
+ >>> collection.restore()
3169
+ """
3170
+ pass
3171
+
3172
+ @property
3173
+ def transform(self) -> Transform | None:
3174
+ """Transform whose run created the collection."""
3175
+ return self.run.transform if self.run is not None else None
3176
+
3177
+ @property
3178
+ def name(self) -> str:
3179
+ """Name of the collection.
3180
+
3181
+ Splits `key` on `/` and returns the last element.
3182
+ """
3183
+ return self.key.split("/")[-1]
3184
+
3185
+ @property
3186
+ def ordered_artifacts(self) -> QuerySet:
3187
+ """Ordered `QuerySet` of `.artifacts`.
3188
+
3189
+ Accessing the many-to-many field `collection.artifacts` directly gives
3190
+ you non-deterministic order.
3191
+
3192
+ Using the property `.ordered_artifacts` allows to iterate through a set
3193
+ that's ordered in the order of creation.
3194
+ """
3195
+ pass
3196
+
3197
+ @property
3198
+ def data_artifact(self) -> Artifact | None:
3199
+ """Access to a single data artifact.
3200
+
3201
+ If the collection has a single data & metadata artifact, this allows access via::
3202
+
3203
+ collection.data_artifact # first & only element of collection.artifacts
3204
+ collection.meta_artifact # metadata
3205
+
3206
+ """
3207
+ pass
3208
+
3209
+ def describe(self) -> None:
3210
+ """Describe relations of record.
3211
+
3212
+ Examples:
3213
+ >>> artifact.describe()
3214
+ """
3215
+ pass
3216
+
3217
+
3218
+ # -------------------------------------------------------------------------------------
3219
+ # Project management
3220
+
3221
+
3222
+ class Person(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3223
+ """Persons.
3224
+
3225
+ This registry is distinct from `User` and purely exists for project management.
3226
+
3227
+ You'll soon be able to conveniently create persons from users.
3228
+
3229
+ Example:
3230
+ >>> person = Person(
3231
+ ... name="Jane Doe",
3232
+ ... email="jane.doe@example.com",
3233
+ ... internal=True,
3234
+ ... ).save()
3235
+ """
3236
+
3237
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
3238
+ abstract = False
3239
+
3240
+ id: int = models.AutoField(primary_key=True)
3241
+ """Internal id, valid only in one DB instance."""
3242
+ uid: str = CharField(unique=True, max_length=8, db_index=True, default=base62_8)
3243
+ """Universal id, valid across DB instances."""
3244
+ name: str = CharField(db_index=True)
3245
+ """Name of the person (forename(s) lastname)."""
3246
+ email: str | None = EmailField(null=True, default=None)
3247
+ """Email of the person."""
3248
+ external: bool = BooleanField(default=True, db_index=True)
3249
+ """Whether the person is external to the organization."""
3250
+
3251
+
3252
+ class Project(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3253
+ """Projects.
3254
+
3255
+ Example:
3256
+ >>> project = Project(
3257
+ ... name="My Project Name",
3258
+ ... abbr="MPN",
3259
+ ... url="https://example.com/my_project",
3260
+ ... ).save()
3261
+ """
3262
+
3263
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
3264
+ abstract = False
3265
+
3266
+ id: int = models.AutoField(primary_key=True)
3267
+ """Internal id, valid only in one DB instance."""
3268
+ uid: str = CharField(unique=True, max_length=12, db_index=True, default=base62_12)
3269
+ """Universal id, valid across DB instances."""
3270
+ name: str = CharField(db_index=True)
3271
+ """Title or name of the Project."""
3272
+ type: str | None = CharField(max_length=64, db_index=True, null=True)
3273
+ """A free-form type."""
3274
+ abbr: str | None = CharField(max_length=32, db_index=True, null=True)
3275
+ """An abbreviation."""
3276
+ url: str | None = URLField(max_length=255, null=True, default=None)
3277
+ """A URL."""
3278
+ start_date: date | None = DateField(null=True, default=None)
3279
+ """Date of start of the project."""
3280
+ end_date: date | None = DateField(null=True, default=None)
3281
+ """Date of start of the project."""
3282
+ parents: Project = models.ManyToManyField(
3283
+ "self", symmetrical=False, related_name="children"
3284
+ )
3285
+ """Parent projects."""
3286
+ children: Project
3287
+ """Child projects.
3288
+
3289
+ Reverse accessor for parents.
3290
+ """
3291
+ artifacts: Artifact = models.ManyToManyField(
3292
+ Artifact, through="ArtifactProject", related_name="projects"
3293
+ )
3294
+ """Artifacts associated with this Project."""
3295
+ transforms: Transform = models.ManyToManyField(
3296
+ Transform, through="TransformProject", related_name="projects"
3297
+ )
3298
+ """Transforms associated with this project."""
3299
+ ulabels: ULabel = models.ManyToManyField(
3300
+ ULabel, through="ULabelProject", related_name="projects"
3301
+ )
3302
+ """Transforms associated with this project."""
3303
+ features: ULabel = models.ManyToManyField(
3304
+ Feature, through="FeatureProject", related_name="projects"
3305
+ )
3306
+ """Transforms associated with this project."""
3307
+ schemas: ULabel = models.ManyToManyField(
3308
+ Schema, through="SchemaProject", related_name="projects"
3309
+ )
3310
+ """Schemas associated with this project."""
3311
+ collections: Collection = models.ManyToManyField(
3312
+ Collection, through="CollectionProject", related_name="projects"
3313
+ )
3314
+ """Collections associated with this project."""
3315
+ persons: Person = models.ManyToManyField(Person, related_name="projects")
3316
+ """Persons associated with this project."""
3317
+ references: Reference = models.ManyToManyField("Reference", related_name="projects")
3318
+ """References associated with this project."""
3319
+ _status_code: int = models.SmallIntegerField(default=0, db_index=True)
3320
+ """Status code."""
3321
+
3322
+
3323
+ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3324
+ """References such as internal studies, papers, documents, or URLs.
3325
+
3326
+ Example:
3327
+ >>> reference = Reference(
3328
+ ... name="A Paper Title",
3329
+ ... abbr="APT",
3330
+ ... url="https://doi.org/10.1000/xyz123",
3331
+ ... pubmed_id=12345678,
3332
+ ... doi="10.1000/xyz123",
3333
+ ... description="Good paper.",
3334
+ ... text="Some text I want to be searchable.",
3335
+ ... date=date(2023, 11, 21),
3336
+ ... ).save()
3337
+ """
3338
+
3339
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
3340
+ abstract = False
3341
+
3342
+ id: int = models.AutoField(primary_key=True)
3343
+ """Internal id, valid only in one DB instance."""
3344
+ uid: str = CharField(unique=True, max_length=12, db_index=True, default=base62_12)
3345
+ """Universal id, valid across DB instances."""
3346
+ name: str = CharField(db_index=True)
3347
+ """Title or name of the reference document."""
3348
+ abbr: str | None = CharField(
3349
+ max_length=32,
3350
+ db_index=True,
3351
+ null=True,
3352
+ )
3353
+ """An abbreviation for the reference."""
3354
+ type: Reference | None = ForeignKey(
3355
+ "self", PROTECT, null=True, related_name="records"
3356
+ )
3357
+ """Type of reference (e.g., 'Study', 'Paper', 'Preprint').
3358
+
3359
+ Allows to group reference by type, e.g., internal studies vs. all papers etc.
3360
+ """
3361
+ records: Reference
3362
+ """Records of this type."""
3363
+ is_type: bool = BooleanField(default=None, db_index=True, null=True)
3364
+ """Distinguish types from instances of the type."""
3365
+ url: str | None = URLField(null=True)
3366
+ """URL linking to the reference."""
3367
+ pubmed_id: int | None = BigIntegerField(null=True, db_index=True)
3368
+ """A PudMmed ID."""
3369
+ doi: str | None = CharField(
3370
+ null=True,
3371
+ db_index=True,
3372
+ validators=[
3373
+ RegexValidator(
3374
+ regex=r"^(?:https?://(?:dx\.)?doi\.org/|doi:|DOI:)?10\.\d+/.*$",
3375
+ message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)",
3376
+ )
3377
+ ],
3378
+ )
3379
+ """Digital Object Identifier (DOI) for the reference."""
3380
+ description: str | None = TextField(null=True)
3381
+ """Description of the reference."""
3382
+ text: str | None = TextField(null=True)
3383
+ """Abstract or full text of the reference to make it searchable."""
3384
+ date: date | None = DateField(null=True, default=None)
3385
+ """Date of creation or publication of the reference."""
3386
+ authors: Person = models.ManyToManyField(Person, related_name="references")
3387
+ """All people associated with this reference."""
3388
+ artifacts: Artifact = models.ManyToManyField(
3389
+ Artifact, through="ArtifactReference", related_name="references"
3390
+ )
3391
+ """Artifacts associated with this reference."""
3392
+ transforms: Artifact = models.ManyToManyField(
3393
+ Transform, through="TransformReference", related_name="references"
3394
+ )
3395
+ """Transforms associated with this reference."""
3396
+ collections: Artifact = models.ManyToManyField(
3397
+ Collection, through="CollectionReference", related_name="references"
3398
+ )
3399
+ """Collections associated with this reference."""
3400
+
3401
+
3402
+ # -------------------------------------------------------------------------------------
3403
+ # Data models
3404
+
3405
+ from django.contrib.postgres.fields import JSONField
3406
+ from django.core.exceptions import ValidationError
3407
+ from django.db import models
3408
+
3409
+
3410
+ class DataMixin(models.Model):
3411
+ space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1)
3412
+ feature = ForeignKey(
3413
+ Feature, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3414
+ )
3415
+ param = ForeignKey(
3416
+ Param, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3417
+ )
3418
+ row = IntegerField(help_text="Use -1 for result data")
3419
+
3420
+ # Value fields
3421
+ value_int = models.BigIntegerField(null=True, blank=True)
3422
+ value_float = models.FloatField(null=True, blank=True)
3423
+ value_str = models.TextField(null=True, blank=True)
3424
+ value_upath = models.CharField(max_length=255, null=True, blank=True)
3425
+ value_datetime = models.DateTimeField(null=True, blank=True)
3426
+ value_ulabel = models.ForeignKey(
3427
+ ULabel, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3428
+ )
3429
+ value_person = models.ForeignKey(
3430
+ Person, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3431
+ )
3432
+ value_artifact = models.ForeignKey(
3433
+ Artifact, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3434
+ )
3435
+ value_collection = models.ForeignKey(
3436
+ Collection, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3437
+ )
3438
+ value_project = models.ForeignKey(
3439
+ Project, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3440
+ )
3441
+ value_json = models.JSONField(null=True, blank=True)
3442
+
3443
+ class Meta:
3444
+ abstract = True
3445
+
3446
+ def clean(self):
3447
+ # Validate feature/param mutual exclusivity
3448
+ if (self.feature is not None) == (self.param is not None):
3449
+ raise ValidationError("Exactly one of feature or param must be set")
3450
+
3451
+ # Validate value fields
3452
+ values = [
3453
+ self.value_int,
3454
+ self.value_float,
3455
+ self.value_str,
3456
+ self.value_upath,
3457
+ self.value_datetime,
3458
+ self.value_ulabel,
3459
+ self.value_artifact,
3460
+ self.value_json,
3461
+ ]
3462
+ non_null_count = sum(1 for v in values if v is not None)
3463
+
3464
+ if non_null_count != 1:
3465
+ raise ValidationError("Exactly one value field must be set")
3466
+
3467
+
3468
+ class RunData(BasicRecord, DataMixin):
3469
+ run = models.ForeignKey("Run", on_delete=models.CASCADE, related_name="data")
3470
+
3471
+ class Meta:
3472
+ constraints = [
3473
+ models.CheckConstraint(
3474
+ check=(
3475
+ models.Q(feature__isnull=False, param__isnull=True)
3476
+ | models.Q(feature__isnull=True, param__isnull=False)
3477
+ ),
3478
+ name="run_data_feature_param_mutex",
3479
+ ),
3480
+ models.UniqueConstraint(
3481
+ fields=["run", "row", "feature", "param"], name="run_data_unique"
3482
+ ),
3483
+ ]
3484
+ indexes = [
3485
+ models.Index(fields=["run", "row"]),
3486
+ models.Index(fields=["feature"]),
3487
+ models.Index(fields=["param"]),
3488
+ ]
3489
+
3490
+
3491
+ class TidyTable(Record, TracksRun, TracksUpdates):
3492
+ uid: str = CharField(unique=True, max_length=12, db_index=True, default=base62_12)
3493
+ name = CharField()
3494
+ schema: Schema | None = ForeignKey(
3495
+ Schema, null=True, on_delete=models.SET_NULL, related_name="_tidytables"
3496
+ )
3497
+ type: TidyTable | None = ForeignKey(
3498
+ "self", PROTECT, null=True, related_name="records"
3499
+ )
3500
+ """Type of tidy table, e.g., `Cell`, `SampleSheet`, etc."""
3501
+ records: ULabel
3502
+ """Records of this type."""
3503
+ is_type: bool = BooleanField(default=None, db_index=True, null=True)
3504
+ """Distinguish types from instances of the type."""
3505
+ description: str = TextField()
3506
+ projects: Project = ManyToManyField(Project, related_name="_tidytables")
3507
+ ulabels: Project = ManyToManyField(ULabel, related_name="_tidytables")
3508
+
3509
+ class Meta:
3510
+ indexes = [models.Index(fields=["uid"]), models.Index(fields=["name"])]
3511
+
3512
+
3513
+ class TidyTableData(BasicRecord, DataMixin):
3514
+ tidytable = models.ForeignKey(
3515
+ TidyTable, on_delete=models.CASCADE, related_name="data"
3516
+ )
3517
+
3518
+ class Meta:
3519
+ constraints = [
3520
+ models.CheckConstraint(
3521
+ check=(
3522
+ models.Q(feature__isnull=False, param__isnull=True)
3523
+ | models.Q(feature__isnull=True, param__isnull=False)
3524
+ ),
3525
+ name="tidy_table_data_feature_param_mutex",
3526
+ ),
3527
+ models.UniqueConstraint(
3528
+ fields=["tidytable", "row", "feature", "param"],
3529
+ name="tidy_table_data_unique",
3530
+ ),
3531
+ ]
3532
+ indexes = [
3533
+ models.Index(fields=["tidytable", "row"]),
3534
+ models.Index(fields=["feature"]),
3535
+ models.Index(fields=["param"]),
3536
+ ]
3537
+
3538
+
3539
+ # -------------------------------------------------------------------------------------
3540
+ # Link models
3541
+
3542
+
3543
+ class LinkORM:
3544
+ pass
3545
+
3546
+
3547
+ class SchemaFeature(BasicRecord, LinkORM):
3548
+ id: int = models.BigAutoField(primary_key=True)
3549
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
3550
+ feature: Feature = ForeignKey(Feature, PROTECT, related_name="+")
3551
+
3552
+ class Meta:
3553
+ unique_together = ("schema", "feature")
3554
+
3555
+
3556
+ class SchemaParam(BasicRecord, LinkORM):
3557
+ id: int = models.BigAutoField(primary_key=True)
3558
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
3559
+ param: Param = ForeignKey(Param, PROTECT, related_name="+")
3560
+
3561
+ class Meta:
3562
+ unique_together = ("schema", "param")
3563
+
3564
+
3565
+ class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
3566
+ id: int = models.BigAutoField(primary_key=True)
3567
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="_links_schema")
3568
+ # we follow the lower() case convention rather than snake case for link models
3569
+ schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
3570
+ slot: str | None = CharField(max_length=40, null=True)
3571
+ feature_ref_is_semantic: bool | None = BooleanField(
3572
+ null=True
3573
+ ) # like Feature name or Gene symbol or CellMarker name
3574
+
3575
+ class Meta:
3576
+ unique_together = ("artifact", "schema")
3577
+
3578
+
3579
+ class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
3580
+ id: int = models.BigAutoField(primary_key=True)
3581
+ collection: Collection = ForeignKey(
3582
+ Collection, CASCADE, related_name="links_artifact"
3583
+ )
3584
+ artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection")
3585
+
3586
+ class Meta:
3587
+ unique_together = ("collection", "artifact")
3588
+
3589
+
3590
+ class ArtifactULabel(BasicRecord, LinkORM, TracksRun):
3591
+ id: int = models.BigAutoField(primary_key=True)
3592
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_ulabel")
3593
+ ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_artifact")
3594
+ feature: Feature | None = ForeignKey(
3595
+ Feature, PROTECT, null=True, related_name="links_artifactulabel", default=None
3596
+ )
3597
+ label_ref_is_name: bool | None = BooleanField(null=True)
3598
+ feature_ref_is_name: bool | None = BooleanField(null=True)
3599
+
3600
+ class Meta:
3601
+ # can have the same label linked to the same artifact if the feature is
3602
+ # different
3603
+ unique_together = ("artifact", "ulabel", "feature")
3604
+
3605
+
3606
+ class TransformULabel(BasicRecord, LinkORM, TracksRun):
3607
+ id: int = models.BigAutoField(primary_key=True)
3608
+ transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_ulabel")
3609
+ ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_transform")
3610
+
3611
+ class Meta:
3612
+ unique_together = ("transform", "ulabel")
3613
+
3614
+
3615
+ class CollectionULabel(BasicRecord, LinkORM, TracksRun):
3616
+ id: int = models.BigAutoField(primary_key=True)
3617
+ collection: Collection = ForeignKey(
3618
+ Collection, CASCADE, related_name="links_ulabel"
3619
+ )
3620
+ ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_collection")
3621
+ feature: Feature | None = ForeignKey(
3622
+ Feature, PROTECT, null=True, related_name="links_collectionulabel", default=None
3623
+ )
3624
+ label_ref_is_name: bool | None = BooleanField(null=True)
3625
+ feature_ref_is_name: bool | None = BooleanField(null=True)
3626
+
3627
+ class Meta:
3628
+ unique_together = ("collection", "ulabel")
3629
+
3630
+
3631
+ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
3632
+ id: int = models.BigAutoField(primary_key=True)
3633
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
3634
+ # we follow the lower() case convention rather than snake case for link models
3635
+ featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
3636
+
3637
+ class Meta:
3638
+ unique_together = ("artifact", "featurevalue")
3639
+
3640
+
3641
+ class RunParamValue(BasicRecord, LinkORM):
3642
+ id: int = models.BigAutoField(primary_key=True)
3643
+ run: Run = ForeignKey(Run, CASCADE, related_name="+")
3644
+ # we follow the lower() case convention rather than snake case for link models
3645
+ paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
3646
+ created_at: datetime = DateTimeField(auto_now_add=True, db_index=True)
3647
+ """Time of creation of record."""
3648
+ created_by: User = ForeignKey(
3649
+ "lamindb.User", PROTECT, default=current_user_id, related_name="+"
3650
+ )
3651
+ """Creator of record."""
3652
+
3653
+ class Meta:
3654
+ unique_together = ("run", "paramvalue")
3655
+
3656
+
3657
+ class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
3658
+ id: int = models.BigAutoField(primary_key=True)
3659
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
3660
+ # we follow the lower() case convention rather than snake case for link models
3661
+ paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
3662
+
3663
+ class Meta:
3664
+ unique_together = ("artifact", "paramvalue")
3665
+
3666
+
3667
+ # -------------------------------------------------------------------------------------
3668
+ # Link models for project management
3669
+
3670
+
3671
+ class ArtifactProject(BasicRecord, LinkORM, TracksRun):
3672
+ id: int = models.BigAutoField(primary_key=True)
3673
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_project")
3674
+ project: Project = ForeignKey(Project, PROTECT, related_name="links_artifact")
3675
+ feature: Feature | None = ForeignKey(
3676
+ Feature,
3677
+ PROTECT,
3678
+ null=True,
3679
+ default=None,
3680
+ related_name="links_artifactproject",
3681
+ )
3682
+ label_ref_is_name: bool | None = BooleanField(null=True, default=None)
3683
+ feature_ref_is_name: bool | None = BooleanField(null=True, default=None)
3684
+
3685
+ class Meta:
3686
+ # can have the same label linked to the same artifact if the feature is different
3687
+ unique_together = ("artifact", "project", "feature")
3688
+
3689
+
3690
+ class TransformProject(BasicRecord, LinkORM, TracksRun):
3691
+ id: int = models.BigAutoField(primary_key=True)
3692
+ transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_project")
3693
+ project: Project = ForeignKey(Project, PROTECT, related_name="links_transform")
3694
+
3695
+ class Meta:
3696
+ unique_together = ("transform", "project")
3697
+
3698
+
3699
+ class CollectionProject(BasicRecord, LinkORM, TracksRun):
3700
+ id: int = models.BigAutoField(primary_key=True)
3701
+ collection: Collection = ForeignKey(
3702
+ Collection, CASCADE, related_name="links_project"
3703
+ )
3704
+ project: Project = ForeignKey(Project, PROTECT, related_name="links_collection")
3705
+
3706
+ class Meta:
3707
+ unique_together = ("collection", "project")
3708
+
3709
+
3710
+ class ULabelProject(BasicRecord, LinkORM, TracksRun):
3711
+ id: int = models.BigAutoField(primary_key=True)
3712
+ ulabel: Transform = ForeignKey(ULabel, CASCADE, related_name="links_project")
3713
+ project: Project = ForeignKey(Project, PROTECT, related_name="links_ulabel")
3714
+
3715
+ class Meta:
3716
+ unique_together = ("ulabel", "project")
3717
+
3718
+
3719
+ class FeatureProject(BasicRecord, LinkORM, TracksRun):
3720
+ id: int = models.BigAutoField(primary_key=True)
3721
+ feature: Feature = ForeignKey(Feature, CASCADE, related_name="links_project")
3722
+ project: Project = ForeignKey(Project, PROTECT, related_name="links_feature")
3723
+
3724
+ class Meta:
3725
+ unique_together = ("feature", "project")
3726
+
3727
+
3728
+ class SchemaProject(BasicRecord, LinkORM, TracksRun):
3729
+ id: int = models.BigAutoField(primary_key=True)
3730
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_project")
3731
+ project: Project = ForeignKey(Project, PROTECT, related_name="links_schema")
3732
+
3733
+ class Meta:
3734
+ unique_together = ("schema", "project")
3735
+
3736
+
3737
+ class ArtifactReference(BasicRecord, LinkORM, TracksRun):
3738
+ id: int = models.BigAutoField(primary_key=True)
3739
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_reference")
3740
+ reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_artifact")
3741
+ feature: Feature | None = ForeignKey(
3742
+ Feature,
3743
+ PROTECT,
3744
+ null=True,
3745
+ default=None,
3746
+ related_name="links_artifactreference",
3747
+ )
3748
+ label_ref_is_name: bool | None = BooleanField(null=True, default=None)
3749
+ feature_ref_is_name: bool | None = BooleanField(null=True, default=None)
3750
+
3751
+ class Meta:
3752
+ # can have the same label linked to the same artifact if the feature is different
3753
+ unique_together = ("artifact", "reference", "feature")
3754
+
3755
+
3756
+ class TransformReference(BasicRecord, LinkORM, TracksRun):
3757
+ id: int = models.BigAutoField(primary_key=True)
3758
+ transform: Transform = ForeignKey(
3759
+ Transform, CASCADE, related_name="links_reference"
3760
+ )
3761
+ reference: Reference = ForeignKey(
3762
+ Reference, PROTECT, related_name="links_transform"
3763
+ )
3764
+
3765
+ class Meta:
3766
+ unique_together = ("transform", "reference")
3767
+
3768
+
3769
+ class CollectionReference(BasicRecord, LinkORM, TracksRun):
3770
+ id: int = models.BigAutoField(primary_key=True)
3771
+ collection: Collection = ForeignKey(
3772
+ Collection, CASCADE, related_name="links_reference"
3773
+ )
3774
+ reference: Reference = ForeignKey(
3775
+ Reference, PROTECT, related_name="links_collection"
3776
+ )
3777
+
3778
+ class Meta:
3779
+ unique_together = ("collection", "reference")
3780
+
3781
+
3782
+ # class Migration(Record):
3783
+ # app = CharField(max_length=255)
3784
+ # name = CharField(max_length=255)
3785
+ # applied: datetime = DateTimeField()
3786
+
3787
+ # class Meta:
3788
+ # db_table = "django_migrations"
3789
+ # managed = False
3790
+
3791
+
3792
+ # -------------------------------------------------------------------------------------
3793
+ # Low-level logic needed in lamindb-setup
3794
+
3795
+ # Below is needed within lnschema-core because lamindb-setup already performs
3796
+ # some logging
3797
+
3798
+
3799
+ def format_field_value(value: datetime | str | Any) -> Any:
3800
+ from datetime import datetime
3801
+
3802
+ if isinstance(value, datetime):
3803
+ return value.strftime("%Y-%m-%d %H:%M:%S %Z")
3804
+
3805
+ if isinstance(value, str):
3806
+ try:
3807
+ value = datetime.fromisoformat(value)
3808
+ value = value.strftime("%Y-%m-%d %H:%M:%S %Z")
3809
+ except ValueError:
3810
+ pass
3811
+ return f"'{value}'"
3812
+ else:
3813
+ return value
3814
+
3815
+
3816
+ class RegistryInfo:
3817
+ def __init__(self, registry: Registry):
3818
+ self.registry = registry
3819
+
3820
+ def _get_type_for_field(self, field_name: str) -> str:
3821
+ field = self.registry._meta.get_field(field_name)
3822
+ related_model_name = (
3823
+ field.related_model.__name__
3824
+ if hasattr(field, "related_model") and field.related_model
3825
+ else None
3826
+ )
3827
+ return related_model_name if related_model_name else field.get_internal_type()
3828
+
3829
+ def _get_base_class_fields(self) -> list[str]:
3830
+ return [
3831
+ field.name
3832
+ for base in self.registry.__bases__
3833
+ if hasattr(base, "_meta")
3834
+ for field in base._meta.get_fields()
3835
+ ]
3836
+
3837
+ def _reorder_fields_by_class(self, fields_to_order: list[Field]) -> list[Field]:
3838
+ """Reorders the fields so that base class fields come last."""
3839
+ non_base_class_fields = [
3840
+ field
3841
+ for field in fields_to_order
3842
+ if field.name not in self._get_base_class_fields()
3843
+ ]
3844
+ found_base_class_fields = [
3845
+ field
3846
+ for field in fields_to_order
3847
+ if field.name in self._get_base_class_fields()
3848
+ ]
3849
+ return non_base_class_fields + found_base_class_fields
3850
+
3851
+ def get_simple_fields(self, return_str: bool = False) -> Any:
3852
+ simple_fields = [
3853
+ field
3854
+ for field in self.registry._meta.get_fields()
3855
+ if not (
3856
+ isinstance(field, ManyToOneRel)
3857
+ or isinstance(field, ManyToManyRel)
3858
+ or isinstance(field, ManyToManyField)
3859
+ or isinstance(field, ForeignKey)
3860
+ or field.name.startswith("_")
3861
+ or field.name == "id"
3862
+ )
3863
+ ]
3864
+ simple_fields = self._reorder_fields_by_class(simple_fields)
3865
+ if not return_str:
3866
+ return simple_fields
3867
+ else:
3868
+ repr_str = f" {colors.italic('Simple fields')}\n"
3869
+ if simple_fields:
3870
+ repr_str += "".join(
3871
+ [
3872
+ f" .{field_name.name}: {self._get_type_for_field(field_name.name)}\n"
3873
+ for field_name in simple_fields
3874
+ ]
3875
+ )
3876
+ return repr_str
3877
+
3878
+ def get_relational_fields(self, return_str: bool = False):
3879
+ # we ignore ManyToOneRel because it leads to so much clutter in the API
3880
+ # also note that our general guideline is to have related_name="+"
3881
+ # for ForeignKey fields
3882
+ relational_fields = (ManyToOneRel, ManyToManyRel, ManyToManyField, ForeignKey)
3883
+
3884
+ class_specific_relational_fields = [
3885
+ field
3886
+ for field in self.registry._meta.fields + self.registry._meta.many_to_many
3887
+ if isinstance(field, relational_fields)
3888
+ and not field.name.startswith(("links_", "_"))
3889
+ ]
3890
+
3891
+ non_class_specific_relational_fields = [
3892
+ field
3893
+ for field in self.registry._meta.get_fields()
3894
+ if isinstance(field, relational_fields)
3895
+ and not field.name.startswith(("links_", "_"))
3896
+ ]
3897
+ non_class_specific_relational_fields = self._reorder_fields_by_class(
3898
+ non_class_specific_relational_fields
3899
+ )
3900
+
3901
+ # Ensure that class specific fields (e.g. Artifact) come before non-class specific fields (e.g. collection)
3902
+ filtered_non_class_specific = [
3903
+ field
3904
+ for field in non_class_specific_relational_fields
3905
+ if field not in class_specific_relational_fields
3906
+ ]
3907
+ ordered_relational_fields = (
3908
+ class_specific_relational_fields + filtered_non_class_specific
3909
+ )
3910
+
3911
+ core_module_fields = []
3912
+ external_modules_fields = []
3913
+ for field in ordered_relational_fields:
3914
+ field_name = repr(field).split(": ")[1][:-1]
3915
+ if field_name.count(".") == 1 and "lamindb" not in field_name:
3916
+ external_modules_fields.append(field)
3917
+ else:
3918
+ core_module_fields.append(field)
3919
+
3920
+ def _get_related_field_type(field) -> str:
3921
+ field_type = (
3922
+ field.related_model.__get_name_with_module__()
3923
+ .replace(
3924
+ "Artifact", ""
3925
+ ) # some fields have an unnecessary 'Artifact' in their name
3926
+ .replace(
3927
+ "Collection", ""
3928
+ ) # some fields have an unnecessary 'Collection' in their name
3929
+ )
3930
+ return (
3931
+ self._get_type_for_field(field.name)
3932
+ if not field_type.strip()
3933
+ else field_type
3934
+ )
3935
+
3936
+ core_module_fields_formatted = [
3937
+ f" .{field.name}: {_get_related_field_type(field)}\n"
3938
+ for field in core_module_fields
3939
+ ]
3940
+ external_modules_fields_formatted = [
3941
+ f" .{field.name}: {_get_related_field_type(field)}\n"
3942
+ for field in external_modules_fields
3943
+ ]
3944
+
3945
+ if not return_str:
3946
+ external_modules_fields_by_modules = defaultdict(list)
3947
+ for field_str, field in zip(
3948
+ external_modules_fields_formatted, external_modules_fields
3949
+ ):
3950
+ field_type = field_str.split(":")[1].split()[0]
3951
+ module_name = field_type.split(".")[0]
3952
+ external_modules_fields_by_modules[module_name].append(field)
3953
+ return core_module_fields, external_modules_fields_by_modules
3954
+ else:
3955
+ repr_str = ""
3956
+
3957
+ # Non-external relational fields
3958
+ if core_module_fields:
3959
+ repr_str += f" {colors.italic('Relational fields')}\n"
3960
+ repr_str += "".join(core_module_fields_formatted)
3961
+
3962
+ # External relational fields
3963
+ external_modules = set()
3964
+ for field in external_modules_fields_formatted:
3965
+ field_type = field.split(":")[1].split()[0]
3966
+ external_modules.add(field_type.split(".")[0])
3967
+
3968
+ if external_modules:
3969
+ # We want Bionty to show up before other modules
3970
+ external_modules = (
3971
+ ["bionty"] + sorted(external_modules - {"bionty"}) # type: ignore
3972
+ if "bionty" in external_modules
3973
+ else sorted(external_modules)
3974
+ )
3975
+ for ext_module in external_modules:
3976
+ ext_module_fields = [
3977
+ field
3978
+ for field in external_modules_fields_formatted
3979
+ if ext_module in field
3980
+ ]
3981
+
3982
+ if ext_module_fields:
3983
+ repr_str += (
3984
+ f" {colors.italic(f'{ext_module.capitalize()} fields')}\n"
3985
+ )
3986
+ repr_str += "".join(ext_module_fields)
3987
+
3988
+ return repr_str
3989
+
3990
+
3991
+ def registry_repr(cls):
3992
+ """Shows fields."""
3993
+ repr_str = f"{colors.green(cls.__name__)}\n"
3994
+ info = RegistryInfo(cls)
3995
+ repr_str += info.get_simple_fields(return_str=True)
3996
+ repr_str += info.get_relational_fields(return_str=True)
3997
+ repr_str = repr_str.rstrip("\n")
3998
+ return repr_str
3999
+
4000
+
4001
+ def record_repr(
4002
+ self: Record, include_foreign_keys: bool = True, exclude_field_names=None
4003
+ ) -> str:
4004
+ if exclude_field_names is None:
4005
+ exclude_field_names = ["id", "updated_at", "source_code"]
4006
+ field_names = [
4007
+ field.name
4008
+ for field in self._meta.fields
4009
+ if (not isinstance(field, ForeignKey) and field.name not in exclude_field_names)
4010
+ ]
4011
+ if include_foreign_keys:
4012
+ field_names += [
4013
+ f"{field.name}_id"
4014
+ for field in self._meta.fields
4015
+ if isinstance(field, ForeignKey)
4016
+ ]
4017
+ if "created_at" in field_names:
4018
+ field_names.remove("created_at")
4019
+ field_names.append("created_at")
4020
+ if field_names[0] != "uid" and "uid" in field_names:
4021
+ field_names.remove("uid")
4022
+ field_names.insert(0, "uid")
4023
+ fields_str = {}
4024
+ for k in field_names:
4025
+ if not k.startswith("_") and hasattr(self, k):
4026
+ value = getattr(self, k)
4027
+ # Force strip the time component of the version
4028
+ if k == "version" and value:
4029
+ fields_str[k] = f"'{str(value).split()[0]}'"
4030
+ else:
4031
+ fields_str[k] = format_field_value(value)
4032
+ fields_joined_str = ", ".join(
4033
+ [f"{k}={fields_str[k]}" for k in fields_str if fields_str[k] is not None]
4034
+ )
4035
+ return f"{self.__class__.__name__}({fields_joined_str})"
4036
+
4037
+
4038
+ # below is code to further format the repr of a record
4039
+ #
4040
+ # def format_repr(
4041
+ # record: Record, exclude_field_names: str | list[str] | None = None
4042
+ # ) -> str:
4043
+ # if isinstance(exclude_field_names, str):
4044
+ # exclude_field_names = [exclude_field_names]
4045
+ # exclude_field_names_init = ["id", "created_at", "updated_at"]
4046
+ # if exclude_field_names is not None:
4047
+ # exclude_field_names_init += exclude_field_names
4048
+ # return record.__repr__(
4049
+ # include_foreign_keys=False, exclude_field_names=exclude_field_names_init
4050
+ # )
4051
+
4052
+
4053
+ Record.__repr__ = record_repr # type: ignore
4054
+ Record.__str__ = record_repr # type: ignore
4055
+
4056
+
4057
+ def deferred_attribute__repr__(self):
4058
+ return f"FieldAttr({self.field.model.__name__}.{self.field.name})"
4059
+
4060
+
4061
+ FieldAttr.__repr__ = deferred_attribute__repr__ # type: ignore
4062
+ # backward compatibility
4063
+ CanValidate = CanCurate
4064
+ FeatureSet = Schema