lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. lamindb/__init__.py +31 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_context.py +41 -10
  10. lamindb/core/_mapped_collection.py +4 -2
  11. lamindb/core/_settings.py +6 -6
  12. lamindb/core/_sync_git.py +1 -1
  13. lamindb/core/_track_environment.py +2 -1
  14. lamindb/core/datasets/_small.py +3 -3
  15. lamindb/core/loaders.py +22 -9
  16. lamindb/core/storage/_anndata_accessor.py +8 -3
  17. lamindb/core/storage/_backed_access.py +14 -7
  18. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  19. lamindb/core/storage/_tiledbsoma.py +6 -4
  20. lamindb/core/storage/_zarr.py +32 -11
  21. lamindb/core/storage/objects.py +59 -26
  22. lamindb/core/storage/paths.py +16 -13
  23. lamindb/curators/__init__.py +173 -145
  24. lamindb/errors.py +1 -1
  25. lamindb/integrations/_vitessce.py +4 -4
  26. lamindb/migrations/0089_subsequent_runs.py +159 -0
  27. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  28. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  29. lamindb/models/__init__.py +79 -0
  30. lamindb/{core → models}/_describe.py +3 -3
  31. lamindb/{core → models}/_django.py +8 -5
  32. lamindb/{core → models}/_feature_manager.py +103 -87
  33. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  34. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  35. lamindb/{core → models}/_label_manager.py +10 -17
  36. lamindb/{core/relations.py → models/_relations.py} +8 -1
  37. lamindb/models/artifact.py +2601 -0
  38. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  39. lamindb/models/collection.py +683 -0
  40. lamindb/models/core.py +135 -0
  41. lamindb/models/feature.py +643 -0
  42. lamindb/models/flextable.py +163 -0
  43. lamindb/{_parents.py → models/has_parents.py} +55 -49
  44. lamindb/models/project.py +384 -0
  45. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  46. lamindb/{_query_set.py → models/query_set.py} +52 -30
  47. lamindb/models/record.py +1757 -0
  48. lamindb/models/run.py +563 -0
  49. lamindb/{_save.py → models/save.py} +18 -8
  50. lamindb/models/schema.py +732 -0
  51. lamindb/models/transform.py +360 -0
  52. lamindb/models/ulabel.py +249 -0
  53. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
  54. lamindb-1.2a2.dist-info/RECORD +94 -0
  55. lamindb/_artifact.py +0 -1361
  56. lamindb/_collection.py +0 -440
  57. lamindb/_feature.py +0 -316
  58. lamindb/_is_versioned.py +0 -40
  59. lamindb/_record.py +0 -1065
  60. lamindb/_run.py +0 -60
  61. lamindb/_schema.py +0 -347
  62. lamindb/_storage.py +0 -15
  63. lamindb/_transform.py +0 -170
  64. lamindb/_ulabel.py +0 -56
  65. lamindb/_utils.py +0 -9
  66. lamindb/base/validation.py +0 -63
  67. lamindb/core/_data.py +0 -491
  68. lamindb/core/fields.py +0 -12
  69. lamindb/models.py +0 -4435
  70. lamindb-1.1.0.dist-info/RECORD +0 -95
  71. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
  72. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
lamindb/models.py DELETED
@@ -1,4435 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import sys
4
- from collections import defaultdict
5
- from datetime import date, datetime # noqa: TC003
6
- from itertools import chain
7
- from typing import (
8
- TYPE_CHECKING,
9
- Any,
10
- Literal,
11
- NamedTuple,
12
- overload,
13
- )
14
-
15
- from django.core.validators import RegexValidator
16
- from django.db import IntegrityError, models
17
- from django.db.models import CASCADE, PROTECT, Field, Q
18
- from django.db.models.base import ModelBase
19
- from django.db.models.fields.related import (
20
- ManyToManyField,
21
- ManyToManyRel,
22
- ManyToOneRel,
23
- )
24
- from lamin_utils import colors
25
- from lamindb_setup import _check_instance_setup
26
- from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict
27
-
28
- from lamindb.base import deprecated, doc_args
29
- from lamindb.base.fields import (
30
- BigIntegerField,
31
- BooleanField,
32
- CharField,
33
- DateField,
34
- DateTimeField,
35
- EmailField,
36
- ForeignKey,
37
- IntegerField,
38
- JSONField,
39
- OneToOneField,
40
- TextField,
41
- URLField,
42
- )
43
-
44
- from .base.ids import base62_8, base62_12, base62_20
45
- from .base.types import (
46
- ArtifactKind,
47
- FeatureDtype,
48
- FieldAttr,
49
- ListLike,
50
- StrField,
51
- TransformType,
52
- )
53
- from .base.users import current_user_id
54
-
55
- if TYPE_CHECKING:
56
- from collections.abc import Iterable
57
- from pathlib import Path
58
-
59
- import numpy as np
60
- import pandas as pd
61
- from anndata import AnnData
62
- from lamin_utils._inspect import InspectResult
63
- from lamindb_setup.core.types import UPathStr
64
- from mudata import MuData
65
- from pyarrow.dataset import Dataset as PyArrowDataset
66
- from tiledbsoma import Collection as SOMACollection
67
- from tiledbsoma import Experiment as SOMAExperiment
68
- from tiledbsoma import Measurement as SOMAMeasurement
69
- from upath import UPath
70
-
71
- from lamindb.core import LabelManager, MappedCollection, QuerySet, RecordList
72
- from lamindb.core.storage import AnnDataAccessor, BackedAccessor
73
-
74
-
75
- _TRACKING_READY: bool | None = None
76
-
77
-
78
- class IsVersioned(models.Model):
79
- """Base class for versioned models."""
80
-
81
- class Meta:
82
- abstract = True
83
-
84
- _len_stem_uid: int
85
-
86
- version: str | None = CharField(max_length=30, null=True, db_index=True)
87
- """Version (default `None`).
88
-
89
- Defines version of a family of records characterized by the same `stem_uid`.
90
-
91
- Consider using `semantic versioning <https://semver.org>`__
92
- with `Python versioning <https://peps.python.org/pep-0440/>`__.
93
- """
94
- is_latest: bool = BooleanField(default=True, db_index=True)
95
- """Boolean flag that indicates whether a record is the latest in its version family."""
96
-
97
- @overload
98
- def __init__(self): ...
99
-
100
- @overload
101
- def __init__(
102
- self,
103
- *db_args,
104
- ): ...
105
-
106
- def __init__(
107
- self,
108
- *args,
109
- **kwargs,
110
- ):
111
- self._revises = kwargs.pop("revises") if "revises" in kwargs else None
112
- super().__init__(*args, **kwargs)
113
-
114
- @property
115
- def stem_uid(self) -> str:
116
- """Universal id characterizing the version family.
117
-
118
- The full uid of a record is obtained via concatenating the stem uid and version information::
119
-
120
- stem_uid = random_base62(n_char) # a random base62 sequence of length 12 (transform) or 16 (artifact, collection)
121
- version_uid = "0000" # an auto-incrementing 4-digit base62 number
122
- uid = f"{stem_uid}{version_uid}" # concatenate the stem_uid & version_uid
123
-
124
- """
125
- return self.uid[: self._len_stem_uid] # type: ignore
126
-
127
- @property
128
- def versions(self) -> QuerySet:
129
- """Lists all records of the same version family.
130
-
131
- >>> new_artifact = ln.Artifact(df2, revises=artifact).save()
132
- >>> new_artifact.versions()
133
- """
134
- db = self._state.db
135
- if db is not None and db != "default":
136
- return self.__class__.using(db).filter(uid__startswith=self.stem_uid) # type: ignore
137
- else:
138
- return self.__class__.filter(uid__startswith=self.stem_uid) # type: ignore
139
-
140
- def _add_to_version_family(self, revises: IsVersioned, version: str | None = None):
141
- """Add current record to a version family.
142
-
143
- Args:
144
- revises: a record that belongs to the version family.
145
- version: semantic version of the record.
146
- """
147
- pass
148
-
149
-
150
- def current_run() -> Run | None:
151
- global _TRACKING_READY
152
-
153
- if not _TRACKING_READY:
154
- _TRACKING_READY = _check_instance_setup()
155
- if _TRACKING_READY:
156
- import lamindb
157
-
158
- # also see get_run() in core._data
159
- run = lamindb._tracked.get_current_tracked_run()
160
- if run is None:
161
- run = lamindb.context.run
162
- return run
163
- else:
164
- return None
165
-
166
-
167
- class TracksRun(models.Model):
168
- """Base class tracking latest run, creating user, and `created_at` timestamp."""
169
-
170
- class Meta:
171
- abstract = True
172
-
173
- created_at: datetime = DateTimeField(
174
- editable=False, db_default=models.functions.Now(), db_index=True
175
- )
176
- """Time of creation of record."""
177
- created_by: User = ForeignKey(
178
- "lamindb.User",
179
- PROTECT,
180
- editable=False,
181
- default=current_user_id,
182
- related_name="+",
183
- )
184
- """Creator of record."""
185
- run: Run | None = ForeignKey(
186
- "lamindb.Run", PROTECT, null=True, default=current_run, related_name="+"
187
- )
188
- """Last run that created or updated the record."""
189
-
190
- @overload
191
- def __init__(self): ...
192
-
193
- @overload
194
- def __init__(
195
- self,
196
- *db_args,
197
- ): ...
198
-
199
- def __init__(
200
- self,
201
- *args,
202
- **kwargs,
203
- ):
204
- super().__init__(*args, **kwargs)
205
-
206
-
207
- class TracksUpdates(models.Model):
208
- """Base class tracking previous runs and `updated_at` timestamp."""
209
-
210
- class Meta:
211
- abstract = True
212
-
213
- updated_at: datetime = DateTimeField(
214
- editable=False, db_default=models.functions.Now(), db_index=True
215
- )
216
- """Time of last update to record."""
217
-
218
- @overload
219
- def __init__(self): ...
220
-
221
- @overload
222
- def __init__(
223
- self,
224
- *db_args,
225
- ): ...
226
-
227
- def __init__(
228
- self,
229
- *args,
230
- **kwargs,
231
- ):
232
- super().__init__(*args, **kwargs)
233
-
234
-
235
- class CanCurate:
236
- """Base class providing :class:`~lamindb.core.Record`-based validation."""
237
-
238
- @classmethod
239
- def inspect(
240
- cls,
241
- values: ListLike,
242
- field: str | StrField | None = None,
243
- *,
244
- mute: bool = False,
245
- organism: str | Record | None = None,
246
- source: Record | None = None,
247
- strict_source: bool = False,
248
- ) -> InspectResult:
249
- """Inspect if values are mappable to a field.
250
-
251
- Being mappable means that an exact match exists.
252
-
253
- Args:
254
- values: Values that will be checked against the field.
255
- field: The field of values. Examples are `'ontology_id'` to map
256
- against the source ID or `'name'` to map against the ontologies
257
- field names.
258
- mute: Whether to mute logging.
259
- organism: An Organism name or record.
260
- source: A `bionty.Source` record that specifies the version to inspect against.
261
- strict_source: Determines the validation behavior against records in the registry.
262
- - If `False`, validation will include all records in the registry, ignoring the specified source.
263
- - If `True`, validation will only include records in the registry that are linked to the specified source.
264
- Note: this parameter won't affect validation against bionty/public sources.
265
-
266
- See Also:
267
- :meth:`~lamindb.core.CanCurate.validate`
268
-
269
- Examples:
270
- >>> import bionty as bt
271
- >>> bt.settings.organism = "human"
272
- >>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
273
- >>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
274
- >>> result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol)
275
- >>> result.validated
276
- ['A1CF', 'A1BG']
277
- >>> result.non_validated
278
- ['FANCD1', 'FANCD20']
279
- """
280
- pass
281
-
282
- @classmethod
283
- def validate(
284
- cls,
285
- values: ListLike,
286
- field: str | StrField | None = None,
287
- *,
288
- mute: bool = False,
289
- organism: str | Record | None = None,
290
- source: Record | None = None,
291
- strict_source: bool = False,
292
- ) -> np.ndarray:
293
- """Validate values against existing values of a string field.
294
-
295
- Note this is strict_source validation, only asserts exact matches.
296
-
297
- Args:
298
- values: Values that will be validated against the field.
299
- field: The field of values.
300
- Examples are `'ontology_id'` to map against the source ID
301
- or `'name'` to map against the ontologies field names.
302
- mute: Whether to mute logging.
303
- organism: An Organism name or record.
304
- source: A `bionty.Source` record that specifies the version to validate against.
305
- strict_source: Determines the validation behavior against records in the registry.
306
- - If `False`, validation will include all records in the registry, ignoring the specified source.
307
- - If `True`, validation will only include records in the registry that are linked to the specified source.
308
- Note: this parameter won't affect validation against bionty/public sources.
309
-
310
- Returns:
311
- A vector of booleans indicating if an element is validated.
312
-
313
- See Also:
314
- :meth:`~lamindb.core.CanCurate.inspect`
315
-
316
- Examples:
317
- >>> import bionty as bt
318
- >>> bt.settings.organism = "human"
319
- >>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
320
- >>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
321
- >>> bt.Gene.validate(gene_symbols, field=bt.Gene.symbol)
322
- array([ True, True, False, False])
323
- """
324
- pass
325
-
326
- def from_values(
327
- cls,
328
- values: ListLike,
329
- field: StrField | None = None,
330
- create: bool = False,
331
- organism: Record | str | None = None,
332
- source: Record | None = None,
333
- mute: bool = False,
334
- ) -> RecordList:
335
- """Bulk create validated records by parsing values for an identifier such as a name or an id).
336
-
337
- Args:
338
- values: A list of values for an identifier, e.g.
339
- `["name1", "name2"]`.
340
- field: A `Record` field to look up, e.g., `bt.CellMarker.name`.
341
- create: Whether to create records if they don't exist.
342
- organism: A `bionty.Organism` name or record.
343
- source: A `bionty.Source` record to validate against to create records for.
344
- mute: Whether to mute logging.
345
-
346
- Returns:
347
- A list of validated records. For bionty registries. Also returns knowledge-coupled records.
348
-
349
- Notes:
350
- For more info, see tutorial: :doc:`docs:bio-registries`.
351
-
352
- Examples:
353
-
354
- Bulk create from non-validated values will log warnings & returns empty list:
355
-
356
- >>> ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], field="name")
357
- >>> assert len(ulabels) == 0
358
-
359
- Bulk create records from validated values returns the corresponding existing records:
360
-
361
- >>> ln.save([ln.ULabel(name=name) for name in ["benchmark", "prediction", "test"]])
362
- >>> ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], field="name")
363
- >>> assert len(ulabels) == 3
364
-
365
- Bulk create records from public reference:
366
-
367
- >>> import bionty as bt
368
- >>> records = bt.CellType.from_values(["T cell", "B cell"], field="name")
369
- >>> records
370
- """
371
- pass
372
-
373
- @classmethod
374
- def standardize(
375
- cls,
376
- values: Iterable,
377
- field: str | StrField | None = None,
378
- *,
379
- return_field: str | StrField | None = None,
380
- return_mapper: bool = False,
381
- case_sensitive: bool = False,
382
- mute: bool = False,
383
- public_aware: bool = True,
384
- keep: Literal["first", "last", False] = "first",
385
- synonyms_field: str = "synonyms",
386
- organism: str | Record | None = None,
387
- source: Record | None = None,
388
- strict_source: bool = False,
389
- ) -> list[str] | dict[str, str]:
390
- """Maps input synonyms to standardized names.
391
-
392
- Args:
393
- values: Identifiers that will be standardized.
394
- field: The field representing the standardized names.
395
- return_field: The field to return. Defaults to field.
396
- return_mapper: If `True`, returns `{input_value: standardized_name}`.
397
- case_sensitive: Whether the mapping is case sensitive.
398
- mute: Whether to mute logging.
399
- public_aware: Whether to standardize from Bionty reference. Defaults to `True` for Bionty registries.
400
- keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`:
401
- - `"first"`: returns the first mapped standardized name
402
- - `"last"`: returns the last mapped standardized name
403
- - `False`: returns all mapped standardized name.
404
-
405
- When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates.
406
-
407
- When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value.
408
- synonyms_field: A field containing the concatenated synonyms.
409
- organism: An Organism name or record.
410
- source: A `bionty.Source` record that specifies the version to validate against.
411
- strict_source: Determines the validation behavior against records in the registry.
412
- - If `False`, validation will include all records in the registry, ignoring the specified source.
413
- - If `True`, validation will only include records in the registry that are linked to the specified source.
414
- Note: this parameter won't affect validation against bionty/public sources.
415
-
416
- Returns:
417
- If `return_mapper` is `False`: a list of standardized names. Otherwise,
418
- a dictionary of mapped values with mappable synonyms as keys and
419
- standardized names as values.
420
-
421
- See Also:
422
- :meth:`~lamindb.core.CanCurate.add_synonym`
423
- Add synonyms.
424
- :meth:`~lamindb.core.CanCurate.remove_synonym`
425
- Remove synonyms.
426
-
427
- Examples:
428
- >>> import bionty as bt
429
- >>> bt.settings.organism = "human"
430
- >>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
431
- >>> gene_synonyms = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
432
- >>> standardized_names = bt.Gene.standardize(gene_synonyms)
433
- >>> standardized_names
434
- ['A1CF', 'A1BG', 'BRCA2', 'FANCD20']
435
- """
436
- pass
437
-
438
- def add_synonym(
439
- self,
440
- synonym: str | ListLike,
441
- force: bool = False,
442
- save: bool | None = None,
443
- ):
444
- """Add synonyms to a record.
445
-
446
- Args:
447
- synonym: The synonyms to add to the record.
448
- force: Whether to add synonyms even if they are already synonyms of other records.
449
- save: Whether to save the record to the database.
450
-
451
- See Also:
452
- :meth:`~lamindb.core.CanCurate.remove_synonym`
453
- Remove synonyms.
454
-
455
- Examples:
456
- >>> import bionty as bt
457
- >>> bt.CellType.from_source(name="T cell").save()
458
- >>> lookup = bt.CellType.lookup()
459
- >>> record = lookup.t_cell
460
- >>> record.synonyms
461
- 'T-cell|T lymphocyte|T-lymphocyte'
462
- >>> record.add_synonym("T cells")
463
- >>> record.synonyms
464
- 'T cells|T-cell|T-lymphocyte|T lymphocyte'
465
- """
466
- pass
467
-
468
- def remove_synonym(self, synonym: str | ListLike):
469
- """Remove synonyms from a record.
470
-
471
- Args:
472
- synonym: The synonym values to remove.
473
-
474
- See Also:
475
- :meth:`~lamindb.core.CanCurate.add_synonym`
476
- Add synonyms
477
-
478
- Examples:
479
- >>> import bionty as bt
480
- >>> bt.CellType.from_source(name="T cell").save()
481
- >>> lookup = bt.CellType.lookup()
482
- >>> record = lookup.t_cell
483
- >>> record.synonyms
484
- 'T-cell|T lymphocyte|T-lymphocyte'
485
- >>> record.remove_synonym("T-cell")
486
- 'T lymphocyte|T-lymphocyte'
487
- """
488
- pass
489
-
490
- def set_abbr(self, value: str):
491
- """Set value for abbr field and add to synonyms.
492
-
493
- Args:
494
- value: A value for an abbreviation.
495
-
496
- See Also:
497
- :meth:`~lamindb.core.CanCurate.add_synonym`
498
-
499
- Examples:
500
- >>> import bionty as bt
501
- >>> bt.ExperimentalFactor.from_source(name="single-cell RNA sequencing").save()
502
- >>> scrna = bt.ExperimentalFactor.get(name="single-cell RNA sequencing")
503
- >>> scrna.abbr
504
- None
505
- >>> scrna.synonyms
506
- 'single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing'
507
- >>> scrna.set_abbr("scRNA")
508
- >>> scrna.abbr
509
- 'scRNA'
510
- >>> scrna.synonyms
511
- 'scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq'
512
- >>> scrna.save()
513
- """
514
- pass
515
-
516
-
517
- class HasParents:
518
- """Base class for hierarchical registries (ontologies)."""
519
-
520
- def view_parents(
521
- self,
522
- field: StrField | None = None,
523
- with_children: bool = False,
524
- distance: int = 5,
525
- ):
526
- """View parents in an ontology.
527
-
528
- Args:
529
- field: Field to display on graph
530
- with_children: Whether to also show children.
531
- distance: Maximum distance still shown.
532
-
533
- Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).
534
-
535
- Examples:
536
- >>> import bionty as bt
537
- >>> bt.Tissue.from_source(name="subsegmental bronchus").save()
538
- >>> record = bt.Tissue.get(name="respiratory tube")
539
- >>> record.view_parents()
540
- >>> tissue.view_parents(with_children=True)
541
- """
542
- pass
543
-
544
- def query_parents(self) -> QuerySet:
545
- """Query parents in an ontology."""
546
- pass
547
-
548
- def query_children(self) -> QuerySet:
549
- """Query children in an ontology."""
550
- pass
551
-
552
-
553
- class ValidateFields:
554
- pass
555
-
556
-
557
- RECORD_REGISTRY_EXAMPLE = """Example::
558
-
559
- from lamindb import Record, fields
560
-
561
- # sub-classing `Record` creates a new registry
562
- class Experiment(Record):
563
- name: str = fields.CharField()
564
-
565
- # instantiating `Experiment` creates a record `experiment`
566
- experiment = Experiment(name="my experiment")
567
-
568
- # you can save the record to the database
569
- experiment.save()
570
-
571
- # `Experiment` refers to the registry, which you can query
572
- df = Experiment.filter(name__startswith="my ").df()
573
- """
574
-
575
-
576
- # this is the metaclass for Record
577
- @doc_args(RECORD_REGISTRY_EXAMPLE)
578
- class Registry(ModelBase):
579
- """Metaclass for :class:`~lamindb.core.Record`.
580
-
581
- Each `Registry` *object* is a `Record` *class* and corresponds to a table in the metadata SQL database.
582
-
583
- You work with `Registry` objects whenever you use *class methods* of `Record`.
584
-
585
- You call any subclass of `Record` a "registry" and their objects "records". A `Record` object corresponds to a row in the SQL table.
586
-
587
- If you want to create a new registry, you sub-class `Record`.
588
-
589
- {}
590
-
591
- Note: `Registry` inherits from Django's `ModelBase`.
592
- """
593
-
594
- def __new__(cls, name, bases, attrs, **kwargs):
595
- new_class = super().__new__(cls, name, bases, attrs, **kwargs)
596
- return new_class
597
-
598
- # below creates a sensible auto-complete behavior that differs across the
599
- # class and instance level in Jupyter Editors it doesn't have any effect for
600
- # static type analyzer like pylance used in VSCode
601
- def __dir__(cls):
602
- # this is needed to bring auto-complete on the class-level back
603
- # https://laminlabs.slack.com/archives/C04FPE8V01W/p1717535625268849
604
- # Filter class attributes, excluding instance methods
605
- exclude_instance_methods = "sphinx" not in sys.modules
606
- # https://laminlabs.slack.com/archives/C04FPE8V01W/p1721134595920959
607
-
608
- def include_attribute(attr_name, attr_value):
609
- if attr_name.startswith("__"):
610
- return False
611
- if exclude_instance_methods and callable(attr_value):
612
- return isinstance(attr_value, (classmethod, staticmethod, type))
613
- return True
614
-
615
- # check also inherited attributes
616
- if hasattr(cls, "mro"):
617
- attrs = chain(*(c.__dict__.items() for c in cls.mro()))
618
- else:
619
- attrs = cls.__dict__.items()
620
-
621
- result = []
622
- for attr_name, attr_value in attrs:
623
- if attr_name not in result and include_attribute(attr_name, attr_value):
624
- result.append(attr_name)
625
-
626
- # Add non-dunder attributes from Registry
627
- for attr in dir(Registry):
628
- if not attr.startswith("__") and attr not in result:
629
- result.append(attr)
630
- return result
631
-
632
- def __repr__(cls) -> str:
633
- return registry_repr(cls)
634
-
635
- def lookup(
636
- cls,
637
- field: StrField | None = None,
638
- return_field: StrField | None = None,
639
- ) -> NamedTuple:
640
- """Return an auto-complete object for a field.
641
-
642
- Args:
643
- field: The field to look up the values for. Defaults to first string field.
644
- return_field: The field to return. If `None`, returns the whole record.
645
-
646
- Returns:
647
- A `NamedTuple` of lookup information of the field values with a
648
- dictionary converter.
649
-
650
- See Also:
651
- :meth:`~lamindb.core.Record.search`
652
-
653
- Examples:
654
- >>> import bionty as bt
655
- >>> bt.settings.organism = "human"
656
- >>> bt.Gene.from_source(symbol="ADGB-DT").save()
657
- >>> lookup = bt.Gene.lookup()
658
- >>> lookup.adgb_dt
659
- >>> lookup_dict = lookup.dict()
660
- >>> lookup_dict['ADGB-DT']
661
- >>> lookup_by_ensembl_id = bt.Gene.lookup(field="ensembl_gene_id")
662
- >>> genes.ensg00000002745
663
- >>> lookup_return_symbols = bt.Gene.lookup(field="ensembl_gene_id", return_field="symbol")
664
- """
665
- pass
666
-
667
- def filter(cls, *queries, **expressions) -> QuerySet:
668
- """Query records.
669
-
670
- Args:
671
- queries: One or multiple `Q` objects.
672
- expressions: Fields and values passed as Django query expressions.
673
-
674
- Returns:
675
- A :class:`~lamindb.core.QuerySet`.
676
-
677
- See Also:
678
- - Guide: :doc:`docs:registries`
679
- - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__
680
-
681
- Examples:
682
- >>> ln.ULabel(name="my label").save()
683
- >>> ln.ULabel.filter(name__startswith="my").df()
684
- """
685
- pass
686
-
687
- def get(
688
- cls,
689
- idlike: int | str | None = None,
690
- **expressions,
691
- ) -> Record:
692
- """Get a single record.
693
-
694
- Args:
695
- idlike: Either a uid stub, uid or an integer id.
696
- expressions: Fields and values passed as Django query expressions.
697
-
698
- Returns:
699
- A record.
700
-
701
- Raises:
702
- :exc:`docs:lamindb.core.exceptions.DoesNotExist`: In case no matching record is found.
703
-
704
- See Also:
705
- - Guide: :doc:`docs:registries`
706
- - Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__
707
-
708
- Examples:
709
- >>> ulabel = ln.ULabel.get("FvtpPJLJ")
710
- >>> ulabel = ln.ULabel.get(name="my-label")
711
- """
712
- pass
713
-
714
- def df(
715
- cls,
716
- include: str | list[str] | None = None,
717
- features: bool | list[str] = False,
718
- limit: int = 100,
719
- ) -> pd.DataFrame:
720
- """Convert to `pd.DataFrame`.
721
-
722
- By default, shows all direct fields, except `updated_at`.
723
-
724
- Use arguments `include` or `feature` to include other data.
725
-
726
- Args:
727
- include: Related fields to include as columns. Takes strings of
728
- form `"ulabels__name"`, `"cell_types__name"`, etc. or a list
729
- of such strings.
730
- features: If `True`, map all features of the
731
- :class:`~lamindb.Feature` registry onto the resulting
732
- `DataFrame`. Only available for `Artifact`.
733
- limit: Maximum number of rows to display from a Pandas DataFrame.
734
- Defaults to 100 to reduce database load.
735
-
736
- Examples:
737
-
738
- Include the name of the creator in the `DataFrame`:
739
-
740
- >>> ln.ULabel.df(include="created_by__name"])
741
-
742
- Include display of features for `Artifact`:
743
-
744
- >>> df = ln.Artifact.df(features=True)
745
- >>> ln.view(df) # visualize with type annotations
746
-
747
- Only include select features:
748
-
749
- >>> df = ln.Artifact.df(features=["cell_type_by_expert", "cell_type_by_model"])
750
- """
751
- pass
752
-
753
- def search(
754
- cls,
755
- string: str,
756
- *,
757
- field: StrField | None = None,
758
- limit: int | None = 20,
759
- case_sensitive: bool = False,
760
- ) -> QuerySet:
761
- """Search.
762
-
763
- Args:
764
- string: The input string to match against the field ontology values.
765
- field: The field or fields to search. Search all string fields by default.
766
- limit: Maximum amount of top results to return.
767
- case_sensitive: Whether the match is case sensitive.
768
-
769
- Returns:
770
- A sorted `DataFrame` of search results with a score in column `score`.
771
- If `return_queryset` is `True`. `QuerySet`.
772
-
773
- See Also:
774
- :meth:`~lamindb.core.Record.filter`
775
- :meth:`~lamindb.core.Record.lookup`
776
-
777
- Examples:
778
- >>> ulabels = ln.ULabel.from_values(["ULabel1", "ULabel2", "ULabel3"], field="name")
779
- >>> ln.save(ulabels)
780
- >>> ln.ULabel.search("ULabel2")
781
- """
782
- pass
783
-
784
- def using(
785
- cls,
786
- instance: str | None,
787
- ) -> QuerySet:
788
- """Use a non-default LaminDB instance.
789
-
790
- Args:
791
- instance: An instance identifier of form "account_handle/instance_name".
792
-
793
- Examples:
794
- >>> ln.ULabel.using("account_handle/instance_name").search("ULabel7", field="name")
795
- uid score
796
- name
797
- ULabel7 g7Hk9b2v 100.0
798
- ULabel5 t4Jm6s0q 75.0
799
- ULabel6 r2Xw8p1z 75.0
800
- """
801
- pass
802
-
803
- def __get_module_name__(cls) -> str:
804
- schema_module_name = cls.__module__.split(".")[0]
805
- module_name = schema_module_name.replace("lnschema_", "")
806
- if module_name == "lamindb":
807
- module_name = "core"
808
- return module_name
809
-
810
- @deprecated("__get_module_name__")
811
- def __get_schema_name__(cls) -> str:
812
- return cls.__get_module_name__()
813
-
814
- def __get_name_with_module__(cls) -> str:
815
- module_name = cls.__get_module_name__()
816
- if module_name == "core":
817
- module_prefix = ""
818
- else:
819
- module_prefix = f"{module_name}."
820
- return f"{module_prefix}{cls.__name__}"
821
-
822
- @deprecated("__get_name_with_module__")
823
- def __get_name_with_schema__(cls) -> str:
824
- return cls.__get_name_with_module__()
825
-
826
-
827
- class BasicRecord(models.Model, metaclass=Registry):
828
- """Basic metadata record.
829
-
830
- It has the same methods as Record, but doesn't have the additional fields.
831
-
832
- It's mainly used for LinkORMs and similar.
833
- """
834
-
835
- class Meta:
836
- abstract = True
837
-
838
-
839
- class Space(BasicRecord):
840
- """Spaces."""
841
-
842
- id: int = models.SmallAutoField(primary_key=True)
843
- """Internal id, valid only in one DB instance."""
844
- name: str = models.CharField(max_length=100, db_index=True)
845
- """Name of space."""
846
- uid: str = CharField(
847
- editable=False,
848
- unique=True,
849
- max_length=12,
850
- default="00000000",
851
- db_default="00000000",
852
- db_index=True,
853
- )
854
- """Universal id."""
855
- description: str | None = CharField(null=True)
856
- """Description of space."""
857
- created_at: datetime = DateTimeField(
858
- editable=False, db_default=models.functions.Now(), db_index=True
859
- )
860
- """Time of creation of record."""
861
- created_by: User = ForeignKey(
862
- "User", CASCADE, default=None, related_name="+", null=True
863
- )
864
- """Creator of run."""
865
-
866
-
867
- @doc_args(RECORD_REGISTRY_EXAMPLE)
868
- class Record(BasicRecord, metaclass=Registry):
869
- """Metadata record.
870
-
871
- Every `Record` is a data model that comes with a registry in form of a SQL
872
- table in your database.
873
-
874
- Sub-classing `Record` creates a new registry while instantiating a `Record`
875
- creates a new record.
876
-
877
- {}
878
-
879
- `Record`'s metaclass is :class:`~lamindb.core.Registry`.
880
-
881
- `Record` inherits from Django's `Model` class. Why does LaminDB call it `Record`
882
- and not `Model`? The term `Record` can't lead to confusion with statistical,
883
- machine learning or biological models.
884
- """
885
-
886
- _branch_code: int = models.SmallIntegerField(db_index=True, default=1, db_default=1)
887
- """Whether record is on a branch, in archive or in trash.
888
-
889
- This dictates whether a record appears in queries & searches.
890
-
891
- Coding is as follows:
892
-
893
- - 3: template (hidden in queries & searches)
894
- - 2: draft (hidden in queries & searches)
895
- - 1: default (visible in queries & searches)
896
- - 0: archive (hidden, meant to be kept)
897
- - -1: trash (hidden, scheduled for deletion)
898
-
899
- Any integer higher than >3 codes a branch that's involved in a pull request.
900
- """
901
- space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1)
902
- """The space in which the record lives."""
903
- _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)
904
- """Auxiliary field for dictionary-like metadata."""
905
-
906
- def save(self, *args, **kwargs) -> Record:
907
- """Save.
908
-
909
- Always saves to the default database.
910
- """
911
- # we need this here because we're using models also from plain
912
- # django outside of lamindb
913
- super().save(*args, **kwargs)
914
- return self
915
-
916
- def delete(self) -> None:
917
- """Delete."""
918
- pass
919
-
920
- class Meta:
921
- abstract = True
922
-
923
-
924
- class FeatureManager:
925
- """Feature manager."""
926
-
927
- pass
928
-
929
-
930
- class ParamManager:
931
- """Param manager."""
932
-
933
- pass
934
-
935
-
936
- class ParamManagerArtifact(ParamManager):
937
- """Param manager."""
938
-
939
- pass
940
-
941
-
942
- class ParamManagerRun(ParamManager):
943
- """Param manager."""
944
-
945
- pass
946
-
947
-
948
- # -------------------------------------------------------------------------------------
949
- # A note on required fields at the Record level
950
- #
951
- # As Django does most of its validation on the Form-level, it doesn't offer functionality
952
- # for validating the integrity of an Record object upon instantation (similar to pydantic)
953
- #
954
- # For required fields, we define them as commonly done on the SQL level together
955
- # with a validator in Record (validate_required_fields)
956
- #
957
- # This goes against the Django convention, but goes with the SQLModel convention
958
- # (Optional fields can be null on the SQL level, non-optional fields cannot)
959
- #
960
- # Due to Django's convention where CharFieldAttr has pre-configured (null=False, default=""), marking
961
- # a required field necessitates passing `default=None`. Without the validator it would trigger
962
- # an error at the SQL-level, with it, it triggers it at instantiation
963
-
964
- # -------------------------------------------------------------------------------------
965
- # A note on class and instance methods of core Record
966
- #
967
- # All of these are defined and tested within lamindb, in files starting with _{orm_name}.py
968
-
969
- # -------------------------------------------------------------------------------------
970
- # A note on maximal lengths of char fields
971
- #
972
- # 100 characters:
973
- # "Raindrops pitter-pattered on the windowpane, blurring the"
974
- # "city lights outside, curled up with a mug."
975
- # A good maximal length for a name (title).
976
- #
977
- # 150 characters: We choose this for name maximal length because some users like long names.
978
- #
979
- # 255 characters:
980
- # "In creating a precise 255-character paragraph, one engages in"
981
- # "a dance of words, where clarity meets brevity. Every syllable counts,"
982
- # "illustrating the skill in compact expression, ensuring the essence of the"
983
- # "message shines through within the exacting limit."
984
- # This is a good maximal length for a description field.
985
-
986
-
987
- class User(BasicRecord, CanCurate):
988
- """Users.
989
-
990
- All data in this registry is synched from `lamin.ai` to ensure a universal
991
- user identity. There is no need to manually create records.
992
-
993
- Examples:
994
-
995
- Query a user by handle:
996
-
997
- >>> user = ln.User.get(handle="testuser1")
998
- >>> user
999
- """
1000
-
1001
- _name_field: str = "handle"
1002
-
1003
- id: int = models.AutoField(primary_key=True)
1004
- """Internal id, valid only in one DB instance."""
1005
- uid: str = CharField(editable=False, unique=True, db_index=True, max_length=8)
1006
- """Universal id, valid across DB instances."""
1007
- handle: str = CharField(max_length=30, unique=True, db_index=True)
1008
- """Universal handle, valid across DB instances (required)."""
1009
- name: str | None = CharField(max_length=150, db_index=True, null=True)
1010
- """Name (optional).""" # has to match hub specification, where it's also optional
1011
- created_artifacts: Artifact
1012
- """Artifacts created by user."""
1013
- created_transforms: Transform
1014
- """Transforms created by user."""
1015
- created_runs: Run
1016
- """Runs created by user."""
1017
- created_at: datetime = DateTimeField(
1018
- editable=False, db_default=models.functions.Now(), db_index=True
1019
- )
1020
- """Time of creation of record."""
1021
- updated_at: datetime = DateTimeField(
1022
- editable=False, db_default=models.functions.Now(), db_index=True
1023
- )
1024
- """Time of last update to record."""
1025
-
1026
- @overload
1027
- def __init__(
1028
- self,
1029
- handle: str,
1030
- email: str,
1031
- name: str | None,
1032
- ): ...
1033
-
1034
- @overload
1035
- def __init__(
1036
- self,
1037
- *db_args,
1038
- ): ...
1039
-
1040
- def __init__(
1041
- self,
1042
- *args,
1043
- **kwargs,
1044
- ):
1045
- super().__init__(*args, **kwargs)
1046
-
1047
-
1048
- class Storage(Record, TracksRun, TracksUpdates):
1049
- """Storage locations.
1050
-
1051
- A storage location is either a directory/folder (local or in the cloud) or
1052
- an entire S3/GCP bucket.
1053
-
1054
- A LaminDB instance can manage and link multiple storage locations. But any
1055
- storage location is managed by *at most one* LaminDB instance.
1056
-
1057
- .. dropdown:: Managed vs. linked storage locations
1058
-
1059
- The LaminDB instance can update & delete artifacts in managed storage
1060
- locations but merely read artifacts in linked storage locations.
1061
-
1062
- When you transfer artifacts from another instance, the default is to
1063
- only copy metadata into the target instance, but merely link the data.
1064
-
1065
- The `instance_uid` field indicates the managing LaminDB instance of a
1066
- storage location.
1067
-
1068
- When you delete a LaminDB instance, you'll be warned about data in managed
1069
- storage locations while data in linked storage locations is ignored.
1070
-
1071
- See Also:
1072
- :attr:`~lamindb.core.Settings.storage`
1073
- Default storage.
1074
- :attr:`~lamindb.setup.core.StorageSettings`
1075
- Storage settings.
1076
-
1077
- Examples:
1078
-
1079
- Configure the default storage location upon initiation of a LaminDB instance::
1080
-
1081
- lamin init --storage ./mydata # or "s3://my-bucket" or "gs://my-bucket"
1082
-
1083
- View the default storage location:
1084
-
1085
- >>> ln.settings.storage
1086
- PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata')
1087
-
1088
- Dynamically change the default storage:
1089
-
1090
- >>> ln.settings.storage = "./storage_2" # or a cloud bucket
1091
- """
1092
-
1093
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1094
- abstract = False
1095
-
1096
- _name_field: str = "root"
1097
-
1098
- id: int = models.AutoField(primary_key=True)
1099
- """Internal id, valid only in one DB instance."""
1100
- uid: str = CharField(
1101
- editable=False, unique=True, max_length=12, default=base62_12, db_index=True
1102
- )
1103
- """Universal id, valid across DB instances."""
1104
- # we are very conservative here with 255 characters
1105
- root: str = CharField(db_index=True, unique=True)
1106
- """Root path of storage. n s3 path. local path, etc. (required)."""
1107
- description: str | None = CharField(db_index=True, null=True)
1108
- """A description of what the storage location is used for (optional)."""
1109
- type: str = CharField(max_length=30, db_index=True)
1110
- """Can be "local" vs. "s3" vs. "gs"."""
1111
- region: str | None = CharField(max_length=64, db_index=True, null=True)
1112
- """Cloud storage region, if applicable."""
1113
- instance_uid: str | None = CharField(max_length=12, db_index=True, null=True)
1114
- """Instance that manages this storage location."""
1115
- artifacts: Artifact
1116
- """Artifacts contained in this storage location."""
1117
-
1118
- @overload
1119
- def __init__(
1120
- self,
1121
- root: str,
1122
- type: str,
1123
- region: str | None,
1124
- ): ...
1125
-
1126
- @overload
1127
- def __init__(
1128
- self,
1129
- *db_args,
1130
- ): ...
1131
-
1132
- def __init__(
1133
- self,
1134
- *args,
1135
- **kwargs,
1136
- ):
1137
- super().__init__(*args, **kwargs)
1138
-
1139
- @property
1140
- def path(self) -> Path | UPath:
1141
- """Bucket or folder path.
1142
-
1143
- Cloud storage bucket:
1144
-
1145
- >>> ln.Storage("s3://my-bucket").save()
1146
-
1147
- Directory/folder in cloud storage:
1148
-
1149
- >>> ln.Storage("s3://my-bucket/my-directory").save()
1150
-
1151
- Local directory/folder:
1152
-
1153
- >>> ln.Storage("./my-directory").save()
1154
- """
1155
- pass
1156
-
1157
-
1158
- # does not inherit from TracksRun because the Transform
1159
- # is needed to define a run
1160
- class Transform(Record, IsVersioned):
1161
- """Data transformations.
1162
-
1163
- A "transform" can refer to a Python function, a script, a notebook, or a
1164
- pipeline. If you execute a transform, you generate a run
1165
- (:class:`~lamindb.Run`). A run has inputs and outputs.
1166
-
1167
- A pipeline is typically created with a workflow tool (Nextflow, Snakemake,
1168
- Prefect, Flyte, MetaFlow, redun, Airflow, ...) and stored in a versioned
1169
- repository.
1170
-
1171
- Transforms are versioned so that a given transform version maps on a given
1172
- source code version.
1173
-
1174
- .. dropdown:: Can I sync transforms to git?
1175
-
1176
- If you switch on
1177
- :attr:`~lamindb.core.Settings.sync_git_repo` a script-like transform is
1178
- synched to its hashed state in a git repository upon calling `ln.track()`.
1179
-
1180
- >>> ln.settings.sync_git_repo = "https://github.com/laminlabs/lamindb"
1181
- >>> ln.track()
1182
-
1183
- The definition of transforms and runs is consistent the OpenLineage
1184
- specification where a :class:`~lamindb.Transform` record would be called a
1185
- "job" and a :class:`~lamindb.Run` record a "run".
1186
-
1187
- Args:
1188
- name: `str` A name or title.
1189
- key: `str | None = None` A short name or path-like semantic key.
1190
- type: `TransformType | None = "pipeline"` See :class:`~lamindb.base.types.TransformType`.
1191
- revises: `Transform | None = None` An old version of the transform.
1192
-
1193
- See Also:
1194
- :meth:`~lamindb.core.Context.track`
1195
- Globally track a script, notebook or pipeline run.
1196
- :class:`~lamindb.Run`
1197
- Executions of transforms.
1198
-
1199
- Notes:
1200
- - :doc:`docs:track`
1201
- - :doc:`docs:data-flow`
1202
- - :doc:`docs:redun`
1203
- - :doc:`docs:nextflow`
1204
- - :doc:`docs:snakemake`
1205
-
1206
- Examples:
1207
-
1208
- Create a transform for a pipeline:
1209
-
1210
- >>> transform = ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
1211
-
1212
- Create a transform from a notebook:
1213
-
1214
- >>> ln.track()
1215
-
1216
- View predecessors of a transform:
1217
-
1218
- >>> transform.view_lineage()
1219
- """
1220
-
1221
- class Meta(Record.Meta, IsVersioned.Meta):
1222
- abstract = False
1223
-
1224
- _len_stem_uid: int = 12
1225
- _len_full_uid: int = 16
1226
- _name_field: str = "key"
1227
-
1228
- id: int = models.AutoField(primary_key=True)
1229
- """Internal id, valid only in one DB instance."""
1230
- uid: str = CharField(
1231
- editable=False, unique=True, db_index=True, max_length=_len_full_uid
1232
- )
1233
- """Universal id."""
1234
- key: str | None = CharField(db_index=True, null=True)
1235
- """A name or "/"-separated path-like string.
1236
-
1237
- All transforms with the same key are part of the same version family.
1238
- """
1239
- description: str | None = CharField(db_index=True, null=True)
1240
- """A description."""
1241
- type: TransformType = CharField(
1242
- max_length=20,
1243
- db_index=True,
1244
- default="pipeline",
1245
- )
1246
- """:class:`~lamindb.base.types.TransformType` (default `"pipeline"`)."""
1247
- source_code: str | None = TextField(null=True)
1248
- """Source code of the transform.
1249
-
1250
- .. versionchanged:: 0.75
1251
- The `source_code` field is no longer an artifact, but a text field.
1252
- """
1253
- # we have a unique constraint here but not on artifact because on artifact, we haven't yet
1254
- # settled how we model the same artifact in different storage locations
1255
- hash: str | None = CharField(
1256
- max_length=HASH_LENGTH, db_index=True, null=True, unique=True
1257
- )
1258
- """Hash of the source code."""
1259
- reference: str | None = CharField(max_length=255, db_index=True, null=True)
1260
- """Reference for the transform, e.g., a URL."""
1261
- reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
1262
- """Reference type of the transform, e.g., 'url'."""
1263
- runs: Run
1264
- """Runs of this transform."""
1265
- ulabels: ULabel = models.ManyToManyField(
1266
- "ULabel", through="TransformULabel", related_name="transforms"
1267
- )
1268
- """ULabel annotations of this transform."""
1269
- predecessors: Transform = models.ManyToManyField(
1270
- "self", symmetrical=False, related_name="successors"
1271
- )
1272
- """Preceding transforms.
1273
-
1274
- These are auto-populated whenever an artifact or collection serves as a run
1275
- input, e.g., `artifact.run` and `artifact.transform` get populated & saved.
1276
-
1277
- The table provides a more convenient method to query for the predecessors that
1278
- bypasses querying the :class:`~lamindb.Run`.
1279
-
1280
- It also allows to manually add predecessors whose outputs are not tracked in a run.
1281
- """
1282
- successors: Transform
1283
- """Subsequent transforms.
1284
-
1285
- See :attr:`~lamindb.Transform.predecessors`.
1286
- """
1287
- output_artifacts: Artifact
1288
- """The artifacts generated by all runs of this transform.
1289
-
1290
- If you're looking for the outputs of a single run, see :attr:`lamindb.Run.output_artifacts`.
1291
- """
1292
- output_collections: Collection
1293
- """The collections generated by all runs of this transform.
1294
-
1295
- If you're looking for the outputs of a single run, see :attr:`lamindb.Run.output_collections`.
1296
- """
1297
- projects: Project
1298
- """Associated projects."""
1299
- references: Reference
1300
- """Associated references."""
1301
- created_at: datetime = DateTimeField(
1302
- editable=False, db_default=models.functions.Now(), db_index=True
1303
- )
1304
- """Time of creation of record."""
1305
- updated_at: datetime = DateTimeField(
1306
- editable=False, db_default=models.functions.Now(), db_index=True
1307
- )
1308
- """Time of last update to record."""
1309
- created_by: User = ForeignKey(
1310
- User, PROTECT, default=current_user_id, related_name="created_transforms"
1311
- )
1312
- """Creator of record."""
1313
- _template: Transform | None = ForeignKey(
1314
- "Transform", PROTECT, related_name="_derived_from", default=None, null=True
1315
- )
1316
- """Creating template."""
1317
-
1318
- @overload
1319
- def __init__(
1320
- self,
1321
- name: str,
1322
- key: str | None = None,
1323
- type: TransformType | None = None,
1324
- revises: Transform | None = None,
1325
- ): ...
1326
-
1327
- @overload
1328
- def __init__(
1329
- self,
1330
- *db_args,
1331
- ): ...
1332
-
1333
- def __init__(
1334
- self,
1335
- *args,
1336
- **kwargs,
1337
- ):
1338
- super().__init__(*args, **kwargs)
1339
-
1340
- @property
1341
- def name(self) -> str:
1342
- """Name of the transform.
1343
-
1344
- Splits `key` on `/` and returns the last element.
1345
- """
1346
- return self.key.split("/")[-1]
1347
-
1348
- @property
1349
- def latest_run(self) -> Run:
1350
- """The latest run of this transform."""
1351
- pass
1352
-
1353
- def view_lineage(self) -> None:
1354
- """View lineage of transforms."""
1355
- pass
1356
-
1357
-
1358
- class Param(Record, CanCurate, TracksRun, TracksUpdates):
1359
- """Parameters of runs & models."""
1360
-
1361
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1362
- abstract = False
1363
-
1364
- _name_field: str = "name"
1365
-
1366
- name: str = CharField(max_length=100, db_index=True)
1367
- dtype: str | None = CharField(db_index=True, null=True)
1368
- """Data type ("num", "cat", "int", "float", "bool", "datetime").
1369
-
1370
- For categorical types, can define from which registry values are
1371
- sampled, e.g., `cat[ULabel]` or `cat[bionty.CellType]`.
1372
- """
1373
- type: Param | None = ForeignKey("self", PROTECT, null=True, related_name="records")
1374
- """Type of param (e.g., 'Pipeline', 'ModelTraining', 'PostProcessing').
1375
-
1376
- Allows to group features by type, e.g., all read outs, all metrics, etc.
1377
- """
1378
- records: Param
1379
- """Records of this type."""
1380
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
1381
- """Distinguish types from instances of the type."""
1382
- _expect_many: bool = models.BooleanField(default=False, db_default=False)
1383
- """Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`).
1384
-
1385
- - if it's `False` (default), the values mean artifact/run-level values and a dtype of `datetime` means `datetime`
1386
- - if it's `True`, the values are from an aggregation, which this seems like an edge case but when characterizing a model ensemble trained with different parameters it could be relevant
1387
- """
1388
- schemas: Schema = models.ManyToManyField(
1389
- "Schema", through="SchemaParam", related_name="params"
1390
- )
1391
- """Feature sets linked to this feature."""
1392
- # backward fields
1393
- values: ParamValue
1394
- """Values for this parameter."""
1395
-
1396
- def __init__(self, *args, **kwargs):
1397
- from ._feature import process_init_feature_param
1398
- from .errors import ValidationError
1399
-
1400
- if len(args) == len(self._meta.concrete_fields):
1401
- super().__init__(*args, **kwargs)
1402
- return None
1403
-
1404
- dtype = kwargs.get("dtype", None)
1405
- kwargs = process_init_feature_param(args, kwargs, is_param=True)
1406
- super().__init__(*args, **kwargs)
1407
- dtype_str = kwargs.pop("dtype", None)
1408
- if not self._state.adding:
1409
- if not (
1410
- self.dtype.startswith("cat")
1411
- if dtype == "cat"
1412
- else self.dtype == dtype_str
1413
- ):
1414
- raise ValidationError(
1415
- f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
1416
- )
1417
-
1418
-
1419
- # FeatureValue behaves in many ways like a link in a LinkORM
1420
- # in particular, we don't want a _public field on it
1421
- # Also, we don't inherit from TracksRun because a ParamValue
1422
- # is typically created before a run is created and we want to
1423
- # avoid delete cycles (for Model params though it might be helpful)
1424
- class ParamValue(Record):
1425
- """Parameter values.
1426
-
1427
- Is largely analogous to `FeatureValue`.
1428
- """
1429
-
1430
- # we do not have a unique constraint on param & value because it leads to hashing errors
1431
- # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
1432
- # we do not hash values because we have `get_or_create` logic all over the place
1433
- # and also for checking whether the (param, value) combination exists
1434
- # there does not seem an issue with querying for a dict-like value
1435
- # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
1436
- _name_field: str = "value"
1437
-
1438
- param: Param = ForeignKey(Param, CASCADE, related_name="values")
1439
- """The dimension metadata."""
1440
- value: Any = (
1441
- models.JSONField()
1442
- ) # stores float, integer, boolean, datetime or dictionaries
1443
- """The JSON-like value."""
1444
- # it'd be confusing and hard to populate a run here because these
1445
- # values are typically created upon creating a run
1446
- # hence, ParamValue does _not_ inherit from TracksRun but manually
1447
- # adds created_at & created_by
1448
- # because ParamValue cannot be updated, we don't need updated_at
1449
- created_at: datetime = DateTimeField(
1450
- editable=False, db_default=models.functions.Now(), db_index=True
1451
- )
1452
- """Time of creation of record."""
1453
- created_by: User = ForeignKey(
1454
- User, PROTECT, default=current_user_id, related_name="+"
1455
- )
1456
- """Creator of record."""
1457
- hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
1458
-
1459
- class Meta:
1460
- constraints = [
1461
- # For simple types, use direct value comparison
1462
- models.UniqueConstraint(
1463
- fields=["param", "value"],
1464
- name="unique_simple_param_value",
1465
- condition=Q(hash__isnull=True),
1466
- ),
1467
- # For complex types (dictionaries), use hash
1468
- models.UniqueConstraint(
1469
- fields=["param", "hash"],
1470
- name="unique_complex_param_value",
1471
- condition=Q(hash__isnull=False),
1472
- ),
1473
- ]
1474
-
1475
- @classmethod
1476
- def get_or_create(cls, param, value):
1477
- # Simple types: int, float, str, bool
1478
- if isinstance(value, (int, float, str, bool)):
1479
- try:
1480
- return cls.objects.create(param=param, value=value, hash=None), False
1481
- except IntegrityError:
1482
- return cls.objects.get(param=param, value=value), True
1483
-
1484
- # Complex types: dict, list
1485
- else:
1486
- hash = hash_dict(value)
1487
- try:
1488
- return cls.objects.create(param=param, value=value, hash=hash), False
1489
- except IntegrityError:
1490
- return cls.objects.get(param=param, hash=hash), True
1491
-
1492
-
1493
- class Run(Record):
1494
- """Runs of transforms.
1495
-
1496
- Args:
1497
- transform: `Transform` A :class:`~lamindb.Transform` record.
1498
- reference: `str | None = None` For instance, an external ID or a download URL.
1499
- reference_type: `str | None = None` For instance, `redun_id`, `nextflow_id` or `url`.
1500
-
1501
- See Also:
1502
- :meth:`~lamindb.core.Context.track`
1503
- Track global run & transform records for a notebook or pipeline.
1504
-
1505
- Examples:
1506
-
1507
- Create a run record:
1508
-
1509
- >>> ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
1510
- >>> transform = ln.Transform.get(key="Cell Ranger", version="7.2.0")
1511
- >>> run = ln.Run(transform)
1512
-
1513
- Create a global run context for a custom transform:
1514
-
1515
- >>> ln.track(transform=transform)
1516
- >>> ln.context.run # globally available run
1517
-
1518
- Track a global run context for a notebook or script:
1519
-
1520
- >>> ln.track() # Jupyter notebook metadata is automatically parsed
1521
- >>> ln.context.run
1522
- """
1523
-
1524
- _name_field: str = "started_at"
1525
-
1526
- params: ParamManager = ParamManagerRun # type: ignore
1527
- """Param manager.
1528
-
1529
- Guide: :ref:`track-run-parameters`
1530
-
1531
- Example::
1532
-
1533
- run.params.add_values({
1534
- "learning_rate": 0.01,
1535
- "input_dir": "s3://my-bucket/mydataset",
1536
- "downsample": True,
1537
- "preprocess_params": {
1538
- "normalization_type": "cool",
1539
- "subset_highlyvariable": True,
1540
- },
1541
- })
1542
- """
1543
-
1544
- id: int = models.BigAutoField(primary_key=True)
1545
- """Internal id, valid only in one DB instance."""
1546
- uid: str = CharField(
1547
- editable=False, unique=True, db_index=True, max_length=20, default=base62_20
1548
- )
1549
- """Universal id, valid across DB instances."""
1550
- name: str | None = CharField(max_length=150, null=True)
1551
- """A name."""
1552
- transform = ForeignKey(Transform, CASCADE, related_name="runs")
1553
- """The transform :class:`~lamindb.Transform` that is being run."""
1554
- started_at: datetime = DateTimeField(
1555
- editable=False, db_default=models.functions.Now(), db_index=True
1556
- )
1557
- """Start time of run."""
1558
- finished_at: datetime | None = DateTimeField(db_index=True, null=True, default=None)
1559
- """Finished time of run."""
1560
- # we don't want to make below a OneToOne because there could be the same trivial report
1561
- # generated for many different runs
1562
- report: Artifact | None = ForeignKey(
1563
- "Artifact", PROTECT, null=True, related_name="_report_of", default=None
1564
- )
1565
- """Report of run, e.g.. n html file."""
1566
- _logfile: Artifact | None = ForeignKey(
1567
- "Artifact", PROTECT, null=True, related_name="_logfile_of", default=None
1568
- )
1569
- """Report of run, e.g.. n html file."""
1570
- environment: Artifact | None = ForeignKey(
1571
- "Artifact", PROTECT, null=True, related_name="_environment_of", default=None
1572
- )
1573
- """Computational environment for the run.
1574
-
1575
- For instance, `Dockerfile`, `docker image`, `requirements.txt`, `environment.yml`, etc.
1576
- """
1577
- input_artifacts: Artifact
1578
- """The artifacts serving as input for this run.
1579
-
1580
- Related accessor: :attr:`~lamindb.Artifact.input_of_runs`.
1581
- """
1582
- output_artifacts: Artifact
1583
- """The artifacts generated by this run.
1584
-
1585
- Related accessor: via :attr:`~lamindb.Artifact.run`
1586
- """
1587
- input_collections: Collection
1588
- """The collections serving as input for this run."""
1589
- output_collections: Collection
1590
- """The collections generated by this run."""
1591
- _param_values: ParamValue = models.ManyToManyField(
1592
- ParamValue, through="RunParamValue", related_name="runs"
1593
- )
1594
- """Parameter values."""
1595
- reference: str | None = CharField(max_length=255, db_index=True, null=True)
1596
- """A reference like a URL or external ID (such as from a workflow manager)."""
1597
- reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
1598
- """Type of reference such as a workflow manager execution ID."""
1599
- created_at: datetime = DateTimeField(
1600
- editable=False, db_default=models.functions.Now(), db_index=True
1601
- )
1602
- """Time of first creation. Mismatches ``started_at`` if the run is re-run."""
1603
- created_by: User = ForeignKey(
1604
- User, CASCADE, default=current_user_id, related_name="created_runs"
1605
- )
1606
- """Creator of run."""
1607
- ulabels: ULabel = models.ManyToManyField(
1608
- "ULabel", through="RunULabel", related_name="runs"
1609
- )
1610
- """ULabel annotations of this transform."""
1611
- initiated_by_run: Run | None = ForeignKey(
1612
- "Run", CASCADE, null=True, related_name="initiated_runs", default=None
1613
- )
1614
- """The run that triggered the current run.
1615
-
1616
- This is not a preceding run. The preceding runs ("predecessors") is the set
1617
- of runs that produced the output artifacts that serve as the inputs for the
1618
- present run.
1619
-
1620
- Be careful with using this field at this point.
1621
- """
1622
- initiated_runs: Run
1623
- """Runs that were initiated by this run."""
1624
- _is_consecutive: bool | None = BooleanField(null=True)
1625
- """Indicates whether code was consecutively executed. Is relevant for notebooks."""
1626
- _status_code: int = models.SmallIntegerField(default=0, db_index=True)
1627
- """Status code of the run.
1628
-
1629
- - 0: scheduled
1630
- - 1: started
1631
- - 2: errored
1632
- - 3: aborted
1633
- - 4: completed
1634
- """
1635
-
1636
- @overload
1637
- def __init__(
1638
- self,
1639
- transform: Transform,
1640
- reference: str | None = None,
1641
- reference_type: str | None = None,
1642
- ): ...
1643
-
1644
- @overload
1645
- def __init__(
1646
- self,
1647
- *db_args,
1648
- ): ...
1649
-
1650
- def __init__(
1651
- self,
1652
- *args,
1653
- **kwargs,
1654
- ):
1655
- super().__init__(*args, **kwargs)
1656
-
1657
-
1658
- class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1659
- """Universal labels.
1660
-
1661
- Args:
1662
- name: `str` A name.
1663
- description: `str` A description.
1664
- reference: `str | None = None` For instance, an external ID or a URL.
1665
- reference_type: `str | None = None` For instance, `"url"`.
1666
-
1667
- A `ULabel` record provides the easiest way to annotate a dataset
1668
- with a label: `"My project"`, `"curated"`, or `"Batch X"`:
1669
-
1670
- >>> my_project = ULabel(name="My project")
1671
- >>> my_project.save()
1672
- >>> artifact.ulabels.add(my_project)
1673
-
1674
- Often, a ulabel is measured *within* a dataset. For instance, an artifact
1675
- might characterize 2 species of the Iris flower (`"setosa"` &
1676
- `"versicolor"`) measured by a `"species"` feature. Use the
1677
- :class:`~lamindb.Curator` flow to automatically parse, validate, and
1678
- annotate with labels that are contained in `DataFrame` or `AnnData`
1679
- artifacts.
1680
-
1681
- .. note::
1682
-
1683
- If you work with complex entities like cell lines, cell types, tissues,
1684
- etc., consider using the pre-defined biological registries in
1685
- :mod:`bionty` to label artifacts & collections.
1686
-
1687
- If you work with biological samples, likely, the only sustainable way of
1688
- tracking metadata, is to create a custom schema module.
1689
-
1690
- See Also:
1691
- :meth:`~lamindb.Feature`
1692
- Dimensions of measurement for artifacts & collections.
1693
- :attr:`~lamindb.Artifact.features`
1694
- Feature manager for an artifact.
1695
-
1696
- Examples:
1697
-
1698
- Create a new label:
1699
-
1700
- >>> train_split = ln.ULabel(name="train").save()
1701
-
1702
- Organize labels in a hierarchy:
1703
-
1704
- >>> split_type = ln.ULabel(name="Split", is_type=True).save()
1705
- >>> train_split = ln.ULabel(name="train", type="split_type").save()
1706
-
1707
- Label an artifact:
1708
-
1709
- >>> artifact.ulabels.add(ulabel)
1710
-
1711
- Query by `ULabel`:
1712
-
1713
- >>> ln.Artifact.filter(ulabels=train_split)
1714
- """
1715
-
1716
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1717
- abstract = False
1718
-
1719
- _name_field: str = "name"
1720
-
1721
- id: int = models.AutoField(primary_key=True)
1722
- """Internal id, valid only in one DB instance."""
1723
- uid: str = CharField(
1724
- editable=False, unique=True, db_index=True, max_length=8, default=base62_8
1725
- )
1726
- """A universal random id, valid across DB instances."""
1727
- name: str = CharField(max_length=150, db_index=True)
1728
- """Name or title of ulabel."""
1729
- type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="records")
1730
- """Type of ulabel, e.g., `"donor"`, `"split"`, etc.
1731
-
1732
- Allows to group ulabels by type, e.g., all donors, all split ulabels, etc.
1733
- """
1734
- records: ULabel
1735
- """Records of this type."""
1736
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
1737
- """Distinguish types from instances of the type.
1738
-
1739
- For example, a ulabel "Project" would be a type, and the actual projects "Project 1", "Project 2", would be records of that `type`.
1740
- """
1741
- description: str | None = CharField(null=True, db_index=True)
1742
- """A description (optional)."""
1743
- reference: str | None = CharField(max_length=255, db_index=True, null=True)
1744
- """A reference like URL or external ID."""
1745
- reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
1746
- """Type of reference such as a donor_id from Vendor X."""
1747
- parents: ULabel = models.ManyToManyField(
1748
- "self", symmetrical=False, related_name="children"
1749
- )
1750
- """Parent entities of this ulabel.
1751
-
1752
- For advanced use cases, you can build an ontology under a given `type`.
1753
-
1754
- Say, if you modeled `CellType` as a `ULabel`, you would introduce a type `CellType` and model the hiearchy of cell types under it.
1755
- """
1756
- children: ULabel
1757
- """Child entities of this ulabel.
1758
-
1759
- Reverse accessor for parents.
1760
- """
1761
- transforms: Transform
1762
- """Transforms annotated with this ulabel."""
1763
- runs: Transform
1764
- """Runs annotated with this ulabel."""
1765
- artifacts: Artifact
1766
- """Artifacts annotated with this ulabel."""
1767
- collections: Collection
1768
- """Collections annotated with this ulabel."""
1769
- projects: Project
1770
- """Associated projects."""
1771
-
1772
- @overload
1773
- def __init__(
1774
- self,
1775
- name: str,
1776
- type: ULabel | None = None,
1777
- is_type: bool = False,
1778
- description: str | None = None,
1779
- reference: str | None = None,
1780
- reference_type: str | None = None,
1781
- ): ...
1782
-
1783
- @overload
1784
- def __init__(
1785
- self,
1786
- *db_args,
1787
- ): ...
1788
-
1789
- def __init__(
1790
- self,
1791
- *args,
1792
- **kwargs,
1793
- ):
1794
- pass
1795
-
1796
-
1797
- class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1798
- """Dataset dimensions.
1799
-
1800
- A feature represents a dimension of a dataset, such as a column in a
1801
- `DataFrame`. The `Feature` registry organizes metadata of features.
1802
-
1803
- The `Feature` registry helps you organize and query datasets based on their
1804
- features and corresponding label annotations. For instance, when working
1805
- with a "T cell" label, it could be measured through different features
1806
- such as `"cell_type_by_expert"` where an expert manually classified the
1807
- cell, or `"cell_type_by_model"` where a computational model made the
1808
- classification.
1809
-
1810
- The two most important metadata of a feature are its `name` and the `dtype`.
1811
- In addition to typical data types, LaminDB has a `"num"` `dtype` to
1812
- concisely denote the union of all numerical types.
1813
-
1814
- Args:
1815
- name: `str` Name of the feature, typically. column name.
1816
- dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
1817
- For categorical types, can define from which registry values are
1818
- sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
1819
- unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
1820
- description: `str | None = None` A description.
1821
- synonyms: `str | None = None` Bar-separated synonyms.
1822
- nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
1823
- default_value: `Any | None = None` Default value for the feature.
1824
- cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
1825
-
1826
- Note:
1827
-
1828
- For more control, you can use :mod:`bionty` registries to manage simple
1829
- biological entities like genes, proteins & cell markers. Or you define
1830
- custom registries to manage high-level derived features like gene sets.
1831
-
1832
- See Also:
1833
- :meth:`~lamindb.Feature.from_df`
1834
- Create feature records from DataFrame.
1835
- :attr:`~lamindb.Artifact.features`
1836
- Feature manager of an artifact or collection.
1837
- :class:`~lamindb.ULabel`
1838
- Universal labels.
1839
- :class:`~lamindb.Schema`
1840
- Feature sets.
1841
-
1842
- Example:
1843
-
1844
- A simple `"str"` feature.
1845
-
1846
- >>> ln.Feature(
1847
- ... name="sample_note",
1848
- ... dtype="str",
1849
- ... ).save()
1850
-
1851
- A dtype `"cat[ULabel]"` can be more easily passed as below.
1852
-
1853
- >>> ln.Feature(
1854
- ... name="project",
1855
- ... dtype=ln.ULabel,
1856
- ... ).save()
1857
-
1858
- A dtype `"cat[ULabel|bionty.CellType]"` can be more easily passed as below.
1859
-
1860
- >>> ln.Feature(
1861
- ... name="cell_type",
1862
- ... dtype=[ln.ULabel, bt.CellType],
1863
- ... ).save()
1864
-
1865
- Hint:
1866
-
1867
- *Features* and *labels* denote two ways of using entities to organize data:
1868
-
1869
- 1. A feature qualifies *what* is measured, i.e., a numerical or categorical random variable
1870
- 2. A label *is* a measured value, i.e., a category
1871
-
1872
- Consider annotating a dataset by that it measured expression of 30k
1873
- genes: genes relate to the dataset as feature identifiers through a
1874
- feature set with 30k members. Now consider annotating the artifact by
1875
- whether that it measured the knock-out of 3 genes: here, the 3 genes act
1876
- as labels of the dataset.
1877
-
1878
- Re-shaping data can introduce ambiguity among features & labels. If this
1879
- happened, ask yourself what the joint measurement was: a feature
1880
- qualifies variables in a joint measurement. The canonical data matrix
1881
- lists jointly measured variables in the columns.
1882
-
1883
- """
1884
-
1885
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
1886
- abstract = False
1887
-
1888
- _name_field: str = "name"
1889
- _aux_fields: dict[str, tuple[str, type]] = {
1890
- "0": ("default_value", bool),
1891
- "1": ("nullable", bool),
1892
- }
1893
-
1894
- id: int = models.AutoField(primary_key=True)
1895
- """Internal id, valid only in one DB instance."""
1896
- uid: str = CharField(
1897
- editable=False, unique=True, db_index=True, max_length=12, default=base62_12
1898
- )
1899
- """Universal id, valid across DB instances."""
1900
- name: str = CharField(max_length=150, db_index=True, unique=True)
1901
- """Name of feature (hard unique constraint `unique=True`)."""
1902
- dtype: FeatureDtype | None = CharField(db_index=True, null=True)
1903
- """Data type (:class:`~lamindb.base.types.FeatureDtype`).
1904
-
1905
- For categorical types, can define from which registry values are
1906
- sampled, e.g., `'cat[ULabel]'` or `'cat[bionty.CellType]'`. Unions are also
1907
- allowed if the feature samples from two registries, e.g., `'cat[ULabel|bionty.CellType]'`
1908
- """
1909
- type: Feature | None = ForeignKey(
1910
- "self", PROTECT, null=True, related_name="records"
1911
- )
1912
- """Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').
1913
-
1914
- Allows to group features by type, e.g., all read outs, all metrics, etc.
1915
- """
1916
- records: Feature
1917
- """Records of this type."""
1918
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
1919
- """Distinguish types from instances of the type."""
1920
- unit: str | None = CharField(max_length=30, db_index=True, null=True)
1921
- """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
1922
- description: str | None = CharField(db_index=True, null=True)
1923
- """A description."""
1924
- array_rank: int = models.SmallIntegerField(default=0, db_index=True)
1925
- """Rank of feature.
1926
-
1927
- Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.
1928
-
1929
- Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with
1930
- the dimension of the feature space.
1931
- """
1932
- array_size: int = models.IntegerField(default=0, db_index=True)
1933
- """Number of elements of the feature.
1934
-
1935
- Total number of elements (product of shape components) of the array.
1936
-
1937
- - A number or string (a scalar): 1
1938
- - A 50-dimensional embedding: 50
1939
- - A 25 x 25 image: 625
1940
- """
1941
- array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)
1942
- """Shape of the feature.
1943
-
1944
- - A number or string (a scalar): [1]
1945
- - A 50-dimensional embedding: [50]
1946
- - A 25 x 25 image: [25, 25]
1947
-
1948
- Is stored as a list rather than a tuple because it's serialized as JSON.
1949
- """
1950
- proxy_dtype: FeatureDtype | None = CharField(default=None, null=True)
1951
- """Proxy data type.
1952
-
1953
- If the feature is an image it's often stored via a path to the image file. Hence, while the dtype might be
1954
- image with a certain shape, the proxy dtype would be str.
1955
- """
1956
- synonyms: str | None = TextField(null=True)
1957
- """Bar-separated (|) synonyms (optional)."""
1958
- # we define the below ManyToMany on the feature model because it parallels
1959
- # how other registries (like Gene, Protein, etc.) relate to Schema
1960
- # it makes the API more consistent
1961
- schemas: Schema = models.ManyToManyField(
1962
- "Schema", through="SchemaFeature", related_name="features"
1963
- )
1964
- """Feature sets linked to this feature."""
1965
- _expect_many: bool = models.BooleanField(default=True, db_default=True)
1966
- """Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `True`).
1967
-
1968
- - if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level mean `set[datetime]` on the artifact-level
1969
- - if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact")
1970
- """
1971
- _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
1972
- # backward fields
1973
- values: FeatureValue
1974
- """Values for this feature."""
1975
-
1976
- @overload
1977
- def __init__(
1978
- self,
1979
- name: str,
1980
- dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
1981
- type: Feature | None = None,
1982
- is_type: bool = False,
1983
- unit: str | None = None,
1984
- description: str | None = None,
1985
- synonyms: str | None = None,
1986
- nullable: bool = True,
1987
- default_value: str | None = None,
1988
- cat_filters: dict[str, str] | None = None,
1989
- ): ...
1990
-
1991
- @overload
1992
- def __init__(
1993
- self,
1994
- *db_args,
1995
- ): ...
1996
-
1997
- def __init__(
1998
- self,
1999
- *args,
2000
- **kwargs,
2001
- ):
2002
- pass
2003
-
2004
- @classmethod
2005
- def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
2006
- """Create Feature records for columns."""
2007
- pass
2008
-
2009
- def save(self, *args, **kwargs) -> Feature:
2010
- """Save."""
2011
- pass
2012
-
2013
- @property
2014
- def default_value(self) -> Any:
2015
- """A default value that overwrites missing values (default `None`).
2016
-
2017
- This takes effect when you call `Curator.standardize()`.
2018
- """
2019
- if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
2020
- return self._aux["af"]["0"]
2021
- else:
2022
- return None
2023
-
2024
- @default_value.setter
2025
- def default_value(self, value: bool) -> None:
2026
- if self._aux is None:
2027
- self._aux = {}
2028
- if "af" not in self._aux:
2029
- self._aux["af"] = {}
2030
- self._aux["af"]["0"] = value
2031
-
2032
- @property
2033
- def nullable(self) -> bool:
2034
- """Indicates whether the feature can have nullable values (default `True`).
2035
-
2036
- Example::
2037
-
2038
- import lamindb as ln
2039
- import pandas as pd
2040
-
2041
- disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
2042
- schema = ln.Schema(features=[disease]).save()
2043
- dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
2044
- df = pd.DataFrame(dataset)
2045
- curator = ln.curators.DataFrameCurator(df, schema)
2046
- try:
2047
- curator.validate()
2048
- except ln.errors.ValidationError as e:
2049
- assert str(e).startswith("non-nullable series 'disease' contains null values")
2050
-
2051
- """
2052
- if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
2053
- return self._aux["af"]["1"]
2054
- else:
2055
- return True
2056
-
2057
- @nullable.setter
2058
- def nullable(self, value: bool) -> None:
2059
- if self._aux is None:
2060
- self._aux = {}
2061
- if "af" not in self._aux:
2062
- self._aux["af"] = {}
2063
- self._aux["af"]["1"] = value
2064
-
2065
-
2066
- class FeatureValue(Record, TracksRun):
2067
- """Non-categorical features values.
2068
-
2069
- Categorical feature values are stored in their respective registries:
2070
- :class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.
2071
-
2072
- Unlike for ULabel, in `FeatureValue`, values are grouped by features and
2073
- not by an ontological hierarchy.
2074
- """
2075
-
2076
- # we do not have a unique constraint on feature & value because it leads to hashing errors
2077
- # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
2078
- # we do not hash values because we have `get_or_create` logic all over the place
2079
- # and also for checking whether the (feature, value) combination exists
2080
- # there does not seem an issue with querying for a dict-like value
2081
- # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
2082
-
2083
- _name_field: str = "value"
2084
-
2085
- feature: Feature | None = ForeignKey(
2086
- Feature, CASCADE, null=True, related_name="values", default=None
2087
- )
2088
- """The dimension metadata."""
2089
- value: Any = models.JSONField()
2090
- """The JSON-like value."""
2091
- hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
2092
- """Value hash."""
2093
-
2094
- class Meta(BasicRecord.Meta, TracksRun.Meta):
2095
- constraints = [
2096
- # For simple types, use direct value comparison
2097
- models.UniqueConstraint(
2098
- fields=["feature", "value"],
2099
- name="unique_simple_feature_value",
2100
- condition=Q(hash__isnull=True),
2101
- ),
2102
- # For complex types (dictionaries), use hash
2103
- models.UniqueConstraint(
2104
- fields=["feature", "hash"],
2105
- name="unique_complex_feature_value",
2106
- condition=Q(hash__isnull=False),
2107
- ),
2108
- ]
2109
-
2110
- @classmethod
2111
- def get_or_create(cls, feature, value):
2112
- # Simple types: int, float, str, bool
2113
- if isinstance(value, (int, float, str, bool)):
2114
- try:
2115
- return (
2116
- cls.objects.create(feature=feature, value=value, hash=None),
2117
- False,
2118
- )
2119
- except IntegrityError:
2120
- return cls.objects.get(feature=feature, value=value), True
2121
-
2122
- # Complex types: dict, list
2123
- else:
2124
- hash = hash_dict(value)
2125
- try:
2126
- return (
2127
- cls.objects.create(feature=feature, value=value, hash=hash),
2128
- False,
2129
- )
2130
- except IntegrityError:
2131
- return cls.objects.get(feature=feature, hash=hash), True
2132
-
2133
-
2134
- class Schema(Record, CanCurate, TracksRun):
2135
- """Schemas / feature sets.
2136
-
2137
- Stores references to dataset schemas: these are the sets of columns in a dataset
2138
- that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
2139
- entities.
2140
-
2141
- .. dropdown:: Why does LaminDB model feature sets, not just features?
2142
-
2143
- 1. Performance: Imagine you measure the same panel of 20k transcripts in
2144
- 1M samples. By modeling the panel as a feature set, you can link all
2145
- your artifacts against one feature set and only need to store 1M
2146
- instead of 1M x 20k = 20B links.
2147
- 2. Interpretation: Model protein panels, gene panels, etc.
2148
- 3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
2149
-
2150
- These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
2151
-
2152
- Args:
2153
- features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
2154
- records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
2155
- a set upon instantiation. If you'd like to pass values, use
2156
- :meth:`~lamindb.Schema.from_values` or
2157
- :meth:`~lamindb.Schema.from_df`.
2158
- components: `dict[str, Schema] | None = None` A dictionary mapping component names to
2159
- their corresponding :class:`~lamindb.Schema` objects for composite schemas.
2160
- name: `str | None = None` A name.
2161
- description: `str | None = None` A description.
2162
- dtype: `str | None = None` The simple type. Defaults to
2163
- `None` for sets of :class:`~lamindb.Feature` records.
2164
- Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
2165
- itype: `str | None = None` The schema identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
2166
- type: `Schema | None = None` A type.
2167
- is_type: `bool = False` Distinguish types from instances of the type.
2168
- otype: `str | None = None` An object type to define the structure of a composite schema.
2169
- minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
2170
- ordered_set: `bool = False` Whether features are required to be ordered.
2171
- maximal_set: `bool = False` If `True`, no additional features are allowed.
2172
- slot: `str | None = None` The slot name when this schema is used as a component in a
2173
- composite schema.
2174
- coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
2175
- during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
2176
-
2177
- Note:
2178
-
2179
- A feature set can be identified by the `hash` of its feature uids.
2180
- It's stored in the `.hash` field.
2181
-
2182
- A `slot` provides a string key to access feature sets. For instance, for the schema of an
2183
- `AnnData` object, it would be `'obs'` for `adata.obs`.
2184
-
2185
- See Also:
2186
- :meth:`~lamindb.Schema.from_values`
2187
- Create from values.
2188
- :meth:`~lamindb.Schema.from_df`
2189
- Create from dataframe columns.
2190
-
2191
- Examples:
2192
-
2193
- Create a schema (feature set) from df with types:
2194
-
2195
- >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
2196
- >>> schema = ln.Schema.from_df(df)
2197
-
2198
- Create a schema (feature set) from features:
2199
-
2200
- >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
2201
- >>> schema = ln.Schema(features)
2202
-
2203
- Create a schema (feature set) from identifier values:
2204
-
2205
- >>> import bionty as bt
2206
- >>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
2207
-
2208
- """
2209
-
2210
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
2211
- abstract = False
2212
-
2213
- _name_field: str = "name"
2214
- _aux_fields: dict[str, tuple[str, type]] = {"0": ("coerce_dtype", bool)}
2215
-
2216
- id: int = models.AutoField(primary_key=True)
2217
- """Internal id, valid only in one DB instance."""
2218
- uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
2219
- """A universal id (hash of the set of feature values)."""
2220
- name: str | None = CharField(max_length=150, null=True, db_index=True)
2221
- """A name."""
2222
- description: str | None = CharField(null=True, db_index=True)
2223
- """A description."""
2224
- n = IntegerField()
2225
- """Number of features in the set."""
2226
- dtype: str | None = CharField(max_length=64, null=True, editable=False)
2227
- """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
2228
-
2229
- For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
2230
- """
2231
- itype: str | None = CharField(
2232
- max_length=120, db_index=True, null=True, editable=False
2233
- )
2234
- """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
2235
-
2236
- Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
2237
-
2238
- .. versionchanged:: 1.0.0
2239
- Was called `registry` before.
2240
- """
2241
- type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
2242
- """Type of schema.
2243
-
2244
- Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
2245
-
2246
- You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
2247
-
2248
- Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
2249
- """
2250
- records: Schema
2251
- """Records of this type."""
2252
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
2253
- """Distinguish types from instances of the type."""
2254
- otype: str | None = CharField(max_length=64, db_index=True, null=True)
2255
- """Default Python object type, e.g., DataFrame, AnnData."""
2256
- hash: str | None = CharField(
2257
- max_length=HASH_LENGTH, db_index=True, null=True, editable=False
2258
- )
2259
- """A hash of the set of feature identifiers.
2260
-
2261
- For a composite schema, the hash of hashes.
2262
- """
2263
- minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
2264
- """Whether the schema contains a minimal set of linked features (default `True`).
2265
-
2266
- If `False`, no features are linked to this schema.
2267
-
2268
- If `True`, features are linked and considered as a minimally required set in validation.
2269
- """
2270
- ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
2271
- """Whether features are required to be ordered (default `False`)."""
2272
- maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
2273
- """If `False`, additional features are allowed (default `False`).
2274
-
2275
- If `True`, the the minimal set is a maximal set and no additional features are allowed.
2276
- """
2277
- components: Schema = ManyToManyField(
2278
- "self", through="SchemaComponent", symmetrical=False, related_name="composites"
2279
- )
2280
- """Components of this schema."""
2281
- composites: Schema
2282
- """The composite schemas that contains this schema as a component.
2283
-
2284
- For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
2285
- """
2286
- features: Feature
2287
- """The features contained in the schema."""
2288
- params: Param
2289
- """The params contained in the schema."""
2290
- artifacts: Artifact
2291
- """The artifacts that measure a feature set that matches this schema."""
2292
- validated_artifacts: Artifact
2293
- """The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
2294
- projects: Project
2295
- """Associated projects."""
2296
- _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
2297
- # lamindb v2
2298
- # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
2299
- # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
2300
- # -- the following two fields are dynamically removed from the API for now
2301
- validated_by: Schema | None = ForeignKey(
2302
- "self", PROTECT, related_name="validated_schemas", default=None, null=True
2303
- )
2304
- # """The schema that validated this schema during curation.
2305
-
2306
- # When performing validation, the schema that enforced validation is often less concrete than what is validated.
2307
-
2308
- # For instance, the set of measured features might be a superset of the minimally required set of features.
2309
- # """
2310
- # validated_schemas: Schema
2311
- # """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
2312
- composite: Schema | None = ForeignKey(
2313
- "self", PROTECT, related_name="+", default=None, null=True
2314
- )
2315
- # The legacy foreign key
2316
- slot: str | None = CharField(max_length=100, db_index=True, null=True)
2317
- # The legacy slot
2318
-
2319
- @overload
2320
- def __init__(
2321
- self,
2322
- features: Iterable[Record] | None = None,
2323
- components: dict[str, Schema] | None = None,
2324
- name: str | None = None,
2325
- description: str | None = None,
2326
- dtype: str | None = None,
2327
- itype: str | Registry | FieldAttr | None = None,
2328
- type: Schema | None = None,
2329
- is_type: bool = False,
2330
- otype: str | None = None,
2331
- minimal_set: bool = True,
2332
- ordered_set: bool = False,
2333
- maximal_set: bool = False,
2334
- slot: str | None = None,
2335
- coerce_dtype: bool = False,
2336
- ): ...
2337
-
2338
- @overload
2339
- def __init__(
2340
- self,
2341
- *db_args,
2342
- ): ...
2343
-
2344
- def __init__(
2345
- self,
2346
- *args,
2347
- **kwargs,
2348
- ):
2349
- pass
2350
-
2351
- @classmethod
2352
- def from_values( # type: ignore
2353
- cls,
2354
- values: ListLike,
2355
- field: FieldAttr = Feature.name,
2356
- type: str | None = None,
2357
- name: str | None = None,
2358
- mute: bool = False,
2359
- organism: Record | str | None = None,
2360
- source: Record | None = None,
2361
- raise_validation_error: bool = True,
2362
- ) -> Schema:
2363
- """Create feature set for validated features.
2364
-
2365
- Args:
2366
- values: A list of values, like feature names or ids.
2367
- field: The field of a reference registry to map values.
2368
- type: The simple type.
2369
- Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
2370
- defaults to `"float"` otherwise.
2371
- name: A name.
2372
- organism: An organism to resolve gene mapping.
2373
- source: A public ontology to resolve feature identifier mapping.
2374
- raise_validation_error: Whether to raise a validation error if some values are not valid.
2375
-
2376
- Raises:
2377
- ValidationError: If some values are not valid.
2378
-
2379
- Examples:
2380
-
2381
- >>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
2382
- >>> schema = ln.Schema.from_values(features)
2383
-
2384
- >>> genes = ["ENSG00000139618", "ENSG00000198786"]
2385
- >>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
2386
- """
2387
- pass
2388
-
2389
- @classmethod
2390
- def from_df(
2391
- cls,
2392
- df: pd.DataFrame,
2393
- field: FieldAttr = Feature.name,
2394
- name: str | None = None,
2395
- mute: bool = False,
2396
- organism: Record | str | None = None,
2397
- source: Record | None = None,
2398
- ) -> Schema | None:
2399
- """Create feature set for validated features."""
2400
- pass
2401
-
2402
- def save(self, *args, **kwargs) -> Schema:
2403
- """Save."""
2404
- pass
2405
-
2406
- @property
2407
- def members(self) -> QuerySet:
2408
- """A queryset for the individual records of the set."""
2409
- pass
2410
-
2411
- @property
2412
- def coerce_dtype(self) -> bool:
2413
- """Whether dtypes should be coerced during validation.
2414
-
2415
- For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
2416
- """
2417
- if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
2418
- return self._aux["af"]["0"]
2419
- else:
2420
- return False
2421
-
2422
- @coerce_dtype.setter
2423
- def coerce_dtype(self, value: bool) -> None:
2424
- if self._aux is None:
2425
- self._aux = {}
2426
- if "af" not in self._aux:
2427
- self._aux["af"] = {}
2428
- self._aux["af"]["0"] = value
2429
-
2430
- @property
2431
- @deprecated("itype")
2432
- def registry(self) -> str:
2433
- return self.itype
2434
-
2435
- @registry.setter
2436
- def registry(self, value) -> None:
2437
- self.itype = value
2438
-
2439
- def describe(self, return_str=False) -> None | str:
2440
- """Describe schema."""
2441
- message = str(self) + "\ncomponents:"
2442
- for component in self.components.all():
2443
- message += "\n " + str(component)
2444
- if return_str:
2445
- return message
2446
- else:
2447
- print(message)
2448
- return None
2449
-
2450
- def _get_component(self, slot: str) -> Schema:
2451
- return self.components.get(links_component__slot=slot)
2452
-
2453
-
2454
- class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2455
- # Note that this docstring has to be consistent with Curator.save_artifact()
2456
- """Datasets & models stored as files, folders, or arrays.
2457
-
2458
- Artifacts manage data in local or remote storage.
2459
-
2460
- Some artifacts are array-like, e.g., when stored as `.parquet`, `.h5ad`,
2461
- `.zarr`, or `.tiledb`.
2462
-
2463
- Args:
2464
- data: `UPathStr` A path to a local or remote folder or file.
2465
- kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
2466
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
2467
- description: `str | None = None` A description.
2468
- revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
2469
- run: `Run | None = None` The run that creates the artifact.
2470
-
2471
- .. dropdown:: Typical storage formats & their API accessors
2472
-
2473
- Arrays:
2474
-
2475
- - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
2476
- - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
2477
- - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
2478
-
2479
- Non-arrays:
2480
-
2481
- - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
2482
- - Fastq: `.fastq` ⟷ /
2483
- - VCF: `.vcf` ⟷ /
2484
- - QC: `.html` ⟷ /
2485
-
2486
- You'll find these values in the `suffix` & `accessor` fields.
2487
-
2488
- LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
2489
-
2490
- See Also:
2491
- :class:`~lamindb.Storage`
2492
- Storage locations for artifacts.
2493
- :class:`~lamindb.Collection`
2494
- Collections of artifacts.
2495
- :meth:`~lamindb.Artifact.from_df`
2496
- Create an artifact from a `DataFrame`.
2497
- :meth:`~lamindb.Artifact.from_anndata`
2498
- Create an artifact from an `AnnData`.
2499
-
2500
- Examples:
2501
-
2502
- Create an artifact by passing `key`:
2503
-
2504
- >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
2505
- >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
2506
-
2507
- Calling `.save()` uploads the file to the default storage location of your lamindb instance.
2508
- (If it's a local instance, the "upload" is a mere copy operation.)
2509
-
2510
- If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
2511
-
2512
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
2513
-
2514
- You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
2515
-
2516
- >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
2517
- >>> artifact_v2.versions.df() # see all versions
2518
-
2519
- .. dropdown:: Why does the API look this way?
2520
-
2521
- It's inspired by APIs building on AWS S3.
2522
-
2523
- Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
2524
-
2525
- In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
2526
-
2527
- # signature: S3.Bucket.upload_file(filepath, key)
2528
- import boto3
2529
- s3 = boto3.resource('s3')
2530
- bucket = s3.Bucket('mybucket')
2531
- bucket.upload_file('/tmp/hello.txt', 'hello.txt')
2532
-
2533
- In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
2534
-
2535
- # signature: quilt3.Bucket.put_file(key, filepath)
2536
- import quilt3
2537
- bucket = quilt3.Bucket('mybucket')
2538
- bucket.put_file('hello.txt', '/tmp/hello.txt')
2539
-
2540
- Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
2541
-
2542
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
2543
- >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
2544
-
2545
- Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
2546
-
2547
- >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
2548
-
2549
- If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
2550
- the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
2551
- detects the duplication and will return the existing artifact.
2552
-
2553
- """
2554
-
2555
- class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
2556
- abstract = False
2557
-
2558
- _len_full_uid: int = 20
2559
- _len_stem_uid: int = 16
2560
-
2561
- params: ParamManager = ParamManagerArtifact # type: ignore
2562
- """Param manager.
2563
-
2564
- Example::
2565
-
2566
- artifact.params.add_values({
2567
- "hidden_size": 32,
2568
- "bottleneck_size": 16,
2569
- "batch_size": 32,
2570
- "preprocess_params": {
2571
- "normalization_type": "cool",
2572
- "subset_highlyvariable": True,
2573
- },
2574
- })
2575
- """
2576
-
2577
- features: FeatureManager = FeatureManager # type: ignore
2578
- """Feature manager.
2579
-
2580
- Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
2581
-
2582
- Annotate with features & values::
2583
-
2584
- artifact.features.add_values({
2585
- "species": organism, # here, organism is an Organism record
2586
- "scientist": ['Barbara McClintock', 'Edgar Anderson'],
2587
- "temperature": 27.6,
2588
- "study": "Candidate marker study"
2589
- })
2590
-
2591
- Query for features & values::
2592
-
2593
- ln.Artifact.features.filter(scientist="Barbara McClintock")
2594
-
2595
- Features may or may not be part of the artifact content in storage. For
2596
- instance, the :class:`~lamindb.Curator` flow validates the columns of a
2597
- `DataFrame`-like artifact and annotates it with features corresponding to
2598
- these columns. `artifact.features.add_values`, by contrast, does not
2599
- validate the content of the artifact.
2600
- """
2601
-
2602
- @property
2603
- def labels(self) -> LabelManager:
2604
- """Label manager.
2605
-
2606
- To annotate with labels, you typically use the registry-specific accessors,
2607
- for instance :attr:`~lamindb.Artifact.ulabels`::
2608
-
2609
- candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
2610
- artifact.ulabels.add(candidate_marker_study)
2611
-
2612
- Similarly, you query based on these accessors::
2613
-
2614
- ln.Artifact.filter(ulabels__name="Candidate marker study").all()
2615
-
2616
- Unlike the registry-specific accessors, the `.labels` accessor provides
2617
- a way of associating labels with features::
2618
-
2619
- study = ln.Feature(name="study", dtype="cat").save()
2620
- artifact.labels.add(candidate_marker_study, feature=study)
2621
-
2622
- Note that the above is equivalent to::
2623
-
2624
- artifact.features.add_values({"study": candidate_marker_study})
2625
- """
2626
- from lamindb.core._label_manager import LabelManager
2627
-
2628
- return LabelManager(self)
2629
-
2630
- id: int = models.AutoField(primary_key=True)
2631
- """Internal id, valid only in one DB instance."""
2632
- uid: str = CharField(
2633
- editable=False, unique=True, db_index=True, max_length=_len_full_uid
2634
- )
2635
- """A universal random id."""
2636
- key: str | None = CharField(db_index=True, null=True)
2637
- """A (virtual) relative file path within the artifact's storage location.
2638
-
2639
- Setting a `key` is useful to automatically group artifacts into a version family.
2640
-
2641
- LaminDB defaults to a virtual file path to make renaming of data in object storage easy.
2642
-
2643
- If you register existing files in a storage location, the `key` equals the
2644
- actual filepath on the underyling filesytem or object store.
2645
- """
2646
- description: str | None = CharField(db_index=True, null=True)
2647
- """A description."""
2648
- storage: Storage = ForeignKey(
2649
- Storage, PROTECT, related_name="artifacts", editable=False
2650
- )
2651
- """Storage location, e.g. an S3 or GCP bucket or a local directory."""
2652
- suffix: str = CharField(max_length=30, db_index=True, editable=False)
2653
- # Initially, we thought about having this be nullable to indicate folders
2654
- # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
2655
- """Path suffix or empty string if no canonical suffix exists.
2656
-
2657
- This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
2658
- """
2659
- kind: ArtifactKind | None = CharField(
2660
- max_length=20,
2661
- db_index=True,
2662
- null=True,
2663
- )
2664
- """:class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
2665
- otype: str | None = CharField(
2666
- max_length=64, db_index=True, null=True, editable=False
2667
- )
2668
- """Default Python object type, e.g., DataFrame, AnnData."""
2669
- size: int | None = BigIntegerField(
2670
- null=True, db_index=True, default=None, editable=False
2671
- )
2672
- """Size in bytes.
2673
-
2674
- Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
2675
- """
2676
- hash: str | None = CharField(
2677
- max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
2678
- )
2679
- """Hash or pseudo-hash of artifact content.
2680
-
2681
- Useful to ascertain integrity and avoid duplication.
2682
- """
2683
- n_files: int | None = BigIntegerField(
2684
- null=True, db_index=True, default=None, editable=False
2685
- )
2686
- """Number of files for folder-like artifacts, `None` for file-like artifacts.
2687
-
2688
- Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
2689
-
2690
- .. versionchanged:: 1.0
2691
- Renamed from `n_objects` to `n_files`.
2692
- """
2693
- n_observations: int | None = BigIntegerField(
2694
- null=True, db_index=True, default=None, editable=False
2695
- )
2696
- """Number of observations.
2697
-
2698
- Typically, this denotes the first array dimension.
2699
- """
2700
- _hash_type: str | None = CharField(
2701
- max_length=30, db_index=True, null=True, editable=False
2702
- )
2703
- """Type of hash."""
2704
- ulabels: ULabel = models.ManyToManyField(
2705
- ULabel, through="ArtifactULabel", related_name="artifacts"
2706
- )
2707
- """The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
2708
- run: Run | None = ForeignKey(
2709
- Run,
2710
- PROTECT,
2711
- related_name="output_artifacts",
2712
- null=True,
2713
- default=None,
2714
- editable=False,
2715
- )
2716
- """Run that created the artifact."""
2717
- input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
2718
- """Runs that use this artifact as an input."""
2719
- # if the artifact is replicated or updated in a new run, we link the previous
2720
- # run in previous_runs
2721
- _previous_runs: Run = models.ManyToManyField(
2722
- "Run", related_name="_output_artifacts_with_later_updates"
2723
- )
2724
- """Sequence of runs that created or updated the record."""
2725
- collections: Collection
2726
- """The collections that this artifact is part of."""
2727
- schema: Schema | None = ForeignKey(
2728
- Schema,
2729
- PROTECT,
2730
- null=True,
2731
- default=None,
2732
- related_name="validated_artifacts",
2733
- )
2734
- """The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
2735
- feature_sets: Schema = models.ManyToManyField(
2736
- Schema, related_name="artifacts", through="ArtifactSchema"
2737
- )
2738
- """The feature sets measured by the artifact."""
2739
- _feature_values: FeatureValue = models.ManyToManyField(
2740
- FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
2741
- )
2742
- """Non-categorical feature values for annotation."""
2743
- _param_values: ParamValue = models.ManyToManyField(
2744
- ParamValue, through="ArtifactParamValue", related_name="artifacts"
2745
- )
2746
- """Parameter values."""
2747
- _key_is_virtual: bool = BooleanField()
2748
- """Indicates whether `key` is virtual or part of an actual file path."""
2749
- # be mindful that below, passing related_name="+" leads to errors
2750
- _actions: Artifact = models.ManyToManyField(
2751
- "self", symmetrical=False, related_name="_action_targets"
2752
- )
2753
- """Actions to attach for the UI."""
2754
- created_by: User = ForeignKey(
2755
- "lamindb.User",
2756
- PROTECT,
2757
- default=current_user_id,
2758
- related_name="created_artifacts",
2759
- editable=False,
2760
- )
2761
- """Creator of record."""
2762
- _overwrite_versions: bool = BooleanField(default=None)
2763
- """Indicates whether to store or overwrite versions.
2764
-
2765
- It defaults to False for file-like artifacts and to True for folder-like artifacts.
2766
- """
2767
- projects: Project
2768
- """Associated projects."""
2769
- references: Reference
2770
- """Associated references."""
2771
-
2772
- @overload
2773
- def __init__(
2774
- self,
2775
- # we're not choosing the name "path" for this arg because
2776
- # it'd be confusing with `artifact.path`, which is not the same
2777
- # so "data" conveys better that this is input data that's ingested
2778
- # and will be moved to a target path at `artifact.path`
2779
- # also internally, we sometimes pass "data objects" like a DataFrame
2780
- # here; and we might refactor this but we might also keep that internal
2781
- # usage
2782
- data: UPathStr,
2783
- kind: ArtifactKind | None = None,
2784
- key: str | None = None,
2785
- description: str | None = None,
2786
- revises: Artifact | None = None,
2787
- run: Run | None = None,
2788
- ): ...
2789
-
2790
- @overload
2791
- def __init__(
2792
- self,
2793
- *db_args,
2794
- ): ...
2795
-
2796
- def __init__(
2797
- self,
2798
- *args,
2799
- **kwargs,
2800
- ):
2801
- pass
2802
-
2803
- @property
2804
- @deprecated("kind")
2805
- def type(self) -> str:
2806
- return self.kind
2807
-
2808
- @property
2809
- @deprecated("otype")
2810
- def _accessor(self) -> str:
2811
- return self.otype
2812
-
2813
- @property
2814
- def transform(self) -> Transform | None:
2815
- """Transform whose run created the artifact."""
2816
- return self.run.transform if self.run is not None else None
2817
-
2818
- @property
2819
- @deprecated("n_files")
2820
- def n_objects(self) -> int:
2821
- return self.n_files
2822
-
2823
- # add the below because this is what people will have in their code
2824
- # if they implement the recommended migration strategy
2825
- # - FeatureSet -> Schema
2826
- # - featureset -> schema
2827
- # - feature_set -> schema
2828
- # @property
2829
- # def schemas(self) -> QuerySet[Schema]:
2830
- # """Schemas linked to artifact via many-to-many relationship.
2831
-
2832
- # Is now mediating the private `.feature_sets` relationship during
2833
- # a transition period to better schema management.
2834
-
2835
- # .. versionchanged: 1.0
2836
- # Was previously called `.feature_sets`.
2837
-
2838
- # """
2839
- # return self.feature_sets
2840
-
2841
- @property
2842
- def path(self) -> Path:
2843
- """Path.
2844
-
2845
- File in cloud storage, here AWS S3:
2846
-
2847
- >>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
2848
- >>> artifact.path
2849
- S3QueryPath('s3://my-bucket/my-file.csv')
2850
-
2851
- File in local storage:
2852
-
2853
- >>> ln.Artifact("./myfile.csv", key="myfile").save()
2854
- >>> artifact = ln.Artifact.get(key="myfile")
2855
- >>> artifact.path
2856
- PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
2857
- """
2858
- pass
2859
-
2860
- @classmethod
2861
- def from_df(
2862
- cls,
2863
- df: pd.DataFrame,
2864
- *,
2865
- key: str | None = None,
2866
- description: str | None = None,
2867
- run: Run | None = None,
2868
- revises: Artifact | None = None,
2869
- **kwargs,
2870
- ) -> Artifact:
2871
- """Create from `DataFrame`, validate & link features.
2872
-
2873
- Args:
2874
- df: A `DataFrame` object.
2875
- key: A relative path within default storage,
2876
- e.g., `"myfolder/myfile.parquet"`.
2877
- description: A description.
2878
- revises: An old version of the artifact.
2879
- run: The run that creates the artifact.
2880
-
2881
- See Also:
2882
- :meth:`~lamindb.Collection`
2883
- Track collections.
2884
- :class:`~lamindb.Feature`
2885
- Track features.
2886
-
2887
- Examples:
2888
- >>> df = ln.core.datasets.df_iris_in_meter_batch1()
2889
- >>> df.head()
2890
- sepal_length sepal_width petal_length petal_width iris_organism_code
2891
- 0 0.051 0.035 0.014 0.002 0
2892
- 1 0.049 0.030 0.014 0.002 0
2893
- 2 0.047 0.032 0.013 0.002 0
2894
- 3 0.046 0.031 0.015 0.002 0
2895
- 4 0.050 0.036 0.014 0.002 0
2896
- >>> artifact = ln.Artifact.from_df(df, description="Iris flower collection batch1")
2897
- >>> artifact.save()
2898
- """
2899
- pass
2900
-
2901
- @classmethod
2902
- def from_anndata(
2903
- cls,
2904
- adata: AnnData | UPathStr,
2905
- *,
2906
- key: str | None = None,
2907
- description: str | None = None,
2908
- run: Run | None = None,
2909
- revises: Artifact | None = None,
2910
- **kwargs,
2911
- ) -> Artifact:
2912
- """Create from ``AnnData``, validate & link features.
2913
-
2914
- Args:
2915
- adata: An `AnnData` object or a path of AnnData-like.
2916
- key: A relative path within default storage,
2917
- e.g., `"myfolder/myfile.h5ad"`.
2918
- description: A description.
2919
- revises: An old version of the artifact.
2920
- run: The run that creates the artifact.
2921
-
2922
- See Also:
2923
-
2924
- :meth:`~lamindb.Collection`
2925
- Track collections.
2926
- :class:`~lamindb.Feature`
2927
- Track features.
2928
-
2929
- Examples:
2930
- >>> import bionty as bt
2931
- >>> bt.settings.organism = "human"
2932
- >>> adata = ln.core.datasets.anndata_with_obs()
2933
- >>> artifact = ln.Artifact.from_anndata(adata, description="mini anndata with obs")
2934
- >>> artifact.save()
2935
- """
2936
- pass
2937
-
2938
- @classmethod
2939
- def from_mudata(
2940
- cls,
2941
- mdata: MuData,
2942
- *,
2943
- key: str | None = None,
2944
- description: str | None = None,
2945
- run: Run | None = None,
2946
- revises: Artifact | None = None,
2947
- **kwargs,
2948
- ) -> Artifact:
2949
- """Create from ``MuData``, validate & link features.
2950
-
2951
- Args:
2952
- mdata: An `MuData` object.
2953
- key: A relative path within default storage,
2954
- e.g., `"myfolder/myfile.h5mu"`.
2955
- description: A description.
2956
- revises: An old version of the artifact.
2957
- run: The run that creates the artifact.
2958
-
2959
- See Also:
2960
- :meth:`~lamindb.Collection`
2961
- Track collections.
2962
- :class:`~lamindb.Feature`
2963
- Track features.
2964
-
2965
- Examples:
2966
- >>> import bionty as bt
2967
- >>> bt.settings.organism = "human"
2968
- >>> mdata = ln.core.datasets.mudata_papalexi21_subset()
2969
- >>> artifact = ln.Artifact.from_mudata(mdata, description="a mudata object")
2970
- >>> artifact.save()
2971
- """
2972
- pass
2973
-
2974
- @classmethod
2975
- def from_tiledbsoma(
2976
- cls,
2977
- path: UPathStr,
2978
- *,
2979
- key: str | None = None,
2980
- description: str | None = None,
2981
- run: Run | None = None,
2982
- revises: Artifact | None = None,
2983
- **kwargs,
2984
- ) -> Artifact:
2985
- """Create from a tiledbsoma store.
2986
-
2987
- Args:
2988
- path: A tiledbsoma store with .tiledbsoma suffix.
2989
- key: A relative path within default storage,
2990
- e.g., `"myfolder/mystore.tiledbsoma"`.
2991
- description: A description.
2992
- revises: An old version of the artifact.
2993
- run: The run that creates the artifact.
2994
-
2995
- Examples:
2996
- >>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
2997
- >>> artifact.save()
2998
- """
2999
- pass
3000
-
3001
- @classmethod
3002
- def from_dir(
3003
- cls,
3004
- path: UPathStr,
3005
- *,
3006
- key: str | None = None,
3007
- run: Run | None = None,
3008
- ) -> list[Artifact]:
3009
- """Create a list of artifact objects from a directory.
3010
-
3011
- Hint:
3012
- If you have a high number of files (several 100k) and don't want to
3013
- track them individually, create a single :class:`~lamindb.Artifact` via
3014
- ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.
3015
-
3016
- Args:
3017
- path: Source path of folder.
3018
- key: Key for storage destination. If `None` and
3019
- directory is in a registered location, the inferred `key` will
3020
- reflect the relative position. If `None` and directory is outside
3021
- of a registered storage location, the inferred key defaults to `path.name`.
3022
- run: A `Run` object.
3023
-
3024
- Examples:
3025
- >>> dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
3026
- >>> artifacts = ln.Artifact.from_dir(dir_path)
3027
- >>> ln.save(artifacts)
3028
- """
3029
- pass
3030
-
3031
- def replace(
3032
- self,
3033
- data: UPathStr | pd.DataFrame | AnnData | MuData,
3034
- run: Run | None = None,
3035
- format: str | None = None,
3036
- ) -> None:
3037
- """Replace artifact content.
3038
-
3039
- Args:
3040
- data: A file path.
3041
- run: The run that created the artifact gets
3042
- auto-linked if ``ln.track()`` was called.
3043
-
3044
- Examples:
3045
- Say we made a change to the content of an artifact, e.g., edited the image
3046
- `paradisi05_laminopathic_nuclei.jpg`.
3047
-
3048
- This is how we replace the old file in storage with the new file:
3049
-
3050
- >>> artifact.replace("paradisi05_laminopathic_nuclei.jpg")
3051
- >>> artifact.save()
3052
-
3053
- Note that this neither changes the storage key nor the filename.
3054
-
3055
- However, it will update the suffix if it changes.
3056
- """
3057
- pass
3058
-
3059
- def open(
3060
- self, mode: str = "r", is_run_input: bool | None = None
3061
- ) -> (
3062
- AnnDataAccessor
3063
- | BackedAccessor
3064
- | SOMACollection
3065
- | SOMAExperiment
3066
- | SOMAMeasurement
3067
- | PyArrowDataset
3068
- ):
3069
- """Return a cloud-backed data object.
3070
-
3071
- Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
3072
- `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
3073
-
3074
- Args:
3075
- mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
3076
- otherwise should be always `"r"` (read-only mode).
3077
-
3078
- Notes:
3079
- For more info, see tutorial: :doc:`/arrays`.
3080
-
3081
- Examples:
3082
-
3083
- Read AnnData in backed mode from cloud:
3084
-
3085
- >>> artifact = ln.Artifact.get(key="lndb-storage/pbmc68k.h5ad")
3086
- >>> artifact.open()
3087
- AnnDataAccessor object with n_obs × n_vars = 70 × 765
3088
- constructed for the AnnData object pbmc68k.h5ad
3089
- ...
3090
- """
3091
- pass
3092
-
3093
- def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
3094
- """Cache and load into memory.
3095
-
3096
- See all :mod:`~lamindb.core.loaders`.
3097
-
3098
- Examples:
3099
-
3100
- Load a `DataFrame`-like artifact:
3101
-
3102
- >>> artifact.load().head()
3103
- sepal_length sepal_width petal_length petal_width iris_organism_code
3104
- 0 0.051 0.035 0.014 0.002 0
3105
- 1 0.049 0.030 0.014 0.002 0
3106
- 2 0.047 0.032 0.013 0.002 0
3107
- 3 0.046 0.031 0.015 0.002 0
3108
- 4 0.050 0.036 0.014 0.002 0
3109
-
3110
- Load an `AnnData`-like artifact:
3111
-
3112
- >>> artifact.load()
3113
- AnnData object with n_obs × n_vars = 70 × 765
3114
-
3115
- Fall back to :meth:`~lamindb.Artifact.cache` if no in-memory representation is configured:
3116
-
3117
- >>> artifact.load()
3118
- PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/.lamindb/jb7BY5UJoQVGMUOKiLcn.jpg')
3119
- """
3120
- pass
3121
-
3122
- def cache(self, is_run_input: bool | None = None) -> Path:
3123
- """Download cloud artifact to local cache.
3124
-
3125
- Follows synching logic: only caches an artifact if it's outdated in the local cache.
3126
-
3127
- Returns a path to a locally cached on-disk object (say a `.jpg` file).
3128
-
3129
- Examples:
3130
-
3131
- Sync file from cloud and return the local path of the cache:
3132
-
3133
- >>> artifact.cache()
3134
- PosixPath('/home/runner/work/Caches/lamindb/lamindb-ci/lndb-storage/pbmc68k.h5ad')
3135
- """
3136
- pass
3137
-
3138
- def delete(
3139
- self, permanent: bool | None = None, storage: bool | None = None
3140
- ) -> None:
3141
- """Trash or permanently delete.
3142
-
3143
- A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`).
3144
- A second call permanently deletes the artifact.
3145
- If it is a folder artifact with multiple versions, deleting a non-latest version
3146
- will not delete the underlying storage by default (if `storage=True` is not specified).
3147
- Deleting the latest version will delete all the versions for folder artifacts.
3148
-
3149
- FAQ: :doc:`docs:faq/storage`
3150
-
3151
- Args:
3152
- permanent: Permanently delete the artifact (skip trash).
3153
- storage: Indicate whether you want to delete the artifact in storage.
3154
-
3155
- Examples:
3156
-
3157
- For an `Artifact` object `artifact`, call:
3158
-
3159
- >>> artifact = ln.Artifact.filter(key="some.csv").one()
3160
- >>> artifact.delete() # delete a single file artifact
3161
-
3162
- >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=False).first()
3163
- >>> artiact.delete() # delete an old version, the data will not be deleted
3164
-
3165
- >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=True).one()
3166
- >>> artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
3167
- """
3168
- pass
3169
-
3170
- def save(self, upload: bool | None = None, **kwargs) -> Artifact:
3171
- """Save to database & storage.
3172
-
3173
- Args:
3174
- upload: Trigger upload to cloud storage in instances with hybrid storage mode.
3175
-
3176
- Examples:
3177
- >>> artifact = ln.Artifact("./myfile.csv", description="myfile")
3178
- >>> artifact.save()
3179
- """
3180
- pass
3181
-
3182
- def restore(self) -> None:
3183
- """Restore from trash.
3184
-
3185
- Examples:
3186
-
3187
- For any `Artifact` object `artifact`, call:
3188
-
3189
- >>> artifact.restore()
3190
- """
3191
- pass
3192
-
3193
- def describe(self) -> None:
3194
- """Describe relations of record.
3195
-
3196
- Examples:
3197
- >>> artifact.describe()
3198
- """
3199
- pass
3200
-
3201
-
3202
- class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3203
- """Collections of artifacts.
3204
-
3205
- Collections provide a simple way of versioning collections of artifacts.
3206
-
3207
- Args:
3208
- artifacts: `list[Artifact]` A list of artifacts.
3209
- key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
3210
- description: `str | None = None` A description.
3211
- revises: `Collection | None = None` An old version of the collection.
3212
- run: `Run | None = None` The run that creates the collection.
3213
- meta: `Artifact | None = None` An artifact that defines metadata for the collection.
3214
- reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
3215
- reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
3216
-
3217
- See Also:
3218
- :class:`~lamindb.Artifact`
3219
-
3220
- Examples:
3221
-
3222
- Create a collection from a list of :class:`~lamindb.Artifact` objects:
3223
-
3224
- >>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
3225
-
3226
- Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
3227
-
3228
- >>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
3229
-
3230
- """
3231
-
3232
- class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
3233
- abstract = False
3234
-
3235
- _len_full_uid: int = 20
3236
- _len_stem_uid: int = 16
3237
- _name_field: str = "key"
3238
-
3239
- id: int = models.AutoField(primary_key=True)
3240
- """Internal id, valid only in one DB instance."""
3241
- uid: str = CharField(
3242
- editable=False,
3243
- unique=True,
3244
- db_index=True,
3245
- max_length=_len_full_uid,
3246
- default=base62_20,
3247
- )
3248
- """Universal id, valid across DB instances."""
3249
- key: str = CharField(db_index=True)
3250
- """Name or path-like key."""
3251
- # below is the only case in which we use a TextField
3252
- # for description; we do so because users had descriptions exceeding 255 chars
3253
- # in their instances
3254
- description: str | None = TextField(null=True, db_index=True)
3255
- """A description or title."""
3256
- hash: str | None = CharField(
3257
- max_length=HASH_LENGTH, db_index=True, null=True, unique=True
3258
- )
3259
- """Hash of collection content."""
3260
- reference: str | None = CharField(max_length=255, db_index=True, null=True)
3261
- """A reference like URL or external ID."""
3262
- # also for reference_type here, we allow an extra long max_length
3263
- reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
3264
- """Type of reference, e.g., cellxgene Census collection_id."""
3265
- ulabels: ULabel = models.ManyToManyField(
3266
- "ULabel", through="CollectionULabel", related_name="collections"
3267
- )
3268
- """ULabels sampled in the collection (see :class:`~lamindb.Feature`)."""
3269
- run: Run | None = ForeignKey(
3270
- Run, PROTECT, related_name="output_collections", null=True, default=None
3271
- )
3272
- """:class:`~lamindb.Run` that created the `collection`."""
3273
- input_of_runs: Run = models.ManyToManyField(Run, related_name="input_collections")
3274
- """Runs that use this collection as an input."""
3275
- _previous_runs: Run = models.ManyToManyField(
3276
- "Run", related_name="_output_collections_with_later_updates"
3277
- )
3278
- """Sequence of runs that created or updated the record."""
3279
- artifacts: Artifact = models.ManyToManyField(
3280
- "Artifact", related_name="collections", through="CollectionArtifact"
3281
- )
3282
- """Artifacts in collection."""
3283
- meta_artifact: Artifact | None = OneToOneField(
3284
- "Artifact",
3285
- PROTECT,
3286
- null=True,
3287
- unique=True,
3288
- related_name="_meta_of_collection",
3289
- )
3290
- """An artifact that stores metadata that indexes a collection.
3291
-
3292
- It has a 1:1 correspondence with an artifact. If needed, you can access the
3293
- collection from the artifact via a private field:
3294
- `artifact._meta_of_collection`.
3295
- """
3296
- _actions: Artifact = models.ManyToManyField(Artifact, related_name="+")
3297
- """Actions to attach for the UI."""
3298
-
3299
- @overload
3300
- def __init__(
3301
- self,
3302
- artifacts: list[Artifact],
3303
- key: str,
3304
- description: str | None = None,
3305
- meta: Any | None = None,
3306
- reference: str | None = None,
3307
- reference_type: str | None = None,
3308
- run: Run | None = None,
3309
- revises: Collection | None = None,
3310
- ): ...
3311
-
3312
- @overload
3313
- def __init__(
3314
- self,
3315
- *db_args,
3316
- ): ...
3317
-
3318
- def __init__(
3319
- self,
3320
- *args,
3321
- **kwargs,
3322
- ):
3323
- pass
3324
-
3325
- def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
3326
- """Add an artifact to the collection.
3327
-
3328
- Creates a new version of the collection.
3329
- This does not modify the original collection in-place, but returns a new version
3330
- of the original collection with the added artifact.
3331
-
3332
- Args:
3333
- artifact: An artifact to add to the collection.
3334
- run: The run that creates the new version of the collection.
3335
-
3336
- Examples:
3337
- >>> collection = ln.Collection(artifact, key="new collection")
3338
- >>> collecton.save()
3339
- >>> collection = collection.append(another_artifact) # returns a new version
3340
- >>> collection.save() # save the new version
3341
-
3342
- .. versionadded:: 0.76.14
3343
- """
3344
- pass
3345
-
3346
- def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
3347
- """Return a cloud-backed pyarrow Dataset.
3348
-
3349
- Works for `pyarrow` compatible formats.
3350
-
3351
- Notes:
3352
- For more info, see tutorial: :doc:`/arrays`.
3353
- """
3354
- pass
3355
-
3356
- def mapped(
3357
- self,
3358
- layers_keys: str | list[str] | None = None,
3359
- obs_keys: str | list[str] | None = None,
3360
- obsm_keys: str | list[str] | None = None,
3361
- obs_filter: dict[str, str | list[str]] | None = None,
3362
- join: Literal["inner", "outer"] | None = "inner",
3363
- encode_labels: bool | list[str] = True,
3364
- unknown_label: str | dict[str, str] | None = None,
3365
- cache_categories: bool = True,
3366
- parallel: bool = False,
3367
- dtype: str | None = None,
3368
- stream: bool = False,
3369
- is_run_input: bool | None = None,
3370
- ) -> MappedCollection:
3371
- """Return a map-style dataset.
3372
-
3373
- Returns a `pytorch map-style dataset
3374
- <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
3375
- virtually concatenating `AnnData` arrays.
3376
-
3377
- If your `AnnData` collection is in the cloud, move them into a local
3378
- cache first via :meth:`~lamindb.Collection.cache`.
3379
-
3380
- `__getitem__` of the `MappedCollection` object takes a single integer index
3381
- and returns a dictionary with the observation data sample for this index from
3382
- the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`
3383
- (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
3384
- for the index of the `AnnData` object containing this observation sample.
3385
-
3386
- .. note::
3387
-
3388
- For a guide, see :doc:`docs:scrna-mappedcollection`.
3389
-
3390
- This method currently only works for collections of `AnnData` artifacts.
3391
-
3392
- Args:
3393
- layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
3394
- retrieves ``.X``.
3395
- obs_keys: Keys from the ``.obs`` slots.
3396
- obsm_keys: Keys from the ``.obsm`` slots.
3397
- obs_filter: Select only observations with these values for the given obs columns.
3398
- Should be a dictionary with obs column names as keys
3399
- and filtering values (a string or a list of strings) as values.
3400
- join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
3401
- does not join.
3402
- encode_labels: Encode labels into integers.
3403
- Can be a list with elements from ``obs_keys``.
3404
- unknown_label: Encode this label to -1.
3405
- Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
3406
- or from ``encode_labels`` if it is a list.
3407
- cache_categories: Enable caching categories of ``obs_keys`` for faster access.
3408
- parallel: Enable sampling with multiple processes.
3409
- dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
3410
- stream: Whether to stream data from the array backend.
3411
- is_run_input: Whether to track this collection as run input.
3412
-
3413
- Examples:
3414
- >>> import lamindb as ln
3415
- >>> from torch.utils.data import DataLoader
3416
- >>> ds = ln.Collection.get(description="my collection")
3417
- >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
3418
- >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
3419
- """
3420
- pass
3421
-
3422
- def cache(self, is_run_input: bool | None = None) -> list[UPath]:
3423
- """Download cloud artifacts in collection to local cache.
3424
-
3425
- Follows synching logic: only caches outdated artifacts.
3426
-
3427
- Returns paths to locally cached on-disk artifacts.
3428
-
3429
- Args:
3430
- is_run_input: Whether to track this collection as run input.
3431
- """
3432
- pass
3433
-
3434
- def load(
3435
- self,
3436
- join: Literal["inner", "outer"] = "outer",
3437
- is_run_input: bool | None = None,
3438
- **kwargs,
3439
- ) -> Any:
3440
- """Stage and load to memory.
3441
-
3442
- Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
3443
- """
3444
- pass
3445
-
3446
- def delete(self, permanent: bool | None = None) -> None:
3447
- """Delete collection.
3448
-
3449
- Args:
3450
- permanent: Whether to permanently delete the collection record (skips trash).
3451
-
3452
- Examples:
3453
-
3454
- For any `Collection` object `collection`, call:
3455
-
3456
- >>> collection.delete()
3457
- """
3458
- pass
3459
-
3460
- def save(self, using: str | None = None) -> Collection:
3461
- """Save the collection and underlying artifacts to database & storage.
3462
-
3463
- Args:
3464
- using: The database to which you want to save.
3465
-
3466
- Examples:
3467
- >>> collection = ln.Collection("./myfile.csv", name="myfile")
3468
- >>> collection.save()
3469
- """
3470
- pass
3471
-
3472
- def restore(self) -> None:
3473
- """Restore collection record from trash.
3474
-
3475
- Examples:
3476
-
3477
- For any `Collection` object `collection`, call:
3478
-
3479
- >>> collection.restore()
3480
- """
3481
- pass
3482
-
3483
- @property
3484
- def transform(self) -> Transform | None:
3485
- """Transform whose run created the collection."""
3486
- return self.run.transform if self.run is not None else None
3487
-
3488
- @property
3489
- def name(self) -> str:
3490
- """Name of the collection.
3491
-
3492
- Splits `key` on `/` and returns the last element.
3493
- """
3494
- return self.key.split("/")[-1]
3495
-
3496
- @property
3497
- def ordered_artifacts(self) -> QuerySet:
3498
- """Ordered `QuerySet` of `.artifacts`.
3499
-
3500
- Accessing the many-to-many field `collection.artifacts` directly gives
3501
- you non-deterministic order.
3502
-
3503
- Using the property `.ordered_artifacts` allows to iterate through a set
3504
- that's ordered in the order of creation.
3505
- """
3506
- pass
3507
-
3508
- @property
3509
- def data_artifact(self) -> Artifact | None:
3510
- """Access to a single data artifact.
3511
-
3512
- If the collection has a single data & metadata artifact, this allows access via::
3513
-
3514
- collection.data_artifact # first & only element of collection.artifacts
3515
- collection.meta_artifact # metadata
3516
-
3517
- """
3518
- pass
3519
-
3520
- def describe(self) -> None:
3521
- """Describe relations of record.
3522
-
3523
- Examples:
3524
- >>> artifact.describe()
3525
- """
3526
- pass
3527
-
3528
-
3529
- # -------------------------------------------------------------------------------------
3530
- # Project management
3531
-
3532
-
3533
- class Person(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3534
- """Persons.
3535
-
3536
- This registry is distinct from `User` and purely exists for project management.
3537
-
3538
- You'll soon be able to conveniently create persons from users.
3539
-
3540
- Example:
3541
- >>> person = Person(
3542
- ... name="Jane Doe",
3543
- ... email="jane.doe@example.com",
3544
- ... internal=True,
3545
- ... ).save()
3546
- """
3547
-
3548
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
3549
- abstract = False
3550
-
3551
- id: int = models.AutoField(primary_key=True)
3552
- """Internal id, valid only in one DB instance."""
3553
- uid: str = CharField(
3554
- editable=False, unique=True, max_length=8, db_index=True, default=base62_8
3555
- )
3556
- """Universal id, valid across DB instances."""
3557
- name: str = CharField(db_index=True)
3558
- """Name of the person (forename(s) lastname)."""
3559
- email: str | None = EmailField(null=True, default=None)
3560
- """Email of the person."""
3561
- external: bool = BooleanField(default=True, db_index=True)
3562
- """Whether the person is external to the organization."""
3563
-
3564
-
3565
- class Project(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3566
- """Projects.
3567
-
3568
- Example:
3569
- >>> project = Project(
3570
- ... name="My Project Name",
3571
- ... abbr="MPN",
3572
- ... url="https://example.com/my_project",
3573
- ... ).save()
3574
- """
3575
-
3576
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
3577
- abstract = False
3578
-
3579
- id: int = models.AutoField(primary_key=True)
3580
- """Internal id, valid only in one DB instance."""
3581
- uid: str = CharField(
3582
- editable=False, unique=True, max_length=12, db_index=True, default=base62_12
3583
- )
3584
- """Universal id, valid across DB instances."""
3585
- name: str = CharField(db_index=True)
3586
- """Title or name of the Project."""
3587
- type: Project | None = ForeignKey(
3588
- "self", PROTECT, null=True, related_name="records"
3589
- )
3590
- """Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task')."""
3591
- records: Project
3592
- """Records of this type."""
3593
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
3594
- """Distinguish types from instances of the type."""
3595
- abbr: str | None = CharField(max_length=32, db_index=True, null=True)
3596
- """An abbreviation."""
3597
- url: str | None = URLField(max_length=255, null=True, default=None)
3598
- """A URL."""
3599
- start_date: date | None = DateField(null=True, default=None)
3600
- """Date of start of the project."""
3601
- end_date: date | None = DateField(null=True, default=None)
3602
- """Date of start of the project."""
3603
- parents: Project = models.ManyToManyField(
3604
- "self", symmetrical=False, related_name="children"
3605
- )
3606
- """Parent projects, the super-projects owning this project."""
3607
- children: Project
3608
- """Child projects, the sub-projects owned by this project.
3609
-
3610
- Reverse accessor for `.parents`.
3611
- """
3612
- predecessors: Project = models.ManyToManyField(
3613
- "self", symmetrical=False, related_name="successors"
3614
- )
3615
- """The preceding projects required by this project."""
3616
- successors: Project
3617
- """The succeeding projects requiring this project.
3618
-
3619
- Reverse accessor for `.predecessors`.
3620
- """
3621
- people: Person = models.ManyToManyField(
3622
- Person, through="PersonProject", related_name="projects"
3623
- )
3624
- """People associated with this project."""
3625
- artifacts: Artifact = models.ManyToManyField(
3626
- Artifact, through="ArtifactProject", related_name="projects"
3627
- )
3628
- """Artifacts associated with this Project."""
3629
- transforms: Transform = models.ManyToManyField(
3630
- Transform, through="TransformProject", related_name="projects"
3631
- )
3632
- """Transforms associated with this project."""
3633
- ulabels: ULabel = models.ManyToManyField(
3634
- ULabel, through="ULabelProject", related_name="projects"
3635
- )
3636
- """Transforms associated with this project."""
3637
- features: ULabel = models.ManyToManyField(
3638
- Feature, through="FeatureProject", related_name="projects"
3639
- )
3640
- """Transforms associated with this project."""
3641
- schemas: ULabel = models.ManyToManyField(
3642
- Schema, through="SchemaProject", related_name="projects"
3643
- )
3644
- """Schemas associated with this project."""
3645
- collections: Collection = models.ManyToManyField(
3646
- Collection, through="CollectionProject", related_name="projects"
3647
- )
3648
- """Collections associated with this project."""
3649
- references: Reference = models.ManyToManyField("Reference", related_name="projects")
3650
- """References associated with this project."""
3651
- _status_code: int = models.SmallIntegerField(default=0, db_index=True)
3652
- """Status code."""
3653
-
3654
-
3655
- class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3656
- """References such as internal studies, papers, documents, or URLs.
3657
-
3658
- Example:
3659
- >>> reference = Reference(
3660
- ... name="A Paper Title",
3661
- ... abbr="APT",
3662
- ... url="https://doi.org/10.1000/xyz123",
3663
- ... pubmed_id=12345678,
3664
- ... doi="10.1000/xyz123",
3665
- ... description="Good paper.",
3666
- ... text="Some text I want to be searchable.",
3667
- ... date=date(2023, 11, 21),
3668
- ... ).save()
3669
- """
3670
-
3671
- class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
3672
- abstract = False
3673
-
3674
- id: int = models.AutoField(primary_key=True)
3675
- """Internal id, valid only in one DB instance."""
3676
- uid: str = CharField(
3677
- editable=False, unique=True, max_length=12, db_index=True, default=base62_12
3678
- )
3679
- """Universal id, valid across DB instances."""
3680
- name: str = CharField(db_index=True)
3681
- """Title or name of the reference document."""
3682
- abbr: str | None = CharField(
3683
- max_length=32,
3684
- db_index=True,
3685
- null=True,
3686
- )
3687
- """An abbreviation for the reference."""
3688
- type: Reference | None = ForeignKey(
3689
- "self", PROTECT, null=True, related_name="records"
3690
- )
3691
- """Type of reference (e.g., 'Study', 'Paper', 'Preprint').
3692
-
3693
- Allows to group reference by type, e.g., internal studies vs. all papers etc.
3694
- """
3695
- records: Reference
3696
- """Records of this type."""
3697
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
3698
- """Distinguish types from instances of the type."""
3699
- url: str | None = URLField(null=True)
3700
- """URL linking to the reference."""
3701
- pubmed_id: int | None = BigIntegerField(null=True, db_index=True)
3702
- """A PudMmed ID."""
3703
- doi: str | None = CharField(
3704
- null=True,
3705
- db_index=True,
3706
- validators=[
3707
- RegexValidator(
3708
- regex=r"^(?:https?://(?:dx\.)?doi\.org/|doi:|DOI:)?10\.\d+/.*$",
3709
- message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)",
3710
- )
3711
- ],
3712
- )
3713
- """Digital Object Identifier (DOI) for the reference."""
3714
- description: str | None = CharField(null=True, db_index=True)
3715
- """Description of the reference."""
3716
- text: str | None = TextField(null=True)
3717
- """Abstract or full text of the reference to make it searchable."""
3718
- date: date | None = DateField(null=True, default=None)
3719
- """Date of creation or publication of the reference."""
3720
- authors: Person = models.ManyToManyField(Person, related_name="references")
3721
- """All people associated with this reference."""
3722
- artifacts: Artifact = models.ManyToManyField(
3723
- Artifact, through="ArtifactReference", related_name="references"
3724
- )
3725
- """Artifacts associated with this reference."""
3726
- transforms: Artifact = models.ManyToManyField(
3727
- Transform, through="TransformReference", related_name="references"
3728
- )
3729
- """Transforms associated with this reference."""
3730
- collections: Artifact = models.ManyToManyField(
3731
- Collection, through="CollectionReference", related_name="references"
3732
- )
3733
- """Collections associated with this reference."""
3734
-
3735
-
3736
- # -------------------------------------------------------------------------------------
3737
- # Data models
3738
-
3739
- from django.contrib.postgres.fields import JSONField # type: ignore
3740
- from django.core.exceptions import ValidationError
3741
- from django.db import models
3742
-
3743
-
3744
- class DataMixin(models.Model):
3745
- space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1)
3746
- feature = ForeignKey(
3747
- Feature, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3748
- )
3749
- param = ForeignKey(
3750
- Param, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3751
- )
3752
- row = IntegerField(help_text="Use -1 for result data")
3753
-
3754
- # Value fields
3755
- value_int = models.BigIntegerField(null=True, blank=True)
3756
- value_float = models.FloatField(null=True, blank=True)
3757
- value_str = models.TextField(null=True, blank=True)
3758
- value_datetime = models.DateTimeField(null=True, blank=True)
3759
- value_ulabel = models.ForeignKey(
3760
- ULabel, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3761
- )
3762
- value_person = models.ForeignKey(
3763
- Person, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3764
- )
3765
- value_artifact = models.ForeignKey(
3766
- Artifact, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3767
- )
3768
- value_collection = models.ForeignKey(
3769
- Collection, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3770
- )
3771
- value_project = models.ForeignKey(
3772
- Project, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
3773
- )
3774
- value_json = models.JSONField(null=True, blank=True)
3775
-
3776
- class Meta:
3777
- abstract = True
3778
-
3779
- def clean(self):
3780
- # Validate feature/param mutual exclusivity
3781
- if (self.feature is not None) == (self.param is not None):
3782
- raise ValidationError("Exactly one of feature or param must be set")
3783
-
3784
- # Validate value fields
3785
- values = [
3786
- self.value_int,
3787
- self.value_float,
3788
- self.value_str,
3789
- self.value_datetime,
3790
- self.value_ulabel,
3791
- self.value_artifact,
3792
- self.value_json,
3793
- ]
3794
- non_null_count = sum(1 for v in values if v is not None)
3795
-
3796
- if non_null_count != 1:
3797
- raise ValidationError("Exactly one value field must be set")
3798
-
3799
-
3800
- class RunData(BasicRecord, DataMixin):
3801
- run = models.ForeignKey("Run", on_delete=models.CASCADE, related_name="_rundata")
3802
-
3803
- class Meta:
3804
- constraints = [
3805
- models.CheckConstraint(
3806
- condition=(
3807
- models.Q(feature__isnull=False, param__isnull=True)
3808
- | models.Q(feature__isnull=True, param__isnull=False)
3809
- ),
3810
- name="run_data_feature_param_mutex",
3811
- ),
3812
- models.UniqueConstraint(
3813
- fields=["run", "row", "feature", "param"], name="run_data_unique"
3814
- ),
3815
- ]
3816
- indexes = [
3817
- models.Index(fields=["run", "row"]),
3818
- models.Index(fields=["feature"]),
3819
- models.Index(fields=["param"]),
3820
- ]
3821
-
3822
-
3823
- class FlexTable(Record, TracksRun, TracksUpdates):
3824
- uid: str = CharField(
3825
- editable=False, unique=True, max_length=12, db_index=True, default=base62_12
3826
- )
3827
- name = CharField()
3828
- schema: Schema | None = ForeignKey(
3829
- Schema, null=True, on_delete=models.SET_NULL, related_name="_tidytables"
3830
- )
3831
- type: FlexTable | None = ForeignKey(
3832
- "self", PROTECT, null=True, related_name="records"
3833
- )
3834
- """Type of tidy table, e.g., `Cell`, `SampleSheet`, etc."""
3835
- records: ULabel
3836
- """Records of this type."""
3837
- is_type: bool = BooleanField(default=False, db_index=True, null=True)
3838
- """Distinguish types from instances of the type."""
3839
- description: str = CharField(null=True, db_index=True)
3840
- """A description."""
3841
- projects: Project = ManyToManyField(Project, related_name="_tidytables")
3842
- ulabels: Project = ManyToManyField(ULabel, related_name="_tidytables")
3843
-
3844
- class Meta:
3845
- indexes = [models.Index(fields=["uid"]), models.Index(fields=["name"])]
3846
-
3847
-
3848
- class FlexTableData(BasicRecord, DataMixin):
3849
- tidytable = models.ForeignKey(
3850
- FlexTable, on_delete=models.CASCADE, related_name="data"
3851
- )
3852
-
3853
- class Meta:
3854
- constraints = [
3855
- models.CheckConstraint(
3856
- condition=(
3857
- models.Q(feature__isnull=False, param__isnull=True)
3858
- | models.Q(feature__isnull=True, param__isnull=False)
3859
- ),
3860
- name="tidy_table_data_feature_param_mutex",
3861
- ),
3862
- models.UniqueConstraint(
3863
- fields=["tidytable", "row", "feature", "param"],
3864
- name="tidy_table_data_unique",
3865
- ),
3866
- ]
3867
- indexes = [
3868
- models.Index(fields=["tidytable", "row"]),
3869
- models.Index(fields=["feature"]),
3870
- models.Index(fields=["param"]),
3871
- ]
3872
-
3873
-
3874
- # -------------------------------------------------------------------------------------
3875
- # Link models
3876
-
3877
-
3878
- class LinkORM:
3879
- pass
3880
-
3881
-
3882
- class SchemaFeature(BasicRecord, LinkORM):
3883
- id: int = models.BigAutoField(primary_key=True)
3884
- schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
3885
- feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
3886
-
3887
- class Meta:
3888
- unique_together = ("schema", "feature")
3889
-
3890
-
3891
- class SchemaParam(BasicRecord, LinkORM):
3892
- id: int = models.BigAutoField(primary_key=True)
3893
- schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
3894
- param: Param = ForeignKey(Param, PROTECT, related_name="+")
3895
-
3896
- class Meta:
3897
- unique_together = ("schema", "param")
3898
-
3899
-
3900
- class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
3901
- id: int = models.BigAutoField(primary_key=True)
3902
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="_links_schema")
3903
- schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
3904
- slot: str | None = CharField(null=True)
3905
- feature_ref_is_semantic: bool | None = BooleanField(null=True)
3906
-
3907
- class Meta:
3908
- unique_together = (("artifact", "schema"), ("artifact", "slot"))
3909
-
3910
-
3911
- class SchemaComponent(BasicRecord, LinkORM, TracksRun):
3912
- id: int = models.BigAutoField(primary_key=True)
3913
- composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
3914
- component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
3915
- slot: str | None = CharField(null=True)
3916
-
3917
- class Meta:
3918
- unique_together = (("composite", "component"), ("composite", "slot"))
3919
-
3920
-
3921
- class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
3922
- id: int = models.BigAutoField(primary_key=True)
3923
- collection: Collection = ForeignKey(
3924
- Collection, CASCADE, related_name="links_artifact"
3925
- )
3926
- artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection")
3927
-
3928
- class Meta:
3929
- unique_together = ("collection", "artifact")
3930
-
3931
-
3932
- class ArtifactULabel(BasicRecord, LinkORM, TracksRun):
3933
- id: int = models.BigAutoField(primary_key=True)
3934
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_ulabel")
3935
- ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_artifact")
3936
- feature: Feature | None = ForeignKey(
3937
- Feature, PROTECT, null=True, related_name="links_artifactulabel", default=None
3938
- )
3939
- label_ref_is_name: bool | None = BooleanField(null=True)
3940
- feature_ref_is_name: bool | None = BooleanField(null=True)
3941
-
3942
- class Meta:
3943
- # can have the same label linked to the same artifact if the feature is
3944
- # different
3945
- unique_together = ("artifact", "ulabel", "feature")
3946
-
3947
-
3948
- class TransformULabel(BasicRecord, LinkORM, TracksRun):
3949
- id: int = models.BigAutoField(primary_key=True)
3950
- transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_ulabel")
3951
- ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_transform")
3952
-
3953
- class Meta:
3954
- unique_together = ("transform", "ulabel")
3955
-
3956
-
3957
- class RunULabel(BasicRecord, LinkORM):
3958
- id: int = models.BigAutoField(primary_key=True)
3959
- run: Run = ForeignKey(Run, CASCADE, related_name="links_ulabel")
3960
- ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_run")
3961
- created_at: datetime = DateTimeField(
3962
- editable=False, db_default=models.functions.Now(), db_index=True
3963
- )
3964
- """Time of creation of record."""
3965
- created_by: User = ForeignKey(
3966
- "lamindb.User", PROTECT, default=current_user_id, related_name="+"
3967
- )
3968
- """Creator of record."""
3969
-
3970
- class Meta:
3971
- unique_together = ("run", "ulabel")
3972
-
3973
-
3974
- class CollectionULabel(BasicRecord, LinkORM, TracksRun):
3975
- id: int = models.BigAutoField(primary_key=True)
3976
- collection: Collection = ForeignKey(
3977
- Collection, CASCADE, related_name="links_ulabel"
3978
- )
3979
- ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_collection")
3980
- feature: Feature | None = ForeignKey(
3981
- Feature, PROTECT, null=True, related_name="links_collectionulabel", default=None
3982
- )
3983
- label_ref_is_name: bool | None = BooleanField(null=True)
3984
- feature_ref_is_name: bool | None = BooleanField(null=True)
3985
-
3986
- class Meta:
3987
- unique_together = ("collection", "ulabel")
3988
-
3989
-
3990
- class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
3991
- id: int = models.BigAutoField(primary_key=True)
3992
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
3993
- # we follow the lower() case convention rather than snake case for link models
3994
- featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
3995
-
3996
- class Meta:
3997
- unique_together = ("artifact", "featurevalue")
3998
-
3999
-
4000
- class RunParamValue(BasicRecord, LinkORM):
4001
- id: int = models.BigAutoField(primary_key=True)
4002
- run: Run = ForeignKey(Run, CASCADE, related_name="+")
4003
- # we follow the lower() case convention rather than snake case for link models
4004
- paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
4005
- created_at: datetime = DateTimeField(
4006
- editable=False, db_default=models.functions.Now(), db_index=True
4007
- )
4008
- """Time of creation of record."""
4009
- created_by: User = ForeignKey(
4010
- "lamindb.User", PROTECT, default=current_user_id, related_name="+"
4011
- )
4012
- """Creator of record."""
4013
-
4014
- class Meta:
4015
- unique_together = ("run", "paramvalue")
4016
-
4017
-
4018
- class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
4019
- id: int = models.BigAutoField(primary_key=True)
4020
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
4021
- # we follow the lower() case convention rather than snake case for link models
4022
- paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
4023
-
4024
- class Meta:
4025
- unique_together = ("artifact", "paramvalue")
4026
-
4027
-
4028
- # -------------------------------------------------------------------------------------
4029
- # Link models for project management
4030
-
4031
-
4032
- class ArtifactProject(BasicRecord, LinkORM, TracksRun):
4033
- id: int = models.BigAutoField(primary_key=True)
4034
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_project")
4035
- project: Project = ForeignKey(Project, PROTECT, related_name="links_artifact")
4036
- feature: Feature | None = ForeignKey(
4037
- Feature,
4038
- PROTECT,
4039
- null=True,
4040
- default=None,
4041
- related_name="links_artifactproject",
4042
- )
4043
- label_ref_is_name: bool | None = BooleanField(null=True, default=None)
4044
- feature_ref_is_name: bool | None = BooleanField(null=True, default=None)
4045
-
4046
- class Meta:
4047
- # can have the same label linked to the same artifact if the feature is different
4048
- unique_together = ("artifact", "project", "feature")
4049
-
4050
-
4051
- class TransformProject(BasicRecord, LinkORM, TracksRun):
4052
- id: int = models.BigAutoField(primary_key=True)
4053
- transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_project")
4054
- project: Project = ForeignKey(Project, PROTECT, related_name="links_transform")
4055
-
4056
- class Meta:
4057
- unique_together = ("transform", "project")
4058
-
4059
-
4060
- class CollectionProject(BasicRecord, LinkORM, TracksRun):
4061
- id: int = models.BigAutoField(primary_key=True)
4062
- collection: Collection = ForeignKey(
4063
- Collection, CASCADE, related_name="links_project"
4064
- )
4065
- project: Project = ForeignKey(Project, PROTECT, related_name="links_collection")
4066
-
4067
- class Meta:
4068
- unique_together = ("collection", "project")
4069
-
4070
-
4071
- class ULabelProject(BasicRecord, LinkORM, TracksRun):
4072
- id: int = models.BigAutoField(primary_key=True)
4073
- ulabel: Transform = ForeignKey(ULabel, CASCADE, related_name="links_project")
4074
- project: Project = ForeignKey(Project, PROTECT, related_name="links_ulabel")
4075
-
4076
- class Meta:
4077
- unique_together = ("ulabel", "project")
4078
-
4079
-
4080
- class PersonProject(BasicRecord, LinkORM, TracksRun):
4081
- id: int = models.BigAutoField(primary_key=True)
4082
- person: Transform = ForeignKey(Person, CASCADE, related_name="links_project")
4083
- project: Project = ForeignKey(Project, PROTECT, related_name="links_person")
4084
- role: str | None = CharField(null=True, default=None)
4085
-
4086
- class Meta:
4087
- unique_together = ("person", "project")
4088
-
4089
-
4090
- class FeatureProject(BasicRecord, LinkORM, TracksRun):
4091
- id: int = models.BigAutoField(primary_key=True)
4092
- feature: Feature = ForeignKey(Feature, CASCADE, related_name="links_project")
4093
- project: Project = ForeignKey(Project, PROTECT, related_name="links_feature")
4094
-
4095
- class Meta:
4096
- unique_together = ("feature", "project")
4097
-
4098
-
4099
- class SchemaProject(BasicRecord, LinkORM, TracksRun):
4100
- id: int = models.BigAutoField(primary_key=True)
4101
- schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_project")
4102
- project: Project = ForeignKey(Project, PROTECT, related_name="links_schema")
4103
-
4104
- class Meta:
4105
- unique_together = ("schema", "project")
4106
-
4107
-
4108
- class ArtifactReference(BasicRecord, LinkORM, TracksRun):
4109
- id: int = models.BigAutoField(primary_key=True)
4110
- artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_reference")
4111
- reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_artifact")
4112
- feature: Feature | None = ForeignKey(
4113
- Feature,
4114
- PROTECT,
4115
- null=True,
4116
- default=None,
4117
- related_name="links_artifactreference",
4118
- )
4119
- label_ref_is_name: bool | None = BooleanField(null=True, default=None)
4120
- feature_ref_is_name: bool | None = BooleanField(null=True, default=None)
4121
-
4122
- class Meta:
4123
- # can have the same label linked to the same artifact if the feature is different
4124
- unique_together = ("artifact", "reference", "feature")
4125
-
4126
-
4127
- class TransformReference(BasicRecord, LinkORM, TracksRun):
4128
- id: int = models.BigAutoField(primary_key=True)
4129
- transform: Transform = ForeignKey(
4130
- Transform, CASCADE, related_name="links_reference"
4131
- )
4132
- reference: Reference = ForeignKey(
4133
- Reference, PROTECT, related_name="links_transform"
4134
- )
4135
-
4136
- class Meta:
4137
- unique_together = ("transform", "reference")
4138
-
4139
-
4140
- class CollectionReference(BasicRecord, LinkORM, TracksRun):
4141
- id: int = models.BigAutoField(primary_key=True)
4142
- collection: Collection = ForeignKey(
4143
- Collection, CASCADE, related_name="links_reference"
4144
- )
4145
- reference: Reference = ForeignKey(
4146
- Reference, PROTECT, related_name="links_collection"
4147
- )
4148
-
4149
- class Meta:
4150
- unique_together = ("collection", "reference")
4151
-
4152
-
4153
- class Migration(BasicRecord):
4154
- app = CharField(max_length=255)
4155
- name = CharField(max_length=255)
4156
- applied: datetime = DateTimeField()
4157
-
4158
- class Meta:
4159
- db_table = "django_migrations"
4160
- managed = False
4161
-
4162
-
4163
- # -------------------------------------------------------------------------------------
4164
- # Low-level logic needed in lamindb-setup
4165
-
4166
- # Below is needed within lnschema-core because lamindb-setup already performs
4167
- # some logging
4168
-
4169
-
4170
- def format_field_value(value: datetime | str | Any) -> Any:
4171
- from datetime import datetime
4172
-
4173
- if isinstance(value, datetime):
4174
- return value.strftime("%Y-%m-%d %H:%M:%S %Z")
4175
-
4176
- if isinstance(value, str):
4177
- try:
4178
- value = datetime.fromisoformat(value)
4179
- value = value.strftime("%Y-%m-%d %H:%M:%S %Z")
4180
- except ValueError:
4181
- pass
4182
- return f"'{value}'"
4183
- else:
4184
- return value
4185
-
4186
-
4187
- class RegistryInfo:
4188
- def __init__(self, registry: Registry):
4189
- self.registry = registry
4190
-
4191
- def _get_type_for_field(self, field_name: str) -> str:
4192
- field = self.registry._meta.get_field(field_name)
4193
- related_model_name = (
4194
- field.related_model.__name__
4195
- if hasattr(field, "related_model") and field.related_model
4196
- else None
4197
- )
4198
- return related_model_name if related_model_name else field.get_internal_type()
4199
-
4200
- def _get_base_class_fields(self) -> list[str]:
4201
- return [
4202
- field.name
4203
- for base in self.registry.__bases__
4204
- if hasattr(base, "_meta")
4205
- for field in base._meta.get_fields()
4206
- ]
4207
-
4208
- def _reorder_fields_by_class(self, fields_to_order: list[Field]) -> list[Field]:
4209
- """Reorders the fields so that base class fields come last."""
4210
- non_base_class_fields = [
4211
- field
4212
- for field in fields_to_order
4213
- if field.name not in self._get_base_class_fields()
4214
- ]
4215
- found_base_class_fields = [
4216
- field
4217
- for field in fields_to_order
4218
- if field.name in self._get_base_class_fields()
4219
- ]
4220
- return non_base_class_fields + found_base_class_fields
4221
-
4222
- def get_simple_fields(self, return_str: bool = False) -> Any:
4223
- simple_fields = [
4224
- field
4225
- for field in self.registry._meta.get_fields()
4226
- if not (
4227
- isinstance(field, ManyToOneRel)
4228
- or isinstance(field, ManyToManyRel)
4229
- or isinstance(field, ManyToManyField)
4230
- or isinstance(field, ForeignKey)
4231
- or field.name.startswith("_")
4232
- or field.name == "id"
4233
- )
4234
- ]
4235
- simple_fields = self._reorder_fields_by_class(simple_fields)
4236
- if not return_str:
4237
- return simple_fields
4238
- else:
4239
- repr_str = f" {colors.italic('Simple fields')}\n"
4240
- if simple_fields:
4241
- repr_str += "".join(
4242
- [
4243
- f" .{field_name.name}: {self._get_type_for_field(field_name.name)}\n"
4244
- for field_name in simple_fields
4245
- ]
4246
- )
4247
- return repr_str
4248
-
4249
- def get_relational_fields(self, return_str: bool = False):
4250
- # we ignore ManyToOneRel because it leads to so much clutter in the API
4251
- # also note that our general guideline is to have related_name="+"
4252
- # for ForeignKey fields
4253
- relational_fields = (ManyToOneRel, ManyToManyRel, ManyToManyField, ForeignKey)
4254
-
4255
- class_specific_relational_fields = [
4256
- field
4257
- for field in self.registry._meta.fields + self.registry._meta.many_to_many
4258
- if isinstance(field, relational_fields)
4259
- and not field.name.startswith(("links_", "_"))
4260
- ]
4261
-
4262
- non_class_specific_relational_fields = [
4263
- field
4264
- for field in self.registry._meta.get_fields()
4265
- if isinstance(field, relational_fields)
4266
- and not field.name.startswith(("links_", "_"))
4267
- ]
4268
- non_class_specific_relational_fields = self._reorder_fields_by_class(
4269
- non_class_specific_relational_fields
4270
- )
4271
-
4272
- # Ensure that class specific fields (e.g. Artifact) come before non-class specific fields (e.g. collection)
4273
- filtered_non_class_specific = [
4274
- field
4275
- for field in non_class_specific_relational_fields
4276
- if field not in class_specific_relational_fields
4277
- ]
4278
- ordered_relational_fields = (
4279
- class_specific_relational_fields + filtered_non_class_specific
4280
- )
4281
-
4282
- core_module_fields = []
4283
- external_modules_fields = []
4284
- for field in ordered_relational_fields:
4285
- field_name = repr(field).split(": ")[1][:-1]
4286
- if field_name.count(".") == 1 and "lamindb" not in field_name:
4287
- external_modules_fields.append(field)
4288
- else:
4289
- core_module_fields.append(field)
4290
-
4291
- def _get_related_field_type(field) -> str:
4292
- field_type = (
4293
- field.related_model.__get_name_with_module__()
4294
- .replace(
4295
- "Artifact", ""
4296
- ) # some fields have an unnecessary 'Artifact' in their name
4297
- .replace(
4298
- "Collection", ""
4299
- ) # some fields have an unnecessary 'Collection' in their name
4300
- )
4301
- return (
4302
- self._get_type_for_field(field.name)
4303
- if not field_type.strip()
4304
- else field_type
4305
- )
4306
-
4307
- core_module_fields_formatted = [
4308
- f" .{field.name}: {_get_related_field_type(field)}\n"
4309
- for field in core_module_fields
4310
- ]
4311
- external_modules_fields_formatted = [
4312
- f" .{field.name}: {_get_related_field_type(field)}\n"
4313
- for field in external_modules_fields
4314
- ]
4315
-
4316
- if not return_str:
4317
- external_modules_fields_by_modules = defaultdict(list)
4318
- for field_str, field in zip(
4319
- external_modules_fields_formatted, external_modules_fields
4320
- ):
4321
- field_type = field_str.split(":")[1].split()[0]
4322
- module_name = field_type.split(".")[0]
4323
- external_modules_fields_by_modules[module_name].append(field)
4324
- return core_module_fields, external_modules_fields_by_modules
4325
- else:
4326
- repr_str = ""
4327
-
4328
- # Non-external relational fields
4329
- if core_module_fields:
4330
- repr_str += f" {colors.italic('Relational fields')}\n"
4331
- repr_str += "".join(core_module_fields_formatted)
4332
-
4333
- # External relational fields
4334
- external_modules = set()
4335
- for field in external_modules_fields_formatted:
4336
- field_type = field.split(":")[1].split()[0]
4337
- external_modules.add(field_type.split(".")[0])
4338
-
4339
- if external_modules:
4340
- # We want Bionty to show up before other modules
4341
- external_modules = (
4342
- ["bionty"] + sorted(external_modules - {"bionty"}) # type: ignore
4343
- if "bionty" in external_modules
4344
- else sorted(external_modules)
4345
- )
4346
- for ext_module in external_modules:
4347
- ext_module_fields = [
4348
- field
4349
- for field in external_modules_fields_formatted
4350
- if ext_module in field
4351
- ]
4352
-
4353
- if ext_module_fields:
4354
- repr_str += (
4355
- f" {colors.italic(f'{ext_module.capitalize()} fields')}\n"
4356
- )
4357
- repr_str += "".join(ext_module_fields)
4358
-
4359
- return repr_str
4360
-
4361
-
4362
- def registry_repr(cls):
4363
- """Shows fields."""
4364
- repr_str = f"{colors.green(cls.__name__)}\n"
4365
- info = RegistryInfo(cls)
4366
- repr_str += info.get_simple_fields(return_str=True)
4367
- repr_str += info.get_relational_fields(return_str=True)
4368
- repr_str = repr_str.rstrip("\n")
4369
- return repr_str
4370
-
4371
-
4372
- def record_repr(
4373
- self: Record, include_foreign_keys: bool = True, exclude_field_names=None
4374
- ) -> str:
4375
- if exclude_field_names is None:
4376
- exclude_field_names = ["id", "updated_at", "source_code"]
4377
- field_names = [
4378
- field.name
4379
- for field in self._meta.fields
4380
- if (not isinstance(field, ForeignKey) and field.name not in exclude_field_names)
4381
- ]
4382
- if include_foreign_keys:
4383
- field_names += [
4384
- f"{field.name}_id"
4385
- for field in self._meta.fields
4386
- if isinstance(field, ForeignKey)
4387
- ]
4388
- if "created_at" in field_names:
4389
- field_names.remove("created_at")
4390
- field_names.append("created_at")
4391
- if field_names[0] != "uid" and "uid" in field_names:
4392
- field_names.remove("uid")
4393
- field_names.insert(0, "uid")
4394
- fields_str = {}
4395
- for k in field_names:
4396
- if not k.startswith("_") and hasattr(self, k):
4397
- value = getattr(self, k)
4398
- # Force strip the time component of the version
4399
- if k == "version" and value:
4400
- fields_str[k] = f"'{str(value).split()[0]}'"
4401
- else:
4402
- fields_str[k] = format_field_value(value)
4403
- fields_joined_str = ", ".join(
4404
- [f"{k}={fields_str[k]}" for k in fields_str if fields_str[k] is not None]
4405
- )
4406
- return f"{self.__class__.__name__}({fields_joined_str})"
4407
-
4408
-
4409
- # below is code to further format the repr of a record
4410
- #
4411
- # def format_repr(
4412
- # record: Record, exclude_field_names: str | list[str] | None = None
4413
- # ) -> str:
4414
- # if isinstance(exclude_field_names, str):
4415
- # exclude_field_names = [exclude_field_names]
4416
- # exclude_field_names_init = ["id", "created_at", "updated_at"]
4417
- # if exclude_field_names is not None:
4418
- # exclude_field_names_init += exclude_field_names
4419
- # return record.__repr__(
4420
- # include_foreign_keys=False, exclude_field_names=exclude_field_names_init
4421
- # )
4422
-
4423
-
4424
- Record.__repr__ = record_repr # type: ignore
4425
- Record.__str__ = record_repr # type: ignore
4426
-
4427
-
4428
- def deferred_attribute__repr__(self):
4429
- return f"FieldAttr({self.field.model.__name__}.{self.field.name})"
4430
-
4431
-
4432
- FieldAttr.__repr__ = deferred_attribute__repr__ # type: ignore
4433
- # backward compatibility
4434
- CanValidate = CanCurate
4435
- FeatureSet = Schema