lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +31 -26
- lamindb/_finish.py +9 -1
- lamindb/_tracked.py +26 -3
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/base/users.py +1 -4
- lamindb/core/__init__.py +7 -65
- lamindb/core/_context.py +41 -10
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +22 -9
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +14 -7
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/_tiledbsoma.py +6 -4
- lamindb/core/storage/_zarr.py +32 -11
- lamindb/core/storage/objects.py +59 -26
- lamindb/core/storage/paths.py +16 -13
- lamindb/curators/__init__.py +173 -145
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2601 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +52 -30
- lamindb/models/record.py +1757 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +18 -8
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
- lamindb-1.2a2.dist-info/RECORD +94 -0
- lamindb/_artifact.py +0 -1361
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1065
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4435
- lamindb-1.1.0.dist-info/RECORD +0 -95
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
lamindb/models.py
DELETED
@@ -1,4435 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import sys
|
4
|
-
from collections import defaultdict
|
5
|
-
from datetime import date, datetime # noqa: TC003
|
6
|
-
from itertools import chain
|
7
|
-
from typing import (
|
8
|
-
TYPE_CHECKING,
|
9
|
-
Any,
|
10
|
-
Literal,
|
11
|
-
NamedTuple,
|
12
|
-
overload,
|
13
|
-
)
|
14
|
-
|
15
|
-
from django.core.validators import RegexValidator
|
16
|
-
from django.db import IntegrityError, models
|
17
|
-
from django.db.models import CASCADE, PROTECT, Field, Q
|
18
|
-
from django.db.models.base import ModelBase
|
19
|
-
from django.db.models.fields.related import (
|
20
|
-
ManyToManyField,
|
21
|
-
ManyToManyRel,
|
22
|
-
ManyToOneRel,
|
23
|
-
)
|
24
|
-
from lamin_utils import colors
|
25
|
-
from lamindb_setup import _check_instance_setup
|
26
|
-
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict
|
27
|
-
|
28
|
-
from lamindb.base import deprecated, doc_args
|
29
|
-
from lamindb.base.fields import (
|
30
|
-
BigIntegerField,
|
31
|
-
BooleanField,
|
32
|
-
CharField,
|
33
|
-
DateField,
|
34
|
-
DateTimeField,
|
35
|
-
EmailField,
|
36
|
-
ForeignKey,
|
37
|
-
IntegerField,
|
38
|
-
JSONField,
|
39
|
-
OneToOneField,
|
40
|
-
TextField,
|
41
|
-
URLField,
|
42
|
-
)
|
43
|
-
|
44
|
-
from .base.ids import base62_8, base62_12, base62_20
|
45
|
-
from .base.types import (
|
46
|
-
ArtifactKind,
|
47
|
-
FeatureDtype,
|
48
|
-
FieldAttr,
|
49
|
-
ListLike,
|
50
|
-
StrField,
|
51
|
-
TransformType,
|
52
|
-
)
|
53
|
-
from .base.users import current_user_id
|
54
|
-
|
55
|
-
if TYPE_CHECKING:
|
56
|
-
from collections.abc import Iterable
|
57
|
-
from pathlib import Path
|
58
|
-
|
59
|
-
import numpy as np
|
60
|
-
import pandas as pd
|
61
|
-
from anndata import AnnData
|
62
|
-
from lamin_utils._inspect import InspectResult
|
63
|
-
from lamindb_setup.core.types import UPathStr
|
64
|
-
from mudata import MuData
|
65
|
-
from pyarrow.dataset import Dataset as PyArrowDataset
|
66
|
-
from tiledbsoma import Collection as SOMACollection
|
67
|
-
from tiledbsoma import Experiment as SOMAExperiment
|
68
|
-
from tiledbsoma import Measurement as SOMAMeasurement
|
69
|
-
from upath import UPath
|
70
|
-
|
71
|
-
from lamindb.core import LabelManager, MappedCollection, QuerySet, RecordList
|
72
|
-
from lamindb.core.storage import AnnDataAccessor, BackedAccessor
|
73
|
-
|
74
|
-
|
75
|
-
_TRACKING_READY: bool | None = None
|
76
|
-
|
77
|
-
|
78
|
-
class IsVersioned(models.Model):
|
79
|
-
"""Base class for versioned models."""
|
80
|
-
|
81
|
-
class Meta:
|
82
|
-
abstract = True
|
83
|
-
|
84
|
-
_len_stem_uid: int
|
85
|
-
|
86
|
-
version: str | None = CharField(max_length=30, null=True, db_index=True)
|
87
|
-
"""Version (default `None`).
|
88
|
-
|
89
|
-
Defines version of a family of records characterized by the same `stem_uid`.
|
90
|
-
|
91
|
-
Consider using `semantic versioning <https://semver.org>`__
|
92
|
-
with `Python versioning <https://peps.python.org/pep-0440/>`__.
|
93
|
-
"""
|
94
|
-
is_latest: bool = BooleanField(default=True, db_index=True)
|
95
|
-
"""Boolean flag that indicates whether a record is the latest in its version family."""
|
96
|
-
|
97
|
-
@overload
|
98
|
-
def __init__(self): ...
|
99
|
-
|
100
|
-
@overload
|
101
|
-
def __init__(
|
102
|
-
self,
|
103
|
-
*db_args,
|
104
|
-
): ...
|
105
|
-
|
106
|
-
def __init__(
|
107
|
-
self,
|
108
|
-
*args,
|
109
|
-
**kwargs,
|
110
|
-
):
|
111
|
-
self._revises = kwargs.pop("revises") if "revises" in kwargs else None
|
112
|
-
super().__init__(*args, **kwargs)
|
113
|
-
|
114
|
-
@property
|
115
|
-
def stem_uid(self) -> str:
|
116
|
-
"""Universal id characterizing the version family.
|
117
|
-
|
118
|
-
The full uid of a record is obtained via concatenating the stem uid and version information::
|
119
|
-
|
120
|
-
stem_uid = random_base62(n_char) # a random base62 sequence of length 12 (transform) or 16 (artifact, collection)
|
121
|
-
version_uid = "0000" # an auto-incrementing 4-digit base62 number
|
122
|
-
uid = f"{stem_uid}{version_uid}" # concatenate the stem_uid & version_uid
|
123
|
-
|
124
|
-
"""
|
125
|
-
return self.uid[: self._len_stem_uid] # type: ignore
|
126
|
-
|
127
|
-
@property
|
128
|
-
def versions(self) -> QuerySet:
|
129
|
-
"""Lists all records of the same version family.
|
130
|
-
|
131
|
-
>>> new_artifact = ln.Artifact(df2, revises=artifact).save()
|
132
|
-
>>> new_artifact.versions()
|
133
|
-
"""
|
134
|
-
db = self._state.db
|
135
|
-
if db is not None and db != "default":
|
136
|
-
return self.__class__.using(db).filter(uid__startswith=self.stem_uid) # type: ignore
|
137
|
-
else:
|
138
|
-
return self.__class__.filter(uid__startswith=self.stem_uid) # type: ignore
|
139
|
-
|
140
|
-
def _add_to_version_family(self, revises: IsVersioned, version: str | None = None):
|
141
|
-
"""Add current record to a version family.
|
142
|
-
|
143
|
-
Args:
|
144
|
-
revises: a record that belongs to the version family.
|
145
|
-
version: semantic version of the record.
|
146
|
-
"""
|
147
|
-
pass
|
148
|
-
|
149
|
-
|
150
|
-
def current_run() -> Run | None:
|
151
|
-
global _TRACKING_READY
|
152
|
-
|
153
|
-
if not _TRACKING_READY:
|
154
|
-
_TRACKING_READY = _check_instance_setup()
|
155
|
-
if _TRACKING_READY:
|
156
|
-
import lamindb
|
157
|
-
|
158
|
-
# also see get_run() in core._data
|
159
|
-
run = lamindb._tracked.get_current_tracked_run()
|
160
|
-
if run is None:
|
161
|
-
run = lamindb.context.run
|
162
|
-
return run
|
163
|
-
else:
|
164
|
-
return None
|
165
|
-
|
166
|
-
|
167
|
-
class TracksRun(models.Model):
|
168
|
-
"""Base class tracking latest run, creating user, and `created_at` timestamp."""
|
169
|
-
|
170
|
-
class Meta:
|
171
|
-
abstract = True
|
172
|
-
|
173
|
-
created_at: datetime = DateTimeField(
|
174
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
175
|
-
)
|
176
|
-
"""Time of creation of record."""
|
177
|
-
created_by: User = ForeignKey(
|
178
|
-
"lamindb.User",
|
179
|
-
PROTECT,
|
180
|
-
editable=False,
|
181
|
-
default=current_user_id,
|
182
|
-
related_name="+",
|
183
|
-
)
|
184
|
-
"""Creator of record."""
|
185
|
-
run: Run | None = ForeignKey(
|
186
|
-
"lamindb.Run", PROTECT, null=True, default=current_run, related_name="+"
|
187
|
-
)
|
188
|
-
"""Last run that created or updated the record."""
|
189
|
-
|
190
|
-
@overload
|
191
|
-
def __init__(self): ...
|
192
|
-
|
193
|
-
@overload
|
194
|
-
def __init__(
|
195
|
-
self,
|
196
|
-
*db_args,
|
197
|
-
): ...
|
198
|
-
|
199
|
-
def __init__(
|
200
|
-
self,
|
201
|
-
*args,
|
202
|
-
**kwargs,
|
203
|
-
):
|
204
|
-
super().__init__(*args, **kwargs)
|
205
|
-
|
206
|
-
|
207
|
-
class TracksUpdates(models.Model):
|
208
|
-
"""Base class tracking previous runs and `updated_at` timestamp."""
|
209
|
-
|
210
|
-
class Meta:
|
211
|
-
abstract = True
|
212
|
-
|
213
|
-
updated_at: datetime = DateTimeField(
|
214
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
215
|
-
)
|
216
|
-
"""Time of last update to record."""
|
217
|
-
|
218
|
-
@overload
|
219
|
-
def __init__(self): ...
|
220
|
-
|
221
|
-
@overload
|
222
|
-
def __init__(
|
223
|
-
self,
|
224
|
-
*db_args,
|
225
|
-
): ...
|
226
|
-
|
227
|
-
def __init__(
|
228
|
-
self,
|
229
|
-
*args,
|
230
|
-
**kwargs,
|
231
|
-
):
|
232
|
-
super().__init__(*args, **kwargs)
|
233
|
-
|
234
|
-
|
235
|
-
class CanCurate:
|
236
|
-
"""Base class providing :class:`~lamindb.core.Record`-based validation."""
|
237
|
-
|
238
|
-
@classmethod
|
239
|
-
def inspect(
|
240
|
-
cls,
|
241
|
-
values: ListLike,
|
242
|
-
field: str | StrField | None = None,
|
243
|
-
*,
|
244
|
-
mute: bool = False,
|
245
|
-
organism: str | Record | None = None,
|
246
|
-
source: Record | None = None,
|
247
|
-
strict_source: bool = False,
|
248
|
-
) -> InspectResult:
|
249
|
-
"""Inspect if values are mappable to a field.
|
250
|
-
|
251
|
-
Being mappable means that an exact match exists.
|
252
|
-
|
253
|
-
Args:
|
254
|
-
values: Values that will be checked against the field.
|
255
|
-
field: The field of values. Examples are `'ontology_id'` to map
|
256
|
-
against the source ID or `'name'` to map against the ontologies
|
257
|
-
field names.
|
258
|
-
mute: Whether to mute logging.
|
259
|
-
organism: An Organism name or record.
|
260
|
-
source: A `bionty.Source` record that specifies the version to inspect against.
|
261
|
-
strict_source: Determines the validation behavior against records in the registry.
|
262
|
-
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
263
|
-
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
264
|
-
Note: this parameter won't affect validation against bionty/public sources.
|
265
|
-
|
266
|
-
See Also:
|
267
|
-
:meth:`~lamindb.core.CanCurate.validate`
|
268
|
-
|
269
|
-
Examples:
|
270
|
-
>>> import bionty as bt
|
271
|
-
>>> bt.settings.organism = "human"
|
272
|
-
>>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
|
273
|
-
>>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
|
274
|
-
>>> result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol)
|
275
|
-
>>> result.validated
|
276
|
-
['A1CF', 'A1BG']
|
277
|
-
>>> result.non_validated
|
278
|
-
['FANCD1', 'FANCD20']
|
279
|
-
"""
|
280
|
-
pass
|
281
|
-
|
282
|
-
@classmethod
|
283
|
-
def validate(
|
284
|
-
cls,
|
285
|
-
values: ListLike,
|
286
|
-
field: str | StrField | None = None,
|
287
|
-
*,
|
288
|
-
mute: bool = False,
|
289
|
-
organism: str | Record | None = None,
|
290
|
-
source: Record | None = None,
|
291
|
-
strict_source: bool = False,
|
292
|
-
) -> np.ndarray:
|
293
|
-
"""Validate values against existing values of a string field.
|
294
|
-
|
295
|
-
Note this is strict_source validation, only asserts exact matches.
|
296
|
-
|
297
|
-
Args:
|
298
|
-
values: Values that will be validated against the field.
|
299
|
-
field: The field of values.
|
300
|
-
Examples are `'ontology_id'` to map against the source ID
|
301
|
-
or `'name'` to map against the ontologies field names.
|
302
|
-
mute: Whether to mute logging.
|
303
|
-
organism: An Organism name or record.
|
304
|
-
source: A `bionty.Source` record that specifies the version to validate against.
|
305
|
-
strict_source: Determines the validation behavior against records in the registry.
|
306
|
-
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
307
|
-
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
308
|
-
Note: this parameter won't affect validation against bionty/public sources.
|
309
|
-
|
310
|
-
Returns:
|
311
|
-
A vector of booleans indicating if an element is validated.
|
312
|
-
|
313
|
-
See Also:
|
314
|
-
:meth:`~lamindb.core.CanCurate.inspect`
|
315
|
-
|
316
|
-
Examples:
|
317
|
-
>>> import bionty as bt
|
318
|
-
>>> bt.settings.organism = "human"
|
319
|
-
>>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
|
320
|
-
>>> gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
|
321
|
-
>>> bt.Gene.validate(gene_symbols, field=bt.Gene.symbol)
|
322
|
-
array([ True, True, False, False])
|
323
|
-
"""
|
324
|
-
pass
|
325
|
-
|
326
|
-
def from_values(
|
327
|
-
cls,
|
328
|
-
values: ListLike,
|
329
|
-
field: StrField | None = None,
|
330
|
-
create: bool = False,
|
331
|
-
organism: Record | str | None = None,
|
332
|
-
source: Record | None = None,
|
333
|
-
mute: bool = False,
|
334
|
-
) -> RecordList:
|
335
|
-
"""Bulk create validated records by parsing values for an identifier such as a name or an id).
|
336
|
-
|
337
|
-
Args:
|
338
|
-
values: A list of values for an identifier, e.g.
|
339
|
-
`["name1", "name2"]`.
|
340
|
-
field: A `Record` field to look up, e.g., `bt.CellMarker.name`.
|
341
|
-
create: Whether to create records if they don't exist.
|
342
|
-
organism: A `bionty.Organism` name or record.
|
343
|
-
source: A `bionty.Source` record to validate against to create records for.
|
344
|
-
mute: Whether to mute logging.
|
345
|
-
|
346
|
-
Returns:
|
347
|
-
A list of validated records. For bionty registries. Also returns knowledge-coupled records.
|
348
|
-
|
349
|
-
Notes:
|
350
|
-
For more info, see tutorial: :doc:`docs:bio-registries`.
|
351
|
-
|
352
|
-
Examples:
|
353
|
-
|
354
|
-
Bulk create from non-validated values will log warnings & returns empty list:
|
355
|
-
|
356
|
-
>>> ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], field="name")
|
357
|
-
>>> assert len(ulabels) == 0
|
358
|
-
|
359
|
-
Bulk create records from validated values returns the corresponding existing records:
|
360
|
-
|
361
|
-
>>> ln.save([ln.ULabel(name=name) for name in ["benchmark", "prediction", "test"]])
|
362
|
-
>>> ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], field="name")
|
363
|
-
>>> assert len(ulabels) == 3
|
364
|
-
|
365
|
-
Bulk create records from public reference:
|
366
|
-
|
367
|
-
>>> import bionty as bt
|
368
|
-
>>> records = bt.CellType.from_values(["T cell", "B cell"], field="name")
|
369
|
-
>>> records
|
370
|
-
"""
|
371
|
-
pass
|
372
|
-
|
373
|
-
@classmethod
|
374
|
-
def standardize(
|
375
|
-
cls,
|
376
|
-
values: Iterable,
|
377
|
-
field: str | StrField | None = None,
|
378
|
-
*,
|
379
|
-
return_field: str | StrField | None = None,
|
380
|
-
return_mapper: bool = False,
|
381
|
-
case_sensitive: bool = False,
|
382
|
-
mute: bool = False,
|
383
|
-
public_aware: bool = True,
|
384
|
-
keep: Literal["first", "last", False] = "first",
|
385
|
-
synonyms_field: str = "synonyms",
|
386
|
-
organism: str | Record | None = None,
|
387
|
-
source: Record | None = None,
|
388
|
-
strict_source: bool = False,
|
389
|
-
) -> list[str] | dict[str, str]:
|
390
|
-
"""Maps input synonyms to standardized names.
|
391
|
-
|
392
|
-
Args:
|
393
|
-
values: Identifiers that will be standardized.
|
394
|
-
field: The field representing the standardized names.
|
395
|
-
return_field: The field to return. Defaults to field.
|
396
|
-
return_mapper: If `True`, returns `{input_value: standardized_name}`.
|
397
|
-
case_sensitive: Whether the mapping is case sensitive.
|
398
|
-
mute: Whether to mute logging.
|
399
|
-
public_aware: Whether to standardize from Bionty reference. Defaults to `True` for Bionty registries.
|
400
|
-
keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`:
|
401
|
-
- `"first"`: returns the first mapped standardized name
|
402
|
-
- `"last"`: returns the last mapped standardized name
|
403
|
-
- `False`: returns all mapped standardized name.
|
404
|
-
|
405
|
-
When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates.
|
406
|
-
|
407
|
-
When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value.
|
408
|
-
synonyms_field: A field containing the concatenated synonyms.
|
409
|
-
organism: An Organism name or record.
|
410
|
-
source: A `bionty.Source` record that specifies the version to validate against.
|
411
|
-
strict_source: Determines the validation behavior against records in the registry.
|
412
|
-
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
413
|
-
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
414
|
-
Note: this parameter won't affect validation against bionty/public sources.
|
415
|
-
|
416
|
-
Returns:
|
417
|
-
If `return_mapper` is `False`: a list of standardized names. Otherwise,
|
418
|
-
a dictionary of mapped values with mappable synonyms as keys and
|
419
|
-
standardized names as values.
|
420
|
-
|
421
|
-
See Also:
|
422
|
-
:meth:`~lamindb.core.CanCurate.add_synonym`
|
423
|
-
Add synonyms.
|
424
|
-
:meth:`~lamindb.core.CanCurate.remove_synonym`
|
425
|
-
Remove synonyms.
|
426
|
-
|
427
|
-
Examples:
|
428
|
-
>>> import bionty as bt
|
429
|
-
>>> bt.settings.organism = "human"
|
430
|
-
>>> ln.save(bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol"))
|
431
|
-
>>> gene_synonyms = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
|
432
|
-
>>> standardized_names = bt.Gene.standardize(gene_synonyms)
|
433
|
-
>>> standardized_names
|
434
|
-
['A1CF', 'A1BG', 'BRCA2', 'FANCD20']
|
435
|
-
"""
|
436
|
-
pass
|
437
|
-
|
438
|
-
def add_synonym(
|
439
|
-
self,
|
440
|
-
synonym: str | ListLike,
|
441
|
-
force: bool = False,
|
442
|
-
save: bool | None = None,
|
443
|
-
):
|
444
|
-
"""Add synonyms to a record.
|
445
|
-
|
446
|
-
Args:
|
447
|
-
synonym: The synonyms to add to the record.
|
448
|
-
force: Whether to add synonyms even if they are already synonyms of other records.
|
449
|
-
save: Whether to save the record to the database.
|
450
|
-
|
451
|
-
See Also:
|
452
|
-
:meth:`~lamindb.core.CanCurate.remove_synonym`
|
453
|
-
Remove synonyms.
|
454
|
-
|
455
|
-
Examples:
|
456
|
-
>>> import bionty as bt
|
457
|
-
>>> bt.CellType.from_source(name="T cell").save()
|
458
|
-
>>> lookup = bt.CellType.lookup()
|
459
|
-
>>> record = lookup.t_cell
|
460
|
-
>>> record.synonyms
|
461
|
-
'T-cell|T lymphocyte|T-lymphocyte'
|
462
|
-
>>> record.add_synonym("T cells")
|
463
|
-
>>> record.synonyms
|
464
|
-
'T cells|T-cell|T-lymphocyte|T lymphocyte'
|
465
|
-
"""
|
466
|
-
pass
|
467
|
-
|
468
|
-
def remove_synonym(self, synonym: str | ListLike):
|
469
|
-
"""Remove synonyms from a record.
|
470
|
-
|
471
|
-
Args:
|
472
|
-
synonym: The synonym values to remove.
|
473
|
-
|
474
|
-
See Also:
|
475
|
-
:meth:`~lamindb.core.CanCurate.add_synonym`
|
476
|
-
Add synonyms
|
477
|
-
|
478
|
-
Examples:
|
479
|
-
>>> import bionty as bt
|
480
|
-
>>> bt.CellType.from_source(name="T cell").save()
|
481
|
-
>>> lookup = bt.CellType.lookup()
|
482
|
-
>>> record = lookup.t_cell
|
483
|
-
>>> record.synonyms
|
484
|
-
'T-cell|T lymphocyte|T-lymphocyte'
|
485
|
-
>>> record.remove_synonym("T-cell")
|
486
|
-
'T lymphocyte|T-lymphocyte'
|
487
|
-
"""
|
488
|
-
pass
|
489
|
-
|
490
|
-
def set_abbr(self, value: str):
|
491
|
-
"""Set value for abbr field and add to synonyms.
|
492
|
-
|
493
|
-
Args:
|
494
|
-
value: A value for an abbreviation.
|
495
|
-
|
496
|
-
See Also:
|
497
|
-
:meth:`~lamindb.core.CanCurate.add_synonym`
|
498
|
-
|
499
|
-
Examples:
|
500
|
-
>>> import bionty as bt
|
501
|
-
>>> bt.ExperimentalFactor.from_source(name="single-cell RNA sequencing").save()
|
502
|
-
>>> scrna = bt.ExperimentalFactor.get(name="single-cell RNA sequencing")
|
503
|
-
>>> scrna.abbr
|
504
|
-
None
|
505
|
-
>>> scrna.synonyms
|
506
|
-
'single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing'
|
507
|
-
>>> scrna.set_abbr("scRNA")
|
508
|
-
>>> scrna.abbr
|
509
|
-
'scRNA'
|
510
|
-
>>> scrna.synonyms
|
511
|
-
'scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq'
|
512
|
-
>>> scrna.save()
|
513
|
-
"""
|
514
|
-
pass
|
515
|
-
|
516
|
-
|
517
|
-
class HasParents:
|
518
|
-
"""Base class for hierarchical registries (ontologies)."""
|
519
|
-
|
520
|
-
def view_parents(
|
521
|
-
self,
|
522
|
-
field: StrField | None = None,
|
523
|
-
with_children: bool = False,
|
524
|
-
distance: int = 5,
|
525
|
-
):
|
526
|
-
"""View parents in an ontology.
|
527
|
-
|
528
|
-
Args:
|
529
|
-
field: Field to display on graph
|
530
|
-
with_children: Whether to also show children.
|
531
|
-
distance: Maximum distance still shown.
|
532
|
-
|
533
|
-
Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).
|
534
|
-
|
535
|
-
Examples:
|
536
|
-
>>> import bionty as bt
|
537
|
-
>>> bt.Tissue.from_source(name="subsegmental bronchus").save()
|
538
|
-
>>> record = bt.Tissue.get(name="respiratory tube")
|
539
|
-
>>> record.view_parents()
|
540
|
-
>>> tissue.view_parents(with_children=True)
|
541
|
-
"""
|
542
|
-
pass
|
543
|
-
|
544
|
-
def query_parents(self) -> QuerySet:
|
545
|
-
"""Query parents in an ontology."""
|
546
|
-
pass
|
547
|
-
|
548
|
-
def query_children(self) -> QuerySet:
|
549
|
-
"""Query children in an ontology."""
|
550
|
-
pass
|
551
|
-
|
552
|
-
|
553
|
-
class ValidateFields:
|
554
|
-
pass
|
555
|
-
|
556
|
-
|
557
|
-
RECORD_REGISTRY_EXAMPLE = """Example::
|
558
|
-
|
559
|
-
from lamindb import Record, fields
|
560
|
-
|
561
|
-
# sub-classing `Record` creates a new registry
|
562
|
-
class Experiment(Record):
|
563
|
-
name: str = fields.CharField()
|
564
|
-
|
565
|
-
# instantiating `Experiment` creates a record `experiment`
|
566
|
-
experiment = Experiment(name="my experiment")
|
567
|
-
|
568
|
-
# you can save the record to the database
|
569
|
-
experiment.save()
|
570
|
-
|
571
|
-
# `Experiment` refers to the registry, which you can query
|
572
|
-
df = Experiment.filter(name__startswith="my ").df()
|
573
|
-
"""
|
574
|
-
|
575
|
-
|
576
|
-
# this is the metaclass for Record
|
577
|
-
@doc_args(RECORD_REGISTRY_EXAMPLE)
|
578
|
-
class Registry(ModelBase):
|
579
|
-
"""Metaclass for :class:`~lamindb.core.Record`.
|
580
|
-
|
581
|
-
Each `Registry` *object* is a `Record` *class* and corresponds to a table in the metadata SQL database.
|
582
|
-
|
583
|
-
You work with `Registry` objects whenever you use *class methods* of `Record`.
|
584
|
-
|
585
|
-
You call any subclass of `Record` a "registry" and their objects "records". A `Record` object corresponds to a row in the SQL table.
|
586
|
-
|
587
|
-
If you want to create a new registry, you sub-class `Record`.
|
588
|
-
|
589
|
-
{}
|
590
|
-
|
591
|
-
Note: `Registry` inherits from Django's `ModelBase`.
|
592
|
-
"""
|
593
|
-
|
594
|
-
def __new__(cls, name, bases, attrs, **kwargs):
|
595
|
-
new_class = super().__new__(cls, name, bases, attrs, **kwargs)
|
596
|
-
return new_class
|
597
|
-
|
598
|
-
# below creates a sensible auto-complete behavior that differs across the
|
599
|
-
# class and instance level in Jupyter Editors it doesn't have any effect for
|
600
|
-
# static type analyzer like pylance used in VSCode
|
601
|
-
def __dir__(cls):
|
602
|
-
# this is needed to bring auto-complete on the class-level back
|
603
|
-
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1717535625268849
|
604
|
-
# Filter class attributes, excluding instance methods
|
605
|
-
exclude_instance_methods = "sphinx" not in sys.modules
|
606
|
-
# https://laminlabs.slack.com/archives/C04FPE8V01W/p1721134595920959
|
607
|
-
|
608
|
-
def include_attribute(attr_name, attr_value):
|
609
|
-
if attr_name.startswith("__"):
|
610
|
-
return False
|
611
|
-
if exclude_instance_methods and callable(attr_value):
|
612
|
-
return isinstance(attr_value, (classmethod, staticmethod, type))
|
613
|
-
return True
|
614
|
-
|
615
|
-
# check also inherited attributes
|
616
|
-
if hasattr(cls, "mro"):
|
617
|
-
attrs = chain(*(c.__dict__.items() for c in cls.mro()))
|
618
|
-
else:
|
619
|
-
attrs = cls.__dict__.items()
|
620
|
-
|
621
|
-
result = []
|
622
|
-
for attr_name, attr_value in attrs:
|
623
|
-
if attr_name not in result and include_attribute(attr_name, attr_value):
|
624
|
-
result.append(attr_name)
|
625
|
-
|
626
|
-
# Add non-dunder attributes from Registry
|
627
|
-
for attr in dir(Registry):
|
628
|
-
if not attr.startswith("__") and attr not in result:
|
629
|
-
result.append(attr)
|
630
|
-
return result
|
631
|
-
|
632
|
-
def __repr__(cls) -> str:
|
633
|
-
return registry_repr(cls)
|
634
|
-
|
635
|
-
def lookup(
|
636
|
-
cls,
|
637
|
-
field: StrField | None = None,
|
638
|
-
return_field: StrField | None = None,
|
639
|
-
) -> NamedTuple:
|
640
|
-
"""Return an auto-complete object for a field.
|
641
|
-
|
642
|
-
Args:
|
643
|
-
field: The field to look up the values for. Defaults to first string field.
|
644
|
-
return_field: The field to return. If `None`, returns the whole record.
|
645
|
-
|
646
|
-
Returns:
|
647
|
-
A `NamedTuple` of lookup information of the field values with a
|
648
|
-
dictionary converter.
|
649
|
-
|
650
|
-
See Also:
|
651
|
-
:meth:`~lamindb.core.Record.search`
|
652
|
-
|
653
|
-
Examples:
|
654
|
-
>>> import bionty as bt
|
655
|
-
>>> bt.settings.organism = "human"
|
656
|
-
>>> bt.Gene.from_source(symbol="ADGB-DT").save()
|
657
|
-
>>> lookup = bt.Gene.lookup()
|
658
|
-
>>> lookup.adgb_dt
|
659
|
-
>>> lookup_dict = lookup.dict()
|
660
|
-
>>> lookup_dict['ADGB-DT']
|
661
|
-
>>> lookup_by_ensembl_id = bt.Gene.lookup(field="ensembl_gene_id")
|
662
|
-
>>> genes.ensg00000002745
|
663
|
-
>>> lookup_return_symbols = bt.Gene.lookup(field="ensembl_gene_id", return_field="symbol")
|
664
|
-
"""
|
665
|
-
pass
|
666
|
-
|
667
|
-
def filter(cls, *queries, **expressions) -> QuerySet:
|
668
|
-
"""Query records.
|
669
|
-
|
670
|
-
Args:
|
671
|
-
queries: One or multiple `Q` objects.
|
672
|
-
expressions: Fields and values passed as Django query expressions.
|
673
|
-
|
674
|
-
Returns:
|
675
|
-
A :class:`~lamindb.core.QuerySet`.
|
676
|
-
|
677
|
-
See Also:
|
678
|
-
- Guide: :doc:`docs:registries`
|
679
|
-
- Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__
|
680
|
-
|
681
|
-
Examples:
|
682
|
-
>>> ln.ULabel(name="my label").save()
|
683
|
-
>>> ln.ULabel.filter(name__startswith="my").df()
|
684
|
-
"""
|
685
|
-
pass
|
686
|
-
|
687
|
-
def get(
|
688
|
-
cls,
|
689
|
-
idlike: int | str | None = None,
|
690
|
-
**expressions,
|
691
|
-
) -> Record:
|
692
|
-
"""Get a single record.
|
693
|
-
|
694
|
-
Args:
|
695
|
-
idlike: Either a uid stub, uid or an integer id.
|
696
|
-
expressions: Fields and values passed as Django query expressions.
|
697
|
-
|
698
|
-
Returns:
|
699
|
-
A record.
|
700
|
-
|
701
|
-
Raises:
|
702
|
-
:exc:`docs:lamindb.core.exceptions.DoesNotExist`: In case no matching record is found.
|
703
|
-
|
704
|
-
See Also:
|
705
|
-
- Guide: :doc:`docs:registries`
|
706
|
-
- Django documentation: `Queries <https://docs.djangoproject.com/en/stable/topics/db/queries/>`__
|
707
|
-
|
708
|
-
Examples:
|
709
|
-
>>> ulabel = ln.ULabel.get("FvtpPJLJ")
|
710
|
-
>>> ulabel = ln.ULabel.get(name="my-label")
|
711
|
-
"""
|
712
|
-
pass
|
713
|
-
|
714
|
-
def df(
|
715
|
-
cls,
|
716
|
-
include: str | list[str] | None = None,
|
717
|
-
features: bool | list[str] = False,
|
718
|
-
limit: int = 100,
|
719
|
-
) -> pd.DataFrame:
|
720
|
-
"""Convert to `pd.DataFrame`.
|
721
|
-
|
722
|
-
By default, shows all direct fields, except `updated_at`.
|
723
|
-
|
724
|
-
Use arguments `include` or `feature` to include other data.
|
725
|
-
|
726
|
-
Args:
|
727
|
-
include: Related fields to include as columns. Takes strings of
|
728
|
-
form `"ulabels__name"`, `"cell_types__name"`, etc. or a list
|
729
|
-
of such strings.
|
730
|
-
features: If `True`, map all features of the
|
731
|
-
:class:`~lamindb.Feature` registry onto the resulting
|
732
|
-
`DataFrame`. Only available for `Artifact`.
|
733
|
-
limit: Maximum number of rows to display from a Pandas DataFrame.
|
734
|
-
Defaults to 100 to reduce database load.
|
735
|
-
|
736
|
-
Examples:
|
737
|
-
|
738
|
-
Include the name of the creator in the `DataFrame`:
|
739
|
-
|
740
|
-
>>> ln.ULabel.df(include="created_by__name"])
|
741
|
-
|
742
|
-
Include display of features for `Artifact`:
|
743
|
-
|
744
|
-
>>> df = ln.Artifact.df(features=True)
|
745
|
-
>>> ln.view(df) # visualize with type annotations
|
746
|
-
|
747
|
-
Only include select features:
|
748
|
-
|
749
|
-
>>> df = ln.Artifact.df(features=["cell_type_by_expert", "cell_type_by_model"])
|
750
|
-
"""
|
751
|
-
pass
|
752
|
-
|
753
|
-
def search(
|
754
|
-
cls,
|
755
|
-
string: str,
|
756
|
-
*,
|
757
|
-
field: StrField | None = None,
|
758
|
-
limit: int | None = 20,
|
759
|
-
case_sensitive: bool = False,
|
760
|
-
) -> QuerySet:
|
761
|
-
"""Search.
|
762
|
-
|
763
|
-
Args:
|
764
|
-
string: The input string to match against the field ontology values.
|
765
|
-
field: The field or fields to search. Search all string fields by default.
|
766
|
-
limit: Maximum amount of top results to return.
|
767
|
-
case_sensitive: Whether the match is case sensitive.
|
768
|
-
|
769
|
-
Returns:
|
770
|
-
A sorted `DataFrame` of search results with a score in column `score`.
|
771
|
-
If `return_queryset` is `True`. `QuerySet`.
|
772
|
-
|
773
|
-
See Also:
|
774
|
-
:meth:`~lamindb.core.Record.filter`
|
775
|
-
:meth:`~lamindb.core.Record.lookup`
|
776
|
-
|
777
|
-
Examples:
|
778
|
-
>>> ulabels = ln.ULabel.from_values(["ULabel1", "ULabel2", "ULabel3"], field="name")
|
779
|
-
>>> ln.save(ulabels)
|
780
|
-
>>> ln.ULabel.search("ULabel2")
|
781
|
-
"""
|
782
|
-
pass
|
783
|
-
|
784
|
-
def using(
|
785
|
-
cls,
|
786
|
-
instance: str | None,
|
787
|
-
) -> QuerySet:
|
788
|
-
"""Use a non-default LaminDB instance.
|
789
|
-
|
790
|
-
Args:
|
791
|
-
instance: An instance identifier of form "account_handle/instance_name".
|
792
|
-
|
793
|
-
Examples:
|
794
|
-
>>> ln.ULabel.using("account_handle/instance_name").search("ULabel7", field="name")
|
795
|
-
uid score
|
796
|
-
name
|
797
|
-
ULabel7 g7Hk9b2v 100.0
|
798
|
-
ULabel5 t4Jm6s0q 75.0
|
799
|
-
ULabel6 r2Xw8p1z 75.0
|
800
|
-
"""
|
801
|
-
pass
|
802
|
-
|
803
|
-
def __get_module_name__(cls) -> str:
|
804
|
-
schema_module_name = cls.__module__.split(".")[0]
|
805
|
-
module_name = schema_module_name.replace("lnschema_", "")
|
806
|
-
if module_name == "lamindb":
|
807
|
-
module_name = "core"
|
808
|
-
return module_name
|
809
|
-
|
810
|
-
@deprecated("__get_module_name__")
|
811
|
-
def __get_schema_name__(cls) -> str:
|
812
|
-
return cls.__get_module_name__()
|
813
|
-
|
814
|
-
def __get_name_with_module__(cls) -> str:
|
815
|
-
module_name = cls.__get_module_name__()
|
816
|
-
if module_name == "core":
|
817
|
-
module_prefix = ""
|
818
|
-
else:
|
819
|
-
module_prefix = f"{module_name}."
|
820
|
-
return f"{module_prefix}{cls.__name__}"
|
821
|
-
|
822
|
-
@deprecated("__get_name_with_module__")
|
823
|
-
def __get_name_with_schema__(cls) -> str:
|
824
|
-
return cls.__get_name_with_module__()
|
825
|
-
|
826
|
-
|
827
|
-
class BasicRecord(models.Model, metaclass=Registry):
|
828
|
-
"""Basic metadata record.
|
829
|
-
|
830
|
-
It has the same methods as Record, but doesn't have the additional fields.
|
831
|
-
|
832
|
-
It's mainly used for LinkORMs and similar.
|
833
|
-
"""
|
834
|
-
|
835
|
-
class Meta:
|
836
|
-
abstract = True
|
837
|
-
|
838
|
-
|
839
|
-
class Space(BasicRecord):
|
840
|
-
"""Spaces."""
|
841
|
-
|
842
|
-
id: int = models.SmallAutoField(primary_key=True)
|
843
|
-
"""Internal id, valid only in one DB instance."""
|
844
|
-
name: str = models.CharField(max_length=100, db_index=True)
|
845
|
-
"""Name of space."""
|
846
|
-
uid: str = CharField(
|
847
|
-
editable=False,
|
848
|
-
unique=True,
|
849
|
-
max_length=12,
|
850
|
-
default="00000000",
|
851
|
-
db_default="00000000",
|
852
|
-
db_index=True,
|
853
|
-
)
|
854
|
-
"""Universal id."""
|
855
|
-
description: str | None = CharField(null=True)
|
856
|
-
"""Description of space."""
|
857
|
-
created_at: datetime = DateTimeField(
|
858
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
859
|
-
)
|
860
|
-
"""Time of creation of record."""
|
861
|
-
created_by: User = ForeignKey(
|
862
|
-
"User", CASCADE, default=None, related_name="+", null=True
|
863
|
-
)
|
864
|
-
"""Creator of run."""
|
865
|
-
|
866
|
-
|
867
|
-
@doc_args(RECORD_REGISTRY_EXAMPLE)
|
868
|
-
class Record(BasicRecord, metaclass=Registry):
|
869
|
-
"""Metadata record.
|
870
|
-
|
871
|
-
Every `Record` is a data model that comes with a registry in form of a SQL
|
872
|
-
table in your database.
|
873
|
-
|
874
|
-
Sub-classing `Record` creates a new registry while instantiating a `Record`
|
875
|
-
creates a new record.
|
876
|
-
|
877
|
-
{}
|
878
|
-
|
879
|
-
`Record`'s metaclass is :class:`~lamindb.core.Registry`.
|
880
|
-
|
881
|
-
`Record` inherits from Django's `Model` class. Why does LaminDB call it `Record`
|
882
|
-
and not `Model`? The term `Record` can't lead to confusion with statistical,
|
883
|
-
machine learning or biological models.
|
884
|
-
"""
|
885
|
-
|
886
|
-
_branch_code: int = models.SmallIntegerField(db_index=True, default=1, db_default=1)
|
887
|
-
"""Whether record is on a branch, in archive or in trash.
|
888
|
-
|
889
|
-
This dictates whether a record appears in queries & searches.
|
890
|
-
|
891
|
-
Coding is as follows:
|
892
|
-
|
893
|
-
- 3: template (hidden in queries & searches)
|
894
|
-
- 2: draft (hidden in queries & searches)
|
895
|
-
- 1: default (visible in queries & searches)
|
896
|
-
- 0: archive (hidden, meant to be kept)
|
897
|
-
- -1: trash (hidden, scheduled for deletion)
|
898
|
-
|
899
|
-
Any integer higher than >3 codes a branch that's involved in a pull request.
|
900
|
-
"""
|
901
|
-
space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1)
|
902
|
-
"""The space in which the record lives."""
|
903
|
-
_aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True)
|
904
|
-
"""Auxiliary field for dictionary-like metadata."""
|
905
|
-
|
906
|
-
def save(self, *args, **kwargs) -> Record:
|
907
|
-
"""Save.
|
908
|
-
|
909
|
-
Always saves to the default database.
|
910
|
-
"""
|
911
|
-
# we need this here because we're using models also from plain
|
912
|
-
# django outside of lamindb
|
913
|
-
super().save(*args, **kwargs)
|
914
|
-
return self
|
915
|
-
|
916
|
-
def delete(self) -> None:
|
917
|
-
"""Delete."""
|
918
|
-
pass
|
919
|
-
|
920
|
-
class Meta:
|
921
|
-
abstract = True
|
922
|
-
|
923
|
-
|
924
|
-
class FeatureManager:
|
925
|
-
"""Feature manager."""
|
926
|
-
|
927
|
-
pass
|
928
|
-
|
929
|
-
|
930
|
-
class ParamManager:
|
931
|
-
"""Param manager."""
|
932
|
-
|
933
|
-
pass
|
934
|
-
|
935
|
-
|
936
|
-
class ParamManagerArtifact(ParamManager):
|
937
|
-
"""Param manager."""
|
938
|
-
|
939
|
-
pass
|
940
|
-
|
941
|
-
|
942
|
-
class ParamManagerRun(ParamManager):
|
943
|
-
"""Param manager."""
|
944
|
-
|
945
|
-
pass
|
946
|
-
|
947
|
-
|
948
|
-
# -------------------------------------------------------------------------------------
|
949
|
-
# A note on required fields at the Record level
|
950
|
-
#
|
951
|
-
# As Django does most of its validation on the Form-level, it doesn't offer functionality
|
952
|
-
# for validating the integrity of an Record object upon instantation (similar to pydantic)
|
953
|
-
#
|
954
|
-
# For required fields, we define them as commonly done on the SQL level together
|
955
|
-
# with a validator in Record (validate_required_fields)
|
956
|
-
#
|
957
|
-
# This goes against the Django convention, but goes with the SQLModel convention
|
958
|
-
# (Optional fields can be null on the SQL level, non-optional fields cannot)
|
959
|
-
#
|
960
|
-
# Due to Django's convention where CharFieldAttr has pre-configured (null=False, default=""), marking
|
961
|
-
# a required field necessitates passing `default=None`. Without the validator it would trigger
|
962
|
-
# an error at the SQL-level, with it, it triggers it at instantiation
|
963
|
-
|
964
|
-
# -------------------------------------------------------------------------------------
|
965
|
-
# A note on class and instance methods of core Record
|
966
|
-
#
|
967
|
-
# All of these are defined and tested within lamindb, in files starting with _{orm_name}.py
|
968
|
-
|
969
|
-
# -------------------------------------------------------------------------------------
|
970
|
-
# A note on maximal lengths of char fields
|
971
|
-
#
|
972
|
-
# 100 characters:
|
973
|
-
# "Raindrops pitter-pattered on the windowpane, blurring the"
|
974
|
-
# "city lights outside, curled up with a mug."
|
975
|
-
# A good maximal length for a name (title).
|
976
|
-
#
|
977
|
-
# 150 characters: We choose this for name maximal length because some users like long names.
|
978
|
-
#
|
979
|
-
# 255 characters:
|
980
|
-
# "In creating a precise 255-character paragraph, one engages in"
|
981
|
-
# "a dance of words, where clarity meets brevity. Every syllable counts,"
|
982
|
-
# "illustrating the skill in compact expression, ensuring the essence of the"
|
983
|
-
# "message shines through within the exacting limit."
|
984
|
-
# This is a good maximal length for a description field.
|
985
|
-
|
986
|
-
|
987
|
-
class User(BasicRecord, CanCurate):
|
988
|
-
"""Users.
|
989
|
-
|
990
|
-
All data in this registry is synched from `lamin.ai` to ensure a universal
|
991
|
-
user identity. There is no need to manually create records.
|
992
|
-
|
993
|
-
Examples:
|
994
|
-
|
995
|
-
Query a user by handle:
|
996
|
-
|
997
|
-
>>> user = ln.User.get(handle="testuser1")
|
998
|
-
>>> user
|
999
|
-
"""
|
1000
|
-
|
1001
|
-
_name_field: str = "handle"
|
1002
|
-
|
1003
|
-
id: int = models.AutoField(primary_key=True)
|
1004
|
-
"""Internal id, valid only in one DB instance."""
|
1005
|
-
uid: str = CharField(editable=False, unique=True, db_index=True, max_length=8)
|
1006
|
-
"""Universal id, valid across DB instances."""
|
1007
|
-
handle: str = CharField(max_length=30, unique=True, db_index=True)
|
1008
|
-
"""Universal handle, valid across DB instances (required)."""
|
1009
|
-
name: str | None = CharField(max_length=150, db_index=True, null=True)
|
1010
|
-
"""Name (optional).""" # has to match hub specification, where it's also optional
|
1011
|
-
created_artifacts: Artifact
|
1012
|
-
"""Artifacts created by user."""
|
1013
|
-
created_transforms: Transform
|
1014
|
-
"""Transforms created by user."""
|
1015
|
-
created_runs: Run
|
1016
|
-
"""Runs created by user."""
|
1017
|
-
created_at: datetime = DateTimeField(
|
1018
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1019
|
-
)
|
1020
|
-
"""Time of creation of record."""
|
1021
|
-
updated_at: datetime = DateTimeField(
|
1022
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1023
|
-
)
|
1024
|
-
"""Time of last update to record."""
|
1025
|
-
|
1026
|
-
@overload
|
1027
|
-
def __init__(
|
1028
|
-
self,
|
1029
|
-
handle: str,
|
1030
|
-
email: str,
|
1031
|
-
name: str | None,
|
1032
|
-
): ...
|
1033
|
-
|
1034
|
-
@overload
|
1035
|
-
def __init__(
|
1036
|
-
self,
|
1037
|
-
*db_args,
|
1038
|
-
): ...
|
1039
|
-
|
1040
|
-
def __init__(
|
1041
|
-
self,
|
1042
|
-
*args,
|
1043
|
-
**kwargs,
|
1044
|
-
):
|
1045
|
-
super().__init__(*args, **kwargs)
|
1046
|
-
|
1047
|
-
|
1048
|
-
class Storage(Record, TracksRun, TracksUpdates):
|
1049
|
-
"""Storage locations.
|
1050
|
-
|
1051
|
-
A storage location is either a directory/folder (local or in the cloud) or
|
1052
|
-
an entire S3/GCP bucket.
|
1053
|
-
|
1054
|
-
A LaminDB instance can manage and link multiple storage locations. But any
|
1055
|
-
storage location is managed by *at most one* LaminDB instance.
|
1056
|
-
|
1057
|
-
.. dropdown:: Managed vs. linked storage locations
|
1058
|
-
|
1059
|
-
The LaminDB instance can update & delete artifacts in managed storage
|
1060
|
-
locations but merely read artifacts in linked storage locations.
|
1061
|
-
|
1062
|
-
When you transfer artifacts from another instance, the default is to
|
1063
|
-
only copy metadata into the target instance, but merely link the data.
|
1064
|
-
|
1065
|
-
The `instance_uid` field indicates the managing LaminDB instance of a
|
1066
|
-
storage location.
|
1067
|
-
|
1068
|
-
When you delete a LaminDB instance, you'll be warned about data in managed
|
1069
|
-
storage locations while data in linked storage locations is ignored.
|
1070
|
-
|
1071
|
-
See Also:
|
1072
|
-
:attr:`~lamindb.core.Settings.storage`
|
1073
|
-
Default storage.
|
1074
|
-
:attr:`~lamindb.setup.core.StorageSettings`
|
1075
|
-
Storage settings.
|
1076
|
-
|
1077
|
-
Examples:
|
1078
|
-
|
1079
|
-
Configure the default storage location upon initiation of a LaminDB instance::
|
1080
|
-
|
1081
|
-
lamin init --storage ./mydata # or "s3://my-bucket" or "gs://my-bucket"
|
1082
|
-
|
1083
|
-
View the default storage location:
|
1084
|
-
|
1085
|
-
>>> ln.settings.storage
|
1086
|
-
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata')
|
1087
|
-
|
1088
|
-
Dynamically change the default storage:
|
1089
|
-
|
1090
|
-
>>> ln.settings.storage = "./storage_2" # or a cloud bucket
|
1091
|
-
"""
|
1092
|
-
|
1093
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1094
|
-
abstract = False
|
1095
|
-
|
1096
|
-
_name_field: str = "root"
|
1097
|
-
|
1098
|
-
id: int = models.AutoField(primary_key=True)
|
1099
|
-
"""Internal id, valid only in one DB instance."""
|
1100
|
-
uid: str = CharField(
|
1101
|
-
editable=False, unique=True, max_length=12, default=base62_12, db_index=True
|
1102
|
-
)
|
1103
|
-
"""Universal id, valid across DB instances."""
|
1104
|
-
# we are very conservative here with 255 characters
|
1105
|
-
root: str = CharField(db_index=True, unique=True)
|
1106
|
-
"""Root path of storage. n s3 path. local path, etc. (required)."""
|
1107
|
-
description: str | None = CharField(db_index=True, null=True)
|
1108
|
-
"""A description of what the storage location is used for (optional)."""
|
1109
|
-
type: str = CharField(max_length=30, db_index=True)
|
1110
|
-
"""Can be "local" vs. "s3" vs. "gs"."""
|
1111
|
-
region: str | None = CharField(max_length=64, db_index=True, null=True)
|
1112
|
-
"""Cloud storage region, if applicable."""
|
1113
|
-
instance_uid: str | None = CharField(max_length=12, db_index=True, null=True)
|
1114
|
-
"""Instance that manages this storage location."""
|
1115
|
-
artifacts: Artifact
|
1116
|
-
"""Artifacts contained in this storage location."""
|
1117
|
-
|
1118
|
-
@overload
|
1119
|
-
def __init__(
|
1120
|
-
self,
|
1121
|
-
root: str,
|
1122
|
-
type: str,
|
1123
|
-
region: str | None,
|
1124
|
-
): ...
|
1125
|
-
|
1126
|
-
@overload
|
1127
|
-
def __init__(
|
1128
|
-
self,
|
1129
|
-
*db_args,
|
1130
|
-
): ...
|
1131
|
-
|
1132
|
-
def __init__(
|
1133
|
-
self,
|
1134
|
-
*args,
|
1135
|
-
**kwargs,
|
1136
|
-
):
|
1137
|
-
super().__init__(*args, **kwargs)
|
1138
|
-
|
1139
|
-
@property
|
1140
|
-
def path(self) -> Path | UPath:
|
1141
|
-
"""Bucket or folder path.
|
1142
|
-
|
1143
|
-
Cloud storage bucket:
|
1144
|
-
|
1145
|
-
>>> ln.Storage("s3://my-bucket").save()
|
1146
|
-
|
1147
|
-
Directory/folder in cloud storage:
|
1148
|
-
|
1149
|
-
>>> ln.Storage("s3://my-bucket/my-directory").save()
|
1150
|
-
|
1151
|
-
Local directory/folder:
|
1152
|
-
|
1153
|
-
>>> ln.Storage("./my-directory").save()
|
1154
|
-
"""
|
1155
|
-
pass
|
1156
|
-
|
1157
|
-
|
1158
|
-
# does not inherit from TracksRun because the Transform
|
1159
|
-
# is needed to define a run
|
1160
|
-
class Transform(Record, IsVersioned):
|
1161
|
-
"""Data transformations.
|
1162
|
-
|
1163
|
-
A "transform" can refer to a Python function, a script, a notebook, or a
|
1164
|
-
pipeline. If you execute a transform, you generate a run
|
1165
|
-
(:class:`~lamindb.Run`). A run has inputs and outputs.
|
1166
|
-
|
1167
|
-
A pipeline is typically created with a workflow tool (Nextflow, Snakemake,
|
1168
|
-
Prefect, Flyte, MetaFlow, redun, Airflow, ...) and stored in a versioned
|
1169
|
-
repository.
|
1170
|
-
|
1171
|
-
Transforms are versioned so that a given transform version maps on a given
|
1172
|
-
source code version.
|
1173
|
-
|
1174
|
-
.. dropdown:: Can I sync transforms to git?
|
1175
|
-
|
1176
|
-
If you switch on
|
1177
|
-
:attr:`~lamindb.core.Settings.sync_git_repo` a script-like transform is
|
1178
|
-
synched to its hashed state in a git repository upon calling `ln.track()`.
|
1179
|
-
|
1180
|
-
>>> ln.settings.sync_git_repo = "https://github.com/laminlabs/lamindb"
|
1181
|
-
>>> ln.track()
|
1182
|
-
|
1183
|
-
The definition of transforms and runs is consistent the OpenLineage
|
1184
|
-
specification where a :class:`~lamindb.Transform` record would be called a
|
1185
|
-
"job" and a :class:`~lamindb.Run` record a "run".
|
1186
|
-
|
1187
|
-
Args:
|
1188
|
-
name: `str` A name or title.
|
1189
|
-
key: `str | None = None` A short name or path-like semantic key.
|
1190
|
-
type: `TransformType | None = "pipeline"` See :class:`~lamindb.base.types.TransformType`.
|
1191
|
-
revises: `Transform | None = None` An old version of the transform.
|
1192
|
-
|
1193
|
-
See Also:
|
1194
|
-
:meth:`~lamindb.core.Context.track`
|
1195
|
-
Globally track a script, notebook or pipeline run.
|
1196
|
-
:class:`~lamindb.Run`
|
1197
|
-
Executions of transforms.
|
1198
|
-
|
1199
|
-
Notes:
|
1200
|
-
- :doc:`docs:track`
|
1201
|
-
- :doc:`docs:data-flow`
|
1202
|
-
- :doc:`docs:redun`
|
1203
|
-
- :doc:`docs:nextflow`
|
1204
|
-
- :doc:`docs:snakemake`
|
1205
|
-
|
1206
|
-
Examples:
|
1207
|
-
|
1208
|
-
Create a transform for a pipeline:
|
1209
|
-
|
1210
|
-
>>> transform = ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
|
1211
|
-
|
1212
|
-
Create a transform from a notebook:
|
1213
|
-
|
1214
|
-
>>> ln.track()
|
1215
|
-
|
1216
|
-
View predecessors of a transform:
|
1217
|
-
|
1218
|
-
>>> transform.view_lineage()
|
1219
|
-
"""
|
1220
|
-
|
1221
|
-
class Meta(Record.Meta, IsVersioned.Meta):
|
1222
|
-
abstract = False
|
1223
|
-
|
1224
|
-
_len_stem_uid: int = 12
|
1225
|
-
_len_full_uid: int = 16
|
1226
|
-
_name_field: str = "key"
|
1227
|
-
|
1228
|
-
id: int = models.AutoField(primary_key=True)
|
1229
|
-
"""Internal id, valid only in one DB instance."""
|
1230
|
-
uid: str = CharField(
|
1231
|
-
editable=False, unique=True, db_index=True, max_length=_len_full_uid
|
1232
|
-
)
|
1233
|
-
"""Universal id."""
|
1234
|
-
key: str | None = CharField(db_index=True, null=True)
|
1235
|
-
"""A name or "/"-separated path-like string.
|
1236
|
-
|
1237
|
-
All transforms with the same key are part of the same version family.
|
1238
|
-
"""
|
1239
|
-
description: str | None = CharField(db_index=True, null=True)
|
1240
|
-
"""A description."""
|
1241
|
-
type: TransformType = CharField(
|
1242
|
-
max_length=20,
|
1243
|
-
db_index=True,
|
1244
|
-
default="pipeline",
|
1245
|
-
)
|
1246
|
-
""":class:`~lamindb.base.types.TransformType` (default `"pipeline"`)."""
|
1247
|
-
source_code: str | None = TextField(null=True)
|
1248
|
-
"""Source code of the transform.
|
1249
|
-
|
1250
|
-
.. versionchanged:: 0.75
|
1251
|
-
The `source_code` field is no longer an artifact, but a text field.
|
1252
|
-
"""
|
1253
|
-
# we have a unique constraint here but not on artifact because on artifact, we haven't yet
|
1254
|
-
# settled how we model the same artifact in different storage locations
|
1255
|
-
hash: str | None = CharField(
|
1256
|
-
max_length=HASH_LENGTH, db_index=True, null=True, unique=True
|
1257
|
-
)
|
1258
|
-
"""Hash of the source code."""
|
1259
|
-
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
1260
|
-
"""Reference for the transform, e.g., a URL."""
|
1261
|
-
reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
|
1262
|
-
"""Reference type of the transform, e.g., 'url'."""
|
1263
|
-
runs: Run
|
1264
|
-
"""Runs of this transform."""
|
1265
|
-
ulabels: ULabel = models.ManyToManyField(
|
1266
|
-
"ULabel", through="TransformULabel", related_name="transforms"
|
1267
|
-
)
|
1268
|
-
"""ULabel annotations of this transform."""
|
1269
|
-
predecessors: Transform = models.ManyToManyField(
|
1270
|
-
"self", symmetrical=False, related_name="successors"
|
1271
|
-
)
|
1272
|
-
"""Preceding transforms.
|
1273
|
-
|
1274
|
-
These are auto-populated whenever an artifact or collection serves as a run
|
1275
|
-
input, e.g., `artifact.run` and `artifact.transform` get populated & saved.
|
1276
|
-
|
1277
|
-
The table provides a more convenient method to query for the predecessors that
|
1278
|
-
bypasses querying the :class:`~lamindb.Run`.
|
1279
|
-
|
1280
|
-
It also allows to manually add predecessors whose outputs are not tracked in a run.
|
1281
|
-
"""
|
1282
|
-
successors: Transform
|
1283
|
-
"""Subsequent transforms.
|
1284
|
-
|
1285
|
-
See :attr:`~lamindb.Transform.predecessors`.
|
1286
|
-
"""
|
1287
|
-
output_artifacts: Artifact
|
1288
|
-
"""The artifacts generated by all runs of this transform.
|
1289
|
-
|
1290
|
-
If you're looking for the outputs of a single run, see :attr:`lamindb.Run.output_artifacts`.
|
1291
|
-
"""
|
1292
|
-
output_collections: Collection
|
1293
|
-
"""The collections generated by all runs of this transform.
|
1294
|
-
|
1295
|
-
If you're looking for the outputs of a single run, see :attr:`lamindb.Run.output_collections`.
|
1296
|
-
"""
|
1297
|
-
projects: Project
|
1298
|
-
"""Associated projects."""
|
1299
|
-
references: Reference
|
1300
|
-
"""Associated references."""
|
1301
|
-
created_at: datetime = DateTimeField(
|
1302
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1303
|
-
)
|
1304
|
-
"""Time of creation of record."""
|
1305
|
-
updated_at: datetime = DateTimeField(
|
1306
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1307
|
-
)
|
1308
|
-
"""Time of last update to record."""
|
1309
|
-
created_by: User = ForeignKey(
|
1310
|
-
User, PROTECT, default=current_user_id, related_name="created_transforms"
|
1311
|
-
)
|
1312
|
-
"""Creator of record."""
|
1313
|
-
_template: Transform | None = ForeignKey(
|
1314
|
-
"Transform", PROTECT, related_name="_derived_from", default=None, null=True
|
1315
|
-
)
|
1316
|
-
"""Creating template."""
|
1317
|
-
|
1318
|
-
@overload
|
1319
|
-
def __init__(
|
1320
|
-
self,
|
1321
|
-
name: str,
|
1322
|
-
key: str | None = None,
|
1323
|
-
type: TransformType | None = None,
|
1324
|
-
revises: Transform | None = None,
|
1325
|
-
): ...
|
1326
|
-
|
1327
|
-
@overload
|
1328
|
-
def __init__(
|
1329
|
-
self,
|
1330
|
-
*db_args,
|
1331
|
-
): ...
|
1332
|
-
|
1333
|
-
def __init__(
|
1334
|
-
self,
|
1335
|
-
*args,
|
1336
|
-
**kwargs,
|
1337
|
-
):
|
1338
|
-
super().__init__(*args, **kwargs)
|
1339
|
-
|
1340
|
-
@property
|
1341
|
-
def name(self) -> str:
|
1342
|
-
"""Name of the transform.
|
1343
|
-
|
1344
|
-
Splits `key` on `/` and returns the last element.
|
1345
|
-
"""
|
1346
|
-
return self.key.split("/")[-1]
|
1347
|
-
|
1348
|
-
@property
|
1349
|
-
def latest_run(self) -> Run:
|
1350
|
-
"""The latest run of this transform."""
|
1351
|
-
pass
|
1352
|
-
|
1353
|
-
def view_lineage(self) -> None:
|
1354
|
-
"""View lineage of transforms."""
|
1355
|
-
pass
|
1356
|
-
|
1357
|
-
|
1358
|
-
class Param(Record, CanCurate, TracksRun, TracksUpdates):
|
1359
|
-
"""Parameters of runs & models."""
|
1360
|
-
|
1361
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1362
|
-
abstract = False
|
1363
|
-
|
1364
|
-
_name_field: str = "name"
|
1365
|
-
|
1366
|
-
name: str = CharField(max_length=100, db_index=True)
|
1367
|
-
dtype: str | None = CharField(db_index=True, null=True)
|
1368
|
-
"""Data type ("num", "cat", "int", "float", "bool", "datetime").
|
1369
|
-
|
1370
|
-
For categorical types, can define from which registry values are
|
1371
|
-
sampled, e.g., `cat[ULabel]` or `cat[bionty.CellType]`.
|
1372
|
-
"""
|
1373
|
-
type: Param | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
1374
|
-
"""Type of param (e.g., 'Pipeline', 'ModelTraining', 'PostProcessing').
|
1375
|
-
|
1376
|
-
Allows to group features by type, e.g., all read outs, all metrics, etc.
|
1377
|
-
"""
|
1378
|
-
records: Param
|
1379
|
-
"""Records of this type."""
|
1380
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
1381
|
-
"""Distinguish types from instances of the type."""
|
1382
|
-
_expect_many: bool = models.BooleanField(default=False, db_default=False)
|
1383
|
-
"""Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`).
|
1384
|
-
|
1385
|
-
- if it's `False` (default), the values mean artifact/run-level values and a dtype of `datetime` means `datetime`
|
1386
|
-
- if it's `True`, the values are from an aggregation, which this seems like an edge case but when characterizing a model ensemble trained with different parameters it could be relevant
|
1387
|
-
"""
|
1388
|
-
schemas: Schema = models.ManyToManyField(
|
1389
|
-
"Schema", through="SchemaParam", related_name="params"
|
1390
|
-
)
|
1391
|
-
"""Feature sets linked to this feature."""
|
1392
|
-
# backward fields
|
1393
|
-
values: ParamValue
|
1394
|
-
"""Values for this parameter."""
|
1395
|
-
|
1396
|
-
def __init__(self, *args, **kwargs):
|
1397
|
-
from ._feature import process_init_feature_param
|
1398
|
-
from .errors import ValidationError
|
1399
|
-
|
1400
|
-
if len(args) == len(self._meta.concrete_fields):
|
1401
|
-
super().__init__(*args, **kwargs)
|
1402
|
-
return None
|
1403
|
-
|
1404
|
-
dtype = kwargs.get("dtype", None)
|
1405
|
-
kwargs = process_init_feature_param(args, kwargs, is_param=True)
|
1406
|
-
super().__init__(*args, **kwargs)
|
1407
|
-
dtype_str = kwargs.pop("dtype", None)
|
1408
|
-
if not self._state.adding:
|
1409
|
-
if not (
|
1410
|
-
self.dtype.startswith("cat")
|
1411
|
-
if dtype == "cat"
|
1412
|
-
else self.dtype == dtype_str
|
1413
|
-
):
|
1414
|
-
raise ValidationError(
|
1415
|
-
f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
|
1416
|
-
)
|
1417
|
-
|
1418
|
-
|
1419
|
-
# FeatureValue behaves in many ways like a link in a LinkORM
|
1420
|
-
# in particular, we don't want a _public field on it
|
1421
|
-
# Also, we don't inherit from TracksRun because a ParamValue
|
1422
|
-
# is typically created before a run is created and we want to
|
1423
|
-
# avoid delete cycles (for Model params though it might be helpful)
|
1424
|
-
class ParamValue(Record):
|
1425
|
-
"""Parameter values.
|
1426
|
-
|
1427
|
-
Is largely analogous to `FeatureValue`.
|
1428
|
-
"""
|
1429
|
-
|
1430
|
-
# we do not have a unique constraint on param & value because it leads to hashing errors
|
1431
|
-
# for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
|
1432
|
-
# we do not hash values because we have `get_or_create` logic all over the place
|
1433
|
-
# and also for checking whether the (param, value) combination exists
|
1434
|
-
# there does not seem an issue with querying for a dict-like value
|
1435
|
-
# https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
|
1436
|
-
_name_field: str = "value"
|
1437
|
-
|
1438
|
-
param: Param = ForeignKey(Param, CASCADE, related_name="values")
|
1439
|
-
"""The dimension metadata."""
|
1440
|
-
value: Any = (
|
1441
|
-
models.JSONField()
|
1442
|
-
) # stores float, integer, boolean, datetime or dictionaries
|
1443
|
-
"""The JSON-like value."""
|
1444
|
-
# it'd be confusing and hard to populate a run here because these
|
1445
|
-
# values are typically created upon creating a run
|
1446
|
-
# hence, ParamValue does _not_ inherit from TracksRun but manually
|
1447
|
-
# adds created_at & created_by
|
1448
|
-
# because ParamValue cannot be updated, we don't need updated_at
|
1449
|
-
created_at: datetime = DateTimeField(
|
1450
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1451
|
-
)
|
1452
|
-
"""Time of creation of record."""
|
1453
|
-
created_by: User = ForeignKey(
|
1454
|
-
User, PROTECT, default=current_user_id, related_name="+"
|
1455
|
-
)
|
1456
|
-
"""Creator of record."""
|
1457
|
-
hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
|
1458
|
-
|
1459
|
-
class Meta:
|
1460
|
-
constraints = [
|
1461
|
-
# For simple types, use direct value comparison
|
1462
|
-
models.UniqueConstraint(
|
1463
|
-
fields=["param", "value"],
|
1464
|
-
name="unique_simple_param_value",
|
1465
|
-
condition=Q(hash__isnull=True),
|
1466
|
-
),
|
1467
|
-
# For complex types (dictionaries), use hash
|
1468
|
-
models.UniqueConstraint(
|
1469
|
-
fields=["param", "hash"],
|
1470
|
-
name="unique_complex_param_value",
|
1471
|
-
condition=Q(hash__isnull=False),
|
1472
|
-
),
|
1473
|
-
]
|
1474
|
-
|
1475
|
-
@classmethod
|
1476
|
-
def get_or_create(cls, param, value):
|
1477
|
-
# Simple types: int, float, str, bool
|
1478
|
-
if isinstance(value, (int, float, str, bool)):
|
1479
|
-
try:
|
1480
|
-
return cls.objects.create(param=param, value=value, hash=None), False
|
1481
|
-
except IntegrityError:
|
1482
|
-
return cls.objects.get(param=param, value=value), True
|
1483
|
-
|
1484
|
-
# Complex types: dict, list
|
1485
|
-
else:
|
1486
|
-
hash = hash_dict(value)
|
1487
|
-
try:
|
1488
|
-
return cls.objects.create(param=param, value=value, hash=hash), False
|
1489
|
-
except IntegrityError:
|
1490
|
-
return cls.objects.get(param=param, hash=hash), True
|
1491
|
-
|
1492
|
-
|
1493
|
-
class Run(Record):
|
1494
|
-
"""Runs of transforms.
|
1495
|
-
|
1496
|
-
Args:
|
1497
|
-
transform: `Transform` A :class:`~lamindb.Transform` record.
|
1498
|
-
reference: `str | None = None` For instance, an external ID or a download URL.
|
1499
|
-
reference_type: `str | None = None` For instance, `redun_id`, `nextflow_id` or `url`.
|
1500
|
-
|
1501
|
-
See Also:
|
1502
|
-
:meth:`~lamindb.core.Context.track`
|
1503
|
-
Track global run & transform records for a notebook or pipeline.
|
1504
|
-
|
1505
|
-
Examples:
|
1506
|
-
|
1507
|
-
Create a run record:
|
1508
|
-
|
1509
|
-
>>> ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
|
1510
|
-
>>> transform = ln.Transform.get(key="Cell Ranger", version="7.2.0")
|
1511
|
-
>>> run = ln.Run(transform)
|
1512
|
-
|
1513
|
-
Create a global run context for a custom transform:
|
1514
|
-
|
1515
|
-
>>> ln.track(transform=transform)
|
1516
|
-
>>> ln.context.run # globally available run
|
1517
|
-
|
1518
|
-
Track a global run context for a notebook or script:
|
1519
|
-
|
1520
|
-
>>> ln.track() # Jupyter notebook metadata is automatically parsed
|
1521
|
-
>>> ln.context.run
|
1522
|
-
"""
|
1523
|
-
|
1524
|
-
_name_field: str = "started_at"
|
1525
|
-
|
1526
|
-
params: ParamManager = ParamManagerRun # type: ignore
|
1527
|
-
"""Param manager.
|
1528
|
-
|
1529
|
-
Guide: :ref:`track-run-parameters`
|
1530
|
-
|
1531
|
-
Example::
|
1532
|
-
|
1533
|
-
run.params.add_values({
|
1534
|
-
"learning_rate": 0.01,
|
1535
|
-
"input_dir": "s3://my-bucket/mydataset",
|
1536
|
-
"downsample": True,
|
1537
|
-
"preprocess_params": {
|
1538
|
-
"normalization_type": "cool",
|
1539
|
-
"subset_highlyvariable": True,
|
1540
|
-
},
|
1541
|
-
})
|
1542
|
-
"""
|
1543
|
-
|
1544
|
-
id: int = models.BigAutoField(primary_key=True)
|
1545
|
-
"""Internal id, valid only in one DB instance."""
|
1546
|
-
uid: str = CharField(
|
1547
|
-
editable=False, unique=True, db_index=True, max_length=20, default=base62_20
|
1548
|
-
)
|
1549
|
-
"""Universal id, valid across DB instances."""
|
1550
|
-
name: str | None = CharField(max_length=150, null=True)
|
1551
|
-
"""A name."""
|
1552
|
-
transform = ForeignKey(Transform, CASCADE, related_name="runs")
|
1553
|
-
"""The transform :class:`~lamindb.Transform` that is being run."""
|
1554
|
-
started_at: datetime = DateTimeField(
|
1555
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1556
|
-
)
|
1557
|
-
"""Start time of run."""
|
1558
|
-
finished_at: datetime | None = DateTimeField(db_index=True, null=True, default=None)
|
1559
|
-
"""Finished time of run."""
|
1560
|
-
# we don't want to make below a OneToOne because there could be the same trivial report
|
1561
|
-
# generated for many different runs
|
1562
|
-
report: Artifact | None = ForeignKey(
|
1563
|
-
"Artifact", PROTECT, null=True, related_name="_report_of", default=None
|
1564
|
-
)
|
1565
|
-
"""Report of run, e.g.. n html file."""
|
1566
|
-
_logfile: Artifact | None = ForeignKey(
|
1567
|
-
"Artifact", PROTECT, null=True, related_name="_logfile_of", default=None
|
1568
|
-
)
|
1569
|
-
"""Report of run, e.g.. n html file."""
|
1570
|
-
environment: Artifact | None = ForeignKey(
|
1571
|
-
"Artifact", PROTECT, null=True, related_name="_environment_of", default=None
|
1572
|
-
)
|
1573
|
-
"""Computational environment for the run.
|
1574
|
-
|
1575
|
-
For instance, `Dockerfile`, `docker image`, `requirements.txt`, `environment.yml`, etc.
|
1576
|
-
"""
|
1577
|
-
input_artifacts: Artifact
|
1578
|
-
"""The artifacts serving as input for this run.
|
1579
|
-
|
1580
|
-
Related accessor: :attr:`~lamindb.Artifact.input_of_runs`.
|
1581
|
-
"""
|
1582
|
-
output_artifacts: Artifact
|
1583
|
-
"""The artifacts generated by this run.
|
1584
|
-
|
1585
|
-
Related accessor: via :attr:`~lamindb.Artifact.run`
|
1586
|
-
"""
|
1587
|
-
input_collections: Collection
|
1588
|
-
"""The collections serving as input for this run."""
|
1589
|
-
output_collections: Collection
|
1590
|
-
"""The collections generated by this run."""
|
1591
|
-
_param_values: ParamValue = models.ManyToManyField(
|
1592
|
-
ParamValue, through="RunParamValue", related_name="runs"
|
1593
|
-
)
|
1594
|
-
"""Parameter values."""
|
1595
|
-
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
1596
|
-
"""A reference like a URL or external ID (such as from a workflow manager)."""
|
1597
|
-
reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
|
1598
|
-
"""Type of reference such as a workflow manager execution ID."""
|
1599
|
-
created_at: datetime = DateTimeField(
|
1600
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
1601
|
-
)
|
1602
|
-
"""Time of first creation. Mismatches ``started_at`` if the run is re-run."""
|
1603
|
-
created_by: User = ForeignKey(
|
1604
|
-
User, CASCADE, default=current_user_id, related_name="created_runs"
|
1605
|
-
)
|
1606
|
-
"""Creator of run."""
|
1607
|
-
ulabels: ULabel = models.ManyToManyField(
|
1608
|
-
"ULabel", through="RunULabel", related_name="runs"
|
1609
|
-
)
|
1610
|
-
"""ULabel annotations of this transform."""
|
1611
|
-
initiated_by_run: Run | None = ForeignKey(
|
1612
|
-
"Run", CASCADE, null=True, related_name="initiated_runs", default=None
|
1613
|
-
)
|
1614
|
-
"""The run that triggered the current run.
|
1615
|
-
|
1616
|
-
This is not a preceding run. The preceding runs ("predecessors") is the set
|
1617
|
-
of runs that produced the output artifacts that serve as the inputs for the
|
1618
|
-
present run.
|
1619
|
-
|
1620
|
-
Be careful with using this field at this point.
|
1621
|
-
"""
|
1622
|
-
initiated_runs: Run
|
1623
|
-
"""Runs that were initiated by this run."""
|
1624
|
-
_is_consecutive: bool | None = BooleanField(null=True)
|
1625
|
-
"""Indicates whether code was consecutively executed. Is relevant for notebooks."""
|
1626
|
-
_status_code: int = models.SmallIntegerField(default=0, db_index=True)
|
1627
|
-
"""Status code of the run.
|
1628
|
-
|
1629
|
-
- 0: scheduled
|
1630
|
-
- 1: started
|
1631
|
-
- 2: errored
|
1632
|
-
- 3: aborted
|
1633
|
-
- 4: completed
|
1634
|
-
"""
|
1635
|
-
|
1636
|
-
@overload
|
1637
|
-
def __init__(
|
1638
|
-
self,
|
1639
|
-
transform: Transform,
|
1640
|
-
reference: str | None = None,
|
1641
|
-
reference_type: str | None = None,
|
1642
|
-
): ...
|
1643
|
-
|
1644
|
-
@overload
|
1645
|
-
def __init__(
|
1646
|
-
self,
|
1647
|
-
*db_args,
|
1648
|
-
): ...
|
1649
|
-
|
1650
|
-
def __init__(
|
1651
|
-
self,
|
1652
|
-
*args,
|
1653
|
-
**kwargs,
|
1654
|
-
):
|
1655
|
-
super().__init__(*args, **kwargs)
|
1656
|
-
|
1657
|
-
|
1658
|
-
class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
|
1659
|
-
"""Universal labels.
|
1660
|
-
|
1661
|
-
Args:
|
1662
|
-
name: `str` A name.
|
1663
|
-
description: `str` A description.
|
1664
|
-
reference: `str | None = None` For instance, an external ID or a URL.
|
1665
|
-
reference_type: `str | None = None` For instance, `"url"`.
|
1666
|
-
|
1667
|
-
A `ULabel` record provides the easiest way to annotate a dataset
|
1668
|
-
with a label: `"My project"`, `"curated"`, or `"Batch X"`:
|
1669
|
-
|
1670
|
-
>>> my_project = ULabel(name="My project")
|
1671
|
-
>>> my_project.save()
|
1672
|
-
>>> artifact.ulabels.add(my_project)
|
1673
|
-
|
1674
|
-
Often, a ulabel is measured *within* a dataset. For instance, an artifact
|
1675
|
-
might characterize 2 species of the Iris flower (`"setosa"` &
|
1676
|
-
`"versicolor"`) measured by a `"species"` feature. Use the
|
1677
|
-
:class:`~lamindb.Curator` flow to automatically parse, validate, and
|
1678
|
-
annotate with labels that are contained in `DataFrame` or `AnnData`
|
1679
|
-
artifacts.
|
1680
|
-
|
1681
|
-
.. note::
|
1682
|
-
|
1683
|
-
If you work with complex entities like cell lines, cell types, tissues,
|
1684
|
-
etc., consider using the pre-defined biological registries in
|
1685
|
-
:mod:`bionty` to label artifacts & collections.
|
1686
|
-
|
1687
|
-
If you work with biological samples, likely, the only sustainable way of
|
1688
|
-
tracking metadata, is to create a custom schema module.
|
1689
|
-
|
1690
|
-
See Also:
|
1691
|
-
:meth:`~lamindb.Feature`
|
1692
|
-
Dimensions of measurement for artifacts & collections.
|
1693
|
-
:attr:`~lamindb.Artifact.features`
|
1694
|
-
Feature manager for an artifact.
|
1695
|
-
|
1696
|
-
Examples:
|
1697
|
-
|
1698
|
-
Create a new label:
|
1699
|
-
|
1700
|
-
>>> train_split = ln.ULabel(name="train").save()
|
1701
|
-
|
1702
|
-
Organize labels in a hierarchy:
|
1703
|
-
|
1704
|
-
>>> split_type = ln.ULabel(name="Split", is_type=True).save()
|
1705
|
-
>>> train_split = ln.ULabel(name="train", type="split_type").save()
|
1706
|
-
|
1707
|
-
Label an artifact:
|
1708
|
-
|
1709
|
-
>>> artifact.ulabels.add(ulabel)
|
1710
|
-
|
1711
|
-
Query by `ULabel`:
|
1712
|
-
|
1713
|
-
>>> ln.Artifact.filter(ulabels=train_split)
|
1714
|
-
"""
|
1715
|
-
|
1716
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1717
|
-
abstract = False
|
1718
|
-
|
1719
|
-
_name_field: str = "name"
|
1720
|
-
|
1721
|
-
id: int = models.AutoField(primary_key=True)
|
1722
|
-
"""Internal id, valid only in one DB instance."""
|
1723
|
-
uid: str = CharField(
|
1724
|
-
editable=False, unique=True, db_index=True, max_length=8, default=base62_8
|
1725
|
-
)
|
1726
|
-
"""A universal random id, valid across DB instances."""
|
1727
|
-
name: str = CharField(max_length=150, db_index=True)
|
1728
|
-
"""Name or title of ulabel."""
|
1729
|
-
type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
1730
|
-
"""Type of ulabel, e.g., `"donor"`, `"split"`, etc.
|
1731
|
-
|
1732
|
-
Allows to group ulabels by type, e.g., all donors, all split ulabels, etc.
|
1733
|
-
"""
|
1734
|
-
records: ULabel
|
1735
|
-
"""Records of this type."""
|
1736
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
1737
|
-
"""Distinguish types from instances of the type.
|
1738
|
-
|
1739
|
-
For example, a ulabel "Project" would be a type, and the actual projects "Project 1", "Project 2", would be records of that `type`.
|
1740
|
-
"""
|
1741
|
-
description: str | None = CharField(null=True, db_index=True)
|
1742
|
-
"""A description (optional)."""
|
1743
|
-
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
1744
|
-
"""A reference like URL or external ID."""
|
1745
|
-
reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
|
1746
|
-
"""Type of reference such as a donor_id from Vendor X."""
|
1747
|
-
parents: ULabel = models.ManyToManyField(
|
1748
|
-
"self", symmetrical=False, related_name="children"
|
1749
|
-
)
|
1750
|
-
"""Parent entities of this ulabel.
|
1751
|
-
|
1752
|
-
For advanced use cases, you can build an ontology under a given `type`.
|
1753
|
-
|
1754
|
-
Say, if you modeled `CellType` as a `ULabel`, you would introduce a type `CellType` and model the hiearchy of cell types under it.
|
1755
|
-
"""
|
1756
|
-
children: ULabel
|
1757
|
-
"""Child entities of this ulabel.
|
1758
|
-
|
1759
|
-
Reverse accessor for parents.
|
1760
|
-
"""
|
1761
|
-
transforms: Transform
|
1762
|
-
"""Transforms annotated with this ulabel."""
|
1763
|
-
runs: Transform
|
1764
|
-
"""Runs annotated with this ulabel."""
|
1765
|
-
artifacts: Artifact
|
1766
|
-
"""Artifacts annotated with this ulabel."""
|
1767
|
-
collections: Collection
|
1768
|
-
"""Collections annotated with this ulabel."""
|
1769
|
-
projects: Project
|
1770
|
-
"""Associated projects."""
|
1771
|
-
|
1772
|
-
@overload
|
1773
|
-
def __init__(
|
1774
|
-
self,
|
1775
|
-
name: str,
|
1776
|
-
type: ULabel | None = None,
|
1777
|
-
is_type: bool = False,
|
1778
|
-
description: str | None = None,
|
1779
|
-
reference: str | None = None,
|
1780
|
-
reference_type: str | None = None,
|
1781
|
-
): ...
|
1782
|
-
|
1783
|
-
@overload
|
1784
|
-
def __init__(
|
1785
|
-
self,
|
1786
|
-
*db_args,
|
1787
|
-
): ...
|
1788
|
-
|
1789
|
-
def __init__(
|
1790
|
-
self,
|
1791
|
-
*args,
|
1792
|
-
**kwargs,
|
1793
|
-
):
|
1794
|
-
pass
|
1795
|
-
|
1796
|
-
|
1797
|
-
class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
1798
|
-
"""Dataset dimensions.
|
1799
|
-
|
1800
|
-
A feature represents a dimension of a dataset, such as a column in a
|
1801
|
-
`DataFrame`. The `Feature` registry organizes metadata of features.
|
1802
|
-
|
1803
|
-
The `Feature` registry helps you organize and query datasets based on their
|
1804
|
-
features and corresponding label annotations. For instance, when working
|
1805
|
-
with a "T cell" label, it could be measured through different features
|
1806
|
-
such as `"cell_type_by_expert"` where an expert manually classified the
|
1807
|
-
cell, or `"cell_type_by_model"` where a computational model made the
|
1808
|
-
classification.
|
1809
|
-
|
1810
|
-
The two most important metadata of a feature are its `name` and the `dtype`.
|
1811
|
-
In addition to typical data types, LaminDB has a `"num"` `dtype` to
|
1812
|
-
concisely denote the union of all numerical types.
|
1813
|
-
|
1814
|
-
Args:
|
1815
|
-
name: `str` Name of the feature, typically. column name.
|
1816
|
-
dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
|
1817
|
-
For categorical types, can define from which registry values are
|
1818
|
-
sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
|
1819
|
-
unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
|
1820
|
-
description: `str | None = None` A description.
|
1821
|
-
synonyms: `str | None = None` Bar-separated synonyms.
|
1822
|
-
nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
|
1823
|
-
default_value: `Any | None = None` Default value for the feature.
|
1824
|
-
cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
|
1825
|
-
|
1826
|
-
Note:
|
1827
|
-
|
1828
|
-
For more control, you can use :mod:`bionty` registries to manage simple
|
1829
|
-
biological entities like genes, proteins & cell markers. Or you define
|
1830
|
-
custom registries to manage high-level derived features like gene sets.
|
1831
|
-
|
1832
|
-
See Also:
|
1833
|
-
:meth:`~lamindb.Feature.from_df`
|
1834
|
-
Create feature records from DataFrame.
|
1835
|
-
:attr:`~lamindb.Artifact.features`
|
1836
|
-
Feature manager of an artifact or collection.
|
1837
|
-
:class:`~lamindb.ULabel`
|
1838
|
-
Universal labels.
|
1839
|
-
:class:`~lamindb.Schema`
|
1840
|
-
Feature sets.
|
1841
|
-
|
1842
|
-
Example:
|
1843
|
-
|
1844
|
-
A simple `"str"` feature.
|
1845
|
-
|
1846
|
-
>>> ln.Feature(
|
1847
|
-
... name="sample_note",
|
1848
|
-
... dtype="str",
|
1849
|
-
... ).save()
|
1850
|
-
|
1851
|
-
A dtype `"cat[ULabel]"` can be more easily passed as below.
|
1852
|
-
|
1853
|
-
>>> ln.Feature(
|
1854
|
-
... name="project",
|
1855
|
-
... dtype=ln.ULabel,
|
1856
|
-
... ).save()
|
1857
|
-
|
1858
|
-
A dtype `"cat[ULabel|bionty.CellType]"` can be more easily passed as below.
|
1859
|
-
|
1860
|
-
>>> ln.Feature(
|
1861
|
-
... name="cell_type",
|
1862
|
-
... dtype=[ln.ULabel, bt.CellType],
|
1863
|
-
... ).save()
|
1864
|
-
|
1865
|
-
Hint:
|
1866
|
-
|
1867
|
-
*Features* and *labels* denote two ways of using entities to organize data:
|
1868
|
-
|
1869
|
-
1. A feature qualifies *what* is measured, i.e., a numerical or categorical random variable
|
1870
|
-
2. A label *is* a measured value, i.e., a category
|
1871
|
-
|
1872
|
-
Consider annotating a dataset by that it measured expression of 30k
|
1873
|
-
genes: genes relate to the dataset as feature identifiers through a
|
1874
|
-
feature set with 30k members. Now consider annotating the artifact by
|
1875
|
-
whether that it measured the knock-out of 3 genes: here, the 3 genes act
|
1876
|
-
as labels of the dataset.
|
1877
|
-
|
1878
|
-
Re-shaping data can introduce ambiguity among features & labels. If this
|
1879
|
-
happened, ask yourself what the joint measurement was: a feature
|
1880
|
-
qualifies variables in a joint measurement. The canonical data matrix
|
1881
|
-
lists jointly measured variables in the columns.
|
1882
|
-
|
1883
|
-
"""
|
1884
|
-
|
1885
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1886
|
-
abstract = False
|
1887
|
-
|
1888
|
-
_name_field: str = "name"
|
1889
|
-
_aux_fields: dict[str, tuple[str, type]] = {
|
1890
|
-
"0": ("default_value", bool),
|
1891
|
-
"1": ("nullable", bool),
|
1892
|
-
}
|
1893
|
-
|
1894
|
-
id: int = models.AutoField(primary_key=True)
|
1895
|
-
"""Internal id, valid only in one DB instance."""
|
1896
|
-
uid: str = CharField(
|
1897
|
-
editable=False, unique=True, db_index=True, max_length=12, default=base62_12
|
1898
|
-
)
|
1899
|
-
"""Universal id, valid across DB instances."""
|
1900
|
-
name: str = CharField(max_length=150, db_index=True, unique=True)
|
1901
|
-
"""Name of feature (hard unique constraint `unique=True`)."""
|
1902
|
-
dtype: FeatureDtype | None = CharField(db_index=True, null=True)
|
1903
|
-
"""Data type (:class:`~lamindb.base.types.FeatureDtype`).
|
1904
|
-
|
1905
|
-
For categorical types, can define from which registry values are
|
1906
|
-
sampled, e.g., `'cat[ULabel]'` or `'cat[bionty.CellType]'`. Unions are also
|
1907
|
-
allowed if the feature samples from two registries, e.g., `'cat[ULabel|bionty.CellType]'`
|
1908
|
-
"""
|
1909
|
-
type: Feature | None = ForeignKey(
|
1910
|
-
"self", PROTECT, null=True, related_name="records"
|
1911
|
-
)
|
1912
|
-
"""Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').
|
1913
|
-
|
1914
|
-
Allows to group features by type, e.g., all read outs, all metrics, etc.
|
1915
|
-
"""
|
1916
|
-
records: Feature
|
1917
|
-
"""Records of this type."""
|
1918
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
1919
|
-
"""Distinguish types from instances of the type."""
|
1920
|
-
unit: str | None = CharField(max_length=30, db_index=True, null=True)
|
1921
|
-
"""Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
|
1922
|
-
description: str | None = CharField(db_index=True, null=True)
|
1923
|
-
"""A description."""
|
1924
|
-
array_rank: int = models.SmallIntegerField(default=0, db_index=True)
|
1925
|
-
"""Rank of feature.
|
1926
|
-
|
1927
|
-
Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.
|
1928
|
-
|
1929
|
-
Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with
|
1930
|
-
the dimension of the feature space.
|
1931
|
-
"""
|
1932
|
-
array_size: int = models.IntegerField(default=0, db_index=True)
|
1933
|
-
"""Number of elements of the feature.
|
1934
|
-
|
1935
|
-
Total number of elements (product of shape components) of the array.
|
1936
|
-
|
1937
|
-
- A number or string (a scalar): 1
|
1938
|
-
- A 50-dimensional embedding: 50
|
1939
|
-
- A 25 x 25 image: 625
|
1940
|
-
"""
|
1941
|
-
array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)
|
1942
|
-
"""Shape of the feature.
|
1943
|
-
|
1944
|
-
- A number or string (a scalar): [1]
|
1945
|
-
- A 50-dimensional embedding: [50]
|
1946
|
-
- A 25 x 25 image: [25, 25]
|
1947
|
-
|
1948
|
-
Is stored as a list rather than a tuple because it's serialized as JSON.
|
1949
|
-
"""
|
1950
|
-
proxy_dtype: FeatureDtype | None = CharField(default=None, null=True)
|
1951
|
-
"""Proxy data type.
|
1952
|
-
|
1953
|
-
If the feature is an image it's often stored via a path to the image file. Hence, while the dtype might be
|
1954
|
-
image with a certain shape, the proxy dtype would be str.
|
1955
|
-
"""
|
1956
|
-
synonyms: str | None = TextField(null=True)
|
1957
|
-
"""Bar-separated (|) synonyms (optional)."""
|
1958
|
-
# we define the below ManyToMany on the feature model because it parallels
|
1959
|
-
# how other registries (like Gene, Protein, etc.) relate to Schema
|
1960
|
-
# it makes the API more consistent
|
1961
|
-
schemas: Schema = models.ManyToManyField(
|
1962
|
-
"Schema", through="SchemaFeature", related_name="features"
|
1963
|
-
)
|
1964
|
-
"""Feature sets linked to this feature."""
|
1965
|
-
_expect_many: bool = models.BooleanField(default=True, db_default=True)
|
1966
|
-
"""Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `True`).
|
1967
|
-
|
1968
|
-
- if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level mean `set[datetime]` on the artifact-level
|
1969
|
-
- if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact")
|
1970
|
-
"""
|
1971
|
-
_curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
|
1972
|
-
# backward fields
|
1973
|
-
values: FeatureValue
|
1974
|
-
"""Values for this feature."""
|
1975
|
-
|
1976
|
-
@overload
|
1977
|
-
def __init__(
|
1978
|
-
self,
|
1979
|
-
name: str,
|
1980
|
-
dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
|
1981
|
-
type: Feature | None = None,
|
1982
|
-
is_type: bool = False,
|
1983
|
-
unit: str | None = None,
|
1984
|
-
description: str | None = None,
|
1985
|
-
synonyms: str | None = None,
|
1986
|
-
nullable: bool = True,
|
1987
|
-
default_value: str | None = None,
|
1988
|
-
cat_filters: dict[str, str] | None = None,
|
1989
|
-
): ...
|
1990
|
-
|
1991
|
-
@overload
|
1992
|
-
def __init__(
|
1993
|
-
self,
|
1994
|
-
*db_args,
|
1995
|
-
): ...
|
1996
|
-
|
1997
|
-
def __init__(
|
1998
|
-
self,
|
1999
|
-
*args,
|
2000
|
-
**kwargs,
|
2001
|
-
):
|
2002
|
-
pass
|
2003
|
-
|
2004
|
-
@classmethod
|
2005
|
-
def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
|
2006
|
-
"""Create Feature records for columns."""
|
2007
|
-
pass
|
2008
|
-
|
2009
|
-
def save(self, *args, **kwargs) -> Feature:
|
2010
|
-
"""Save."""
|
2011
|
-
pass
|
2012
|
-
|
2013
|
-
@property
|
2014
|
-
def default_value(self) -> Any:
|
2015
|
-
"""A default value that overwrites missing values (default `None`).
|
2016
|
-
|
2017
|
-
This takes effect when you call `Curator.standardize()`.
|
2018
|
-
"""
|
2019
|
-
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
|
2020
|
-
return self._aux["af"]["0"]
|
2021
|
-
else:
|
2022
|
-
return None
|
2023
|
-
|
2024
|
-
@default_value.setter
|
2025
|
-
def default_value(self, value: bool) -> None:
|
2026
|
-
if self._aux is None:
|
2027
|
-
self._aux = {}
|
2028
|
-
if "af" not in self._aux:
|
2029
|
-
self._aux["af"] = {}
|
2030
|
-
self._aux["af"]["0"] = value
|
2031
|
-
|
2032
|
-
@property
|
2033
|
-
def nullable(self) -> bool:
|
2034
|
-
"""Indicates whether the feature can have nullable values (default `True`).
|
2035
|
-
|
2036
|
-
Example::
|
2037
|
-
|
2038
|
-
import lamindb as ln
|
2039
|
-
import pandas as pd
|
2040
|
-
|
2041
|
-
disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
|
2042
|
-
schema = ln.Schema(features=[disease]).save()
|
2043
|
-
dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
|
2044
|
-
df = pd.DataFrame(dataset)
|
2045
|
-
curator = ln.curators.DataFrameCurator(df, schema)
|
2046
|
-
try:
|
2047
|
-
curator.validate()
|
2048
|
-
except ln.errors.ValidationError as e:
|
2049
|
-
assert str(e).startswith("non-nullable series 'disease' contains null values")
|
2050
|
-
|
2051
|
-
"""
|
2052
|
-
if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
|
2053
|
-
return self._aux["af"]["1"]
|
2054
|
-
else:
|
2055
|
-
return True
|
2056
|
-
|
2057
|
-
@nullable.setter
|
2058
|
-
def nullable(self, value: bool) -> None:
|
2059
|
-
if self._aux is None:
|
2060
|
-
self._aux = {}
|
2061
|
-
if "af" not in self._aux:
|
2062
|
-
self._aux["af"] = {}
|
2063
|
-
self._aux["af"]["1"] = value
|
2064
|
-
|
2065
|
-
|
2066
|
-
class FeatureValue(Record, TracksRun):
|
2067
|
-
"""Non-categorical features values.
|
2068
|
-
|
2069
|
-
Categorical feature values are stored in their respective registries:
|
2070
|
-
:class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.
|
2071
|
-
|
2072
|
-
Unlike for ULabel, in `FeatureValue`, values are grouped by features and
|
2073
|
-
not by an ontological hierarchy.
|
2074
|
-
"""
|
2075
|
-
|
2076
|
-
# we do not have a unique constraint on feature & value because it leads to hashing errors
|
2077
|
-
# for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
|
2078
|
-
# we do not hash values because we have `get_or_create` logic all over the place
|
2079
|
-
# and also for checking whether the (feature, value) combination exists
|
2080
|
-
# there does not seem an issue with querying for a dict-like value
|
2081
|
-
# https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
|
2082
|
-
|
2083
|
-
_name_field: str = "value"
|
2084
|
-
|
2085
|
-
feature: Feature | None = ForeignKey(
|
2086
|
-
Feature, CASCADE, null=True, related_name="values", default=None
|
2087
|
-
)
|
2088
|
-
"""The dimension metadata."""
|
2089
|
-
value: Any = models.JSONField()
|
2090
|
-
"""The JSON-like value."""
|
2091
|
-
hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
|
2092
|
-
"""Value hash."""
|
2093
|
-
|
2094
|
-
class Meta(BasicRecord.Meta, TracksRun.Meta):
|
2095
|
-
constraints = [
|
2096
|
-
# For simple types, use direct value comparison
|
2097
|
-
models.UniqueConstraint(
|
2098
|
-
fields=["feature", "value"],
|
2099
|
-
name="unique_simple_feature_value",
|
2100
|
-
condition=Q(hash__isnull=True),
|
2101
|
-
),
|
2102
|
-
# For complex types (dictionaries), use hash
|
2103
|
-
models.UniqueConstraint(
|
2104
|
-
fields=["feature", "hash"],
|
2105
|
-
name="unique_complex_feature_value",
|
2106
|
-
condition=Q(hash__isnull=False),
|
2107
|
-
),
|
2108
|
-
]
|
2109
|
-
|
2110
|
-
@classmethod
|
2111
|
-
def get_or_create(cls, feature, value):
|
2112
|
-
# Simple types: int, float, str, bool
|
2113
|
-
if isinstance(value, (int, float, str, bool)):
|
2114
|
-
try:
|
2115
|
-
return (
|
2116
|
-
cls.objects.create(feature=feature, value=value, hash=None),
|
2117
|
-
False,
|
2118
|
-
)
|
2119
|
-
except IntegrityError:
|
2120
|
-
return cls.objects.get(feature=feature, value=value), True
|
2121
|
-
|
2122
|
-
# Complex types: dict, list
|
2123
|
-
else:
|
2124
|
-
hash = hash_dict(value)
|
2125
|
-
try:
|
2126
|
-
return (
|
2127
|
-
cls.objects.create(feature=feature, value=value, hash=hash),
|
2128
|
-
False,
|
2129
|
-
)
|
2130
|
-
except IntegrityError:
|
2131
|
-
return cls.objects.get(feature=feature, hash=hash), True
|
2132
|
-
|
2133
|
-
|
2134
|
-
class Schema(Record, CanCurate, TracksRun):
|
2135
|
-
"""Schemas / feature sets.
|
2136
|
-
|
2137
|
-
Stores references to dataset schemas: these are the sets of columns in a dataset
|
2138
|
-
that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
|
2139
|
-
entities.
|
2140
|
-
|
2141
|
-
.. dropdown:: Why does LaminDB model feature sets, not just features?
|
2142
|
-
|
2143
|
-
1. Performance: Imagine you measure the same panel of 20k transcripts in
|
2144
|
-
1M samples. By modeling the panel as a feature set, you can link all
|
2145
|
-
your artifacts against one feature set and only need to store 1M
|
2146
|
-
instead of 1M x 20k = 20B links.
|
2147
|
-
2. Interpretation: Model protein panels, gene panels, etc.
|
2148
|
-
3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
|
2149
|
-
|
2150
|
-
These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
|
2151
|
-
|
2152
|
-
Args:
|
2153
|
-
features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
|
2154
|
-
records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
|
2155
|
-
a set upon instantiation. If you'd like to pass values, use
|
2156
|
-
:meth:`~lamindb.Schema.from_values` or
|
2157
|
-
:meth:`~lamindb.Schema.from_df`.
|
2158
|
-
components: `dict[str, Schema] | None = None` A dictionary mapping component names to
|
2159
|
-
their corresponding :class:`~lamindb.Schema` objects for composite schemas.
|
2160
|
-
name: `str | None = None` A name.
|
2161
|
-
description: `str | None = None` A description.
|
2162
|
-
dtype: `str | None = None` The simple type. Defaults to
|
2163
|
-
`None` for sets of :class:`~lamindb.Feature` records.
|
2164
|
-
Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
|
2165
|
-
itype: `str | None = None` The schema identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
|
2166
|
-
type: `Schema | None = None` A type.
|
2167
|
-
is_type: `bool = False` Distinguish types from instances of the type.
|
2168
|
-
otype: `str | None = None` An object type to define the structure of a composite schema.
|
2169
|
-
minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
|
2170
|
-
ordered_set: `bool = False` Whether features are required to be ordered.
|
2171
|
-
maximal_set: `bool = False` If `True`, no additional features are allowed.
|
2172
|
-
slot: `str | None = None` The slot name when this schema is used as a component in a
|
2173
|
-
composite schema.
|
2174
|
-
coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
|
2175
|
-
during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
|
2176
|
-
|
2177
|
-
Note:
|
2178
|
-
|
2179
|
-
A feature set can be identified by the `hash` of its feature uids.
|
2180
|
-
It's stored in the `.hash` field.
|
2181
|
-
|
2182
|
-
A `slot` provides a string key to access feature sets. For instance, for the schema of an
|
2183
|
-
`AnnData` object, it would be `'obs'` for `adata.obs`.
|
2184
|
-
|
2185
|
-
See Also:
|
2186
|
-
:meth:`~lamindb.Schema.from_values`
|
2187
|
-
Create from values.
|
2188
|
-
:meth:`~lamindb.Schema.from_df`
|
2189
|
-
Create from dataframe columns.
|
2190
|
-
|
2191
|
-
Examples:
|
2192
|
-
|
2193
|
-
Create a schema (feature set) from df with types:
|
2194
|
-
|
2195
|
-
>>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
|
2196
|
-
>>> schema = ln.Schema.from_df(df)
|
2197
|
-
|
2198
|
-
Create a schema (feature set) from features:
|
2199
|
-
|
2200
|
-
>>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
|
2201
|
-
>>> schema = ln.Schema(features)
|
2202
|
-
|
2203
|
-
Create a schema (feature set) from identifier values:
|
2204
|
-
|
2205
|
-
>>> import bionty as bt
|
2206
|
-
>>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
|
2207
|
-
|
2208
|
-
"""
|
2209
|
-
|
2210
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
2211
|
-
abstract = False
|
2212
|
-
|
2213
|
-
_name_field: str = "name"
|
2214
|
-
_aux_fields: dict[str, tuple[str, type]] = {"0": ("coerce_dtype", bool)}
|
2215
|
-
|
2216
|
-
id: int = models.AutoField(primary_key=True)
|
2217
|
-
"""Internal id, valid only in one DB instance."""
|
2218
|
-
uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
|
2219
|
-
"""A universal id (hash of the set of feature values)."""
|
2220
|
-
name: str | None = CharField(max_length=150, null=True, db_index=True)
|
2221
|
-
"""A name."""
|
2222
|
-
description: str | None = CharField(null=True, db_index=True)
|
2223
|
-
"""A description."""
|
2224
|
-
n = IntegerField()
|
2225
|
-
"""Number of features in the set."""
|
2226
|
-
dtype: str | None = CharField(max_length=64, null=True, editable=False)
|
2227
|
-
"""Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
|
2228
|
-
|
2229
|
-
For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
|
2230
|
-
"""
|
2231
|
-
itype: str | None = CharField(
|
2232
|
-
max_length=120, db_index=True, null=True, editable=False
|
2233
|
-
)
|
2234
|
-
"""A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
|
2235
|
-
|
2236
|
-
Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
|
2237
|
-
|
2238
|
-
.. versionchanged:: 1.0.0
|
2239
|
-
Was called `registry` before.
|
2240
|
-
"""
|
2241
|
-
type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
2242
|
-
"""Type of schema.
|
2243
|
-
|
2244
|
-
Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
|
2245
|
-
|
2246
|
-
You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
|
2247
|
-
|
2248
|
-
Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
|
2249
|
-
"""
|
2250
|
-
records: Schema
|
2251
|
-
"""Records of this type."""
|
2252
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
2253
|
-
"""Distinguish types from instances of the type."""
|
2254
|
-
otype: str | None = CharField(max_length=64, db_index=True, null=True)
|
2255
|
-
"""Default Python object type, e.g., DataFrame, AnnData."""
|
2256
|
-
hash: str | None = CharField(
|
2257
|
-
max_length=HASH_LENGTH, db_index=True, null=True, editable=False
|
2258
|
-
)
|
2259
|
-
"""A hash of the set of feature identifiers.
|
2260
|
-
|
2261
|
-
For a composite schema, the hash of hashes.
|
2262
|
-
"""
|
2263
|
-
minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
|
2264
|
-
"""Whether the schema contains a minimal set of linked features (default `True`).
|
2265
|
-
|
2266
|
-
If `False`, no features are linked to this schema.
|
2267
|
-
|
2268
|
-
If `True`, features are linked and considered as a minimally required set in validation.
|
2269
|
-
"""
|
2270
|
-
ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
2271
|
-
"""Whether features are required to be ordered (default `False`)."""
|
2272
|
-
maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
2273
|
-
"""If `False`, additional features are allowed (default `False`).
|
2274
|
-
|
2275
|
-
If `True`, the the minimal set is a maximal set and no additional features are allowed.
|
2276
|
-
"""
|
2277
|
-
components: Schema = ManyToManyField(
|
2278
|
-
"self", through="SchemaComponent", symmetrical=False, related_name="composites"
|
2279
|
-
)
|
2280
|
-
"""Components of this schema."""
|
2281
|
-
composites: Schema
|
2282
|
-
"""The composite schemas that contains this schema as a component.
|
2283
|
-
|
2284
|
-
For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
|
2285
|
-
"""
|
2286
|
-
features: Feature
|
2287
|
-
"""The features contained in the schema."""
|
2288
|
-
params: Param
|
2289
|
-
"""The params contained in the schema."""
|
2290
|
-
artifacts: Artifact
|
2291
|
-
"""The artifacts that measure a feature set that matches this schema."""
|
2292
|
-
validated_artifacts: Artifact
|
2293
|
-
"""The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
|
2294
|
-
projects: Project
|
2295
|
-
"""Associated projects."""
|
2296
|
-
_curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
|
2297
|
-
# lamindb v2
|
2298
|
-
# _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
|
2299
|
-
# ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
|
2300
|
-
# -- the following two fields are dynamically removed from the API for now
|
2301
|
-
validated_by: Schema | None = ForeignKey(
|
2302
|
-
"self", PROTECT, related_name="validated_schemas", default=None, null=True
|
2303
|
-
)
|
2304
|
-
# """The schema that validated this schema during curation.
|
2305
|
-
|
2306
|
-
# When performing validation, the schema that enforced validation is often less concrete than what is validated.
|
2307
|
-
|
2308
|
-
# For instance, the set of measured features might be a superset of the minimally required set of features.
|
2309
|
-
# """
|
2310
|
-
# validated_schemas: Schema
|
2311
|
-
# """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
|
2312
|
-
composite: Schema | None = ForeignKey(
|
2313
|
-
"self", PROTECT, related_name="+", default=None, null=True
|
2314
|
-
)
|
2315
|
-
# The legacy foreign key
|
2316
|
-
slot: str | None = CharField(max_length=100, db_index=True, null=True)
|
2317
|
-
# The legacy slot
|
2318
|
-
|
2319
|
-
@overload
|
2320
|
-
def __init__(
|
2321
|
-
self,
|
2322
|
-
features: Iterable[Record] | None = None,
|
2323
|
-
components: dict[str, Schema] | None = None,
|
2324
|
-
name: str | None = None,
|
2325
|
-
description: str | None = None,
|
2326
|
-
dtype: str | None = None,
|
2327
|
-
itype: str | Registry | FieldAttr | None = None,
|
2328
|
-
type: Schema | None = None,
|
2329
|
-
is_type: bool = False,
|
2330
|
-
otype: str | None = None,
|
2331
|
-
minimal_set: bool = True,
|
2332
|
-
ordered_set: bool = False,
|
2333
|
-
maximal_set: bool = False,
|
2334
|
-
slot: str | None = None,
|
2335
|
-
coerce_dtype: bool = False,
|
2336
|
-
): ...
|
2337
|
-
|
2338
|
-
@overload
|
2339
|
-
def __init__(
|
2340
|
-
self,
|
2341
|
-
*db_args,
|
2342
|
-
): ...
|
2343
|
-
|
2344
|
-
def __init__(
|
2345
|
-
self,
|
2346
|
-
*args,
|
2347
|
-
**kwargs,
|
2348
|
-
):
|
2349
|
-
pass
|
2350
|
-
|
2351
|
-
@classmethod
|
2352
|
-
def from_values( # type: ignore
|
2353
|
-
cls,
|
2354
|
-
values: ListLike,
|
2355
|
-
field: FieldAttr = Feature.name,
|
2356
|
-
type: str | None = None,
|
2357
|
-
name: str | None = None,
|
2358
|
-
mute: bool = False,
|
2359
|
-
organism: Record | str | None = None,
|
2360
|
-
source: Record | None = None,
|
2361
|
-
raise_validation_error: bool = True,
|
2362
|
-
) -> Schema:
|
2363
|
-
"""Create feature set for validated features.
|
2364
|
-
|
2365
|
-
Args:
|
2366
|
-
values: A list of values, like feature names or ids.
|
2367
|
-
field: The field of a reference registry to map values.
|
2368
|
-
type: The simple type.
|
2369
|
-
Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
|
2370
|
-
defaults to `"float"` otherwise.
|
2371
|
-
name: A name.
|
2372
|
-
organism: An organism to resolve gene mapping.
|
2373
|
-
source: A public ontology to resolve feature identifier mapping.
|
2374
|
-
raise_validation_error: Whether to raise a validation error if some values are not valid.
|
2375
|
-
|
2376
|
-
Raises:
|
2377
|
-
ValidationError: If some values are not valid.
|
2378
|
-
|
2379
|
-
Examples:
|
2380
|
-
|
2381
|
-
>>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
|
2382
|
-
>>> schema = ln.Schema.from_values(features)
|
2383
|
-
|
2384
|
-
>>> genes = ["ENSG00000139618", "ENSG00000198786"]
|
2385
|
-
>>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
|
2386
|
-
"""
|
2387
|
-
pass
|
2388
|
-
|
2389
|
-
@classmethod
|
2390
|
-
def from_df(
|
2391
|
-
cls,
|
2392
|
-
df: pd.DataFrame,
|
2393
|
-
field: FieldAttr = Feature.name,
|
2394
|
-
name: str | None = None,
|
2395
|
-
mute: bool = False,
|
2396
|
-
organism: Record | str | None = None,
|
2397
|
-
source: Record | None = None,
|
2398
|
-
) -> Schema | None:
|
2399
|
-
"""Create feature set for validated features."""
|
2400
|
-
pass
|
2401
|
-
|
2402
|
-
def save(self, *args, **kwargs) -> Schema:
|
2403
|
-
"""Save."""
|
2404
|
-
pass
|
2405
|
-
|
2406
|
-
@property
|
2407
|
-
def members(self) -> QuerySet:
|
2408
|
-
"""A queryset for the individual records of the set."""
|
2409
|
-
pass
|
2410
|
-
|
2411
|
-
@property
|
2412
|
-
def coerce_dtype(self) -> bool:
|
2413
|
-
"""Whether dtypes should be coerced during validation.
|
2414
|
-
|
2415
|
-
For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
|
2416
|
-
"""
|
2417
|
-
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
|
2418
|
-
return self._aux["af"]["0"]
|
2419
|
-
else:
|
2420
|
-
return False
|
2421
|
-
|
2422
|
-
@coerce_dtype.setter
|
2423
|
-
def coerce_dtype(self, value: bool) -> None:
|
2424
|
-
if self._aux is None:
|
2425
|
-
self._aux = {}
|
2426
|
-
if "af" not in self._aux:
|
2427
|
-
self._aux["af"] = {}
|
2428
|
-
self._aux["af"]["0"] = value
|
2429
|
-
|
2430
|
-
@property
|
2431
|
-
@deprecated("itype")
|
2432
|
-
def registry(self) -> str:
|
2433
|
-
return self.itype
|
2434
|
-
|
2435
|
-
@registry.setter
|
2436
|
-
def registry(self, value) -> None:
|
2437
|
-
self.itype = value
|
2438
|
-
|
2439
|
-
def describe(self, return_str=False) -> None | str:
|
2440
|
-
"""Describe schema."""
|
2441
|
-
message = str(self) + "\ncomponents:"
|
2442
|
-
for component in self.components.all():
|
2443
|
-
message += "\n " + str(component)
|
2444
|
-
if return_str:
|
2445
|
-
return message
|
2446
|
-
else:
|
2447
|
-
print(message)
|
2448
|
-
return None
|
2449
|
-
|
2450
|
-
def _get_component(self, slot: str) -> Schema:
|
2451
|
-
return self.components.get(links_component__slot=slot)
|
2452
|
-
|
2453
|
-
|
2454
|
-
class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
2455
|
-
# Note that this docstring has to be consistent with Curator.save_artifact()
|
2456
|
-
"""Datasets & models stored as files, folders, or arrays.
|
2457
|
-
|
2458
|
-
Artifacts manage data in local or remote storage.
|
2459
|
-
|
2460
|
-
Some artifacts are array-like, e.g., when stored as `.parquet`, `.h5ad`,
|
2461
|
-
`.zarr`, or `.tiledb`.
|
2462
|
-
|
2463
|
-
Args:
|
2464
|
-
data: `UPathStr` A path to a local or remote folder or file.
|
2465
|
-
kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
|
2466
|
-
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
2467
|
-
description: `str | None = None` A description.
|
2468
|
-
revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
|
2469
|
-
run: `Run | None = None` The run that creates the artifact.
|
2470
|
-
|
2471
|
-
.. dropdown:: Typical storage formats & their API accessors
|
2472
|
-
|
2473
|
-
Arrays:
|
2474
|
-
|
2475
|
-
- Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
|
2476
|
-
- Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
|
2477
|
-
- Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
|
2478
|
-
|
2479
|
-
Non-arrays:
|
2480
|
-
|
2481
|
-
- Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
|
2482
|
-
- Fastq: `.fastq` ⟷ /
|
2483
|
-
- VCF: `.vcf` ⟷ /
|
2484
|
-
- QC: `.html` ⟷ /
|
2485
|
-
|
2486
|
-
You'll find these values in the `suffix` & `accessor` fields.
|
2487
|
-
|
2488
|
-
LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
|
2489
|
-
|
2490
|
-
See Also:
|
2491
|
-
:class:`~lamindb.Storage`
|
2492
|
-
Storage locations for artifacts.
|
2493
|
-
:class:`~lamindb.Collection`
|
2494
|
-
Collections of artifacts.
|
2495
|
-
:meth:`~lamindb.Artifact.from_df`
|
2496
|
-
Create an artifact from a `DataFrame`.
|
2497
|
-
:meth:`~lamindb.Artifact.from_anndata`
|
2498
|
-
Create an artifact from an `AnnData`.
|
2499
|
-
|
2500
|
-
Examples:
|
2501
|
-
|
2502
|
-
Create an artifact by passing `key`:
|
2503
|
-
|
2504
|
-
>>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
|
2505
|
-
>>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
|
2506
|
-
|
2507
|
-
Calling `.save()` uploads the file to the default storage location of your lamindb instance.
|
2508
|
-
(If it's a local instance, the "upload" is a mere copy operation.)
|
2509
|
-
|
2510
|
-
If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
|
2511
|
-
|
2512
|
-
>>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
|
2513
|
-
|
2514
|
-
You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
|
2515
|
-
|
2516
|
-
>>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
|
2517
|
-
>>> artifact_v2.versions.df() # see all versions
|
2518
|
-
|
2519
|
-
.. dropdown:: Why does the API look this way?
|
2520
|
-
|
2521
|
-
It's inspired by APIs building on AWS S3.
|
2522
|
-
|
2523
|
-
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
2524
|
-
|
2525
|
-
In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
|
2526
|
-
|
2527
|
-
# signature: S3.Bucket.upload_file(filepath, key)
|
2528
|
-
import boto3
|
2529
|
-
s3 = boto3.resource('s3')
|
2530
|
-
bucket = s3.Bucket('mybucket')
|
2531
|
-
bucket.upload_file('/tmp/hello.txt', 'hello.txt')
|
2532
|
-
|
2533
|
-
In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
|
2534
|
-
|
2535
|
-
# signature: quilt3.Bucket.put_file(key, filepath)
|
2536
|
-
import quilt3
|
2537
|
-
bucket = quilt3.Bucket('mybucket')
|
2538
|
-
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
2539
|
-
|
2540
|
-
Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
|
2541
|
-
|
2542
|
-
>>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
|
2543
|
-
>>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
|
2544
|
-
|
2545
|
-
Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
|
2546
|
-
|
2547
|
-
>>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
|
2548
|
-
|
2549
|
-
If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
|
2550
|
-
the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
|
2551
|
-
detects the duplication and will return the existing artifact.
|
2552
|
-
|
2553
|
-
"""
|
2554
|
-
|
2555
|
-
class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
2556
|
-
abstract = False
|
2557
|
-
|
2558
|
-
_len_full_uid: int = 20
|
2559
|
-
_len_stem_uid: int = 16
|
2560
|
-
|
2561
|
-
params: ParamManager = ParamManagerArtifact # type: ignore
|
2562
|
-
"""Param manager.
|
2563
|
-
|
2564
|
-
Example::
|
2565
|
-
|
2566
|
-
artifact.params.add_values({
|
2567
|
-
"hidden_size": 32,
|
2568
|
-
"bottleneck_size": 16,
|
2569
|
-
"batch_size": 32,
|
2570
|
-
"preprocess_params": {
|
2571
|
-
"normalization_type": "cool",
|
2572
|
-
"subset_highlyvariable": True,
|
2573
|
-
},
|
2574
|
-
})
|
2575
|
-
"""
|
2576
|
-
|
2577
|
-
features: FeatureManager = FeatureManager # type: ignore
|
2578
|
-
"""Feature manager.
|
2579
|
-
|
2580
|
-
Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
|
2581
|
-
|
2582
|
-
Annotate with features & values::
|
2583
|
-
|
2584
|
-
artifact.features.add_values({
|
2585
|
-
"species": organism, # here, organism is an Organism record
|
2586
|
-
"scientist": ['Barbara McClintock', 'Edgar Anderson'],
|
2587
|
-
"temperature": 27.6,
|
2588
|
-
"study": "Candidate marker study"
|
2589
|
-
})
|
2590
|
-
|
2591
|
-
Query for features & values::
|
2592
|
-
|
2593
|
-
ln.Artifact.features.filter(scientist="Barbara McClintock")
|
2594
|
-
|
2595
|
-
Features may or may not be part of the artifact content in storage. For
|
2596
|
-
instance, the :class:`~lamindb.Curator` flow validates the columns of a
|
2597
|
-
`DataFrame`-like artifact and annotates it with features corresponding to
|
2598
|
-
these columns. `artifact.features.add_values`, by contrast, does not
|
2599
|
-
validate the content of the artifact.
|
2600
|
-
"""
|
2601
|
-
|
2602
|
-
@property
|
2603
|
-
def labels(self) -> LabelManager:
|
2604
|
-
"""Label manager.
|
2605
|
-
|
2606
|
-
To annotate with labels, you typically use the registry-specific accessors,
|
2607
|
-
for instance :attr:`~lamindb.Artifact.ulabels`::
|
2608
|
-
|
2609
|
-
candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
|
2610
|
-
artifact.ulabels.add(candidate_marker_study)
|
2611
|
-
|
2612
|
-
Similarly, you query based on these accessors::
|
2613
|
-
|
2614
|
-
ln.Artifact.filter(ulabels__name="Candidate marker study").all()
|
2615
|
-
|
2616
|
-
Unlike the registry-specific accessors, the `.labels` accessor provides
|
2617
|
-
a way of associating labels with features::
|
2618
|
-
|
2619
|
-
study = ln.Feature(name="study", dtype="cat").save()
|
2620
|
-
artifact.labels.add(candidate_marker_study, feature=study)
|
2621
|
-
|
2622
|
-
Note that the above is equivalent to::
|
2623
|
-
|
2624
|
-
artifact.features.add_values({"study": candidate_marker_study})
|
2625
|
-
"""
|
2626
|
-
from lamindb.core._label_manager import LabelManager
|
2627
|
-
|
2628
|
-
return LabelManager(self)
|
2629
|
-
|
2630
|
-
id: int = models.AutoField(primary_key=True)
|
2631
|
-
"""Internal id, valid only in one DB instance."""
|
2632
|
-
uid: str = CharField(
|
2633
|
-
editable=False, unique=True, db_index=True, max_length=_len_full_uid
|
2634
|
-
)
|
2635
|
-
"""A universal random id."""
|
2636
|
-
key: str | None = CharField(db_index=True, null=True)
|
2637
|
-
"""A (virtual) relative file path within the artifact's storage location.
|
2638
|
-
|
2639
|
-
Setting a `key` is useful to automatically group artifacts into a version family.
|
2640
|
-
|
2641
|
-
LaminDB defaults to a virtual file path to make renaming of data in object storage easy.
|
2642
|
-
|
2643
|
-
If you register existing files in a storage location, the `key` equals the
|
2644
|
-
actual filepath on the underyling filesytem or object store.
|
2645
|
-
"""
|
2646
|
-
description: str | None = CharField(db_index=True, null=True)
|
2647
|
-
"""A description."""
|
2648
|
-
storage: Storage = ForeignKey(
|
2649
|
-
Storage, PROTECT, related_name="artifacts", editable=False
|
2650
|
-
)
|
2651
|
-
"""Storage location, e.g. an S3 or GCP bucket or a local directory."""
|
2652
|
-
suffix: str = CharField(max_length=30, db_index=True, editable=False)
|
2653
|
-
# Initially, we thought about having this be nullable to indicate folders
|
2654
|
-
# But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
|
2655
|
-
"""Path suffix or empty string if no canonical suffix exists.
|
2656
|
-
|
2657
|
-
This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
|
2658
|
-
"""
|
2659
|
-
kind: ArtifactKind | None = CharField(
|
2660
|
-
max_length=20,
|
2661
|
-
db_index=True,
|
2662
|
-
null=True,
|
2663
|
-
)
|
2664
|
-
""":class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
|
2665
|
-
otype: str | None = CharField(
|
2666
|
-
max_length=64, db_index=True, null=True, editable=False
|
2667
|
-
)
|
2668
|
-
"""Default Python object type, e.g., DataFrame, AnnData."""
|
2669
|
-
size: int | None = BigIntegerField(
|
2670
|
-
null=True, db_index=True, default=None, editable=False
|
2671
|
-
)
|
2672
|
-
"""Size in bytes.
|
2673
|
-
|
2674
|
-
Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
|
2675
|
-
"""
|
2676
|
-
hash: str | None = CharField(
|
2677
|
-
max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
|
2678
|
-
)
|
2679
|
-
"""Hash or pseudo-hash of artifact content.
|
2680
|
-
|
2681
|
-
Useful to ascertain integrity and avoid duplication.
|
2682
|
-
"""
|
2683
|
-
n_files: int | None = BigIntegerField(
|
2684
|
-
null=True, db_index=True, default=None, editable=False
|
2685
|
-
)
|
2686
|
-
"""Number of files for folder-like artifacts, `None` for file-like artifacts.
|
2687
|
-
|
2688
|
-
Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
|
2689
|
-
|
2690
|
-
.. versionchanged:: 1.0
|
2691
|
-
Renamed from `n_objects` to `n_files`.
|
2692
|
-
"""
|
2693
|
-
n_observations: int | None = BigIntegerField(
|
2694
|
-
null=True, db_index=True, default=None, editable=False
|
2695
|
-
)
|
2696
|
-
"""Number of observations.
|
2697
|
-
|
2698
|
-
Typically, this denotes the first array dimension.
|
2699
|
-
"""
|
2700
|
-
_hash_type: str | None = CharField(
|
2701
|
-
max_length=30, db_index=True, null=True, editable=False
|
2702
|
-
)
|
2703
|
-
"""Type of hash."""
|
2704
|
-
ulabels: ULabel = models.ManyToManyField(
|
2705
|
-
ULabel, through="ArtifactULabel", related_name="artifacts"
|
2706
|
-
)
|
2707
|
-
"""The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
|
2708
|
-
run: Run | None = ForeignKey(
|
2709
|
-
Run,
|
2710
|
-
PROTECT,
|
2711
|
-
related_name="output_artifacts",
|
2712
|
-
null=True,
|
2713
|
-
default=None,
|
2714
|
-
editable=False,
|
2715
|
-
)
|
2716
|
-
"""Run that created the artifact."""
|
2717
|
-
input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
|
2718
|
-
"""Runs that use this artifact as an input."""
|
2719
|
-
# if the artifact is replicated or updated in a new run, we link the previous
|
2720
|
-
# run in previous_runs
|
2721
|
-
_previous_runs: Run = models.ManyToManyField(
|
2722
|
-
"Run", related_name="_output_artifacts_with_later_updates"
|
2723
|
-
)
|
2724
|
-
"""Sequence of runs that created or updated the record."""
|
2725
|
-
collections: Collection
|
2726
|
-
"""The collections that this artifact is part of."""
|
2727
|
-
schema: Schema | None = ForeignKey(
|
2728
|
-
Schema,
|
2729
|
-
PROTECT,
|
2730
|
-
null=True,
|
2731
|
-
default=None,
|
2732
|
-
related_name="validated_artifacts",
|
2733
|
-
)
|
2734
|
-
"""The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
|
2735
|
-
feature_sets: Schema = models.ManyToManyField(
|
2736
|
-
Schema, related_name="artifacts", through="ArtifactSchema"
|
2737
|
-
)
|
2738
|
-
"""The feature sets measured by the artifact."""
|
2739
|
-
_feature_values: FeatureValue = models.ManyToManyField(
|
2740
|
-
FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
|
2741
|
-
)
|
2742
|
-
"""Non-categorical feature values for annotation."""
|
2743
|
-
_param_values: ParamValue = models.ManyToManyField(
|
2744
|
-
ParamValue, through="ArtifactParamValue", related_name="artifacts"
|
2745
|
-
)
|
2746
|
-
"""Parameter values."""
|
2747
|
-
_key_is_virtual: bool = BooleanField()
|
2748
|
-
"""Indicates whether `key` is virtual or part of an actual file path."""
|
2749
|
-
# be mindful that below, passing related_name="+" leads to errors
|
2750
|
-
_actions: Artifact = models.ManyToManyField(
|
2751
|
-
"self", symmetrical=False, related_name="_action_targets"
|
2752
|
-
)
|
2753
|
-
"""Actions to attach for the UI."""
|
2754
|
-
created_by: User = ForeignKey(
|
2755
|
-
"lamindb.User",
|
2756
|
-
PROTECT,
|
2757
|
-
default=current_user_id,
|
2758
|
-
related_name="created_artifacts",
|
2759
|
-
editable=False,
|
2760
|
-
)
|
2761
|
-
"""Creator of record."""
|
2762
|
-
_overwrite_versions: bool = BooleanField(default=None)
|
2763
|
-
"""Indicates whether to store or overwrite versions.
|
2764
|
-
|
2765
|
-
It defaults to False for file-like artifacts and to True for folder-like artifacts.
|
2766
|
-
"""
|
2767
|
-
projects: Project
|
2768
|
-
"""Associated projects."""
|
2769
|
-
references: Reference
|
2770
|
-
"""Associated references."""
|
2771
|
-
|
2772
|
-
@overload
|
2773
|
-
def __init__(
|
2774
|
-
self,
|
2775
|
-
# we're not choosing the name "path" for this arg because
|
2776
|
-
# it'd be confusing with `artifact.path`, which is not the same
|
2777
|
-
# so "data" conveys better that this is input data that's ingested
|
2778
|
-
# and will be moved to a target path at `artifact.path`
|
2779
|
-
# also internally, we sometimes pass "data objects" like a DataFrame
|
2780
|
-
# here; and we might refactor this but we might also keep that internal
|
2781
|
-
# usage
|
2782
|
-
data: UPathStr,
|
2783
|
-
kind: ArtifactKind | None = None,
|
2784
|
-
key: str | None = None,
|
2785
|
-
description: str | None = None,
|
2786
|
-
revises: Artifact | None = None,
|
2787
|
-
run: Run | None = None,
|
2788
|
-
): ...
|
2789
|
-
|
2790
|
-
@overload
|
2791
|
-
def __init__(
|
2792
|
-
self,
|
2793
|
-
*db_args,
|
2794
|
-
): ...
|
2795
|
-
|
2796
|
-
def __init__(
|
2797
|
-
self,
|
2798
|
-
*args,
|
2799
|
-
**kwargs,
|
2800
|
-
):
|
2801
|
-
pass
|
2802
|
-
|
2803
|
-
@property
|
2804
|
-
@deprecated("kind")
|
2805
|
-
def type(self) -> str:
|
2806
|
-
return self.kind
|
2807
|
-
|
2808
|
-
@property
|
2809
|
-
@deprecated("otype")
|
2810
|
-
def _accessor(self) -> str:
|
2811
|
-
return self.otype
|
2812
|
-
|
2813
|
-
@property
|
2814
|
-
def transform(self) -> Transform | None:
|
2815
|
-
"""Transform whose run created the artifact."""
|
2816
|
-
return self.run.transform if self.run is not None else None
|
2817
|
-
|
2818
|
-
@property
|
2819
|
-
@deprecated("n_files")
|
2820
|
-
def n_objects(self) -> int:
|
2821
|
-
return self.n_files
|
2822
|
-
|
2823
|
-
# add the below because this is what people will have in their code
|
2824
|
-
# if they implement the recommended migration strategy
|
2825
|
-
# - FeatureSet -> Schema
|
2826
|
-
# - featureset -> schema
|
2827
|
-
# - feature_set -> schema
|
2828
|
-
# @property
|
2829
|
-
# def schemas(self) -> QuerySet[Schema]:
|
2830
|
-
# """Schemas linked to artifact via many-to-many relationship.
|
2831
|
-
|
2832
|
-
# Is now mediating the private `.feature_sets` relationship during
|
2833
|
-
# a transition period to better schema management.
|
2834
|
-
|
2835
|
-
# .. versionchanged: 1.0
|
2836
|
-
# Was previously called `.feature_sets`.
|
2837
|
-
|
2838
|
-
# """
|
2839
|
-
# return self.feature_sets
|
2840
|
-
|
2841
|
-
@property
|
2842
|
-
def path(self) -> Path:
|
2843
|
-
"""Path.
|
2844
|
-
|
2845
|
-
File in cloud storage, here AWS S3:
|
2846
|
-
|
2847
|
-
>>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
|
2848
|
-
>>> artifact.path
|
2849
|
-
S3QueryPath('s3://my-bucket/my-file.csv')
|
2850
|
-
|
2851
|
-
File in local storage:
|
2852
|
-
|
2853
|
-
>>> ln.Artifact("./myfile.csv", key="myfile").save()
|
2854
|
-
>>> artifact = ln.Artifact.get(key="myfile")
|
2855
|
-
>>> artifact.path
|
2856
|
-
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
|
2857
|
-
"""
|
2858
|
-
pass
|
2859
|
-
|
2860
|
-
@classmethod
|
2861
|
-
def from_df(
|
2862
|
-
cls,
|
2863
|
-
df: pd.DataFrame,
|
2864
|
-
*,
|
2865
|
-
key: str | None = None,
|
2866
|
-
description: str | None = None,
|
2867
|
-
run: Run | None = None,
|
2868
|
-
revises: Artifact | None = None,
|
2869
|
-
**kwargs,
|
2870
|
-
) -> Artifact:
|
2871
|
-
"""Create from `DataFrame`, validate & link features.
|
2872
|
-
|
2873
|
-
Args:
|
2874
|
-
df: A `DataFrame` object.
|
2875
|
-
key: A relative path within default storage,
|
2876
|
-
e.g., `"myfolder/myfile.parquet"`.
|
2877
|
-
description: A description.
|
2878
|
-
revises: An old version of the artifact.
|
2879
|
-
run: The run that creates the artifact.
|
2880
|
-
|
2881
|
-
See Also:
|
2882
|
-
:meth:`~lamindb.Collection`
|
2883
|
-
Track collections.
|
2884
|
-
:class:`~lamindb.Feature`
|
2885
|
-
Track features.
|
2886
|
-
|
2887
|
-
Examples:
|
2888
|
-
>>> df = ln.core.datasets.df_iris_in_meter_batch1()
|
2889
|
-
>>> df.head()
|
2890
|
-
sepal_length sepal_width petal_length petal_width iris_organism_code
|
2891
|
-
0 0.051 0.035 0.014 0.002 0
|
2892
|
-
1 0.049 0.030 0.014 0.002 0
|
2893
|
-
2 0.047 0.032 0.013 0.002 0
|
2894
|
-
3 0.046 0.031 0.015 0.002 0
|
2895
|
-
4 0.050 0.036 0.014 0.002 0
|
2896
|
-
>>> artifact = ln.Artifact.from_df(df, description="Iris flower collection batch1")
|
2897
|
-
>>> artifact.save()
|
2898
|
-
"""
|
2899
|
-
pass
|
2900
|
-
|
2901
|
-
@classmethod
|
2902
|
-
def from_anndata(
|
2903
|
-
cls,
|
2904
|
-
adata: AnnData | UPathStr,
|
2905
|
-
*,
|
2906
|
-
key: str | None = None,
|
2907
|
-
description: str | None = None,
|
2908
|
-
run: Run | None = None,
|
2909
|
-
revises: Artifact | None = None,
|
2910
|
-
**kwargs,
|
2911
|
-
) -> Artifact:
|
2912
|
-
"""Create from ``AnnData``, validate & link features.
|
2913
|
-
|
2914
|
-
Args:
|
2915
|
-
adata: An `AnnData` object or a path of AnnData-like.
|
2916
|
-
key: A relative path within default storage,
|
2917
|
-
e.g., `"myfolder/myfile.h5ad"`.
|
2918
|
-
description: A description.
|
2919
|
-
revises: An old version of the artifact.
|
2920
|
-
run: The run that creates the artifact.
|
2921
|
-
|
2922
|
-
See Also:
|
2923
|
-
|
2924
|
-
:meth:`~lamindb.Collection`
|
2925
|
-
Track collections.
|
2926
|
-
:class:`~lamindb.Feature`
|
2927
|
-
Track features.
|
2928
|
-
|
2929
|
-
Examples:
|
2930
|
-
>>> import bionty as bt
|
2931
|
-
>>> bt.settings.organism = "human"
|
2932
|
-
>>> adata = ln.core.datasets.anndata_with_obs()
|
2933
|
-
>>> artifact = ln.Artifact.from_anndata(adata, description="mini anndata with obs")
|
2934
|
-
>>> artifact.save()
|
2935
|
-
"""
|
2936
|
-
pass
|
2937
|
-
|
2938
|
-
@classmethod
|
2939
|
-
def from_mudata(
|
2940
|
-
cls,
|
2941
|
-
mdata: MuData,
|
2942
|
-
*,
|
2943
|
-
key: str | None = None,
|
2944
|
-
description: str | None = None,
|
2945
|
-
run: Run | None = None,
|
2946
|
-
revises: Artifact | None = None,
|
2947
|
-
**kwargs,
|
2948
|
-
) -> Artifact:
|
2949
|
-
"""Create from ``MuData``, validate & link features.
|
2950
|
-
|
2951
|
-
Args:
|
2952
|
-
mdata: An `MuData` object.
|
2953
|
-
key: A relative path within default storage,
|
2954
|
-
e.g., `"myfolder/myfile.h5mu"`.
|
2955
|
-
description: A description.
|
2956
|
-
revises: An old version of the artifact.
|
2957
|
-
run: The run that creates the artifact.
|
2958
|
-
|
2959
|
-
See Also:
|
2960
|
-
:meth:`~lamindb.Collection`
|
2961
|
-
Track collections.
|
2962
|
-
:class:`~lamindb.Feature`
|
2963
|
-
Track features.
|
2964
|
-
|
2965
|
-
Examples:
|
2966
|
-
>>> import bionty as bt
|
2967
|
-
>>> bt.settings.organism = "human"
|
2968
|
-
>>> mdata = ln.core.datasets.mudata_papalexi21_subset()
|
2969
|
-
>>> artifact = ln.Artifact.from_mudata(mdata, description="a mudata object")
|
2970
|
-
>>> artifact.save()
|
2971
|
-
"""
|
2972
|
-
pass
|
2973
|
-
|
2974
|
-
@classmethod
|
2975
|
-
def from_tiledbsoma(
|
2976
|
-
cls,
|
2977
|
-
path: UPathStr,
|
2978
|
-
*,
|
2979
|
-
key: str | None = None,
|
2980
|
-
description: str | None = None,
|
2981
|
-
run: Run | None = None,
|
2982
|
-
revises: Artifact | None = None,
|
2983
|
-
**kwargs,
|
2984
|
-
) -> Artifact:
|
2985
|
-
"""Create from a tiledbsoma store.
|
2986
|
-
|
2987
|
-
Args:
|
2988
|
-
path: A tiledbsoma store with .tiledbsoma suffix.
|
2989
|
-
key: A relative path within default storage,
|
2990
|
-
e.g., `"myfolder/mystore.tiledbsoma"`.
|
2991
|
-
description: A description.
|
2992
|
-
revises: An old version of the artifact.
|
2993
|
-
run: The run that creates the artifact.
|
2994
|
-
|
2995
|
-
Examples:
|
2996
|
-
>>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
|
2997
|
-
>>> artifact.save()
|
2998
|
-
"""
|
2999
|
-
pass
|
3000
|
-
|
3001
|
-
@classmethod
|
3002
|
-
def from_dir(
|
3003
|
-
cls,
|
3004
|
-
path: UPathStr,
|
3005
|
-
*,
|
3006
|
-
key: str | None = None,
|
3007
|
-
run: Run | None = None,
|
3008
|
-
) -> list[Artifact]:
|
3009
|
-
"""Create a list of artifact objects from a directory.
|
3010
|
-
|
3011
|
-
Hint:
|
3012
|
-
If you have a high number of files (several 100k) and don't want to
|
3013
|
-
track them individually, create a single :class:`~lamindb.Artifact` via
|
3014
|
-
``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.
|
3015
|
-
|
3016
|
-
Args:
|
3017
|
-
path: Source path of folder.
|
3018
|
-
key: Key for storage destination. If `None` and
|
3019
|
-
directory is in a registered location, the inferred `key` will
|
3020
|
-
reflect the relative position. If `None` and directory is outside
|
3021
|
-
of a registered storage location, the inferred key defaults to `path.name`.
|
3022
|
-
run: A `Run` object.
|
3023
|
-
|
3024
|
-
Examples:
|
3025
|
-
>>> dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
|
3026
|
-
>>> artifacts = ln.Artifact.from_dir(dir_path)
|
3027
|
-
>>> ln.save(artifacts)
|
3028
|
-
"""
|
3029
|
-
pass
|
3030
|
-
|
3031
|
-
def replace(
|
3032
|
-
self,
|
3033
|
-
data: UPathStr | pd.DataFrame | AnnData | MuData,
|
3034
|
-
run: Run | None = None,
|
3035
|
-
format: str | None = None,
|
3036
|
-
) -> None:
|
3037
|
-
"""Replace artifact content.
|
3038
|
-
|
3039
|
-
Args:
|
3040
|
-
data: A file path.
|
3041
|
-
run: The run that created the artifact gets
|
3042
|
-
auto-linked if ``ln.track()`` was called.
|
3043
|
-
|
3044
|
-
Examples:
|
3045
|
-
Say we made a change to the content of an artifact, e.g., edited the image
|
3046
|
-
`paradisi05_laminopathic_nuclei.jpg`.
|
3047
|
-
|
3048
|
-
This is how we replace the old file in storage with the new file:
|
3049
|
-
|
3050
|
-
>>> artifact.replace("paradisi05_laminopathic_nuclei.jpg")
|
3051
|
-
>>> artifact.save()
|
3052
|
-
|
3053
|
-
Note that this neither changes the storage key nor the filename.
|
3054
|
-
|
3055
|
-
However, it will update the suffix if it changes.
|
3056
|
-
"""
|
3057
|
-
pass
|
3058
|
-
|
3059
|
-
def open(
|
3060
|
-
self, mode: str = "r", is_run_input: bool | None = None
|
3061
|
-
) -> (
|
3062
|
-
AnnDataAccessor
|
3063
|
-
| BackedAccessor
|
3064
|
-
| SOMACollection
|
3065
|
-
| SOMAExperiment
|
3066
|
-
| SOMAMeasurement
|
3067
|
-
| PyArrowDataset
|
3068
|
-
):
|
3069
|
-
"""Return a cloud-backed data object.
|
3070
|
-
|
3071
|
-
Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
|
3072
|
-
`tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
|
3073
|
-
|
3074
|
-
Args:
|
3075
|
-
mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
|
3076
|
-
otherwise should be always `"r"` (read-only mode).
|
3077
|
-
|
3078
|
-
Notes:
|
3079
|
-
For more info, see tutorial: :doc:`/arrays`.
|
3080
|
-
|
3081
|
-
Examples:
|
3082
|
-
|
3083
|
-
Read AnnData in backed mode from cloud:
|
3084
|
-
|
3085
|
-
>>> artifact = ln.Artifact.get(key="lndb-storage/pbmc68k.h5ad")
|
3086
|
-
>>> artifact.open()
|
3087
|
-
AnnDataAccessor object with n_obs × n_vars = 70 × 765
|
3088
|
-
constructed for the AnnData object pbmc68k.h5ad
|
3089
|
-
...
|
3090
|
-
"""
|
3091
|
-
pass
|
3092
|
-
|
3093
|
-
def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
|
3094
|
-
"""Cache and load into memory.
|
3095
|
-
|
3096
|
-
See all :mod:`~lamindb.core.loaders`.
|
3097
|
-
|
3098
|
-
Examples:
|
3099
|
-
|
3100
|
-
Load a `DataFrame`-like artifact:
|
3101
|
-
|
3102
|
-
>>> artifact.load().head()
|
3103
|
-
sepal_length sepal_width petal_length petal_width iris_organism_code
|
3104
|
-
0 0.051 0.035 0.014 0.002 0
|
3105
|
-
1 0.049 0.030 0.014 0.002 0
|
3106
|
-
2 0.047 0.032 0.013 0.002 0
|
3107
|
-
3 0.046 0.031 0.015 0.002 0
|
3108
|
-
4 0.050 0.036 0.014 0.002 0
|
3109
|
-
|
3110
|
-
Load an `AnnData`-like artifact:
|
3111
|
-
|
3112
|
-
>>> artifact.load()
|
3113
|
-
AnnData object with n_obs × n_vars = 70 × 765
|
3114
|
-
|
3115
|
-
Fall back to :meth:`~lamindb.Artifact.cache` if no in-memory representation is configured:
|
3116
|
-
|
3117
|
-
>>> artifact.load()
|
3118
|
-
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/.lamindb/jb7BY5UJoQVGMUOKiLcn.jpg')
|
3119
|
-
"""
|
3120
|
-
pass
|
3121
|
-
|
3122
|
-
def cache(self, is_run_input: bool | None = None) -> Path:
|
3123
|
-
"""Download cloud artifact to local cache.
|
3124
|
-
|
3125
|
-
Follows synching logic: only caches an artifact if it's outdated in the local cache.
|
3126
|
-
|
3127
|
-
Returns a path to a locally cached on-disk object (say a `.jpg` file).
|
3128
|
-
|
3129
|
-
Examples:
|
3130
|
-
|
3131
|
-
Sync file from cloud and return the local path of the cache:
|
3132
|
-
|
3133
|
-
>>> artifact.cache()
|
3134
|
-
PosixPath('/home/runner/work/Caches/lamindb/lamindb-ci/lndb-storage/pbmc68k.h5ad')
|
3135
|
-
"""
|
3136
|
-
pass
|
3137
|
-
|
3138
|
-
def delete(
|
3139
|
-
self, permanent: bool | None = None, storage: bool | None = None
|
3140
|
-
) -> None:
|
3141
|
-
"""Trash or permanently delete.
|
3142
|
-
|
3143
|
-
A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`).
|
3144
|
-
A second call permanently deletes the artifact.
|
3145
|
-
If it is a folder artifact with multiple versions, deleting a non-latest version
|
3146
|
-
will not delete the underlying storage by default (if `storage=True` is not specified).
|
3147
|
-
Deleting the latest version will delete all the versions for folder artifacts.
|
3148
|
-
|
3149
|
-
FAQ: :doc:`docs:faq/storage`
|
3150
|
-
|
3151
|
-
Args:
|
3152
|
-
permanent: Permanently delete the artifact (skip trash).
|
3153
|
-
storage: Indicate whether you want to delete the artifact in storage.
|
3154
|
-
|
3155
|
-
Examples:
|
3156
|
-
|
3157
|
-
For an `Artifact` object `artifact`, call:
|
3158
|
-
|
3159
|
-
>>> artifact = ln.Artifact.filter(key="some.csv").one()
|
3160
|
-
>>> artifact.delete() # delete a single file artifact
|
3161
|
-
|
3162
|
-
>>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=False).first()
|
3163
|
-
>>> artiact.delete() # delete an old version, the data will not be deleted
|
3164
|
-
|
3165
|
-
>>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=True).one()
|
3166
|
-
>>> artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
|
3167
|
-
"""
|
3168
|
-
pass
|
3169
|
-
|
3170
|
-
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
3171
|
-
"""Save to database & storage.
|
3172
|
-
|
3173
|
-
Args:
|
3174
|
-
upload: Trigger upload to cloud storage in instances with hybrid storage mode.
|
3175
|
-
|
3176
|
-
Examples:
|
3177
|
-
>>> artifact = ln.Artifact("./myfile.csv", description="myfile")
|
3178
|
-
>>> artifact.save()
|
3179
|
-
"""
|
3180
|
-
pass
|
3181
|
-
|
3182
|
-
def restore(self) -> None:
|
3183
|
-
"""Restore from trash.
|
3184
|
-
|
3185
|
-
Examples:
|
3186
|
-
|
3187
|
-
For any `Artifact` object `artifact`, call:
|
3188
|
-
|
3189
|
-
>>> artifact.restore()
|
3190
|
-
"""
|
3191
|
-
pass
|
3192
|
-
|
3193
|
-
def describe(self) -> None:
|
3194
|
-
"""Describe relations of record.
|
3195
|
-
|
3196
|
-
Examples:
|
3197
|
-
>>> artifact.describe()
|
3198
|
-
"""
|
3199
|
-
pass
|
3200
|
-
|
3201
|
-
|
3202
|
-
class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
3203
|
-
"""Collections of artifacts.
|
3204
|
-
|
3205
|
-
Collections provide a simple way of versioning collections of artifacts.
|
3206
|
-
|
3207
|
-
Args:
|
3208
|
-
artifacts: `list[Artifact]` A list of artifacts.
|
3209
|
-
key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
|
3210
|
-
description: `str | None = None` A description.
|
3211
|
-
revises: `Collection | None = None` An old version of the collection.
|
3212
|
-
run: `Run | None = None` The run that creates the collection.
|
3213
|
-
meta: `Artifact | None = None` An artifact that defines metadata for the collection.
|
3214
|
-
reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
|
3215
|
-
reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
|
3216
|
-
|
3217
|
-
See Also:
|
3218
|
-
:class:`~lamindb.Artifact`
|
3219
|
-
|
3220
|
-
Examples:
|
3221
|
-
|
3222
|
-
Create a collection from a list of :class:`~lamindb.Artifact` objects:
|
3223
|
-
|
3224
|
-
>>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
|
3225
|
-
|
3226
|
-
Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
|
3227
|
-
|
3228
|
-
>>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
|
3229
|
-
|
3230
|
-
"""
|
3231
|
-
|
3232
|
-
class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
3233
|
-
abstract = False
|
3234
|
-
|
3235
|
-
_len_full_uid: int = 20
|
3236
|
-
_len_stem_uid: int = 16
|
3237
|
-
_name_field: str = "key"
|
3238
|
-
|
3239
|
-
id: int = models.AutoField(primary_key=True)
|
3240
|
-
"""Internal id, valid only in one DB instance."""
|
3241
|
-
uid: str = CharField(
|
3242
|
-
editable=False,
|
3243
|
-
unique=True,
|
3244
|
-
db_index=True,
|
3245
|
-
max_length=_len_full_uid,
|
3246
|
-
default=base62_20,
|
3247
|
-
)
|
3248
|
-
"""Universal id, valid across DB instances."""
|
3249
|
-
key: str = CharField(db_index=True)
|
3250
|
-
"""Name or path-like key."""
|
3251
|
-
# below is the only case in which we use a TextField
|
3252
|
-
# for description; we do so because users had descriptions exceeding 255 chars
|
3253
|
-
# in their instances
|
3254
|
-
description: str | None = TextField(null=True, db_index=True)
|
3255
|
-
"""A description or title."""
|
3256
|
-
hash: str | None = CharField(
|
3257
|
-
max_length=HASH_LENGTH, db_index=True, null=True, unique=True
|
3258
|
-
)
|
3259
|
-
"""Hash of collection content."""
|
3260
|
-
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
3261
|
-
"""A reference like URL or external ID."""
|
3262
|
-
# also for reference_type here, we allow an extra long max_length
|
3263
|
-
reference_type: str | None = CharField(max_length=25, db_index=True, null=True)
|
3264
|
-
"""Type of reference, e.g., cellxgene Census collection_id."""
|
3265
|
-
ulabels: ULabel = models.ManyToManyField(
|
3266
|
-
"ULabel", through="CollectionULabel", related_name="collections"
|
3267
|
-
)
|
3268
|
-
"""ULabels sampled in the collection (see :class:`~lamindb.Feature`)."""
|
3269
|
-
run: Run | None = ForeignKey(
|
3270
|
-
Run, PROTECT, related_name="output_collections", null=True, default=None
|
3271
|
-
)
|
3272
|
-
""":class:`~lamindb.Run` that created the `collection`."""
|
3273
|
-
input_of_runs: Run = models.ManyToManyField(Run, related_name="input_collections")
|
3274
|
-
"""Runs that use this collection as an input."""
|
3275
|
-
_previous_runs: Run = models.ManyToManyField(
|
3276
|
-
"Run", related_name="_output_collections_with_later_updates"
|
3277
|
-
)
|
3278
|
-
"""Sequence of runs that created or updated the record."""
|
3279
|
-
artifacts: Artifact = models.ManyToManyField(
|
3280
|
-
"Artifact", related_name="collections", through="CollectionArtifact"
|
3281
|
-
)
|
3282
|
-
"""Artifacts in collection."""
|
3283
|
-
meta_artifact: Artifact | None = OneToOneField(
|
3284
|
-
"Artifact",
|
3285
|
-
PROTECT,
|
3286
|
-
null=True,
|
3287
|
-
unique=True,
|
3288
|
-
related_name="_meta_of_collection",
|
3289
|
-
)
|
3290
|
-
"""An artifact that stores metadata that indexes a collection.
|
3291
|
-
|
3292
|
-
It has a 1:1 correspondence with an artifact. If needed, you can access the
|
3293
|
-
collection from the artifact via a private field:
|
3294
|
-
`artifact._meta_of_collection`.
|
3295
|
-
"""
|
3296
|
-
_actions: Artifact = models.ManyToManyField(Artifact, related_name="+")
|
3297
|
-
"""Actions to attach for the UI."""
|
3298
|
-
|
3299
|
-
@overload
|
3300
|
-
def __init__(
|
3301
|
-
self,
|
3302
|
-
artifacts: list[Artifact],
|
3303
|
-
key: str,
|
3304
|
-
description: str | None = None,
|
3305
|
-
meta: Any | None = None,
|
3306
|
-
reference: str | None = None,
|
3307
|
-
reference_type: str | None = None,
|
3308
|
-
run: Run | None = None,
|
3309
|
-
revises: Collection | None = None,
|
3310
|
-
): ...
|
3311
|
-
|
3312
|
-
@overload
|
3313
|
-
def __init__(
|
3314
|
-
self,
|
3315
|
-
*db_args,
|
3316
|
-
): ...
|
3317
|
-
|
3318
|
-
def __init__(
|
3319
|
-
self,
|
3320
|
-
*args,
|
3321
|
-
**kwargs,
|
3322
|
-
):
|
3323
|
-
pass
|
3324
|
-
|
3325
|
-
def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
|
3326
|
-
"""Add an artifact to the collection.
|
3327
|
-
|
3328
|
-
Creates a new version of the collection.
|
3329
|
-
This does not modify the original collection in-place, but returns a new version
|
3330
|
-
of the original collection with the added artifact.
|
3331
|
-
|
3332
|
-
Args:
|
3333
|
-
artifact: An artifact to add to the collection.
|
3334
|
-
run: The run that creates the new version of the collection.
|
3335
|
-
|
3336
|
-
Examples:
|
3337
|
-
>>> collection = ln.Collection(artifact, key="new collection")
|
3338
|
-
>>> collecton.save()
|
3339
|
-
>>> collection = collection.append(another_artifact) # returns a new version
|
3340
|
-
>>> collection.save() # save the new version
|
3341
|
-
|
3342
|
-
.. versionadded:: 0.76.14
|
3343
|
-
"""
|
3344
|
-
pass
|
3345
|
-
|
3346
|
-
def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
|
3347
|
-
"""Return a cloud-backed pyarrow Dataset.
|
3348
|
-
|
3349
|
-
Works for `pyarrow` compatible formats.
|
3350
|
-
|
3351
|
-
Notes:
|
3352
|
-
For more info, see tutorial: :doc:`/arrays`.
|
3353
|
-
"""
|
3354
|
-
pass
|
3355
|
-
|
3356
|
-
def mapped(
|
3357
|
-
self,
|
3358
|
-
layers_keys: str | list[str] | None = None,
|
3359
|
-
obs_keys: str | list[str] | None = None,
|
3360
|
-
obsm_keys: str | list[str] | None = None,
|
3361
|
-
obs_filter: dict[str, str | list[str]] | None = None,
|
3362
|
-
join: Literal["inner", "outer"] | None = "inner",
|
3363
|
-
encode_labels: bool | list[str] = True,
|
3364
|
-
unknown_label: str | dict[str, str] | None = None,
|
3365
|
-
cache_categories: bool = True,
|
3366
|
-
parallel: bool = False,
|
3367
|
-
dtype: str | None = None,
|
3368
|
-
stream: bool = False,
|
3369
|
-
is_run_input: bool | None = None,
|
3370
|
-
) -> MappedCollection:
|
3371
|
-
"""Return a map-style dataset.
|
3372
|
-
|
3373
|
-
Returns a `pytorch map-style dataset
|
3374
|
-
<https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
|
3375
|
-
virtually concatenating `AnnData` arrays.
|
3376
|
-
|
3377
|
-
If your `AnnData` collection is in the cloud, move them into a local
|
3378
|
-
cache first via :meth:`~lamindb.Collection.cache`.
|
3379
|
-
|
3380
|
-
`__getitem__` of the `MappedCollection` object takes a single integer index
|
3381
|
-
and returns a dictionary with the observation data sample for this index from
|
3382
|
-
the `AnnData` objects in the collection. The dictionary has keys for `layers_keys`
|
3383
|
-
(`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"`
|
3384
|
-
for the index of the `AnnData` object containing this observation sample.
|
3385
|
-
|
3386
|
-
.. note::
|
3387
|
-
|
3388
|
-
For a guide, see :doc:`docs:scrna-mappedcollection`.
|
3389
|
-
|
3390
|
-
This method currently only works for collections of `AnnData` artifacts.
|
3391
|
-
|
3392
|
-
Args:
|
3393
|
-
layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
|
3394
|
-
retrieves ``.X``.
|
3395
|
-
obs_keys: Keys from the ``.obs`` slots.
|
3396
|
-
obsm_keys: Keys from the ``.obsm`` slots.
|
3397
|
-
obs_filter: Select only observations with these values for the given obs columns.
|
3398
|
-
Should be a dictionary with obs column names as keys
|
3399
|
-
and filtering values (a string or a list of strings) as values.
|
3400
|
-
join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
|
3401
|
-
does not join.
|
3402
|
-
encode_labels: Encode labels into integers.
|
3403
|
-
Can be a list with elements from ``obs_keys``.
|
3404
|
-
unknown_label: Encode this label to -1.
|
3405
|
-
Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True``
|
3406
|
-
or from ``encode_labels`` if it is a list.
|
3407
|
-
cache_categories: Enable caching categories of ``obs_keys`` for faster access.
|
3408
|
-
parallel: Enable sampling with multiple processes.
|
3409
|
-
dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
|
3410
|
-
stream: Whether to stream data from the array backend.
|
3411
|
-
is_run_input: Whether to track this collection as run input.
|
3412
|
-
|
3413
|
-
Examples:
|
3414
|
-
>>> import lamindb as ln
|
3415
|
-
>>> from torch.utils.data import DataLoader
|
3416
|
-
>>> ds = ln.Collection.get(description="my collection")
|
3417
|
-
>>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
|
3418
|
-
>>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
|
3419
|
-
"""
|
3420
|
-
pass
|
3421
|
-
|
3422
|
-
def cache(self, is_run_input: bool | None = None) -> list[UPath]:
|
3423
|
-
"""Download cloud artifacts in collection to local cache.
|
3424
|
-
|
3425
|
-
Follows synching logic: only caches outdated artifacts.
|
3426
|
-
|
3427
|
-
Returns paths to locally cached on-disk artifacts.
|
3428
|
-
|
3429
|
-
Args:
|
3430
|
-
is_run_input: Whether to track this collection as run input.
|
3431
|
-
"""
|
3432
|
-
pass
|
3433
|
-
|
3434
|
-
def load(
|
3435
|
-
self,
|
3436
|
-
join: Literal["inner", "outer"] = "outer",
|
3437
|
-
is_run_input: bool | None = None,
|
3438
|
-
**kwargs,
|
3439
|
-
) -> Any:
|
3440
|
-
"""Stage and load to memory.
|
3441
|
-
|
3442
|
-
Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
|
3443
|
-
"""
|
3444
|
-
pass
|
3445
|
-
|
3446
|
-
def delete(self, permanent: bool | None = None) -> None:
|
3447
|
-
"""Delete collection.
|
3448
|
-
|
3449
|
-
Args:
|
3450
|
-
permanent: Whether to permanently delete the collection record (skips trash).
|
3451
|
-
|
3452
|
-
Examples:
|
3453
|
-
|
3454
|
-
For any `Collection` object `collection`, call:
|
3455
|
-
|
3456
|
-
>>> collection.delete()
|
3457
|
-
"""
|
3458
|
-
pass
|
3459
|
-
|
3460
|
-
def save(self, using: str | None = None) -> Collection:
|
3461
|
-
"""Save the collection and underlying artifacts to database & storage.
|
3462
|
-
|
3463
|
-
Args:
|
3464
|
-
using: The database to which you want to save.
|
3465
|
-
|
3466
|
-
Examples:
|
3467
|
-
>>> collection = ln.Collection("./myfile.csv", name="myfile")
|
3468
|
-
>>> collection.save()
|
3469
|
-
"""
|
3470
|
-
pass
|
3471
|
-
|
3472
|
-
def restore(self) -> None:
|
3473
|
-
"""Restore collection record from trash.
|
3474
|
-
|
3475
|
-
Examples:
|
3476
|
-
|
3477
|
-
For any `Collection` object `collection`, call:
|
3478
|
-
|
3479
|
-
>>> collection.restore()
|
3480
|
-
"""
|
3481
|
-
pass
|
3482
|
-
|
3483
|
-
@property
|
3484
|
-
def transform(self) -> Transform | None:
|
3485
|
-
"""Transform whose run created the collection."""
|
3486
|
-
return self.run.transform if self.run is not None else None
|
3487
|
-
|
3488
|
-
@property
|
3489
|
-
def name(self) -> str:
|
3490
|
-
"""Name of the collection.
|
3491
|
-
|
3492
|
-
Splits `key` on `/` and returns the last element.
|
3493
|
-
"""
|
3494
|
-
return self.key.split("/")[-1]
|
3495
|
-
|
3496
|
-
@property
|
3497
|
-
def ordered_artifacts(self) -> QuerySet:
|
3498
|
-
"""Ordered `QuerySet` of `.artifacts`.
|
3499
|
-
|
3500
|
-
Accessing the many-to-many field `collection.artifacts` directly gives
|
3501
|
-
you non-deterministic order.
|
3502
|
-
|
3503
|
-
Using the property `.ordered_artifacts` allows to iterate through a set
|
3504
|
-
that's ordered in the order of creation.
|
3505
|
-
"""
|
3506
|
-
pass
|
3507
|
-
|
3508
|
-
@property
|
3509
|
-
def data_artifact(self) -> Artifact | None:
|
3510
|
-
"""Access to a single data artifact.
|
3511
|
-
|
3512
|
-
If the collection has a single data & metadata artifact, this allows access via::
|
3513
|
-
|
3514
|
-
collection.data_artifact # first & only element of collection.artifacts
|
3515
|
-
collection.meta_artifact # metadata
|
3516
|
-
|
3517
|
-
"""
|
3518
|
-
pass
|
3519
|
-
|
3520
|
-
def describe(self) -> None:
|
3521
|
-
"""Describe relations of record.
|
3522
|
-
|
3523
|
-
Examples:
|
3524
|
-
>>> artifact.describe()
|
3525
|
-
"""
|
3526
|
-
pass
|
3527
|
-
|
3528
|
-
|
3529
|
-
# -------------------------------------------------------------------------------------
|
3530
|
-
# Project management
|
3531
|
-
|
3532
|
-
|
3533
|
-
class Person(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
3534
|
-
"""Persons.
|
3535
|
-
|
3536
|
-
This registry is distinct from `User` and purely exists for project management.
|
3537
|
-
|
3538
|
-
You'll soon be able to conveniently create persons from users.
|
3539
|
-
|
3540
|
-
Example:
|
3541
|
-
>>> person = Person(
|
3542
|
-
... name="Jane Doe",
|
3543
|
-
... email="jane.doe@example.com",
|
3544
|
-
... internal=True,
|
3545
|
-
... ).save()
|
3546
|
-
"""
|
3547
|
-
|
3548
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
3549
|
-
abstract = False
|
3550
|
-
|
3551
|
-
id: int = models.AutoField(primary_key=True)
|
3552
|
-
"""Internal id, valid only in one DB instance."""
|
3553
|
-
uid: str = CharField(
|
3554
|
-
editable=False, unique=True, max_length=8, db_index=True, default=base62_8
|
3555
|
-
)
|
3556
|
-
"""Universal id, valid across DB instances."""
|
3557
|
-
name: str = CharField(db_index=True)
|
3558
|
-
"""Name of the person (forename(s) lastname)."""
|
3559
|
-
email: str | None = EmailField(null=True, default=None)
|
3560
|
-
"""Email of the person."""
|
3561
|
-
external: bool = BooleanField(default=True, db_index=True)
|
3562
|
-
"""Whether the person is external to the organization."""
|
3563
|
-
|
3564
|
-
|
3565
|
-
class Project(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
3566
|
-
"""Projects.
|
3567
|
-
|
3568
|
-
Example:
|
3569
|
-
>>> project = Project(
|
3570
|
-
... name="My Project Name",
|
3571
|
-
... abbr="MPN",
|
3572
|
-
... url="https://example.com/my_project",
|
3573
|
-
... ).save()
|
3574
|
-
"""
|
3575
|
-
|
3576
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
3577
|
-
abstract = False
|
3578
|
-
|
3579
|
-
id: int = models.AutoField(primary_key=True)
|
3580
|
-
"""Internal id, valid only in one DB instance."""
|
3581
|
-
uid: str = CharField(
|
3582
|
-
editable=False, unique=True, max_length=12, db_index=True, default=base62_12
|
3583
|
-
)
|
3584
|
-
"""Universal id, valid across DB instances."""
|
3585
|
-
name: str = CharField(db_index=True)
|
3586
|
-
"""Title or name of the Project."""
|
3587
|
-
type: Project | None = ForeignKey(
|
3588
|
-
"self", PROTECT, null=True, related_name="records"
|
3589
|
-
)
|
3590
|
-
"""Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task')."""
|
3591
|
-
records: Project
|
3592
|
-
"""Records of this type."""
|
3593
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
3594
|
-
"""Distinguish types from instances of the type."""
|
3595
|
-
abbr: str | None = CharField(max_length=32, db_index=True, null=True)
|
3596
|
-
"""An abbreviation."""
|
3597
|
-
url: str | None = URLField(max_length=255, null=True, default=None)
|
3598
|
-
"""A URL."""
|
3599
|
-
start_date: date | None = DateField(null=True, default=None)
|
3600
|
-
"""Date of start of the project."""
|
3601
|
-
end_date: date | None = DateField(null=True, default=None)
|
3602
|
-
"""Date of start of the project."""
|
3603
|
-
parents: Project = models.ManyToManyField(
|
3604
|
-
"self", symmetrical=False, related_name="children"
|
3605
|
-
)
|
3606
|
-
"""Parent projects, the super-projects owning this project."""
|
3607
|
-
children: Project
|
3608
|
-
"""Child projects, the sub-projects owned by this project.
|
3609
|
-
|
3610
|
-
Reverse accessor for `.parents`.
|
3611
|
-
"""
|
3612
|
-
predecessors: Project = models.ManyToManyField(
|
3613
|
-
"self", symmetrical=False, related_name="successors"
|
3614
|
-
)
|
3615
|
-
"""The preceding projects required by this project."""
|
3616
|
-
successors: Project
|
3617
|
-
"""The succeeding projects requiring this project.
|
3618
|
-
|
3619
|
-
Reverse accessor for `.predecessors`.
|
3620
|
-
"""
|
3621
|
-
people: Person = models.ManyToManyField(
|
3622
|
-
Person, through="PersonProject", related_name="projects"
|
3623
|
-
)
|
3624
|
-
"""People associated with this project."""
|
3625
|
-
artifacts: Artifact = models.ManyToManyField(
|
3626
|
-
Artifact, through="ArtifactProject", related_name="projects"
|
3627
|
-
)
|
3628
|
-
"""Artifacts associated with this Project."""
|
3629
|
-
transforms: Transform = models.ManyToManyField(
|
3630
|
-
Transform, through="TransformProject", related_name="projects"
|
3631
|
-
)
|
3632
|
-
"""Transforms associated with this project."""
|
3633
|
-
ulabels: ULabel = models.ManyToManyField(
|
3634
|
-
ULabel, through="ULabelProject", related_name="projects"
|
3635
|
-
)
|
3636
|
-
"""Transforms associated with this project."""
|
3637
|
-
features: ULabel = models.ManyToManyField(
|
3638
|
-
Feature, through="FeatureProject", related_name="projects"
|
3639
|
-
)
|
3640
|
-
"""Transforms associated with this project."""
|
3641
|
-
schemas: ULabel = models.ManyToManyField(
|
3642
|
-
Schema, through="SchemaProject", related_name="projects"
|
3643
|
-
)
|
3644
|
-
"""Schemas associated with this project."""
|
3645
|
-
collections: Collection = models.ManyToManyField(
|
3646
|
-
Collection, through="CollectionProject", related_name="projects"
|
3647
|
-
)
|
3648
|
-
"""Collections associated with this project."""
|
3649
|
-
references: Reference = models.ManyToManyField("Reference", related_name="projects")
|
3650
|
-
"""References associated with this project."""
|
3651
|
-
_status_code: int = models.SmallIntegerField(default=0, db_index=True)
|
3652
|
-
"""Status code."""
|
3653
|
-
|
3654
|
-
|
3655
|
-
class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
3656
|
-
"""References such as internal studies, papers, documents, or URLs.
|
3657
|
-
|
3658
|
-
Example:
|
3659
|
-
>>> reference = Reference(
|
3660
|
-
... name="A Paper Title",
|
3661
|
-
... abbr="APT",
|
3662
|
-
... url="https://doi.org/10.1000/xyz123",
|
3663
|
-
... pubmed_id=12345678,
|
3664
|
-
... doi="10.1000/xyz123",
|
3665
|
-
... description="Good paper.",
|
3666
|
-
... text="Some text I want to be searchable.",
|
3667
|
-
... date=date(2023, 11, 21),
|
3668
|
-
... ).save()
|
3669
|
-
"""
|
3670
|
-
|
3671
|
-
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
3672
|
-
abstract = False
|
3673
|
-
|
3674
|
-
id: int = models.AutoField(primary_key=True)
|
3675
|
-
"""Internal id, valid only in one DB instance."""
|
3676
|
-
uid: str = CharField(
|
3677
|
-
editable=False, unique=True, max_length=12, db_index=True, default=base62_12
|
3678
|
-
)
|
3679
|
-
"""Universal id, valid across DB instances."""
|
3680
|
-
name: str = CharField(db_index=True)
|
3681
|
-
"""Title or name of the reference document."""
|
3682
|
-
abbr: str | None = CharField(
|
3683
|
-
max_length=32,
|
3684
|
-
db_index=True,
|
3685
|
-
null=True,
|
3686
|
-
)
|
3687
|
-
"""An abbreviation for the reference."""
|
3688
|
-
type: Reference | None = ForeignKey(
|
3689
|
-
"self", PROTECT, null=True, related_name="records"
|
3690
|
-
)
|
3691
|
-
"""Type of reference (e.g., 'Study', 'Paper', 'Preprint').
|
3692
|
-
|
3693
|
-
Allows to group reference by type, e.g., internal studies vs. all papers etc.
|
3694
|
-
"""
|
3695
|
-
records: Reference
|
3696
|
-
"""Records of this type."""
|
3697
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
3698
|
-
"""Distinguish types from instances of the type."""
|
3699
|
-
url: str | None = URLField(null=True)
|
3700
|
-
"""URL linking to the reference."""
|
3701
|
-
pubmed_id: int | None = BigIntegerField(null=True, db_index=True)
|
3702
|
-
"""A PudMmed ID."""
|
3703
|
-
doi: str | None = CharField(
|
3704
|
-
null=True,
|
3705
|
-
db_index=True,
|
3706
|
-
validators=[
|
3707
|
-
RegexValidator(
|
3708
|
-
regex=r"^(?:https?://(?:dx\.)?doi\.org/|doi:|DOI:)?10\.\d+/.*$",
|
3709
|
-
message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)",
|
3710
|
-
)
|
3711
|
-
],
|
3712
|
-
)
|
3713
|
-
"""Digital Object Identifier (DOI) for the reference."""
|
3714
|
-
description: str | None = CharField(null=True, db_index=True)
|
3715
|
-
"""Description of the reference."""
|
3716
|
-
text: str | None = TextField(null=True)
|
3717
|
-
"""Abstract or full text of the reference to make it searchable."""
|
3718
|
-
date: date | None = DateField(null=True, default=None)
|
3719
|
-
"""Date of creation or publication of the reference."""
|
3720
|
-
authors: Person = models.ManyToManyField(Person, related_name="references")
|
3721
|
-
"""All people associated with this reference."""
|
3722
|
-
artifacts: Artifact = models.ManyToManyField(
|
3723
|
-
Artifact, through="ArtifactReference", related_name="references"
|
3724
|
-
)
|
3725
|
-
"""Artifacts associated with this reference."""
|
3726
|
-
transforms: Artifact = models.ManyToManyField(
|
3727
|
-
Transform, through="TransformReference", related_name="references"
|
3728
|
-
)
|
3729
|
-
"""Transforms associated with this reference."""
|
3730
|
-
collections: Artifact = models.ManyToManyField(
|
3731
|
-
Collection, through="CollectionReference", related_name="references"
|
3732
|
-
)
|
3733
|
-
"""Collections associated with this reference."""
|
3734
|
-
|
3735
|
-
|
3736
|
-
# -------------------------------------------------------------------------------------
|
3737
|
-
# Data models
|
3738
|
-
|
3739
|
-
from django.contrib.postgres.fields import JSONField # type: ignore
|
3740
|
-
from django.core.exceptions import ValidationError
|
3741
|
-
from django.db import models
|
3742
|
-
|
3743
|
-
|
3744
|
-
class DataMixin(models.Model):
|
3745
|
-
space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1)
|
3746
|
-
feature = ForeignKey(
|
3747
|
-
Feature, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3748
|
-
)
|
3749
|
-
param = ForeignKey(
|
3750
|
-
Param, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3751
|
-
)
|
3752
|
-
row = IntegerField(help_text="Use -1 for result data")
|
3753
|
-
|
3754
|
-
# Value fields
|
3755
|
-
value_int = models.BigIntegerField(null=True, blank=True)
|
3756
|
-
value_float = models.FloatField(null=True, blank=True)
|
3757
|
-
value_str = models.TextField(null=True, blank=True)
|
3758
|
-
value_datetime = models.DateTimeField(null=True, blank=True)
|
3759
|
-
value_ulabel = models.ForeignKey(
|
3760
|
-
ULabel, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3761
|
-
)
|
3762
|
-
value_person = models.ForeignKey(
|
3763
|
-
Person, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3764
|
-
)
|
3765
|
-
value_artifact = models.ForeignKey(
|
3766
|
-
Artifact, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3767
|
-
)
|
3768
|
-
value_collection = models.ForeignKey(
|
3769
|
-
Collection, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3770
|
-
)
|
3771
|
-
value_project = models.ForeignKey(
|
3772
|
-
Project, null=True, blank=True, on_delete=models.CASCADE, related_name="+"
|
3773
|
-
)
|
3774
|
-
value_json = models.JSONField(null=True, blank=True)
|
3775
|
-
|
3776
|
-
class Meta:
|
3777
|
-
abstract = True
|
3778
|
-
|
3779
|
-
def clean(self):
|
3780
|
-
# Validate feature/param mutual exclusivity
|
3781
|
-
if (self.feature is not None) == (self.param is not None):
|
3782
|
-
raise ValidationError("Exactly one of feature or param must be set")
|
3783
|
-
|
3784
|
-
# Validate value fields
|
3785
|
-
values = [
|
3786
|
-
self.value_int,
|
3787
|
-
self.value_float,
|
3788
|
-
self.value_str,
|
3789
|
-
self.value_datetime,
|
3790
|
-
self.value_ulabel,
|
3791
|
-
self.value_artifact,
|
3792
|
-
self.value_json,
|
3793
|
-
]
|
3794
|
-
non_null_count = sum(1 for v in values if v is not None)
|
3795
|
-
|
3796
|
-
if non_null_count != 1:
|
3797
|
-
raise ValidationError("Exactly one value field must be set")
|
3798
|
-
|
3799
|
-
|
3800
|
-
class RunData(BasicRecord, DataMixin):
|
3801
|
-
run = models.ForeignKey("Run", on_delete=models.CASCADE, related_name="_rundata")
|
3802
|
-
|
3803
|
-
class Meta:
|
3804
|
-
constraints = [
|
3805
|
-
models.CheckConstraint(
|
3806
|
-
condition=(
|
3807
|
-
models.Q(feature__isnull=False, param__isnull=True)
|
3808
|
-
| models.Q(feature__isnull=True, param__isnull=False)
|
3809
|
-
),
|
3810
|
-
name="run_data_feature_param_mutex",
|
3811
|
-
),
|
3812
|
-
models.UniqueConstraint(
|
3813
|
-
fields=["run", "row", "feature", "param"], name="run_data_unique"
|
3814
|
-
),
|
3815
|
-
]
|
3816
|
-
indexes = [
|
3817
|
-
models.Index(fields=["run", "row"]),
|
3818
|
-
models.Index(fields=["feature"]),
|
3819
|
-
models.Index(fields=["param"]),
|
3820
|
-
]
|
3821
|
-
|
3822
|
-
|
3823
|
-
class FlexTable(Record, TracksRun, TracksUpdates):
|
3824
|
-
uid: str = CharField(
|
3825
|
-
editable=False, unique=True, max_length=12, db_index=True, default=base62_12
|
3826
|
-
)
|
3827
|
-
name = CharField()
|
3828
|
-
schema: Schema | None = ForeignKey(
|
3829
|
-
Schema, null=True, on_delete=models.SET_NULL, related_name="_tidytables"
|
3830
|
-
)
|
3831
|
-
type: FlexTable | None = ForeignKey(
|
3832
|
-
"self", PROTECT, null=True, related_name="records"
|
3833
|
-
)
|
3834
|
-
"""Type of tidy table, e.g., `Cell`, `SampleSheet`, etc."""
|
3835
|
-
records: ULabel
|
3836
|
-
"""Records of this type."""
|
3837
|
-
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
3838
|
-
"""Distinguish types from instances of the type."""
|
3839
|
-
description: str = CharField(null=True, db_index=True)
|
3840
|
-
"""A description."""
|
3841
|
-
projects: Project = ManyToManyField(Project, related_name="_tidytables")
|
3842
|
-
ulabels: Project = ManyToManyField(ULabel, related_name="_tidytables")
|
3843
|
-
|
3844
|
-
class Meta:
|
3845
|
-
indexes = [models.Index(fields=["uid"]), models.Index(fields=["name"])]
|
3846
|
-
|
3847
|
-
|
3848
|
-
class FlexTableData(BasicRecord, DataMixin):
|
3849
|
-
tidytable = models.ForeignKey(
|
3850
|
-
FlexTable, on_delete=models.CASCADE, related_name="data"
|
3851
|
-
)
|
3852
|
-
|
3853
|
-
class Meta:
|
3854
|
-
constraints = [
|
3855
|
-
models.CheckConstraint(
|
3856
|
-
condition=(
|
3857
|
-
models.Q(feature__isnull=False, param__isnull=True)
|
3858
|
-
| models.Q(feature__isnull=True, param__isnull=False)
|
3859
|
-
),
|
3860
|
-
name="tidy_table_data_feature_param_mutex",
|
3861
|
-
),
|
3862
|
-
models.UniqueConstraint(
|
3863
|
-
fields=["tidytable", "row", "feature", "param"],
|
3864
|
-
name="tidy_table_data_unique",
|
3865
|
-
),
|
3866
|
-
]
|
3867
|
-
indexes = [
|
3868
|
-
models.Index(fields=["tidytable", "row"]),
|
3869
|
-
models.Index(fields=["feature"]),
|
3870
|
-
models.Index(fields=["param"]),
|
3871
|
-
]
|
3872
|
-
|
3873
|
-
|
3874
|
-
# -------------------------------------------------------------------------------------
|
3875
|
-
# Link models
|
3876
|
-
|
3877
|
-
|
3878
|
-
class LinkORM:
|
3879
|
-
pass
|
3880
|
-
|
3881
|
-
|
3882
|
-
class SchemaFeature(BasicRecord, LinkORM):
|
3883
|
-
id: int = models.BigAutoField(primary_key=True)
|
3884
|
-
schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
|
3885
|
-
feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
|
3886
|
-
|
3887
|
-
class Meta:
|
3888
|
-
unique_together = ("schema", "feature")
|
3889
|
-
|
3890
|
-
|
3891
|
-
class SchemaParam(BasicRecord, LinkORM):
|
3892
|
-
id: int = models.BigAutoField(primary_key=True)
|
3893
|
-
schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
|
3894
|
-
param: Param = ForeignKey(Param, PROTECT, related_name="+")
|
3895
|
-
|
3896
|
-
class Meta:
|
3897
|
-
unique_together = ("schema", "param")
|
3898
|
-
|
3899
|
-
|
3900
|
-
class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
|
3901
|
-
id: int = models.BigAutoField(primary_key=True)
|
3902
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="_links_schema")
|
3903
|
-
schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
|
3904
|
-
slot: str | None = CharField(null=True)
|
3905
|
-
feature_ref_is_semantic: bool | None = BooleanField(null=True)
|
3906
|
-
|
3907
|
-
class Meta:
|
3908
|
-
unique_together = (("artifact", "schema"), ("artifact", "slot"))
|
3909
|
-
|
3910
|
-
|
3911
|
-
class SchemaComponent(BasicRecord, LinkORM, TracksRun):
|
3912
|
-
id: int = models.BigAutoField(primary_key=True)
|
3913
|
-
composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
|
3914
|
-
component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
|
3915
|
-
slot: str | None = CharField(null=True)
|
3916
|
-
|
3917
|
-
class Meta:
|
3918
|
-
unique_together = (("composite", "component"), ("composite", "slot"))
|
3919
|
-
|
3920
|
-
|
3921
|
-
class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
|
3922
|
-
id: int = models.BigAutoField(primary_key=True)
|
3923
|
-
collection: Collection = ForeignKey(
|
3924
|
-
Collection, CASCADE, related_name="links_artifact"
|
3925
|
-
)
|
3926
|
-
artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection")
|
3927
|
-
|
3928
|
-
class Meta:
|
3929
|
-
unique_together = ("collection", "artifact")
|
3930
|
-
|
3931
|
-
|
3932
|
-
class ArtifactULabel(BasicRecord, LinkORM, TracksRun):
|
3933
|
-
id: int = models.BigAutoField(primary_key=True)
|
3934
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_ulabel")
|
3935
|
-
ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_artifact")
|
3936
|
-
feature: Feature | None = ForeignKey(
|
3937
|
-
Feature, PROTECT, null=True, related_name="links_artifactulabel", default=None
|
3938
|
-
)
|
3939
|
-
label_ref_is_name: bool | None = BooleanField(null=True)
|
3940
|
-
feature_ref_is_name: bool | None = BooleanField(null=True)
|
3941
|
-
|
3942
|
-
class Meta:
|
3943
|
-
# can have the same label linked to the same artifact if the feature is
|
3944
|
-
# different
|
3945
|
-
unique_together = ("artifact", "ulabel", "feature")
|
3946
|
-
|
3947
|
-
|
3948
|
-
class TransformULabel(BasicRecord, LinkORM, TracksRun):
|
3949
|
-
id: int = models.BigAutoField(primary_key=True)
|
3950
|
-
transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_ulabel")
|
3951
|
-
ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_transform")
|
3952
|
-
|
3953
|
-
class Meta:
|
3954
|
-
unique_together = ("transform", "ulabel")
|
3955
|
-
|
3956
|
-
|
3957
|
-
class RunULabel(BasicRecord, LinkORM):
|
3958
|
-
id: int = models.BigAutoField(primary_key=True)
|
3959
|
-
run: Run = ForeignKey(Run, CASCADE, related_name="links_ulabel")
|
3960
|
-
ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_run")
|
3961
|
-
created_at: datetime = DateTimeField(
|
3962
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
3963
|
-
)
|
3964
|
-
"""Time of creation of record."""
|
3965
|
-
created_by: User = ForeignKey(
|
3966
|
-
"lamindb.User", PROTECT, default=current_user_id, related_name="+"
|
3967
|
-
)
|
3968
|
-
"""Creator of record."""
|
3969
|
-
|
3970
|
-
class Meta:
|
3971
|
-
unique_together = ("run", "ulabel")
|
3972
|
-
|
3973
|
-
|
3974
|
-
class CollectionULabel(BasicRecord, LinkORM, TracksRun):
|
3975
|
-
id: int = models.BigAutoField(primary_key=True)
|
3976
|
-
collection: Collection = ForeignKey(
|
3977
|
-
Collection, CASCADE, related_name="links_ulabel"
|
3978
|
-
)
|
3979
|
-
ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_collection")
|
3980
|
-
feature: Feature | None = ForeignKey(
|
3981
|
-
Feature, PROTECT, null=True, related_name="links_collectionulabel", default=None
|
3982
|
-
)
|
3983
|
-
label_ref_is_name: bool | None = BooleanField(null=True)
|
3984
|
-
feature_ref_is_name: bool | None = BooleanField(null=True)
|
3985
|
-
|
3986
|
-
class Meta:
|
3987
|
-
unique_together = ("collection", "ulabel")
|
3988
|
-
|
3989
|
-
|
3990
|
-
class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
|
3991
|
-
id: int = models.BigAutoField(primary_key=True)
|
3992
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
|
3993
|
-
# we follow the lower() case convention rather than snake case for link models
|
3994
|
-
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
|
3995
|
-
|
3996
|
-
class Meta:
|
3997
|
-
unique_together = ("artifact", "featurevalue")
|
3998
|
-
|
3999
|
-
|
4000
|
-
class RunParamValue(BasicRecord, LinkORM):
|
4001
|
-
id: int = models.BigAutoField(primary_key=True)
|
4002
|
-
run: Run = ForeignKey(Run, CASCADE, related_name="+")
|
4003
|
-
# we follow the lower() case convention rather than snake case for link models
|
4004
|
-
paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
|
4005
|
-
created_at: datetime = DateTimeField(
|
4006
|
-
editable=False, db_default=models.functions.Now(), db_index=True
|
4007
|
-
)
|
4008
|
-
"""Time of creation of record."""
|
4009
|
-
created_by: User = ForeignKey(
|
4010
|
-
"lamindb.User", PROTECT, default=current_user_id, related_name="+"
|
4011
|
-
)
|
4012
|
-
"""Creator of record."""
|
4013
|
-
|
4014
|
-
class Meta:
|
4015
|
-
unique_together = ("run", "paramvalue")
|
4016
|
-
|
4017
|
-
|
4018
|
-
class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
|
4019
|
-
id: int = models.BigAutoField(primary_key=True)
|
4020
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
|
4021
|
-
# we follow the lower() case convention rather than snake case for link models
|
4022
|
-
paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
|
4023
|
-
|
4024
|
-
class Meta:
|
4025
|
-
unique_together = ("artifact", "paramvalue")
|
4026
|
-
|
4027
|
-
|
4028
|
-
# -------------------------------------------------------------------------------------
|
4029
|
-
# Link models for project management
|
4030
|
-
|
4031
|
-
|
4032
|
-
class ArtifactProject(BasicRecord, LinkORM, TracksRun):
|
4033
|
-
id: int = models.BigAutoField(primary_key=True)
|
4034
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_project")
|
4035
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_artifact")
|
4036
|
-
feature: Feature | None = ForeignKey(
|
4037
|
-
Feature,
|
4038
|
-
PROTECT,
|
4039
|
-
null=True,
|
4040
|
-
default=None,
|
4041
|
-
related_name="links_artifactproject",
|
4042
|
-
)
|
4043
|
-
label_ref_is_name: bool | None = BooleanField(null=True, default=None)
|
4044
|
-
feature_ref_is_name: bool | None = BooleanField(null=True, default=None)
|
4045
|
-
|
4046
|
-
class Meta:
|
4047
|
-
# can have the same label linked to the same artifact if the feature is different
|
4048
|
-
unique_together = ("artifact", "project", "feature")
|
4049
|
-
|
4050
|
-
|
4051
|
-
class TransformProject(BasicRecord, LinkORM, TracksRun):
|
4052
|
-
id: int = models.BigAutoField(primary_key=True)
|
4053
|
-
transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_project")
|
4054
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_transform")
|
4055
|
-
|
4056
|
-
class Meta:
|
4057
|
-
unique_together = ("transform", "project")
|
4058
|
-
|
4059
|
-
|
4060
|
-
class CollectionProject(BasicRecord, LinkORM, TracksRun):
|
4061
|
-
id: int = models.BigAutoField(primary_key=True)
|
4062
|
-
collection: Collection = ForeignKey(
|
4063
|
-
Collection, CASCADE, related_name="links_project"
|
4064
|
-
)
|
4065
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_collection")
|
4066
|
-
|
4067
|
-
class Meta:
|
4068
|
-
unique_together = ("collection", "project")
|
4069
|
-
|
4070
|
-
|
4071
|
-
class ULabelProject(BasicRecord, LinkORM, TracksRun):
|
4072
|
-
id: int = models.BigAutoField(primary_key=True)
|
4073
|
-
ulabel: Transform = ForeignKey(ULabel, CASCADE, related_name="links_project")
|
4074
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_ulabel")
|
4075
|
-
|
4076
|
-
class Meta:
|
4077
|
-
unique_together = ("ulabel", "project")
|
4078
|
-
|
4079
|
-
|
4080
|
-
class PersonProject(BasicRecord, LinkORM, TracksRun):
|
4081
|
-
id: int = models.BigAutoField(primary_key=True)
|
4082
|
-
person: Transform = ForeignKey(Person, CASCADE, related_name="links_project")
|
4083
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_person")
|
4084
|
-
role: str | None = CharField(null=True, default=None)
|
4085
|
-
|
4086
|
-
class Meta:
|
4087
|
-
unique_together = ("person", "project")
|
4088
|
-
|
4089
|
-
|
4090
|
-
class FeatureProject(BasicRecord, LinkORM, TracksRun):
|
4091
|
-
id: int = models.BigAutoField(primary_key=True)
|
4092
|
-
feature: Feature = ForeignKey(Feature, CASCADE, related_name="links_project")
|
4093
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_feature")
|
4094
|
-
|
4095
|
-
class Meta:
|
4096
|
-
unique_together = ("feature", "project")
|
4097
|
-
|
4098
|
-
|
4099
|
-
class SchemaProject(BasicRecord, LinkORM, TracksRun):
|
4100
|
-
id: int = models.BigAutoField(primary_key=True)
|
4101
|
-
schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_project")
|
4102
|
-
project: Project = ForeignKey(Project, PROTECT, related_name="links_schema")
|
4103
|
-
|
4104
|
-
class Meta:
|
4105
|
-
unique_together = ("schema", "project")
|
4106
|
-
|
4107
|
-
|
4108
|
-
class ArtifactReference(BasicRecord, LinkORM, TracksRun):
|
4109
|
-
id: int = models.BigAutoField(primary_key=True)
|
4110
|
-
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_reference")
|
4111
|
-
reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_artifact")
|
4112
|
-
feature: Feature | None = ForeignKey(
|
4113
|
-
Feature,
|
4114
|
-
PROTECT,
|
4115
|
-
null=True,
|
4116
|
-
default=None,
|
4117
|
-
related_name="links_artifactreference",
|
4118
|
-
)
|
4119
|
-
label_ref_is_name: bool | None = BooleanField(null=True, default=None)
|
4120
|
-
feature_ref_is_name: bool | None = BooleanField(null=True, default=None)
|
4121
|
-
|
4122
|
-
class Meta:
|
4123
|
-
# can have the same label linked to the same artifact if the feature is different
|
4124
|
-
unique_together = ("artifact", "reference", "feature")
|
4125
|
-
|
4126
|
-
|
4127
|
-
class TransformReference(BasicRecord, LinkORM, TracksRun):
|
4128
|
-
id: int = models.BigAutoField(primary_key=True)
|
4129
|
-
transform: Transform = ForeignKey(
|
4130
|
-
Transform, CASCADE, related_name="links_reference"
|
4131
|
-
)
|
4132
|
-
reference: Reference = ForeignKey(
|
4133
|
-
Reference, PROTECT, related_name="links_transform"
|
4134
|
-
)
|
4135
|
-
|
4136
|
-
class Meta:
|
4137
|
-
unique_together = ("transform", "reference")
|
4138
|
-
|
4139
|
-
|
4140
|
-
class CollectionReference(BasicRecord, LinkORM, TracksRun):
|
4141
|
-
id: int = models.BigAutoField(primary_key=True)
|
4142
|
-
collection: Collection = ForeignKey(
|
4143
|
-
Collection, CASCADE, related_name="links_reference"
|
4144
|
-
)
|
4145
|
-
reference: Reference = ForeignKey(
|
4146
|
-
Reference, PROTECT, related_name="links_collection"
|
4147
|
-
)
|
4148
|
-
|
4149
|
-
class Meta:
|
4150
|
-
unique_together = ("collection", "reference")
|
4151
|
-
|
4152
|
-
|
4153
|
-
class Migration(BasicRecord):
|
4154
|
-
app = CharField(max_length=255)
|
4155
|
-
name = CharField(max_length=255)
|
4156
|
-
applied: datetime = DateTimeField()
|
4157
|
-
|
4158
|
-
class Meta:
|
4159
|
-
db_table = "django_migrations"
|
4160
|
-
managed = False
|
4161
|
-
|
4162
|
-
|
4163
|
-
# -------------------------------------------------------------------------------------
|
4164
|
-
# Low-level logic needed in lamindb-setup
|
4165
|
-
|
4166
|
-
# Below is needed within lnschema-core because lamindb-setup already performs
|
4167
|
-
# some logging
|
4168
|
-
|
4169
|
-
|
4170
|
-
def format_field_value(value: datetime | str | Any) -> Any:
|
4171
|
-
from datetime import datetime
|
4172
|
-
|
4173
|
-
if isinstance(value, datetime):
|
4174
|
-
return value.strftime("%Y-%m-%d %H:%M:%S %Z")
|
4175
|
-
|
4176
|
-
if isinstance(value, str):
|
4177
|
-
try:
|
4178
|
-
value = datetime.fromisoformat(value)
|
4179
|
-
value = value.strftime("%Y-%m-%d %H:%M:%S %Z")
|
4180
|
-
except ValueError:
|
4181
|
-
pass
|
4182
|
-
return f"'{value}'"
|
4183
|
-
else:
|
4184
|
-
return value
|
4185
|
-
|
4186
|
-
|
4187
|
-
class RegistryInfo:
|
4188
|
-
def __init__(self, registry: Registry):
|
4189
|
-
self.registry = registry
|
4190
|
-
|
4191
|
-
def _get_type_for_field(self, field_name: str) -> str:
|
4192
|
-
field = self.registry._meta.get_field(field_name)
|
4193
|
-
related_model_name = (
|
4194
|
-
field.related_model.__name__
|
4195
|
-
if hasattr(field, "related_model") and field.related_model
|
4196
|
-
else None
|
4197
|
-
)
|
4198
|
-
return related_model_name if related_model_name else field.get_internal_type()
|
4199
|
-
|
4200
|
-
def _get_base_class_fields(self) -> list[str]:
|
4201
|
-
return [
|
4202
|
-
field.name
|
4203
|
-
for base in self.registry.__bases__
|
4204
|
-
if hasattr(base, "_meta")
|
4205
|
-
for field in base._meta.get_fields()
|
4206
|
-
]
|
4207
|
-
|
4208
|
-
def _reorder_fields_by_class(self, fields_to_order: list[Field]) -> list[Field]:
|
4209
|
-
"""Reorders the fields so that base class fields come last."""
|
4210
|
-
non_base_class_fields = [
|
4211
|
-
field
|
4212
|
-
for field in fields_to_order
|
4213
|
-
if field.name not in self._get_base_class_fields()
|
4214
|
-
]
|
4215
|
-
found_base_class_fields = [
|
4216
|
-
field
|
4217
|
-
for field in fields_to_order
|
4218
|
-
if field.name in self._get_base_class_fields()
|
4219
|
-
]
|
4220
|
-
return non_base_class_fields + found_base_class_fields
|
4221
|
-
|
4222
|
-
def get_simple_fields(self, return_str: bool = False) -> Any:
|
4223
|
-
simple_fields = [
|
4224
|
-
field
|
4225
|
-
for field in self.registry._meta.get_fields()
|
4226
|
-
if not (
|
4227
|
-
isinstance(field, ManyToOneRel)
|
4228
|
-
or isinstance(field, ManyToManyRel)
|
4229
|
-
or isinstance(field, ManyToManyField)
|
4230
|
-
or isinstance(field, ForeignKey)
|
4231
|
-
or field.name.startswith("_")
|
4232
|
-
or field.name == "id"
|
4233
|
-
)
|
4234
|
-
]
|
4235
|
-
simple_fields = self._reorder_fields_by_class(simple_fields)
|
4236
|
-
if not return_str:
|
4237
|
-
return simple_fields
|
4238
|
-
else:
|
4239
|
-
repr_str = f" {colors.italic('Simple fields')}\n"
|
4240
|
-
if simple_fields:
|
4241
|
-
repr_str += "".join(
|
4242
|
-
[
|
4243
|
-
f" .{field_name.name}: {self._get_type_for_field(field_name.name)}\n"
|
4244
|
-
for field_name in simple_fields
|
4245
|
-
]
|
4246
|
-
)
|
4247
|
-
return repr_str
|
4248
|
-
|
4249
|
-
def get_relational_fields(self, return_str: bool = False):
|
4250
|
-
# we ignore ManyToOneRel because it leads to so much clutter in the API
|
4251
|
-
# also note that our general guideline is to have related_name="+"
|
4252
|
-
# for ForeignKey fields
|
4253
|
-
relational_fields = (ManyToOneRel, ManyToManyRel, ManyToManyField, ForeignKey)
|
4254
|
-
|
4255
|
-
class_specific_relational_fields = [
|
4256
|
-
field
|
4257
|
-
for field in self.registry._meta.fields + self.registry._meta.many_to_many
|
4258
|
-
if isinstance(field, relational_fields)
|
4259
|
-
and not field.name.startswith(("links_", "_"))
|
4260
|
-
]
|
4261
|
-
|
4262
|
-
non_class_specific_relational_fields = [
|
4263
|
-
field
|
4264
|
-
for field in self.registry._meta.get_fields()
|
4265
|
-
if isinstance(field, relational_fields)
|
4266
|
-
and not field.name.startswith(("links_", "_"))
|
4267
|
-
]
|
4268
|
-
non_class_specific_relational_fields = self._reorder_fields_by_class(
|
4269
|
-
non_class_specific_relational_fields
|
4270
|
-
)
|
4271
|
-
|
4272
|
-
# Ensure that class specific fields (e.g. Artifact) come before non-class specific fields (e.g. collection)
|
4273
|
-
filtered_non_class_specific = [
|
4274
|
-
field
|
4275
|
-
for field in non_class_specific_relational_fields
|
4276
|
-
if field not in class_specific_relational_fields
|
4277
|
-
]
|
4278
|
-
ordered_relational_fields = (
|
4279
|
-
class_specific_relational_fields + filtered_non_class_specific
|
4280
|
-
)
|
4281
|
-
|
4282
|
-
core_module_fields = []
|
4283
|
-
external_modules_fields = []
|
4284
|
-
for field in ordered_relational_fields:
|
4285
|
-
field_name = repr(field).split(": ")[1][:-1]
|
4286
|
-
if field_name.count(".") == 1 and "lamindb" not in field_name:
|
4287
|
-
external_modules_fields.append(field)
|
4288
|
-
else:
|
4289
|
-
core_module_fields.append(field)
|
4290
|
-
|
4291
|
-
def _get_related_field_type(field) -> str:
|
4292
|
-
field_type = (
|
4293
|
-
field.related_model.__get_name_with_module__()
|
4294
|
-
.replace(
|
4295
|
-
"Artifact", ""
|
4296
|
-
) # some fields have an unnecessary 'Artifact' in their name
|
4297
|
-
.replace(
|
4298
|
-
"Collection", ""
|
4299
|
-
) # some fields have an unnecessary 'Collection' in their name
|
4300
|
-
)
|
4301
|
-
return (
|
4302
|
-
self._get_type_for_field(field.name)
|
4303
|
-
if not field_type.strip()
|
4304
|
-
else field_type
|
4305
|
-
)
|
4306
|
-
|
4307
|
-
core_module_fields_formatted = [
|
4308
|
-
f" .{field.name}: {_get_related_field_type(field)}\n"
|
4309
|
-
for field in core_module_fields
|
4310
|
-
]
|
4311
|
-
external_modules_fields_formatted = [
|
4312
|
-
f" .{field.name}: {_get_related_field_type(field)}\n"
|
4313
|
-
for field in external_modules_fields
|
4314
|
-
]
|
4315
|
-
|
4316
|
-
if not return_str:
|
4317
|
-
external_modules_fields_by_modules = defaultdict(list)
|
4318
|
-
for field_str, field in zip(
|
4319
|
-
external_modules_fields_formatted, external_modules_fields
|
4320
|
-
):
|
4321
|
-
field_type = field_str.split(":")[1].split()[0]
|
4322
|
-
module_name = field_type.split(".")[0]
|
4323
|
-
external_modules_fields_by_modules[module_name].append(field)
|
4324
|
-
return core_module_fields, external_modules_fields_by_modules
|
4325
|
-
else:
|
4326
|
-
repr_str = ""
|
4327
|
-
|
4328
|
-
# Non-external relational fields
|
4329
|
-
if core_module_fields:
|
4330
|
-
repr_str += f" {colors.italic('Relational fields')}\n"
|
4331
|
-
repr_str += "".join(core_module_fields_formatted)
|
4332
|
-
|
4333
|
-
# External relational fields
|
4334
|
-
external_modules = set()
|
4335
|
-
for field in external_modules_fields_formatted:
|
4336
|
-
field_type = field.split(":")[1].split()[0]
|
4337
|
-
external_modules.add(field_type.split(".")[0])
|
4338
|
-
|
4339
|
-
if external_modules:
|
4340
|
-
# We want Bionty to show up before other modules
|
4341
|
-
external_modules = (
|
4342
|
-
["bionty"] + sorted(external_modules - {"bionty"}) # type: ignore
|
4343
|
-
if "bionty" in external_modules
|
4344
|
-
else sorted(external_modules)
|
4345
|
-
)
|
4346
|
-
for ext_module in external_modules:
|
4347
|
-
ext_module_fields = [
|
4348
|
-
field
|
4349
|
-
for field in external_modules_fields_formatted
|
4350
|
-
if ext_module in field
|
4351
|
-
]
|
4352
|
-
|
4353
|
-
if ext_module_fields:
|
4354
|
-
repr_str += (
|
4355
|
-
f" {colors.italic(f'{ext_module.capitalize()} fields')}\n"
|
4356
|
-
)
|
4357
|
-
repr_str += "".join(ext_module_fields)
|
4358
|
-
|
4359
|
-
return repr_str
|
4360
|
-
|
4361
|
-
|
4362
|
-
def registry_repr(cls):
|
4363
|
-
"""Shows fields."""
|
4364
|
-
repr_str = f"{colors.green(cls.__name__)}\n"
|
4365
|
-
info = RegistryInfo(cls)
|
4366
|
-
repr_str += info.get_simple_fields(return_str=True)
|
4367
|
-
repr_str += info.get_relational_fields(return_str=True)
|
4368
|
-
repr_str = repr_str.rstrip("\n")
|
4369
|
-
return repr_str
|
4370
|
-
|
4371
|
-
|
4372
|
-
def record_repr(
|
4373
|
-
self: Record, include_foreign_keys: bool = True, exclude_field_names=None
|
4374
|
-
) -> str:
|
4375
|
-
if exclude_field_names is None:
|
4376
|
-
exclude_field_names = ["id", "updated_at", "source_code"]
|
4377
|
-
field_names = [
|
4378
|
-
field.name
|
4379
|
-
for field in self._meta.fields
|
4380
|
-
if (not isinstance(field, ForeignKey) and field.name not in exclude_field_names)
|
4381
|
-
]
|
4382
|
-
if include_foreign_keys:
|
4383
|
-
field_names += [
|
4384
|
-
f"{field.name}_id"
|
4385
|
-
for field in self._meta.fields
|
4386
|
-
if isinstance(field, ForeignKey)
|
4387
|
-
]
|
4388
|
-
if "created_at" in field_names:
|
4389
|
-
field_names.remove("created_at")
|
4390
|
-
field_names.append("created_at")
|
4391
|
-
if field_names[0] != "uid" and "uid" in field_names:
|
4392
|
-
field_names.remove("uid")
|
4393
|
-
field_names.insert(0, "uid")
|
4394
|
-
fields_str = {}
|
4395
|
-
for k in field_names:
|
4396
|
-
if not k.startswith("_") and hasattr(self, k):
|
4397
|
-
value = getattr(self, k)
|
4398
|
-
# Force strip the time component of the version
|
4399
|
-
if k == "version" and value:
|
4400
|
-
fields_str[k] = f"'{str(value).split()[0]}'"
|
4401
|
-
else:
|
4402
|
-
fields_str[k] = format_field_value(value)
|
4403
|
-
fields_joined_str = ", ".join(
|
4404
|
-
[f"{k}={fields_str[k]}" for k in fields_str if fields_str[k] is not None]
|
4405
|
-
)
|
4406
|
-
return f"{self.__class__.__name__}({fields_joined_str})"
|
4407
|
-
|
4408
|
-
|
4409
|
-
# below is code to further format the repr of a record
|
4410
|
-
#
|
4411
|
-
# def format_repr(
|
4412
|
-
# record: Record, exclude_field_names: str | list[str] | None = None
|
4413
|
-
# ) -> str:
|
4414
|
-
# if isinstance(exclude_field_names, str):
|
4415
|
-
# exclude_field_names = [exclude_field_names]
|
4416
|
-
# exclude_field_names_init = ["id", "created_at", "updated_at"]
|
4417
|
-
# if exclude_field_names is not None:
|
4418
|
-
# exclude_field_names_init += exclude_field_names
|
4419
|
-
# return record.__repr__(
|
4420
|
-
# include_foreign_keys=False, exclude_field_names=exclude_field_names_init
|
4421
|
-
# )
|
4422
|
-
|
4423
|
-
|
4424
|
-
Record.__repr__ = record_repr # type: ignore
|
4425
|
-
Record.__str__ = record_repr # type: ignore
|
4426
|
-
|
4427
|
-
|
4428
|
-
def deferred_attribute__repr__(self):
|
4429
|
-
return f"FieldAttr({self.field.model.__name__}.{self.field.name})"
|
4430
|
-
|
4431
|
-
|
4432
|
-
FieldAttr.__repr__ = deferred_attribute__repr__ # type: ignore
|
4433
|
-
# backward compatibility
|
4434
|
-
CanValidate = CanCurate
|
4435
|
-
FeatureSet = Schema
|