lamindb 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +33 -26
- lamindb/_finish.py +9 -1
- lamindb/_tracked.py +26 -3
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/base/users.py +1 -4
- lamindb/core/__init__.py +7 -65
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +50 -22
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +43 -20
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +14 -7
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/_tiledbsoma.py +8 -6
- lamindb/core/storage/_zarr.py +104 -25
- lamindb/core/storage/objects.py +63 -28
- lamindb/core/storage/paths.py +16 -13
- lamindb/core/types.py +10 -0
- lamindb/curators/__init__.py +176 -149
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2602 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +64 -32
- lamindb/models/record.py +1762 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +18 -8
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
- lamindb-1.2.0.dist-info/RECORD +95 -0
- lamindb/_artifact.py +0 -1361
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1065
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4435
- lamindb-1.1.0.dist-info/RECORD +0 -95
- {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
"""Curators.
|
2
2
|
|
3
|
+
.. versionadded:: 1.1.0
|
4
|
+
|
3
5
|
.. autosummary::
|
4
6
|
:toctree: .
|
5
7
|
|
@@ -7,12 +9,23 @@
|
|
7
9
|
DataFrameCurator
|
8
10
|
AnnDataCurator
|
9
11
|
|
12
|
+
CatManager:
|
13
|
+
|
14
|
+
.. autosummary::
|
15
|
+
:toctree: .
|
16
|
+
|
17
|
+
CatManager
|
18
|
+
DataFrameCatManager
|
19
|
+
AnnDataCatManager
|
20
|
+
MuDataCatManager
|
21
|
+
TiledbsomaCatManager
|
22
|
+
CurateLookup
|
23
|
+
|
10
24
|
"""
|
11
25
|
|
12
26
|
from __future__ import annotations
|
13
27
|
|
14
28
|
import copy
|
15
|
-
import random
|
16
29
|
import re
|
17
30
|
from importlib import resources
|
18
31
|
from itertools import chain
|
@@ -38,14 +51,10 @@ if TYPE_CHECKING:
|
|
38
51
|
|
39
52
|
from lamindb.base.types import FieldAttr
|
40
53
|
from lamindb.models import Record
|
41
|
-
from lamindb._feature import parse_dtype, parse_dtype_single_cat
|
42
54
|
from lamindb.base.types import FieldAttr # noqa
|
43
|
-
from lamindb.core._data import add_labels
|
44
|
-
from lamindb.core._feature_manager import parse_staged_feature_sets_from_anndata
|
45
55
|
from lamindb.core._settings import settings
|
46
56
|
from lamindb.models import (
|
47
57
|
Artifact,
|
48
|
-
CanCurate,
|
49
58
|
Collection,
|
50
59
|
Feature,
|
51
60
|
Record,
|
@@ -53,9 +62,11 @@ from lamindb.models import (
|
|
53
62
|
Schema,
|
54
63
|
ULabel,
|
55
64
|
)
|
65
|
+
from lamindb.models._feature_manager import parse_staged_feature_sets_from_anndata
|
66
|
+
from lamindb.models.artifact import add_labels, data_is_anndata
|
67
|
+
from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
|
68
|
+
from lamindb.models._from_values import _format_values
|
56
69
|
|
57
|
-
from .._artifact import data_is_anndata
|
58
|
-
from .._from_values import _format_values
|
59
70
|
from ..errors import InvalidArgument, ValidationError
|
60
71
|
|
61
72
|
if TYPE_CHECKING:
|
@@ -66,7 +77,7 @@ if TYPE_CHECKING:
|
|
66
77
|
from mudata import MuData
|
67
78
|
from spatialdata import SpatialData
|
68
79
|
|
69
|
-
from lamindb.
|
80
|
+
from lamindb.models.query_set import RecordList
|
70
81
|
|
71
82
|
|
72
83
|
def strip_ansi_codes(text):
|
@@ -139,13 +150,19 @@ class CurateLookup:
|
|
139
150
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
140
151
|
"To look up public ontologies, use .lookup(public=True)"
|
141
152
|
)
|
142
|
-
else: #
|
153
|
+
else: # pragma: no cover
|
143
154
|
return colors.warning("No fields are found!")
|
144
155
|
|
145
156
|
|
146
157
|
CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
147
158
|
|
148
159
|
|
160
|
+
SLOTS_DOCSTRING = """Curator objects by slot.
|
161
|
+
|
162
|
+
.. versionadded:: 1.1.1
|
163
|
+
"""
|
164
|
+
|
165
|
+
|
149
166
|
VALIDATE_DOCSTRING = """Validate dataset.
|
150
167
|
|
151
168
|
Raises:
|
@@ -170,6 +187,8 @@ class Curator:
|
|
170
187
|
|
171
188
|
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
172
189
|
|
190
|
+
.. versionadded:: 1.1.0
|
191
|
+
|
173
192
|
See:
|
174
193
|
- :class:`~lamindb.curators.DataFrameCurator`
|
175
194
|
- :class:`~lamindb.curators.AnnDataCurator`
|
@@ -189,7 +208,7 @@ class Curator:
|
|
189
208
|
@doc_args(VALIDATE_DOCSTRING)
|
190
209
|
def validate(self) -> bool | str:
|
191
210
|
"""{}""" # noqa: D415
|
192
|
-
pass #
|
211
|
+
pass # pragma: no cover
|
193
212
|
|
194
213
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
195
214
|
def save_artifact(
|
@@ -212,6 +231,8 @@ class DataFrameCurator(Curator):
|
|
212
231
|
|
213
232
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
214
233
|
|
234
|
+
.. versionadded:: 1.1.0
|
235
|
+
|
215
236
|
Args:
|
216
237
|
dataset: The DataFrame-like object to validate & annotate.
|
217
238
|
schema: A `Schema` object that defines the validation constraints.
|
@@ -222,9 +243,9 @@ class DataFrameCurator(Curator):
|
|
222
243
|
import bionty as bt
|
223
244
|
|
224
245
|
# define valid labels
|
225
|
-
|
226
|
-
ln.ULabel(name="DMSO", type=
|
227
|
-
ln.ULabel(name="IFNG", type=
|
246
|
+
perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
|
247
|
+
ln.ULabel(name="DMSO", type=perturbation).save()
|
248
|
+
ln.ULabel(name="IFNG", type=perturbation).save()
|
228
249
|
bt.CellType.from_source(name="B cell").save()
|
229
250
|
bt.CellType.from_source(name="T cell").save()
|
230
251
|
|
@@ -232,7 +253,7 @@ class DataFrameCurator(Curator):
|
|
232
253
|
schema = ln.Schema(
|
233
254
|
name="small_dataset1_obs_level_metadata",
|
234
255
|
features=[
|
235
|
-
ln.Feature(name="
|
256
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
236
257
|
ln.Feature(name="sample_note", dtype=str).save(),
|
237
258
|
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
238
259
|
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
|
@@ -252,10 +273,10 @@ class DataFrameCurator(Curator):
|
|
252
273
|
schema: Schema,
|
253
274
|
) -> None:
|
254
275
|
super().__init__(dataset=dataset, schema=schema)
|
276
|
+
categoricals = {}
|
255
277
|
if schema.n > 0:
|
256
278
|
# populate features
|
257
279
|
pandera_columns = {}
|
258
|
-
categoricals = {}
|
259
280
|
for feature in schema.features.all():
|
260
281
|
pandera_dtype = (
|
261
282
|
feature.dtype if not feature.dtype.startswith("cat") else "category"
|
@@ -268,13 +289,13 @@ class DataFrameCurator(Curator):
|
|
268
289
|
self._pandera_schema = pandera.DataFrameSchema(
|
269
290
|
pandera_columns, coerce=schema.coerce_dtype
|
270
291
|
)
|
271
|
-
# now deal with detailed validation of categoricals
|
272
|
-
self._cat_manager = DataFrameCatManager(
|
273
|
-
self._dataset,
|
274
|
-
categoricals=categoricals,
|
275
|
-
)
|
276
292
|
else:
|
277
293
|
assert schema.itype is not None # noqa: S101
|
294
|
+
self._cat_manager = DataFrameCatManager(
|
295
|
+
self._dataset,
|
296
|
+
columns=parse_dtype_single_cat(schema.itype, is_itype=True)["field"],
|
297
|
+
categoricals=categoricals,
|
298
|
+
)
|
278
299
|
|
279
300
|
@property
|
280
301
|
@doc_args(CAT_MANAGER_DOCSTRING)
|
@@ -285,16 +306,29 @@ class DataFrameCurator(Curator):
|
|
285
306
|
def standardize(self) -> None:
|
286
307
|
"""Standardize the dataset.
|
287
308
|
|
288
|
-
- Adds missing columns
|
289
|
-
- Fills missing values
|
309
|
+
- Adds missing columns for features
|
310
|
+
- Fills missing values for features with default values
|
290
311
|
"""
|
291
312
|
for feature in self._schema.members:
|
292
313
|
if feature.name not in self._dataset.columns:
|
293
|
-
if feature.default_value is not None:
|
294
|
-
|
314
|
+
if feature.default_value is not None or feature.nullable:
|
315
|
+
fill_value = (
|
316
|
+
feature.default_value
|
317
|
+
if feature.default_value is not None
|
318
|
+
else pd.NA
|
319
|
+
)
|
320
|
+
if feature.dtype.startswith("cat"):
|
321
|
+
self._dataset[feature.name] = pd.Categorical(
|
322
|
+
[fill_value] * len(self._dataset)
|
323
|
+
)
|
324
|
+
else:
|
325
|
+
self._dataset[feature.name] = fill_value
|
326
|
+
logger.important(
|
327
|
+
f"added column {feature.name} with fill value {fill_value}"
|
328
|
+
)
|
295
329
|
else:
|
296
330
|
raise ValidationError(
|
297
|
-
f"Missing column {feature.name} cannot be added because
|
331
|
+
f"Missing column {feature.name} cannot be added because is not nullable and has no default value"
|
298
332
|
)
|
299
333
|
else:
|
300
334
|
if feature.default_value is not None:
|
@@ -312,46 +346,29 @@ class DataFrameCurator(Curator):
|
|
312
346
|
feature.default_value
|
313
347
|
)
|
314
348
|
|
349
|
+
def _cat_manager_validate(self) -> None:
|
350
|
+
self._cat_manager.validate()
|
351
|
+
if self._cat_manager._is_validated:
|
352
|
+
self._is_validated = True
|
353
|
+
else:
|
354
|
+
self._is_validated = False
|
355
|
+
raise ValidationError(self._cat_manager._validate_category_error_messages)
|
356
|
+
|
315
357
|
@doc_args(VALIDATE_DOCSTRING)
|
316
358
|
def validate(self) -> None:
|
317
359
|
"""{}""" # noqa: D415
|
318
360
|
if self._schema.n > 0:
|
319
|
-
self._cat_manager.validate()
|
320
361
|
try:
|
362
|
+
# first validate through pandera
|
321
363
|
self._pandera_schema.validate(self._dataset)
|
322
|
-
|
323
|
-
|
324
|
-
else:
|
325
|
-
self._is_validated = False
|
326
|
-
raise ValidationError(
|
327
|
-
self._cat_manager._validate_category_error_messages
|
328
|
-
)
|
364
|
+
# then validate lamindb categoricals
|
365
|
+
self._cat_manager_validate()
|
329
366
|
except pandera.errors.SchemaError as err:
|
330
367
|
self._is_validated = False
|
331
368
|
# .exconly() doesn't exist on SchemaError
|
332
369
|
raise ValidationError(str(err)) from err
|
333
370
|
else:
|
334
|
-
|
335
|
-
registry: CanCurate = result["registry"]
|
336
|
-
inspector = registry.inspect(
|
337
|
-
self._dataset.columns,
|
338
|
-
result["field"],
|
339
|
-
mute=True,
|
340
|
-
)
|
341
|
-
if len(inspector.non_validated) > 0:
|
342
|
-
# also check public ontology
|
343
|
-
if hasattr(registry, "public"):
|
344
|
-
registry.from_values(
|
345
|
-
inspector.non_validated, result["field"], mute=True
|
346
|
-
).save()
|
347
|
-
inspector = registry.inspect(
|
348
|
-
inspector.non_validated, result["field"], mute=True
|
349
|
-
)
|
350
|
-
if len(inspector.non_validated) > 0:
|
351
|
-
self._is_validated = False
|
352
|
-
raise ValidationError(
|
353
|
-
f"Invalid identifiers for {self._schema.itype}: {inspector.non_validated}"
|
354
|
-
)
|
371
|
+
self._cat_manager_validate()
|
355
372
|
|
356
373
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
357
374
|
def save_artifact(
|
@@ -385,6 +402,8 @@ class AnnDataCurator(Curator):
|
|
385
402
|
|
386
403
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
387
404
|
|
405
|
+
.. versionadded:: 1.1.0
|
406
|
+
|
388
407
|
Args:
|
389
408
|
dataset: The AnnData-like object to validate & annotate.
|
390
409
|
schema: A `Schema` object that defines the validation constraints.
|
@@ -395,9 +414,9 @@ class AnnDataCurator(Curator):
|
|
395
414
|
import bionty as bt
|
396
415
|
|
397
416
|
# define valid labels
|
398
|
-
|
399
|
-
ln.ULabel(name="DMSO", type=
|
400
|
-
ln.ULabel(name="IFNG", type=
|
417
|
+
perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
|
418
|
+
ln.ULabel(name="DMSO", type=perturbation).save()
|
419
|
+
ln.ULabel(name="IFNG", type=perturbation).save()
|
401
420
|
bt.CellType.from_source(name="B cell").save()
|
402
421
|
bt.CellType.from_source(name="T cell").save()
|
403
422
|
|
@@ -405,9 +424,9 @@ class AnnDataCurator(Curator):
|
|
405
424
|
obs_schema = ln.Schema(
|
406
425
|
name="small_dataset1_obs_level_metadata",
|
407
426
|
features=[
|
408
|
-
ln.Feature(name="
|
427
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
409
428
|
ln.Feature(name="sample_note", dtype=str).save(),
|
410
|
-
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType
|
429
|
+
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
411
430
|
ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
|
412
431
|
],
|
413
432
|
).save()
|
@@ -416,7 +435,7 @@ class AnnDataCurator(Curator):
|
|
416
435
|
var_schema = ln.Schema(
|
417
436
|
name="scRNA_seq_var_schema",
|
418
437
|
itype=bt.Gene.ensembl_gene_id,
|
419
|
-
dtype=
|
438
|
+
dtype=int,
|
420
439
|
).save()
|
421
440
|
|
422
441
|
# define composite schema
|
@@ -443,31 +462,55 @@ class AnnDataCurator(Curator):
|
|
443
462
|
raise InvalidArgument("dataset must be AnnData-like.")
|
444
463
|
if schema.otype != "AnnData":
|
445
464
|
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
465
|
+
# TODO: also support slots other than obs and var
|
466
|
+
self._slots = {
|
467
|
+
slot: DataFrameCurator(
|
468
|
+
(
|
469
|
+
self._dataset.__getattribute__(slot).T
|
470
|
+
if slot == "var"
|
471
|
+
else self._dataset.__getattribute__(slot)
|
472
|
+
),
|
473
|
+
slot_schema,
|
474
|
+
)
|
475
|
+
for slot, slot_schema in schema.slots.items()
|
476
|
+
if slot in {"obs", "var"}
|
477
|
+
}
|
478
|
+
|
479
|
+
@property
|
480
|
+
@doc_args(SLOTS_DOCSTRING)
|
481
|
+
def slots(self) -> dict[str, DataFrameCurator]:
|
482
|
+
"""{}""" # noqa: D415
|
483
|
+
return self._slots
|
452
484
|
|
453
485
|
@doc_args(VALIDATE_DOCSTRING)
|
454
486
|
def validate(self) -> None:
|
455
487
|
"""{}""" # noqa: D415
|
456
|
-
self.
|
457
|
-
|
458
|
-
self._is_validated = True
|
488
|
+
for _, curator in self._slots.items():
|
489
|
+
curator.validate()
|
459
490
|
|
460
491
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
461
|
-
def save_artifact(
|
492
|
+
def save_artifact(
|
493
|
+
self,
|
494
|
+
*,
|
495
|
+
key: str | None = None,
|
496
|
+
description: str | None = None,
|
497
|
+
revises: Artifact | None = None,
|
498
|
+
run: Run | None = None,
|
499
|
+
):
|
462
500
|
"""{}""" # noqa: D415
|
463
501
|
if not self._is_validated:
|
464
|
-
self.validate()
|
465
|
-
result = parse_dtype_single_cat(self._var_curator._schema.itype, is_itype=True)
|
502
|
+
self.validate()
|
466
503
|
return save_artifact( # type: ignore
|
467
504
|
self._dataset,
|
468
505
|
description=description,
|
469
|
-
fields=self.
|
470
|
-
columns_field=
|
506
|
+
fields=self.slots["obs"]._cat_manager.categoricals,
|
507
|
+
columns_field=(
|
508
|
+
parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
|
509
|
+
"field"
|
510
|
+
]
|
511
|
+
if "var" in self._slots
|
512
|
+
else None
|
513
|
+
),
|
471
514
|
key=key,
|
472
515
|
artifact=self._artifact,
|
473
516
|
revises=revises,
|
@@ -497,8 +540,8 @@ class CatManager:
|
|
497
540
|
|
498
541
|
If you find non-validated values, you have several options:
|
499
542
|
|
500
|
-
- new values found in the data can be registered using :meth:`~lamindb.
|
501
|
-
- non-validated values can be accessed using :meth:`~lamindb.
|
543
|
+
- new values found in the data can be registered using :meth:`~lamindb.curators.DataFrameCatManager.add_new_from`
|
544
|
+
- non-validated values can be accessed using :meth:`~lamindb.curators.DataFrameCatManager.non_validated` and addressed manually
|
502
545
|
"""
|
503
546
|
|
504
547
|
def __init__(
|
@@ -577,7 +620,7 @@ class CatManager:
|
|
577
620
|
Returns:
|
578
621
|
None
|
579
622
|
"""
|
580
|
-
pass #
|
623
|
+
pass # pragma: no cover
|
581
624
|
|
582
625
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
583
626
|
def save_artifact(
|
@@ -869,7 +912,7 @@ class AnnDataCatManager(CatManager):
|
|
869
912
|
def __init__(
|
870
913
|
self,
|
871
914
|
data: ad.AnnData | Artifact,
|
872
|
-
var_index: FieldAttr,
|
915
|
+
var_index: FieldAttr | None = None,
|
873
916
|
categoricals: dict[str, FieldAttr] | None = None,
|
874
917
|
obs_columns: FieldAttr = Feature.name,
|
875
918
|
verbosity: str = "hint",
|
@@ -938,15 +981,16 @@ class AnnDataCatManager(CatManager):
|
|
938
981
|
validated_only: bool = True,
|
939
982
|
):
|
940
983
|
"""Save variable records."""
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
984
|
+
if self.var_index is not None:
|
985
|
+
update_registry(
|
986
|
+
values=list(self._adata.var.index),
|
987
|
+
field=self.var_index,
|
988
|
+
key="var_index",
|
989
|
+
validated_only=validated_only,
|
990
|
+
organism=self._organism,
|
991
|
+
source=self._sources.get("var_index"),
|
992
|
+
exclude=self._exclude.get("var_index"),
|
993
|
+
)
|
950
994
|
|
951
995
|
def add_new_from(self, key: str, **kwargs):
|
952
996
|
"""Add validated & new categories.
|
@@ -982,15 +1026,19 @@ class AnnDataCatManager(CatManager):
|
|
982
1026
|
|
983
1027
|
# add all validated records to the current instance
|
984
1028
|
self._save_from_var_index(validated_only=True)
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
1029
|
+
if self.var_index is not None:
|
1030
|
+
validated_var, non_validated_var = validate_categories(
|
1031
|
+
self._adata.var.index,
|
1032
|
+
field=self._var_field,
|
1033
|
+
key="var_index",
|
1034
|
+
source=self._sources.get("var_index"),
|
1035
|
+
hint_print=".add_new_from_var_index()",
|
1036
|
+
exclude=self._exclude.get("var_index"),
|
1037
|
+
organism=self._organism, # type: ignore
|
1038
|
+
)
|
1039
|
+
else:
|
1040
|
+
validated_var = True
|
1041
|
+
non_validated_var = []
|
994
1042
|
validated_obs = self._obs_df_curator.validate()
|
995
1043
|
self._non_validated = self._obs_df_curator._non_validated # type: ignore
|
996
1044
|
if len(non_validated_var) > 0:
|
@@ -1031,11 +1079,6 @@ class AnnDataCatManager(CatManager):
|
|
1031
1079
|
class MuDataCatManager(CatManager):
|
1032
1080
|
"""Curation flow for a ``MuData`` object.
|
1033
1081
|
|
1034
|
-
See also :class:`~lamindb.Curator`.
|
1035
|
-
|
1036
|
-
Note that if genes or other measurements are removed from the MuData object,
|
1037
|
-
the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
|
1038
|
-
|
1039
1082
|
Args:
|
1040
1083
|
mdata: The MuData object to curate.
|
1041
1084
|
var_index: The registry field for mapping the ``.var`` index for each modality.
|
@@ -1289,8 +1332,6 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1289
1332
|
class TiledbsomaCatManager(CatManager):
|
1290
1333
|
"""Curation flow for `tiledbsoma.Experiment`.
|
1291
1334
|
|
1292
|
-
See also :class:`~lamindb.Curator`.
|
1293
|
-
|
1294
1335
|
Args:
|
1295
1336
|
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1296
1337
|
var_index: The registry fields for mapping the `.var` indices for measurements.
|
@@ -1696,7 +1737,7 @@ class TiledbsomaCatManager(CatManager):
|
|
1696
1737
|
Returns:
|
1697
1738
|
A saved artifact record.
|
1698
1739
|
"""
|
1699
|
-
from lamindb.
|
1740
|
+
from lamindb.models.artifact import add_labels
|
1700
1741
|
|
1701
1742
|
if not self._is_validated:
|
1702
1743
|
self.validate()
|
@@ -1833,11 +1874,11 @@ class SpatialDataCatManager(CatManager):
|
|
1833
1874
|
exclude=exclude,
|
1834
1875
|
)
|
1835
1876
|
if isinstance(sdata, Artifact):
|
1836
|
-
# TODO: load() doesn't yet work
|
1837
1877
|
self._sdata = sdata.load()
|
1838
1878
|
else:
|
1839
1879
|
self._sdata = self._dataset
|
1840
1880
|
self._sample_metadata_key = sample_metadata_key
|
1881
|
+
self._write_path = None
|
1841
1882
|
self._var_fields = var_index
|
1842
1883
|
self._verify_accessor_exists(self._var_fields.keys())
|
1843
1884
|
self._categoricals = categoricals
|
@@ -2119,26 +2160,14 @@ class SpatialDataCatManager(CatManager):
|
|
2119
2160
|
try:
|
2120
2161
|
settings.verbosity = "warning"
|
2121
2162
|
|
2122
|
-
|
2123
|
-
|
2124
|
-
|
2125
|
-
|
2126
|
-
|
2127
|
-
|
2128
|
-
|
2129
|
-
|
2130
|
-
# Create the Artifact and associate Artifact metadata
|
2131
|
-
self._artifact = Artifact(
|
2132
|
-
write_path,
|
2133
|
-
description=description,
|
2134
|
-
key=key,
|
2135
|
-
revises=revises,
|
2136
|
-
run=run,
|
2137
|
-
)
|
2138
|
-
# According to Tim it is not easy to calculate the number of observations.
|
2139
|
-
# We would have to write custom code to iterate over labels (which might not even exist at that point)
|
2140
|
-
self._artifact.otype = "spatialdata"
|
2141
|
-
self._artifact.save()
|
2163
|
+
self._artifact = Artifact.from_spatialdata(
|
2164
|
+
self._sdata,
|
2165
|
+
key=key,
|
2166
|
+
description=description,
|
2167
|
+
revises=revises,
|
2168
|
+
run=run,
|
2169
|
+
)
|
2170
|
+
self._artifact.save()
|
2142
2171
|
|
2143
2172
|
# Link schemas
|
2144
2173
|
feature_kwargs = check_registry_organism(
|
@@ -2156,7 +2185,7 @@ class SpatialDataCatManager(CatManager):
|
|
2156
2185
|
"""Add Schemas from SpatialData."""
|
2157
2186
|
if obs_fields is None:
|
2158
2187
|
obs_fields = {}
|
2159
|
-
assert host.otype == "
|
2188
|
+
assert host.otype == "SpatialData" # noqa: S101
|
2160
2189
|
|
2161
2190
|
feature_sets = {}
|
2162
2191
|
|
@@ -2784,7 +2813,7 @@ class DoseHandler:
|
|
2784
2813
|
return cls.UNIT_MAP.get(unit, unit)
|
2785
2814
|
|
2786
2815
|
@classmethod
|
2787
|
-
def validate_values(cls, values: pd.Series) -> list:
|
2816
|
+
def validate_values(cls, values: pd.Series) -> list[str]:
|
2788
2817
|
"""Validate pert_dose values with strict case checking."""
|
2789
2818
|
errors = []
|
2790
2819
|
|
@@ -2828,7 +2857,7 @@ class TimeHandler:
|
|
2828
2857
|
return unit[0].lower()
|
2829
2858
|
|
2830
2859
|
@classmethod
|
2831
|
-
def validate_values(cls, values: pd.Series) -> list:
|
2860
|
+
def validate_values(cls, values: pd.Series) -> list[str]:
|
2832
2861
|
"""Validate pert_time values."""
|
2833
2862
|
errors = []
|
2834
2863
|
|
@@ -3168,10 +3197,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
|
|
3168
3197
|
import bionty as bt
|
3169
3198
|
|
3170
3199
|
if organism is None and bt.settings.organism is None:
|
3171
|
-
|
3172
|
-
f"{registry.__name__} registry requires an organism!\n"
|
3173
|
-
" → please pass an organism name via organism="
|
3174
|
-
)
|
3200
|
+
return {}
|
3175
3201
|
return {"organism": organism or bt.settings.organism.name}
|
3176
3202
|
return {}
|
3177
3203
|
|
@@ -3185,8 +3211,8 @@ def validate_categories(
|
|
3185
3211
|
exclude: str | list | None = None,
|
3186
3212
|
hint_print: str | None = None,
|
3187
3213
|
curator: CatManager | None = None,
|
3188
|
-
) -> tuple[bool, list]:
|
3189
|
-
"""Validate ontology terms
|
3214
|
+
) -> tuple[bool, list[str]]:
|
3215
|
+
"""Validate ontology terms using LaminDB registries.
|
3190
3216
|
|
3191
3217
|
Args:
|
3192
3218
|
values: The values to validate.
|
@@ -3198,8 +3224,8 @@ def validate_categories(
|
|
3198
3224
|
standardize: Whether to standardize the values.
|
3199
3225
|
hint_print: The hint to print that suggests fixing non-validated values.
|
3200
3226
|
"""
|
3201
|
-
from lamindb._from_values import _format_values
|
3202
3227
|
from lamindb.core._settings import settings
|
3228
|
+
from lamindb.models._from_values import _format_values
|
3203
3229
|
|
3204
3230
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
3205
3231
|
|
@@ -3263,7 +3289,7 @@ def validate_categories(
|
|
3263
3289
|
warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
|
3264
3290
|
if n_non_validated > len(syn_mapper):
|
3265
3291
|
if syn_mapper:
|
3266
|
-
warning_message += " for remaining terms:\n"
|
3292
|
+
warning_message += "\n for remaining terms:\n"
|
3267
3293
|
warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
|
3268
3294
|
|
3269
3295
|
if logger.indent == "":
|
@@ -3334,7 +3360,7 @@ def validate_categories_in_df(
|
|
3334
3360
|
def save_artifact(
|
3335
3361
|
data: pd.DataFrame | ad.AnnData | MuData,
|
3336
3362
|
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
3337
|
-
columns_field: FieldAttr | dict[str, FieldAttr],
|
3363
|
+
columns_field: FieldAttr | dict[str, FieldAttr] | None = None,
|
3338
3364
|
description: str | None = None,
|
3339
3365
|
organism: str | None = None,
|
3340
3366
|
key: str | None = None,
|
@@ -3360,8 +3386,7 @@ def save_artifact(
|
|
3360
3386
|
Returns:
|
3361
3387
|
The saved Artifact.
|
3362
3388
|
"""
|
3363
|
-
from ..
|
3364
|
-
from ..core._data import add_labels
|
3389
|
+
from ..models.artifact import add_labels, data_is_anndata, data_is_mudata
|
3365
3390
|
|
3366
3391
|
if artifact is None:
|
3367
3392
|
if data_is_anndata(data):
|
@@ -3383,7 +3408,7 @@ def save_artifact(
|
|
3383
3408
|
artifact.schema = schema
|
3384
3409
|
artifact.save()
|
3385
3410
|
|
3386
|
-
if organism is not None:
|
3411
|
+
if organism is not None and columns_field is not None:
|
3387
3412
|
feature_kwargs = check_registry_organism(
|
3388
3413
|
(
|
3389
3414
|
list(columns_field.values())[0].field.model
|
@@ -3422,7 +3447,7 @@ def save_artifact(
|
|
3422
3447
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
3423
3448
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
3424
3449
|
# multi-value columns are separated by "|"
|
3425
|
-
if df[key].str.contains("|").any():
|
3450
|
+
if not df[key].isna().all() and df[key].str.contains("|").any():
|
3426
3451
|
values = df[key].str.split("|").explode().unique()
|
3427
3452
|
else:
|
3428
3453
|
values = df[key].unique()
|
@@ -3520,8 +3545,8 @@ def update_registry(
|
|
3520
3545
|
exclude: Values to exclude from inspect.
|
3521
3546
|
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
3522
3547
|
"""
|
3523
|
-
from lamindb._save import save as ln_save
|
3524
3548
|
from lamindb.core._settings import settings
|
3549
|
+
from lamindb.models.save import save as ln_save
|
3525
3550
|
|
3526
3551
|
registry = field.field.model
|
3527
3552
|
filter_kwargs = check_registry_organism(registry, organism)
|
@@ -3609,7 +3634,7 @@ def log_saved_labels(
|
|
3609
3634
|
validated_only: bool = True,
|
3610
3635
|
) -> None:
|
3611
3636
|
"""Log the saved labels."""
|
3612
|
-
from .._from_values import _format_values
|
3637
|
+
from ..models._from_values import _format_values
|
3613
3638
|
|
3614
3639
|
model_field = colors.italic(model_field)
|
3615
3640
|
for k, labels in labels_saved.items():
|
@@ -3655,12 +3680,14 @@ def _save_organism(name: str):
|
|
3655
3680
|
return organism
|
3656
3681
|
|
3657
3682
|
|
3658
|
-
def _ref_is_name(field: FieldAttr) -> bool | None:
|
3683
|
+
def _ref_is_name(field: FieldAttr | None) -> bool | None:
|
3659
3684
|
"""Check if the reference field is a name field."""
|
3660
|
-
from ..
|
3685
|
+
from ..models.can_curate import get_name_field
|
3661
3686
|
|
3662
|
-
|
3663
|
-
|
3687
|
+
if field is not None:
|
3688
|
+
name_field = get_name_field(field.field.model)
|
3689
|
+
return field.field.name == name_field
|
3690
|
+
return None
|
3664
3691
|
|
3665
3692
|
|
3666
3693
|
# backward compat constructors ------------------
|
@@ -3709,7 +3736,7 @@ def from_anndata(
|
|
3709
3736
|
@classmethod # type: ignore
|
3710
3737
|
def from_mudata(
|
3711
3738
|
cls,
|
3712
|
-
mdata: MuData,
|
3739
|
+
mdata: MuData | UPathStr,
|
3713
3740
|
var_index: dict[str, dict[str, FieldAttr]],
|
3714
3741
|
categoricals: dict[str, FieldAttr] | None = None,
|
3715
3742
|
verbosity: str = "hint",
|
@@ -3749,7 +3776,7 @@ def from_tiledbsoma(
|
|
3749
3776
|
@classmethod # type: ignore
|
3750
3777
|
def from_spatialdata(
|
3751
3778
|
cls,
|
3752
|
-
sdata,
|
3779
|
+
sdata: SpatialData | UPathStr,
|
3753
3780
|
var_index: dict[str, FieldAttr],
|
3754
3781
|
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3755
3782
|
organism: str | None = None,
|