lamindb 0.77.2__py3-none-any.whl → 1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +39 -32
- lamindb/_artifact.py +95 -64
- lamindb/_can_curate.py +19 -10
- lamindb/_collection.py +51 -49
- lamindb/_feature.py +9 -9
- lamindb/_finish.py +99 -86
- lamindb/_from_values.py +20 -17
- lamindb/_is_versioned.py +2 -1
- lamindb/_parents.py +23 -16
- lamindb/_query_manager.py +3 -3
- lamindb/_query_set.py +85 -18
- lamindb/_record.py +121 -46
- lamindb/_run.py +3 -3
- lamindb/_save.py +14 -8
- lamindb/{_feature_set.py → _schema.py} +34 -31
- lamindb/_storage.py +2 -1
- lamindb/_transform.py +51 -23
- lamindb/_ulabel.py +17 -8
- lamindb/_view.py +15 -14
- lamindb/base/__init__.py +24 -0
- lamindb/base/fields.py +281 -0
- lamindb/base/ids.py +103 -0
- lamindb/base/types.py +51 -0
- lamindb/base/users.py +30 -0
- lamindb/base/validation.py +67 -0
- lamindb/core/__init__.py +19 -14
- lamindb/core/_context.py +297 -228
- lamindb/core/_data.py +44 -49
- lamindb/core/_describe.py +41 -31
- lamindb/core/_django.py +59 -44
- lamindb/core/_feature_manager.py +192 -168
- lamindb/core/_label_manager.py +22 -22
- lamindb/core/_mapped_collection.py +17 -14
- lamindb/core/_settings.py +1 -12
- lamindb/core/_sync_git.py +56 -9
- lamindb/core/_track_environment.py +1 -1
- lamindb/core/datasets/_core.py +5 -6
- lamindb/core/exceptions.py +0 -7
- lamindb/core/fields.py +1 -1
- lamindb/core/loaders.py +18 -2
- lamindb/core/{schema.py → relations.py} +22 -19
- lamindb/core/storage/_anndata_accessor.py +1 -2
- lamindb/core/storage/_backed_access.py +2 -1
- lamindb/core/storage/_tiledbsoma.py +40 -13
- lamindb/core/storage/objects.py +1 -1
- lamindb/core/storage/paths.py +13 -8
- lamindb/core/subsettings/__init__.py +0 -2
- lamindb/core/types.py +2 -23
- lamindb/core/versioning.py +11 -7
- lamindb/{_curate.py → curators/__init__.py} +700 -57
- lamindb/curators/_spatial.py +528 -0
- lamindb/integrations/_vitessce.py +1 -3
- lamindb/migrations/0052_squashed.py +1261 -0
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +57 -0
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +35 -0
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +61 -0
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +22 -0
- lamindb/migrations/0057_link_models_latest_report_and_others.py +356 -0
- lamindb/migrations/0058_artifact__actions_collection__actions.py +22 -0
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +31 -0
- lamindb/migrations/0060_alter_artifact__actions.py +22 -0
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +45 -0
- lamindb/migrations/0062_add_is_latest_field.py +32 -0
- lamindb/migrations/0063_populate_latest_field.py +45 -0
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +33 -0
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +22 -0
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +352 -0
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +20 -0
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +20 -0
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +1294 -0
- lamindb/migrations/0069_squashed.py +1770 -0
- lamindb/migrations/0070_lamindbv1_migrate_data.py +78 -0
- lamindb/migrations/0071_lamindbv1_migrate_schema.py +741 -0
- lamindb/migrations/0072_remove_user__branch_code_remove_user_aux_and_more.py +148 -0
- lamindb/migrations/0073_merge_ourprojects.py +945 -0
- lamindb/migrations/0074_lamindbv1_part4.py +374 -0
- lamindb/migrations/0075_lamindbv1_part5.py +276 -0
- lamindb/migrations/0076_lamindbv1_part6.py +621 -0
- lamindb/migrations/0077_lamindbv1_part6b.py +228 -0
- lamindb/migrations/0078_lamindbv1_part6c.py +468 -0
- lamindb/migrations/0079_alter_rundata_value_json_and_more.py +36 -0
- lamindb/migrations/__init__.py +0 -0
- lamindb/models.py +4064 -0
- {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/METADATA +15 -20
- lamindb-1.0rc1.dist-info/RECORD +100 -0
- {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/WHEEL +1 -1
- lamindb/core/subsettings/_transform_settings.py +0 -21
- lamindb-0.77.2.dist-info/RECORD +0 -63
- {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/LICENSE +0 -0
lamindb/core/_feature_manager.py
CHANGED
@@ -14,35 +14,17 @@ from anndata import AnnData
|
|
14
14
|
from django.contrib.postgres.aggregates import ArrayAgg
|
15
15
|
from django.db import connections
|
16
16
|
from django.db.models import Aggregate
|
17
|
-
from lamin_utils import
|
17
|
+
from lamin_utils import logger
|
18
18
|
from lamindb_setup.core.hashing import hash_set
|
19
19
|
from lamindb_setup.core.upath import create_path
|
20
|
-
from lnschema_core.models import (
|
21
|
-
Artifact,
|
22
|
-
Collection,
|
23
|
-
Feature,
|
24
|
-
FeatureManager,
|
25
|
-
FeatureValue,
|
26
|
-
LinkORM,
|
27
|
-
Param,
|
28
|
-
ParamManager,
|
29
|
-
ParamManagerArtifact,
|
30
|
-
ParamManagerRun,
|
31
|
-
ParamValue,
|
32
|
-
Record,
|
33
|
-
Run,
|
34
|
-
ULabel,
|
35
|
-
)
|
36
20
|
from rich.table import Column, Table
|
37
21
|
from rich.text import Text
|
38
22
|
|
39
23
|
from lamindb._feature import (
|
40
|
-
FEATURE_DTYPES,
|
41
24
|
convert_pandas_dtype_to_lamin_dtype,
|
42
25
|
suggest_categorical_for_str_iterable,
|
43
26
|
)
|
44
|
-
from lamindb.
|
45
|
-
from lamindb._from_values import _print_values
|
27
|
+
from lamindb._from_values import _format_values
|
46
28
|
from lamindb._record import (
|
47
29
|
REGISTRY_UNIQUE_FIELD,
|
48
30
|
get_name_field,
|
@@ -50,8 +32,25 @@ from lamindb._record import (
|
|
50
32
|
transfer_to_default_db,
|
51
33
|
)
|
52
34
|
from lamindb._save import save
|
35
|
+
from lamindb._schema import DICT_KEYS_TYPE, Schema
|
53
36
|
from lamindb.core.exceptions import DoesNotExist, ValidationError
|
54
37
|
from lamindb.core.storage import LocalPathClasses
|
38
|
+
from lamindb.models import (
|
39
|
+
Artifact,
|
40
|
+
Collection,
|
41
|
+
Feature,
|
42
|
+
FeatureManager,
|
43
|
+
FeatureValue,
|
44
|
+
LinkORM,
|
45
|
+
Param,
|
46
|
+
ParamManager,
|
47
|
+
ParamManagerArtifact,
|
48
|
+
ParamManagerRun,
|
49
|
+
ParamValue,
|
50
|
+
Record,
|
51
|
+
Run,
|
52
|
+
ULabel,
|
53
|
+
)
|
55
54
|
|
56
55
|
from ._describe import (
|
57
56
|
NAME_WIDTH,
|
@@ -63,15 +62,15 @@ from ._describe import (
|
|
63
62
|
from ._django import get_artifact_with_related
|
64
63
|
from ._label_manager import _get_labels, describe_labels
|
65
64
|
from ._settings import settings
|
66
|
-
from .
|
65
|
+
from .relations import (
|
67
66
|
dict_related_model_to_related_name,
|
68
67
|
)
|
69
68
|
|
70
69
|
if TYPE_CHECKING:
|
71
|
-
from lnschema_core.types import FieldAttr
|
72
70
|
from rich.tree import Tree
|
73
71
|
|
74
72
|
from lamindb._query_set import QuerySet
|
73
|
+
from lamindb.base.types import FieldAttr
|
75
74
|
|
76
75
|
|
77
76
|
def get_host_id_field(host: Artifact | Collection) -> str:
|
@@ -84,7 +83,7 @@ def get_host_id_field(host: Artifact | Collection) -> str:
|
|
84
83
|
|
85
84
|
def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
|
86
85
|
dictionary = {
|
87
|
-
field.related_model.
|
86
|
+
field.related_model.__get_name_with_module__(): field.name
|
88
87
|
for field in host._meta.related_objects
|
89
88
|
}
|
90
89
|
dictionary["Feature"] = "features"
|
@@ -92,25 +91,25 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
|
|
92
91
|
return dictionary
|
93
92
|
|
94
93
|
|
95
|
-
def
|
94
|
+
def get_schema_by_slot_(host: Artifact | Collection) -> dict:
|
96
95
|
if isinstance(host, Collection):
|
97
96
|
return {}
|
98
97
|
# if the host is not yet saved
|
99
98
|
if host._state.adding:
|
100
|
-
if hasattr(host, "
|
101
|
-
return host.
|
99
|
+
if hasattr(host, "_staged__schemas_m2m"):
|
100
|
+
return host._staged__schemas_m2m
|
102
101
|
else:
|
103
102
|
return {}
|
104
103
|
host_db = host._state.db
|
105
104
|
host_id_field = get_host_id_field(host)
|
106
105
|
kwargs = {host_id_field: host.id}
|
107
106
|
# otherwise, we need a query
|
108
|
-
|
109
|
-
host.
|
107
|
+
links_schema = (
|
108
|
+
host._schemas_m2m.through.objects.using(host_db)
|
110
109
|
.filter(**kwargs)
|
111
|
-
.select_related("
|
110
|
+
.select_related("schema")
|
112
111
|
)
|
113
|
-
return {fsl.slot: fsl.
|
112
|
+
return {fsl.slot: fsl.schema for fsl in links_schema}
|
114
113
|
|
115
114
|
|
116
115
|
def get_label_links(
|
@@ -126,11 +125,11 @@ def get_label_links(
|
|
126
125
|
return link_records
|
127
126
|
|
128
127
|
|
129
|
-
def
|
128
|
+
def get_schema_links(host: Artifact | Collection) -> QuerySet:
|
130
129
|
host_id_field = get_host_id_field(host)
|
131
130
|
kwargs = {host_id_field: host.id}
|
132
|
-
|
133
|
-
return
|
131
|
+
links_schema = host._schemas_m2m.through.objects.filter(**kwargs)
|
132
|
+
return links_schema
|
134
133
|
|
135
134
|
|
136
135
|
def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
|
@@ -270,25 +269,27 @@ def _get_non_categoricals(
|
|
270
269
|
return non_categoricals
|
271
270
|
|
272
271
|
|
273
|
-
def
|
272
|
+
def _get_schemas_postgres(
|
274
273
|
self: Artifact | Collection,
|
275
274
|
related_data: dict | None = None,
|
276
275
|
) -> dict:
|
277
276
|
if not related_data:
|
278
|
-
artifact_meta = get_artifact_with_related(self,
|
277
|
+
artifact_meta = get_artifact_with_related(self, include_schema=True)
|
279
278
|
related_data = artifact_meta.get("related_data", {})
|
280
279
|
|
281
|
-
fs_data = related_data.get("
|
280
|
+
fs_data = related_data.get("schemas", {}) if related_data else {}
|
282
281
|
return fs_data
|
283
282
|
|
284
283
|
|
285
|
-
def _create_feature_table(
|
284
|
+
def _create_feature_table(
|
285
|
+
name: str, registry_str: str, data: list, show_header: bool = False
|
286
|
+
) -> Table:
|
286
287
|
"""Create a Rich table for a feature group."""
|
287
288
|
table = Table(
|
288
289
|
Column(name, style="", no_wrap=True, width=NAME_WIDTH),
|
289
290
|
Column(registry_str, style="dim", no_wrap=True, width=TYPE_WIDTH),
|
290
291
|
Column("", width=VALUES_WIDTH, no_wrap=True),
|
291
|
-
show_header=
|
292
|
+
show_header=show_header,
|
292
293
|
box=None,
|
293
294
|
pad_edge=False,
|
294
295
|
)
|
@@ -324,36 +325,36 @@ def describe_features(
|
|
324
325
|
return dictionary if to_dict else tree
|
325
326
|
|
326
327
|
# feature sets
|
327
|
-
|
328
|
+
schema_data: dict[str, tuple[str, list[str]]] = {}
|
328
329
|
feature_data: dict[str, tuple[str, list[str]]] = {}
|
329
330
|
if not print_params and not to_dict:
|
330
331
|
if self.id is not None and connections[self._state.db].vendor == "postgresql":
|
331
|
-
fs_data =
|
332
|
+
fs_data = _get_schemas_postgres(self, related_data=related_data)
|
332
333
|
for fs_id, (slot, data) in fs_data.items():
|
333
334
|
for registry_str, feature_names in data.items():
|
334
|
-
|
335
|
-
|
335
|
+
schema = Schema.objects.using(self._state.db).get(id=fs_id)
|
336
|
+
schema_data[slot] = (schema, feature_names)
|
336
337
|
for feature_name in feature_names:
|
337
338
|
feature_data[feature_name] = (slot, registry_str)
|
338
339
|
else:
|
339
|
-
for slot,
|
340
|
-
features =
|
340
|
+
for slot, schema in get_schema_by_slot_(self).items():
|
341
|
+
features = schema.members
|
341
342
|
# features.first() is a lot slower than features[0] here
|
342
343
|
name_field = get_name_field(features[0])
|
343
344
|
feature_names = list(features.values_list(name_field, flat=True)[:20])
|
344
|
-
|
345
|
+
schema_data[slot] = (schema, feature_names)
|
345
346
|
for feature_name in feature_names:
|
346
|
-
feature_data[feature_name] = (slot,
|
347
|
+
feature_data[feature_name] = (slot, schema.itype)
|
347
348
|
|
348
|
-
internal_feature_names:
|
349
|
+
internal_feature_names: dict[str, str] = {}
|
349
350
|
if isinstance(self, Artifact):
|
350
|
-
|
351
|
-
internal_feature_names =
|
352
|
-
if len(
|
353
|
-
for
|
354
|
-
internal_feature_names
|
355
|
-
|
356
|
-
)
|
351
|
+
_schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
|
352
|
+
internal_feature_names = {}
|
353
|
+
if len(_schemas_m2m) > 0:
|
354
|
+
for schema in _schemas_m2m:
|
355
|
+
internal_feature_names.update(
|
356
|
+
dict(schema.members.values_list("name", "dtype"))
|
357
|
+
)
|
357
358
|
|
358
359
|
# categorical feature values
|
359
360
|
# Get the categorical data using the appropriate method
|
@@ -388,7 +389,7 @@ def describe_features(
|
|
388
389
|
|
389
390
|
# Format message
|
390
391
|
printed_values = (
|
391
|
-
|
392
|
+
_format_values(sorted(values), n=10, quotes=False)
|
392
393
|
if not is_list_type or not feature_dtype.startswith("list")
|
393
394
|
else sorted(values)
|
394
395
|
)
|
@@ -407,78 +408,99 @@ def describe_features(
|
|
407
408
|
if to_dict:
|
408
409
|
return dictionary
|
409
410
|
|
410
|
-
# Dataset section
|
411
|
-
|
412
|
-
|
413
|
-
] = {} # internal features from the `Feature` registry that contain labels
|
411
|
+
# Dataset features section
|
412
|
+
# internal features that contain labels (only `Feature` features contain labels)
|
413
|
+
internal_feature_labels_slot: dict[str, list] = {}
|
414
414
|
for feature_name, feature_row in internal_feature_labels.items():
|
415
415
|
slot, _ = feature_data.get(feature_name)
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
for slot, (
|
420
|
-
if slot in
|
421
|
-
|
416
|
+
internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
|
417
|
+
|
418
|
+
int_features_tree_children = []
|
419
|
+
for slot, (schema, feature_names) in schema_data.items():
|
420
|
+
if slot in internal_feature_labels_slot:
|
421
|
+
# add internal Feature features with labels
|
422
|
+
feature_rows = internal_feature_labels_slot[slot]
|
423
|
+
# add internal Feature features without labels
|
424
|
+
feature_rows += [
|
425
|
+
(
|
426
|
+
feature_name,
|
427
|
+
Text(str(internal_feature_names.get(feature_name)), style="dim"),
|
428
|
+
"",
|
429
|
+
)
|
430
|
+
for feature_name in feature_names
|
431
|
+
if feature_name and feature_name not in internal_feature_labels
|
432
|
+
]
|
422
433
|
else:
|
434
|
+
# add internal non-Feature features without labels
|
423
435
|
feature_rows = [
|
424
|
-
(
|
436
|
+
(
|
437
|
+
feature_name,
|
438
|
+
Text(
|
439
|
+
str(
|
440
|
+
internal_feature_names.get(feature_name)
|
441
|
+
if feature_name in internal_feature_names
|
442
|
+
else schema.dtype
|
443
|
+
),
|
444
|
+
style="dim",
|
445
|
+
),
|
446
|
+
"",
|
447
|
+
)
|
425
448
|
for feature_name in feature_names
|
426
449
|
if feature_name
|
427
450
|
]
|
428
|
-
|
451
|
+
int_features_tree_children.append(
|
429
452
|
_create_feature_table(
|
430
453
|
Text.assemble(
|
431
454
|
(slot, "violet"),
|
432
455
|
(" • ", "dim"),
|
433
|
-
(str(
|
456
|
+
(str(schema.n), "pink1"),
|
434
457
|
),
|
435
|
-
Text.assemble((f"[{
|
458
|
+
Text.assemble((f"[{schema.itype}]", "pink1")),
|
436
459
|
feature_rows,
|
460
|
+
show_header=True,
|
437
461
|
)
|
438
462
|
)
|
439
463
|
## internal features from the non-`Feature` registry
|
440
|
-
if
|
464
|
+
if int_features_tree_children:
|
441
465
|
dataset_tree = tree.add(
|
442
466
|
Text.assemble(
|
443
|
-
("Dataset", "bold bright_magenta"),
|
467
|
+
("Dataset features", "bold bright_magenta"),
|
444
468
|
("/", "dim"),
|
445
|
-
(".
|
469
|
+
("._schemas_m2m", "dim bold"),
|
446
470
|
)
|
447
471
|
)
|
448
|
-
for child in
|
472
|
+
for child in int_features_tree_children:
|
449
473
|
dataset_tree.add(child)
|
450
474
|
|
451
|
-
#
|
452
|
-
|
453
|
-
features_tree_children = []
|
475
|
+
# Linked features
|
476
|
+
ext_features_tree_children = []
|
454
477
|
if external_data:
|
455
|
-
|
478
|
+
ext_features_tree_children.append(
|
456
479
|
_create_feature_table(
|
457
|
-
|
458
|
-
("Params" if print_params else "Features", "green_yellow")
|
459
|
-
),
|
480
|
+
"",
|
460
481
|
"",
|
461
482
|
external_data,
|
462
483
|
)
|
463
484
|
)
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
485
|
+
# ext_features_tree = None
|
486
|
+
ext_features_header = Text(
|
487
|
+
"Params" if print_params else "Linked features", style="bold dark_orange"
|
488
|
+
)
|
489
|
+
if ext_features_tree_children:
|
490
|
+
ext_features_tree = tree.add(ext_features_header)
|
491
|
+
for child in ext_features_tree_children:
|
492
|
+
ext_features_tree.add(child)
|
469
493
|
if with_labels:
|
470
|
-
|
494
|
+
# avoid querying the db if the labels were queried already
|
495
|
+
labels_data = related_data.get("m2m") if related_data is not None else None
|
496
|
+
labels_tree = describe_labels(self, labels_data=labels_data, as_subtree=True)
|
471
497
|
if labels_tree:
|
472
|
-
|
473
|
-
annotations_tree = tree.add(
|
474
|
-
Text("Annotations", style="bold dark_orange")
|
475
|
-
)
|
476
|
-
annotations_tree.add(labels_tree)
|
498
|
+
tree.add(labels_tree)
|
477
499
|
|
478
500
|
return tree
|
479
501
|
|
480
502
|
|
481
|
-
def
|
503
|
+
def parse_staged__schemas_m2m_from_anndata(
|
482
504
|
adata: AnnData,
|
483
505
|
var_field: FieldAttr | None = None,
|
484
506
|
obs_field: FieldAttr = Feature.name,
|
@@ -502,11 +524,11 @@ def parse_feature_sets_from_anndata(
|
|
502
524
|
if adata.X is None
|
503
525
|
else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
|
504
526
|
)
|
505
|
-
|
527
|
+
_schemas_m2m = {}
|
506
528
|
if var_field is not None:
|
507
529
|
logger.info("parsing feature names of X stored in slot 'var'")
|
508
530
|
logger.indent = " "
|
509
|
-
|
531
|
+
schema_var = Schema.from_values(
|
510
532
|
data_parse.var.index,
|
511
533
|
var_field,
|
512
534
|
type=type,
|
@@ -514,28 +536,28 @@ def parse_feature_sets_from_anndata(
|
|
514
536
|
organism=organism,
|
515
537
|
raise_validation_error=False,
|
516
538
|
)
|
517
|
-
if
|
518
|
-
|
519
|
-
logger.save(f"linked: {
|
539
|
+
if schema_var is not None:
|
540
|
+
_schemas_m2m["var"] = schema_var
|
541
|
+
logger.save(f"linked: {schema_var}")
|
520
542
|
logger.indent = ""
|
521
|
-
if
|
543
|
+
if schema_var is None:
|
522
544
|
logger.warning("skip linking features to artifact in slot 'var'")
|
523
545
|
if len(data_parse.obs.columns) > 0:
|
524
546
|
logger.info("parsing feature names of slot 'obs'")
|
525
547
|
logger.indent = " "
|
526
|
-
|
548
|
+
schema_obs = Schema.from_df(
|
527
549
|
df=data_parse.obs,
|
528
550
|
field=obs_field,
|
529
551
|
mute=mute,
|
530
552
|
organism=organism,
|
531
553
|
)
|
532
|
-
if
|
533
|
-
|
534
|
-
logger.save(f"linked: {
|
554
|
+
if schema_obs is not None:
|
555
|
+
_schemas_m2m["obs"] = schema_obs
|
556
|
+
logger.save(f"linked: {schema_obs}")
|
535
557
|
logger.indent = ""
|
536
|
-
if
|
558
|
+
if schema_obs is None:
|
537
559
|
logger.warning("skip linking features to artifact in slot 'obs'")
|
538
|
-
return
|
560
|
+
return _schemas_m2m
|
539
561
|
|
540
562
|
|
541
563
|
def is_valid_datetime_str(date_string: str) -> bool | str:
|
@@ -598,12 +620,12 @@ def infer_feature_type_convert_json(
|
|
598
620
|
return ("list[cat ? str]", value, message)
|
599
621
|
elif first_element_type == Record:
|
600
622
|
return (
|
601
|
-
f"list[cat[{first_element_type.
|
623
|
+
f"list[cat[{first_element_type.__get_name_with_module__()}]]",
|
602
624
|
value,
|
603
625
|
message,
|
604
626
|
)
|
605
627
|
elif isinstance(value, Record):
|
606
|
-
return (f"cat[{value.__class__.
|
628
|
+
return (f"cat[{value.__class__.__get_name_with_module__()}]", value, message)
|
607
629
|
if not mute:
|
608
630
|
logger.warning(f"cannot infer feature type of: {value}, returning '?")
|
609
631
|
return "?", value, message
|
@@ -611,7 +633,7 @@ def infer_feature_type_convert_json(
|
|
611
633
|
|
612
634
|
def __init__(self, host: Artifact | Collection | Run):
|
613
635
|
self._host = host
|
614
|
-
self.
|
636
|
+
self._schema_by_slot_ = None
|
615
637
|
self._accessor_by_registry_ = None
|
616
638
|
|
617
639
|
|
@@ -628,15 +650,15 @@ def get_values(self) -> dict[str, Any]:
|
|
628
650
|
|
629
651
|
|
630
652
|
def __getitem__(self, slot) -> QuerySet:
|
631
|
-
if slot not in self.
|
653
|
+
if slot not in self._schema_by_slot:
|
632
654
|
raise ValueError(
|
633
655
|
f"No linked feature set for slot: {slot}\nDid you get validation"
|
634
656
|
" warnings? Only features that match registered features get validated"
|
635
657
|
" and linked."
|
636
658
|
)
|
637
|
-
|
638
|
-
orm_name =
|
639
|
-
return getattr(
|
659
|
+
schema = self._schema_by_slot[slot]
|
660
|
+
orm_name = schema.itype
|
661
|
+
return getattr(schema, self._accessor_by_registry[orm_name]).all()
|
640
662
|
|
641
663
|
|
642
664
|
def filter_base(cls, **expression):
|
@@ -726,11 +748,11 @@ def get(cls, **expression) -> Record:
|
|
726
748
|
|
727
749
|
|
728
750
|
@property # type: ignore
|
729
|
-
def
|
751
|
+
def _schema_by_slot(self):
|
730
752
|
"""Feature sets by slot."""
|
731
|
-
if self.
|
732
|
-
self.
|
733
|
-
return self.
|
753
|
+
if self._schema_by_slot_ is None:
|
754
|
+
self._schema_by_slot_ = get_schema_by_slot_(self._host)
|
755
|
+
return self._schema_by_slot_
|
734
756
|
|
735
757
|
|
736
758
|
@property # type: ignore
|
@@ -810,11 +832,11 @@ def _add_values(
|
|
810
832
|
model_name = "Param" if is_param else "Feature"
|
811
833
|
if is_param:
|
812
834
|
if self._host.__class__ == Artifact:
|
813
|
-
if self._host.
|
835
|
+
if self._host.kind != "model":
|
814
836
|
raise ValidationError("Can only set params for model-like artifacts.")
|
815
837
|
else:
|
816
838
|
if self._host.__class__ == Artifact:
|
817
|
-
if self._host.
|
839
|
+
if self._host.kind != "dataset" and self._host.kind is not None:
|
818
840
|
raise ValidationError(
|
819
841
|
"Can only set features for dataset-like artifacts."
|
820
842
|
)
|
@@ -890,7 +912,7 @@ def _add_values(
|
|
890
912
|
raise ValidationError(
|
891
913
|
f"Please save {record} before annotation."
|
892
914
|
)
|
893
|
-
features_labels[record.__class__.
|
915
|
+
features_labels[record.__class__.__get_name_with_module__()].append(
|
894
916
|
(feature, record)
|
895
917
|
)
|
896
918
|
else:
|
@@ -934,7 +956,7 @@ def _add_values(
|
|
934
956
|
links = [
|
935
957
|
LinkORM(
|
936
958
|
**{
|
937
|
-
f"{self._host.__class__.
|
959
|
+
f"{self._host.__class__.__get_name_with_module__().lower()}_id": self._host.id,
|
938
960
|
valuefield_id: feature_value.id,
|
939
961
|
}
|
940
962
|
)
|
@@ -1007,14 +1029,14 @@ def remove_values(
|
|
1007
1029
|
link_models_on_models = {
|
1008
1030
|
getattr(
|
1009
1031
|
Artifact, obj.related_name
|
1010
|
-
).through.
|
1032
|
+
).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
|
1011
1033
|
for obj in Artifact._meta.related_objects
|
1012
|
-
if obj.related_model.
|
1034
|
+
if obj.related_model.__get_name_with_module__() == feature_registry
|
1013
1035
|
}
|
1014
1036
|
link_attribute = {
|
1015
1037
|
obj.related_name
|
1016
1038
|
for obj in Artifact._meta.related_objects
|
1017
|
-
if obj.related_model.
|
1039
|
+
if obj.related_model.__get_name_with_module__() in link_models_on_models
|
1018
1040
|
}.pop()
|
1019
1041
|
getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
|
1020
1042
|
else:
|
@@ -1027,36 +1049,37 @@ def remove_values(
|
|
1027
1049
|
# we can clean the FeatureValue registry periodically if we want to
|
1028
1050
|
|
1029
1051
|
|
1030
|
-
def
|
1031
|
-
"""
|
1052
|
+
def add_schema(self, schema: Schema, slot: str) -> None:
|
1053
|
+
"""Annotate artifact with a schema.
|
1032
1054
|
|
1033
1055
|
Args:
|
1034
|
-
|
1035
|
-
slot: `str` The slot that marks where the
|
1056
|
+
schema: `Schema` A schema record.
|
1057
|
+
slot: `str` The slot that marks where the schema is stored in
|
1036
1058
|
the artifact.
|
1037
1059
|
"""
|
1060
|
+
# TODO: deprecate as soon as we have the Schema-based curators
|
1038
1061
|
if self._host._state.adding:
|
1039
1062
|
raise ValueError(
|
1040
1063
|
"Please save the artifact or collection before adding a feature set!"
|
1041
1064
|
)
|
1042
1065
|
host_db = self._host._state.db
|
1043
|
-
|
1066
|
+
schema.save(using=host_db)
|
1044
1067
|
host_id_field = get_host_id_field(self._host)
|
1045
1068
|
kwargs = {
|
1046
1069
|
host_id_field: self._host.id,
|
1047
|
-
"
|
1070
|
+
"schema": schema,
|
1048
1071
|
"slot": slot,
|
1049
1072
|
}
|
1050
1073
|
link_record = (
|
1051
|
-
self._host.
|
1074
|
+
self._host._schemas_m2m.through.objects.using(host_db)
|
1052
1075
|
.filter(**kwargs)
|
1053
1076
|
.one_or_none()
|
1054
1077
|
)
|
1055
1078
|
if link_record is None:
|
1056
|
-
self._host.
|
1057
|
-
if slot in self.
|
1079
|
+
self._host._schemas_m2m.through(**kwargs).save(using=host_db)
|
1080
|
+
if slot in self._schema_by_slot:
|
1058
1081
|
logger.debug(f"replaced existing {slot} feature set")
|
1059
|
-
self.
|
1082
|
+
self._schema_by_slot_[slot] = schema # type: ignore
|
1060
1083
|
|
1061
1084
|
|
1062
1085
|
def _add_set_from_df(
|
@@ -1067,18 +1090,18 @@ def _add_set_from_df(
|
|
1067
1090
|
):
|
1068
1091
|
"""Add feature set corresponding to column names of DataFrame."""
|
1069
1092
|
if isinstance(self._host, Artifact):
|
1070
|
-
assert self._host.
|
1093
|
+
assert self._host.otype == "DataFrame" # noqa: S101
|
1071
1094
|
else:
|
1072
1095
|
# Collection
|
1073
|
-
assert self._host.artifact.
|
1096
|
+
assert self._host.artifact.otype == "DataFrame" # noqa: S101
|
1074
1097
|
df = self._host.load()
|
1075
|
-
|
1098
|
+
schema = Schema.from_df(
|
1076
1099
|
df=df,
|
1077
1100
|
field=field,
|
1078
1101
|
mute=mute,
|
1079
1102
|
organism=organism,
|
1080
1103
|
)
|
1081
|
-
self._host.
|
1104
|
+
self._host._staged__schemas_m2m = {"columns": schema}
|
1082
1105
|
self._host.save()
|
1083
1106
|
|
1084
1107
|
|
@@ -1091,13 +1114,13 @@ def _add_set_from_anndata(
|
|
1091
1114
|
):
|
1092
1115
|
"""Add features from AnnData."""
|
1093
1116
|
if isinstance(self._host, Artifact):
|
1094
|
-
assert self._host.
|
1117
|
+
assert self._host.otype == "AnnData" # noqa: S101
|
1095
1118
|
else:
|
1096
1119
|
raise NotImplementedError()
|
1097
1120
|
|
1098
1121
|
# parse and register features
|
1099
1122
|
adata = self._host.load()
|
1100
|
-
|
1123
|
+
_schemas_m2m = parse_staged__schemas_m2m_from_anndata(
|
1101
1124
|
adata,
|
1102
1125
|
var_field=var_field,
|
1103
1126
|
obs_field=obs_field,
|
@@ -1106,7 +1129,7 @@ def _add_set_from_anndata(
|
|
1106
1129
|
)
|
1107
1130
|
|
1108
1131
|
# link feature sets
|
1109
|
-
self._host.
|
1132
|
+
self._host._staged__schemas_m2m = _schemas_m2m
|
1110
1133
|
self._host.save()
|
1111
1134
|
|
1112
1135
|
|
@@ -1121,18 +1144,18 @@ def _add_set_from_mudata(
|
|
1121
1144
|
if obs_fields is None:
|
1122
1145
|
obs_fields = {}
|
1123
1146
|
if isinstance(self._host, Artifact):
|
1124
|
-
assert self._host.
|
1147
|
+
assert self._host.otype == "MuData" # noqa: S101
|
1125
1148
|
else:
|
1126
1149
|
raise NotImplementedError()
|
1127
1150
|
|
1128
1151
|
# parse and register features
|
1129
1152
|
mdata = self._host.load()
|
1130
|
-
|
1153
|
+
_schemas_m2m = {}
|
1131
1154
|
obs_features = Feature.from_values(mdata.obs.columns)
|
1132
1155
|
if len(obs_features) > 0:
|
1133
|
-
|
1156
|
+
_schemas_m2m["obs"] = Schema(features=obs_features)
|
1134
1157
|
for modality, field in var_fields.items():
|
1135
|
-
modality_fs =
|
1158
|
+
modality_fs = parse_staged__schemas_m2m_from_anndata(
|
1136
1159
|
mdata[modality],
|
1137
1160
|
var_field=field,
|
1138
1161
|
obs_field=obs_fields.get(modality, Feature.name),
|
@@ -1140,22 +1163,22 @@ def _add_set_from_mudata(
|
|
1140
1163
|
organism=organism,
|
1141
1164
|
)
|
1142
1165
|
for k, v in modality_fs.items():
|
1143
|
-
|
1166
|
+
_schemas_m2m[f"['{modality}'].{k}"] = v
|
1144
1167
|
|
1145
|
-
def
|
1168
|
+
def unify_staged__schemas_m2m_by_hash(_schemas_m2m):
|
1146
1169
|
unique_values = {}
|
1147
1170
|
|
1148
|
-
for key, value in
|
1171
|
+
for key, value in _schemas_m2m.items():
|
1149
1172
|
value_hash = value.hash # Assuming each value has a .hash attribute
|
1150
1173
|
if value_hash in unique_values:
|
1151
|
-
|
1174
|
+
_schemas_m2m[key] = unique_values[value_hash]
|
1152
1175
|
else:
|
1153
1176
|
unique_values[value_hash] = value
|
1154
1177
|
|
1155
|
-
return
|
1178
|
+
return _schemas_m2m
|
1156
1179
|
|
1157
1180
|
# link feature sets
|
1158
|
-
self._host.
|
1181
|
+
self._host._staged__schemas_m2m = unify_staged__schemas_m2m_by_hash(_schemas_m2m)
|
1159
1182
|
self._host.save()
|
1160
1183
|
|
1161
1184
|
|
@@ -1165,8 +1188,8 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
|
1165
1188
|
if transfer_logs is None:
|
1166
1189
|
transfer_logs = {"mapped": [], "transferred": [], "run": None}
|
1167
1190
|
using_key = settings._using_key
|
1168
|
-
for slot,
|
1169
|
-
members =
|
1191
|
+
for slot, schema in data.features._schema_by_slot.items():
|
1192
|
+
members = schema.members
|
1170
1193
|
if len(members) == 0:
|
1171
1194
|
continue
|
1172
1195
|
registry = members[0].__class__
|
@@ -1202,20 +1225,18 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
|
1202
1225
|
save(new_members)
|
1203
1226
|
|
1204
1227
|
# create a new feature set from feature values using the same uid
|
1205
|
-
|
1206
|
-
|
1207
|
-
)
|
1208
|
-
if feature_set_self is None:
|
1228
|
+
schema_self = Schema.from_values(member_uids, field=getattr(registry, field))
|
1229
|
+
if schema_self is None:
|
1209
1230
|
if hasattr(registry, "organism_id"):
|
1210
1231
|
logger.warning(
|
1211
|
-
f"
|
1232
|
+
f"Schema is not transferred, check if organism is set correctly: {schema}"
|
1212
1233
|
)
|
1213
1234
|
continue
|
1214
|
-
# make sure the uid matches if
|
1215
|
-
if
|
1216
|
-
|
1217
|
-
logger.info(f"saving {slot}
|
1218
|
-
self._host.features.
|
1235
|
+
# make sure the uid matches if schema is composed of same features
|
1236
|
+
if schema_self.hash == schema.hash:
|
1237
|
+
schema_self.uid = schema.uid
|
1238
|
+
logger.info(f"saving {slot} schema: {schema_self}")
|
1239
|
+
self._host.features.add_schema(schema_self, slot)
|
1219
1240
|
|
1220
1241
|
|
1221
1242
|
def make_external(self, feature: Feature) -> None:
|
@@ -1227,8 +1248,8 @@ def make_external(self, feature: Feature) -> None:
|
|
1227
1248
|
"""
|
1228
1249
|
if not isinstance(feature, Feature):
|
1229
1250
|
raise TypeError("feature must be a Feature record!")
|
1230
|
-
|
1231
|
-
for fs in
|
1251
|
+
_schemas_m2m = Schema.filter(features=feature).all()
|
1252
|
+
for fs in _schemas_m2m:
|
1232
1253
|
f = Feature.filter(uid=feature.uid).all()
|
1233
1254
|
features_updated = fs.members.difference(f)
|
1234
1255
|
if len(features_updated) > 0:
|
@@ -1238,13 +1259,14 @@ def make_external(self, feature: Feature) -> None:
|
|
1238
1259
|
fs.n = len(features_updated)
|
1239
1260
|
fs.save()
|
1240
1261
|
# delete the link between the feature and the feature set
|
1241
|
-
|
1242
|
-
feature_id=feature.id,
|
1262
|
+
Schema.features.through.objects.filter(
|
1263
|
+
feature_id=feature.id, schema_id=fs.id
|
1243
1264
|
).delete()
|
1244
|
-
# if no members are left in the
|
1265
|
+
# if no members are left in the schema, delete it
|
1245
1266
|
if len(features_updated) == 0:
|
1246
1267
|
logger.warning(f"deleting empty feature set: {fs}")
|
1247
1268
|
fs.artifacts.set([])
|
1269
|
+
fs._artifacts_m2m.set([])
|
1248
1270
|
fs.delete()
|
1249
1271
|
|
1250
1272
|
|
@@ -1254,10 +1276,12 @@ FeatureManager.__repr__ = __repr__
|
|
1254
1276
|
ParamManager.__repr__ = __repr__
|
1255
1277
|
FeatureManager.__getitem__ = __getitem__
|
1256
1278
|
FeatureManager.get_values = get_values
|
1257
|
-
FeatureManager.
|
1279
|
+
FeatureManager._schema_by_slot = _schema_by_slot
|
1280
|
+
FeatureManager._feature_set_by_slot = _schema_by_slot
|
1258
1281
|
FeatureManager._accessor_by_registry = _accessor_by_registry
|
1259
1282
|
FeatureManager.add_values = add_values_features
|
1260
|
-
FeatureManager.
|
1283
|
+
FeatureManager.add_schema = add_schema
|
1284
|
+
FeatureManager.add_feature_set = add_schema # backward compat, will raise warning soon
|
1261
1285
|
FeatureManager._add_set_from_df = _add_set_from_df
|
1262
1286
|
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1263
1287
|
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|