lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +17 -6
- lamindb/_artifact.py +202 -87
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +86 -52
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +21 -7
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +78 -18
- lamindb/_record.py +170 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +42 -11
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +129 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/users.py +1 -4
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +14 -9
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +53 -43
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +24 -9
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +67 -21
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +21 -15
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +49 -3
- lamindb/core/storage/_backed_access.py +12 -7
- lamindb/core/storage/_pyarrow_dataset.py +40 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +30 -24
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2193 -846
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +475 -168
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
- lamindb-1.1.1.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.5.dist-info/RECORD +0 -102
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/core/_data.py
CHANGED
@@ -21,6 +21,8 @@ from lamindb.models import (
|
|
21
21
|
record_repr,
|
22
22
|
)
|
23
23
|
|
24
|
+
from .._tracked import get_current_tracked_run
|
25
|
+
from ..errors import ValidationError
|
24
26
|
from ._context import context
|
25
27
|
from ._django import get_artifact_with_related, get_related_model
|
26
28
|
from ._feature_manager import (
|
@@ -28,7 +30,6 @@ from ._feature_manager import (
|
|
28
30
|
get_host_id_field,
|
29
31
|
get_label_links,
|
30
32
|
)
|
31
|
-
from .exceptions import ValidationError
|
32
33
|
from .relations import (
|
33
34
|
dict_module_name_to_model_name,
|
34
35
|
dict_related_model_to_related_name,
|
@@ -45,9 +46,12 @@ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-r
|
|
45
46
|
WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
|
46
47
|
|
47
48
|
|
49
|
+
# also see current_run() in core._data
|
48
50
|
def get_run(run: Run | None) -> Run | None:
|
49
51
|
if run is None:
|
50
|
-
run =
|
52
|
+
run = get_current_tracked_run()
|
53
|
+
if run is None:
|
54
|
+
run = context.run
|
51
55
|
if run is None and not settings.creation.artifact_silence_missing_run_warning:
|
52
56
|
logger.warning(WARNING_RUN_TRANSFORM)
|
53
57
|
# suppress run by passing False
|
@@ -56,26 +60,26 @@ def get_run(run: Run | None) -> Run | None:
|
|
56
60
|
return run
|
57
61
|
|
58
62
|
|
59
|
-
def
|
60
|
-
if hasattr(self, "
|
63
|
+
def save_staged_feature_sets(self: Artifact | Collection) -> None:
|
64
|
+
if hasattr(self, "_staged_feature_sets"):
|
61
65
|
from lamindb.core._feature_manager import get_schema_by_slot_
|
62
66
|
|
63
|
-
|
64
|
-
|
65
|
-
for key, schema in self.
|
67
|
+
existing_staged_feature_sets = get_schema_by_slot_(self)
|
68
|
+
saved_staged_feature_sets = {}
|
69
|
+
for key, schema in self._staged_feature_sets.items():
|
66
70
|
if isinstance(schema, Schema) and schema._state.adding:
|
67
71
|
schema.save()
|
68
|
-
|
69
|
-
if key in
|
72
|
+
saved_staged_feature_sets[key] = schema
|
73
|
+
if key in existing_staged_feature_sets:
|
70
74
|
# remove existing feature set on the same slot
|
71
|
-
self.
|
72
|
-
if len(
|
73
|
-
s = "s" if len(
|
75
|
+
self.feature_sets.remove(existing_staged_feature_sets[key])
|
76
|
+
if len(saved_staged_feature_sets) > 0:
|
77
|
+
s = "s" if len(saved_staged_feature_sets) > 1 else ""
|
74
78
|
display_schema_keys = ",".join(
|
75
|
-
f"'{key}'" for key in
|
79
|
+
f"'{key}'" for key in saved_staged_feature_sets.keys()
|
76
80
|
)
|
77
81
|
logger.save(
|
78
|
-
f"saved {len(
|
82
|
+
f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
|
79
83
|
f" {display_schema_keys}"
|
80
84
|
)
|
81
85
|
|
@@ -84,16 +88,16 @@ def save_schema_links(self: Artifact | Collection) -> None:
|
|
84
88
|
from lamindb._save import bulk_create
|
85
89
|
|
86
90
|
Data = self.__class__
|
87
|
-
if hasattr(self, "
|
91
|
+
if hasattr(self, "_staged_feature_sets"):
|
88
92
|
links = []
|
89
93
|
host_id_field = get_host_id_field(self)
|
90
|
-
for slot, schema in self.
|
94
|
+
for slot, schema in self._staged_feature_sets.items():
|
91
95
|
kwargs = {
|
92
96
|
host_id_field: self.id,
|
93
97
|
"schema_id": schema.id,
|
94
98
|
"slot": slot,
|
95
99
|
}
|
96
|
-
links.append(Data.
|
100
|
+
links.append(Data.feature_sets.through(**kwargs))
|
97
101
|
bulk_create(links, ignore_conflicts=True)
|
98
102
|
|
99
103
|
|
@@ -182,7 +186,7 @@ def _describe_sqlite(self: Artifact | Collection, print_types: bool = False):
|
|
182
186
|
if isinstance(self, (Collection, Artifact)):
|
183
187
|
many_to_many_fields.append("input_of_runs")
|
184
188
|
if isinstance(self, Artifact):
|
185
|
-
many_to_many_fields.append("
|
189
|
+
many_to_many_fields.append("feature_sets")
|
186
190
|
self = (
|
187
191
|
self.__class__.objects.using(self._state.db)
|
188
192
|
.prefetch_related(*many_to_many_fields)
|
@@ -335,10 +339,10 @@ def add_labels(
|
|
335
339
|
else:
|
336
340
|
validate_feature(feature, records) # type:ignore
|
337
341
|
records_by_registry = defaultdict(list)
|
338
|
-
|
342
|
+
feature_sets = self.feature_sets.filter(itype="Feature").all()
|
339
343
|
internal_features = set() # type: ignore
|
340
|
-
if len(
|
341
|
-
for schema in
|
344
|
+
if len(feature_sets) > 0:
|
345
|
+
for schema in feature_sets:
|
342
346
|
internal_features = internal_features.union(
|
343
347
|
set(schema.members.values_list("name", flat=True))
|
344
348
|
) # type: ignore
|
@@ -357,7 +361,7 @@ def add_labels(
|
|
357
361
|
f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
|
358
362
|
)
|
359
363
|
if feature.dtype == "cat":
|
360
|
-
feature.dtype = f"cat[{registry_name}]"
|
364
|
+
feature.dtype = f"cat[{registry_name}]" # type: ignore
|
361
365
|
feature.save()
|
362
366
|
elif registry_name not in feature.dtype:
|
363
367
|
new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
|
@@ -386,13 +390,13 @@ def _track_run_input(
|
|
386
390
|
is_run_input: bool | Run | None = None,
|
387
391
|
run: Run | None = None,
|
388
392
|
):
|
389
|
-
# this is an internal hack right now for project-flow, but we can allow this
|
390
|
-
# for the user in the future
|
391
393
|
if isinstance(is_run_input, Run):
|
392
394
|
run = is_run_input
|
393
395
|
is_run_input = True
|
394
396
|
elif run is None:
|
395
|
-
run =
|
397
|
+
run = get_current_tracked_run()
|
398
|
+
if run is None:
|
399
|
+
run = context.run
|
396
400
|
# consider that data is an iterable of Data
|
397
401
|
data_iter: Iterable[Artifact] | Iterable[Collection] = (
|
398
402
|
[data] if isinstance(data, (Artifact, Collection)) else data
|
lamindb/core/_describe.py
CHANGED
@@ -76,7 +76,7 @@ def describe_header(self: Artifact | Collection | Run) -> Tree:
|
|
76
76
|
if self._branch_code == 0:
|
77
77
|
logger.warning("This artifact is hidden.")
|
78
78
|
elif self._branch_code == -1:
|
79
|
-
logger.warning("This artifact is the trash.")
|
79
|
+
logger.warning("This artifact is in the trash.")
|
80
80
|
# initialize tree
|
81
81
|
suffix = self.suffix if hasattr(self, "suffix") and self.suffix else ""
|
82
82
|
accessor = self.otype if hasattr(self, "otype") and self.otype else ""
|
lamindb/core/_django.py
CHANGED
@@ -105,7 +105,7 @@ def get_artifact_with_related(
|
|
105
105
|
|
106
106
|
if include_schema:
|
107
107
|
annotations["schemas"] = Subquery(
|
108
|
-
model.
|
108
|
+
model.feature_sets.through.objects.filter(artifact=OuterRef("pk"))
|
109
109
|
.annotate(
|
110
110
|
data=JSONObject(
|
111
111
|
id=F("id"),
|
lamindb/core/_feature_manager.py
CHANGED
@@ -33,8 +33,8 @@ from lamindb._record import (
|
|
33
33
|
)
|
34
34
|
from lamindb._save import save
|
35
35
|
from lamindb._schema import DICT_KEYS_TYPE, Schema
|
36
|
-
from lamindb.core.exceptions import DoesNotExist, ValidationError
|
37
36
|
from lamindb.core.storage import LocalPathClasses
|
37
|
+
from lamindb.errors import DoesNotExist, ValidationError
|
38
38
|
from lamindb.models import (
|
39
39
|
Artifact,
|
40
40
|
Collection,
|
@@ -96,8 +96,8 @@ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
|
|
96
96
|
return {}
|
97
97
|
# if the host is not yet saved
|
98
98
|
if host._state.adding:
|
99
|
-
if hasattr(host, "
|
100
|
-
return host.
|
99
|
+
if hasattr(host, "_staged_feature_sets"):
|
100
|
+
return host._staged_feature_sets
|
101
101
|
else:
|
102
102
|
return {}
|
103
103
|
host_db = host._state.db
|
@@ -105,7 +105,7 @@ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
|
|
105
105
|
kwargs = {host_id_field: host.id}
|
106
106
|
# otherwise, we need a query
|
107
107
|
links_schema = (
|
108
|
-
host.
|
108
|
+
host.feature_sets.through.objects.using(host_db)
|
109
109
|
.filter(**kwargs)
|
110
110
|
.select_related("schema")
|
111
111
|
)
|
@@ -118,7 +118,7 @@ def get_label_links(
|
|
118
118
|
host_id_field = get_host_id_field(host)
|
119
119
|
kwargs = {host_id_field: host.id, "feature_id": feature.id}
|
120
120
|
link_records = (
|
121
|
-
getattr(host, host.features._accessor_by_registry[registry])
|
121
|
+
getattr(host, host.features._accessor_by_registry[registry]) # type: ignore
|
122
122
|
.through.objects.using(host._state.db)
|
123
123
|
.filter(**kwargs)
|
124
124
|
)
|
@@ -128,14 +128,14 @@ def get_label_links(
|
|
128
128
|
def get_schema_links(host: Artifact | Collection) -> QuerySet:
|
129
129
|
host_id_field = get_host_id_field(host)
|
130
130
|
kwargs = {host_id_field: host.id}
|
131
|
-
links_schema = host.
|
131
|
+
links_schema = host.feature_sets.through.objects.filter(**kwargs)
|
132
132
|
return links_schema
|
133
133
|
|
134
134
|
|
135
135
|
def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
|
136
136
|
link_model_name = link.__class__.__name__
|
137
137
|
if link_model_name in {"Registry", "ModelBase"}: # we passed the type of the link
|
138
|
-
link_model_name = link.__name__
|
138
|
+
link_model_name = link.__name__ # type: ignore
|
139
139
|
return link_model_name.replace(data.__class__.__name__, "").lower()
|
140
140
|
|
141
141
|
|
@@ -348,10 +348,10 @@ def describe_features(
|
|
348
348
|
|
349
349
|
internal_feature_names: dict[str, str] = {}
|
350
350
|
if isinstance(self, Artifact):
|
351
|
-
|
351
|
+
feature_sets = self.feature_sets.filter(itype="Feature").all()
|
352
352
|
internal_feature_names = {}
|
353
|
-
if len(
|
354
|
-
for schema in
|
353
|
+
if len(feature_sets) > 0:
|
354
|
+
for schema in feature_sets:
|
355
355
|
internal_feature_names.update(
|
356
356
|
dict(schema.members.values_list("name", "dtype"))
|
357
357
|
)
|
@@ -500,7 +500,7 @@ def describe_features(
|
|
500
500
|
return tree
|
501
501
|
|
502
502
|
|
503
|
-
def
|
503
|
+
def parse_staged_feature_sets_from_anndata(
|
504
504
|
adata: AnnData,
|
505
505
|
var_field: FieldAttr | None = None,
|
506
506
|
obs_field: FieldAttr = Feature.name,
|
@@ -524,7 +524,7 @@ def parse_staged__schemas_m2m_from_anndata(
|
|
524
524
|
if adata.X is None
|
525
525
|
else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
|
526
526
|
)
|
527
|
-
|
527
|
+
feature_sets = {}
|
528
528
|
if var_field is not None:
|
529
529
|
logger.info("parsing feature names of X stored in slot 'var'")
|
530
530
|
logger.indent = " "
|
@@ -537,7 +537,7 @@ def parse_staged__schemas_m2m_from_anndata(
|
|
537
537
|
raise_validation_error=False,
|
538
538
|
)
|
539
539
|
if schema_var is not None:
|
540
|
-
|
540
|
+
feature_sets["var"] = schema_var
|
541
541
|
logger.save(f"linked: {schema_var}")
|
542
542
|
logger.indent = ""
|
543
543
|
if schema_var is None:
|
@@ -552,12 +552,12 @@ def parse_staged__schemas_m2m_from_anndata(
|
|
552
552
|
organism=organism,
|
553
553
|
)
|
554
554
|
if schema_obs is not None:
|
555
|
-
|
555
|
+
feature_sets["obs"] = schema_obs
|
556
556
|
logger.save(f"linked: {schema_obs}")
|
557
557
|
logger.indent = ""
|
558
558
|
if schema_obs is None:
|
559
559
|
logger.warning("skip linking features to artifact in slot 'obs'")
|
560
|
-
return
|
560
|
+
return feature_sets
|
561
561
|
|
562
562
|
|
563
563
|
def is_valid_datetime_str(date_string: str) -> bool | str:
|
@@ -818,6 +818,8 @@ def _add_values(
|
|
818
818
|
feature_param_field: The field of a reference registry to map keys of the
|
819
819
|
dictionary.
|
820
820
|
"""
|
821
|
+
from .._tracked import get_current_tracked_run
|
822
|
+
|
821
823
|
# rename to distinguish from the values inside the dict
|
822
824
|
features_values = values
|
823
825
|
keys = features_values.keys()
|
@@ -849,12 +851,20 @@ def _add_values(
|
|
849
851
|
(key, infer_feature_type_convert_json(key, features_values[key]))
|
850
852
|
for key in not_validated_keys
|
851
853
|
]
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
]
|
857
|
-
|
854
|
+
run = get_current_tracked_run()
|
855
|
+
if run is not None:
|
856
|
+
name = f"{run.transform.type}[{run.transform.key}]"
|
857
|
+
type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
|
858
|
+
elements = [type_hint]
|
859
|
+
type_kwarg = f", type={model_name.lower()}_type"
|
860
|
+
else:
|
861
|
+
elements = []
|
862
|
+
type_kwarg = ""
|
863
|
+
elements += [
|
864
|
+
f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
|
865
|
+
for key, (dtype, _, message) in not_validated_keys_dtype_message
|
866
|
+
]
|
867
|
+
hint = "\n".join(elements)
|
858
868
|
msg = (
|
859
869
|
f"These keys could not be validated: {not_validated_keys.tolist()}\n"
|
860
870
|
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
@@ -928,7 +938,7 @@ def _add_values(
|
|
928
938
|
validated_values = values_array[validated]
|
929
939
|
if validated.sum() != len(values):
|
930
940
|
not_validated_values += values_array[~validated].tolist()
|
931
|
-
label_records = ULabel.from_values(validated_values, field="name")
|
941
|
+
label_records = ULabel.from_values(validated_values, field="name") # type: ignore
|
932
942
|
features_labels["ULabel"] += [
|
933
943
|
(feature, label_record) for label_record in label_records
|
934
944
|
]
|
@@ -1012,8 +1022,8 @@ def remove_values(
|
|
1012
1022
|
if isinstance(feature, str):
|
1013
1023
|
feature = Feature.get(name=feature)
|
1014
1024
|
filter_kwargs = {"feature": feature}
|
1015
|
-
if feature.dtype.startswith("cat["):
|
1016
|
-
feature_registry = feature.dtype.replace("cat[", "").replace("]", "")
|
1025
|
+
if feature.dtype.startswith("cat["): # type: ignore
|
1026
|
+
feature_registry = feature.dtype.replace("cat[", "").replace("]", "") # type: ignore
|
1017
1027
|
if value is not None:
|
1018
1028
|
assert isinstance(value, Record) # noqa: S101
|
1019
1029
|
# the below uses our convention for field names in link models
|
@@ -1071,12 +1081,12 @@ def add_schema(self, schema: Schema, slot: str) -> None:
|
|
1071
1081
|
"slot": slot,
|
1072
1082
|
}
|
1073
1083
|
link_record = (
|
1074
|
-
self._host.
|
1084
|
+
self._host.feature_sets.through.objects.using(host_db)
|
1075
1085
|
.filter(**kwargs)
|
1076
1086
|
.one_or_none()
|
1077
1087
|
)
|
1078
1088
|
if link_record is None:
|
1079
|
-
self._host.
|
1089
|
+
self._host.feature_sets.through(**kwargs).save(using=host_db)
|
1080
1090
|
if slot in self._schema_by_slot:
|
1081
1091
|
logger.debug(f"replaced existing {slot} feature set")
|
1082
1092
|
self._schema_by_slot_[slot] = schema # type: ignore
|
@@ -1101,7 +1111,7 @@ def _add_set_from_df(
|
|
1101
1111
|
mute=mute,
|
1102
1112
|
organism=organism,
|
1103
1113
|
)
|
1104
|
-
self._host.
|
1114
|
+
self._host._staged_feature_sets = {"columns": schema}
|
1105
1115
|
self._host.save()
|
1106
1116
|
|
1107
1117
|
|
@@ -1120,7 +1130,7 @@ def _add_set_from_anndata(
|
|
1120
1130
|
|
1121
1131
|
# parse and register features
|
1122
1132
|
adata = self._host.load()
|
1123
|
-
|
1133
|
+
feature_sets = parse_staged_feature_sets_from_anndata(
|
1124
1134
|
adata,
|
1125
1135
|
var_field=var_field,
|
1126
1136
|
obs_field=obs_field,
|
@@ -1129,7 +1139,7 @@ def _add_set_from_anndata(
|
|
1129
1139
|
)
|
1130
1140
|
|
1131
1141
|
# link feature sets
|
1132
|
-
self._host.
|
1142
|
+
self._host._staged_feature_sets = feature_sets
|
1133
1143
|
self._host.save()
|
1134
1144
|
|
1135
1145
|
|
@@ -1150,12 +1160,12 @@ def _add_set_from_mudata(
|
|
1150
1160
|
|
1151
1161
|
# parse and register features
|
1152
1162
|
mdata = self._host.load()
|
1153
|
-
|
1154
|
-
obs_features = Feature.from_values(mdata.obs.columns)
|
1163
|
+
feature_sets = {}
|
1164
|
+
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1155
1165
|
if len(obs_features) > 0:
|
1156
|
-
|
1166
|
+
feature_sets["obs"] = Schema(features=obs_features)
|
1157
1167
|
for modality, field in var_fields.items():
|
1158
|
-
modality_fs =
|
1168
|
+
modality_fs = parse_staged_feature_sets_from_anndata(
|
1159
1169
|
mdata[modality],
|
1160
1170
|
var_field=field,
|
1161
1171
|
obs_field=obs_fields.get(modality, Feature.name),
|
@@ -1163,22 +1173,22 @@ def _add_set_from_mudata(
|
|
1163
1173
|
organism=organism,
|
1164
1174
|
)
|
1165
1175
|
for k, v in modality_fs.items():
|
1166
|
-
|
1176
|
+
feature_sets[f"['{modality}'].{k}"] = v
|
1167
1177
|
|
1168
|
-
def
|
1178
|
+
def unify_staged_feature_sets_by_hash(feature_sets):
|
1169
1179
|
unique_values = {}
|
1170
1180
|
|
1171
|
-
for key, value in
|
1181
|
+
for key, value in feature_sets.items():
|
1172
1182
|
value_hash = value.hash # Assuming each value has a .hash attribute
|
1173
1183
|
if value_hash in unique_values:
|
1174
|
-
|
1184
|
+
feature_sets[key] = unique_values[value_hash]
|
1175
1185
|
else:
|
1176
1186
|
unique_values[value_hash] = value
|
1177
1187
|
|
1178
|
-
return
|
1188
|
+
return feature_sets
|
1179
1189
|
|
1180
1190
|
# link feature sets
|
1181
|
-
self._host.
|
1191
|
+
self._host._staged_feature_sets = unify_staged_feature_sets_by_hash(feature_sets)
|
1182
1192
|
self._host.save()
|
1183
1193
|
|
1184
1194
|
|
@@ -1188,7 +1198,7 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
|
1188
1198
|
if transfer_logs is None:
|
1189
1199
|
transfer_logs = {"mapped": [], "transferred": [], "run": None}
|
1190
1200
|
using_key = settings._using_key
|
1191
|
-
for slot, schema in data.features._schema_by_slot.items():
|
1201
|
+
for slot, schema in data.features._schema_by_slot.items(): # type: ignore
|
1192
1202
|
members = schema.members
|
1193
1203
|
if len(members) == 0:
|
1194
1204
|
continue
|
@@ -1248,8 +1258,8 @@ def make_external(self, feature: Feature) -> None:
|
|
1248
1258
|
"""
|
1249
1259
|
if not isinstance(feature, Feature):
|
1250
1260
|
raise TypeError("feature must be a Feature record!")
|
1251
|
-
|
1252
|
-
for fs in
|
1261
|
+
feature_sets = Schema.filter(features=feature).all()
|
1262
|
+
for fs in feature_sets:
|
1253
1263
|
f = Feature.filter(uid=feature.uid).all()
|
1254
1264
|
features_updated = fs.members.difference(f)
|
1255
1265
|
if len(features_updated) > 0:
|
@@ -1266,10 +1276,10 @@ def make_external(self, feature: Feature) -> None:
|
|
1266
1276
|
if len(features_updated) == 0:
|
1267
1277
|
logger.warning(f"deleting empty feature set: {fs}")
|
1268
1278
|
fs.artifacts.set([])
|
1269
|
-
fs._artifacts_m2m.set([])
|
1270
1279
|
fs.delete()
|
1271
1280
|
|
1272
1281
|
|
1282
|
+
# mypy: ignore-errors
|
1273
1283
|
FeatureManager.__init__ = __init__
|
1274
1284
|
ParamManager.__init__ = __init__
|
1275
1285
|
FeatureManager.__repr__ = __repr__
|
lamindb/core/_label_manager.py
CHANGED
@@ -35,7 +35,7 @@ if TYPE_CHECKING:
|
|
35
35
|
from lamindb._query_set import QuerySet
|
36
36
|
from lamindb.models import Artifact, Collection, Record
|
37
37
|
|
38
|
-
EXCLUDE_LABELS = {"
|
38
|
+
EXCLUDE_LABELS = {"feature_sets"}
|
39
39
|
|
40
40
|
|
41
41
|
def _get_labels(
|
@@ -106,7 +106,7 @@ def describe_labels(
|
|
106
106
|
pad_edge=False,
|
107
107
|
)
|
108
108
|
for related_name, labels in labels_data.items():
|
109
|
-
if not labels or related_name == "
|
109
|
+
if not labels or related_name == "feature_sets":
|
110
110
|
continue
|
111
111
|
if isinstance(labels, dict): # postgres, labels are a dict[id, name]
|
112
112
|
print_values = _format_values(labels.values(), n=10, quotes=False)
|
@@ -286,12 +286,12 @@ class LabelManager:
|
|
286
286
|
)
|
287
287
|
for feature in new_features:
|
288
288
|
transfer_to_default_db(
|
289
|
-
feature,
|
289
|
+
feature, # type: ignore
|
290
290
|
using_key,
|
291
291
|
transfer_logs=transfer_logs,
|
292
292
|
transfer_fk=False,
|
293
293
|
)
|
294
|
-
save(new_features)
|
294
|
+
save(new_features) # type: ignore
|
295
295
|
if hasattr(self._host, related_name):
|
296
296
|
for feature_name, feature_labels in labels_by_features.items():
|
297
297
|
if feature_name is not None:
|
@@ -27,7 +27,8 @@ if TYPE_CHECKING:
|
|
27
27
|
class _Connect:
|
28
28
|
def __init__(self, storage):
|
29
29
|
if isinstance(storage, UPath):
|
30
|
-
|
30
|
+
# force no external compression even for files with .gz extension. REMOVE LATER
|
31
|
+
self.conn, self.store = registry.open("h5py", storage, compression=None)
|
31
32
|
self.to_close = True
|
32
33
|
else:
|
33
34
|
self.conn, self.store = None, storage
|
@@ -87,7 +88,7 @@ class MappedCollection:
|
|
87
88
|
obs_keys: Keys from the ``.obs`` slots.
|
88
89
|
obs_filter: Select only observations with these values for the given obs columns.
|
89
90
|
Should be a dictionary with obs column names as keys
|
90
|
-
and filtering values (a string or a
|
91
|
+
and filtering values (a string or a list of strings) as values.
|
91
92
|
join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
|
92
93
|
does not join.
|
93
94
|
encode_labels: Encode labels into integers.
|
@@ -106,7 +107,7 @@ class MappedCollection:
|
|
106
107
|
layers_keys: str | list[str] | None = None,
|
107
108
|
obs_keys: str | list[str] | None = None,
|
108
109
|
obsm_keys: str | list[str] | None = None,
|
109
|
-
obs_filter: dict[str, str |
|
110
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
110
111
|
join: Literal["inner", "outer"] | None = "inner",
|
111
112
|
encode_labels: bool | list[str] = True,
|
112
113
|
unknown_label: str | dict[str, str] | None = None,
|
@@ -184,9 +185,14 @@ class MappedCollection:
|
|
184
185
|
if self.filtered:
|
185
186
|
indices_storage_mask = None
|
186
187
|
for obs_filter_key, obs_filter_values in obs_filter.items():
|
187
|
-
|
188
|
-
|
189
|
-
)
|
188
|
+
if isinstance(obs_filter_values, tuple):
|
189
|
+
obs_filter_values = list(obs_filter_values)
|
190
|
+
elif not isinstance(obs_filter_values, list):
|
191
|
+
obs_filter_values = [obs_filter_values]
|
192
|
+
obs_labels = self._get_labels(store, obs_filter_key)
|
193
|
+
obs_filter_mask = np.isin(obs_labels, obs_filter_values)
|
194
|
+
if pd.isna(obs_filter_values).any():
|
195
|
+
obs_filter_mask |= pd.isna(obs_labels)
|
190
196
|
if indices_storage_mask is None:
|
191
197
|
indices_storage_mask = obs_filter_mask
|
192
198
|
else:
|
@@ -241,7 +247,8 @@ class MappedCollection:
|
|
241
247
|
if parallel:
|
242
248
|
conn, storage = None, path
|
243
249
|
else:
|
244
|
-
|
250
|
+
# force no external compression even for files with .gz extension. REMOVE LATER
|
251
|
+
conn, storage = registry.open("h5py", path, compression=None)
|
245
252
|
else:
|
246
253
|
conn, storage = registry.open("zarr", path)
|
247
254
|
self.conns.append(conn)
|
@@ -296,7 +303,7 @@ class MappedCollection:
|
|
296
303
|
self.var_joint = reduce(pd.Index.intersection, self.var_list)
|
297
304
|
if len(self.var_joint) == 0:
|
298
305
|
raise ValueError(
|
299
|
-
"The provided AnnData objects don't have shared
|
306
|
+
"The provided AnnData objects don't have shared variables.\n"
|
300
307
|
"Use join='outer'."
|
301
308
|
)
|
302
309
|
self.var_indices = [
|
@@ -389,7 +396,7 @@ class MappedCollection:
|
|
389
396
|
else:
|
390
397
|
cats = None
|
391
398
|
label_idx = self._get_obs_idx(store, obs_idx, label, cats)
|
392
|
-
if label in self.encoders:
|
399
|
+
if label in self.encoders and label_idx is not np.nan:
|
393
400
|
label_idx = self.encoders[label][label_idx]
|
394
401
|
out[label] = label_idx
|
395
402
|
return out
|
@@ -453,6 +460,8 @@ class MappedCollection:
|
|
453
460
|
label = labels[idx]
|
454
461
|
else:
|
455
462
|
label = labels["codes"][idx]
|
463
|
+
if label == -1:
|
464
|
+
return np.nan
|
456
465
|
if categories is not None:
|
457
466
|
cats = categories
|
458
467
|
else:
|
@@ -589,7 +598,13 @@ class MappedCollection:
|
|
589
598
|
cats = self._get_categories(storage, label_key)
|
590
599
|
if cats is not None:
|
591
600
|
cats = _decode(cats) if isinstance(cats[0], bytes) else cats
|
601
|
+
# NaN is coded as -1
|
602
|
+
nans = labels == -1
|
592
603
|
labels = cats[labels]
|
604
|
+
# detect and replace nans
|
605
|
+
if nans.any():
|
606
|
+
labels[nans] = np.nan
|
607
|
+
|
593
608
|
return labels
|
594
609
|
|
595
610
|
def close(self):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import subprocess
|
4
|
+
import sys
|
4
5
|
from typing import TYPE_CHECKING
|
5
6
|
|
6
7
|
import lamindb_setup as ln_setup
|
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
|
|
17
18
|
try:
|
18
19
|
with open(filepath, "w") as f:
|
19
20
|
result = subprocess.run(
|
20
|
-
["pip", "freeze"],
|
21
|
+
[sys.executable, "-m", "pip", "freeze"],
|
21
22
|
stdout=f,
|
22
23
|
)
|
23
24
|
except OSError as e:
|
@@ -85,4 +85,9 @@ from ._core import (
|
|
85
85
|
schmidt22_perturbseq,
|
86
86
|
)
|
87
87
|
from ._fake import fake_bio_notebook_titles
|
88
|
-
from ._small import
|
88
|
+
from ._small import (
|
89
|
+
anndata_with_obs,
|
90
|
+
small_dataset1,
|
91
|
+
small_dataset2,
|
92
|
+
small_dataset3_cellxgene,
|
93
|
+
)
|
lamindb/core/datasets/_core.py
CHANGED
@@ -18,7 +18,8 @@ if TYPE_CHECKING:
|
|
18
18
|
def file_fcs() -> Path:
|
19
19
|
"""Example FCS artifact."""
|
20
20
|
filepath, _ = urlretrieve(
|
21
|
-
"https://lamindb-
|
21
|
+
"https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
|
22
|
+
"example.fcs",
|
22
23
|
)
|
23
24
|
return Path(filepath)
|
24
25
|
|
@@ -48,8 +49,8 @@ def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no
|
|
48
49
|
bt.CellMarker.public().inspect(std, "name").validated, "name"
|
49
50
|
)
|
50
51
|
)
|
51
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
52
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
52
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
53
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
|
53
54
|
ln.settings.verbosity = verbosity
|
54
55
|
return Path(filepath)
|
55
56
|
|
@@ -84,8 +85,8 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
|
|
84
85
|
|
85
86
|
verbosity = ln.settings.verbosity
|
86
87
|
ln.settings.verbosity = "error"
|
87
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
88
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
88
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
89
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
|
89
90
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
|
90
91
|
ln.settings.verbosity = verbosity
|
91
92
|
|
@@ -207,7 +208,7 @@ def anndata_mouse_sc_lymph_node(
|
|
207
208
|
# cell types
|
208
209
|
ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
|
209
210
|
# assays
|
210
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
211
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
211
212
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
212
213
|
# genes
|
213
214
|
validated = bt.Gene.public(organism="mouse").validate(
|
@@ -330,11 +331,11 @@ def anndata_human_immune_cells(
|
|
330
331
|
ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
|
331
332
|
ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
|
332
333
|
ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
|
333
|
-
ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
|
334
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
335
|
-
ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
|
336
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
337
|
-
ln.Feature(name="donor", dtype=[ln.ULabel]).save()
|
334
|
+
ln.Feature(name="cell_type", dtype=[bt.CellType]).save() # type: ignore
|
335
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
336
|
+
ln.Feature(name="tissue", dtype=[bt.Tissue]).save() # type: ignore
|
337
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
|
338
|
+
ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
|
338
339
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
339
340
|
ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
|
340
341
|
ln.settings.verbosity = verbosity
|