lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -1
- lamindb/_view.py +2 -2
- lamindb/base/types.py +50 -11
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +15 -12
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/loaders.py +22 -12
- lamindb/core/storage/_tiledbsoma.py +2 -2
- lamindb/core/storage/_zarr.py +84 -26
- lamindb/core/storage/objects.py +45 -44
- lamindb/core/types.py +11 -1
- lamindb/curators/__init__.py +1430 -1665
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +86 -42
- lamindb/models/_from_values.py +110 -119
- lamindb/models/_label_manager.py +17 -10
- lamindb/models/artifact.py +170 -102
- lamindb/models/can_curate.py +200 -231
- lamindb/models/feature.py +76 -47
- lamindb/models/project.py +69 -7
- lamindb/models/query_set.py +12 -2
- lamindb/models/record.py +77 -50
- lamindb/models/run.py +20 -7
- lamindb/models/schema.py +7 -15
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/METADATA +8 -7
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/RECORD +31 -30
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0
@@ -1,26 +1,198 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
1
|
import pandas as pd
|
4
|
-
|
2
|
+
from lamin_utils import logger
|
3
|
+
from lamindb_setup.core.upath import UPath
|
4
|
+
|
5
|
+
from lamindb.base.types import FieldAttr
|
6
|
+
from lamindb.models import Record, ULabel
|
7
|
+
from lamindb.models._from_values import _format_values
|
8
|
+
|
9
|
+
RESERVED_NAMES = {
|
10
|
+
"ethnicity",
|
11
|
+
"ethnicity_ontology_term_id",
|
12
|
+
"X_normalization",
|
13
|
+
"default_field",
|
14
|
+
"layer_descriptions",
|
15
|
+
"tags",
|
16
|
+
"versions",
|
17
|
+
"contributors",
|
18
|
+
"preprint_doi",
|
19
|
+
"project_description",
|
20
|
+
"project_links",
|
21
|
+
"project_name",
|
22
|
+
"publication_doi",
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
def _get_cxg_categoricals() -> dict[str, FieldAttr]:
|
27
|
+
import bionty as bt
|
28
|
+
|
29
|
+
return {
|
30
|
+
"assay": bt.ExperimentalFactor.name,
|
31
|
+
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
32
|
+
"cell_type": bt.CellType.name,
|
33
|
+
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
34
|
+
"development_stage": bt.DevelopmentalStage.name,
|
35
|
+
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
36
|
+
"disease": bt.Disease.name,
|
37
|
+
"disease_ontology_term_id": bt.Disease.ontology_id,
|
38
|
+
# "donor_id": "str", via pandera
|
39
|
+
"self_reported_ethnicity": bt.Ethnicity.name,
|
40
|
+
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
41
|
+
"sex": bt.Phenotype.name,
|
42
|
+
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
43
|
+
"suspension_type": ULabel.name,
|
44
|
+
"tissue": bt.Tissue.name,
|
45
|
+
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
46
|
+
"tissue_type": ULabel.name,
|
47
|
+
"organism": bt.Organism.name,
|
48
|
+
"organism_ontology_term_id": bt.Organism.ontology_id,
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
def _restrict_obs_fields(
|
53
|
+
obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
|
54
|
+
) -> dict[str, FieldAttr]:
|
55
|
+
"""Restrict the obs fields only available obs fields.
|
56
|
+
|
57
|
+
To simplify the curation, we only validate against either name or ontology_id.
|
58
|
+
If both are available, we validate against ontology_id.
|
59
|
+
If none are available, we validate against name.
|
60
|
+
"""
|
61
|
+
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
62
|
+
for name, field in obs_fields.items():
|
63
|
+
if name.endswith("_ontology_term_id"):
|
64
|
+
continue
|
65
|
+
# if both the ontology id and the name are present, only validate on the ontology_id
|
66
|
+
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
67
|
+
obs_fields_unique.pop(name)
|
68
|
+
# if the neither name nor ontology id are present, validate on the name
|
69
|
+
# this will raise error downstream, we just use name to be more readable
|
70
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
71
|
+
obs_fields_unique[name] = field
|
72
|
+
|
73
|
+
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
74
|
+
available_obs_fields = {
|
75
|
+
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
76
|
+
}
|
5
77
|
|
78
|
+
return available_obs_fields
|
6
79
|
|
7
|
-
def _read_schema_versions(ontology_versions: Path) -> dict[str, pd.DataFrame]:
|
8
|
-
data = yaml.safe_load(open(ontology_versions))
|
9
|
-
schema_versions = data["schema-version"]
|
10
80
|
|
11
|
-
|
12
|
-
|
81
|
+
def _add_defaults_to_obs(obs: pd.DataFrame, defaults: dict[str, str]) -> None:
|
82
|
+
"""Add default columns and values to obs DataFrame."""
|
83
|
+
added_defaults: dict = {}
|
84
|
+
for name, default in defaults.items():
|
85
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
86
|
+
obs[name] = default
|
87
|
+
added_defaults[name] = default
|
88
|
+
logger.important(
|
89
|
+
f"added default value '{default}' to the adata.obs['{name}']"
|
90
|
+
)
|
91
|
+
|
92
|
+
|
93
|
+
def _create_sources(
|
94
|
+
categoricals: dict[str, FieldAttr], schema_version: str, organism: str
|
95
|
+
) -> dict[str, Record]:
|
96
|
+
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
97
|
+
import bionty as bt
|
98
|
+
|
99
|
+
def _fetch_bionty_source(entity: str, organism: str) -> Record | None: # type: ignore
|
100
|
+
"""Fetch the Bionty source of the pinned ontology."""
|
101
|
+
entity_sources = sources_df.loc[(sources_df.entity == entity)].copy()
|
102
|
+
if not entity_sources.empty:
|
103
|
+
if len(entity_sources) == 1:
|
104
|
+
row = entity_sources.iloc[0] # for sources with organism "all"
|
105
|
+
else:
|
106
|
+
row = entity_sources[entity_sources.organism == organism].iloc[0]
|
107
|
+
source = bt.Source.filter(
|
108
|
+
organism=row.organism,
|
109
|
+
entity=f"bionty.{entity}",
|
110
|
+
name=row.source,
|
111
|
+
version=row.version,
|
112
|
+
).one_or_none()
|
113
|
+
if source is None:
|
114
|
+
logger.error(
|
115
|
+
f"Could not find source: {entity}\n"
|
116
|
+
" → consider running `bionty.core.sync_public_sources()`"
|
117
|
+
)
|
118
|
+
return source
|
119
|
+
|
120
|
+
sources_df = pd.read_csv(UPath(__file__).parent / "schema_versions.csv")
|
121
|
+
sources_df = sources_df[sources_df.schema_version == schema_version]
|
122
|
+
if sources_df.empty:
|
123
|
+
raise ValueError(
|
124
|
+
f"Invalid schema_version: {schema_version}\n"
|
125
|
+
f"Valid versions are: {_format_values(sources_df.schema_version.unique())}"
|
126
|
+
)
|
127
|
+
|
128
|
+
key_to_source: dict[str, bt.Source] = {}
|
129
|
+
for key, field in categoricals.items():
|
130
|
+
if field.field.model.__get_module_name__() == "bionty":
|
131
|
+
entity = field.field.model.__name__
|
132
|
+
key_to_source[key] = _fetch_bionty_source(entity, organism)
|
133
|
+
key_to_source["var_index"] = _fetch_bionty_source("Gene", organism)
|
134
|
+
|
135
|
+
return key_to_source
|
136
|
+
|
137
|
+
|
138
|
+
def _init_categoricals_additional_values() -> None:
|
139
|
+
"""Add additional values from CellxGene schema to the DB."""
|
140
|
+
import bionty as bt
|
141
|
+
|
142
|
+
# Note: if you add another control below, be mindful to change the if condition that
|
143
|
+
# triggers whether creating these records is re-considered
|
144
|
+
controls_were_created = (
|
145
|
+
ULabel.filter(name="SuspensionType", is_type=True).one_or_none() is not None
|
146
|
+
)
|
147
|
+
if not controls_were_created:
|
148
|
+
logger.important("Creating control labels in the CellxGene schema.")
|
149
|
+
|
150
|
+
# "normal" in Disease
|
151
|
+
normal = bt.Phenotype.from_source(
|
152
|
+
ontology_id="PATO:0000461",
|
153
|
+
source=bt.Source.get(name="pato", version="2024-03-28"),
|
154
|
+
)
|
155
|
+
bt.Disease(
|
156
|
+
uid=normal.uid,
|
157
|
+
name=normal.name,
|
158
|
+
ontology_id=normal.ontology_id,
|
159
|
+
description=normal.description,
|
160
|
+
source=normal.source, # not sure
|
161
|
+
).save()
|
162
|
+
|
163
|
+
# na, unknown
|
164
|
+
for model, name in zip(
|
13
165
|
[
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
166
|
+
bt.Ethnicity,
|
167
|
+
bt.Ethnicity,
|
168
|
+
bt.DevelopmentalStage,
|
169
|
+
bt.Phenotype,
|
170
|
+
bt.CellType,
|
18
171
|
],
|
19
|
-
|
20
|
-
)
|
172
|
+
["na", "unknown", "unknown", "unknown", "unknown"],
|
173
|
+
):
|
174
|
+
model(
|
175
|
+
ontology_id=name, name=name, description="From CellxGene schema."
|
176
|
+
).save()
|
21
177
|
|
22
|
-
|
23
|
-
|
24
|
-
|
178
|
+
# tissue_type
|
179
|
+
tissue_type = ULabel(
|
180
|
+
name="TissueType",
|
181
|
+
is_type=True,
|
182
|
+
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
183
|
+
).save()
|
184
|
+
for name in ["tissue", "organoid", "cell culture"]:
|
185
|
+
ULabel(
|
186
|
+
name=name, type=tissue_type, description="From CellxGene schema."
|
187
|
+
).save()
|
25
188
|
|
26
|
-
|
189
|
+
# suspension_type
|
190
|
+
suspension_type = ULabel(
|
191
|
+
name="SuspensionType",
|
192
|
+
is_type=True,
|
193
|
+
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
194
|
+
).save()
|
195
|
+
for name in ["cell", "nucleus", "na"]:
|
196
|
+
ULabel(
|
197
|
+
name=name, type=suspension_type, description="From CellxGene schema."
|
198
|
+
).save()
|
@@ -0,0 +1,43 @@
|
|
1
|
+
schema_version,entity,organism,source,version
|
2
|
+
4.0.0,CellType,all,cl,2023-08-24
|
3
|
+
4.0.0,ExperimentalFactor,all,efo,3.57.0
|
4
|
+
4.0.0,Ethnicity,human,hancestro,3.0
|
5
|
+
4.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
|
6
|
+
4.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
|
7
|
+
4.0.0,Disease,all,mondo,2023-08-02
|
8
|
+
4.0.0,Organism,all,ncbitaxon,2023-06-20
|
9
|
+
4.0.0,Phenotype,all,pato,2023-05-18
|
10
|
+
4.0.0,Tissue,all,uberon,2023-09-05
|
11
|
+
5.0.0,CellType,all,cl,2024-01-04
|
12
|
+
5.0.0,ExperimentalFactor,all,efo,3.62.0
|
13
|
+
5.0.0,Ethnicity,human,hancestro,3.0
|
14
|
+
5.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
|
15
|
+
5.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
|
16
|
+
5.0.0,Disease,all,mondo,2024-01-03
|
17
|
+
5.0.0,Organism,all,ncbitaxon,2023-06-20
|
18
|
+
5.0.0,Phenotype,all,pato,2023-05-18
|
19
|
+
5.0.0,Tissue,all,uberon,2024-01-18
|
20
|
+
5.0.0,Gene,human,ensembl,release-110
|
21
|
+
5.0.0,Gene,mouse,ensembl,release-110
|
22
|
+
5.1.0,CellType,all,cl,2024-04-05
|
23
|
+
5.1.0,ExperimentalFactor,all,efo,3.65.0
|
24
|
+
5.1.0,Ethnicity,human,hancestro,3.0
|
25
|
+
5.1.0,DevelopmentalStage,human,hsapdv,2020-03-10
|
26
|
+
5.1.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
|
27
|
+
5.1.0,Disease,all,mondo,2024-05-08
|
28
|
+
5.1.0,Organism,all,ncbitaxon,2023-06-20
|
29
|
+
5.1.0,Phenotype,all,pato,2023-05-18
|
30
|
+
5.1.0,Tissue,all,uberon,2024-03-22
|
31
|
+
5.1.0,Gene,human,ensembl,release-110
|
32
|
+
5.1.0,Gene,mouse,ensembl,release-110
|
33
|
+
5.2.0,CellType,all,cl,2024-08-16
|
34
|
+
5.2.0,ExperimentalFactor,all,efo,3.69.0
|
35
|
+
5.2.0,Ethnicity,human,hancestro,3.0
|
36
|
+
5.2.0,DevelopmentalStage,human,hsapdv,2024-05-28
|
37
|
+
5.2.0,DevelopmentalStage,mouse,mmusdv,2024-05-28
|
38
|
+
5.2.0,Disease,all,mondo,2024-08-06
|
39
|
+
5.2.0,Organism,all,ncbitaxon,2023-06-20
|
40
|
+
5.2.0,Phenotype,all,pato,2023-05-18
|
41
|
+
5.2.0,Tissue,all,uberon,2024-08-07
|
42
|
+
5.2.0,Gene,human,ensembl,release-110
|
43
|
+
5.2.0,Gene,mouse,ensembl,release-110
|
@@ -5,7 +5,7 @@ from collections import defaultdict
|
|
5
5
|
from collections.abc import Iterable
|
6
6
|
from datetime import date, datetime
|
7
7
|
from itertools import compress
|
8
|
-
from typing import TYPE_CHECKING, Any
|
8
|
+
from typing import TYPE_CHECKING, Any, MutableMapping
|
9
9
|
|
10
10
|
import anndata as ad
|
11
11
|
import numpy as np
|
@@ -24,7 +24,7 @@ from lamindb.core.storage import LocalPathClasses
|
|
24
24
|
from lamindb.errors import DoesNotExist, ValidationError
|
25
25
|
from lamindb.models._from_values import _format_values
|
26
26
|
from lamindb.models.feature import (
|
27
|
-
|
27
|
+
serialize_pandas_dtype,
|
28
28
|
suggest_categorical_for_str_iterable,
|
29
29
|
)
|
30
30
|
from lamindb.models.record import (
|
@@ -201,7 +201,11 @@ def _get_categoricals(
|
|
201
201
|
if hasattr(link, "feature_id") and link.feature_id is not None:
|
202
202
|
feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
|
203
203
|
link_attr = get_link_attr(link, self)
|
204
|
-
|
204
|
+
label = getattr(link, link_attr)
|
205
|
+
name_attr = (
|
206
|
+
"name" if hasattr(label, "name") else label.__class__._name_field
|
207
|
+
)
|
208
|
+
label_name = getattr(label, name_attr)
|
205
209
|
result[(feature.name, feature.dtype)].add(label_name)
|
206
210
|
|
207
211
|
return dict(result)
|
@@ -481,6 +485,7 @@ def parse_staged_feature_sets_from_anndata(
|
|
481
485
|
adata: AnnData,
|
482
486
|
var_field: FieldAttr | None = None,
|
483
487
|
obs_field: FieldAttr = Feature.name,
|
488
|
+
uns_field: FieldAttr | None = None,
|
484
489
|
mute: bool = False,
|
485
490
|
organism: str | Record | None = None,
|
486
491
|
) -> dict:
|
@@ -497,15 +502,9 @@ def parse_staged_feature_sets_from_anndata(
|
|
497
502
|
data_parse = ad.read_h5ad(filepath, backed="r")
|
498
503
|
type = "float"
|
499
504
|
else:
|
500
|
-
type = (
|
501
|
-
"float"
|
502
|
-
if adata.X is None
|
503
|
-
else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
|
504
|
-
)
|
505
|
+
type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
|
505
506
|
feature_sets = {}
|
506
507
|
if var_field is not None:
|
507
|
-
logger.info("parsing feature names of X stored in slot 'var'")
|
508
|
-
logger.indent = " "
|
509
508
|
schema_var = Schema.from_values(
|
510
509
|
data_parse.var.index,
|
511
510
|
var_field,
|
@@ -516,13 +515,7 @@ def parse_staged_feature_sets_from_anndata(
|
|
516
515
|
)
|
517
516
|
if schema_var is not None:
|
518
517
|
feature_sets["var"] = schema_var
|
519
|
-
|
520
|
-
logger.indent = ""
|
521
|
-
if schema_var is None:
|
522
|
-
logger.warning("skip linking features to artifact in slot 'var'")
|
523
|
-
if len(data_parse.obs.columns) > 0:
|
524
|
-
logger.info("parsing feature names of slot 'obs'")
|
525
|
-
logger.indent = " "
|
518
|
+
if obs_field is not None and len(data_parse.obs.columns) > 0:
|
526
519
|
schema_obs = Schema.from_df(
|
527
520
|
df=data_parse.obs,
|
528
521
|
field=obs_field,
|
@@ -531,10 +524,13 @@ def parse_staged_feature_sets_from_anndata(
|
|
531
524
|
)
|
532
525
|
if schema_obs is not None:
|
533
526
|
feature_sets["obs"] = schema_obs
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
527
|
+
if uns_field is not None and len(data_parse.uns) > 0:
|
528
|
+
validated_features = Feature.from_values( # type: ignore
|
529
|
+
data_parse.uns.keys(), field=uns_field, organism=organism
|
530
|
+
)
|
531
|
+
if len(validated_features) > 0:
|
532
|
+
schema_uns = Schema(validated_features, dtype=None, otype="dict")
|
533
|
+
feature_sets["uns"] = schema_uns
|
538
534
|
return feature_sets
|
539
535
|
|
540
536
|
|
@@ -571,7 +567,7 @@ def infer_feature_type_convert_json(
|
|
571
567
|
return "cat ? str", value, message
|
572
568
|
elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
|
573
569
|
if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)):
|
574
|
-
dtype =
|
570
|
+
dtype = serialize_pandas_dtype(value.dtype)
|
575
571
|
if dtype == "str":
|
576
572
|
# ndarray doesn't know categorical, so there was no conscious choice
|
577
573
|
# offer both options
|
@@ -844,7 +840,7 @@ def _add_values(
|
|
844
840
|
)
|
845
841
|
validated = registry.validate(keys, field=feature_param_field, mute=True)
|
846
842
|
keys_array = np.array(keys)
|
847
|
-
|
843
|
+
keys_array[validated]
|
848
844
|
if validated.sum() != len(keys):
|
849
845
|
not_validated_keys = keys_array[~validated]
|
850
846
|
not_validated_keys_dtype_message = [
|
@@ -870,10 +866,7 @@ def _add_values(
|
|
870
866
|
f"Here is how to create a {model_name.lower()}:\n\n{hint}"
|
871
867
|
)
|
872
868
|
raise ValidationError(msg)
|
873
|
-
|
874
|
-
validated_keys,
|
875
|
-
field=feature_param_field,
|
876
|
-
)
|
869
|
+
|
877
870
|
# figure out which of the values go where
|
878
871
|
features_labels = defaultdict(list)
|
879
872
|
_feature_values = []
|
@@ -933,12 +926,14 @@ def _add_values(
|
|
933
926
|
if "ULabel" not in feature.dtype:
|
934
927
|
feature.dtype += "[ULabel]"
|
935
928
|
feature.save()
|
936
|
-
validated = ULabel.validate(values, field=
|
929
|
+
validated = ULabel.validate(values, field=ULabel.name, mute=True)
|
937
930
|
values_array = np.array(values)
|
938
931
|
validated_values = values_array[validated]
|
939
932
|
if validated.sum() != len(values):
|
940
933
|
not_validated_values += values_array[~validated].tolist()
|
941
|
-
label_records = ULabel.from_values(
|
934
|
+
label_records = ULabel.from_values(
|
935
|
+
validated_values, field=ULabel.name, mute=True
|
936
|
+
) # type: ignore
|
942
937
|
features_labels["ULabel"] += [
|
943
938
|
(feature, label_record) for label_record in label_records
|
944
939
|
]
|
@@ -1116,6 +1111,7 @@ def _add_set_from_anndata(
|
|
1116
1111
|
self,
|
1117
1112
|
var_field: FieldAttr | None = None,
|
1118
1113
|
obs_field: FieldAttr | None = Feature.name,
|
1114
|
+
uns_field: FieldAttr | None = None,
|
1119
1115
|
mute: bool = False,
|
1120
1116
|
organism: str | Record | None = None,
|
1121
1117
|
):
|
@@ -1128,6 +1124,7 @@ def _add_set_from_anndata(
|
|
1128
1124
|
adata,
|
1129
1125
|
var_field=var_field,
|
1130
1126
|
obs_field=obs_field,
|
1127
|
+
uns_field=uns_field,
|
1131
1128
|
mute=mute,
|
1132
1129
|
organism=organism,
|
1133
1130
|
)
|
@@ -1137,10 +1134,25 @@ def _add_set_from_anndata(
|
|
1137
1134
|
self._host.save()
|
1138
1135
|
|
1139
1136
|
|
1137
|
+
def _unify_staged_feature_sets_by_hash(
|
1138
|
+
feature_sets: MutableMapping[str, Schema],
|
1139
|
+
):
|
1140
|
+
unique_values: dict[str, Any] = {}
|
1141
|
+
|
1142
|
+
for key, value in feature_sets.items():
|
1143
|
+
value_hash = value.hash # Assuming each value has a .hash attribute
|
1144
|
+
if value_hash in unique_values:
|
1145
|
+
feature_sets[key] = unique_values[value_hash]
|
1146
|
+
else:
|
1147
|
+
unique_values[value_hash] = value
|
1148
|
+
|
1149
|
+
return feature_sets
|
1150
|
+
|
1151
|
+
|
1140
1152
|
def _add_set_from_mudata(
|
1141
1153
|
self,
|
1142
|
-
var_fields: dict[str, FieldAttr],
|
1143
|
-
obs_fields: dict[str, FieldAttr] = None,
|
1154
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1155
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1144
1156
|
mute: bool = False,
|
1145
1157
|
organism: str | Record | None = None,
|
1146
1158
|
):
|
@@ -1152,6 +1164,7 @@ def _add_set_from_mudata(
|
|
1152
1164
|
# parse and register features
|
1153
1165
|
mdata = self._host.load()
|
1154
1166
|
feature_sets = {}
|
1167
|
+
|
1155
1168
|
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1156
1169
|
if len(obs_features) > 0:
|
1157
1170
|
feature_sets["obs"] = Schema(features=obs_features)
|
@@ -1166,20 +1179,50 @@ def _add_set_from_mudata(
|
|
1166
1179
|
for k, v in modality_fs.items():
|
1167
1180
|
feature_sets[f"['{modality}'].{k}"] = v
|
1168
1181
|
|
1169
|
-
|
1170
|
-
|
1182
|
+
# link feature sets
|
1183
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1184
|
+
self._host.save()
|
1185
|
+
|
1171
1186
|
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1187
|
+
def _add_set_from_spatialdata(
|
1188
|
+
self,
|
1189
|
+
sample_metadata_key: str,
|
1190
|
+
sample_metadata_field: FieldAttr = Feature.name,
|
1191
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1192
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1193
|
+
mute: bool = False,
|
1194
|
+
organism: str | Record | None = None,
|
1195
|
+
):
|
1196
|
+
"""Add features from SpatialData."""
|
1197
|
+
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1198
|
+
assert self._host.otype == "SpatialData" # noqa: S101
|
1178
1199
|
|
1179
|
-
|
1200
|
+
# parse and register features
|
1201
|
+
sdata = self._host.load()
|
1202
|
+
feature_sets = {}
|
1203
|
+
|
1204
|
+
# sample features
|
1205
|
+
sample_features = Feature.from_values(
|
1206
|
+
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1207
|
+
field=sample_metadata_field,
|
1208
|
+
) # type: ignore
|
1209
|
+
if len(sample_features) > 0:
|
1210
|
+
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1211
|
+
|
1212
|
+
# table features
|
1213
|
+
for table, field in var_fields.items():
|
1214
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
1215
|
+
sdata[table],
|
1216
|
+
var_field=field,
|
1217
|
+
obs_field=obs_fields.get(table, Feature.name),
|
1218
|
+
mute=mute,
|
1219
|
+
organism=organism,
|
1220
|
+
)
|
1221
|
+
for k, v in table_fs.items():
|
1222
|
+
feature_sets[f"['{table}'].{k}"] = v
|
1180
1223
|
|
1181
1224
|
# link feature sets
|
1182
|
-
self._host._staged_feature_sets =
|
1225
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1183
1226
|
self._host.save()
|
1184
1227
|
|
1185
1228
|
|
@@ -1205,7 +1248,7 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
|
|
1205
1248
|
# create records from ontology_id
|
1206
1249
|
if hasattr(registry, "_ontology_id_field") and len(member_uids) > 0:
|
1207
1250
|
# create from bionty
|
1208
|
-
members_records = registry.from_values(member_uids, field=field)
|
1251
|
+
members_records = registry.from_values(member_uids, field=field, mute=True)
|
1209
1252
|
save([r for r in members_records if r._state.adding])
|
1210
1253
|
validated = registry.validate(member_uids, field=field, mute=True)
|
1211
1254
|
new_members_uids = list(compress(member_uids, ~validated))
|
@@ -1311,6 +1354,7 @@ FeatureManager._accessor_by_registry = _accessor_by_registry
|
|
1311
1354
|
FeatureManager._add_set_from_df = _add_set_from_df
|
1312
1355
|
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1313
1356
|
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1357
|
+
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1314
1358
|
FeatureManager._add_from = _add_from
|
1315
1359
|
FeatureManager.filter = filter
|
1316
1360
|
FeatureManager.get = get
|