lamindb 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/core/_context.py +6 -0
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/storage/objects.py +1 -2
- lamindb/curators/__init__.py +1269 -1513
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +65 -14
- lamindb/models/_from_values.py +113 -78
- lamindb/models/artifact.py +138 -95
- lamindb/models/can_curate.py +185 -216
- lamindb/models/feature.py +32 -2
- lamindb/models/project.py +69 -7
- lamindb/models/record.py +43 -25
- lamindb/models/run.py +18 -1
- lamindb/models/schema.py +0 -8
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/METADATA +6 -5
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/RECORD +22 -22
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/LICENSE +0 -0
- {lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/WHEEL +0 -0
@@ -1,26 +1,198 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
1
|
import pandas as pd
|
4
|
-
|
2
|
+
from lamin_utils import logger
|
3
|
+
from lamindb_setup.core.upath import UPath
|
4
|
+
|
5
|
+
from lamindb.base.types import FieldAttr
|
6
|
+
from lamindb.models import Record, ULabel
|
7
|
+
from lamindb.models._from_values import _format_values
|
8
|
+
|
9
|
+
RESERVED_NAMES = {
|
10
|
+
"ethnicity",
|
11
|
+
"ethnicity_ontology_term_id",
|
12
|
+
"X_normalization",
|
13
|
+
"default_field",
|
14
|
+
"layer_descriptions",
|
15
|
+
"tags",
|
16
|
+
"versions",
|
17
|
+
"contributors",
|
18
|
+
"preprint_doi",
|
19
|
+
"project_description",
|
20
|
+
"project_links",
|
21
|
+
"project_name",
|
22
|
+
"publication_doi",
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
def _get_cxg_categoricals() -> dict[str, FieldAttr]:
|
27
|
+
import bionty as bt
|
28
|
+
|
29
|
+
return {
|
30
|
+
"assay": bt.ExperimentalFactor.name,
|
31
|
+
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
32
|
+
"cell_type": bt.CellType.name,
|
33
|
+
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
34
|
+
"development_stage": bt.DevelopmentalStage.name,
|
35
|
+
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
36
|
+
"disease": bt.Disease.name,
|
37
|
+
"disease_ontology_term_id": bt.Disease.ontology_id,
|
38
|
+
# "donor_id": "str", via pandera
|
39
|
+
"self_reported_ethnicity": bt.Ethnicity.name,
|
40
|
+
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
41
|
+
"sex": bt.Phenotype.name,
|
42
|
+
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
43
|
+
"suspension_type": ULabel.name,
|
44
|
+
"tissue": bt.Tissue.name,
|
45
|
+
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
46
|
+
"tissue_type": ULabel.name,
|
47
|
+
"organism": bt.Organism.name,
|
48
|
+
"organism_ontology_term_id": bt.Organism.ontology_id,
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
def _restrict_obs_fields(
|
53
|
+
obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
|
54
|
+
) -> dict[str, FieldAttr]:
|
55
|
+
"""Restrict the obs fields only available obs fields.
|
56
|
+
|
57
|
+
To simplify the curation, we only validate against either name or ontology_id.
|
58
|
+
If both are available, we validate against ontology_id.
|
59
|
+
If none are available, we validate against name.
|
60
|
+
"""
|
61
|
+
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
62
|
+
for name, field in obs_fields.items():
|
63
|
+
if name.endswith("_ontology_term_id"):
|
64
|
+
continue
|
65
|
+
# if both the ontology id and the name are present, only validate on the ontology_id
|
66
|
+
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
67
|
+
obs_fields_unique.pop(name)
|
68
|
+
# if the neither name nor ontology id are present, validate on the name
|
69
|
+
# this will raise error downstream, we just use name to be more readable
|
70
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
71
|
+
obs_fields_unique[name] = field
|
72
|
+
|
73
|
+
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
74
|
+
available_obs_fields = {
|
75
|
+
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
76
|
+
}
|
5
77
|
|
78
|
+
return available_obs_fields
|
6
79
|
|
7
|
-
def _read_schema_versions(ontology_versions: Path) -> dict[str, pd.DataFrame]:
|
8
|
-
data = yaml.safe_load(open(ontology_versions))
|
9
|
-
schema_versions = data["schema-version"]
|
10
80
|
|
11
|
-
|
12
|
-
|
81
|
+
def _add_defaults_to_obs(obs: pd.DataFrame, defaults: dict[str, str]) -> None:
|
82
|
+
"""Add default columns and values to obs DataFrame."""
|
83
|
+
added_defaults: dict = {}
|
84
|
+
for name, default in defaults.items():
|
85
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
86
|
+
obs[name] = default
|
87
|
+
added_defaults[name] = default
|
88
|
+
logger.important(
|
89
|
+
f"added default value '{default}' to the adata.obs['{name}']"
|
90
|
+
)
|
91
|
+
|
92
|
+
|
93
|
+
def _create_sources(
|
94
|
+
categoricals: dict[str, FieldAttr], schema_version: str, organism: str
|
95
|
+
) -> dict[str, Record]:
|
96
|
+
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
97
|
+
import bionty as bt
|
98
|
+
|
99
|
+
def _fetch_bionty_source(entity: str, organism: str) -> Record | None: # type: ignore
|
100
|
+
"""Fetch the Bionty source of the pinned ontology."""
|
101
|
+
entity_sources = sources_df.loc[(sources_df.entity == entity)].copy()
|
102
|
+
if not entity_sources.empty:
|
103
|
+
if len(entity_sources) == 1:
|
104
|
+
row = entity_sources.iloc[0] # for sources with organism "all"
|
105
|
+
else:
|
106
|
+
row = entity_sources[entity_sources.organism == organism].iloc[0]
|
107
|
+
source = bt.Source.filter(
|
108
|
+
organism=row.organism,
|
109
|
+
entity=f"bionty.{entity}",
|
110
|
+
name=row.source,
|
111
|
+
version=row.version,
|
112
|
+
).one_or_none()
|
113
|
+
if source is None:
|
114
|
+
logger.error(
|
115
|
+
f"Could not find source: {entity}\n"
|
116
|
+
" → consider running `bionty.core.sync_all_sources_to_latest()` and re-connect to your instance"
|
117
|
+
)
|
118
|
+
return source
|
119
|
+
|
120
|
+
sources_df = pd.read_csv(UPath(__file__).parent / "schema_versions.csv")
|
121
|
+
sources_df = sources_df[sources_df.schema_version == schema_version]
|
122
|
+
if sources_df.empty:
|
123
|
+
raise ValueError(
|
124
|
+
f"Invalid schema_version: {schema_version}\n"
|
125
|
+
f"Valid versions are: {_format_values(sources_df.schema_version.unique())}"
|
126
|
+
)
|
127
|
+
|
128
|
+
key_to_source: dict[str, bt.Source] = {}
|
129
|
+
for key, field in categoricals.items():
|
130
|
+
if field.field.model.__get_module_name__() == "bionty":
|
131
|
+
entity = field.field.model.__name__
|
132
|
+
key_to_source[key] = _fetch_bionty_source(entity, organism)
|
133
|
+
key_to_source["var_index"] = _fetch_bionty_source("Gene", organism)
|
134
|
+
|
135
|
+
return key_to_source
|
136
|
+
|
137
|
+
|
138
|
+
def _init_categoricals_additional_values() -> None:
|
139
|
+
"""Add additional values from CellxGene schema to the DB."""
|
140
|
+
import bionty as bt
|
141
|
+
|
142
|
+
# Note: if you add another control below, be mindful to change the if condition that
|
143
|
+
# triggers whether creating these records is re-considered
|
144
|
+
controls_were_created = (
|
145
|
+
ULabel.filter(name="SuspensionType", is_type=True).one_or_none() is not None
|
146
|
+
)
|
147
|
+
if not controls_were_created:
|
148
|
+
logger.important("Creating control labels in the CellxGene schema.")
|
149
|
+
|
150
|
+
# "normal" in Disease
|
151
|
+
normal = bt.Phenotype.from_source(
|
152
|
+
ontology_id="PATO:0000461",
|
153
|
+
source=bt.Source.get(name="pato", version="2024-03-28"),
|
154
|
+
)
|
155
|
+
bt.Disease(
|
156
|
+
uid=normal.uid,
|
157
|
+
name=normal.name,
|
158
|
+
ontology_id=normal.ontology_id,
|
159
|
+
description=normal.description,
|
160
|
+
source=normal.source, # not sure
|
161
|
+
).save()
|
162
|
+
|
163
|
+
# na, unknown
|
164
|
+
for model, name in zip(
|
13
165
|
[
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
166
|
+
bt.Ethnicity,
|
167
|
+
bt.Ethnicity,
|
168
|
+
bt.DevelopmentalStage,
|
169
|
+
bt.Phenotype,
|
170
|
+
bt.CellType,
|
18
171
|
],
|
19
|
-
|
20
|
-
)
|
172
|
+
["na", "unknown", "unknown", "unknown", "unknown"],
|
173
|
+
):
|
174
|
+
model(
|
175
|
+
ontology_id=name, name=name, description="From CellxGene schema."
|
176
|
+
).save()
|
21
177
|
|
22
|
-
|
23
|
-
|
24
|
-
|
178
|
+
# tissue_type
|
179
|
+
tissue_type = ULabel(
|
180
|
+
name="TissueType",
|
181
|
+
is_type=True,
|
182
|
+
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
183
|
+
).save()
|
184
|
+
for name in ["tissue", "organoid", "cell culture"]:
|
185
|
+
ULabel(
|
186
|
+
name=name, type=tissue_type, description="From CellxGene schema."
|
187
|
+
).save()
|
25
188
|
|
26
|
-
|
189
|
+
# suspension_type
|
190
|
+
suspension_type = ULabel(
|
191
|
+
name="SuspensionType",
|
192
|
+
is_type=True,
|
193
|
+
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
194
|
+
).save()
|
195
|
+
for name in ["cell", "nucleus", "na"]:
|
196
|
+
ULabel(
|
197
|
+
name=name, type=suspension_type, description="From CellxGene schema."
|
198
|
+
).save()
|
@@ -0,0 +1,43 @@
|
|
1
|
+
schema_version,entity,organism,source,version
|
2
|
+
4.0.0,CellType,all,cl,2023-08-24
|
3
|
+
4.0.0,ExperimentalFactor,all,efo,3.57.0
|
4
|
+
4.0.0,Ethnicity,human,hancestro,3.0
|
5
|
+
4.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
|
6
|
+
4.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
|
7
|
+
4.0.0,Disease,all,mondo,2023-08-02
|
8
|
+
4.0.0,Organism,all,ncbitaxon,2023-06-20
|
9
|
+
4.0.0,Phenotype,all,pato,2023-05-18
|
10
|
+
4.0.0,Tissue,all,uberon,2023-09-05
|
11
|
+
5.0.0,CellType,all,cl,2024-01-04
|
12
|
+
5.0.0,ExperimentalFactor,all,efo,3.62.0
|
13
|
+
5.0.0,Ethnicity,human,hancestro,3.0
|
14
|
+
5.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
|
15
|
+
5.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
|
16
|
+
5.0.0,Disease,all,mondo,2024-01-03
|
17
|
+
5.0.0,Organism,all,ncbitaxon,2023-06-20
|
18
|
+
5.0.0,Phenotype,all,pato,2023-05-18
|
19
|
+
5.0.0,Tissue,all,uberon,2024-01-18
|
20
|
+
5.0.0,Gene,human,ensembl,release-110
|
21
|
+
5.0.0,Gene,mouse,ensembl,release-110
|
22
|
+
5.1.0,CellType,all,cl,2024-04-05
|
23
|
+
5.1.0,ExperimentalFactor,all,efo,3.65.0
|
24
|
+
5.1.0,Ethnicity,human,hancestro,3.0
|
25
|
+
5.1.0,DevelopmentalStage,human,hsapdv,2020-03-10
|
26
|
+
5.1.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
|
27
|
+
5.1.0,Disease,all,mondo,2024-05-08
|
28
|
+
5.1.0,Organism,all,ncbitaxon,2023-06-20
|
29
|
+
5.1.0,Phenotype,all,pato,2023-05-18
|
30
|
+
5.1.0,Tissue,all,uberon,2024-03-22
|
31
|
+
5.1.0,Gene,human,ensembl,release-110
|
32
|
+
5.1.0,Gene,mouse,ensembl,release-110
|
33
|
+
5.2.0,CellType,all,cl,2024-08-16
|
34
|
+
5.2.0,ExperimentalFactor,all,efo,3.69.0
|
35
|
+
5.2.0,Ethnicity,human,hancestro,3.0
|
36
|
+
5.2.0,DevelopmentalStage,human,hsapdv,2024-05-28
|
37
|
+
5.2.0,DevelopmentalStage,mouse,mmusdv,2024-05-28
|
38
|
+
5.2.0,Disease,all,mondo,2024-08-06
|
39
|
+
5.2.0,Organism,all,ncbitaxon,2023-06-20
|
40
|
+
5.2.0,Phenotype,all,pato,2023-05-18
|
41
|
+
5.2.0,Tissue,all,uberon,2024-08-07
|
42
|
+
5.2.0,Gene,human,ensembl,release-110
|
43
|
+
5.2.0,Gene,mouse,ensembl,release-110
|
@@ -5,7 +5,7 @@ from collections import defaultdict
|
|
5
5
|
from collections.abc import Iterable
|
6
6
|
from datetime import date, datetime
|
7
7
|
from itertools import compress
|
8
|
-
from typing import TYPE_CHECKING, Any
|
8
|
+
from typing import TYPE_CHECKING, Any, MutableMapping
|
9
9
|
|
10
10
|
import anndata as ad
|
11
11
|
import numpy as np
|
@@ -201,7 +201,11 @@ def _get_categoricals(
|
|
201
201
|
if hasattr(link, "feature_id") and link.feature_id is not None:
|
202
202
|
feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
|
203
203
|
link_attr = get_link_attr(link, self)
|
204
|
-
|
204
|
+
label = getattr(link, link_attr)
|
205
|
+
name_attr = (
|
206
|
+
"name" if hasattr(label, "name") else label.__class__._name_field
|
207
|
+
)
|
208
|
+
label_name = getattr(label, name_attr)
|
205
209
|
result[(feature.name, feature.dtype)].add(label_name)
|
206
210
|
|
207
211
|
return dict(result)
|
@@ -1137,10 +1141,25 @@ def _add_set_from_anndata(
|
|
1137
1141
|
self._host.save()
|
1138
1142
|
|
1139
1143
|
|
1144
|
+
def _unify_staged_feature_sets_by_hash(
|
1145
|
+
feature_sets: MutableMapping[str, Schema],
|
1146
|
+
):
|
1147
|
+
unique_values: dict[str, Any] = {}
|
1148
|
+
|
1149
|
+
for key, value in feature_sets.items():
|
1150
|
+
value_hash = value.hash # Assuming each value has a .hash attribute
|
1151
|
+
if value_hash in unique_values:
|
1152
|
+
feature_sets[key] = unique_values[value_hash]
|
1153
|
+
else:
|
1154
|
+
unique_values[value_hash] = value
|
1155
|
+
|
1156
|
+
return feature_sets
|
1157
|
+
|
1158
|
+
|
1140
1159
|
def _add_set_from_mudata(
|
1141
1160
|
self,
|
1142
|
-
var_fields: dict[str, FieldAttr],
|
1143
|
-
obs_fields: dict[str, FieldAttr] = None,
|
1161
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1162
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1144
1163
|
mute: bool = False,
|
1145
1164
|
organism: str | Record | None = None,
|
1146
1165
|
):
|
@@ -1152,6 +1171,7 @@ def _add_set_from_mudata(
|
|
1152
1171
|
# parse and register features
|
1153
1172
|
mdata = self._host.load()
|
1154
1173
|
feature_sets = {}
|
1174
|
+
|
1155
1175
|
obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
|
1156
1176
|
if len(obs_features) > 0:
|
1157
1177
|
feature_sets["obs"] = Schema(features=obs_features)
|
@@ -1166,20 +1186,50 @@ def _add_set_from_mudata(
|
|
1166
1186
|
for k, v in modality_fs.items():
|
1167
1187
|
feature_sets[f"['{modality}'].{k}"] = v
|
1168
1188
|
|
1169
|
-
|
1170
|
-
|
1189
|
+
# link feature sets
|
1190
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1191
|
+
self._host.save()
|
1192
|
+
|
1171
1193
|
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1194
|
+
def _add_set_from_spatialdata(
|
1195
|
+
self,
|
1196
|
+
sample_metadata_key: str,
|
1197
|
+
sample_metadata_field: FieldAttr = Feature.name,
|
1198
|
+
var_fields: dict[str, FieldAttr] | None = None,
|
1199
|
+
obs_fields: dict[str, FieldAttr] | None = None,
|
1200
|
+
mute: bool = False,
|
1201
|
+
organism: str | Record | None = None,
|
1202
|
+
):
|
1203
|
+
"""Add features from SpatialData."""
|
1204
|
+
obs_fields, var_fields = obs_fields or {}, var_fields or {}
|
1205
|
+
assert self._host.otype == "SpatialData" # noqa: S101
|
1206
|
+
|
1207
|
+
# parse and register features
|
1208
|
+
sdata = self._host.load()
|
1209
|
+
feature_sets = {}
|
1178
1210
|
|
1179
|
-
|
1211
|
+
# sample features
|
1212
|
+
sample_features = Feature.from_values(
|
1213
|
+
sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
|
1214
|
+
field=sample_metadata_field,
|
1215
|
+
) # type: ignore
|
1216
|
+
if len(sample_features) > 0:
|
1217
|
+
feature_sets[sample_metadata_key] = Schema(features=sample_features)
|
1218
|
+
|
1219
|
+
# table features
|
1220
|
+
for table, field in var_fields.items():
|
1221
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
1222
|
+
sdata[table],
|
1223
|
+
var_field=field,
|
1224
|
+
obs_field=obs_fields.get(table, Feature.name),
|
1225
|
+
mute=mute,
|
1226
|
+
organism=organism,
|
1227
|
+
)
|
1228
|
+
for k, v in table_fs.items():
|
1229
|
+
feature_sets[f"['{table}'].{k}"] = v
|
1180
1230
|
|
1181
1231
|
# link feature sets
|
1182
|
-
self._host._staged_feature_sets =
|
1232
|
+
self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
|
1183
1233
|
self._host.save()
|
1184
1234
|
|
1185
1235
|
|
@@ -1311,6 +1361,7 @@ FeatureManager._accessor_by_registry = _accessor_by_registry
|
|
1311
1361
|
FeatureManager._add_set_from_df = _add_set_from_df
|
1312
1362
|
FeatureManager._add_set_from_anndata = _add_set_from_anndata
|
1313
1363
|
FeatureManager._add_set_from_mudata = _add_set_from_mudata
|
1364
|
+
FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
|
1314
1365
|
FeatureManager._add_from = _add_from
|
1315
1366
|
FeatureManager.filter = filter
|
1316
1367
|
FeatureManager.get = get
|