lamindb 1.2a2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,26 +1,198 @@
1
- from pathlib import Path
2
-
3
1
  import pandas as pd
4
- import yaml # type: ignore
2
+ from lamin_utils import logger
3
+ from lamindb_setup.core.upath import UPath
4
+
5
+ from lamindb.base.types import FieldAttr
6
+ from lamindb.models import Record, ULabel
7
+ from lamindb.models._from_values import _format_values
8
+
9
+ RESERVED_NAMES = {
10
+ "ethnicity",
11
+ "ethnicity_ontology_term_id",
12
+ "X_normalization",
13
+ "default_field",
14
+ "layer_descriptions",
15
+ "tags",
16
+ "versions",
17
+ "contributors",
18
+ "preprint_doi",
19
+ "project_description",
20
+ "project_links",
21
+ "project_name",
22
+ "publication_doi",
23
+ }
24
+
25
+
26
+ def _get_cxg_categoricals() -> dict[str, FieldAttr]:
27
+ import bionty as bt
28
+
29
+ return {
30
+ "assay": bt.ExperimentalFactor.name,
31
+ "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
32
+ "cell_type": bt.CellType.name,
33
+ "cell_type_ontology_term_id": bt.CellType.ontology_id,
34
+ "development_stage": bt.DevelopmentalStage.name,
35
+ "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
36
+ "disease": bt.Disease.name,
37
+ "disease_ontology_term_id": bt.Disease.ontology_id,
38
+ # "donor_id": "str", via pandera
39
+ "self_reported_ethnicity": bt.Ethnicity.name,
40
+ "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
41
+ "sex": bt.Phenotype.name,
42
+ "sex_ontology_term_id": bt.Phenotype.ontology_id,
43
+ "suspension_type": ULabel.name,
44
+ "tissue": bt.Tissue.name,
45
+ "tissue_ontology_term_id": bt.Tissue.ontology_id,
46
+ "tissue_type": ULabel.name,
47
+ "organism": bt.Organism.name,
48
+ "organism_ontology_term_id": bt.Organism.ontology_id,
49
+ }
50
+
51
+
52
+ def _restrict_obs_fields(
53
+ obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
54
+ ) -> dict[str, FieldAttr]:
55
+ """Restrict the obs fields only available obs fields.
56
+
57
+ To simplify the curation, we only validate against either name or ontology_id.
58
+ If both are available, we validate against ontology_id.
59
+ If none are available, we validate against name.
60
+ """
61
+ obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
62
+ for name, field in obs_fields.items():
63
+ if name.endswith("_ontology_term_id"):
64
+ continue
65
+ # if both the ontology id and the name are present, only validate on the ontology_id
66
+ if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
67
+ obs_fields_unique.pop(name)
68
+ # if the neither name nor ontology id are present, validate on the name
69
+ # this will raise error downstream, we just use name to be more readable
70
+ if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
71
+ obs_fields_unique[name] = field
72
+
73
+ # Only retain obs_fields_unique that have keys in adata.obs.columns
74
+ available_obs_fields = {
75
+ k: v for k, v in obs_fields_unique.items() if k in obs.columns
76
+ }
5
77
 
78
+ return available_obs_fields
6
79
 
7
- def _read_schema_versions(ontology_versions: Path) -> dict[str, pd.DataFrame]:
8
- data = yaml.safe_load(open(ontology_versions))
9
- schema_versions = data["schema-version"]
10
80
 
11
- def _schema_to_df(schema_data):
12
- return pd.DataFrame(
81
+ def _add_defaults_to_obs(obs: pd.DataFrame, defaults: dict[str, str]) -> None:
82
+ """Add default columns and values to obs DataFrame."""
83
+ added_defaults: dict = {}
84
+ for name, default in defaults.items():
85
+ if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
86
+ obs[name] = default
87
+ added_defaults[name] = default
88
+ logger.important(
89
+ f"added default value '{default}' to the adata.obs['{name}']"
90
+ )
91
+
92
+
93
+ def _create_sources(
94
+ categoricals: dict[str, FieldAttr], schema_version: str, organism: str
95
+ ) -> dict[str, Record]:
96
+ """Creates a sources dictionary that can be passed to AnnDataCatManager."""
97
+ import bionty as bt
98
+
99
+ def _fetch_bionty_source(entity: str, organism: str) -> Record | None: # type: ignore
100
+ """Fetch the Bionty source of the pinned ontology."""
101
+ entity_sources = sources_df.loc[(sources_df.entity == entity)].copy()
102
+ if not entity_sources.empty:
103
+ if len(entity_sources) == 1:
104
+ row = entity_sources.iloc[0] # for sources with organism "all"
105
+ else:
106
+ row = entity_sources[entity_sources.organism == organism].iloc[0]
107
+ source = bt.Source.filter(
108
+ organism=row.organism,
109
+ entity=f"bionty.{entity}",
110
+ name=row.source,
111
+ version=row.version,
112
+ ).one_or_none()
113
+ if source is None:
114
+ logger.error(
115
+ f"Could not find source: {entity}\n"
116
+ " → consider running `bionty.core.sync_all_sources_to_latest()` and re-connect to your instance"
117
+ )
118
+ return source
119
+
120
+ sources_df = pd.read_csv(UPath(__file__).parent / "schema_versions.csv")
121
+ sources_df = sources_df[sources_df.schema_version == schema_version]
122
+ if sources_df.empty:
123
+ raise ValueError(
124
+ f"Invalid schema_version: {schema_version}\n"
125
+ f"Valid versions are: {_format_values(sources_df.schema_version.unique())}"
126
+ )
127
+
128
+ key_to_source: dict[str, bt.Source] = {}
129
+ for key, field in categoricals.items():
130
+ if field.field.model.__get_module_name__() == "bionty":
131
+ entity = field.field.model.__name__
132
+ key_to_source[key] = _fetch_bionty_source(entity, organism)
133
+ key_to_source["var_index"] = _fetch_bionty_source("Gene", organism)
134
+
135
+ return key_to_source
136
+
137
+
138
+ def _init_categoricals_additional_values() -> None:
139
+ """Add additional values from CellxGene schema to the DB."""
140
+ import bionty as bt
141
+
142
+ # Note: if you add another control below, be mindful to change the if condition that
143
+ # triggers whether creating these records is re-considered
144
+ controls_were_created = (
145
+ ULabel.filter(name="SuspensionType", is_type=True).one_or_none() is not None
146
+ )
147
+ if not controls_were_created:
148
+ logger.important("Creating control labels in the CellxGene schema.")
149
+
150
+ # "normal" in Disease
151
+ normal = bt.Phenotype.from_source(
152
+ ontology_id="PATO:0000461",
153
+ source=bt.Source.get(name="pato", version="2024-03-28"),
154
+ )
155
+ bt.Disease(
156
+ uid=normal.uid,
157
+ name=normal.name,
158
+ ontology_id=normal.ontology_id,
159
+ description=normal.description,
160
+ source=normal.source, # not sure
161
+ ).save()
162
+
163
+ # na, unknown
164
+ for model, name in zip(
13
165
  [
14
- (entity, organism, ontology, version)
15
- for entity, details in schema_data.items()
16
- for ontology, values in details.items()
17
- for organism, version in values.items()
166
+ bt.Ethnicity,
167
+ bt.Ethnicity,
168
+ bt.DevelopmentalStage,
169
+ bt.Phenotype,
170
+ bt.CellType,
18
171
  ],
19
- columns=["entity", "organism", "source", "version"],
20
- ).set_index("entity")
172
+ ["na", "unknown", "unknown", "unknown", "unknown"],
173
+ ):
174
+ model(
175
+ ontology_id=name, name=name, description="From CellxGene schema."
176
+ ).save()
21
177
 
22
- schema_versions_df = {
23
- version: _schema_to_df(details) for version, details in schema_versions.items()
24
- }
178
+ # tissue_type
179
+ tissue_type = ULabel(
180
+ name="TissueType",
181
+ is_type=True,
182
+ description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
183
+ ).save()
184
+ for name in ["tissue", "organoid", "cell culture"]:
185
+ ULabel(
186
+ name=name, type=tissue_type, description="From CellxGene schema."
187
+ ).save()
25
188
 
26
- return schema_versions_df
189
+ # suspension_type
190
+ suspension_type = ULabel(
191
+ name="SuspensionType",
192
+ is_type=True,
193
+ description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
194
+ ).save()
195
+ for name in ["cell", "nucleus", "na"]:
196
+ ULabel(
197
+ name=name, type=suspension_type, description="From CellxGene schema."
198
+ ).save()
@@ -0,0 +1,43 @@
1
+ schema_version,entity,organism,source,version
2
+ 4.0.0,CellType,all,cl,2023-08-24
3
+ 4.0.0,ExperimentalFactor,all,efo,3.57.0
4
+ 4.0.0,Ethnicity,human,hancestro,3.0
5
+ 4.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
6
+ 4.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
7
+ 4.0.0,Disease,all,mondo,2023-08-02
8
+ 4.0.0,Organism,all,ncbitaxon,2023-06-20
9
+ 4.0.0,Phenotype,all,pato,2023-05-18
10
+ 4.0.0,Tissue,all,uberon,2023-09-05
11
+ 5.0.0,CellType,all,cl,2024-01-04
12
+ 5.0.0,ExperimentalFactor,all,efo,3.62.0
13
+ 5.0.0,Ethnicity,human,hancestro,3.0
14
+ 5.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
15
+ 5.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
16
+ 5.0.0,Disease,all,mondo,2024-01-03
17
+ 5.0.0,Organism,all,ncbitaxon,2023-06-20
18
+ 5.0.0,Phenotype,all,pato,2023-05-18
19
+ 5.0.0,Tissue,all,uberon,2024-01-18
20
+ 5.0.0,Gene,human,ensembl,release-110
21
+ 5.0.0,Gene,mouse,ensembl,release-110
22
+ 5.1.0,CellType,all,cl,2024-04-05
23
+ 5.1.0,ExperimentalFactor,all,efo,3.65.0
24
+ 5.1.0,Ethnicity,human,hancestro,3.0
25
+ 5.1.0,DevelopmentalStage,human,hsapdv,2020-03-10
26
+ 5.1.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
27
+ 5.1.0,Disease,all,mondo,2024-05-08
28
+ 5.1.0,Organism,all,ncbitaxon,2023-06-20
29
+ 5.1.0,Phenotype,all,pato,2023-05-18
30
+ 5.1.0,Tissue,all,uberon,2024-03-22
31
+ 5.1.0,Gene,human,ensembl,release-110
32
+ 5.1.0,Gene,mouse,ensembl,release-110
33
+ 5.2.0,CellType,all,cl,2024-08-16
34
+ 5.2.0,ExperimentalFactor,all,efo,3.69.0
35
+ 5.2.0,Ethnicity,human,hancestro,3.0
36
+ 5.2.0,DevelopmentalStage,human,hsapdv,2024-05-28
37
+ 5.2.0,DevelopmentalStage,mouse,mmusdv,2024-05-28
38
+ 5.2.0,Disease,all,mondo,2024-08-06
39
+ 5.2.0,Organism,all,ncbitaxon,2023-06-20
40
+ 5.2.0,Phenotype,all,pato,2023-05-18
41
+ 5.2.0,Tissue,all,uberon,2024-08-07
42
+ 5.2.0,Gene,human,ensembl,release-110
43
+ 5.2.0,Gene,mouse,ensembl,release-110
@@ -5,7 +5,7 @@ from collections import defaultdict
5
5
  from collections.abc import Iterable
6
6
  from datetime import date, datetime
7
7
  from itertools import compress
8
- from typing import TYPE_CHECKING, Any
8
+ from typing import TYPE_CHECKING, Any, MutableMapping
9
9
 
10
10
  import anndata as ad
11
11
  import numpy as np
@@ -201,7 +201,11 @@ def _get_categoricals(
201
201
  if hasattr(link, "feature_id") and link.feature_id is not None:
202
202
  feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
203
203
  link_attr = get_link_attr(link, self)
204
- label_name = getattr(link, link_attr).name
204
+ label = getattr(link, link_attr)
205
+ name_attr = (
206
+ "name" if hasattr(label, "name") else label.__class__._name_field
207
+ )
208
+ label_name = getattr(label, name_attr)
205
209
  result[(feature.name, feature.dtype)].add(label_name)
206
210
 
207
211
  return dict(result)
@@ -1137,10 +1141,25 @@ def _add_set_from_anndata(
1137
1141
  self._host.save()
1138
1142
 
1139
1143
 
1144
+ def _unify_staged_feature_sets_by_hash(
1145
+ feature_sets: MutableMapping[str, Schema],
1146
+ ):
1147
+ unique_values: dict[str, Any] = {}
1148
+
1149
+ for key, value in feature_sets.items():
1150
+ value_hash = value.hash # Assuming each value has a .hash attribute
1151
+ if value_hash in unique_values:
1152
+ feature_sets[key] = unique_values[value_hash]
1153
+ else:
1154
+ unique_values[value_hash] = value
1155
+
1156
+ return feature_sets
1157
+
1158
+
1140
1159
  def _add_set_from_mudata(
1141
1160
  self,
1142
- var_fields: dict[str, FieldAttr],
1143
- obs_fields: dict[str, FieldAttr] = None,
1161
+ var_fields: dict[str, FieldAttr] | None = None,
1162
+ obs_fields: dict[str, FieldAttr] | None = None,
1144
1163
  mute: bool = False,
1145
1164
  organism: str | Record | None = None,
1146
1165
  ):
@@ -1152,6 +1171,7 @@ def _add_set_from_mudata(
1152
1171
  # parse and register features
1153
1172
  mdata = self._host.load()
1154
1173
  feature_sets = {}
1174
+
1155
1175
  obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1156
1176
  if len(obs_features) > 0:
1157
1177
  feature_sets["obs"] = Schema(features=obs_features)
@@ -1166,20 +1186,50 @@ def _add_set_from_mudata(
1166
1186
  for k, v in modality_fs.items():
1167
1187
  feature_sets[f"['{modality}'].{k}"] = v
1168
1188
 
1169
- def unify_staged_feature_sets_by_hash(feature_sets):
1170
- unique_values = {}
1189
+ # link feature sets
1190
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1191
+ self._host.save()
1192
+
1171
1193
 
1172
- for key, value in feature_sets.items():
1173
- value_hash = value.hash # Assuming each value has a .hash attribute
1174
- if value_hash in unique_values:
1175
- feature_sets[key] = unique_values[value_hash]
1176
- else:
1177
- unique_values[value_hash] = value
1194
+ def _add_set_from_spatialdata(
1195
+ self,
1196
+ sample_metadata_key: str,
1197
+ sample_metadata_field: FieldAttr = Feature.name,
1198
+ var_fields: dict[str, FieldAttr] | None = None,
1199
+ obs_fields: dict[str, FieldAttr] | None = None,
1200
+ mute: bool = False,
1201
+ organism: str | Record | None = None,
1202
+ ):
1203
+ """Add features from SpatialData."""
1204
+ obs_fields, var_fields = obs_fields or {}, var_fields or {}
1205
+ assert self._host.otype == "SpatialData" # noqa: S101
1206
+
1207
+ # parse and register features
1208
+ sdata = self._host.load()
1209
+ feature_sets = {}
1178
1210
 
1179
- return feature_sets
1211
+ # sample features
1212
+ sample_features = Feature.from_values(
1213
+ sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
1214
+ field=sample_metadata_field,
1215
+ ) # type: ignore
1216
+ if len(sample_features) > 0:
1217
+ feature_sets[sample_metadata_key] = Schema(features=sample_features)
1218
+
1219
+ # table features
1220
+ for table, field in var_fields.items():
1221
+ table_fs = parse_staged_feature_sets_from_anndata(
1222
+ sdata[table],
1223
+ var_field=field,
1224
+ obs_field=obs_fields.get(table, Feature.name),
1225
+ mute=mute,
1226
+ organism=organism,
1227
+ )
1228
+ for k, v in table_fs.items():
1229
+ feature_sets[f"['{table}'].{k}"] = v
1180
1230
 
1181
1231
  # link feature sets
1182
- self._host._staged_feature_sets = unify_staged_feature_sets_by_hash(feature_sets)
1232
+ self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
1183
1233
  self._host.save()
1184
1234
 
1185
1235
 
@@ -1311,6 +1361,7 @@ FeatureManager._accessor_by_registry = _accessor_by_registry
1311
1361
  FeatureManager._add_set_from_df = _add_set_from_df
1312
1362
  FeatureManager._add_set_from_anndata = _add_set_from_anndata
1313
1363
  FeatureManager._add_set_from_mudata = _add_set_from_mudata
1364
+ FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
1314
1365
  FeatureManager._add_from = _add_from
1315
1366
  FeatureManager.filter = filter
1316
1367
  FeatureManager.get = get