lamindb 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,7 @@ from __future__ import annotations
30
30
  import copy
31
31
  import re
32
32
  from itertools import chain
33
- from typing import TYPE_CHECKING, Any, Literal
33
+ from typing import TYPE_CHECKING, Any, Callable, Literal
34
34
 
35
35
  import anndata as ad
36
36
  import lamindb_setup as ln_setup
@@ -65,7 +65,7 @@ from lamindb.models.artifact import (
65
65
  data_is_mudata,
66
66
  data_is_spatialdata,
67
67
  )
68
- from lamindb.models.feature import parse_dtype, parse_dtype_single_cat
68
+ from lamindb.models.feature import parse_dtype, parse_cat_dtype
69
69
  from lamindb.models._from_values import _format_values
70
70
 
71
71
  from ..errors import InvalidArgument, ValidationError
@@ -106,16 +106,22 @@ class CatLookup:
106
106
  categoricals: dict[str, FieldAttr],
107
107
  slots: dict[str, FieldAttr] = None,
108
108
  public: bool = False,
109
+ organism: str | None = None,
110
+ sources: dict[str, Record] | None = None,
109
111
  ) -> None:
110
112
  slots = slots or {}
111
113
  self._categoricals = {**categoricals, **slots}
112
114
  self._public = public
115
+ self._organism = organism
116
+ self._sources = sources
113
117
 
114
118
  def __getattr__(self, name):
115
119
  if name in self._categoricals:
116
120
  registry = self._categoricals[name].field.model
117
121
  if self._public and hasattr(registry, "public"):
118
- return registry.public().lookup()
122
+ return registry.public(
123
+ organism=self._organism, source=self._sources.get(name)
124
+ ).lookup()
119
125
  else:
120
126
  return registry.lookup()
121
127
  raise AttributeError(
@@ -126,7 +132,9 @@ class CatLookup:
126
132
  if name in self._categoricals:
127
133
  registry = self._categoricals[name].field.model
128
134
  if self._public and hasattr(registry, "public"):
129
- return registry.public().lookup()
135
+ return registry.public(
136
+ organism=self._organism, source=self._sources.get(name)
137
+ ).lookup()
130
138
  else:
131
139
  return registry.lookup()
132
140
  raise AttributeError(
@@ -229,7 +237,7 @@ class Curator:
229
237
  """{}""" # noqa: D415
230
238
  # Note that this docstring has to be consistent with the Artifact()
231
239
  # constructor signature
232
- pass
240
+ pass # pragma: no cover
233
241
 
234
242
 
235
243
  class SlotsCurator(Curator):
@@ -295,6 +303,28 @@ class SlotsCurator(Curator):
295
303
  )
296
304
 
297
305
 
306
+ def check_dtype(expected_type) -> Callable:
307
+ """Creates a check function for Pandera that validates a column's dtype.
308
+
309
+ Args:
310
+ expected_type: String identifier for the expected type ('int', 'float', or 'num')
311
+
312
+ Returns:
313
+ A function that checks if a series has the expected dtype
314
+ """
315
+
316
+ def check_function(series):
317
+ if expected_type == "int":
318
+ is_valid = pd.api.types.is_integer_dtype(series.dtype)
319
+ elif expected_type == "float":
320
+ is_valid = pd.api.types.is_float_dtype(series.dtype)
321
+ elif expected_type == "num":
322
+ is_valid = pd.api.types.is_numeric_dtype(series.dtype)
323
+ return is_valid
324
+
325
+ return check_function
326
+
327
+
298
328
  class DataFrameCurator(Curator):
299
329
  # the example in the docstring is tested in test_curators_quickstart_example
300
330
  """Curator for `DataFrame`.
@@ -348,14 +378,33 @@ class DataFrameCurator(Curator):
348
378
  # populate features
349
379
  pandera_columns = {}
350
380
  for feature in schema.features.all():
351
- pandera_dtype = (
352
- feature.dtype if not feature.dtype.startswith("cat") else "category"
353
- )
354
- pandera_columns[feature.name] = pandera.Column(
355
- pandera_dtype,
356
- nullable=feature.nullable,
357
- coerce=feature.coerce_dtype,
358
- )
381
+ if feature.dtype in {"int", "float", "num"}:
382
+ dtype = (
383
+ self._dataset[feature.name].dtype
384
+ if feature.name in self._dataset.columns
385
+ else None
386
+ )
387
+ pandera_columns[feature.name] = pandera.Column(
388
+ dtype=None,
389
+ checks=pandera.Check(
390
+ check_dtype(feature.dtype),
391
+ element_wise=False,
392
+ error=f"Column '{feature.name}' failed dtype check for '{feature.dtype}': got {dtype}",
393
+ ),
394
+ nullable=feature.nullable,
395
+ coerce=feature.coerce_dtype,
396
+ )
397
+ else:
398
+ pandera_dtype = (
399
+ feature.dtype
400
+ if not feature.dtype.startswith("cat")
401
+ else "category"
402
+ )
403
+ pandera_columns[feature.name] = pandera.Column(
404
+ pandera_dtype,
405
+ nullable=feature.nullable,
406
+ coerce=feature.coerce_dtype,
407
+ )
359
408
  if feature.dtype.startswith("cat"):
360
409
  categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
361
410
  self._pandera_schema = pandera.DataFrameSchema(
@@ -365,7 +414,7 @@ class DataFrameCurator(Curator):
365
414
  assert schema.itype is not None # noqa: S101
366
415
  self._cat_manager = DataFrameCatManager(
367
416
  self._dataset,
368
- columns=parse_dtype_single_cat(schema.itype, is_itype=True)["field"],
417
+ columns=parse_cat_dtype(schema.itype, is_itype=True)["field"],
369
418
  categoricals=categoricals,
370
419
  )
371
420
 
@@ -454,7 +503,7 @@ class DataFrameCurator(Curator):
454
503
  """{}""" # noqa: D415
455
504
  if not self._is_validated:
456
505
  self.validate() # raises ValidationError if doesn't validate
457
- result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
506
+ result = parse_cat_dtype(self._schema.itype, is_itype=True)
458
507
  return save_artifact( # type: ignore
459
508
  self._dataset,
460
509
  description=description,
@@ -545,7 +594,7 @@ class AnnDataCurator(SlotsCurator):
545
594
  slot_schema,
546
595
  )
547
596
  for slot, slot_schema in schema.slots.items()
548
- if slot in {"obs", "var"}
597
+ if slot in {"obs", "var", "uns"}
549
598
  }
550
599
 
551
600
  @doc_args(SAVE_ARTIFACT_DOCSTRING)
@@ -560,14 +609,16 @@ class AnnDataCurator(SlotsCurator):
560
609
  """{}""" # noqa: D415
561
610
  if not self._is_validated:
562
611
  self.validate()
612
+ if "obs" in self.slots:
613
+ categoricals = self.slots["obs"]._cat_manager.categoricals
614
+ else:
615
+ categoricals = {}
563
616
  return save_artifact( # type: ignore
564
617
  self._dataset,
565
618
  description=description,
566
- fields=self.slots["obs"]._cat_manager.categoricals,
619
+ fields=categoricals,
567
620
  index_field=(
568
- parse_dtype_single_cat(self.slots["var"]._schema.itype, is_itype=True)[
569
- "field"
570
- ]
621
+ parse_cat_dtype(self.slots["var"]._schema.itype, is_itype=True)["field"]
571
622
  if "var" in self._slots
572
623
  else None
573
624
  ),
@@ -595,7 +646,7 @@ def _assign_var_fields_categoricals_multimodal(
595
646
  categoricals[modality] = {}
596
647
 
597
648
  if slot_type == "var":
598
- var_field = parse_dtype_single_cat(slot_schema.itype, is_itype=True)["field"]
649
+ var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"]
599
650
  if modality is None:
600
651
  # This should rarely/never be used since tables should have different var fields
601
652
  var_fields[slot] = var_field # pragma: no cover
@@ -870,10 +921,16 @@ class CatManager:
870
921
  # shared until here
871
922
  self._categoricals = categoricals or {}
872
923
  self._non_validated = None
873
- self._organism = organism
874
924
  self._sources = sources or {}
875
925
  self._columns_field = columns_field
876
926
  self._validate_category_error_messages: str = ""
927
+ # make sure to only fetch organism once at the beginning
928
+ if organism:
929
+ self._organism = organism
930
+ else:
931
+ fields = list(self._categoricals.values()) + [columns_field]
932
+ organisms = {get_organism_kwargs(field).get("organism") for field in fields}
933
+ self._organism = organisms.pop() if len(organisms) > 0 else None
877
934
 
878
935
  @property
879
936
  def non_validated(self) -> dict[str, list[str]]:
@@ -918,7 +975,7 @@ class CatManager:
918
975
  Returns:
919
976
  The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
920
977
  """
921
- pass
978
+ pass # pragma: no cover
922
979
 
923
980
  def standardize(self, key: str) -> None:
924
981
  """Replace synonyms with standardized values.
@@ -943,31 +1000,24 @@ class CatManager:
943
1000
  run: Run | None = None,
944
1001
  ) -> Artifact:
945
1002
  """{}""" # noqa: D415
946
- from lamindb.core._settings import settings
947
-
1003
+ # Make sure all labels are saved in the current instance
948
1004
  if not self._is_validated:
949
1005
  self.validate() # returns True or False
950
1006
  if not self._is_validated: # need to raise error manually
951
1007
  raise ValidationError("Dataset does not validate. Please curate.")
952
1008
 
953
- # Make sure all labels are saved in the current instance
954
- verbosity = settings.verbosity
955
- try:
956
- settings.verbosity = "warning"
957
- self._artifact = save_artifact( # type: ignore
958
- self._dataset,
959
- key=key,
960
- description=description,
961
- fields=self.categoricals,
962
- index_field=self._columns_field,
963
- artifact=self._artifact,
964
- revises=revises,
965
- run=run,
966
- schema=None,
967
- organism=self._organism,
968
- )
969
- finally:
970
- settings.verbosity = verbosity
1009
+ self._artifact = save_artifact( # type: ignore
1010
+ self._dataset,
1011
+ key=key,
1012
+ description=description,
1013
+ fields=self.categoricals,
1014
+ index_field=self._columns_field,
1015
+ artifact=self._artifact,
1016
+ revises=revises,
1017
+ run=run,
1018
+ schema=None,
1019
+ organism=self._organism,
1020
+ )
971
1021
 
972
1022
  return self._artifact
973
1023
 
@@ -984,8 +1034,6 @@ class DataFrameCatManager(CatManager):
984
1034
  organism: str | None = None,
985
1035
  sources: dict[str, Record] | None = None,
986
1036
  ) -> None:
987
- from lamindb.core._settings import settings
988
-
989
1037
  if organism is not None and not isinstance(organism, str):
990
1038
  raise ValueError("organism must be a string such as 'human' or 'mouse'!")
991
1039
 
@@ -1010,6 +1058,8 @@ class DataFrameCatManager(CatManager):
1010
1058
  categoricals=self._categoricals,
1011
1059
  slots={"columns": self._columns_field},
1012
1060
  public=public,
1061
+ organism=self._organism,
1062
+ sources=self._sources,
1013
1063
  )
1014
1064
 
1015
1065
  def _save_columns(self, validated_only: bool = True) -> None:
@@ -1018,18 +1068,18 @@ class DataFrameCatManager(CatManager):
1018
1068
  update_registry(
1019
1069
  values=list(self.categoricals.keys()),
1020
1070
  field=self._columns_field,
1021
- key="columns",
1071
+ key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1022
1072
  validated_only=False,
1023
1073
  source=self._sources.get("columns"),
1024
1074
  )
1025
1075
 
1026
1076
  # Save the rest of the columns based on validated_only
1027
- additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
1077
+ additional_columns = set(self._dataset.keys()) - set(self.categoricals.keys())
1028
1078
  if additional_columns:
1029
1079
  update_registry(
1030
1080
  values=list(additional_columns),
1031
1081
  field=self._columns_field,
1032
- key="columns",
1082
+ key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys",
1033
1083
  validated_only=validated_only,
1034
1084
  df=self._dataset, # Get the Feature type from df
1035
1085
  source=self._sources.get("columns"),
@@ -1037,7 +1087,7 @@ class DataFrameCatManager(CatManager):
1037
1087
 
1038
1088
  @deprecated(new_name="is run by default")
1039
1089
  def add_new_from_columns(self, organism: str | None = None, **kwargs):
1040
- pass
1090
+ pass # pragma: no cover
1041
1091
 
1042
1092
  def validate(self) -> bool:
1043
1093
  """Validate variables and categorical observations.
@@ -1093,7 +1143,7 @@ class DataFrameCatManager(CatManager):
1093
1143
  else:
1094
1144
  if key not in avail_keys:
1095
1145
  if key in self._categoricals:
1096
- logger.info(f"No unstandardized values found for {key!r}")
1146
+ logger.warning(f"No non-standardized values found for {key!r}")
1097
1147
  else:
1098
1148
  raise KeyError(
1099
1149
  f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
@@ -1173,7 +1223,9 @@ class AnnDataCatManager(CatManager):
1173
1223
  sources: dict[str, Record] | None = None,
1174
1224
  ) -> None:
1175
1225
  if isinstance(var_index, str):
1176
- raise TypeError("var_index parameter has to be a bionty field")
1226
+ raise TypeError(
1227
+ "var_index parameter has to be a field, e.g. Gene.ensembl_gene_id"
1228
+ )
1177
1229
 
1178
1230
  if not data_is_anndata(data):
1179
1231
  raise TypeError("data has to be an AnnData object")
@@ -1223,6 +1275,8 @@ class AnnDataCatManager(CatManager):
1223
1275
  categoricals=self._obs_fields,
1224
1276
  slots={"columns": self._columns_field, "var_index": self._var_field},
1225
1277
  public=public,
1278
+ organism=self._organism,
1279
+ sources=self._sources,
1226
1280
  )
1227
1281
 
1228
1282
  def _save_from_var_index(
@@ -1433,6 +1487,8 @@ class MuDataCatManager(CatManager):
1433
1487
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
1434
1488
  },
1435
1489
  public=public,
1490
+ organism=self._organism,
1491
+ sources=self._sources,
1436
1492
  )
1437
1493
 
1438
1494
  @deprecated(new_name="is run by default")
@@ -1442,7 +1498,7 @@ class MuDataCatManager(CatManager):
1442
1498
  column_names: list[str] | None = None,
1443
1499
  **kwargs,
1444
1500
  ):
1445
- pass
1501
+ pass # pragma: no cover
1446
1502
 
1447
1503
  def add_new_from_var_index(self, modality: str, **kwargs):
1448
1504
  """Update variable records.
@@ -1487,13 +1543,7 @@ class MuDataCatManager(CatManager):
1487
1543
  def validate(self) -> bool:
1488
1544
  """Validate categories."""
1489
1545
  # add all validated records to the current instance
1490
- verbosity = settings.verbosity
1491
- try:
1492
- settings.verbosity = "error"
1493
- self._update_registry_all()
1494
- finally:
1495
- settings.verbosity = verbosity
1496
-
1546
+ self._update_registry_all()
1497
1547
  self._non_validated = {} # type: ignore
1498
1548
 
1499
1549
  obs_validated = True
@@ -1684,6 +1734,8 @@ class SpatialDataCatManager(CatManager):
1684
1734
  categoricals=cat_values_dict,
1685
1735
  slots={"accessors": cat_values_dict.keys()},
1686
1736
  public=public,
1737
+ organism=self._organism,
1738
+ sources=self._sources,
1687
1739
  )
1688
1740
 
1689
1741
  def _update_registry_all(self) -> None:
@@ -1799,12 +1851,7 @@ class SpatialDataCatManager(CatManager):
1799
1851
  Whether the SpatialData object is validated.
1800
1852
  """
1801
1853
  # add all validated records to the current instance
1802
- verbosity = settings.verbosity
1803
- try:
1804
- settings.verbosity = "error"
1805
- self._update_registry_all()
1806
- finally:
1807
- settings.verbosity = verbosity
1854
+ self._update_registry_all()
1808
1855
 
1809
1856
  self._non_validated = {} # type: ignore
1810
1857
 
@@ -1957,15 +2004,12 @@ class TiledbsomaCatManager(CatManager):
1957
2004
 
1958
2005
  # register obs columns' names
1959
2006
  register_columns = list(self._obs_fields.keys())
1960
- organism = configure_organism(
1961
- self._columns_field.field.model, self._organism
1962
- ).get("organism")
1963
2007
  update_registry(
1964
2008
  values=register_columns,
1965
2009
  field=self._columns_field,
1966
2010
  key="columns",
1967
2011
  validated_only=False,
1968
- organism=organism,
2012
+ organism=self._organism,
1969
2013
  source=self._sources.get("columns"),
1970
2014
  )
1971
2015
  additional_columns = [k for k in valid_obs_keys if k not in register_columns]
@@ -1979,7 +2023,7 @@ class TiledbsomaCatManager(CatManager):
1979
2023
  field=self._columns_field,
1980
2024
  key="columns",
1981
2025
  validated_only=True,
1982
- organism=organism,
2026
+ organism=self._organism,
1983
2027
  source=self._sources.get("columns"),
1984
2028
  )
1985
2029
 
@@ -1999,22 +2043,19 @@ class TiledbsomaCatManager(CatManager):
1999
2043
  var_ms_values = (
2000
2044
  var_ms.read(column_names=[key]).concat()[key].to_pylist()
2001
2045
  )
2002
- organism = configure_organism(field.field.model, self._organism).get(
2003
- "organism"
2004
- )
2005
2046
  update_registry(
2006
2047
  values=var_ms_values,
2007
2048
  field=field,
2008
2049
  key=var_ms_key,
2009
2050
  validated_only=True,
2010
- organism=organism,
2051
+ organism=self._organism,
2011
2052
  source=self._sources.get(var_ms_key),
2012
2053
  )
2013
2054
  _, non_val = validate_categories(
2014
2055
  values=var_ms_values,
2015
2056
  field=field,
2016
2057
  key=var_ms_key,
2017
- organism=organism,
2058
+ organism=self._organism,
2018
2059
  source=self._sources.get(var_ms_key),
2019
2060
  )
2020
2061
  if len(non_val) > 0:
@@ -2031,22 +2072,19 @@ class TiledbsomaCatManager(CatManager):
2031
2072
  values = pa.compute.unique(
2032
2073
  obs.read(column_names=[key]).concat()[key]
2033
2074
  ).to_pylist()
2034
- organism = configure_organism(field.field.model, self._organism).get(
2035
- "organism"
2036
- )
2037
2075
  update_registry(
2038
2076
  values=values,
2039
2077
  field=field,
2040
2078
  key=key,
2041
2079
  validated_only=True,
2042
- organism=organism,
2080
+ organism=self._organism,
2043
2081
  source=self._sources.get(key),
2044
2082
  )
2045
2083
  _, non_val = validate_categories(
2046
2084
  values=values,
2047
2085
  field=field,
2048
2086
  key=key,
2049
- organism=organism,
2087
+ organism=self._organism,
2050
2088
  source=self._sources.get(key),
2051
2089
  )
2052
2090
  if len(non_val) > 0:
@@ -2095,15 +2133,12 @@ class TiledbsomaCatManager(CatManager):
2095
2133
  values, field = self._non_validated_values_field(k)
2096
2134
  if len(values) == 0:
2097
2135
  continue
2098
- organism = configure_organism(field.field.model, self._organism).get(
2099
- "organism"
2100
- )
2101
2136
  update_registry(
2102
2137
  values=values,
2103
2138
  field=field,
2104
2139
  key=k,
2105
2140
  validated_only=False,
2106
- organism=organism,
2141
+ organism=self._organism,
2107
2142
  source=self._sources.get(k),
2108
2143
  **kwargs,
2109
2144
  )
@@ -2138,6 +2173,8 @@ class TiledbsomaCatManager(CatManager):
2138
2173
  categoricals=self._obs_fields,
2139
2174
  slots={"columns": self._columns_field, **self._var_fields_flat},
2140
2175
  public=public,
2176
+ organism=self._organism,
2177
+ sources=self._sources,
2141
2178
  )
2142
2179
 
2143
2180
  def standardize(self, key: str):
@@ -2173,16 +2210,11 @@ class TiledbsomaCatManager(CatManager):
2173
2210
  else:
2174
2211
  slot = lambda experiment: experiment.obs
2175
2212
  slot_key = k
2176
- # errors if public ontology and the model has no organism
2177
- # has to be fixed in bionty
2178
- organism = configure_organism(field.field.model, self._organism).get(
2179
- "organism"
2180
- )
2181
2213
  syn_mapper = standardize_categories(
2182
2214
  values=values,
2183
2215
  field=field,
2184
2216
  source=self._sources.get(k),
2185
- organism=organism,
2217
+ organism=self._organism,
2186
2218
  )
2187
2219
  if (n_syn_mapper := len(syn_mapper)) == 0:
2188
2220
  continue
@@ -2259,9 +2291,6 @@ class TiledbsomaCatManager(CatManager):
2259
2291
 
2260
2292
  feature_sets = {}
2261
2293
  if len(self._obs_fields) > 0:
2262
- organism = configure_organism(
2263
- self._columns_field.field.model, self._organism
2264
- ).get("organism")
2265
2294
  empty_dict = {field.name: [] for field in self._obs_pa_schema} # type: ignore
2266
2295
  mock_df = pa.Table.from_pydict(
2267
2296
  empty_dict, schema=self._obs_pa_schema
@@ -2271,17 +2300,14 @@ class TiledbsomaCatManager(CatManager):
2271
2300
  df=mock_df,
2272
2301
  field=self._columns_field,
2273
2302
  mute=True,
2274
- organism=organism,
2303
+ organism=self._organism,
2275
2304
  )
2276
2305
  for ms in self._var_fields:
2277
2306
  var_key, var_field = self._var_fields[ms]
2278
- organism = configure_organism(var_field.field.model, self._organism).get(
2279
- "organism"
2280
- )
2281
2307
  feature_sets[f"{ms}__var"] = Schema.from_values(
2282
2308
  values=self._validated_values[f"{ms}__{var_key}"],
2283
2309
  field=var_field,
2284
- organism=organism,
2310
+ organism=self._organism,
2285
2311
  raise_validation_error=False,
2286
2312
  )
2287
2313
  artifact._staged_feature_sets = feature_sets
@@ -2291,11 +2317,10 @@ class TiledbsomaCatManager(CatManager):
2291
2317
  for key, field in self._obs_fields.items():
2292
2318
  feature = features.get(key)
2293
2319
  registry = field.field.model
2294
- organism = configure_organism(field.field.model, self._organism).get(
2295
- "organism"
2296
- )
2297
2320
  labels = registry.from_values(
2298
- values=self._validated_values[key], field=field, organism=organism
2321
+ values=self._validated_values[key],
2322
+ field=field,
2323
+ organism=self._organism,
2299
2324
  )
2300
2325
  if len(labels) == 0:
2301
2326
  continue
@@ -2722,10 +2747,11 @@ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2722
2747
  import wetlab as wl
2723
2748
 
2724
2749
  sources = {}
2725
- if "cell_line" in adata.obs.columns:
2726
- sources["cell_line"] = bt.Source.filter(
2727
- entity="bionty.CellLine", name="depmap"
2728
- ).first()
2750
+ # # do not yet specify cell_line source
2751
+ # if "cell_line" in adata.obs.columns:
2752
+ # sources["cell_line"] = bt.Source.filter(
2753
+ # entity="bionty.CellLine", name="depmap"
2754
+ # ).first()
2729
2755
  if "pert_compound" in adata.obs.columns:
2730
2756
  with logger.mute():
2731
2757
  chebi_source = bt.Source.filter(
@@ -2908,35 +2934,43 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
2908
2934
  source = kwargs.get("source")
2909
2935
  organism = kwargs.get("organism")
2910
2936
  filter_kwargs = kwargs.copy()
2911
- try:
2912
- verbosity = settings.verbosity
2913
- settings.verbosity = "error"
2914
- if isinstance(organism, Record) and organism._state.db != "default":
2915
- if db is None or db == "default":
2916
- organism_default = copy.copy(organism)
2917
- # save the organism record in the default database
2918
- organism_default.save()
2919
- filter_kwargs["organism"] = organism_default
2920
- if isinstance(source, Record) and source._state.db != "default":
2921
- if db is None or db == "default":
2922
- source_default = copy.copy(source)
2923
- # save the source record in the default database
2924
- source_default.save()
2925
- filter_kwargs["source"] = source_default
2926
- finally:
2927
- settings.verbosity = verbosity
2937
+
2938
+ if isinstance(organism, Record) and organism._state.db != "default":
2939
+ if db is None or db == "default":
2940
+ organism_default = copy.copy(organism)
2941
+ # save the organism record in the default database
2942
+ organism_default.save()
2943
+ filter_kwargs["organism"] = organism_default
2944
+ if isinstance(source, Record) and source._state.db != "default":
2945
+ if db is None or db == "default":
2946
+ source_default = copy.copy(source)
2947
+ # save the source record in the default database
2948
+ source_default.save()
2949
+ filter_kwargs["source"] = source_default
2950
+
2928
2951
  return filter_kwargs
2929
2952
 
2930
2953
 
2931
- def configure_organism(registry: Record, organism: str | None = None) -> dict[str, str]:
2954
+ def get_organism_kwargs(
2955
+ field: FieldAttr, organism: str | None = None
2956
+ ) -> dict[str, str]:
2932
2957
  """Check if a registry needs an organism and return the organism name."""
2933
- from ..models._from_values import _is_organism_required
2934
-
2935
- if _is_organism_required(registry):
2958
+ registry = field.field.model
2959
+ if registry.__base__.__name__ == "BioRecord":
2936
2960
  import bionty as bt
2961
+ from bionty._organism import is_organism_required
2937
2962
 
2938
- if organism is not None or bt.settings.organism is not None:
2939
- return {"organism": organism or bt.settings.organism.name}
2963
+ from ..models._from_values import get_organism_record_from_field
2964
+
2965
+ if is_organism_required(registry):
2966
+ if organism is not None or bt.settings.organism is not None:
2967
+ return {"organism": organism or bt.settings.organism.name}
2968
+ else:
2969
+ organism_record = get_organism_record_from_field(
2970
+ field, organism=organism
2971
+ )
2972
+ if organism_record is not None:
2973
+ return {"organism": organism_record.name}
2940
2974
  return {}
2941
2975
 
2942
2976
 
@@ -2969,17 +3003,16 @@ def validate_categories(
2969
3003
 
2970
3004
  registry = field.field.model
2971
3005
 
2972
- # {"organism": organism_name}
2973
- kwargs = configure_organism(registry, organism)
2974
- kwargs.update({"source": source} if source else {})
2975
- kwargs_current = get_current_filter_kwargs(registry, kwargs)
3006
+ kwargs_current = get_current_filter_kwargs(
3007
+ registry, {"organism": organism, "source": source}
3008
+ )
2976
3009
 
2977
3010
  # inspect values from the default instance
2978
3011
  inspect_result = registry.inspect(values, field=field, mute=True, **kwargs_current)
2979
3012
  non_validated = inspect_result.non_validated
2980
3013
  syn_mapper = inspect_result.synonyms_mapper
2981
3014
 
2982
- # inspect the non-validated values from public (bionty only)
3015
+ # inspect the non-validated values from public (BioRecord only)
2983
3016
  values_validated = []
2984
3017
  if hasattr(registry, "public"):
2985
3018
  public_records = registry.from_values(
@@ -3134,18 +3167,6 @@ def save_artifact(
3134
3167
  )
3135
3168
  artifact.save()
3136
3169
 
3137
- if organism is not None and index_field is not None:
3138
- feature_kwargs = configure_organism(
3139
- (
3140
- list(index_field.values())[0].field.model
3141
- if isinstance(index_field, dict)
3142
- else index_field.field.model
3143
- ),
3144
- organism,
3145
- )
3146
- else:
3147
- feature_kwargs = {}
3148
-
3149
3170
  def _add_labels(
3150
3171
  data: pd.DataFrame | ScverseDataStructures,
3151
3172
  artifact: Artifact,
@@ -3156,19 +3177,15 @@ def save_artifact(
3156
3177
  for key, field in fields.items():
3157
3178
  feature = features.get(key)
3158
3179
  registry = field.field.model
3159
- filter_kwargs = configure_organism(registry, organism)
3160
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
3180
+ # we don't need source here because all records are already in the DB
3181
+ filter_kwargs = get_current_filter_kwargs(registry, {"organism": organism})
3161
3182
  df = data if isinstance(data, pd.DataFrame) else data.obs
3162
3183
  # multi-value columns are separated by "|"
3163
3184
  if not df[key].isna().all() and df[key].str.contains("|").any():
3164
3185
  values = df[key].str.split("|").explode().unique()
3165
3186
  else:
3166
3187
  values = df[key].unique()
3167
- labels = registry.from_values(
3168
- values,
3169
- field=field,
3170
- **filter_kwargs_current,
3171
- )
3188
+ labels = registry.from_values(values, field=field, **filter_kwargs)
3172
3189
  if len(labels) == 0:
3173
3190
  continue
3174
3191
  label_ref_is_name = None
@@ -3185,20 +3202,26 @@ def save_artifact(
3185
3202
 
3186
3203
  match artifact.otype:
3187
3204
  case "DataFrame":
3188
- artifact.features._add_set_from_df(field=index_field, **feature_kwargs) # type: ignore
3205
+ artifact.features._add_set_from_df(field=index_field, organism=organism) # type: ignore
3189
3206
  _add_labels(
3190
3207
  data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3191
3208
  )
3192
3209
  case "AnnData":
3210
+ if schema is not None and "uns" in schema.slots:
3211
+ uns_field = parse_cat_dtype(schema.slots["uns"].itype, is_itype=True)[
3212
+ "field"
3213
+ ]
3214
+ else:
3215
+ uns_field = None
3193
3216
  artifact.features._add_set_from_anndata( # type: ignore
3194
- var_field=index_field, **feature_kwargs
3217
+ var_field=index_field, uns_field=uns_field, organism=organism
3195
3218
  )
3196
3219
  _add_labels(
3197
3220
  data, artifact, fields, feature_ref_is_name=_ref_is_name(index_field)
3198
3221
  )
3199
3222
  case "MuData":
3200
3223
  artifact.features._add_set_from_mudata( # type: ignore
3201
- var_fields=index_field, **feature_kwargs
3224
+ var_fields=index_field, organism=organism
3202
3225
  )
3203
3226
  for modality, modality_fields in fields.items():
3204
3227
  column_field_modality = index_field.get(modality)
@@ -3228,7 +3251,7 @@ def save_artifact(
3228
3251
  artifact.features._add_set_from_spatialdata( # type: ignore
3229
3252
  sample_metadata_key=kwargs.get("sample_metadata_key", "sample"),
3230
3253
  var_fields=index_field,
3231
- **feature_kwargs,
3254
+ organism=organism,
3232
3255
  )
3233
3256
  sample_metadata_key = kwargs.get("sample_metadata_key", "sample")
3234
3257
  for accessor, accessor_fields in fields.items():
@@ -3305,77 +3328,63 @@ def update_registry(
3305
3328
  from lamindb.models.save import save as ln_save
3306
3329
 
3307
3330
  registry = field.field.model
3308
- filter_kwargs = configure_organism(registry, organism)
3309
- filter_kwargs.update({"source": source} if source else {})
3331
+ filter_kwargs = get_current_filter_kwargs(
3332
+ registry, {"organism": organism, "source": source}
3333
+ )
3310
3334
  values = [i for i in values if isinstance(i, str) and i]
3311
3335
  if not values:
3312
3336
  return
3313
3337
 
3314
- verbosity = settings.verbosity
3315
- try:
3316
- settings.verbosity = "error"
3317
- labels_saved: dict = {"from public": [], "new": []}
3338
+ labels_saved: dict = {"from public": [], "new": []}
3318
3339
 
3319
- # inspect the default instance and save validated records from public
3320
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
3321
- existing_and_public_records = registry.from_values(
3322
- list(values), field=field, **filter_kwargs_current
3323
- )
3324
- existing_and_public_labels = [
3325
- getattr(r, field.field.name) for r in existing_and_public_records
3326
- ]
3327
- # public records that are not already in the database
3328
- public_records = [r for r in existing_and_public_records if r._state.adding]
3329
- # here we check to only save the public records if they are from the specified source
3330
- # we check the uid because r.source and source can be from different instances
3331
- if source:
3332
- public_records = [r for r in public_records if r.source.uid == source.uid]
3333
- if len(public_records) > 0:
3334
- settings.verbosity = "info"
3335
- logger.info(f"saving validated records of '{key}'")
3336
- settings.verbosity = "error"
3337
- ln_save(public_records)
3338
- labels_saved["from public"] = [
3339
- getattr(r, field.field.name) for r in public_records
3340
- ]
3341
- # non-validated records from the default instance
3342
- non_validated_labels = [
3343
- i for i in values if i not in existing_and_public_labels
3340
+ # inspect the default instance and save validated records from public
3341
+ existing_and_public_records = registry.from_values(
3342
+ list(values), field=field, **filter_kwargs, mute=True
3343
+ )
3344
+ existing_and_public_labels = [
3345
+ getattr(r, field.field.name) for r in existing_and_public_records
3346
+ ]
3347
+ # public records that are not already in the database
3348
+ public_records = [r for r in existing_and_public_records if r._state.adding]
3349
+ # here we check to only save the public records if they are from the specified source
3350
+ # we check the uid because r.source and source can be from different instances
3351
+ if source:
3352
+ public_records = [r for r in public_records if r.source.uid == source.uid]
3353
+ if len(public_records) > 0:
3354
+ logger.info(f"saving validated records of '{key}'")
3355
+ ln_save(public_records)
3356
+ labels_saved["from public"] = [
3357
+ getattr(r, field.field.name) for r in public_records
3344
3358
  ]
3359
+ # non-validated records from the default instance
3360
+ non_validated_labels = [i for i in values if i not in existing_and_public_labels]
3361
+
3362
+ # save non-validated/new records
3363
+ labels_saved["new"] = non_validated_labels
3364
+ if not validated_only:
3365
+ non_validated_records: RecordList[Any] = [] # type: ignore
3366
+ if df is not None and registry == Feature:
3367
+ nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
3368
+ non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
3369
+ else:
3370
+ if (
3371
+ organism
3372
+ and hasattr(registry, "organism")
3373
+ and registry._meta.get_field("organism").is_relation
3374
+ ):
3375
+ # make sure organism record is saved to the current instance
3376
+ create_kwargs["organism"] = _save_organism(name=organism)
3345
3377
 
3346
- # save non-validated/new records
3347
- labels_saved["new"] = non_validated_labels
3348
- if not validated_only:
3349
- non_validated_records: RecordList[Any] = [] # type: ignore
3350
- if df is not None and registry == Feature:
3351
- nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
3352
- non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
3353
- else:
3354
- if "organism" in filter_kwargs:
3355
- # make sure organism record is saved to the current instance
3356
- filter_kwargs["organism"] = _save_organism(name=organism)
3357
- init_kwargs = {}
3358
- for value in labels_saved["new"]:
3359
- init_kwargs[field.field.name] = value
3360
- if registry == Feature:
3361
- init_kwargs["dtype"] = "cat" if dtype is None else dtype
3362
- non_validated_records.append(
3363
- registry(
3364
- **init_kwargs,
3365
- **{k: v for k, v in filter_kwargs.items() if k != "source"},
3366
- **{
3367
- k: v for k, v in create_kwargs.items() if k != "sources"
3368
- },
3369
- )
3370
- )
3371
- ln_save(non_validated_records)
3372
-
3373
- # save parent labels for ulabels, for example a parent label "project" for label "project001"
3374
- if registry == ULabel and field.field.name == "name":
3375
- save_ulabels_type(values, field=field, key=key)
3378
+ for value in labels_saved["new"]:
3379
+ init_kwargs = {field.field.name: value}
3380
+ if registry == Feature:
3381
+ init_kwargs["dtype"] = "cat" if dtype is None else dtype
3382
+ non_validated_records.append(registry(**init_kwargs, **create_kwargs))
3383
+ ln_save(non_validated_records)
3376
3384
 
3377
- finally:
3378
- settings.verbosity = verbosity
3385
+ # save parent labels for ulabels, for example a parent label "project" for label "project001"
3386
+ if registry == ULabel and field.field.name == "name":
3387
+ save_ulabels_type(values, field=field, key=key)
3379
3388
 
3380
3389
  log_saved_labels(
3381
3390
  labels_saved,
@@ -3433,8 +3442,9 @@ def _save_organism(name: str):
3433
3442
  organism = bt.Organism.from_source(name=name)
3434
3443
  if organism is None:
3435
3444
  raise ValidationError(
3436
- f'Organism "{name}" not found\n'
3437
- f' → please save it: bt.Organism(name="{name}").save()'
3445
+ f'Organism "{name}" not found from public reference\n'
3446
+ f' → please save it from a different source: bt.Organism.from_source(name="{name}", source).save()'
3447
+ f' → or manually save it without source: bt.Organism(name="{name}").save()'
3438
3448
  )
3439
3449
  organism.save()
3440
3450
  return organism