lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. lamindb/__init__.py +17 -6
  2. lamindb/_artifact.py +202 -87
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +86 -52
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +21 -7
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +78 -18
  10. lamindb/_record.py +170 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +42 -11
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +129 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/users.py +1 -4
  19. lamindb/base/validation.py +2 -6
  20. lamindb/core/__init__.py +13 -14
  21. lamindb/core/_context.py +14 -9
  22. lamindb/core/_data.py +29 -25
  23. lamindb/core/_describe.py +1 -1
  24. lamindb/core/_django.py +1 -1
  25. lamindb/core/_feature_manager.py +53 -43
  26. lamindb/core/_label_manager.py +4 -4
  27. lamindb/core/_mapped_collection.py +24 -9
  28. lamindb/core/_track_environment.py +2 -1
  29. lamindb/core/datasets/__init__.py +6 -1
  30. lamindb/core/datasets/_core.py +12 -11
  31. lamindb/core/datasets/_small.py +67 -21
  32. lamindb/core/exceptions.py +1 -90
  33. lamindb/core/loaders.py +21 -15
  34. lamindb/core/relations.py +6 -4
  35. lamindb/core/storage/_anndata_accessor.py +49 -3
  36. lamindb/core/storage/_backed_access.py +12 -7
  37. lamindb/core/storage/_pyarrow_dataset.py +40 -15
  38. lamindb/core/storage/_tiledbsoma.py +56 -12
  39. lamindb/core/storage/paths.py +30 -24
  40. lamindb/core/subsettings/_creation_settings.py +4 -16
  41. lamindb/curators/__init__.py +2193 -846
  42. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  43. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  44. lamindb/errors.py +96 -0
  45. lamindb/integrations/_vitessce.py +3 -3
  46. lamindb/migrations/0069_squashed.py +76 -75
  47. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  48. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  49. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  50. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  51. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  52. lamindb/migrations/0086_various.py +95 -0
  53. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  54. lamindb/migrations/0088_schema_components.py +273 -0
  55. lamindb/migrations/0088_squashed.py +4372 -0
  56. lamindb/models.py +475 -168
  57. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
  58. lamindb-1.1.1.dist-info/RECORD +95 -0
  59. lamindb/curators/_spatial.py +0 -528
  60. lamindb/migrations/0052_squashed.py +0 -1261
  61. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  62. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  63. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  64. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  65. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  66. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  67. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  68. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  69. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  70. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  71. lamindb/migrations/0063_populate_latest_field.py +0 -45
  72. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  73. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  74. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  75. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  76. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  77. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  78. lamindb-1.0.5.dist-info/RECORD +0 -102
  79. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
  80. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/core/_data.py CHANGED
@@ -21,6 +21,8 @@ from lamindb.models import (
21
21
  record_repr,
22
22
  )
23
23
 
24
+ from .._tracked import get_current_tracked_run
25
+ from ..errors import ValidationError
24
26
  from ._context import context
25
27
  from ._django import get_artifact_with_related, get_related_model
26
28
  from ._feature_manager import (
@@ -28,7 +30,6 @@ from ._feature_manager import (
28
30
  get_host_id_field,
29
31
  get_label_links,
30
32
  )
31
- from .exceptions import ValidationError
32
33
  from .relations import (
33
34
  dict_module_name_to_model_name,
34
35
  dict_related_model_to_related_name,
@@ -45,9 +46,12 @@ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-r
45
46
  WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
46
47
 
47
48
 
49
+ # also see current_run() in core._data
48
50
  def get_run(run: Run | None) -> Run | None:
49
51
  if run is None:
50
- run = context.run
52
+ run = get_current_tracked_run()
53
+ if run is None:
54
+ run = context.run
51
55
  if run is None and not settings.creation.artifact_silence_missing_run_warning:
52
56
  logger.warning(WARNING_RUN_TRANSFORM)
53
57
  # suppress run by passing False
@@ -56,26 +60,26 @@ def get_run(run: Run | None) -> Run | None:
56
60
  return run
57
61
 
58
62
 
59
- def save_staged__schemas_m2m(self: Artifact | Collection) -> None:
60
- if hasattr(self, "_staged__schemas_m2m"):
63
+ def save_staged_feature_sets(self: Artifact | Collection) -> None:
64
+ if hasattr(self, "_staged_feature_sets"):
61
65
  from lamindb.core._feature_manager import get_schema_by_slot_
62
66
 
63
- existing_staged__schemas_m2m = get_schema_by_slot_(self)
64
- saved_staged__schemas_m2m = {}
65
- for key, schema in self._staged__schemas_m2m.items():
67
+ existing_staged_feature_sets = get_schema_by_slot_(self)
68
+ saved_staged_feature_sets = {}
69
+ for key, schema in self._staged_feature_sets.items():
66
70
  if isinstance(schema, Schema) and schema._state.adding:
67
71
  schema.save()
68
- saved_staged__schemas_m2m[key] = schema
69
- if key in existing_staged__schemas_m2m:
72
+ saved_staged_feature_sets[key] = schema
73
+ if key in existing_staged_feature_sets:
70
74
  # remove existing feature set on the same slot
71
- self._schemas_m2m.remove(existing_staged__schemas_m2m[key])
72
- if len(saved_staged__schemas_m2m) > 0:
73
- s = "s" if len(saved_staged__schemas_m2m) > 1 else ""
75
+ self.feature_sets.remove(existing_staged_feature_sets[key])
76
+ if len(saved_staged_feature_sets) > 0:
77
+ s = "s" if len(saved_staged_feature_sets) > 1 else ""
74
78
  display_schema_keys = ",".join(
75
- f"'{key}'" for key in saved_staged__schemas_m2m.keys()
79
+ f"'{key}'" for key in saved_staged_feature_sets.keys()
76
80
  )
77
81
  logger.save(
78
- f"saved {len(saved_staged__schemas_m2m)} feature set{s} for slot{s}:"
82
+ f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
79
83
  f" {display_schema_keys}"
80
84
  )
81
85
 
@@ -84,16 +88,16 @@ def save_schema_links(self: Artifact | Collection) -> None:
84
88
  from lamindb._save import bulk_create
85
89
 
86
90
  Data = self.__class__
87
- if hasattr(self, "_staged__schemas_m2m"):
91
+ if hasattr(self, "_staged_feature_sets"):
88
92
  links = []
89
93
  host_id_field = get_host_id_field(self)
90
- for slot, schema in self._staged__schemas_m2m.items():
94
+ for slot, schema in self._staged_feature_sets.items():
91
95
  kwargs = {
92
96
  host_id_field: self.id,
93
97
  "schema_id": schema.id,
94
98
  "slot": slot,
95
99
  }
96
- links.append(Data._schemas_m2m.through(**kwargs))
100
+ links.append(Data.feature_sets.through(**kwargs))
97
101
  bulk_create(links, ignore_conflicts=True)
98
102
 
99
103
 
@@ -182,7 +186,7 @@ def _describe_sqlite(self: Artifact | Collection, print_types: bool = False):
182
186
  if isinstance(self, (Collection, Artifact)):
183
187
  many_to_many_fields.append("input_of_runs")
184
188
  if isinstance(self, Artifact):
185
- many_to_many_fields.append("_schemas_m2m")
189
+ many_to_many_fields.append("feature_sets")
186
190
  self = (
187
191
  self.__class__.objects.using(self._state.db)
188
192
  .prefetch_related(*many_to_many_fields)
@@ -335,10 +339,10 @@ def add_labels(
335
339
  else:
336
340
  validate_feature(feature, records) # type:ignore
337
341
  records_by_registry = defaultdict(list)
338
- _schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
342
+ feature_sets = self.feature_sets.filter(itype="Feature").all()
339
343
  internal_features = set() # type: ignore
340
- if len(_schemas_m2m) > 0:
341
- for schema in _schemas_m2m:
344
+ if len(feature_sets) > 0:
345
+ for schema in feature_sets:
342
346
  internal_features = internal_features.union(
343
347
  set(schema.members.values_list("name", flat=True))
344
348
  ) # type: ignore
@@ -357,7 +361,7 @@ def add_labels(
357
361
  f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
358
362
  )
359
363
  if feature.dtype == "cat":
360
- feature.dtype = f"cat[{registry_name}]"
364
+ feature.dtype = f"cat[{registry_name}]" # type: ignore
361
365
  feature.save()
362
366
  elif registry_name not in feature.dtype:
363
367
  new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
@@ -386,13 +390,13 @@ def _track_run_input(
386
390
  is_run_input: bool | Run | None = None,
387
391
  run: Run | None = None,
388
392
  ):
389
- # this is an internal hack right now for project-flow, but we can allow this
390
- # for the user in the future
391
393
  if isinstance(is_run_input, Run):
392
394
  run = is_run_input
393
395
  is_run_input = True
394
396
  elif run is None:
395
- run = context.run
397
+ run = get_current_tracked_run()
398
+ if run is None:
399
+ run = context.run
396
400
  # consider that data is an iterable of Data
397
401
  data_iter: Iterable[Artifact] | Iterable[Collection] = (
398
402
  [data] if isinstance(data, (Artifact, Collection)) else data
lamindb/core/_describe.py CHANGED
@@ -76,7 +76,7 @@ def describe_header(self: Artifact | Collection | Run) -> Tree:
76
76
  if self._branch_code == 0:
77
77
  logger.warning("This artifact is hidden.")
78
78
  elif self._branch_code == -1:
79
- logger.warning("This artifact is the trash.")
79
+ logger.warning("This artifact is in the trash.")
80
80
  # initialize tree
81
81
  suffix = self.suffix if hasattr(self, "suffix") and self.suffix else ""
82
82
  accessor = self.otype if hasattr(self, "otype") and self.otype else ""
lamindb/core/_django.py CHANGED
@@ -105,7 +105,7 @@ def get_artifact_with_related(
105
105
 
106
106
  if include_schema:
107
107
  annotations["schemas"] = Subquery(
108
- model._schemas_m2m.through.objects.filter(artifact=OuterRef("pk"))
108
+ model.feature_sets.through.objects.filter(artifact=OuterRef("pk"))
109
109
  .annotate(
110
110
  data=JSONObject(
111
111
  id=F("id"),
@@ -33,8 +33,8 @@ from lamindb._record import (
33
33
  )
34
34
  from lamindb._save import save
35
35
  from lamindb._schema import DICT_KEYS_TYPE, Schema
36
- from lamindb.core.exceptions import DoesNotExist, ValidationError
37
36
  from lamindb.core.storage import LocalPathClasses
37
+ from lamindb.errors import DoesNotExist, ValidationError
38
38
  from lamindb.models import (
39
39
  Artifact,
40
40
  Collection,
@@ -96,8 +96,8 @@ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
96
96
  return {}
97
97
  # if the host is not yet saved
98
98
  if host._state.adding:
99
- if hasattr(host, "_staged__schemas_m2m"):
100
- return host._staged__schemas_m2m
99
+ if hasattr(host, "_staged_feature_sets"):
100
+ return host._staged_feature_sets
101
101
  else:
102
102
  return {}
103
103
  host_db = host._state.db
@@ -105,7 +105,7 @@ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
105
105
  kwargs = {host_id_field: host.id}
106
106
  # otherwise, we need a query
107
107
  links_schema = (
108
- host._schemas_m2m.through.objects.using(host_db)
108
+ host.feature_sets.through.objects.using(host_db)
109
109
  .filter(**kwargs)
110
110
  .select_related("schema")
111
111
  )
@@ -118,7 +118,7 @@ def get_label_links(
118
118
  host_id_field = get_host_id_field(host)
119
119
  kwargs = {host_id_field: host.id, "feature_id": feature.id}
120
120
  link_records = (
121
- getattr(host, host.features._accessor_by_registry[registry])
121
+ getattr(host, host.features._accessor_by_registry[registry]) # type: ignore
122
122
  .through.objects.using(host._state.db)
123
123
  .filter(**kwargs)
124
124
  )
@@ -128,14 +128,14 @@ def get_label_links(
128
128
  def get_schema_links(host: Artifact | Collection) -> QuerySet:
129
129
  host_id_field = get_host_id_field(host)
130
130
  kwargs = {host_id_field: host.id}
131
- links_schema = host._schemas_m2m.through.objects.filter(**kwargs)
131
+ links_schema = host.feature_sets.through.objects.filter(**kwargs)
132
132
  return links_schema
133
133
 
134
134
 
135
135
  def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
136
136
  link_model_name = link.__class__.__name__
137
137
  if link_model_name in {"Registry", "ModelBase"}: # we passed the type of the link
138
- link_model_name = link.__name__
138
+ link_model_name = link.__name__ # type: ignore
139
139
  return link_model_name.replace(data.__class__.__name__, "").lower()
140
140
 
141
141
 
@@ -348,10 +348,10 @@ def describe_features(
348
348
 
349
349
  internal_feature_names: dict[str, str] = {}
350
350
  if isinstance(self, Artifact):
351
- _schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
351
+ feature_sets = self.feature_sets.filter(itype="Feature").all()
352
352
  internal_feature_names = {}
353
- if len(_schemas_m2m) > 0:
354
- for schema in _schemas_m2m:
353
+ if len(feature_sets) > 0:
354
+ for schema in feature_sets:
355
355
  internal_feature_names.update(
356
356
  dict(schema.members.values_list("name", "dtype"))
357
357
  )
@@ -500,7 +500,7 @@ def describe_features(
500
500
  return tree
501
501
 
502
502
 
503
- def parse_staged__schemas_m2m_from_anndata(
503
+ def parse_staged_feature_sets_from_anndata(
504
504
  adata: AnnData,
505
505
  var_field: FieldAttr | None = None,
506
506
  obs_field: FieldAttr = Feature.name,
@@ -524,7 +524,7 @@ def parse_staged__schemas_m2m_from_anndata(
524
524
  if adata.X is None
525
525
  else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
526
526
  )
527
- _schemas_m2m = {}
527
+ feature_sets = {}
528
528
  if var_field is not None:
529
529
  logger.info("parsing feature names of X stored in slot 'var'")
530
530
  logger.indent = " "
@@ -537,7 +537,7 @@ def parse_staged__schemas_m2m_from_anndata(
537
537
  raise_validation_error=False,
538
538
  )
539
539
  if schema_var is not None:
540
- _schemas_m2m["var"] = schema_var
540
+ feature_sets["var"] = schema_var
541
541
  logger.save(f"linked: {schema_var}")
542
542
  logger.indent = ""
543
543
  if schema_var is None:
@@ -552,12 +552,12 @@ def parse_staged__schemas_m2m_from_anndata(
552
552
  organism=organism,
553
553
  )
554
554
  if schema_obs is not None:
555
- _schemas_m2m["obs"] = schema_obs
555
+ feature_sets["obs"] = schema_obs
556
556
  logger.save(f"linked: {schema_obs}")
557
557
  logger.indent = ""
558
558
  if schema_obs is None:
559
559
  logger.warning("skip linking features to artifact in slot 'obs'")
560
- return _schemas_m2m
560
+ return feature_sets
561
561
 
562
562
 
563
563
  def is_valid_datetime_str(date_string: str) -> bool | str:
@@ -818,6 +818,8 @@ def _add_values(
818
818
  feature_param_field: The field of a reference registry to map keys of the
819
819
  dictionary.
820
820
  """
821
+ from .._tracked import get_current_tracked_run
822
+
821
823
  # rename to distinguish from the values inside the dict
822
824
  features_values = values
823
825
  keys = features_values.keys()
@@ -849,12 +851,20 @@ def _add_values(
849
851
  (key, infer_feature_type_convert_json(key, features_values[key]))
850
852
  for key in not_validated_keys
851
853
  ]
852
- hint = "\n".join(
853
- [
854
- f" ln.{model_name}(name='{key}', dtype='{dtype}').save(){message}"
855
- for key, (dtype, _, message) in not_validated_keys_dtype_message
856
- ]
857
- )
854
+ run = get_current_tracked_run()
855
+ if run is not None:
856
+ name = f"{run.transform.type}[{run.transform.key}]"
857
+ type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
858
+ elements = [type_hint]
859
+ type_kwarg = f", type={model_name.lower()}_type"
860
+ else:
861
+ elements = []
862
+ type_kwarg = ""
863
+ elements += [
864
+ f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
865
+ for key, (dtype, _, message) in not_validated_keys_dtype_message
866
+ ]
867
+ hint = "\n".join(elements)
858
868
  msg = (
859
869
  f"These keys could not be validated: {not_validated_keys.tolist()}\n"
860
870
  f"Here is how to create a {model_name.lower()}:\n\n{hint}"
@@ -928,7 +938,7 @@ def _add_values(
928
938
  validated_values = values_array[validated]
929
939
  if validated.sum() != len(values):
930
940
  not_validated_values += values_array[~validated].tolist()
931
- label_records = ULabel.from_values(validated_values, field="name")
941
+ label_records = ULabel.from_values(validated_values, field="name") # type: ignore
932
942
  features_labels["ULabel"] += [
933
943
  (feature, label_record) for label_record in label_records
934
944
  ]
@@ -1012,8 +1022,8 @@ def remove_values(
1012
1022
  if isinstance(feature, str):
1013
1023
  feature = Feature.get(name=feature)
1014
1024
  filter_kwargs = {"feature": feature}
1015
- if feature.dtype.startswith("cat["):
1016
- feature_registry = feature.dtype.replace("cat[", "").replace("]", "")
1025
+ if feature.dtype.startswith("cat["): # type: ignore
1026
+ feature_registry = feature.dtype.replace("cat[", "").replace("]", "") # type: ignore
1017
1027
  if value is not None:
1018
1028
  assert isinstance(value, Record) # noqa: S101
1019
1029
  # the below uses our convention for field names in link models
@@ -1071,12 +1081,12 @@ def add_schema(self, schema: Schema, slot: str) -> None:
1071
1081
  "slot": slot,
1072
1082
  }
1073
1083
  link_record = (
1074
- self._host._schemas_m2m.through.objects.using(host_db)
1084
+ self._host.feature_sets.through.objects.using(host_db)
1075
1085
  .filter(**kwargs)
1076
1086
  .one_or_none()
1077
1087
  )
1078
1088
  if link_record is None:
1079
- self._host._schemas_m2m.through(**kwargs).save(using=host_db)
1089
+ self._host.feature_sets.through(**kwargs).save(using=host_db)
1080
1090
  if slot in self._schema_by_slot:
1081
1091
  logger.debug(f"replaced existing {slot} feature set")
1082
1092
  self._schema_by_slot_[slot] = schema # type: ignore
@@ -1101,7 +1111,7 @@ def _add_set_from_df(
1101
1111
  mute=mute,
1102
1112
  organism=organism,
1103
1113
  )
1104
- self._host._staged__schemas_m2m = {"columns": schema}
1114
+ self._host._staged_feature_sets = {"columns": schema}
1105
1115
  self._host.save()
1106
1116
 
1107
1117
 
@@ -1120,7 +1130,7 @@ def _add_set_from_anndata(
1120
1130
 
1121
1131
  # parse and register features
1122
1132
  adata = self._host.load()
1123
- _schemas_m2m = parse_staged__schemas_m2m_from_anndata(
1133
+ feature_sets = parse_staged_feature_sets_from_anndata(
1124
1134
  adata,
1125
1135
  var_field=var_field,
1126
1136
  obs_field=obs_field,
@@ -1129,7 +1139,7 @@ def _add_set_from_anndata(
1129
1139
  )
1130
1140
 
1131
1141
  # link feature sets
1132
- self._host._staged__schemas_m2m = _schemas_m2m
1142
+ self._host._staged_feature_sets = feature_sets
1133
1143
  self._host.save()
1134
1144
 
1135
1145
 
@@ -1150,12 +1160,12 @@ def _add_set_from_mudata(
1150
1160
 
1151
1161
  # parse and register features
1152
1162
  mdata = self._host.load()
1153
- _schemas_m2m = {}
1154
- obs_features = Feature.from_values(mdata.obs.columns)
1163
+ feature_sets = {}
1164
+ obs_features = Feature.from_values(mdata.obs.columns) # type: ignore
1155
1165
  if len(obs_features) > 0:
1156
- _schemas_m2m["obs"] = Schema(features=obs_features)
1166
+ feature_sets["obs"] = Schema(features=obs_features)
1157
1167
  for modality, field in var_fields.items():
1158
- modality_fs = parse_staged__schemas_m2m_from_anndata(
1168
+ modality_fs = parse_staged_feature_sets_from_anndata(
1159
1169
  mdata[modality],
1160
1170
  var_field=field,
1161
1171
  obs_field=obs_fields.get(modality, Feature.name),
@@ -1163,22 +1173,22 @@ def _add_set_from_mudata(
1163
1173
  organism=organism,
1164
1174
  )
1165
1175
  for k, v in modality_fs.items():
1166
- _schemas_m2m[f"['{modality}'].{k}"] = v
1176
+ feature_sets[f"['{modality}'].{k}"] = v
1167
1177
 
1168
- def unify_staged__schemas_m2m_by_hash(_schemas_m2m):
1178
+ def unify_staged_feature_sets_by_hash(feature_sets):
1169
1179
  unique_values = {}
1170
1180
 
1171
- for key, value in _schemas_m2m.items():
1181
+ for key, value in feature_sets.items():
1172
1182
  value_hash = value.hash # Assuming each value has a .hash attribute
1173
1183
  if value_hash in unique_values:
1174
- _schemas_m2m[key] = unique_values[value_hash]
1184
+ feature_sets[key] = unique_values[value_hash]
1175
1185
  else:
1176
1186
  unique_values[value_hash] = value
1177
1187
 
1178
- return _schemas_m2m
1188
+ return feature_sets
1179
1189
 
1180
1190
  # link feature sets
1181
- self._host._staged__schemas_m2m = unify_staged__schemas_m2m_by_hash(_schemas_m2m)
1191
+ self._host._staged_feature_sets = unify_staged_feature_sets_by_hash(feature_sets)
1182
1192
  self._host.save()
1183
1193
 
1184
1194
 
@@ -1188,7 +1198,7 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1188
1198
  if transfer_logs is None:
1189
1199
  transfer_logs = {"mapped": [], "transferred": [], "run": None}
1190
1200
  using_key = settings._using_key
1191
- for slot, schema in data.features._schema_by_slot.items():
1201
+ for slot, schema in data.features._schema_by_slot.items(): # type: ignore
1192
1202
  members = schema.members
1193
1203
  if len(members) == 0:
1194
1204
  continue
@@ -1248,8 +1258,8 @@ def make_external(self, feature: Feature) -> None:
1248
1258
  """
1249
1259
  if not isinstance(feature, Feature):
1250
1260
  raise TypeError("feature must be a Feature record!")
1251
- _schemas_m2m = Schema.filter(features=feature).all()
1252
- for fs in _schemas_m2m:
1261
+ feature_sets = Schema.filter(features=feature).all()
1262
+ for fs in feature_sets:
1253
1263
  f = Feature.filter(uid=feature.uid).all()
1254
1264
  features_updated = fs.members.difference(f)
1255
1265
  if len(features_updated) > 0:
@@ -1266,10 +1276,10 @@ def make_external(self, feature: Feature) -> None:
1266
1276
  if len(features_updated) == 0:
1267
1277
  logger.warning(f"deleting empty feature set: {fs}")
1268
1278
  fs.artifacts.set([])
1269
- fs._artifacts_m2m.set([])
1270
1279
  fs.delete()
1271
1280
 
1272
1281
 
1282
+ # mypy: ignore-errors
1273
1283
  FeatureManager.__init__ = __init__
1274
1284
  ParamManager.__init__ = __init__
1275
1285
  FeatureManager.__repr__ = __repr__
@@ -35,7 +35,7 @@ if TYPE_CHECKING:
35
35
  from lamindb._query_set import QuerySet
36
36
  from lamindb.models import Artifact, Collection, Record
37
37
 
38
- EXCLUDE_LABELS = {"_schemas_m2m"}
38
+ EXCLUDE_LABELS = {"feature_sets"}
39
39
 
40
40
 
41
41
  def _get_labels(
@@ -106,7 +106,7 @@ def describe_labels(
106
106
  pad_edge=False,
107
107
  )
108
108
  for related_name, labels in labels_data.items():
109
- if not labels or related_name == "_schemas_m2m":
109
+ if not labels or related_name == "feature_sets":
110
110
  continue
111
111
  if isinstance(labels, dict): # postgres, labels are a dict[id, name]
112
112
  print_values = _format_values(labels.values(), n=10, quotes=False)
@@ -286,12 +286,12 @@ class LabelManager:
286
286
  )
287
287
  for feature in new_features:
288
288
  transfer_to_default_db(
289
- feature,
289
+ feature, # type: ignore
290
290
  using_key,
291
291
  transfer_logs=transfer_logs,
292
292
  transfer_fk=False,
293
293
  )
294
- save(new_features)
294
+ save(new_features) # type: ignore
295
295
  if hasattr(self._host, related_name):
296
296
  for feature_name, feature_labels in labels_by_features.items():
297
297
  if feature_name is not None:
@@ -27,7 +27,8 @@ if TYPE_CHECKING:
27
27
  class _Connect:
28
28
  def __init__(self, storage):
29
29
  if isinstance(storage, UPath):
30
- self.conn, self.store = registry.open("h5py", storage)
30
+ # force no external compression even for files with .gz extension. REMOVE LATER
31
+ self.conn, self.store = registry.open("h5py", storage, compression=None)
31
32
  self.to_close = True
32
33
  else:
33
34
  self.conn, self.store = None, storage
@@ -87,7 +88,7 @@ class MappedCollection:
87
88
  obs_keys: Keys from the ``.obs`` slots.
88
89
  obs_filter: Select only observations with these values for the given obs columns.
89
90
  Should be a dictionary with obs column names as keys
90
- and filtering values (a string or a tuple of strings) as values.
91
+ and filtering values (a string or a list of strings) as values.
91
92
  join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
92
93
  does not join.
93
94
  encode_labels: Encode labels into integers.
@@ -106,7 +107,7 @@ class MappedCollection:
106
107
  layers_keys: str | list[str] | None = None,
107
108
  obs_keys: str | list[str] | None = None,
108
109
  obsm_keys: str | list[str] | None = None,
109
- obs_filter: dict[str, str | tuple[str, ...]] | None = None,
110
+ obs_filter: dict[str, str | list[str]] | None = None,
110
111
  join: Literal["inner", "outer"] | None = "inner",
111
112
  encode_labels: bool | list[str] = True,
112
113
  unknown_label: str | dict[str, str] | None = None,
@@ -184,9 +185,14 @@ class MappedCollection:
184
185
  if self.filtered:
185
186
  indices_storage_mask = None
186
187
  for obs_filter_key, obs_filter_values in obs_filter.items():
187
- obs_filter_mask = np.isin(
188
- self._get_labels(store, obs_filter_key), obs_filter_values
189
- )
188
+ if isinstance(obs_filter_values, tuple):
189
+ obs_filter_values = list(obs_filter_values)
190
+ elif not isinstance(obs_filter_values, list):
191
+ obs_filter_values = [obs_filter_values]
192
+ obs_labels = self._get_labels(store, obs_filter_key)
193
+ obs_filter_mask = np.isin(obs_labels, obs_filter_values)
194
+ if pd.isna(obs_filter_values).any():
195
+ obs_filter_mask |= pd.isna(obs_labels)
190
196
  if indices_storage_mask is None:
191
197
  indices_storage_mask = obs_filter_mask
192
198
  else:
@@ -241,7 +247,8 @@ class MappedCollection:
241
247
  if parallel:
242
248
  conn, storage = None, path
243
249
  else:
244
- conn, storage = registry.open("h5py", path)
250
+ # force no external compression even for files with .gz extension. REMOVE LATER
251
+ conn, storage = registry.open("h5py", path, compression=None)
245
252
  else:
246
253
  conn, storage = registry.open("zarr", path)
247
254
  self.conns.append(conn)
@@ -296,7 +303,7 @@ class MappedCollection:
296
303
  self.var_joint = reduce(pd.Index.intersection, self.var_list)
297
304
  if len(self.var_joint) == 0:
298
305
  raise ValueError(
299
- "The provided AnnData objects don't have shared varibales.\n"
306
+ "The provided AnnData objects don't have shared variables.\n"
300
307
  "Use join='outer'."
301
308
  )
302
309
  self.var_indices = [
@@ -389,7 +396,7 @@ class MappedCollection:
389
396
  else:
390
397
  cats = None
391
398
  label_idx = self._get_obs_idx(store, obs_idx, label, cats)
392
- if label in self.encoders:
399
+ if label in self.encoders and label_idx is not np.nan:
393
400
  label_idx = self.encoders[label][label_idx]
394
401
  out[label] = label_idx
395
402
  return out
@@ -453,6 +460,8 @@ class MappedCollection:
453
460
  label = labels[idx]
454
461
  else:
455
462
  label = labels["codes"][idx]
463
+ if label == -1:
464
+ return np.nan
456
465
  if categories is not None:
457
466
  cats = categories
458
467
  else:
@@ -589,7 +598,13 @@ class MappedCollection:
589
598
  cats = self._get_categories(storage, label_key)
590
599
  if cats is not None:
591
600
  cats = _decode(cats) if isinstance(cats[0], bytes) else cats
601
+ # NaN is coded as -1
602
+ nans = labels == -1
592
603
  labels = cats[labels]
604
+ # detect and replace nans
605
+ if nans.any():
606
+ labels[nans] = np.nan
607
+
593
608
  return labels
594
609
 
595
610
  def close(self):
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import subprocess
4
+ import sys
4
5
  from typing import TYPE_CHECKING
5
6
 
6
7
  import lamindb_setup as ln_setup
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
17
18
  try:
18
19
  with open(filepath, "w") as f:
19
20
  result = subprocess.run(
20
- ["pip", "freeze"],
21
+ [sys.executable, "-m", "pip", "freeze"],
21
22
  stdout=f,
22
23
  )
23
24
  except OSError as e:
@@ -85,4 +85,9 @@ from ._core import (
85
85
  schmidt22_perturbseq,
86
86
  )
87
87
  from ._fake import fake_bio_notebook_titles
88
- from ._small import anndata_with_obs, small_dataset1, small_dataset2
88
+ from ._small import (
89
+ anndata_with_obs,
90
+ small_dataset1,
91
+ small_dataset2,
92
+ small_dataset3_cellxgene,
93
+ )
@@ -18,7 +18,8 @@ if TYPE_CHECKING:
18
18
  def file_fcs() -> Path:
19
19
  """Example FCS artifact."""
20
20
  filepath, _ = urlretrieve(
21
- "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
21
+ "https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
22
+ "example.fcs",
22
23
  )
23
24
  return Path(filepath)
24
25
 
@@ -48,8 +49,8 @@ def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no
48
49
  bt.CellMarker.public().inspect(std, "name").validated, "name"
49
50
  )
50
51
  )
51
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
52
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
52
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
53
+ ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
53
54
  ln.settings.verbosity = verbosity
54
55
  return Path(filepath)
55
56
 
@@ -84,8 +85,8 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
84
85
 
85
86
  verbosity = ln.settings.verbosity
86
87
  ln.settings.verbosity = "error"
87
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
88
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
88
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
89
+ ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
89
90
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
90
91
  ln.settings.verbosity = verbosity
91
92
 
@@ -207,7 +208,7 @@ def anndata_mouse_sc_lymph_node(
207
208
  # cell types
208
209
  ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
209
210
  # assays
210
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
211
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
211
212
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
212
213
  # genes
213
214
  validated = bt.Gene.public(organism="mouse").validate(
@@ -330,11 +331,11 @@ def anndata_human_immune_cells(
330
331
  ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
331
332
  ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
332
333
  ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
333
- ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
334
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
335
- ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
336
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
337
- ln.Feature(name="donor", dtype=[ln.ULabel]).save()
334
+ ln.Feature(name="cell_type", dtype=[bt.CellType]).save() # type: ignore
335
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
336
+ ln.Feature(name="tissue", dtype=[bt.Tissue]).save() # type: ignore
337
+ ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
338
+ ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
338
339
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
339
340
  ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
340
341
  ln.settings.verbosity = verbosity