PyPI - lamindb - Versions diffs - 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

lamindb 1.0.5py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

lamindb/__init__.py +17 -6
lamindb/_artifact.py +202 -87
lamindb/_can_curate.py +27 -8
lamindb/_collection.py +86 -52
lamindb/_feature.py +177 -41
lamindb/_finish.py +21 -7
lamindb/_from_values.py +83 -98
lamindb/_parents.py +4 -4
lamindb/_query_set.py +78 -18
lamindb/_record.py +170 -53
lamindb/_run.py +4 -4
lamindb/_save.py +42 -11
lamindb/_schema.py +135 -38
lamindb/_storage.py +1 -1
lamindb/_tracked.py +129 -0
lamindb/_transform.py +21 -8
lamindb/_ulabel.py +5 -14
lamindb/base/users.py +1 -4
lamindb/base/validation.py +2 -6
lamindb/core/__init__.py +13 -14
lamindb/core/_context.py +14 -9
lamindb/core/_data.py +29 -25
lamindb/core/_describe.py +1 -1
lamindb/core/_django.py +1 -1
lamindb/core/_feature_manager.py +53 -43
lamindb/core/_label_manager.py +4 -4
lamindb/core/_mapped_collection.py +24 -9
lamindb/core/_track_environment.py +2 -1
lamindb/core/datasets/__init__.py +6 -1
lamindb/core/datasets/_core.py +12 -11
lamindb/core/datasets/_small.py +67 -21
lamindb/core/exceptions.py +1 -90
lamindb/core/loaders.py +21 -15
lamindb/core/relations.py +6 -4
lamindb/core/storage/_anndata_accessor.py +49 -3
lamindb/core/storage/_backed_access.py +12 -7
lamindb/core/storage/_pyarrow_dataset.py +40 -15
lamindb/core/storage/_tiledbsoma.py +56 -12
lamindb/core/storage/paths.py +30 -24
lamindb/core/subsettings/_creation_settings.py +4 -16
lamindb/curators/__init__.py +2193 -846
lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
lamindb/errors.py +96 -0
lamindb/integrations/_vitessce.py +3 -3
lamindb/migrations/0069_squashed.py +76 -75
lamindb/migrations/0075_lamindbv1_part5.py +4 -5
lamindb/migrations/0082_alter_feature_dtype.py +21 -0
lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
lamindb/migrations/0086_various.py +95 -0
lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
lamindb/migrations/0088_schema_components.py +273 -0
lamindb/migrations/0088_squashed.py +4372 -0
lamindb/models.py +475 -168
{lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
lamindb-1.1.1.dist-info/RECORD +95 -0
lamindb/curators/_spatial.py +0 -528
lamindb/migrations/0052_squashed.py +0 -1261
lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
lamindb/migrations/0060_alter_artifact__actions.py +0 -22
lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
lamindb/migrations/0062_add_is_latest_field.py +0 -32
lamindb/migrations/0063_populate_latest_field.py +0 -45
lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
lamindb-1.0.5.dist-info/RECORD +0 -102
{lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
{lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0

lamindb/core/_data.py CHANGED Viewed

@@ -21,6 +21,8 @@ from lamindb.models import (
     record_repr,
 )
+from .._tracked import get_current_tracked_run
+from ..errors import ValidationError
 from ._context import context
 from ._django import get_artifact_with_related, get_related_model
 from ._feature_manager import (
@@ -28,7 +30,6 @@ from ._feature_manager import (
     get_host_id_field,
     get_label_links,
 )
-from .exceptions import ValidationError
 from .relations import (
     dict_module_name_to_model_name,
     dict_related_model_to_related_name,
@@ -45,9 +46,12 @@ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-r
 WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
+# also see current_run() in core._data
 def get_run(run: Run | None) -> Run | None:
     if run is None:
-        run = context.run
+        run = get_current_tracked_run()
+        if run is None:
+            run = context.run
         if run is None and not settings.creation.artifact_silence_missing_run_warning:
             logger.warning(WARNING_RUN_TRANSFORM)
     # suppress run by passing False
@@ -56,26 +60,26 @@ def get_run(run: Run | None) -> Run | None:
     return run
-def save_staged__schemas_m2m(self: Artifact | Collection) -> None:
-    if hasattr(self, "_staged__schemas_m2m"):
+def save_staged_feature_sets(self: Artifact | Collection) -> None:
+    if hasattr(self, "_staged_feature_sets"):
         from lamindb.core._feature_manager import get_schema_by_slot_
-        existing_staged__schemas_m2m = get_schema_by_slot_(self)
-        saved_staged__schemas_m2m = {}
-        for key, schema in self._staged__schemas_m2m.items():
+        existing_staged_feature_sets = get_schema_by_slot_(self)
+        saved_staged_feature_sets = {}
+        for key, schema in self._staged_feature_sets.items():
             if isinstance(schema, Schema) and schema._state.adding:
                 schema.save()
-                saved_staged__schemas_m2m[key] = schema
-            if key in existing_staged__schemas_m2m:
+                saved_staged_feature_sets[key] = schema
+            if key in existing_staged_feature_sets:
                 # remove existing feature set on the same slot
-                self._schemas_m2m.remove(existing_staged__schemas_m2m[key])
-        if len(saved_staged__schemas_m2m) > 0:
-            s = "s" if len(saved_staged__schemas_m2m) > 1 else ""
+                self.feature_sets.remove(existing_staged_feature_sets[key])
+        if len(saved_staged_feature_sets) > 0:
+            s = "s" if len(saved_staged_feature_sets) > 1 else ""
             display_schema_keys = ",".join(
-                f"'{key}'" for key in saved_staged__schemas_m2m.keys()
+                f"'{key}'" for key in saved_staged_feature_sets.keys()
             )
             logger.save(
-                f"saved {len(saved_staged__schemas_m2m)} feature set{s} for slot{s}:"
+                f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
                 f" {display_schema_keys}"
             )
@@ -84,16 +88,16 @@ def save_schema_links(self: Artifact | Collection) -> None:
     from lamindb._save import bulk_create
     Data = self.__class__
-    if hasattr(self, "_staged__schemas_m2m"):
+    if hasattr(self, "_staged_feature_sets"):
         links = []
         host_id_field = get_host_id_field(self)
-        for slot, schema in self._staged__schemas_m2m.items():
+        for slot, schema in self._staged_feature_sets.items():
             kwargs = {
                 host_id_field: self.id,
                 "schema_id": schema.id,
                 "slot": slot,
             }
-            links.append(Data._schemas_m2m.through(**kwargs))
+            links.append(Data.feature_sets.through(**kwargs))
         bulk_create(links, ignore_conflicts=True)
@@ -182,7 +186,7 @@ def _describe_sqlite(self: Artifact | Collection, print_types: bool = False):
         if isinstance(self, (Collection, Artifact)):
             many_to_many_fields.append("input_of_runs")
         if isinstance(self, Artifact):
-            many_to_many_fields.append("_schemas_m2m")
+            many_to_many_fields.append("feature_sets")
         self = (
             self.__class__.objects.using(self._state.db)
             .prefetch_related(*many_to_many_fields)
@@ -335,10 +339,10 @@ def add_labels(
     else:
         validate_feature(feature, records)  # type:ignore
         records_by_registry = defaultdict(list)
-        _schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
+        feature_sets = self.feature_sets.filter(itype="Feature").all()
         internal_features = set()  # type: ignore
-        if len(_schemas_m2m) > 0:
-            for schema in _schemas_m2m:
+        if len(feature_sets) > 0:
+            for schema in feature_sets:
                 internal_features = internal_features.union(
                     set(schema.members.values_list("name", flat=True))
                 )  # type: ignore
@@ -357,7 +361,7 @@ def add_labels(
                         f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
                     )
                 if feature.dtype == "cat":
-                    feature.dtype = f"cat[{registry_name}]"
+                    feature.dtype = f"cat[{registry_name}]"  # type: ignore
                     feature.save()
                 elif registry_name not in feature.dtype:
                     new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
@@ -386,13 +390,13 @@ def _track_run_input(
     is_run_input: bool | Run | None = None,
     run: Run | None = None,
 ):
-    # this is an internal hack right now for project-flow, but we can allow this
-    # for the user in the future
     if isinstance(is_run_input, Run):
         run = is_run_input
         is_run_input = True
     elif run is None:
-        run = context.run
+        run = get_current_tracked_run()
+        if run is None:
+            run = context.run
     # consider that data is an iterable of Data
     data_iter: Iterable[Artifact] | Iterable[Collection] = (
         [data] if isinstance(data, (Artifact, Collection)) else data

lamindb/core/_describe.py CHANGED Viewed

@@ -76,7 +76,7 @@ def describe_header(self: Artifact | Collection | Run) -> Tree:
         if self._branch_code == 0:
             logger.warning("This artifact is hidden.")
         elif self._branch_code == -1:
-            logger.warning("This artifact is the trash.")
+            logger.warning("This artifact is in the trash.")
     # initialize tree
     suffix = self.suffix if hasattr(self, "suffix") and self.suffix else ""
     accessor = self.otype if hasattr(self, "otype") and self.otype else ""

lamindb/core/_django.py CHANGED Viewed

@@ -105,7 +105,7 @@ def get_artifact_with_related(
     if include_schema:
         annotations["schemas"] = Subquery(
-            model._schemas_m2m.through.objects.filter(artifact=OuterRef("pk"))
+            model.feature_sets.through.objects.filter(artifact=OuterRef("pk"))
             .annotate(
                 data=JSONObject(
                     id=F("id"),

lamindb/core/_feature_manager.py CHANGED Viewed

@@ -33,8 +33,8 @@ from lamindb._record import (
 )
 from lamindb._save import save
 from lamindb._schema import DICT_KEYS_TYPE, Schema
-from lamindb.core.exceptions import DoesNotExist, ValidationError
 from lamindb.core.storage import LocalPathClasses
+from lamindb.errors import DoesNotExist, ValidationError
 from lamindb.models import (
     Artifact,
     Collection,
@@ -96,8 +96,8 @@ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
         return {}
     # if the host is not yet saved
     if host._state.adding:
-        if hasattr(host, "_staged__schemas_m2m"):
-            return host._staged__schemas_m2m
+        if hasattr(host, "_staged_feature_sets"):
+            return host._staged_feature_sets
         else:
             return {}
     host_db = host._state.db
@@ -105,7 +105,7 @@ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
     kwargs = {host_id_field: host.id}
     # otherwise, we need a query
     links_schema = (
-        host._schemas_m2m.through.objects.using(host_db)
+        host.feature_sets.through.objects.using(host_db)
         .filter(**kwargs)
         .select_related("schema")
     )
@@ -118,7 +118,7 @@ def get_label_links(
     host_id_field = get_host_id_field(host)
     kwargs = {host_id_field: host.id, "feature_id": feature.id}
     link_records = (
-        getattr(host, host.features._accessor_by_registry[registry])
+        getattr(host, host.features._accessor_by_registry[registry])  # type: ignore
         .through.objects.using(host._state.db)
         .filter(**kwargs)
     )
@@ -128,14 +128,14 @@ def get_label_links(
 def get_schema_links(host: Artifact | Collection) -> QuerySet:
     host_id_field = get_host_id_field(host)
     kwargs = {host_id_field: host.id}
-    links_schema = host._schemas_m2m.through.objects.filter(**kwargs)
+    links_schema = host.feature_sets.through.objects.filter(**kwargs)
     return links_schema
 def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
     link_model_name = link.__class__.__name__
     if link_model_name in {"Registry", "ModelBase"}:  # we passed the type of the link
-        link_model_name = link.__name__
+        link_model_name = link.__name__  # type: ignore
     return link_model_name.replace(data.__class__.__name__, "").lower()
@@ -348,10 +348,10 @@ def describe_features(
     internal_feature_names: dict[str, str] = {}
     if isinstance(self, Artifact):
-        _schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
+        feature_sets = self.feature_sets.filter(itype="Feature").all()
         internal_feature_names = {}
-        if len(_schemas_m2m) > 0:
-            for schema in _schemas_m2m:
+        if len(feature_sets) > 0:
+            for schema in feature_sets:
                 internal_feature_names.update(
                     dict(schema.members.values_list("name", "dtype"))
                 )
@@ -500,7 +500,7 @@ def describe_features(
     return tree
-def parse_staged__schemas_m2m_from_anndata(
+def parse_staged_feature_sets_from_anndata(
     adata: AnnData,
     var_field: FieldAttr | None = None,
     obs_field: FieldAttr = Feature.name,
@@ -524,7 +524,7 @@ def parse_staged__schemas_m2m_from_anndata(
             if adata.X is None
             else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
         )
-    _schemas_m2m = {}
+    feature_sets = {}
     if var_field is not None:
         logger.info("parsing feature names of X stored in slot 'var'")
         logger.indent = "   "
@@ -537,7 +537,7 @@ def parse_staged__schemas_m2m_from_anndata(
             raise_validation_error=False,
         )
         if schema_var is not None:
-            _schemas_m2m["var"] = schema_var
+            feature_sets["var"] = schema_var
             logger.save(f"linked: {schema_var}")
         logger.indent = ""
         if schema_var is None:
@@ -552,12 +552,12 @@ def parse_staged__schemas_m2m_from_anndata(
             organism=organism,
         )
         if schema_obs is not None:
-            _schemas_m2m["obs"] = schema_obs
+            feature_sets["obs"] = schema_obs
             logger.save(f"linked: {schema_obs}")
         logger.indent = ""
         if schema_obs is None:
             logger.warning("skip linking features to artifact in slot 'obs'")
-    return _schemas_m2m
+    return feature_sets
 def is_valid_datetime_str(date_string: str) -> bool | str:
@@ -818,6 +818,8 @@ def _add_values(
         feature_param_field: The field of a reference registry to map keys of the
             dictionary.
     """
+    from .._tracked import get_current_tracked_run
     # rename to distinguish from the values inside the dict
     features_values = values
     keys = features_values.keys()
@@ -849,12 +851,20 @@ def _add_values(
             (key, infer_feature_type_convert_json(key, features_values[key]))
             for key in not_validated_keys
         ]
-        hint = "\n".join(
-            [
-                f"  ln.{model_name}(name='{key}', dtype='{dtype}').save(){message}"
-                for key, (dtype, _, message) in not_validated_keys_dtype_message
-            ]
-        )
+        run = get_current_tracked_run()
+        if run is not None:
+            name = f"{run.transform.type}[{run.transform.key}]"
+            type_hint = f"""  {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
+            elements = [type_hint]
+            type_kwarg = f", type={model_name.lower()}_type"
+        else:
+            elements = []
+            type_kwarg = ""
+        elements += [
+            f"  ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
+            for key, (dtype, _, message) in not_validated_keys_dtype_message
+        ]
+        hint = "\n".join(elements)
         msg = (
             f"These keys could not be validated: {not_validated_keys.tolist()}\n"
             f"Here is how to create a {model_name.lower()}:\n\n{hint}"
@@ -928,7 +938,7 @@ def _add_values(
                 validated_values = values_array[validated]
                 if validated.sum() != len(values):
                     not_validated_values += values_array[~validated].tolist()
-                label_records = ULabel.from_values(validated_values, field="name")
+                label_records = ULabel.from_values(validated_values, field="name")  # type: ignore
                 features_labels["ULabel"] += [
                     (feature, label_record) for label_record in label_records
                 ]
@@ -1012,8 +1022,8 @@ def remove_values(
     if isinstance(feature, str):
         feature = Feature.get(name=feature)
     filter_kwargs = {"feature": feature}
-    if feature.dtype.startswith("cat["):
-        feature_registry = feature.dtype.replace("cat[", "").replace("]", "")
+    if feature.dtype.startswith("cat["):  # type: ignore
+        feature_registry = feature.dtype.replace("cat[", "").replace("]", "")  # type: ignore
         if value is not None:
             assert isinstance(value, Record)  # noqa: S101
             # the below uses our convention for field names in link models
@@ -1071,12 +1081,12 @@ def add_schema(self, schema: Schema, slot: str) -> None:
         "slot": slot,
     }
     link_record = (
-        self._host._schemas_m2m.through.objects.using(host_db)
+        self._host.feature_sets.through.objects.using(host_db)
         .filter(**kwargs)
         .one_or_none()
     )
     if link_record is None:
-        self._host._schemas_m2m.through(**kwargs).save(using=host_db)
+        self._host.feature_sets.through(**kwargs).save(using=host_db)
         if slot in self._schema_by_slot:
             logger.debug(f"replaced existing {slot} feature set")
         self._schema_by_slot_[slot] = schema  # type: ignore
@@ -1101,7 +1111,7 @@ def _add_set_from_df(
         mute=mute,
         organism=organism,
     )
-    self._host._staged__schemas_m2m = {"columns": schema}
+    self._host._staged_feature_sets = {"columns": schema}
     self._host.save()
@@ -1120,7 +1130,7 @@ def _add_set_from_anndata(
     # parse and register features
     adata = self._host.load()
-    _schemas_m2m = parse_staged__schemas_m2m_from_anndata(
+    feature_sets = parse_staged_feature_sets_from_anndata(
         adata,
         var_field=var_field,
         obs_field=obs_field,
@@ -1129,7 +1139,7 @@ def _add_set_from_anndata(
     )
     # link feature sets
-    self._host._staged__schemas_m2m = _schemas_m2m
+    self._host._staged_feature_sets = feature_sets
     self._host.save()
@@ -1150,12 +1160,12 @@ def _add_set_from_mudata(
     # parse and register features
     mdata = self._host.load()
-    _schemas_m2m = {}
-    obs_features = Feature.from_values(mdata.obs.columns)
+    feature_sets = {}
+    obs_features = Feature.from_values(mdata.obs.columns)  # type: ignore
     if len(obs_features) > 0:
-        _schemas_m2m["obs"] = Schema(features=obs_features)
+        feature_sets["obs"] = Schema(features=obs_features)
     for modality, field in var_fields.items():
-        modality_fs = parse_staged__schemas_m2m_from_anndata(
+        modality_fs = parse_staged_feature_sets_from_anndata(
             mdata[modality],
             var_field=field,
             obs_field=obs_fields.get(modality, Feature.name),
@@ -1163,22 +1173,22 @@ def _add_set_from_mudata(
             organism=organism,
         )
         for k, v in modality_fs.items():
-            _schemas_m2m[f"['{modality}'].{k}"] = v
+            feature_sets[f"['{modality}'].{k}"] = v
-    def unify_staged__schemas_m2m_by_hash(_schemas_m2m):
+    def unify_staged_feature_sets_by_hash(feature_sets):
         unique_values = {}
-        for key, value in _schemas_m2m.items():
+        for key, value in feature_sets.items():
             value_hash = value.hash  # Assuming each value has a .hash attribute
             if value_hash in unique_values:
-                _schemas_m2m[key] = unique_values[value_hash]
+                feature_sets[key] = unique_values[value_hash]
             else:
                 unique_values[value_hash] = value
-        return _schemas_m2m
+        return feature_sets
     # link feature sets
-    self._host._staged__schemas_m2m = unify_staged__schemas_m2m_by_hash(_schemas_m2m)
+    self._host._staged_feature_sets = unify_staged_feature_sets_by_hash(feature_sets)
     self._host.save()
@@ -1188,7 +1198,7 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
     if transfer_logs is None:
         transfer_logs = {"mapped": [], "transferred": [], "run": None}
     using_key = settings._using_key
-    for slot, schema in data.features._schema_by_slot.items():
+    for slot, schema in data.features._schema_by_slot.items():  # type: ignore
         members = schema.members
         if len(members) == 0:
             continue
@@ -1248,8 +1258,8 @@ def make_external(self, feature: Feature) -> None:
     """
     if not isinstance(feature, Feature):
         raise TypeError("feature must be a Feature record!")
-    _schemas_m2m = Schema.filter(features=feature).all()
-    for fs in _schemas_m2m:
+    feature_sets = Schema.filter(features=feature).all()
+    for fs in feature_sets:
         f = Feature.filter(uid=feature.uid).all()
         features_updated = fs.members.difference(f)
         if len(features_updated) > 0:
@@ -1266,10 +1276,10 @@ def make_external(self, feature: Feature) -> None:
         if len(features_updated) == 0:
             logger.warning(f"deleting empty feature set: {fs}")
             fs.artifacts.set([])
-            fs._artifacts_m2m.set([])
             fs.delete()
+# mypy: ignore-errors
 FeatureManager.__init__ = __init__
 ParamManager.__init__ = __init__
 FeatureManager.__repr__ = __repr__

lamindb/core/_label_manager.py CHANGED Viewed

@@ -35,7 +35,7 @@ if TYPE_CHECKING:
     from lamindb._query_set import QuerySet
     from lamindb.models import Artifact, Collection, Record
-EXCLUDE_LABELS = {"_schemas_m2m"}
+EXCLUDE_LABELS = {"feature_sets"}
 def _get_labels(
@@ -106,7 +106,7 @@ def describe_labels(
         pad_edge=False,
     )
     for related_name, labels in labels_data.items():
-        if not labels or related_name == "_schemas_m2m":
+        if not labels or related_name == "feature_sets":
             continue
         if isinstance(labels, dict):  # postgres, labels are a dict[id, name]
             print_values = _format_values(labels.values(), n=10, quotes=False)
@@ -286,12 +286,12 @@ class LabelManager:
                 )
                 for feature in new_features:
                     transfer_to_default_db(
-                        feature,
+                        feature,  # type: ignore
                         using_key,
                         transfer_logs=transfer_logs,
                         transfer_fk=False,
                     )
-                save(new_features)
+                save(new_features)  # type: ignore
             if hasattr(self._host, related_name):
                 for feature_name, feature_labels in labels_by_features.items():
                     if feature_name is not None:

lamindb/core/_mapped_collection.py CHANGED Viewed

@@ -27,7 +27,8 @@ if TYPE_CHECKING:
 class _Connect:
     def __init__(self, storage):
         if isinstance(storage, UPath):
-            self.conn, self.store = registry.open("h5py", storage)
+            # force no external compression even for files with .gz extension. REMOVE LATER
+            self.conn, self.store = registry.open("h5py", storage, compression=None)
             self.to_close = True
         else:
             self.conn, self.store = None, storage
@@ -87,7 +88,7 @@ class MappedCollection:
         obs_keys: Keys from the ``.obs`` slots.
         obs_filter: Select only observations with these values for the given obs columns.
             Should be a dictionary with obs column names as keys
-            and filtering values (a string or a tuple of strings) as values.
+            and filtering values (a string or a list of strings) as values.
         join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
             does not join.
         encode_labels: Encode labels into integers.
@@ -106,7 +107,7 @@ class MappedCollection:
         layers_keys: str | list[str] | None = None,
         obs_keys: str | list[str] | None = None,
         obsm_keys: str | list[str] | None = None,
-        obs_filter: dict[str, str | tuple[str, ...]] | None = None,
+        obs_filter: dict[str, str | list[str]] | None = None,
         join: Literal["inner", "outer"] | None = "inner",
         encode_labels: bool | list[str] = True,
         unknown_label: str | dict[str, str] | None = None,
@@ -184,9 +185,14 @@ class MappedCollection:
                 if self.filtered:
                     indices_storage_mask = None
                     for obs_filter_key, obs_filter_values in obs_filter.items():
-                        obs_filter_mask = np.isin(
-                            self._get_labels(store, obs_filter_key), obs_filter_values
-                        )
+                        if isinstance(obs_filter_values, tuple):
+                            obs_filter_values = list(obs_filter_values)
+                        elif not isinstance(obs_filter_values, list):
+                            obs_filter_values = [obs_filter_values]
+                        obs_labels = self._get_labels(store, obs_filter_key)
+                        obs_filter_mask = np.isin(obs_labels, obs_filter_values)
+                        if pd.isna(obs_filter_values).any():
+                            obs_filter_mask |= pd.isna(obs_labels)
                         if indices_storage_mask is None:
                             indices_storage_mask = obs_filter_mask
                         else:
@@ -241,7 +247,8 @@ class MappedCollection:
                 if parallel:
                     conn, storage = None, path
                 else:
-                    conn, storage = registry.open("h5py", path)
+                    # force no external compression even for files with .gz extension. REMOVE LATER
+                    conn, storage = registry.open("h5py", path, compression=None)
             else:
                 conn, storage = registry.open("zarr", path)
             self.conns.append(conn)
@@ -296,7 +303,7 @@ class MappedCollection:
             self.var_joint = reduce(pd.Index.intersection, self.var_list)
             if len(self.var_joint) == 0:
                 raise ValueError(
-                    "The provided AnnData objects don't have shared varibales.\n"
+                    "The provided AnnData objects don't have shared variables.\n"
                     "Use join='outer'."
                 )
             self.var_indices = [
@@ -389,7 +396,7 @@ class MappedCollection:
                     else:
                         cats = None
                     label_idx = self._get_obs_idx(store, obs_idx, label, cats)
-                    if label in self.encoders:
+                    if label in self.encoders and label_idx is not np.nan:
                         label_idx = self.encoders[label][label_idx]
                     out[label] = label_idx
         return out
@@ -453,6 +460,8 @@ class MappedCollection:
                 label = labels[idx]
             else:
                 label = labels["codes"][idx]
+                if label == -1:
+                    return np.nan
         if categories is not None:
             cats = categories
         else:
@@ -589,7 +598,13 @@ class MappedCollection:
             cats = self._get_categories(storage, label_key)
         if cats is not None:
             cats = _decode(cats) if isinstance(cats[0], bytes) else cats
+            # NaN is coded as -1
+            nans = labels == -1
             labels = cats[labels]
+            # detect and replace nans
+            if nans.any():
+                labels[nans] = np.nan
         return labels
     def close(self):

lamindb/core/_track_environment.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import subprocess
+import sys
 from typing import TYPE_CHECKING
 import lamindb_setup as ln_setup
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
     try:
         with open(filepath, "w") as f:
             result = subprocess.run(
-                ["pip", "freeze"],
+                [sys.executable, "-m", "pip", "freeze"],
                 stdout=f,
             )
     except OSError as e:

lamindb/core/datasets/__init__.py CHANGED Viewed

@@ -85,4 +85,9 @@ from ._core import (
     schmidt22_perturbseq,
 )
 from ._fake import fake_bio_notebook_titles
-from ._small import anndata_with_obs, small_dataset1, small_dataset2
+from ._small import (
+    anndata_with_obs,
+    small_dataset1,
+    small_dataset2,
+    small_dataset3_cellxgene,
+)

lamindb/core/datasets/_core.py CHANGED Viewed

@@ -18,7 +18,8 @@ if TYPE_CHECKING:
 def file_fcs() -> Path:
     """Example FCS artifact."""
     filepath, _ = urlretrieve(
-        "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
+        "https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
+        "example.fcs",
     )
     return Path(filepath)
@@ -48,8 +49,8 @@ def file_fcs_alpert19(populate_registries: bool = False) -> Path:  # pragma: no
                 bt.CellMarker.public().inspect(std, "name").validated, "name"
             )
         )
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
-        ln.Feature(name="organism", dtype=[bt.Organism]).save()
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
+        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
         ln.settings.verbosity = verbosity
     return Path(filepath)
@@ -84,8 +85,8 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
         verbosity = ln.settings.verbosity
         ln.settings.verbosity = "error"
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
-        ln.Feature(name="organism", dtype=[bt.Organism]).save()
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
+        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
         ln.settings.verbosity = verbosity
@@ -207,7 +208,7 @@ def anndata_mouse_sc_lymph_node(
         # cell types
         ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
         # assays
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
         # genes
         validated = bt.Gene.public(organism="mouse").validate(
@@ -330,11 +331,11 @@ def anndata_human_immune_cells(
         ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
         ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
         ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
-        ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
-        ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
-        ln.Feature(name="organism", dtype=[bt.Organism]).save()
-        ln.Feature(name="donor", dtype=[ln.ULabel]).save()
+        ln.Feature(name="cell_type", dtype=[bt.CellType]).save()  # type: ignore
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
+        ln.Feature(name="tissue", dtype=[bt.Tissue]).save()  # type: ignore
+        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
+        ln.Feature(name="donor", dtype=[ln.ULabel]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
         ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
         ln.settings.verbosity = verbosity

lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl

lamindb 1.0.5py3-none-any.whl → 1.1.1py3-none-any.whl