PyPI - lamindb - Versions diffs - 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

lamindb 1.0.5py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

lamindb/__init__.py +14 -5
lamindb/_artifact.py +150 -53
lamindb/_can_curate.py +27 -8
lamindb/_collection.py +85 -51
lamindb/_feature.py +177 -41
lamindb/_finish.py +12 -6
lamindb/_from_values.py +83 -98
lamindb/_parents.py +4 -4
lamindb/_query_set.py +59 -17
lamindb/_record.py +171 -53
lamindb/_run.py +4 -4
lamindb/_save.py +33 -10
lamindb/_schema.py +135 -38
lamindb/_storage.py +1 -1
lamindb/_tracked.py +106 -0
lamindb/_transform.py +21 -8
lamindb/_ulabel.py +5 -14
lamindb/base/validation.py +2 -6
lamindb/core/__init__.py +13 -14
lamindb/core/_context.py +7 -7
lamindb/core/_data.py +29 -25
lamindb/core/_describe.py +1 -1
lamindb/core/_django.py +1 -1
lamindb/core/_feature_manager.py +53 -43
lamindb/core/_label_manager.py +4 -4
lamindb/core/_mapped_collection.py +20 -7
lamindb/core/datasets/__init__.py +6 -1
lamindb/core/datasets/_core.py +12 -11
lamindb/core/datasets/_small.py +66 -20
lamindb/core/exceptions.py +1 -90
lamindb/core/loaders.py +6 -12
lamindb/core/relations.py +6 -4
lamindb/core/storage/_anndata_accessor.py +41 -0
lamindb/core/storage/_backed_access.py +2 -2
lamindb/core/storage/_pyarrow_dataset.py +25 -15
lamindb/core/storage/_tiledbsoma.py +56 -12
lamindb/core/storage/paths.py +27 -21
lamindb/core/subsettings/_creation_settings.py +4 -16
lamindb/curators/__init__.py +2168 -833
lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
lamindb/errors.py +96 -0
lamindb/integrations/_vitessce.py +3 -3
lamindb/migrations/0069_squashed.py +76 -75
lamindb/migrations/0075_lamindbv1_part5.py +4 -5
lamindb/migrations/0082_alter_feature_dtype.py +21 -0
lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
lamindb/migrations/0086_various.py +95 -0
lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
lamindb/migrations/0088_schema_components.py +273 -0
lamindb/migrations/0088_squashed.py +4372 -0
lamindb/models.py +420 -153
{lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/METADATA +9 -7
lamindb-1.1.0.dist-info/RECORD +95 -0
lamindb/curators/_spatial.py +0 -528
lamindb/migrations/0052_squashed.py +0 -1261
lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
lamindb/migrations/0060_alter_artifact__actions.py +0 -22
lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
lamindb/migrations/0062_add_is_latest_field.py +0 -32
lamindb/migrations/0063_populate_latest_field.py +0 -45
lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
lamindb-1.0.5.dist-info/RECORD +0 -102
{lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
{lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0

lamindb/_collection.py CHANGED Viewed

@@ -15,33 +15,35 @@ from lamin_utils import logger
 from lamindb_setup.core._docs import doc_args
 from lamindb_setup.core.hashing import hash_set
-from lamindb.models import (
-    Collection,
-    CollectionArtifact,
-    Schema,
-)
 from ._parents import view_lineage
-from ._record import init_self_from_db, update_attributes
+from ._record import _get_record_kwargs, init_self_from_db, update_attributes
 from ._utils import attach_func_to_class_method
 from .core._data import (
     _track_run_input,
     describe,
     get_run,
     save_schema_links,
-    save_staged__schemas_m2m,
+    save_staged_feature_sets,
 )
 from .core._mapped_collection import MappedCollection
-from .core._settings import settings
+from .core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
 from .core.versioning import process_revises
-from .models import Artifact, Run
+from .errors import FieldValidationError
+from .models import (
+    Artifact,
+    Collection,
+    CollectionArtifact,
+    Run,
+    Schema,
+)
 if TYPE_CHECKING:
     from collections.abc import Iterable
-    from lamindb.core.storage import UPath
+    from pyarrow.dataset import Dataset as PyArrowDataset
     from ._query_set import QuerySet
+    from .core.storage import UPath
 class CollectionFeatureManager:
@@ -50,15 +52,15 @@ class CollectionFeatureManager:
     def __init__(self, collection: Collection):
         self._collection = collection
-    def _get_staged__schemas_m2m_union(self) -> dict[str, Schema]:
-        links_schema_artifact = Artifact._schemas_m2m.through.objects.filter(
+    def _get_staged_feature_sets_union(self) -> dict[str, Schema]:
+        links_schema_artifact = Artifact.feature_sets.through.objects.filter(
             artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
         )
-        _schemas_m2m_by_slots = defaultdict(list)
+        feature_sets_by_slots = defaultdict(list)
         for link in links_schema_artifact:
-            _schemas_m2m_by_slots[link.slot].append(link.schema_id)
-        _schemas_m2m_union = {}
-        for slot, schema_ids_slot in _schemas_m2m_by_slots.items():
+            feature_sets_by_slots[link.slot].append(link.schema_id)
+        feature_sets_union = {}
+        for slot, schema_ids_slot in feature_sets_by_slots.items():
             schema_1 = Schema.get(id=schema_ids_slot[0])
             related_name = schema_1._get_related_name()
             features_registry = getattr(Schema, related_name).field.model
@@ -73,8 +75,8 @@ class CollectionFeatureManager:
                 .distinct()
             )
             features = features_registry.filter(id__in=feature_ids)
-            _schemas_m2m_union[slot] = Schema(features, dtype=schema_1.dtype)
-        return _schemas_m2m_union
+            feature_sets_union[slot] = Schema(features, dtype=schema_1.dtype)
+        return feature_sets_union
 def __init__(
@@ -92,23 +94,16 @@ def __init__(
     artifacts: Artifact | Iterable[Artifact] = (
         kwargs.pop("artifacts") if len(args) == 0 else args[0]
     )
-    meta_artifact: Artifact | None = (
-        kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
-    )
-    key: str | None = kwargs.pop("key") if "key" in kwargs else None
-    description: str | None = (
-        kwargs.pop("description") if "description" in kwargs else None
-    )
-    reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
-    reference_type: str | None = (
-        kwargs.pop("reference_type") if "reference_type" in kwargs else None
-    )
-    run: Run | None = kwargs.pop("run") if "run" in kwargs else None
-    revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
-    version: str | None = kwargs.pop("version") if "version" in kwargs else None
-    _branch_code: int | None = (
-        kwargs.pop("_branch_code") if "_branch_code" in kwargs else 1
-    )
+    meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
+    tmp_key: str | None = kwargs.pop("key", None)
+    description: str | None = kwargs.pop("description", None)
+    reference: str | None = kwargs.pop("reference", None)
+    reference_type: str | None = kwargs.pop("reference_type", None)
+    run: Run | None = kwargs.pop("run", None)
+    revises: Collection | None = kwargs.pop("revises", None)
+    version: str | None = kwargs.pop("version", None)
+    _branch_code: int | None = kwargs.pop("_branch_code", 1)
+    key: str
     if "name" in kwargs:
         key = kwargs.pop("name")
         warnings.warn(
@@ -116,9 +111,16 @@ def __init__(
             FutureWarning,
             stacklevel=2,
         )
+    else:
+        key = tmp_key
     if not len(kwargs) == 0:
-        raise ValueError(
-            f"Only artifacts, key, run, description, reference, reference_type can be passed, you passed: {kwargs}"
+        valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Collection)])
+        raise FieldValidationError(
+            f"Only {valid_keywords} can be passed, you passed: {kwargs}"
+        )
+    if revises is None:
+        revises = (
+            Collection.filter(key=key, is_latest=True).order_by("-created_at").first()
         )
     provisional_uid, version, key, description, revises = process_revises(
         revises, version, key, description, Collection
@@ -162,11 +164,8 @@ def __init__(
         init_self_from_db(collection, existing_collection)
         update_attributes(collection, {"description": description, "key": key})
     else:
-        kwargs = {}
-        search_names_setting = settings.creation.search_names
-        if revises is not None and key == revises.key:
-            settings.creation.search_names = False
-        super(Collection, collection).__init__(
+        _skip_validation = revises is not None and key == revises.key
+        super(Collection, collection).__init__(  # type: ignore
             uid=provisional_uid,
             key=key,
             description=description,
@@ -178,9 +177,8 @@ def __init__(
             version=version,
             _branch_code=_branch_code,
             revises=revises,
-            **kwargs,
+            _skip_validation=_skip_validation,
         )
-        settings.creation.search_names = search_names_setting
     collection._artifacts = artifacts
     # register provenance
     if revises is not None:
@@ -190,8 +188,9 @@ def __init__(
 # docstring handled through attach_func_to_class_method
 def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
-    return Collection(
+    return Collection(  # type: ignore
         self.artifacts.all().list() + [artifact],
+        # key is automatically taken from revises.key
         description=self.description,
         revises=self,
         run=run,
@@ -218,13 +217,46 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
     return hash
+# docstring handled through attach_func_to_class_method
+def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
+    if self._state.adding:
+        artifacts = self._artifacts
+        logger.warning("the collection isn't saved, consider calling `.save()`")
+    else:
+        artifacts = self.ordered_artifacts.all()
+    paths = [artifact.path for artifact in artifacts]
+    # this checks that the filesystem is the same for all paths
+    # this is a requirement of pyarrow.dataset.dataset
+    fs = paths[0].fs
+    for path in paths[1:]:
+        # this assumes that the filesystems are cached by fsspec
+        if path.fs is not fs:
+            raise ValueError(
+                "The collection has artifacts with different filesystems, this is not supported."
+            )
+    if not _is_pyarrow_dataset(paths):
+        suffixes = {path.suffix for path in paths}
+        suffixes_str = ", ".join(suffixes)
+        err_msg = "This collection is not compatible with pyarrow.dataset.dataset(), "
+        err_msg += (
+            f"the artifacts have incompatible file types: {suffixes_str}"
+            if len(suffixes) > 1
+            else f"the file type {suffixes_str} is not supported by pyarrow."
+        )
+        raise ValueError(err_msg)
+    dataset = _open_pyarrow_dataset(paths)
+    # track only if successful
+    _track_run_input(self, is_run_input)
+    return dataset
 # docstring handled through attach_func_to_class_method
 def mapped(
     self,
     layers_keys: str | list[str] | None = None,
     obs_keys: str | list[str] | None = None,
     obsm_keys: str | list[str] | None = None,
-    obs_filter: dict[str, str | tuple[str, ...]] | None = None,
+    obs_filter: dict[str, str | list[str]] | None = None,
     join: Literal["inner", "outer"] | None = "inner",
     encode_labels: bool | list[str] = True,
     unknown_label: str | dict[str, str] | None = None,
@@ -237,12 +269,12 @@ def mapped(
     path_list = []
     if self._state.adding:
         artifacts = self._artifacts
-        logger.warning("The collection isn't saved, consider calling `.save()`")
+        logger.warning("the collection isn't saved, consider calling `.save()`")
     else:
         artifacts = self.ordered_artifacts.all()
     for artifact in artifacts:
         if artifact.suffix not in {".h5ad", ".zarr"}:
-            logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
+            logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
             continue
         elif not stream:
             path_list.append(artifact.cache())
@@ -335,14 +367,14 @@ def save(self, using: str | None = None) -> Collection:
     if self.meta_artifact is not None:
         self.meta_artifact.save()
     # we don't need to save feature sets again
-    save_staged__schemas_m2m(self)
+    save_staged_feature_sets(self)
     super(Collection, self).save()
     # we don't allow updating the collection of artifacts
     # if users want to update the set of artifacts, they
     # have to create a new collection
     if hasattr(self, "_artifacts"):
         links = [
-            CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
+            CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)  # type: ignore
             for artifact in self._artifacts
         ]
         # the below seems to preserve the order of the list in the
@@ -380,6 +412,7 @@ def data_artifact(self) -> Artifact | None:
 METHOD_NAMES = [
     "__init__",
     "append",
+    "open",
     "mapped",
     "cache",
     "load",
@@ -400,6 +433,7 @@ if ln_setup._TESTING:
 for name in METHOD_NAMES:
     attach_func_to_class_method(name, Collection, globals())
+# mypy: ignore-errors
 Collection.ordered_artifacts = ordered_artifacts
 Collection.data_artifact = data_artifact
 Collection.describe = describe

lamindb/_feature.py CHANGED Viewed

@@ -1,16 +1,20 @@
 from __future__ import annotations
+import importlib
 from typing import TYPE_CHECKING, Any, get_args
 import lamindb_setup as ln_setup
 import pandas as pd
+from django.db.models.query_utils import DeferredAttribute
 from lamin_utils import logger
+from lamindb_setup._init_instance import get_schema_module_name
 from lamindb_setup.core._docs import doc_args
 from pandas.api.types import CategoricalDtype, is_string_dtype
+from lamindb._record import _get_record_kwargs
 from lamindb.base.types import FeatureDtype
-from lamindb.core.exceptions import ValidationError
-from lamindb.models import Artifact, Feature, Record
+from lamindb.errors import FieldValidationError, ValidationError
+from lamindb.models import Artifact, Feature, Record, Registry
 from ._query_set import RecordList
 from ._utils import attach_func_to_class_method
@@ -27,21 +31,133 @@ if TYPE_CHECKING:
 FEATURE_DTYPES = set(get_args(FeatureDtype))
-def get_dtype_str_from_dtype(dtype: Any) -> str:
-    if not isinstance(dtype, list) and dtype.__name__ in FEATURE_DTYPES:
+def parse_dtype_single_cat(
+    dtype_str: str,
+    related_registries: dict[str, Record] | None = None,
+    is_itype: bool = False,
+) -> dict:
+    assert isinstance(dtype_str, str)  # noqa: S101
+    if related_registries is None:
+        related_registries = dict_module_name_to_model_name(Artifact)
+    split_result = dtype_str.split("[")
+    # has sub type
+    sub_type_str = ""
+    if len(split_result) == 2:
+        registry_str = split_result[0]
+        assert "]" in split_result[1]  # noqa: S101
+        sub_type_field_split = split_result[1].split("].")
+        if len(sub_type_field_split) == 1:
+            sub_type_str = sub_type_field_split[0].strip("]")
+            field_str = ""
+        else:
+            sub_type_str = sub_type_field_split[0]
+            field_str = sub_type_field_split[1]
+    elif len(split_result) == 1:
+        registry_field_split = split_result[0].split(".")
+        if (
+            len(registry_field_split) == 2 and registry_field_split[1][0].isupper()
+        ) or len(registry_field_split) == 3:
+            # bionty.CellType or bionty.CellType.name
+            registry_str = f"{registry_field_split[0]}.{registry_field_split[1]}"
+            field_str = (
+                "" if len(registry_field_split) == 2 else registry_field_split[2]
+            )
+        else:
+            # ULabel or ULabel.name
+            registry_str = registry_field_split[0]
+            field_str = (
+                "" if len(registry_field_split) == 1 else registry_field_split[1]
+            )
+    if not is_itype:
+        if registry_str not in related_registries:
+            raise ValidationError(
+                f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
+            )
+        registry = related_registries[registry_str]
+    else:
+        if "." in registry_str:
+            registry_str_split = registry_str.split(".")
+            assert len(registry_str_split) == 2, registry_str  # noqa: S101
+            module_name, class_name = registry_str_split
+            module_name = get_schema_module_name(module_name)
+        else:
+            module_name, class_name = "lamindb", registry_str
+        module = importlib.import_module(module_name)
+        registry = getattr(module, class_name)
+    if sub_type_str != "":
+        pass
+        # validate that the subtype is a record in the registry with is_type = True
+    if field_str != "":
+        pass
+        # validate that field_str is an actual field of the module
+    else:
+        field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
+    return {
+        "registry": registry,  # should be typed as CanCurate
+        "registry_str": registry_str,
+        "subtype_str": sub_type_str,
+        "field_str": field_str,
+        "field": getattr(registry, field_str),
+    }
+def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
+    allowed_dtypes = FEATURE_DTYPES
+    if is_param:
+        allowed_dtypes.add("dict")
+    is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
+    result = []
+    if is_composed_cat:
+        related_registries = dict_module_name_to_model_name(Artifact)
+        registries_str = dtype_str.replace("cat[", "")[:-1]  # strip last ]
+        if registries_str != "":
+            registry_str_list = registries_str.split("|")
+            for cat_single_dtype_str in registry_str_list:
+                single_result = parse_dtype_single_cat(
+                    cat_single_dtype_str, related_registries
+                )
+                result.append(single_result)
+    elif dtype_str not in allowed_dtypes:
+        raise ValueError(
+            f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
+        )
+    return result
+def get_dtype_str_from_dtype(dtype: Any, is_itype: bool = False) -> str:
+    if (
+        not isinstance(dtype, list)
+        and hasattr(dtype, "__name__")
+        and dtype.__name__ in FEATURE_DTYPES
+    ):
         dtype_str = dtype.__name__
     else:
-        error_message = "dtype has to be of type Record or list[Record]"
-        if isinstance(dtype, Record):
+        error_message = (
+            "dtype has to be a record, a record field, or a list of records, not {}"
+        )
+        if isinstance(dtype, Registry):
+            dtype = [dtype]
+        elif isinstance(dtype, DeferredAttribute):
             dtype = [dtype]
         elif not isinstance(dtype, list):
-            raise ValueError(error_message)
-        registries_str = ""
-        for registry in dtype:
-            if not hasattr(registry, "__get_name_with_module__"):
-                raise ValueError(error_message)
-            registries_str += registry.__get_name_with_module__() + "|"
-        dtype_str = f'cat[{registries_str.rstrip("|")}]'
+            raise ValueError(error_message.format(dtype))
+        dtype_str = ""
+        for single_dtype in dtype:
+            if not isinstance(single_dtype, Registry) and not isinstance(
+                single_dtype, DeferredAttribute
+            ):
+                raise ValueError(error_message.format(single_dtype))
+            if isinstance(single_dtype, Registry):
+                dtype_str += single_dtype.__get_name_with_module__() + "|"
+            else:
+                dtype_str += (
+                    single_dtype.field.model.__get_name_with_module__()
+                    + f".{single_dtype.field.name}"
+                    + "|"
+                )
+        dtype_str = dtype_str.rstrip("|")
+        if not is_itype:
+            dtype_str = f"cat[{dtype_str}]"
     return dtype_str
@@ -63,44 +179,64 @@ def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
     return dtype
-def __init__(self, *args, **kwargs):
-    if len(args) == len(self._meta.concrete_fields):
-        super(Feature, self).__init__(*args, **kwargs)
-        return None
+def process_init_feature_param(args, kwargs, is_param: bool = False):
     # now we proceed with the user-facing constructor
     if len(args) != 0:
         raise ValueError("Only keyword args allowed")
-    dtype: type | str = kwargs.pop("dtype") if "dtype" in kwargs else None
-    # cast type
-    if dtype is None:
-        raise ValueError(f"Please pass dtype, one of {FEATURE_DTYPES}")
-    elif dtype is not None:
+    name: str = kwargs.pop("name", None)
+    dtype: type | str | None = kwargs.pop("dtype", None)
+    is_type: bool = kwargs.pop("is_type", None)
+    type_: Feature | str | None = kwargs.pop("type", None)
+    description: str | None = kwargs.pop("description", None)
+    if kwargs:
+        valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
+        raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
+    kwargs["name"] = name
+    kwargs["type"] = type_
+    kwargs["is_type"] = is_type
+    if not is_param:
+        kwargs["description"] = description
+    # cast dtype
+    if dtype is None and not is_type:
+        raise ValidationError(
+            f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
+        )
+    dtype_str = None
+    if dtype is not None:
         if not isinstance(dtype, str):
             dtype_str = get_dtype_str_from_dtype(dtype)
         else:
             dtype_str = dtype
-            # add validation that a registry actually exists
-            if dtype_str not in FEATURE_DTYPES and not dtype_str.startswith("cat"):
-                raise ValueError(
-                    f"dtype is {dtype_str} but has to be one of {FEATURE_DTYPES}!"
-                )
-            if dtype_str != "cat" and dtype_str.startswith("cat"):
-                registries_str = dtype_str.replace("cat[", "").rstrip("]")
-                if registries_str != "":
-                    registry_str_list = registries_str.split("|")
-                    for registry_str in registry_str_list:
-                        if registry_str not in dict_module_name_to_model_name(Artifact):
-                            raise ValueError(
-                                f"'{registry_str}' is an invalid dtype, pass, e.g. `[ln.ULabel, bt.CellType]` or similar"
-                            )
-    kwargs["dtype"] = dtype_str
+            parse_dtype(dtype_str, is_param=is_param)
+        kwargs["dtype"] = dtype_str
+    return kwargs
+def __init__(self, *args, **kwargs):
+    if len(args) == len(self._meta.concrete_fields):
+        super(Feature, self).__init__(*args, **kwargs)
+        return None
+    dtype = kwargs.get("dtype", None)
+    default_value = kwargs.pop("default_value", None)
+    nullable = kwargs.pop("nullable", None)
+    cat_filters = kwargs.pop("cat_filters", None)
+    kwargs = process_init_feature_param(args, kwargs)
     super(Feature, self).__init__(*args, **kwargs)
+    self.default_value = default_value
+    self.nullable = nullable
+    dtype_str = kwargs.pop("dtype", None)
+    if cat_filters:
+        assert "|" not in dtype_str  # noqa: S101
+        assert "]]" not in dtype_str  # noqa: S101
+        fill_in = ", ".join(f"{key}='{value}'" for (key, value) in cat_filters.items())
+        dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
+        self.dtype = dtype_str
     if not self._state.adding:
         if not (
-            self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype
+            self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype_str
         ):
             raise ValidationError(
-                f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype}"
+                f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
             )
@@ -138,7 +274,7 @@ def categoricals_from_df(df: pd.DataFrame) -> dict:
 def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
     """{}"""  # noqa: D415
     field = Feature.name if field is None else field
-    registry = field.field.model
+    registry = field.field.model  # type: ignore
     if registry != Feature:
         raise ValueError("field must be a Feature FieldAttr!")
     categoricals = categoricals_from_df(df)
@@ -149,7 +285,7 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList
         else:
             dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
     with logger.mute():  # silence the warning "loaded record with exact same name "
-        features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]
+        features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]  # type: ignore
     assert len(features) == len(df.columns)  # noqa: S101
     return RecordList(features)

lamindb/_finish.py CHANGED Viewed

@@ -96,7 +96,7 @@ def save_run_logs(run: Run, save_run: bool = False) -> None:
     if logs_path.exists():
         if run.report is not None:
             logger.important("overwriting run.report")
-        artifact = Artifact(
+        artifact = Artifact(  # type: ignore
             logs_path,
             description=f"log streams of run {run.uid}",
             _branch_code=0,
@@ -159,7 +159,7 @@ def notebook_to_report(notebook_path: Path, output_path: Path) -> None:
     output_path.write_text(html, encoding="utf-8")
-def notebook_to_script(
+def notebook_to_script(  # type: ignore
     transform: Transform, notebook_path: Path, script_path: Path | None = None
 ) -> None | str:
     import jupytext
@@ -207,8 +207,13 @@ def clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]:
 def check_filepath_recently_saved(filepath: Path, is_finish_retry: bool) -> bool:
-    recently_saved_time = 3 if not is_finish_retry else 20
+    # the recently_saved_time needs to be very low for the first check
+    # because an accidental save (e.g. via auto-save) might otherwise lead
+    # to upload of an outdated notebook
+    # also see implementation for R notebooks below
+    offset_saved_time = 0.3 if not is_finish_retry else 20
     for retry in range(30):
+        recently_saved_time = offset_saved_time + retry  # sleep time is 1 sec
         if get_seconds_since_modified(filepath) > recently_saved_time:
             if retry == 0:
                 prefix = f"{LEVEL_TO_COLORS[20]}{LEVEL_TO_ICONS[20]}{RESET_COLOR}"
@@ -316,7 +321,8 @@ def save_context_core(
                 f"no html report found; to attach one, create an .html export for your {filepath.suffix} file and then run: lamin save {filepath}"
             )
     if report_path is not None and is_r_notebook and not from_cli:  # R notebooks
-        recently_saved_time = 3 if not is_retry else 20
+        # see comment above in check_filepath_recently_saved
+        recently_saved_time = 0.3 if not is_retry else 20
         if get_seconds_since_modified(report_path) > recently_saved_time:
             # the automated retry solution of Jupyter notebooks does not work in RStudio because the execution of the notebook cell
             # seems to block the event loop of the frontend
@@ -365,7 +371,7 @@ def save_context_core(
                 artifact = ln.Artifact.filter(hash=hash, _branch_code=0).one_or_none()
                 new_env_artifact = artifact is None
                 if new_env_artifact:
-                    artifact = ln.Artifact(
+                    artifact = ln.Artifact(  # type: ignore
                         env_path,
                         description="requirements.txt",
                         _branch_code=0,
@@ -411,7 +417,7 @@ def save_context_core(
                     else:
                         logger.important("report is already saved")
                 else:
-                    report_file = ln.Artifact(
+                    report_file = ln.Artifact(  # type: ignore
                         report_path,
                         description=f"Report of run {run.uid}",
                         _branch_code=0,  # hidden file

lamindb 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl

lamindb 1.0.5py3-none-any.whl → 1.1.0py3-none-any.whl