PyPI - lamindb - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

lamindb 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

lamindb/__init__.py +33 -26
lamindb/_finish.py +9 -1
lamindb/_tracked.py +26 -3
lamindb/_view.py +2 -3
lamindb/base/__init__.py +1 -1
lamindb/base/ids.py +1 -10
lamindb/base/users.py +1 -4
lamindb/core/__init__.py +7 -65
lamindb/core/_compat.py +60 -0
lamindb/core/_context.py +50 -22
lamindb/core/_mapped_collection.py +4 -2
lamindb/core/_settings.py +6 -6
lamindb/core/_sync_git.py +1 -1
lamindb/core/_track_environment.py +2 -1
lamindb/core/datasets/_small.py +3 -3
lamindb/core/loaders.py +43 -20
lamindb/core/storage/_anndata_accessor.py +8 -3
lamindb/core/storage/_backed_access.py +14 -7
lamindb/core/storage/_pyarrow_dataset.py +24 -9
lamindb/core/storage/_tiledbsoma.py +8 -6
lamindb/core/storage/_zarr.py +104 -25
lamindb/core/storage/objects.py +63 -28
lamindb/core/storage/paths.py +16 -13
lamindb/core/types.py +10 -0
lamindb/curators/__init__.py +176 -149
lamindb/errors.py +1 -1
lamindb/integrations/_vitessce.py +4 -4
lamindb/migrations/0089_subsequent_runs.py +159 -0
lamindb/migrations/0090_runproject_project_runs.py +73 -0
lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
lamindb/models/__init__.py +79 -0
lamindb/{core → models}/_describe.py +3 -3
lamindb/{core → models}/_django.py +8 -5
lamindb/{core → models}/_feature_manager.py +103 -87
lamindb/{_from_values.py → models/_from_values.py} +5 -2
lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
lamindb/{core → models}/_label_manager.py +10 -17
lamindb/{core/relations.py → models/_relations.py} +8 -1
lamindb/models/artifact.py +2602 -0
lamindb/{_can_curate.py → models/can_curate.py} +349 -180
lamindb/models/collection.py +683 -0
lamindb/models/core.py +135 -0
lamindb/models/feature.py +643 -0
lamindb/models/flextable.py +163 -0
lamindb/{_parents.py → models/has_parents.py} +55 -49
lamindb/models/project.py +384 -0
lamindb/{_query_manager.py → models/query_manager.py} +10 -8
lamindb/{_query_set.py → models/query_set.py} +64 -32
lamindb/models/record.py +1762 -0
lamindb/models/run.py +563 -0
lamindb/{_save.py → models/save.py} +18 -8
lamindb/models/schema.py +732 -0
lamindb/models/transform.py +360 -0
lamindb/models/ulabel.py +249 -0
{lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
lamindb-1.2.0.dist-info/RECORD +95 -0
lamindb/_artifact.py +0 -1361
lamindb/_collection.py +0 -440
lamindb/_feature.py +0 -316
lamindb/_is_versioned.py +0 -40
lamindb/_record.py +0 -1065
lamindb/_run.py +0 -60
lamindb/_schema.py +0 -347
lamindb/_storage.py +0 -15
lamindb/_transform.py +0 -170
lamindb/_ulabel.py +0 -56
lamindb/_utils.py +0 -9
lamindb/base/validation.py +0 -63
lamindb/core/_data.py +0 -491
lamindb/core/fields.py +0 -12
lamindb/models.py +0 -4435
lamindb-1.1.0.dist-info/RECORD +0 -95
{lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
{lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0

lamindb/base/validation.py DELETED Viewed

@@ -1,63 +0,0 @@
-from typing import TYPE_CHECKING, Literal, Union, get_args, get_origin, get_type_hints
-from lamin_utils import colors
-from lamindb.errors import FieldValidationError
-if TYPE_CHECKING:
-    from .models import Record
-def validate_literal_fields(record: "Record", kwargs) -> None:
-    """Validate all Literal type fields in a record.
-    Args:
-        record: record being validated
-    Raises:
-        ValidationError: If any field value is not in its Literal's allowed values
-    """
-    # check is based on string to avoid circular imports
-    if record.__class__.__name__ == "Feature":
-        # the FeatureDtype is more complicated than a simple literal
-        # because it allows constructs like cat[ULabel] etc.
-        # the User model is used at startup and throws a datetime-related error otherwise
-        # simmilar for Storage & Source
-        return None
-    try:
-        type_hints = get_type_hints(record.__class__)
-    except TypeError:
-        # for 3.9, get_type_hints errors with | in type hints
-        return
-    errors = {}
-    for field_name, field_type in type_hints.items():
-        # Handle both plain Literal and Union/Optional Literal types
-        origin = get_origin(field_type)
-        if origin is Union:
-            # For Optional/Union types, find the Literal type if it exists
-            literal_type = next(
-                (t for t in get_args(field_type) if get_origin(t) is Literal), None
-            )
-        else:
-            # For plain types, check if it's a Literal
-            literal_type = field_type if origin is Literal else None
-        # Skip if no Literal type found
-        if literal_type is None:
-            continue
-        value = kwargs.get(field_name)
-        if value is not None:
-            valid_values = set(get_args(literal_type))
-            if value not in valid_values:
-                errors[field_name] = (
-                    f"{field_name}: {colors.yellow(value)} is not a valid value"
-                    f"\n    → Valid values are: {colors.green(', '.join(sorted(valid_values)))}"
-                )
-    if errors:
-        message = "\n  "
-        for _, error in errors.items():
-            message += error + "\n  "
-        raise FieldValidationError(message)

lamindb/core/_data.py DELETED Viewed

@@ -1,491 +0,0 @@
-from __future__ import annotations
-from collections import defaultdict
-from typing import TYPE_CHECKING
-from django.db import connections
-from lamin_utils import colors, logger
-from lamindb_setup.core._docs import doc_args
-from lamindb._query_set import QuerySet
-from lamindb.core._settings import settings
-from lamindb.models import (
-    Artifact,
-    Collection,
-    Feature,
-    Record,
-    Run,
-    Schema,
-    ULabel,
-    format_field_value,
-    record_repr,
-)
-from .._tracked import get_current_tracked_run
-from ..errors import ValidationError
-from ._context import context
-from ._django import get_artifact_with_related, get_related_model
-from ._feature_manager import (
-    add_label_feature_links,
-    get_host_id_field,
-    get_label_links,
-)
-from .relations import (
-    dict_module_name_to_model_name,
-    dict_related_model_to_related_name,
-)
-if TYPE_CHECKING:
-    from collections.abc import Iterable
-    from lamindb.base.types import StrField
-WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run"
-WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
-# also see current_run() in core._data
-def get_run(run: Run | None) -> Run | None:
-    if run is None:
-        run = get_current_tracked_run()
-        if run is None:
-            run = context.run
-        if run is None and not settings.creation.artifact_silence_missing_run_warning:
-            logger.warning(WARNING_RUN_TRANSFORM)
-    # suppress run by passing False
-    elif not run:
-        run = None
-    return run
-def save_staged_feature_sets(self: Artifact | Collection) -> None:
-    if hasattr(self, "_staged_feature_sets"):
-        from lamindb.core._feature_manager import get_schema_by_slot_
-        existing_staged_feature_sets = get_schema_by_slot_(self)
-        saved_staged_feature_sets = {}
-        for key, schema in self._staged_feature_sets.items():
-            if isinstance(schema, Schema) and schema._state.adding:
-                schema.save()
-                saved_staged_feature_sets[key] = schema
-            if key in existing_staged_feature_sets:
-                # remove existing feature set on the same slot
-                self.feature_sets.remove(existing_staged_feature_sets[key])
-        if len(saved_staged_feature_sets) > 0:
-            s = "s" if len(saved_staged_feature_sets) > 1 else ""
-            display_schema_keys = ",".join(
-                f"'{key}'" for key in saved_staged_feature_sets.keys()
-            )
-            logger.save(
-                f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
-                f" {display_schema_keys}"
-            )
-def save_schema_links(self: Artifact | Collection) -> None:
-    from lamindb._save import bulk_create
-    Data = self.__class__
-    if hasattr(self, "_staged_feature_sets"):
-        links = []
-        host_id_field = get_host_id_field(self)
-        for slot, schema in self._staged_feature_sets.items():
-            kwargs = {
-                host_id_field: self.id,
-                "schema_id": schema.id,
-                "slot": slot,
-            }
-            links.append(Data.feature_sets.through(**kwargs))
-        bulk_create(links, ignore_conflicts=True)
-def format_provenance(self, fk_data, print_types):
-    type_str = lambda attr: (
-        f": {get_related_model(self.__class__, attr).__name__}" if print_types else ""
-    )
-    return "".join(
-        [
-            f"    .{field_name}{type_str(field_name)} = {format_field_value(value.get('name'))}\n"
-            for field_name, value in fk_data.items()
-            if value.get("name")
-        ]
-    )
-def format_input_of_runs(self, print_types):
-    if self.id is not None and self.input_of_runs.exists():
-        values = [format_field_value(i.started_at) for i in self.input_of_runs.all()]
-        type_str = ": Run" if print_types else ""  # type: ignore
-        return f"    .input_of_runs{type_str} = {', '.join(values)}\n"
-    return ""
-def _describe_postgres(self: Artifact | Collection, print_types: bool = False):
-    from ._describe import describe_general
-    from ._feature_manager import describe_features
-    model_name = self.__class__.__name__
-    msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
-    if self._state.db is not None and self._state.db != "default":
-        msg += f"  {colors.italic('Database instance')}\n"
-        msg += f"    slug: {self._state.db}\n"
-    if model_name == "Artifact":
-        result = get_artifact_with_related(
-            self,
-            include_feature_link=True,
-            include_fk=True,
-            include_m2m=True,
-            include_schema=True,
-        )
-    else:
-        result = get_artifact_with_related(self, include_fk=True, include_m2m=True)
-    related_data = result.get("related_data", {})
-    # TODO: fk_data = related_data.get("fk", {})
-    tree = describe_general(self)
-    return describe_features(
-        self,
-        tree=tree,
-        related_data=related_data,
-        with_labels=True,
-        print_params=hasattr(self, "kind") and self.kind == "model",
-    )
-def _describe_sqlite(self: Artifact | Collection, print_types: bool = False):
-    from ._describe import describe_general
-    from ._feature_manager import describe_features
-    model_name = self.__class__.__name__
-    msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
-    if self._state.db is not None and self._state.db != "default":
-        msg += f"  {colors.italic('Database instance')}\n"
-        msg += f"    slug: {self._state.db}\n"
-    fields = self._meta.fields
-    direct_fields = []
-    foreign_key_fields = []
-    for f in fields:
-        if f.is_relation:
-            foreign_key_fields.append(f.name)
-        else:
-            direct_fields.append(f.name)
-    if not self._state.adding:
-        # prefetch foreign key relationships
-        self = (
-            self.__class__.objects.using(self._state.db)
-            .select_related(*foreign_key_fields)
-            .get(id=self.id)
-        )
-        # prefetch m-2-m relationships
-        many_to_many_fields = []
-        if isinstance(self, (Collection, Artifact)):
-            many_to_many_fields.append("input_of_runs")
-        if isinstance(self, Artifact):
-            many_to_many_fields.append("feature_sets")
-        self = (
-            self.__class__.objects.using(self._state.db)
-            .prefetch_related(*many_to_many_fields)
-            .get(id=self.id)
-        )
-    tree = describe_general(self)
-    return describe_features(
-        self,
-        tree=tree,
-        with_labels=True,
-        print_params=hasattr(self, "kind") and self.kind == "kind",
-    )
-@doc_args(Artifact.describe.__doc__)
-def describe(self: Artifact | Collection, print_types: bool = False):
-    """{}"""  # noqa: D415
-    from ._describe import print_rich_tree
-    if not self._state.adding and connections[self._state.db].vendor == "postgresql":
-        tree = _describe_postgres(self, print_types=print_types)
-    else:
-        tree = _describe_sqlite(self, print_types=print_types)
-    print_rich_tree(tree)
-def validate_feature(feature: Feature, records: list[Record]) -> None:
-    """Validate feature record, adjust feature.dtype based on labels records."""
-    if not isinstance(feature, Feature):
-        raise TypeError("feature has to be of type Feature")
-    if feature._state.adding:
-        registries = {record.__class__.__get_name_with_module__() for record in records}
-        registries_str = "|".join(registries)
-        msg = f"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()"
-        raise ValidationError(f"Feature not validated. If it looks correct: {msg}")
-def get_labels(
-    self,
-    feature: Feature,
-    mute: bool = False,
-    flat_names: bool = False,
-) -> QuerySet | dict[str, QuerySet] | list:
-    """{}"""  # noqa: D415
-    if not isinstance(feature, Feature):
-        raise TypeError("feature has to be of type Feature")
-    if feature.dtype is None or not feature.dtype.startswith("cat["):
-        raise ValueError("feature does not have linked labels")
-    registries_to_check = feature.dtype.replace("cat[", "").rstrip("]").split("|")
-    if len(registries_to_check) > 1 and not mute:
-        logger.warning("labels come from multiple registries!")
-    # return an empty query set if self.id is still None
-    if self.id is None:
-        return QuerySet(self.__class__)
-    qs_by_registry = {}
-    for registry in registries_to_check:
-        # currently need to distinguish between ULabel and non-ULabel, because
-        # we only have the feature information for Label
-        if registry == "ULabel":
-            links_to_labels = get_label_links(self, registry, feature)
-            label_ids = [link.ulabel_id for link in links_to_labels]
-            qs_by_registry[registry] = ULabel.objects.using(self._state.db).filter(
-                id__in=label_ids
-            )
-        elif registry in self.features._accessor_by_registry:
-            qs_by_registry[registry] = getattr(
-                self, self.features._accessor_by_registry[registry]
-            ).all()
-    if flat_names:
-        # returns a flat list of names
-        from lamindb._record import get_name_field
-        values = []
-        for v in qs_by_registry.values():
-            values += v.list(get_name_field(v))
-        return values
-    if len(registries_to_check) == 1 and registry in qs_by_registry:
-        return qs_by_registry[registry]
-    else:
-        return qs_by_registry
-def add_labels(
-    self,
-    records: Record | list[Record] | QuerySet | Iterable,
-    feature: Feature | None = None,
-    *,
-    field: StrField | None = None,
-    feature_ref_is_name: bool | None = None,
-    label_ref_is_name: bool | None = None,
-    from_curator: bool = False,
-) -> None:
-    """{}"""  # noqa: D415
-    if self._state.adding:
-        raise ValueError("Please save the artifact/collection before adding a label!")
-    if isinstance(records, (QuerySet, QuerySet.__base__)):  # need to have both
-        records = records.list()
-    if isinstance(records, (str, Record)):
-        records = [records]
-    if not isinstance(records, list):  # avoids warning for pd Series
-        records = list(records)
-    # create records from values
-    if len(records) == 0:
-        return None
-    if isinstance(records[0], str):  # type: ignore
-        records_validated = []
-        # feature is needed if we want to create records from values
-        if feature is None:
-            raise ValueError(
-                "Please pass a feature, e.g., via: label = ln.ULabel(name='my_label',"
-                " feature=ln.Feature(name='my_feature'))"
-            )
-        if feature.dtype.startswith("cat["):
-            orm_dict = dict_module_name_to_model_name(Artifact)
-            for reg in feature.dtype.replace("cat[", "").rstrip("]").split("|"):
-                registry = orm_dict.get(reg)
-                records_validated += registry.from_values(records, field=field)
-        # feature doesn't have registries and therefore can't create records from values
-        # ask users to pass records
-        if len(records_validated) == 0:
-            raise ValueError(
-                "Please pass a record (a `Record` object), not a string, e.g., via:"
-                " label"
-                f" = ln.ULabel(name='{records[0]}')"  # type: ignore
-            )
-        records = records_validated
-    for record in records:
-        if record._state.adding:
-            raise ValidationError(
-                f"{record} not validated. If it looks correct: record.save()"
-            )
-    if feature is None:
-        d = dict_related_model_to_related_name(self.__class__)
-        # strategy: group records by registry to reduce number of transactions
-        records_by_related_name: dict = {}
-        for record in records:
-            related_name = d.get(record.__class__.__get_name_with_module__())
-            if related_name is None:
-                raise ValueError(f"Can't add labels to {record.__class__} record!")
-            if related_name not in records_by_related_name:
-                records_by_related_name[related_name] = []
-            records_by_related_name[related_name].append(record)
-        for related_name, records in records_by_related_name.items():
-            getattr(self, related_name).add(*records)
-    else:
-        validate_feature(feature, records)  # type:ignore
-        records_by_registry = defaultdict(list)
-        feature_sets = self.feature_sets.filter(itype="Feature").all()
-        internal_features = set()  # type: ignore
-        if len(feature_sets) > 0:
-            for schema in feature_sets:
-                internal_features = internal_features.union(
-                    set(schema.members.values_list("name", flat=True))
-                )  # type: ignore
-        for record in records:
-            records_by_registry[record.__class__.__get_name_with_module__()].append(
-                record
-            )
-        for registry_name, records in records_by_registry.items():
-            if not from_curator and feature.name in internal_features:
-                raise ValidationError(
-                    "Cannot manually annotate internal feature with label. Please use ln.Curator"
-                )
-            if registry_name not in feature.dtype:
-                if not feature.dtype.startswith("cat"):
-                    raise ValidationError(
-                        f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
-                    )
-                if feature.dtype == "cat":
-                    feature.dtype = f"cat[{registry_name}]"  # type: ignore
-                    feature.save()
-                elif registry_name not in feature.dtype:
-                    new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
-                    raise ValidationError(
-                        f"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{feature.dtype}'), consider updating to dtype='{new_dtype}'"
-                    )
-            if registry_name not in self.features._accessor_by_registry:
-                logger.warning(f"skipping {registry_name}")
-                continue
-            if len(records) == 0:
-                continue
-            features_labels = {
-                registry_name: [(feature, label_record) for label_record in records]
-            }
-            add_label_feature_links(
-                self.features,
-                features_labels,
-                feature_ref_is_name=feature_ref_is_name,
-                label_ref_is_name=label_ref_is_name,
-            )
-def _track_run_input(
-    data: Artifact | Collection | Iterable[Artifact] | Iterable[Collection],
-    is_run_input: bool | Run | None = None,
-    run: Run | None = None,
-):
-    if isinstance(is_run_input, Run):
-        run = is_run_input
-        is_run_input = True
-    elif run is None:
-        run = get_current_tracked_run()
-        if run is None:
-            run = context.run
-    # consider that data is an iterable of Data
-    data_iter: Iterable[Artifact] | Iterable[Collection] = (
-        [data] if isinstance(data, (Artifact, Collection)) else data
-    )
-    track_run_input = False
-    input_data = []
-    if run is not None:
-        # avoid cycles: data can't be both input and output
-        def is_valid_input(data: Artifact | Collection):
-            is_valid = False
-            if data._state.db == "default":
-                # things are OK if the record is on the default db
-                is_valid = True
-            elif data._state.db is None:
-                # if a record is not yet saved, it can't be an input
-                # we silently ignore because what likely happens is that
-                # the user works with an object that's about to be saved
-                # in the current Python session
-                is_valid = False
-            else:
-                # record is on another db
-                # we have to save the record into the current db with
-                # the run being attached to a transfer transform
-                logger.important(
-                    f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}') as input"
-                )
-                data.save()
-                is_valid = True
-            return (
-                data.run_id != run.id
-                and not data._state.adding  # this seems duplicated with data._state.db is None
-                and is_valid
-            )
-        input_data = [data for data in data_iter if is_valid_input(data)]
-        input_data_ids = [data.id for data in input_data]
-    if input_data:
-        data_class_name = input_data[0].__class__.__name__.lower()
-    # let us first look at the case in which the user does not
-    # provide a boolean value for `is_run_input`
-    # hence, we need to determine whether we actually want to
-    # track a run or not
-    if is_run_input is None:
-        # we don't have a run record
-        if run is None:
-            if settings.track_run_inputs:
-                logger.warning(WARNING_NO_INPUT)
-        # assume we have a run record
-        else:
-            # assume there is non-cyclic candidate input data
-            if input_data:
-                if settings.track_run_inputs:
-                    transform_note = ""
-                    if len(input_data) == 1:
-                        if input_data[0].transform is not None:
-                            transform_note = (
-                                ", adding parent transform"
-                                f" {input_data[0].transform.id}"
-                            )
-                    logger.info(
-                        f"adding {data_class_name} ids {input_data_ids} as inputs for run"
-                        f" {run.id}{transform_note}"
-                    )
-                    track_run_input = True
-                else:
-                    logger.hint(
-                        "track these data as a run input by passing `is_run_input=True`"
-                    )
-    else:
-        track_run_input = is_run_input
-    if track_run_input:
-        if run is None:
-            raise ValueError("No run context set. Call `ln.track()`.")
-        # avoid adding the same run twice
-        run.save()
-        if data_class_name == "artifact":
-            LinkORM = run.input_artifacts.through
-            links = [
-                LinkORM(run_id=run.id, artifact_id=data_id)
-                for data_id in input_data_ids
-            ]
-        else:
-            LinkORM = run.input_collections.through
-            links = [
-                LinkORM(run_id=run.id, collection_id=data_id)
-                for data_id in input_data_ids
-            ]
-        LinkORM.objects.bulk_create(links, ignore_conflicts=True)
-        # generalize below for more than one data batch
-        if len(input_data) == 1:
-            if input_data[0].transform is not None:
-                run.transform.predecessors.add(input_data[0].transform)

lamindb/core/fields.py DELETED Viewed

@@ -1,12 +0,0 @@
-"""Fields.
-The field accessor of a :class:`~lamindb.core.Record`:
-.. autosummary::
-   :toctree: .
-   FieldAttr
-"""
-from lamindb.base.types import FieldAttr  # noqa: F401

lamindb 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

lamindb 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl