PyPI - lamindb - Versions diffs - 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

lamindb 1.6.2py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

lamindb/__init__.py +1 -3
lamindb/_finish.py +32 -16
lamindb/base/types.py +6 -4
lamindb/core/_context.py +127 -57
lamindb/core/_mapped_collection.py +1 -1
lamindb/core/_settings.py +44 -4
lamindb/core/_track_environment.py +5 -2
lamindb/core/loaders.py +1 -1
lamindb/core/storage/_anndata_accessor.py +1 -1
lamindb/core/storage/_tiledbsoma.py +14 -8
lamindb/core/storage/_valid_suffixes.py +0 -1
lamindb/core/storage/_zarr.py +1 -1
lamindb/core/storage/objects.py +13 -8
lamindb/core/storage/paths.py +9 -6
lamindb/core/types.py +1 -1
lamindb/curators/_legacy.py +2 -1
lamindb/curators/core.py +106 -105
lamindb/errors.py +9 -0
lamindb/examples/fixtures/__init__.py +0 -0
lamindb/examples/fixtures/sheets.py +224 -0
lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
lamindb/migrations/0105_record_unique_name.py +20 -0
lamindb/migrations/0106_transfer_data_migration.py +25 -0
lamindb/migrations/0107_add_schema_to_record.py +68 -0
lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
lamindb/migrations/0111_remove_record__sort_order.py +148 -0
lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
lamindb/migrations/0114_alter_run__status_code.py +24 -0
lamindb/migrations/0115_alter_space_uid.py +52 -0
lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
lamindb/models/__init__.py +4 -3
lamindb/models/_describe.py +88 -31
lamindb/models/_feature_manager.py +627 -658
lamindb/models/_label_manager.py +1 -3
lamindb/models/artifact.py +214 -99
lamindb/models/collection.py +7 -1
lamindb/models/feature.py +288 -60
lamindb/models/has_parents.py +3 -3
lamindb/models/project.py +32 -15
lamindb/models/query_manager.py +7 -1
lamindb/models/query_set.py +118 -41
lamindb/models/record.py +140 -94
lamindb/models/run.py +42 -42
lamindb/models/save.py +102 -16
lamindb/models/schema.py +41 -8
lamindb/models/sqlrecord.py +105 -40
lamindb/models/storage.py +278 -0
lamindb/models/transform.py +10 -2
lamindb/models/ulabel.py +9 -1
lamindb/py.typed +0 -0
lamindb/setup/__init__.py +2 -1
lamindb/setup/_switch.py +16 -0
lamindb/setup/errors/__init__.py +4 -0
lamindb/setup/types/__init__.py +4 -0
{lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
{lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
lamindb/models/core.py +0 -135
{lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
{lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0

lamindb/models/query_set.py CHANGED Viewed

@@ -19,7 +19,7 @@ from ..errors import DoesNotExist
 from ._is_versioned import IsVersioned
 from .can_curate import CanCurate, _inspect, _standardize, _validate
 from .query_manager import _lookup, _search
-from .sqlrecord import SQLRecord
+from .sqlrecord import Registry, SQLRecord
 if TYPE_CHECKING:
     from lamindb.base.types import ListLike, StrField
@@ -62,8 +62,12 @@ def get_keys_from_df(data: list, registry: SQLRecord) -> list[str]:
     return keys
-def one_helper(self, does_not_exist_msg: str | None = None):
-    if len(self) == 0:
+def one_helper(self: QuerySet | SQLRecordList, does_not_exist_msg: str | None = None):
+    if isinstance(self, SQLRecord):
+        not_exists = len(self) == 0
+    else:
+        not_exists = not self.exists()  # type: ignore
+    if not_exists:
         raise DoesNotExist(does_not_exist_msg)
     elif len(self) > 1:
         raise MultipleResultsFound(self)
@@ -142,20 +146,19 @@ def process_expressions(queryset: QuerySet, expressions: dict) -> dict:
     )
     if issubclass(queryset.model, SQLRecord):
-        # branch_id is set to 0 unless expressions contains id or uid
+        # branch_id is set to 1 unless expressions contains id or uid
         if not (
             "id" in expressions
             or "uid" in expressions
             or "uid__startswith" in expressions
         ):
-            branch_id = "branch_id"
-            if not any(e.startswith(branch_id) for e in expressions):
-                expressions[branch_id] = 1  # default branch_id
+            if not any(e.startswith("branch_id") for e in expressions):
+                expressions["branch_id"] = 1  # default branch_id
             # if branch_id is None, do not apply a filter
             # otherwise, it would mean filtering for NULL values, which doesn't make
             # sense for a non-NULLABLE column
-            elif branch_id in expressions and expressions[branch_id] is None:
-                expressions.pop(branch_id)
+            elif "branch_id" in expressions and expressions["branch_id"] is None:
+                expressions.pop("branch_id")
     if queryset._db is not None:
         # only check for database mismatch if there is a defined database on the
         # queryset
@@ -257,7 +260,7 @@ class SQLRecordList(UserList, Generic[T]):
 def get_basic_field_names(
     qs: QuerySet,
     include: list[str],
-    features_input: bool | list[str],
+    features_input: bool | list[str] | str,
 ) -> list[str]:
     exclude_field_names = ["updated_at"]
     field_names = [
@@ -300,24 +303,54 @@ def get_basic_field_names(
 def get_feature_annotate_kwargs(
-    features: bool | list[str] | None,
+    registry: Registry,
+    features: bool | list[str] | str | None,
+    qs: QuerySet | None = None,
 ) -> tuple[dict[str, Any], list[str], QuerySet]:
     from lamindb.models import (
         Artifact,
         Feature,
+        Record,
+        RecordJson,
     )
+    if registry not in {Artifact, Record}:
+        raise ValueError(
+            f"features=True is only applicable for Artifact and Record, not {registry.__name__}"
+        )
+    if features == "queryset":
+        ids_list = qs.values_list("id", flat=True)
+        feature_names = []
+        for obj in registry._meta.related_objects:
+            if not hasattr(getattr(registry, obj.related_name), "through"):
+                continue
+            links = getattr(registry, obj.related_name).through.filter(
+                **{registry.__name__.lower() + "_id__in": ids_list}
+            )
+            feature_names_for_link_model = links.values_list("feature__name", flat=True)
+            feature_names += feature_names_for_link_model
+        if registry is Record:
+            # this request is not strictly necessary, but it makes the resulting reshaped
+            # dataframe consistent
+            feature_names += RecordJson.filter(record_id__in=ids_list).values_list(
+                "feature__name", flat=True
+            )
+        features = list(set(feature_names))  # remove duplicates
     feature_qs = Feature.filter()
     if isinstance(features, list):
         feature_qs = feature_qs.filter(name__in=features)
         feature_names = features
     else:  # features is True -- only consider categorical features from ULabel and non-categorical features
         feature_qs = feature_qs.filter(
-            Q(~Q(dtype__startswith="cat[")) | Q(dtype__startswith="cat[ULabel")
+            Q(~Q(dtype__startswith="cat["))
+            | Q(dtype__startswith="cat[ULabel")
+            | Q(dtype__startswith="cat[Record")
         )
         feature_names = feature_qs.list("name")
         logger.important(
-            f"queried for all categorical features with dtype 'cat[ULabel...'] and non-categorical features: ({len(feature_names)}) {feature_names}"
+            f"queried for all categorical features with dtype ULabel or Record and non-categorical features: ({len(feature_names)}) {feature_names}"
         )
     # Get the categorical features
     cat_feature_types = {
@@ -328,18 +361,28 @@ def get_feature_annotate_kwargs(
     # Get relationships of labels and features
     link_models_on_models = {
         getattr(
-            Artifact, obj.related_name
+            registry, obj.related_name
         ).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
-        for obj in Artifact._meta.related_objects
+        for obj in registry._meta.related_objects
         if obj.related_model.__get_name_with_module__() in cat_feature_types
     }
-    link_models_on_models["ArtifactULabel"] = "ULabel"
+    if registry is Artifact:
+        link_models_on_models["ArtifactULabel"] = "ULabel"
+    else:
+        link_models_on_models["RecordRecord"] = "Record"
     link_attributes_on_models = {
         obj.related_name: link_models_on_models[
             obj.related_model.__get_name_with_module__()
         ]
-        for obj in Artifact._meta.related_objects
-        if obj.related_model.__get_name_with_module__() in link_models_on_models
+        for obj in registry._meta.related_objects
+        if (
+            obj.related_model.__get_name_with_module__() in link_models_on_models
+            and (
+                not obj.related_name.startswith("links_record")
+                if registry is Record
+                else True
+            )
+        )
     }
     # Prepare Django's annotate for features
     annotate_kwargs = {}
@@ -347,17 +390,22 @@ def get_feature_annotate_kwargs(
         annotate_kwargs[f"{link_attr}__feature__name"] = F(
             f"{link_attr}__feature__name"
         )
-        field_name = (
-            feature_type.split(".")[1] if "." in feature_type else feature_type
-        ).lower()
+        if registry is Artifact:
+            field_name = (
+                feature_type.split(".")[1] if "." in feature_type else feature_type
+            ).lower()
+        else:
+            field_name = "value"
         annotate_kwargs[f"{link_attr}__{field_name}__name"] = F(
             f"{link_attr}__{field_name}__name"
         )
-    annotate_kwargs["_feature_values__feature__name"] = F(
-        "_feature_values__feature__name"
+    json_values_attribute = "_feature_values" if registry is Artifact else "values_json"
+    annotate_kwargs[f"{json_values_attribute}__feature__name"] = F(
+        f"{json_values_attribute}__feature__name"
+    )
+    annotate_kwargs[f"{json_values_attribute}__value"] = F(
+        f"{json_values_attribute}__value"
     )
-    annotate_kwargs["_feature_values__value"] = F("_feature_values__value")
     return annotate_kwargs, feature_names, feature_qs
@@ -412,7 +460,9 @@ def analyze_lookup_cardinality(
     return result
-def reorder_subset_columns_in_df(df: pd.DataFrame, column_order: list[str], position=3):
+def reorder_subset_columns_in_df(
+    df: pd.DataFrame, column_order: list[str], position=3
+) -> pd.DataFrame:
     valid_columns = [col for col in column_order if col in df.columns]
     all_cols = df.columns.tolist()
     remaining_cols = [col for col in all_cols if col not in valid_columns]
@@ -423,6 +473,7 @@ def reorder_subset_columns_in_df(df: pd.DataFrame, column_order: list[str], posi
 # https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003
 # https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2
 def reshape_annotate_result(
+    registry: Registry,
     df: pd.DataFrame,
     field_names: list[str],
     cols_from_include: dict[str, str] | None,
@@ -438,29 +489,38 @@ def reshape_annotate_result(
             e.g., {'ulabels__name': 'many', 'created_by__name': 'one'}
         feature_names: Feature names.
     """
+    from lamindb.models import Artifact
     cols_from_include = cols_from_include or {}
+    json_values_attribute = "_feature_values" if registry is Artifact else "values_json"
     # initialize result with basic fields, need a copy as we're modifying it
     # will give us warnings otherwise
     result = df[field_names].copy()
     # process features if requested
     if feature_names:
-        # handle feature_values
-        feature_cols = ["_feature_values__feature__name", "_feature_values__value"]
+        # handle json values
+        feature_cols = [
+            f"{json_values_attribute}__feature__name",
+            f"{json_values_attribute}__value",
+        ]
         if all(col in df.columns for col in feature_cols):
             # Create two separate dataframes - one for dict values and one for non-dict values
-            is_dict = df["_feature_values__value"].apply(lambda x: isinstance(x, dict))
+            is_dict = df[f"{json_values_attribute}__value"].apply(
+                lambda x: isinstance(x, dict)
+            )
             dict_df, non_dict_df = df[is_dict], df[~is_dict]
             # Process non-dict values using set aggregation
             non_dict_features = non_dict_df.groupby(
-                ["id", "_feature_values__feature__name"]
-            )["_feature_values__value"].agg(set)
+                ["id", f"{json_values_attribute}__feature__name"]
+            )[f"{json_values_attribute}__value"].agg(set)
             # Process dict values using first aggregation
-            dict_features = dict_df.groupby(["id", "_feature_values__feature__name"])[
-                "_feature_values__value"
-            ].agg("first")
+            dict_features = dict_df.groupby(
+                ["id", f"{json_values_attribute}__feature__name"]
+            )[f"{json_values_attribute}__value"].agg("first")
             # Combine the results
             combined_features = pd.concat([non_dict_features, dict_features])
@@ -474,10 +534,11 @@ def reshape_annotate_result(
                 )
         # handle categorical features
+        links_prefix = "links_" if registry is Artifact else ("links_", "values_")
         links_features = [
             col
             for col in df.columns
-            if "feature__name" in col and col.startswith("links_")
+            if "feature__name" in col and col.startswith(links_prefix)
         ]
         if links_features:
@@ -501,6 +562,20 @@ def reshape_annotate_result(
                 result[feature.name] = result[feature.name].apply(
                     extract_single_element
                 )
+                if feature.dtype.startswith("cat"):
+                    try:
+                        # Try to convert to category - this will fail if complex objects remain
+                        result[feature.name] = result[feature.name].astype("category")
+                    except (TypeError, ValueError):
+                        # If conversion fails, the column still contains complex objects
+                        pass
+                if feature.dtype.startswith("datetime"):
+                    try:
+                        # Try to convert to category - this will fail if complex objects remain
+                        result[feature.name] = pd.to_datetime(result[feature.name])
+                    except (TypeError, ValueError):
+                        # If conversion fails, the column still contains complex objects
+                        pass
         # sort columns
         result = reorder_subset_columns_in_df(result, feature_names)
@@ -520,12 +595,14 @@ def process_links_features(
     """Process links_XXX feature columns."""
     # this loops over different entities that might be linked under a feature
     for feature_col in feature_cols:
-        prefix = re.match(r"links_(.+?)__feature__name", feature_col).group(1)
+        links_attribute = "links_" if feature_col.startswith("links_") else "values_"
+        regex = f"{links_attribute}(.+?)__feature__name"
+        prefix = re.match(regex, feature_col).group(1)
         value_cols = [
             col
             for col in df.columns
-            if col.startswith(f"links_{prefix}__")
+            if col.startswith(f"{links_attribute}{prefix}__")
             and col.endswith("__name")
             and "feature__name" not in col
         ]
@@ -598,7 +675,7 @@ class BasicQuerySet(models.QuerySet):
     def df(
         self,
         include: str | list[str] | None = None,
-        features: bool | list[str] | None = None,
+        features: bool | list[str] | str | None = None,
     ) -> pd.DataFrame:
         """{}"""  # noqa: D415
         time = datetime.now(timezone.utc)
@@ -617,7 +694,7 @@ class BasicQuerySet(models.QuerySet):
         feature_qs = None
         if features:
             feature_annotate_kwargs, feature_names, feature_qs = (
-                get_feature_annotate_kwargs(features)
+                get_feature_annotate_kwargs(self.model, features, self)
             )
             time = logger.debug("finished feature_annotate_kwargs", time=time)
             annotate_kwargs.update(feature_annotate_kwargs)
@@ -652,7 +729,7 @@ class BasicQuerySet(models.QuerySet):
         cols_from_include = analyze_lookup_cardinality(self.model, include_input)  # type: ignore
         time = logger.debug("finished analyze_lookup_cardinality", time=time)
         df_reshaped = reshape_annotate_result(
-            df, field_names, cols_from_include, feature_names, feature_qs
+            self.model, df, field_names, cols_from_include, feature_names, feature_qs
         )
         time = logger.debug("finished reshape_annotate_result", time=time)
         pk_name = self.model._meta.pk.name
@@ -710,7 +787,7 @@ class BasicQuerySet(models.QuerySet):
             >>> ULabel.filter(name="benchmark").one_or_none()
             >>> ULabel.filter(name="non existing label").one_or_none()
         """
-        if len(self) == 0:
+        if not self.exists():
             return None
         elif len(self) == 1:
             return self[0]

lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl

lamindb 1.6.2py3-none-any.whl → 1.7.0py3-none-any.whl