lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -3
- lamindb/_finish.py +32 -16
- lamindb/base/types.py +6 -4
- lamindb/core/_context.py +127 -57
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_settings.py +44 -4
- lamindb/core/_track_environment.py +5 -2
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +1 -1
- lamindb/core/storage/_tiledbsoma.py +14 -8
- lamindb/core/storage/_valid_suffixes.py +0 -1
- lamindb/core/storage/_zarr.py +1 -1
- lamindb/core/storage/objects.py +13 -8
- lamindb/core/storage/paths.py +9 -6
- lamindb/core/types.py +1 -1
- lamindb/curators/_legacy.py +2 -1
- lamindb/curators/core.py +106 -105
- lamindb/errors.py +9 -0
- lamindb/examples/fixtures/__init__.py +0 -0
- lamindb/examples/fixtures/sheets.py +224 -0
- lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
- lamindb/migrations/0105_record_unique_name.py +20 -0
- lamindb/migrations/0106_transfer_data_migration.py +25 -0
- lamindb/migrations/0107_add_schema_to_record.py +68 -0
- lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
- lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
- lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
- lamindb/migrations/0111_remove_record__sort_order.py +148 -0
- lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
- lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
- lamindb/migrations/0114_alter_run__status_code.py +24 -0
- lamindb/migrations/0115_alter_space_uid.py +52 -0
- lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
- lamindb/models/__init__.py +4 -3
- lamindb/models/_describe.py +88 -31
- lamindb/models/_feature_manager.py +627 -658
- lamindb/models/_label_manager.py +1 -3
- lamindb/models/artifact.py +214 -99
- lamindb/models/collection.py +7 -1
- lamindb/models/feature.py +288 -60
- lamindb/models/has_parents.py +3 -3
- lamindb/models/project.py +32 -15
- lamindb/models/query_manager.py +7 -1
- lamindb/models/query_set.py +118 -41
- lamindb/models/record.py +140 -94
- lamindb/models/run.py +42 -42
- lamindb/models/save.py +102 -16
- lamindb/models/schema.py +41 -8
- lamindb/models/sqlrecord.py +105 -40
- lamindb/models/storage.py +278 -0
- lamindb/models/transform.py +10 -2
- lamindb/models/ulabel.py +9 -1
- lamindb/py.typed +0 -0
- lamindb/setup/__init__.py +2 -1
- lamindb/setup/_switch.py +16 -0
- lamindb/setup/errors/__init__.py +4 -0
- lamindb/setup/types/__init__.py +4 -0
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
- lamindb/models/core.py +0 -135
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
- {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
lamindb/models/query_set.py
CHANGED
@@ -19,7 +19,7 @@ from ..errors import DoesNotExist
|
|
19
19
|
from ._is_versioned import IsVersioned
|
20
20
|
from .can_curate import CanCurate, _inspect, _standardize, _validate
|
21
21
|
from .query_manager import _lookup, _search
|
22
|
-
from .sqlrecord import SQLRecord
|
22
|
+
from .sqlrecord import Registry, SQLRecord
|
23
23
|
|
24
24
|
if TYPE_CHECKING:
|
25
25
|
from lamindb.base.types import ListLike, StrField
|
@@ -62,8 +62,12 @@ def get_keys_from_df(data: list, registry: SQLRecord) -> list[str]:
|
|
62
62
|
return keys
|
63
63
|
|
64
64
|
|
65
|
-
def one_helper(self, does_not_exist_msg: str | None = None):
|
66
|
-
if
|
65
|
+
def one_helper(self: QuerySet | SQLRecordList, does_not_exist_msg: str | None = None):
|
66
|
+
if isinstance(self, SQLRecord):
|
67
|
+
not_exists = len(self) == 0
|
68
|
+
else:
|
69
|
+
not_exists = not self.exists() # type: ignore
|
70
|
+
if not_exists:
|
67
71
|
raise DoesNotExist(does_not_exist_msg)
|
68
72
|
elif len(self) > 1:
|
69
73
|
raise MultipleResultsFound(self)
|
@@ -142,20 +146,19 @@ def process_expressions(queryset: QuerySet, expressions: dict) -> dict:
|
|
142
146
|
)
|
143
147
|
|
144
148
|
if issubclass(queryset.model, SQLRecord):
|
145
|
-
# branch_id is set to
|
149
|
+
# branch_id is set to 1 unless expressions contains id or uid
|
146
150
|
if not (
|
147
151
|
"id" in expressions
|
148
152
|
or "uid" in expressions
|
149
153
|
or "uid__startswith" in expressions
|
150
154
|
):
|
151
|
-
|
152
|
-
|
153
|
-
expressions[branch_id] = 1 # default branch_id
|
155
|
+
if not any(e.startswith("branch_id") for e in expressions):
|
156
|
+
expressions["branch_id"] = 1 # default branch_id
|
154
157
|
# if branch_id is None, do not apply a filter
|
155
158
|
# otherwise, it would mean filtering for NULL values, which doesn't make
|
156
159
|
# sense for a non-NULLABLE column
|
157
|
-
elif branch_id in expressions and expressions[branch_id] is None:
|
158
|
-
expressions.pop(branch_id)
|
160
|
+
elif "branch_id" in expressions and expressions["branch_id"] is None:
|
161
|
+
expressions.pop("branch_id")
|
159
162
|
if queryset._db is not None:
|
160
163
|
# only check for database mismatch if there is a defined database on the
|
161
164
|
# queryset
|
@@ -257,7 +260,7 @@ class SQLRecordList(UserList, Generic[T]):
|
|
257
260
|
def get_basic_field_names(
|
258
261
|
qs: QuerySet,
|
259
262
|
include: list[str],
|
260
|
-
features_input: bool | list[str],
|
263
|
+
features_input: bool | list[str] | str,
|
261
264
|
) -> list[str]:
|
262
265
|
exclude_field_names = ["updated_at"]
|
263
266
|
field_names = [
|
@@ -300,24 +303,54 @@ def get_basic_field_names(
|
|
300
303
|
|
301
304
|
|
302
305
|
def get_feature_annotate_kwargs(
|
303
|
-
|
306
|
+
registry: Registry,
|
307
|
+
features: bool | list[str] | str | None,
|
308
|
+
qs: QuerySet | None = None,
|
304
309
|
) -> tuple[dict[str, Any], list[str], QuerySet]:
|
305
310
|
from lamindb.models import (
|
306
311
|
Artifact,
|
307
312
|
Feature,
|
313
|
+
Record,
|
314
|
+
RecordJson,
|
308
315
|
)
|
309
316
|
|
317
|
+
if registry not in {Artifact, Record}:
|
318
|
+
raise ValueError(
|
319
|
+
f"features=True is only applicable for Artifact and Record, not {registry.__name__}"
|
320
|
+
)
|
321
|
+
|
322
|
+
if features == "queryset":
|
323
|
+
ids_list = qs.values_list("id", flat=True)
|
324
|
+
feature_names = []
|
325
|
+
for obj in registry._meta.related_objects:
|
326
|
+
if not hasattr(getattr(registry, obj.related_name), "through"):
|
327
|
+
continue
|
328
|
+
links = getattr(registry, obj.related_name).through.filter(
|
329
|
+
**{registry.__name__.lower() + "_id__in": ids_list}
|
330
|
+
)
|
331
|
+
feature_names_for_link_model = links.values_list("feature__name", flat=True)
|
332
|
+
feature_names += feature_names_for_link_model
|
333
|
+
if registry is Record:
|
334
|
+
# this request is not strictly necessary, but it makes the resulting reshaped
|
335
|
+
# dataframe consistent
|
336
|
+
feature_names += RecordJson.filter(record_id__in=ids_list).values_list(
|
337
|
+
"feature__name", flat=True
|
338
|
+
)
|
339
|
+
features = list(set(feature_names)) # remove duplicates
|
340
|
+
|
310
341
|
feature_qs = Feature.filter()
|
311
342
|
if isinstance(features, list):
|
312
343
|
feature_qs = feature_qs.filter(name__in=features)
|
313
344
|
feature_names = features
|
314
345
|
else: # features is True -- only consider categorical features from ULabel and non-categorical features
|
315
346
|
feature_qs = feature_qs.filter(
|
316
|
-
Q(~Q(dtype__startswith="cat["))
|
347
|
+
Q(~Q(dtype__startswith="cat["))
|
348
|
+
| Q(dtype__startswith="cat[ULabel")
|
349
|
+
| Q(dtype__startswith="cat[Record")
|
317
350
|
)
|
318
351
|
feature_names = feature_qs.list("name")
|
319
352
|
logger.important(
|
320
|
-
f"queried for all categorical features with dtype
|
353
|
+
f"queried for all categorical features with dtype ULabel or Record and non-categorical features: ({len(feature_names)}) {feature_names}"
|
321
354
|
)
|
322
355
|
# Get the categorical features
|
323
356
|
cat_feature_types = {
|
@@ -328,18 +361,28 @@ def get_feature_annotate_kwargs(
|
|
328
361
|
# Get relationships of labels and features
|
329
362
|
link_models_on_models = {
|
330
363
|
getattr(
|
331
|
-
|
364
|
+
registry, obj.related_name
|
332
365
|
).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
|
333
|
-
for obj in
|
366
|
+
for obj in registry._meta.related_objects
|
334
367
|
if obj.related_model.__get_name_with_module__() in cat_feature_types
|
335
368
|
}
|
336
|
-
|
369
|
+
if registry is Artifact:
|
370
|
+
link_models_on_models["ArtifactULabel"] = "ULabel"
|
371
|
+
else:
|
372
|
+
link_models_on_models["RecordRecord"] = "Record"
|
337
373
|
link_attributes_on_models = {
|
338
374
|
obj.related_name: link_models_on_models[
|
339
375
|
obj.related_model.__get_name_with_module__()
|
340
376
|
]
|
341
|
-
for obj in
|
342
|
-
if
|
377
|
+
for obj in registry._meta.related_objects
|
378
|
+
if (
|
379
|
+
obj.related_model.__get_name_with_module__() in link_models_on_models
|
380
|
+
and (
|
381
|
+
not obj.related_name.startswith("links_record")
|
382
|
+
if registry is Record
|
383
|
+
else True
|
384
|
+
)
|
385
|
+
)
|
343
386
|
}
|
344
387
|
# Prepare Django's annotate for features
|
345
388
|
annotate_kwargs = {}
|
@@ -347,17 +390,22 @@ def get_feature_annotate_kwargs(
|
|
347
390
|
annotate_kwargs[f"{link_attr}__feature__name"] = F(
|
348
391
|
f"{link_attr}__feature__name"
|
349
392
|
)
|
350
|
-
|
351
|
-
|
352
|
-
|
393
|
+
if registry is Artifact:
|
394
|
+
field_name = (
|
395
|
+
feature_type.split(".")[1] if "." in feature_type else feature_type
|
396
|
+
).lower()
|
397
|
+
else:
|
398
|
+
field_name = "value"
|
353
399
|
annotate_kwargs[f"{link_attr}__{field_name}__name"] = F(
|
354
400
|
f"{link_attr}__{field_name}__name"
|
355
401
|
)
|
356
|
-
|
357
|
-
annotate_kwargs["
|
358
|
-
"
|
402
|
+
json_values_attribute = "_feature_values" if registry is Artifact else "values_json"
|
403
|
+
annotate_kwargs[f"{json_values_attribute}__feature__name"] = F(
|
404
|
+
f"{json_values_attribute}__feature__name"
|
405
|
+
)
|
406
|
+
annotate_kwargs[f"{json_values_attribute}__value"] = F(
|
407
|
+
f"{json_values_attribute}__value"
|
359
408
|
)
|
360
|
-
annotate_kwargs["_feature_values__value"] = F("_feature_values__value")
|
361
409
|
return annotate_kwargs, feature_names, feature_qs
|
362
410
|
|
363
411
|
|
@@ -412,7 +460,9 @@ def analyze_lookup_cardinality(
|
|
412
460
|
return result
|
413
461
|
|
414
462
|
|
415
|
-
def reorder_subset_columns_in_df(
|
463
|
+
def reorder_subset_columns_in_df(
|
464
|
+
df: pd.DataFrame, column_order: list[str], position=3
|
465
|
+
) -> pd.DataFrame:
|
416
466
|
valid_columns = [col for col in column_order if col in df.columns]
|
417
467
|
all_cols = df.columns.tolist()
|
418
468
|
remaining_cols = [col for col in all_cols if col not in valid_columns]
|
@@ -423,6 +473,7 @@ def reorder_subset_columns_in_df(df: pd.DataFrame, column_order: list[str], posi
|
|
423
473
|
# https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003
|
424
474
|
# https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2
|
425
475
|
def reshape_annotate_result(
|
476
|
+
registry: Registry,
|
426
477
|
df: pd.DataFrame,
|
427
478
|
field_names: list[str],
|
428
479
|
cols_from_include: dict[str, str] | None,
|
@@ -438,29 +489,38 @@ def reshape_annotate_result(
|
|
438
489
|
e.g., {'ulabels__name': 'many', 'created_by__name': 'one'}
|
439
490
|
feature_names: Feature names.
|
440
491
|
"""
|
492
|
+
from lamindb.models import Artifact
|
493
|
+
|
441
494
|
cols_from_include = cols_from_include or {}
|
442
495
|
|
496
|
+
json_values_attribute = "_feature_values" if registry is Artifact else "values_json"
|
497
|
+
|
443
498
|
# initialize result with basic fields, need a copy as we're modifying it
|
444
499
|
# will give us warnings otherwise
|
445
500
|
result = df[field_names].copy()
|
446
501
|
# process features if requested
|
447
502
|
if feature_names:
|
448
|
-
# handle
|
449
|
-
feature_cols = [
|
503
|
+
# handle json values
|
504
|
+
feature_cols = [
|
505
|
+
f"{json_values_attribute}__feature__name",
|
506
|
+
f"{json_values_attribute}__value",
|
507
|
+
]
|
450
508
|
if all(col in df.columns for col in feature_cols):
|
451
509
|
# Create two separate dataframes - one for dict values and one for non-dict values
|
452
|
-
is_dict = df["
|
510
|
+
is_dict = df[f"{json_values_attribute}__value"].apply(
|
511
|
+
lambda x: isinstance(x, dict)
|
512
|
+
)
|
453
513
|
dict_df, non_dict_df = df[is_dict], df[~is_dict]
|
454
514
|
|
455
515
|
# Process non-dict values using set aggregation
|
456
516
|
non_dict_features = non_dict_df.groupby(
|
457
|
-
["id", "
|
458
|
-
)["
|
517
|
+
["id", f"{json_values_attribute}__feature__name"]
|
518
|
+
)[f"{json_values_attribute}__value"].agg(set)
|
459
519
|
|
460
520
|
# Process dict values using first aggregation
|
461
|
-
dict_features = dict_df.groupby(
|
462
|
-
"
|
463
|
-
].agg("first")
|
521
|
+
dict_features = dict_df.groupby(
|
522
|
+
["id", f"{json_values_attribute}__feature__name"]
|
523
|
+
)[f"{json_values_attribute}__value"].agg("first")
|
464
524
|
|
465
525
|
# Combine the results
|
466
526
|
combined_features = pd.concat([non_dict_features, dict_features])
|
@@ -474,10 +534,11 @@ def reshape_annotate_result(
|
|
474
534
|
)
|
475
535
|
|
476
536
|
# handle categorical features
|
537
|
+
links_prefix = "links_" if registry is Artifact else ("links_", "values_")
|
477
538
|
links_features = [
|
478
539
|
col
|
479
540
|
for col in df.columns
|
480
|
-
if "feature__name" in col and col.startswith(
|
541
|
+
if "feature__name" in col and col.startswith(links_prefix)
|
481
542
|
]
|
482
543
|
|
483
544
|
if links_features:
|
@@ -501,6 +562,20 @@ def reshape_annotate_result(
|
|
501
562
|
result[feature.name] = result[feature.name].apply(
|
502
563
|
extract_single_element
|
503
564
|
)
|
565
|
+
if feature.dtype.startswith("cat"):
|
566
|
+
try:
|
567
|
+
# Try to convert to category - this will fail if complex objects remain
|
568
|
+
result[feature.name] = result[feature.name].astype("category")
|
569
|
+
except (TypeError, ValueError):
|
570
|
+
# If conversion fails, the column still contains complex objects
|
571
|
+
pass
|
572
|
+
if feature.dtype.startswith("datetime"):
|
573
|
+
try:
|
574
|
+
# Try to convert to category - this will fail if complex objects remain
|
575
|
+
result[feature.name] = pd.to_datetime(result[feature.name])
|
576
|
+
except (TypeError, ValueError):
|
577
|
+
# If conversion fails, the column still contains complex objects
|
578
|
+
pass
|
504
579
|
|
505
580
|
# sort columns
|
506
581
|
result = reorder_subset_columns_in_df(result, feature_names)
|
@@ -520,12 +595,14 @@ def process_links_features(
|
|
520
595
|
"""Process links_XXX feature columns."""
|
521
596
|
# this loops over different entities that might be linked under a feature
|
522
597
|
for feature_col in feature_cols:
|
523
|
-
|
598
|
+
links_attribute = "links_" if feature_col.startswith("links_") else "values_"
|
599
|
+
regex = f"{links_attribute}(.+?)__feature__name"
|
600
|
+
prefix = re.match(regex, feature_col).group(1)
|
524
601
|
|
525
602
|
value_cols = [
|
526
603
|
col
|
527
604
|
for col in df.columns
|
528
|
-
if col.startswith(f"
|
605
|
+
if col.startswith(f"{links_attribute}{prefix}__")
|
529
606
|
and col.endswith("__name")
|
530
607
|
and "feature__name" not in col
|
531
608
|
]
|
@@ -598,7 +675,7 @@ class BasicQuerySet(models.QuerySet):
|
|
598
675
|
def df(
|
599
676
|
self,
|
600
677
|
include: str | list[str] | None = None,
|
601
|
-
features: bool | list[str] | None = None,
|
678
|
+
features: bool | list[str] | str | None = None,
|
602
679
|
) -> pd.DataFrame:
|
603
680
|
"""{}""" # noqa: D415
|
604
681
|
time = datetime.now(timezone.utc)
|
@@ -617,7 +694,7 @@ class BasicQuerySet(models.QuerySet):
|
|
617
694
|
feature_qs = None
|
618
695
|
if features:
|
619
696
|
feature_annotate_kwargs, feature_names, feature_qs = (
|
620
|
-
get_feature_annotate_kwargs(features)
|
697
|
+
get_feature_annotate_kwargs(self.model, features, self)
|
621
698
|
)
|
622
699
|
time = logger.debug("finished feature_annotate_kwargs", time=time)
|
623
700
|
annotate_kwargs.update(feature_annotate_kwargs)
|
@@ -652,7 +729,7 @@ class BasicQuerySet(models.QuerySet):
|
|
652
729
|
cols_from_include = analyze_lookup_cardinality(self.model, include_input) # type: ignore
|
653
730
|
time = logger.debug("finished analyze_lookup_cardinality", time=time)
|
654
731
|
df_reshaped = reshape_annotate_result(
|
655
|
-
df, field_names, cols_from_include, feature_names, feature_qs
|
732
|
+
self.model, df, field_names, cols_from_include, feature_names, feature_qs
|
656
733
|
)
|
657
734
|
time = logger.debug("finished reshape_annotate_result", time=time)
|
658
735
|
pk_name = self.model._meta.pk.name
|
@@ -710,7 +787,7 @@ class BasicQuerySet(models.QuerySet):
|
|
710
787
|
>>> ULabel.filter(name="benchmark").one_or_none()
|
711
788
|
>>> ULabel.filter(name="non existing label").one_or_none()
|
712
789
|
"""
|
713
|
-
if
|
790
|
+
if not self.exists():
|
714
791
|
return None
|
715
792
|
elif len(self) == 1:
|
716
793
|
return self[0]
|