lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +1 -3
  2. lamindb/_finish.py +32 -16
  3. lamindb/base/types.py +6 -4
  4. lamindb/core/_context.py +127 -57
  5. lamindb/core/_mapped_collection.py +1 -1
  6. lamindb/core/_settings.py +44 -4
  7. lamindb/core/_track_environment.py +5 -2
  8. lamindb/core/loaders.py +1 -1
  9. lamindb/core/storage/_anndata_accessor.py +1 -1
  10. lamindb/core/storage/_tiledbsoma.py +14 -8
  11. lamindb/core/storage/_valid_suffixes.py +0 -1
  12. lamindb/core/storage/_zarr.py +1 -1
  13. lamindb/core/storage/objects.py +13 -8
  14. lamindb/core/storage/paths.py +9 -6
  15. lamindb/core/types.py +1 -1
  16. lamindb/curators/_legacy.py +2 -1
  17. lamindb/curators/core.py +106 -105
  18. lamindb/errors.py +9 -0
  19. lamindb/examples/fixtures/__init__.py +0 -0
  20. lamindb/examples/fixtures/sheets.py +224 -0
  21. lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
  22. lamindb/migrations/0105_record_unique_name.py +20 -0
  23. lamindb/migrations/0106_transfer_data_migration.py +25 -0
  24. lamindb/migrations/0107_add_schema_to_record.py +68 -0
  25. lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
  26. lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
  27. lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
  28. lamindb/migrations/0111_remove_record__sort_order.py +148 -0
  29. lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
  30. lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
  31. lamindb/migrations/0114_alter_run__status_code.py +24 -0
  32. lamindb/migrations/0115_alter_space_uid.py +52 -0
  33. lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
  34. lamindb/models/__init__.py +4 -3
  35. lamindb/models/_describe.py +88 -31
  36. lamindb/models/_feature_manager.py +627 -658
  37. lamindb/models/_label_manager.py +1 -3
  38. lamindb/models/artifact.py +214 -99
  39. lamindb/models/collection.py +7 -1
  40. lamindb/models/feature.py +288 -60
  41. lamindb/models/has_parents.py +3 -3
  42. lamindb/models/project.py +32 -15
  43. lamindb/models/query_manager.py +7 -1
  44. lamindb/models/query_set.py +118 -41
  45. lamindb/models/record.py +140 -94
  46. lamindb/models/run.py +42 -42
  47. lamindb/models/save.py +102 -16
  48. lamindb/models/schema.py +41 -8
  49. lamindb/models/sqlrecord.py +105 -40
  50. lamindb/models/storage.py +278 -0
  51. lamindb/models/transform.py +10 -2
  52. lamindb/models/ulabel.py +9 -1
  53. lamindb/py.typed +0 -0
  54. lamindb/setup/__init__.py +2 -1
  55. lamindb/setup/_switch.py +16 -0
  56. lamindb/setup/errors/__init__.py +4 -0
  57. lamindb/setup/types/__init__.py +4 -0
  58. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
  59. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
  60. lamindb/models/core.py +0 -135
  61. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
  62. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
@@ -19,7 +19,7 @@ from ..errors import DoesNotExist
19
19
  from ._is_versioned import IsVersioned
20
20
  from .can_curate import CanCurate, _inspect, _standardize, _validate
21
21
  from .query_manager import _lookup, _search
22
- from .sqlrecord import SQLRecord
22
+ from .sqlrecord import Registry, SQLRecord
23
23
 
24
24
  if TYPE_CHECKING:
25
25
  from lamindb.base.types import ListLike, StrField
@@ -62,8 +62,12 @@ def get_keys_from_df(data: list, registry: SQLRecord) -> list[str]:
62
62
  return keys
63
63
 
64
64
 
65
- def one_helper(self, does_not_exist_msg: str | None = None):
66
- if len(self) == 0:
65
+ def one_helper(self: QuerySet | SQLRecordList, does_not_exist_msg: str | None = None):
66
+ if isinstance(self, SQLRecord):
67
+ not_exists = len(self) == 0
68
+ else:
69
+ not_exists = not self.exists() # type: ignore
70
+ if not_exists:
67
71
  raise DoesNotExist(does_not_exist_msg)
68
72
  elif len(self) > 1:
69
73
  raise MultipleResultsFound(self)
@@ -142,20 +146,19 @@ def process_expressions(queryset: QuerySet, expressions: dict) -> dict:
142
146
  )
143
147
 
144
148
  if issubclass(queryset.model, SQLRecord):
145
- # branch_id is set to 0 unless expressions contains id or uid
149
+ # branch_id is set to 1 unless expressions contains id or uid
146
150
  if not (
147
151
  "id" in expressions
148
152
  or "uid" in expressions
149
153
  or "uid__startswith" in expressions
150
154
  ):
151
- branch_id = "branch_id"
152
- if not any(e.startswith(branch_id) for e in expressions):
153
- expressions[branch_id] = 1 # default branch_id
155
+ if not any(e.startswith("branch_id") for e in expressions):
156
+ expressions["branch_id"] = 1 # default branch_id
154
157
  # if branch_id is None, do not apply a filter
155
158
  # otherwise, it would mean filtering for NULL values, which doesn't make
156
159
  # sense for a non-NULLABLE column
157
- elif branch_id in expressions and expressions[branch_id] is None:
158
- expressions.pop(branch_id)
160
+ elif "branch_id" in expressions and expressions["branch_id"] is None:
161
+ expressions.pop("branch_id")
159
162
  if queryset._db is not None:
160
163
  # only check for database mismatch if there is a defined database on the
161
164
  # queryset
@@ -257,7 +260,7 @@ class SQLRecordList(UserList, Generic[T]):
257
260
  def get_basic_field_names(
258
261
  qs: QuerySet,
259
262
  include: list[str],
260
- features_input: bool | list[str],
263
+ features_input: bool | list[str] | str,
261
264
  ) -> list[str]:
262
265
  exclude_field_names = ["updated_at"]
263
266
  field_names = [
@@ -300,24 +303,54 @@ def get_basic_field_names(
300
303
 
301
304
 
302
305
  def get_feature_annotate_kwargs(
303
- features: bool | list[str] | None,
306
+ registry: Registry,
307
+ features: bool | list[str] | str | None,
308
+ qs: QuerySet | None = None,
304
309
  ) -> tuple[dict[str, Any], list[str], QuerySet]:
305
310
  from lamindb.models import (
306
311
  Artifact,
307
312
  Feature,
313
+ Record,
314
+ RecordJson,
308
315
  )
309
316
 
317
+ if registry not in {Artifact, Record}:
318
+ raise ValueError(
319
+ f"features=True is only applicable for Artifact and Record, not {registry.__name__}"
320
+ )
321
+
322
+ if features == "queryset":
323
+ ids_list = qs.values_list("id", flat=True)
324
+ feature_names = []
325
+ for obj in registry._meta.related_objects:
326
+ if not hasattr(getattr(registry, obj.related_name), "through"):
327
+ continue
328
+ links = getattr(registry, obj.related_name).through.filter(
329
+ **{registry.__name__.lower() + "_id__in": ids_list}
330
+ )
331
+ feature_names_for_link_model = links.values_list("feature__name", flat=True)
332
+ feature_names += feature_names_for_link_model
333
+ if registry is Record:
334
+ # this request is not strictly necessary, but it makes the resulting reshaped
335
+ # dataframe consistent
336
+ feature_names += RecordJson.filter(record_id__in=ids_list).values_list(
337
+ "feature__name", flat=True
338
+ )
339
+ features = list(set(feature_names)) # remove duplicates
340
+
310
341
  feature_qs = Feature.filter()
311
342
  if isinstance(features, list):
312
343
  feature_qs = feature_qs.filter(name__in=features)
313
344
  feature_names = features
314
345
  else: # features is True -- only consider categorical features from ULabel and non-categorical features
315
346
  feature_qs = feature_qs.filter(
316
- Q(~Q(dtype__startswith="cat[")) | Q(dtype__startswith="cat[ULabel")
347
+ Q(~Q(dtype__startswith="cat["))
348
+ | Q(dtype__startswith="cat[ULabel")
349
+ | Q(dtype__startswith="cat[Record")
317
350
  )
318
351
  feature_names = feature_qs.list("name")
319
352
  logger.important(
320
- f"queried for all categorical features with dtype 'cat[ULabel...'] and non-categorical features: ({len(feature_names)}) {feature_names}"
353
+ f"queried for all categorical features with dtype ULabel or Record and non-categorical features: ({len(feature_names)}) {feature_names}"
321
354
  )
322
355
  # Get the categorical features
323
356
  cat_feature_types = {
@@ -328,18 +361,28 @@ def get_feature_annotate_kwargs(
328
361
  # Get relationships of labels and features
329
362
  link_models_on_models = {
330
363
  getattr(
331
- Artifact, obj.related_name
364
+ registry, obj.related_name
332
365
  ).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
333
- for obj in Artifact._meta.related_objects
366
+ for obj in registry._meta.related_objects
334
367
  if obj.related_model.__get_name_with_module__() in cat_feature_types
335
368
  }
336
- link_models_on_models["ArtifactULabel"] = "ULabel"
369
+ if registry is Artifact:
370
+ link_models_on_models["ArtifactULabel"] = "ULabel"
371
+ else:
372
+ link_models_on_models["RecordRecord"] = "Record"
337
373
  link_attributes_on_models = {
338
374
  obj.related_name: link_models_on_models[
339
375
  obj.related_model.__get_name_with_module__()
340
376
  ]
341
- for obj in Artifact._meta.related_objects
342
- if obj.related_model.__get_name_with_module__() in link_models_on_models
377
+ for obj in registry._meta.related_objects
378
+ if (
379
+ obj.related_model.__get_name_with_module__() in link_models_on_models
380
+ and (
381
+ not obj.related_name.startswith("links_record")
382
+ if registry is Record
383
+ else True
384
+ )
385
+ )
343
386
  }
344
387
  # Prepare Django's annotate for features
345
388
  annotate_kwargs = {}
@@ -347,17 +390,22 @@ def get_feature_annotate_kwargs(
347
390
  annotate_kwargs[f"{link_attr}__feature__name"] = F(
348
391
  f"{link_attr}__feature__name"
349
392
  )
350
- field_name = (
351
- feature_type.split(".")[1] if "." in feature_type else feature_type
352
- ).lower()
393
+ if registry is Artifact:
394
+ field_name = (
395
+ feature_type.split(".")[1] if "." in feature_type else feature_type
396
+ ).lower()
397
+ else:
398
+ field_name = "value"
353
399
  annotate_kwargs[f"{link_attr}__{field_name}__name"] = F(
354
400
  f"{link_attr}__{field_name}__name"
355
401
  )
356
-
357
- annotate_kwargs["_feature_values__feature__name"] = F(
358
- "_feature_values__feature__name"
402
+ json_values_attribute = "_feature_values" if registry is Artifact else "values_json"
403
+ annotate_kwargs[f"{json_values_attribute}__feature__name"] = F(
404
+ f"{json_values_attribute}__feature__name"
405
+ )
406
+ annotate_kwargs[f"{json_values_attribute}__value"] = F(
407
+ f"{json_values_attribute}__value"
359
408
  )
360
- annotate_kwargs["_feature_values__value"] = F("_feature_values__value")
361
409
  return annotate_kwargs, feature_names, feature_qs
362
410
 
363
411
 
@@ -412,7 +460,9 @@ def analyze_lookup_cardinality(
412
460
  return result
413
461
 
414
462
 
415
- def reorder_subset_columns_in_df(df: pd.DataFrame, column_order: list[str], position=3):
463
+ def reorder_subset_columns_in_df(
464
+ df: pd.DataFrame, column_order: list[str], position=3
465
+ ) -> pd.DataFrame:
416
466
  valid_columns = [col for col in column_order if col in df.columns]
417
467
  all_cols = df.columns.tolist()
418
468
  remaining_cols = [col for col in all_cols if col not in valid_columns]
@@ -423,6 +473,7 @@ def reorder_subset_columns_in_df(df: pd.DataFrame, column_order: list[str], posi
423
473
  # https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003
424
474
  # https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2
425
475
  def reshape_annotate_result(
476
+ registry: Registry,
426
477
  df: pd.DataFrame,
427
478
  field_names: list[str],
428
479
  cols_from_include: dict[str, str] | None,
@@ -438,29 +489,38 @@ def reshape_annotate_result(
438
489
  e.g., {'ulabels__name': 'many', 'created_by__name': 'one'}
439
490
  feature_names: Feature names.
440
491
  """
492
+ from lamindb.models import Artifact
493
+
441
494
  cols_from_include = cols_from_include or {}
442
495
 
496
+ json_values_attribute = "_feature_values" if registry is Artifact else "values_json"
497
+
443
498
  # initialize result with basic fields, need a copy as we're modifying it
444
499
  # will give us warnings otherwise
445
500
  result = df[field_names].copy()
446
501
  # process features if requested
447
502
  if feature_names:
448
- # handle feature_values
449
- feature_cols = ["_feature_values__feature__name", "_feature_values__value"]
503
+ # handle json values
504
+ feature_cols = [
505
+ f"{json_values_attribute}__feature__name",
506
+ f"{json_values_attribute}__value",
507
+ ]
450
508
  if all(col in df.columns for col in feature_cols):
451
509
  # Create two separate dataframes - one for dict values and one for non-dict values
452
- is_dict = df["_feature_values__value"].apply(lambda x: isinstance(x, dict))
510
+ is_dict = df[f"{json_values_attribute}__value"].apply(
511
+ lambda x: isinstance(x, dict)
512
+ )
453
513
  dict_df, non_dict_df = df[is_dict], df[~is_dict]
454
514
 
455
515
  # Process non-dict values using set aggregation
456
516
  non_dict_features = non_dict_df.groupby(
457
- ["id", "_feature_values__feature__name"]
458
- )["_feature_values__value"].agg(set)
517
+ ["id", f"{json_values_attribute}__feature__name"]
518
+ )[f"{json_values_attribute}__value"].agg(set)
459
519
 
460
520
  # Process dict values using first aggregation
461
- dict_features = dict_df.groupby(["id", "_feature_values__feature__name"])[
462
- "_feature_values__value"
463
- ].agg("first")
521
+ dict_features = dict_df.groupby(
522
+ ["id", f"{json_values_attribute}__feature__name"]
523
+ )[f"{json_values_attribute}__value"].agg("first")
464
524
 
465
525
  # Combine the results
466
526
  combined_features = pd.concat([non_dict_features, dict_features])
@@ -474,10 +534,11 @@ def reshape_annotate_result(
474
534
  )
475
535
 
476
536
  # handle categorical features
537
+ links_prefix = "links_" if registry is Artifact else ("links_", "values_")
477
538
  links_features = [
478
539
  col
479
540
  for col in df.columns
480
- if "feature__name" in col and col.startswith("links_")
541
+ if "feature__name" in col and col.startswith(links_prefix)
481
542
  ]
482
543
 
483
544
  if links_features:
@@ -501,6 +562,20 @@ def reshape_annotate_result(
501
562
  result[feature.name] = result[feature.name].apply(
502
563
  extract_single_element
503
564
  )
565
+ if feature.dtype.startswith("cat"):
566
+ try:
567
+ # Try to convert to category - this will fail if complex objects remain
568
+ result[feature.name] = result[feature.name].astype("category")
569
+ except (TypeError, ValueError):
570
+ # If conversion fails, the column still contains complex objects
571
+ pass
572
+ if feature.dtype.startswith("datetime"):
573
+ try:
574
+ # Try to convert to category - this will fail if complex objects remain
575
+ result[feature.name] = pd.to_datetime(result[feature.name])
576
+ except (TypeError, ValueError):
577
+ # If conversion fails, the column still contains complex objects
578
+ pass
504
579
 
505
580
  # sort columns
506
581
  result = reorder_subset_columns_in_df(result, feature_names)
@@ -520,12 +595,14 @@ def process_links_features(
520
595
  """Process links_XXX feature columns."""
521
596
  # this loops over different entities that might be linked under a feature
522
597
  for feature_col in feature_cols:
523
- prefix = re.match(r"links_(.+?)__feature__name", feature_col).group(1)
598
+ links_attribute = "links_" if feature_col.startswith("links_") else "values_"
599
+ regex = f"{links_attribute}(.+?)__feature__name"
600
+ prefix = re.match(regex, feature_col).group(1)
524
601
 
525
602
  value_cols = [
526
603
  col
527
604
  for col in df.columns
528
- if col.startswith(f"links_{prefix}__")
605
+ if col.startswith(f"{links_attribute}{prefix}__")
529
606
  and col.endswith("__name")
530
607
  and "feature__name" not in col
531
608
  ]
@@ -598,7 +675,7 @@ class BasicQuerySet(models.QuerySet):
598
675
  def df(
599
676
  self,
600
677
  include: str | list[str] | None = None,
601
- features: bool | list[str] | None = None,
678
+ features: bool | list[str] | str | None = None,
602
679
  ) -> pd.DataFrame:
603
680
  """{}""" # noqa: D415
604
681
  time = datetime.now(timezone.utc)
@@ -617,7 +694,7 @@ class BasicQuerySet(models.QuerySet):
617
694
  feature_qs = None
618
695
  if features:
619
696
  feature_annotate_kwargs, feature_names, feature_qs = (
620
- get_feature_annotate_kwargs(features)
697
+ get_feature_annotate_kwargs(self.model, features, self)
621
698
  )
622
699
  time = logger.debug("finished feature_annotate_kwargs", time=time)
623
700
  annotate_kwargs.update(feature_annotate_kwargs)
@@ -652,7 +729,7 @@ class BasicQuerySet(models.QuerySet):
652
729
  cols_from_include = analyze_lookup_cardinality(self.model, include_input) # type: ignore
653
730
  time = logger.debug("finished analyze_lookup_cardinality", time=time)
654
731
  df_reshaped = reshape_annotate_result(
655
- df, field_names, cols_from_include, feature_names, feature_qs
732
+ self.model, df, field_names, cols_from_include, feature_names, feature_qs
656
733
  )
657
734
  time = logger.debug("finished reshape_annotate_result", time=time)
658
735
  pk_name = self.model._meta.pk.name
@@ -710,7 +787,7 @@ class BasicQuerySet(models.QuerySet):
710
787
  >>> ULabel.filter(name="benchmark").one_or_none()
711
788
  >>> ULabel.filter(name="non existing label").one_or_none()
712
789
  """
713
- if len(self) == 0:
790
+ if not self.exists():
714
791
  return None
715
792
  elif len(self) == 1:
716
793
  return self[0]