lamindb 1.10.2__py3-none-any.whl → 1.11a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +14 -12
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +2 -2
  8. lamindb/core/storage/_anndata_accessor.py +29 -9
  9. lamindb/curators/_legacy.py +16 -3
  10. lamindb/curators/core.py +432 -186
  11. lamindb/examples/cellxgene/__init__.py +8 -3
  12. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  13. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  14. lamindb/examples/croissant/__init__.py +12 -2
  15. lamindb/examples/datasets/__init__.py +2 -2
  16. lamindb/examples/datasets/_core.py +1 -1
  17. lamindb/examples/datasets/_small.py +66 -22
  18. lamindb/examples/datasets/mini_immuno.py +1 -0
  19. lamindb/migrations/0119_squashed.py +5 -2
  20. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  21. lamindb/migrations/0121_recorduser.py +53 -0
  22. lamindb/models/__init__.py +3 -1
  23. lamindb/models/_describe.py +2 -2
  24. lamindb/models/_feature_manager.py +53 -53
  25. lamindb/models/_from_values.py +2 -2
  26. lamindb/models/_is_versioned.py +4 -4
  27. lamindb/models/_label_manager.py +4 -4
  28. lamindb/models/artifact.py +305 -116
  29. lamindb/models/artifact_set.py +36 -1
  30. lamindb/models/can_curate.py +1 -2
  31. lamindb/models/collection.py +3 -34
  32. lamindb/models/feature.py +111 -7
  33. lamindb/models/has_parents.py +11 -11
  34. lamindb/models/project.py +18 -0
  35. lamindb/models/query_manager.py +16 -7
  36. lamindb/models/query_set.py +59 -34
  37. lamindb/models/record.py +25 -4
  38. lamindb/models/run.py +8 -6
  39. lamindb/models/schema.py +54 -26
  40. lamindb/models/sqlrecord.py +123 -25
  41. lamindb/models/storage.py +59 -14
  42. lamindb/models/transform.py +17 -17
  43. lamindb/models/ulabel.py +6 -1
  44. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/METADATA +4 -5
  45. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/RECORD +47 -44
  46. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/WHEEL +1 -1
  47. {lamindb-1.10.2.dist-info/licenses → lamindb-1.11a1.dist-info}/LICENSE +0 -0
@@ -496,21 +496,11 @@ def describe_features(
496
496
  return tree
497
497
 
498
498
 
499
- def is_valid_datetime_str(date_string: str) -> bool | str:
500
- try:
501
- dt = datetime.fromisoformat(date_string)
502
- return dt.isoformat()
503
- except ValueError:
504
- return False
505
-
506
-
507
- def is_iterable_of_sqlrecord(value: Any):
508
- return isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord)
509
-
510
-
511
499
  def infer_feature_type_convert_json(
512
- key: str, value: Any, mute: bool = False, str_as_ulabel: bool = True
500
+ key: str, value: Any, mute: bool = False
513
501
  ) -> tuple[str, Any, str]:
502
+ from lamindb.base.dtypes import is_valid_datetime_str
503
+
514
504
  message = ""
515
505
  if isinstance(value, bool):
516
506
  return "bool", value, message
@@ -719,15 +709,15 @@ def parse_staged_feature_sets_from_anndata(
719
709
  data_parse = backed_access(filepath, using_key=using_key)
720
710
  else:
721
711
  data_parse = ad.read_h5ad(filepath, backed="r")
722
- type = "float"
712
+ dtype = "float"
723
713
  else:
724
- type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
714
+ dtype = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
725
715
  feature_sets = {}
726
716
  if var_field is not None:
727
717
  schema_var = Schema.from_values(
728
718
  data_parse.var.index,
729
719
  var_field,
730
- type=type,
720
+ dtype=dtype,
731
721
  mute=mute,
732
722
  organism=organism,
733
723
  raise_validation_error=False,
@@ -735,7 +725,7 @@ def parse_staged_feature_sets_from_anndata(
735
725
  if schema_var is not None:
736
726
  feature_sets["var"] = schema_var
737
727
  if obs_field is not None and len(data_parse.obs.columns) > 0:
738
- schema_obs = Schema.from_df(
728
+ schema_obs = Schema.from_dataframe(
739
729
  df=data_parse.obs,
740
730
  field=obs_field,
741
731
  mute=mute,
@@ -851,16 +841,17 @@ class FeatureManager:
851
841
  self,
852
842
  values: dict[str, str | int | float | bool],
853
843
  feature_field: FieldAttr = Feature.name,
854
- str_as_ulabel: bool = True,
844
+ schema: Schema = None,
855
845
  ) -> None:
856
846
  """Curate artifact with features & values.
857
847
 
858
848
  Args:
859
849
  values: A dictionary of keys (features) & values (labels, numbers, booleans).
860
- feature_field: The field of a reference registry to map keys of the
861
- dictionary.
862
- str_as_ulabel: Whether to interpret string values as ulabels.
850
+ feature_field: The field of a reference registry to map keys of the dictionary.
851
+ schema: Schema to validate against.
863
852
  """
853
+ from lamindb.base.dtypes import is_iterable_of_sqlrecord
854
+
864
855
  from .._tracked import get_current_tracked_run
865
856
 
866
857
  # rename to distinguish from the values inside the dict
@@ -870,39 +861,48 @@ class FeatureManager:
870
861
  keys = list(keys) # type: ignore
871
862
  # deal with other cases later
872
863
  assert all(isinstance(key, str) for key in keys) # noqa: S101
864
+
873
865
  registry = feature_field.field.model
874
866
  value_model = FeatureValue
875
867
  model_name = "Feature"
876
- records = registry.from_values(keys, field=feature_field, mute=True)
877
- if len(records) != len(keys):
878
- not_validated_keys = [
879
- key for key in keys if key not in records.list("name")
880
- ]
881
- not_validated_keys_dtype_message = [
882
- (key, infer_feature_type_convert_json(key, dictionary[key]))
883
- for key in not_validated_keys
884
- ]
885
- run = get_current_tracked_run()
886
- if run is not None:
887
- name = f"{run.transform.type}[{run.transform.key}]"
888
- type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
889
- elements = [type_hint]
890
- type_kwarg = f", type={model_name.lower()}_type"
891
- else:
892
- elements = []
893
- type_kwarg = ""
894
- elements += [
895
- f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
896
- for key, (dtype, _, message) in not_validated_keys_dtype_message
897
- ]
898
- hint = "\n".join(elements)
899
- msg = (
900
- f"These keys could not be validated: {not_validated_keys}\n"
901
- f"Here is how to create a {model_name.lower()}:\n\n{hint}"
902
- )
903
- raise ValidationError(msg)
904
868
 
905
- # figure out which of the values go where
869
+ if schema is not None:
870
+ from lamindb.curators import DataFrameCurator
871
+
872
+ temp_df = pd.DataFrame([values])
873
+ curator = DataFrameCurator(temp_df, schema)
874
+ curator.validate()
875
+ records = schema.members.filter(name__in=keys)
876
+ else:
877
+ records = registry.from_values(keys, field=feature_field, mute=True)
878
+ if len(records) != len(keys):
879
+ not_validated_keys = [
880
+ key for key in keys if key not in records.to_list("name")
881
+ ]
882
+ not_validated_keys_dtype_message = [
883
+ (key, infer_feature_type_convert_json(key, dictionary[key]))
884
+ for key in not_validated_keys
885
+ ]
886
+ run = get_current_tracked_run()
887
+ if run is not None:
888
+ name = f"{run.transform.type}[{run.transform.key}]"
889
+ type_hint = f""" {model_name.lower()}_type = ln.{model_name}(name='{name}', is_type=True).save()"""
890
+ elements = [type_hint]
891
+ type_kwarg = f", type={model_name.lower()}_type"
892
+ else:
893
+ elements = []
894
+ type_kwarg = ""
895
+ elements += [
896
+ f" ln.{model_name}(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}"
897
+ for key, (dtype, _, message) in not_validated_keys_dtype_message
898
+ ]
899
+ hint = "\n".join(elements)
900
+ msg = (
901
+ f"These keys could not be validated: {not_validated_keys}\n"
902
+ f"Here is how to create a {model_name.lower()}:\n\n{hint}"
903
+ )
904
+ raise ValidationError(msg)
905
+
906
906
  features_labels = defaultdict(list)
907
907
  _feature_values = []
908
908
  not_validated_values: dict[str, list[str]] = defaultdict(list)
@@ -912,7 +912,6 @@ class FeatureManager:
912
912
  feature.name,
913
913
  value,
914
914
  mute=True,
915
- str_as_ulabel=str_as_ulabel,
916
915
  )
917
916
  if feature.dtype == "num":
918
917
  if inferred_type not in {"int", "float"}:
@@ -994,6 +993,7 @@ class FeatureManager:
994
993
  f"Here is how to create records for them:\n\n{hint}"
995
994
  )
996
995
  raise ValidationError(msg)
996
+
997
997
  if features_labels:
998
998
  self._add_label_feature_links(features_labels)
999
999
  if _feature_values:
@@ -1039,7 +1039,7 @@ class FeatureManager:
1039
1039
  feature: str | Feature,
1040
1040
  *,
1041
1041
  value: Any | None = None,
1042
- ):
1042
+ ) -> None:
1043
1043
  """Remove value annotations for a given feature.
1044
1044
 
1045
1045
  Args:
@@ -1262,7 +1262,7 @@ class FeatureManager:
1262
1262
  """Add feature set corresponding to column names of DataFrame."""
1263
1263
  assert self._host.otype == "DataFrame" # noqa: S101
1264
1264
  df = self._host.load(is_run_input=False)
1265
- schema = Schema.from_df(
1265
+ schema = Schema.from_dataframe(
1266
1266
  df=df,
1267
1267
  field=field,
1268
1268
  mute=mute,
@@ -121,7 +121,7 @@ def get_existing_records(
121
121
  # ]
122
122
  # )
123
123
  # order by causes a factor 10 in runtime
124
- # records = query_set.order_by(preserved).list()
124
+ # records = query_set.order_by(preserved).to_list()
125
125
 
126
126
  # log validated terms
127
127
  is_validated = model.validate(
@@ -165,7 +165,7 @@ def get_existing_records(
165
165
  query = {f"{field.field.name}__in": iterable_idx.values} # type: ignore
166
166
  if organism is not None:
167
167
  query["organism"] = organism
168
- records = model.filter(**query).list()
168
+ records = model.filter(**query).to_list()
169
169
 
170
170
  if len(validated) == len(iterable_idx):
171
171
  return records, pd.Index([]), msg
@@ -108,12 +108,12 @@ def bump_version(
108
108
  ) -> str:
109
109
  """Bumps the version number by major or minor depending on the bump_type flag.
110
110
 
111
- Parameters:
112
- version (str): The current version in "MAJOR" or "MAJOR.MINOR" format.
113
- bump_type (str): The type of version bump, either 'major' or 'minor'.
111
+ Args:
112
+ version: The current version in "MAJOR" or "MAJOR.MINOR" format.
113
+ bump_type: The type of version bump, either 'major' or 'minor'.
114
114
 
115
115
  Returns:
116
- str: The new version string.
116
+ The new version string.
117
117
  """
118
118
  try:
119
119
  # Split the version into major and minor parts if possible
@@ -268,7 +268,7 @@ class LabelManager:
268
268
  for link in links:
269
269
  if link.feature is not None:
270
270
  features.add(link.feature)
271
- key = link.feature.name
271
+ key = link.feature.uid
272
272
  else:
273
273
  key = None
274
274
  keys.append(key)
@@ -299,9 +299,9 @@ class LabelManager:
299
299
  )
300
300
  save(new_features) # type: ignore
301
301
  if hasattr(self._host, related_name):
302
- for feature_name, feature_labels in labels_by_features.items():
303
- if feature_name is not None:
304
- feature_id = Feature.get(name=feature_name).id
302
+ for feature_uid, feature_labels in labels_by_features.items():
303
+ if feature_uid is not None:
304
+ feature_id = Feature.get(feature_uid).id
305
305
  else:
306
306
  feature_id = None
307
307
  getattr(self._host, related_name).add(