lamindb 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lamindb/__init__.py +14 -5
  2. lamindb/_artifact.py +150 -53
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +85 -51
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +12 -6
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +59 -17
  10. lamindb/_record.py +171 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +33 -10
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +106 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/validation.py +2 -6
  19. lamindb/core/__init__.py +13 -14
  20. lamindb/core/_context.py +7 -7
  21. lamindb/core/_data.py +29 -25
  22. lamindb/core/_describe.py +1 -1
  23. lamindb/core/_django.py +1 -1
  24. lamindb/core/_feature_manager.py +53 -43
  25. lamindb/core/_label_manager.py +4 -4
  26. lamindb/core/_mapped_collection.py +20 -7
  27. lamindb/core/datasets/__init__.py +6 -1
  28. lamindb/core/datasets/_core.py +12 -11
  29. lamindb/core/datasets/_small.py +66 -20
  30. lamindb/core/exceptions.py +1 -90
  31. lamindb/core/loaders.py +6 -12
  32. lamindb/core/relations.py +6 -4
  33. lamindb/core/storage/_anndata_accessor.py +41 -0
  34. lamindb/core/storage/_backed_access.py +2 -2
  35. lamindb/core/storage/_pyarrow_dataset.py +25 -15
  36. lamindb/core/storage/_tiledbsoma.py +56 -12
  37. lamindb/core/storage/paths.py +27 -21
  38. lamindb/core/subsettings/_creation_settings.py +4 -16
  39. lamindb/curators/__init__.py +2168 -833
  40. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  41. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  42. lamindb/errors.py +96 -0
  43. lamindb/integrations/_vitessce.py +3 -3
  44. lamindb/migrations/0069_squashed.py +76 -75
  45. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  46. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  47. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  48. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  49. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  50. lamindb/migrations/0086_various.py +95 -0
  51. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  52. lamindb/migrations/0088_schema_components.py +273 -0
  53. lamindb/migrations/0088_squashed.py +4372 -0
  54. lamindb/models.py +420 -153
  55. {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/METADATA +9 -7
  56. lamindb-1.1.0.dist-info/RECORD +95 -0
  57. lamindb/curators/_spatial.py +0 -528
  58. lamindb/migrations/0052_squashed.py +0 -1261
  59. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  60. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  61. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  62. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  63. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  64. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  65. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  66. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  67. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  68. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  69. lamindb/migrations/0063_populate_latest_field.py +0 -45
  70. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  71. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  72. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  73. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  74. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  75. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  76. lamindb-1.0.5.dist-info/RECORD +0 -102
  77. {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
  78. {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/_collection.py CHANGED
@@ -15,33 +15,35 @@ from lamin_utils import logger
15
15
  from lamindb_setup.core._docs import doc_args
16
16
  from lamindb_setup.core.hashing import hash_set
17
17
 
18
- from lamindb.models import (
19
- Collection,
20
- CollectionArtifact,
21
- Schema,
22
- )
23
-
24
18
  from ._parents import view_lineage
25
- from ._record import init_self_from_db, update_attributes
19
+ from ._record import _get_record_kwargs, init_self_from_db, update_attributes
26
20
  from ._utils import attach_func_to_class_method
27
21
  from .core._data import (
28
22
  _track_run_input,
29
23
  describe,
30
24
  get_run,
31
25
  save_schema_links,
32
- save_staged__schemas_m2m,
26
+ save_staged_feature_sets,
33
27
  )
34
28
  from .core._mapped_collection import MappedCollection
35
- from .core._settings import settings
29
+ from .core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
36
30
  from .core.versioning import process_revises
37
- from .models import Artifact, Run
31
+ from .errors import FieldValidationError
32
+ from .models import (
33
+ Artifact,
34
+ Collection,
35
+ CollectionArtifact,
36
+ Run,
37
+ Schema,
38
+ )
38
39
 
39
40
  if TYPE_CHECKING:
40
41
  from collections.abc import Iterable
41
42
 
42
- from lamindb.core.storage import UPath
43
+ from pyarrow.dataset import Dataset as PyArrowDataset
43
44
 
44
45
  from ._query_set import QuerySet
46
+ from .core.storage import UPath
45
47
 
46
48
 
47
49
  class CollectionFeatureManager:
@@ -50,15 +52,15 @@ class CollectionFeatureManager:
50
52
  def __init__(self, collection: Collection):
51
53
  self._collection = collection
52
54
 
53
- def _get_staged__schemas_m2m_union(self) -> dict[str, Schema]:
54
- links_schema_artifact = Artifact._schemas_m2m.through.objects.filter(
55
+ def _get_staged_feature_sets_union(self) -> dict[str, Schema]:
56
+ links_schema_artifact = Artifact.feature_sets.through.objects.filter(
55
57
  artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
56
58
  )
57
- _schemas_m2m_by_slots = defaultdict(list)
59
+ feature_sets_by_slots = defaultdict(list)
58
60
  for link in links_schema_artifact:
59
- _schemas_m2m_by_slots[link.slot].append(link.schema_id)
60
- _schemas_m2m_union = {}
61
- for slot, schema_ids_slot in _schemas_m2m_by_slots.items():
61
+ feature_sets_by_slots[link.slot].append(link.schema_id)
62
+ feature_sets_union = {}
63
+ for slot, schema_ids_slot in feature_sets_by_slots.items():
62
64
  schema_1 = Schema.get(id=schema_ids_slot[0])
63
65
  related_name = schema_1._get_related_name()
64
66
  features_registry = getattr(Schema, related_name).field.model
@@ -73,8 +75,8 @@ class CollectionFeatureManager:
73
75
  .distinct()
74
76
  )
75
77
  features = features_registry.filter(id__in=feature_ids)
76
- _schemas_m2m_union[slot] = Schema(features, dtype=schema_1.dtype)
77
- return _schemas_m2m_union
78
+ feature_sets_union[slot] = Schema(features, dtype=schema_1.dtype)
79
+ return feature_sets_union
78
80
 
79
81
 
80
82
  def __init__(
@@ -92,23 +94,16 @@ def __init__(
92
94
  artifacts: Artifact | Iterable[Artifact] = (
93
95
  kwargs.pop("artifacts") if len(args) == 0 else args[0]
94
96
  )
95
- meta_artifact: Artifact | None = (
96
- kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None
97
- )
98
- key: str | None = kwargs.pop("key") if "key" in kwargs else None
99
- description: str | None = (
100
- kwargs.pop("description") if "description" in kwargs else None
101
- )
102
- reference: str | None = kwargs.pop("reference") if "reference" in kwargs else None
103
- reference_type: str | None = (
104
- kwargs.pop("reference_type") if "reference_type" in kwargs else None
105
- )
106
- run: Run | None = kwargs.pop("run") if "run" in kwargs else None
107
- revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
108
- version: str | None = kwargs.pop("version") if "version" in kwargs else None
109
- _branch_code: int | None = (
110
- kwargs.pop("_branch_code") if "_branch_code" in kwargs else 1
111
- )
97
+ meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
98
+ tmp_key: str | None = kwargs.pop("key", None)
99
+ description: str | None = kwargs.pop("description", None)
100
+ reference: str | None = kwargs.pop("reference", None)
101
+ reference_type: str | None = kwargs.pop("reference_type", None)
102
+ run: Run | None = kwargs.pop("run", None)
103
+ revises: Collection | None = kwargs.pop("revises", None)
104
+ version: str | None = kwargs.pop("version", None)
105
+ _branch_code: int | None = kwargs.pop("_branch_code", 1)
106
+ key: str
112
107
  if "name" in kwargs:
113
108
  key = kwargs.pop("name")
114
109
  warnings.warn(
@@ -116,9 +111,16 @@ def __init__(
116
111
  FutureWarning,
117
112
  stacklevel=2,
118
113
  )
114
+ else:
115
+ key = tmp_key
119
116
  if not len(kwargs) == 0:
120
- raise ValueError(
121
- f"Only artifacts, key, run, description, reference, reference_type can be passed, you passed: {kwargs}"
117
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Collection)])
118
+ raise FieldValidationError(
119
+ f"Only {valid_keywords} can be passed, you passed: {kwargs}"
120
+ )
121
+ if revises is None:
122
+ revises = (
123
+ Collection.filter(key=key, is_latest=True).order_by("-created_at").first()
122
124
  )
123
125
  provisional_uid, version, key, description, revises = process_revises(
124
126
  revises, version, key, description, Collection
@@ -162,11 +164,8 @@ def __init__(
162
164
  init_self_from_db(collection, existing_collection)
163
165
  update_attributes(collection, {"description": description, "key": key})
164
166
  else:
165
- kwargs = {}
166
- search_names_setting = settings.creation.search_names
167
- if revises is not None and key == revises.key:
168
- settings.creation.search_names = False
169
- super(Collection, collection).__init__(
167
+ _skip_validation = revises is not None and key == revises.key
168
+ super(Collection, collection).__init__( # type: ignore
170
169
  uid=provisional_uid,
171
170
  key=key,
172
171
  description=description,
@@ -178,9 +177,8 @@ def __init__(
178
177
  version=version,
179
178
  _branch_code=_branch_code,
180
179
  revises=revises,
181
- **kwargs,
180
+ _skip_validation=_skip_validation,
182
181
  )
183
- settings.creation.search_names = search_names_setting
184
182
  collection._artifacts = artifacts
185
183
  # register provenance
186
184
  if revises is not None:
@@ -190,8 +188,9 @@ def __init__(
190
188
 
191
189
  # docstring handled through attach_func_to_class_method
192
190
  def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
193
- return Collection(
191
+ return Collection( # type: ignore
194
192
  self.artifacts.all().list() + [artifact],
193
+ # key is automatically taken from revises.key
195
194
  description=self.description,
196
195
  revises=self,
197
196
  run=run,
@@ -218,13 +217,46 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
218
217
  return hash
219
218
 
220
219
 
220
+ # docstring handled through attach_func_to_class_method
221
+ def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
222
+ if self._state.adding:
223
+ artifacts = self._artifacts
224
+ logger.warning("the collection isn't saved, consider calling `.save()`")
225
+ else:
226
+ artifacts = self.ordered_artifacts.all()
227
+ paths = [artifact.path for artifact in artifacts]
228
+ # this checks that the filesystem is the same for all paths
229
+ # this is a requirement of pyarrow.dataset.dataset
230
+ fs = paths[0].fs
231
+ for path in paths[1:]:
232
+ # this assumes that the filesystems are cached by fsspec
233
+ if path.fs is not fs:
234
+ raise ValueError(
235
+ "The collection has artifacts with different filesystems, this is not supported."
236
+ )
237
+ if not _is_pyarrow_dataset(paths):
238
+ suffixes = {path.suffix for path in paths}
239
+ suffixes_str = ", ".join(suffixes)
240
+ err_msg = "This collection is not compatible with pyarrow.dataset.dataset(), "
241
+ err_msg += (
242
+ f"the artifacts have incompatible file types: {suffixes_str}"
243
+ if len(suffixes) > 1
244
+ else f"the file type {suffixes_str} is not supported by pyarrow."
245
+ )
246
+ raise ValueError(err_msg)
247
+ dataset = _open_pyarrow_dataset(paths)
248
+ # track only if successful
249
+ _track_run_input(self, is_run_input)
250
+ return dataset
251
+
252
+
221
253
  # docstring handled through attach_func_to_class_method
222
254
  def mapped(
223
255
  self,
224
256
  layers_keys: str | list[str] | None = None,
225
257
  obs_keys: str | list[str] | None = None,
226
258
  obsm_keys: str | list[str] | None = None,
227
- obs_filter: dict[str, str | tuple[str, ...]] | None = None,
259
+ obs_filter: dict[str, str | list[str]] | None = None,
228
260
  join: Literal["inner", "outer"] | None = "inner",
229
261
  encode_labels: bool | list[str] = True,
230
262
  unknown_label: str | dict[str, str] | None = None,
@@ -237,12 +269,12 @@ def mapped(
237
269
  path_list = []
238
270
  if self._state.adding:
239
271
  artifacts = self._artifacts
240
- logger.warning("The collection isn't saved, consider calling `.save()`")
272
+ logger.warning("the collection isn't saved, consider calling `.save()`")
241
273
  else:
242
274
  artifacts = self.ordered_artifacts.all()
243
275
  for artifact in artifacts:
244
276
  if artifact.suffix not in {".h5ad", ".zarr"}:
245
- logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
277
+ logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
246
278
  continue
247
279
  elif not stream:
248
280
  path_list.append(artifact.cache())
@@ -335,14 +367,14 @@ def save(self, using: str | None = None) -> Collection:
335
367
  if self.meta_artifact is not None:
336
368
  self.meta_artifact.save()
337
369
  # we don't need to save feature sets again
338
- save_staged__schemas_m2m(self)
370
+ save_staged_feature_sets(self)
339
371
  super(Collection, self).save()
340
372
  # we don't allow updating the collection of artifacts
341
373
  # if users want to update the set of artifacts, they
342
374
  # have to create a new collection
343
375
  if hasattr(self, "_artifacts"):
344
376
  links = [
345
- CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
377
+ CollectionArtifact(collection_id=self.id, artifact_id=artifact.id) # type: ignore
346
378
  for artifact in self._artifacts
347
379
  ]
348
380
  # the below seems to preserve the order of the list in the
@@ -380,6 +412,7 @@ def data_artifact(self) -> Artifact | None:
380
412
  METHOD_NAMES = [
381
413
  "__init__",
382
414
  "append",
415
+ "open",
383
416
  "mapped",
384
417
  "cache",
385
418
  "load",
@@ -400,6 +433,7 @@ if ln_setup._TESTING:
400
433
  for name in METHOD_NAMES:
401
434
  attach_func_to_class_method(name, Collection, globals())
402
435
 
436
+ # mypy: ignore-errors
403
437
  Collection.ordered_artifacts = ordered_artifacts
404
438
  Collection.data_artifact = data_artifact
405
439
  Collection.describe = describe
lamindb/_feature.py CHANGED
@@ -1,16 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import importlib
3
4
  from typing import TYPE_CHECKING, Any, get_args
4
5
 
5
6
  import lamindb_setup as ln_setup
6
7
  import pandas as pd
8
+ from django.db.models.query_utils import DeferredAttribute
7
9
  from lamin_utils import logger
10
+ from lamindb_setup._init_instance import get_schema_module_name
8
11
  from lamindb_setup.core._docs import doc_args
9
12
  from pandas.api.types import CategoricalDtype, is_string_dtype
10
13
 
14
+ from lamindb._record import _get_record_kwargs
11
15
  from lamindb.base.types import FeatureDtype
12
- from lamindb.core.exceptions import ValidationError
13
- from lamindb.models import Artifact, Feature, Record
16
+ from lamindb.errors import FieldValidationError, ValidationError
17
+ from lamindb.models import Artifact, Feature, Record, Registry
14
18
 
15
19
  from ._query_set import RecordList
16
20
  from ._utils import attach_func_to_class_method
@@ -27,21 +31,133 @@ if TYPE_CHECKING:
27
31
  FEATURE_DTYPES = set(get_args(FeatureDtype))
28
32
 
29
33
 
30
- def get_dtype_str_from_dtype(dtype: Any) -> str:
31
- if not isinstance(dtype, list) and dtype.__name__ in FEATURE_DTYPES:
34
+ def parse_dtype_single_cat(
35
+ dtype_str: str,
36
+ related_registries: dict[str, Record] | None = None,
37
+ is_itype: bool = False,
38
+ ) -> dict:
39
+ assert isinstance(dtype_str, str) # noqa: S101
40
+ if related_registries is None:
41
+ related_registries = dict_module_name_to_model_name(Artifact)
42
+ split_result = dtype_str.split("[")
43
+ # has sub type
44
+ sub_type_str = ""
45
+ if len(split_result) == 2:
46
+ registry_str = split_result[0]
47
+ assert "]" in split_result[1] # noqa: S101
48
+ sub_type_field_split = split_result[1].split("].")
49
+ if len(sub_type_field_split) == 1:
50
+ sub_type_str = sub_type_field_split[0].strip("]")
51
+ field_str = ""
52
+ else:
53
+ sub_type_str = sub_type_field_split[0]
54
+ field_str = sub_type_field_split[1]
55
+ elif len(split_result) == 1:
56
+ registry_field_split = split_result[0].split(".")
57
+ if (
58
+ len(registry_field_split) == 2 and registry_field_split[1][0].isupper()
59
+ ) or len(registry_field_split) == 3:
60
+ # bionty.CellType or bionty.CellType.name
61
+ registry_str = f"{registry_field_split[0]}.{registry_field_split[1]}"
62
+ field_str = (
63
+ "" if len(registry_field_split) == 2 else registry_field_split[2]
64
+ )
65
+ else:
66
+ # ULabel or ULabel.name
67
+ registry_str = registry_field_split[0]
68
+ field_str = (
69
+ "" if len(registry_field_split) == 1 else registry_field_split[1]
70
+ )
71
+ if not is_itype:
72
+ if registry_str not in related_registries:
73
+ raise ValidationError(
74
+ f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
75
+ )
76
+ registry = related_registries[registry_str]
77
+ else:
78
+ if "." in registry_str:
79
+ registry_str_split = registry_str.split(".")
80
+ assert len(registry_str_split) == 2, registry_str # noqa: S101
81
+ module_name, class_name = registry_str_split
82
+ module_name = get_schema_module_name(module_name)
83
+ else:
84
+ module_name, class_name = "lamindb", registry_str
85
+ module = importlib.import_module(module_name)
86
+ registry = getattr(module, class_name)
87
+ if sub_type_str != "":
88
+ pass
89
+ # validate that the subtype is a record in the registry with is_type = True
90
+ if field_str != "":
91
+ pass
92
+ # validate that field_str is an actual field of the module
93
+ else:
94
+ field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
95
+ return {
96
+ "registry": registry, # should be typed as CanCurate
97
+ "registry_str": registry_str,
98
+ "subtype_str": sub_type_str,
99
+ "field_str": field_str,
100
+ "field": getattr(registry, field_str),
101
+ }
102
+
103
+
104
+ def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
105
+ allowed_dtypes = FEATURE_DTYPES
106
+ if is_param:
107
+ allowed_dtypes.add("dict")
108
+ is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
109
+ result = []
110
+ if is_composed_cat:
111
+ related_registries = dict_module_name_to_model_name(Artifact)
112
+ registries_str = dtype_str.replace("cat[", "")[:-1] # strip last ]
113
+ if registries_str != "":
114
+ registry_str_list = registries_str.split("|")
115
+ for cat_single_dtype_str in registry_str_list:
116
+ single_result = parse_dtype_single_cat(
117
+ cat_single_dtype_str, related_registries
118
+ )
119
+ result.append(single_result)
120
+ elif dtype_str not in allowed_dtypes:
121
+ raise ValueError(
122
+ f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
123
+ )
124
+ return result
125
+
126
+
127
+ def get_dtype_str_from_dtype(dtype: Any, is_itype: bool = False) -> str:
128
+ if (
129
+ not isinstance(dtype, list)
130
+ and hasattr(dtype, "__name__")
131
+ and dtype.__name__ in FEATURE_DTYPES
132
+ ):
32
133
  dtype_str = dtype.__name__
33
134
  else:
34
- error_message = "dtype has to be of type Record or list[Record]"
35
- if isinstance(dtype, Record):
135
+ error_message = (
136
+ "dtype has to be a record, a record field, or a list of records, not {}"
137
+ )
138
+ if isinstance(dtype, Registry):
139
+ dtype = [dtype]
140
+ elif isinstance(dtype, DeferredAttribute):
36
141
  dtype = [dtype]
37
142
  elif not isinstance(dtype, list):
38
- raise ValueError(error_message)
39
- registries_str = ""
40
- for registry in dtype:
41
- if not hasattr(registry, "__get_name_with_module__"):
42
- raise ValueError(error_message)
43
- registries_str += registry.__get_name_with_module__() + "|"
44
- dtype_str = f'cat[{registries_str.rstrip("|")}]'
143
+ raise ValueError(error_message.format(dtype))
144
+ dtype_str = ""
145
+ for single_dtype in dtype:
146
+ if not isinstance(single_dtype, Registry) and not isinstance(
147
+ single_dtype, DeferredAttribute
148
+ ):
149
+ raise ValueError(error_message.format(single_dtype))
150
+ if isinstance(single_dtype, Registry):
151
+ dtype_str += single_dtype.__get_name_with_module__() + "|"
152
+ else:
153
+ dtype_str += (
154
+ single_dtype.field.model.__get_name_with_module__()
155
+ + f".{single_dtype.field.name}"
156
+ + "|"
157
+ )
158
+ dtype_str = dtype_str.rstrip("|")
159
+ if not is_itype:
160
+ dtype_str = f"cat[{dtype_str}]"
45
161
  return dtype_str
46
162
 
47
163
 
@@ -63,44 +179,64 @@ def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
63
179
  return dtype
64
180
 
65
181
 
66
- def __init__(self, *args, **kwargs):
67
- if len(args) == len(self._meta.concrete_fields):
68
- super(Feature, self).__init__(*args, **kwargs)
69
- return None
182
+ def process_init_feature_param(args, kwargs, is_param: bool = False):
70
183
  # now we proceed with the user-facing constructor
71
184
  if len(args) != 0:
72
185
  raise ValueError("Only keyword args allowed")
73
- dtype: type | str = kwargs.pop("dtype") if "dtype" in kwargs else None
74
- # cast type
75
- if dtype is None:
76
- raise ValueError(f"Please pass dtype, one of {FEATURE_DTYPES}")
77
- elif dtype is not None:
186
+ name: str = kwargs.pop("name", None)
187
+ dtype: type | str | None = kwargs.pop("dtype", None)
188
+ is_type: bool = kwargs.pop("is_type", None)
189
+ type_: Feature | str | None = kwargs.pop("type", None)
190
+ description: str | None = kwargs.pop("description", None)
191
+ if kwargs:
192
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
193
+ raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
194
+ kwargs["name"] = name
195
+ kwargs["type"] = type_
196
+ kwargs["is_type"] = is_type
197
+ if not is_param:
198
+ kwargs["description"] = description
199
+ # cast dtype
200
+ if dtype is None and not is_type:
201
+ raise ValidationError(
202
+ f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
203
+ )
204
+ dtype_str = None
205
+ if dtype is not None:
78
206
  if not isinstance(dtype, str):
79
207
  dtype_str = get_dtype_str_from_dtype(dtype)
80
208
  else:
81
209
  dtype_str = dtype
82
- # add validation that a registry actually exists
83
- if dtype_str not in FEATURE_DTYPES and not dtype_str.startswith("cat"):
84
- raise ValueError(
85
- f"dtype is {dtype_str} but has to be one of {FEATURE_DTYPES}!"
86
- )
87
- if dtype_str != "cat" and dtype_str.startswith("cat"):
88
- registries_str = dtype_str.replace("cat[", "").rstrip("]")
89
- if registries_str != "":
90
- registry_str_list = registries_str.split("|")
91
- for registry_str in registry_str_list:
92
- if registry_str not in dict_module_name_to_model_name(Artifact):
93
- raise ValueError(
94
- f"'{registry_str}' is an invalid dtype, pass, e.g. `[ln.ULabel, bt.CellType]` or similar"
95
- )
96
- kwargs["dtype"] = dtype_str
210
+ parse_dtype(dtype_str, is_param=is_param)
211
+ kwargs["dtype"] = dtype_str
212
+ return kwargs
213
+
214
+
215
+ def __init__(self, *args, **kwargs):
216
+ if len(args) == len(self._meta.concrete_fields):
217
+ super(Feature, self).__init__(*args, **kwargs)
218
+ return None
219
+ dtype = kwargs.get("dtype", None)
220
+ default_value = kwargs.pop("default_value", None)
221
+ nullable = kwargs.pop("nullable", None)
222
+ cat_filters = kwargs.pop("cat_filters", None)
223
+ kwargs = process_init_feature_param(args, kwargs)
97
224
  super(Feature, self).__init__(*args, **kwargs)
225
+ self.default_value = default_value
226
+ self.nullable = nullable
227
+ dtype_str = kwargs.pop("dtype", None)
228
+ if cat_filters:
229
+ assert "|" not in dtype_str # noqa: S101
230
+ assert "]]" not in dtype_str # noqa: S101
231
+ fill_in = ", ".join(f"{key}='{value}'" for (key, value) in cat_filters.items())
232
+ dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
233
+ self.dtype = dtype_str
98
234
  if not self._state.adding:
99
235
  if not (
100
- self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype
236
+ self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype_str
101
237
  ):
102
238
  raise ValidationError(
103
- f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype}"
239
+ f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
104
240
  )
105
241
 
106
242
 
@@ -138,7 +274,7 @@ def categoricals_from_df(df: pd.DataFrame) -> dict:
138
274
  def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
139
275
  """{}""" # noqa: D415
140
276
  field = Feature.name if field is None else field
141
- registry = field.field.model
277
+ registry = field.field.model # type: ignore
142
278
  if registry != Feature:
143
279
  raise ValueError("field must be a Feature FieldAttr!")
144
280
  categoricals = categoricals_from_df(df)
@@ -149,7 +285,7 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList
149
285
  else:
150
286
  dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
151
287
  with logger.mute(): # silence the warning "loaded record with exact same name "
152
- features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]
288
+ features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()] # type: ignore
153
289
  assert len(features) == len(df.columns) # noqa: S101
154
290
  return RecordList(features)
155
291
 
lamindb/_finish.py CHANGED
@@ -96,7 +96,7 @@ def save_run_logs(run: Run, save_run: bool = False) -> None:
96
96
  if logs_path.exists():
97
97
  if run.report is not None:
98
98
  logger.important("overwriting run.report")
99
- artifact = Artifact(
99
+ artifact = Artifact( # type: ignore
100
100
  logs_path,
101
101
  description=f"log streams of run {run.uid}",
102
102
  _branch_code=0,
@@ -159,7 +159,7 @@ def notebook_to_report(notebook_path: Path, output_path: Path) -> None:
159
159
  output_path.write_text(html, encoding="utf-8")
160
160
 
161
161
 
162
- def notebook_to_script(
162
+ def notebook_to_script( # type: ignore
163
163
  transform: Transform, notebook_path: Path, script_path: Path | None = None
164
164
  ) -> None | str:
165
165
  import jupytext
@@ -207,8 +207,13 @@ def clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]:
207
207
 
208
208
 
209
209
  def check_filepath_recently_saved(filepath: Path, is_finish_retry: bool) -> bool:
210
- recently_saved_time = 3 if not is_finish_retry else 20
210
+ # the recently_saved_time needs to be very low for the first check
211
+ # because an accidental save (e.g. via auto-save) might otherwise lead
212
+ # to upload of an outdated notebook
213
+ # also see implementation for R notebooks below
214
+ offset_saved_time = 0.3 if not is_finish_retry else 20
211
215
  for retry in range(30):
216
+ recently_saved_time = offset_saved_time + retry # sleep time is 1 sec
212
217
  if get_seconds_since_modified(filepath) > recently_saved_time:
213
218
  if retry == 0:
214
219
  prefix = f"{LEVEL_TO_COLORS[20]}{LEVEL_TO_ICONS[20]}{RESET_COLOR}"
@@ -316,7 +321,8 @@ def save_context_core(
316
321
  f"no html report found; to attach one, create an .html export for your {filepath.suffix} file and then run: lamin save {filepath}"
317
322
  )
318
323
  if report_path is not None and is_r_notebook and not from_cli: # R notebooks
319
- recently_saved_time = 3 if not is_retry else 20
324
+ # see comment above in check_filepath_recently_saved
325
+ recently_saved_time = 0.3 if not is_retry else 20
320
326
  if get_seconds_since_modified(report_path) > recently_saved_time:
321
327
  # the automated retry solution of Jupyter notebooks does not work in RStudio because the execution of the notebook cell
322
328
  # seems to block the event loop of the frontend
@@ -365,7 +371,7 @@ def save_context_core(
365
371
  artifact = ln.Artifact.filter(hash=hash, _branch_code=0).one_or_none()
366
372
  new_env_artifact = artifact is None
367
373
  if new_env_artifact:
368
- artifact = ln.Artifact(
374
+ artifact = ln.Artifact( # type: ignore
369
375
  env_path,
370
376
  description="requirements.txt",
371
377
  _branch_code=0,
@@ -411,7 +417,7 @@ def save_context_core(
411
417
  else:
412
418
  logger.important("report is already saved")
413
419
  else:
414
- report_file = ln.Artifact(
420
+ report_file = ln.Artifact( # type: ignore
415
421
  report_path,
416
422
  description=f"Report of run {run.uid}",
417
423
  _branch_code=0, # hidden file