lamindb 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lamindb/__init__.py +33 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_compat.py +60 -0
  10. lamindb/core/_context.py +50 -22
  11. lamindb/core/_mapped_collection.py +4 -2
  12. lamindb/core/_settings.py +6 -6
  13. lamindb/core/_sync_git.py +1 -1
  14. lamindb/core/_track_environment.py +2 -1
  15. lamindb/core/datasets/_small.py +3 -3
  16. lamindb/core/loaders.py +43 -20
  17. lamindb/core/storage/_anndata_accessor.py +8 -3
  18. lamindb/core/storage/_backed_access.py +14 -7
  19. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  20. lamindb/core/storage/_tiledbsoma.py +8 -6
  21. lamindb/core/storage/_zarr.py +104 -25
  22. lamindb/core/storage/objects.py +63 -28
  23. lamindb/core/storage/paths.py +16 -13
  24. lamindb/core/types.py +10 -0
  25. lamindb/curators/__init__.py +176 -149
  26. lamindb/errors.py +1 -1
  27. lamindb/integrations/_vitessce.py +4 -4
  28. lamindb/migrations/0089_subsequent_runs.py +159 -0
  29. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  30. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  31. lamindb/models/__init__.py +79 -0
  32. lamindb/{core → models}/_describe.py +3 -3
  33. lamindb/{core → models}/_django.py +8 -5
  34. lamindb/{core → models}/_feature_manager.py +103 -87
  35. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  36. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  37. lamindb/{core → models}/_label_manager.py +10 -17
  38. lamindb/{core/relations.py → models/_relations.py} +8 -1
  39. lamindb/models/artifact.py +2602 -0
  40. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  41. lamindb/models/collection.py +683 -0
  42. lamindb/models/core.py +135 -0
  43. lamindb/models/feature.py +643 -0
  44. lamindb/models/flextable.py +163 -0
  45. lamindb/{_parents.py → models/has_parents.py} +55 -49
  46. lamindb/models/project.py +384 -0
  47. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  48. lamindb/{_query_set.py → models/query_set.py} +64 -32
  49. lamindb/models/record.py +1762 -0
  50. lamindb/models/run.py +563 -0
  51. lamindb/{_save.py → models/save.py} +18 -8
  52. lamindb/models/schema.py +732 -0
  53. lamindb/models/transform.py +360 -0
  54. lamindb/models/ulabel.py +249 -0
  55. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  56. lamindb-1.2.0.dist-info/RECORD +95 -0
  57. lamindb/_artifact.py +0 -1361
  58. lamindb/_collection.py +0 -440
  59. lamindb/_feature.py +0 -316
  60. lamindb/_is_versioned.py +0 -40
  61. lamindb/_record.py +0 -1065
  62. lamindb/_run.py +0 -60
  63. lamindb/_schema.py +0 -347
  64. lamindb/_storage.py +0 -15
  65. lamindb/_transform.py +0 -170
  66. lamindb/_ulabel.py +0 -56
  67. lamindb/_utils.py +0 -9
  68. lamindb/base/validation.py +0 -63
  69. lamindb/core/_data.py +0 -491
  70. lamindb/core/fields.py +0 -12
  71. lamindb/models.py +0 -4435
  72. lamindb-1.1.0.dist-info/RECORD +0 -95
  73. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  74. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
lamindb/_feature.py DELETED
@@ -1,316 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import importlib
4
- from typing import TYPE_CHECKING, Any, get_args
5
-
6
- import lamindb_setup as ln_setup
7
- import pandas as pd
8
- from django.db.models.query_utils import DeferredAttribute
9
- from lamin_utils import logger
10
- from lamindb_setup._init_instance import get_schema_module_name
11
- from lamindb_setup.core._docs import doc_args
12
- from pandas.api.types import CategoricalDtype, is_string_dtype
13
-
14
- from lamindb._record import _get_record_kwargs
15
- from lamindb.base.types import FeatureDtype
16
- from lamindb.errors import FieldValidationError, ValidationError
17
- from lamindb.models import Artifact, Feature, Record, Registry
18
-
19
- from ._query_set import RecordList
20
- from ._utils import attach_func_to_class_method
21
- from .core.relations import dict_module_name_to_model_name
22
-
23
- if TYPE_CHECKING:
24
- from collections.abc import Iterable
25
-
26
- from pandas.core.dtypes.base import ExtensionDtype
27
-
28
- from lamindb.base.types import FieldAttr
29
-
30
-
31
- FEATURE_DTYPES = set(get_args(FeatureDtype))
32
-
33
-
34
- def parse_dtype_single_cat(
35
- dtype_str: str,
36
- related_registries: dict[str, Record] | None = None,
37
- is_itype: bool = False,
38
- ) -> dict:
39
- assert isinstance(dtype_str, str) # noqa: S101
40
- if related_registries is None:
41
- related_registries = dict_module_name_to_model_name(Artifact)
42
- split_result = dtype_str.split("[")
43
- # has sub type
44
- sub_type_str = ""
45
- if len(split_result) == 2:
46
- registry_str = split_result[0]
47
- assert "]" in split_result[1] # noqa: S101
48
- sub_type_field_split = split_result[1].split("].")
49
- if len(sub_type_field_split) == 1:
50
- sub_type_str = sub_type_field_split[0].strip("]")
51
- field_str = ""
52
- else:
53
- sub_type_str = sub_type_field_split[0]
54
- field_str = sub_type_field_split[1]
55
- elif len(split_result) == 1:
56
- registry_field_split = split_result[0].split(".")
57
- if (
58
- len(registry_field_split) == 2 and registry_field_split[1][0].isupper()
59
- ) or len(registry_field_split) == 3:
60
- # bionty.CellType or bionty.CellType.name
61
- registry_str = f"{registry_field_split[0]}.{registry_field_split[1]}"
62
- field_str = (
63
- "" if len(registry_field_split) == 2 else registry_field_split[2]
64
- )
65
- else:
66
- # ULabel or ULabel.name
67
- registry_str = registry_field_split[0]
68
- field_str = (
69
- "" if len(registry_field_split) == 1 else registry_field_split[1]
70
- )
71
- if not is_itype:
72
- if registry_str not in related_registries:
73
- raise ValidationError(
74
- f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
75
- )
76
- registry = related_registries[registry_str]
77
- else:
78
- if "." in registry_str:
79
- registry_str_split = registry_str.split(".")
80
- assert len(registry_str_split) == 2, registry_str # noqa: S101
81
- module_name, class_name = registry_str_split
82
- module_name = get_schema_module_name(module_name)
83
- else:
84
- module_name, class_name = "lamindb", registry_str
85
- module = importlib.import_module(module_name)
86
- registry = getattr(module, class_name)
87
- if sub_type_str != "":
88
- pass
89
- # validate that the subtype is a record in the registry with is_type = True
90
- if field_str != "":
91
- pass
92
- # validate that field_str is an actual field of the module
93
- else:
94
- field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
95
- return {
96
- "registry": registry, # should be typed as CanCurate
97
- "registry_str": registry_str,
98
- "subtype_str": sub_type_str,
99
- "field_str": field_str,
100
- "field": getattr(registry, field_str),
101
- }
102
-
103
-
104
- def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
105
- allowed_dtypes = FEATURE_DTYPES
106
- if is_param:
107
- allowed_dtypes.add("dict")
108
- is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
109
- result = []
110
- if is_composed_cat:
111
- related_registries = dict_module_name_to_model_name(Artifact)
112
- registries_str = dtype_str.replace("cat[", "")[:-1] # strip last ]
113
- if registries_str != "":
114
- registry_str_list = registries_str.split("|")
115
- for cat_single_dtype_str in registry_str_list:
116
- single_result = parse_dtype_single_cat(
117
- cat_single_dtype_str, related_registries
118
- )
119
- result.append(single_result)
120
- elif dtype_str not in allowed_dtypes:
121
- raise ValueError(
122
- f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
123
- )
124
- return result
125
-
126
-
127
- def get_dtype_str_from_dtype(dtype: Any, is_itype: bool = False) -> str:
128
- if (
129
- not isinstance(dtype, list)
130
- and hasattr(dtype, "__name__")
131
- and dtype.__name__ in FEATURE_DTYPES
132
- ):
133
- dtype_str = dtype.__name__
134
- else:
135
- error_message = (
136
- "dtype has to be a record, a record field, or a list of records, not {}"
137
- )
138
- if isinstance(dtype, Registry):
139
- dtype = [dtype]
140
- elif isinstance(dtype, DeferredAttribute):
141
- dtype = [dtype]
142
- elif not isinstance(dtype, list):
143
- raise ValueError(error_message.format(dtype))
144
- dtype_str = ""
145
- for single_dtype in dtype:
146
- if not isinstance(single_dtype, Registry) and not isinstance(
147
- single_dtype, DeferredAttribute
148
- ):
149
- raise ValueError(error_message.format(single_dtype))
150
- if isinstance(single_dtype, Registry):
151
- dtype_str += single_dtype.__get_name_with_module__() + "|"
152
- else:
153
- dtype_str += (
154
- single_dtype.field.model.__get_name_with_module__()
155
- + f".{single_dtype.field.name}"
156
- + "|"
157
- )
158
- dtype_str = dtype_str.rstrip("|")
159
- if not is_itype:
160
- dtype_str = f"cat[{dtype_str}]"
161
- return dtype_str
162
-
163
-
164
- def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
165
- if is_string_dtype(pandas_dtype):
166
- if not isinstance(pandas_dtype, CategoricalDtype):
167
- dtype = "str"
168
- else:
169
- dtype = "cat"
170
- # there are string-like categoricals and "pure" categoricals (pd.Categorical)
171
- elif isinstance(pandas_dtype, CategoricalDtype):
172
- dtype = "cat"
173
- else:
174
- # strip precision qualifiers
175
- dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit())
176
- if dtype.startswith("datetime"):
177
- dtype = dtype.split("[")[0]
178
- assert dtype in FEATURE_DTYPES # noqa: S101
179
- return dtype
180
-
181
-
182
- def process_init_feature_param(args, kwargs, is_param: bool = False):
183
- # now we proceed with the user-facing constructor
184
- if len(args) != 0:
185
- raise ValueError("Only keyword args allowed")
186
- name: str = kwargs.pop("name", None)
187
- dtype: type | str | None = kwargs.pop("dtype", None)
188
- is_type: bool = kwargs.pop("is_type", None)
189
- type_: Feature | str | None = kwargs.pop("type", None)
190
- description: str | None = kwargs.pop("description", None)
191
- if kwargs:
192
- valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
193
- raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
194
- kwargs["name"] = name
195
- kwargs["type"] = type_
196
- kwargs["is_type"] = is_type
197
- if not is_param:
198
- kwargs["description"] = description
199
- # cast dtype
200
- if dtype is None and not is_type:
201
- raise ValidationError(
202
- f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
203
- )
204
- dtype_str = None
205
- if dtype is not None:
206
- if not isinstance(dtype, str):
207
- dtype_str = get_dtype_str_from_dtype(dtype)
208
- else:
209
- dtype_str = dtype
210
- parse_dtype(dtype_str, is_param=is_param)
211
- kwargs["dtype"] = dtype_str
212
- return kwargs
213
-
214
-
215
- def __init__(self, *args, **kwargs):
216
- if len(args) == len(self._meta.concrete_fields):
217
- super(Feature, self).__init__(*args, **kwargs)
218
- return None
219
- dtype = kwargs.get("dtype", None)
220
- default_value = kwargs.pop("default_value", None)
221
- nullable = kwargs.pop("nullable", None)
222
- cat_filters = kwargs.pop("cat_filters", None)
223
- kwargs = process_init_feature_param(args, kwargs)
224
- super(Feature, self).__init__(*args, **kwargs)
225
- self.default_value = default_value
226
- self.nullable = nullable
227
- dtype_str = kwargs.pop("dtype", None)
228
- if cat_filters:
229
- assert "|" not in dtype_str # noqa: S101
230
- assert "]]" not in dtype_str # noqa: S101
231
- fill_in = ", ".join(f"{key}='{value}'" for (key, value) in cat_filters.items())
232
- dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
233
- self.dtype = dtype_str
234
- if not self._state.adding:
235
- if not (
236
- self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype_str
237
- ):
238
- raise ValidationError(
239
- f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
240
- )
241
-
242
-
243
- def suggest_categorical_for_str_iterable(
244
- iterable: Iterable[str], key: str = None
245
- ) -> str:
246
- c = pd.Categorical(iterable)
247
- message = ""
248
- if len(c.categories) < len(c):
249
- if key != "":
250
- key_note = f" for feature {key}"
251
- else:
252
- key_note = ""
253
- message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'"
254
- return message
255
-
256
-
257
- def categoricals_from_df(df: pd.DataFrame) -> dict:
258
- """Returns categorical columns."""
259
- string_cols = [col for col in df.columns if is_string_dtype(df[col])]
260
- categoricals = {
261
- col: df[col]
262
- for col in df.columns
263
- if isinstance(df[col].dtype, CategoricalDtype)
264
- }
265
- for key in string_cols:
266
- message = suggest_categorical_for_str_iterable(df[key], key)
267
- if message:
268
- logger.warning(message)
269
- return categoricals
270
-
271
-
272
- @classmethod # type:ignore
273
- @doc_args(Feature.from_df.__doc__)
274
- def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
275
- """{}""" # noqa: D415
276
- field = Feature.name if field is None else field
277
- registry = field.field.model # type: ignore
278
- if registry != Feature:
279
- raise ValueError("field must be a Feature FieldAttr!")
280
- categoricals = categoricals_from_df(df)
281
- dtypes = {}
282
- for name, col in df.items():
283
- if name in categoricals:
284
- dtypes[name] = "cat"
285
- else:
286
- dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
287
- with logger.mute(): # silence the warning "loaded record with exact same name "
288
- features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()] # type: ignore
289
- assert len(features) == len(df.columns) # noqa: S101
290
- return RecordList(features)
291
-
292
-
293
- @doc_args(Feature.save.__doc__)
294
- def save(self, *args, **kwargs) -> Feature:
295
- """{}""" # noqa: D415
296
- super(Feature, self).save(*args, **kwargs)
297
- return self
298
-
299
-
300
- METHOD_NAMES = [
301
- "__init__",
302
- "from_df",
303
- "save",
304
- ]
305
-
306
- if ln_setup._TESTING:
307
- from inspect import signature
308
-
309
- SIGS = {
310
- name: signature(getattr(Feature, name))
311
- for name in METHOD_NAMES
312
- if name != "__init__"
313
- }
314
-
315
- for name in METHOD_NAMES:
316
- attach_func_to_class_method(name, Feature, globals())
lamindb/_is_versioned.py DELETED
@@ -1,40 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import lamindb_setup as ln_setup
4
- from lamin_utils import logger
5
- from lamindb_setup.core.upath import UPath
6
-
7
- from lamindb.models import IsVersioned
8
-
9
- from ._utils import attach_func_to_class_method
10
- from .core.versioning import create_uid, get_new_path_from_uid
11
-
12
-
13
- # docstring handled through attach_func_to_class_method
14
- def _add_to_version_family(self, revises: IsVersioned, version: str | None = None):
15
- old_uid = self.uid
16
- new_uid, revises = create_uid(revises=revises, version=version)
17
- if self.__class__.__name__ == "Artifact" and self._key_is_virtual:
18
- old_path = self.path
19
- new_path = get_new_path_from_uid(
20
- old_path=old_path, old_uid=old_uid, new_uid=new_uid
21
- )
22
- new_path = UPath(old_path).rename(new_path)
23
- logger.success(f"updated path from {old_path} to {new_path}!")
24
- self.uid = new_uid
25
- self.version = version
26
- self.save()
27
- logger.success(f"updated uid from {old_uid} to {new_uid}!")
28
-
29
-
30
- METHOD_NAMES = [
31
- "_add_to_version_family",
32
- ]
33
-
34
- if ln_setup._TESTING: # type: ignore
35
- from inspect import signature
36
-
37
- SIGS = {name: signature(getattr(IsVersioned, name)) for name in METHOD_NAMES}
38
-
39
- for name in METHOD_NAMES:
40
- attach_func_to_class_method(name, IsVersioned, globals())