lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +174 -57
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +222 -81
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +39 -36
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +54 -44
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +7 -13
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +41 -22
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +423 -156
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.4.dist-info/RECORD +0 -102
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/_collection.py
CHANGED
@@ -15,33 +15,35 @@ from lamin_utils import logger
|
|
15
15
|
from lamindb_setup.core._docs import doc_args
|
16
16
|
from lamindb_setup.core.hashing import hash_set
|
17
17
|
|
18
|
-
from lamindb.models import (
|
19
|
-
Collection,
|
20
|
-
CollectionArtifact,
|
21
|
-
Schema,
|
22
|
-
)
|
23
|
-
|
24
18
|
from ._parents import view_lineage
|
25
|
-
from ._record import init_self_from_db, update_attributes
|
19
|
+
from ._record import _get_record_kwargs, init_self_from_db, update_attributes
|
26
20
|
from ._utils import attach_func_to_class_method
|
27
21
|
from .core._data import (
|
28
22
|
_track_run_input,
|
29
23
|
describe,
|
30
24
|
get_run,
|
31
25
|
save_schema_links,
|
32
|
-
|
26
|
+
save_staged_feature_sets,
|
33
27
|
)
|
34
28
|
from .core._mapped_collection import MappedCollection
|
35
|
-
from .core.
|
29
|
+
from .core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
|
36
30
|
from .core.versioning import process_revises
|
37
|
-
from .
|
31
|
+
from .errors import FieldValidationError
|
32
|
+
from .models import (
|
33
|
+
Artifact,
|
34
|
+
Collection,
|
35
|
+
CollectionArtifact,
|
36
|
+
Run,
|
37
|
+
Schema,
|
38
|
+
)
|
38
39
|
|
39
40
|
if TYPE_CHECKING:
|
40
41
|
from collections.abc import Iterable
|
41
42
|
|
42
|
-
from
|
43
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
43
44
|
|
44
45
|
from ._query_set import QuerySet
|
46
|
+
from .core.storage import UPath
|
45
47
|
|
46
48
|
|
47
49
|
class CollectionFeatureManager:
|
@@ -50,15 +52,15 @@ class CollectionFeatureManager:
|
|
50
52
|
def __init__(self, collection: Collection):
|
51
53
|
self._collection = collection
|
52
54
|
|
53
|
-
def
|
54
|
-
links_schema_artifact = Artifact.
|
55
|
+
def _get_staged_feature_sets_union(self) -> dict[str, Schema]:
|
56
|
+
links_schema_artifact = Artifact.feature_sets.through.objects.filter(
|
55
57
|
artifact_id__in=self._collection.artifacts.values_list("id", flat=True)
|
56
58
|
)
|
57
|
-
|
59
|
+
feature_sets_by_slots = defaultdict(list)
|
58
60
|
for link in links_schema_artifact:
|
59
|
-
|
60
|
-
|
61
|
-
for slot, schema_ids_slot in
|
61
|
+
feature_sets_by_slots[link.slot].append(link.schema_id)
|
62
|
+
feature_sets_union = {}
|
63
|
+
for slot, schema_ids_slot in feature_sets_by_slots.items():
|
62
64
|
schema_1 = Schema.get(id=schema_ids_slot[0])
|
63
65
|
related_name = schema_1._get_related_name()
|
64
66
|
features_registry = getattr(Schema, related_name).field.model
|
@@ -73,8 +75,8 @@ class CollectionFeatureManager:
|
|
73
75
|
.distinct()
|
74
76
|
)
|
75
77
|
features = features_registry.filter(id__in=feature_ids)
|
76
|
-
|
77
|
-
return
|
78
|
+
feature_sets_union[slot] = Schema(features, dtype=schema_1.dtype)
|
79
|
+
return feature_sets_union
|
78
80
|
|
79
81
|
|
80
82
|
def __init__(
|
@@ -92,23 +94,16 @@ def __init__(
|
|
92
94
|
artifacts: Artifact | Iterable[Artifact] = (
|
93
95
|
kwargs.pop("artifacts") if len(args) == 0 else args[0]
|
94
96
|
)
|
95
|
-
meta_artifact: Artifact | None = (
|
96
|
-
|
97
|
-
)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
)
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
)
|
106
|
-
run: Run | None = kwargs.pop("run") if "run" in kwargs else None
|
107
|
-
revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
|
108
|
-
version: str | None = kwargs.pop("version") if "version" in kwargs else None
|
109
|
-
_branch_code: int | None = (
|
110
|
-
kwargs.pop("_branch_code") if "_branch_code" in kwargs else 1
|
111
|
-
)
|
97
|
+
meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None)
|
98
|
+
tmp_key: str | None = kwargs.pop("key", None)
|
99
|
+
description: str | None = kwargs.pop("description", None)
|
100
|
+
reference: str | None = kwargs.pop("reference", None)
|
101
|
+
reference_type: str | None = kwargs.pop("reference_type", None)
|
102
|
+
run: Run | None = kwargs.pop("run", None)
|
103
|
+
revises: Collection | None = kwargs.pop("revises", None)
|
104
|
+
version: str | None = kwargs.pop("version", None)
|
105
|
+
_branch_code: int | None = kwargs.pop("_branch_code", 1)
|
106
|
+
key: str
|
112
107
|
if "name" in kwargs:
|
113
108
|
key = kwargs.pop("name")
|
114
109
|
warnings.warn(
|
@@ -116,9 +111,16 @@ def __init__(
|
|
116
111
|
FutureWarning,
|
117
112
|
stacklevel=2,
|
118
113
|
)
|
114
|
+
else:
|
115
|
+
key = tmp_key
|
119
116
|
if not len(kwargs) == 0:
|
120
|
-
|
121
|
-
|
117
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Collection)])
|
118
|
+
raise FieldValidationError(
|
119
|
+
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
120
|
+
)
|
121
|
+
if revises is None:
|
122
|
+
revises = (
|
123
|
+
Collection.filter(key=key, is_latest=True).order_by("-created_at").first()
|
122
124
|
)
|
123
125
|
provisional_uid, version, key, description, revises = process_revises(
|
124
126
|
revises, version, key, description, Collection
|
@@ -162,11 +164,8 @@ def __init__(
|
|
162
164
|
init_self_from_db(collection, existing_collection)
|
163
165
|
update_attributes(collection, {"description": description, "key": key})
|
164
166
|
else:
|
165
|
-
|
166
|
-
|
167
|
-
if revises is not None and key == revises.key:
|
168
|
-
settings.creation.search_names = False
|
169
|
-
super(Collection, collection).__init__(
|
167
|
+
_skip_validation = revises is not None and key == revises.key
|
168
|
+
super(Collection, collection).__init__( # type: ignore
|
170
169
|
uid=provisional_uid,
|
171
170
|
key=key,
|
172
171
|
description=description,
|
@@ -178,9 +177,8 @@ def __init__(
|
|
178
177
|
version=version,
|
179
178
|
_branch_code=_branch_code,
|
180
179
|
revises=revises,
|
181
|
-
|
180
|
+
_skip_validation=_skip_validation,
|
182
181
|
)
|
183
|
-
settings.creation.search_names = search_names_setting
|
184
182
|
collection._artifacts = artifacts
|
185
183
|
# register provenance
|
186
184
|
if revises is not None:
|
@@ -190,8 +188,9 @@ def __init__(
|
|
190
188
|
|
191
189
|
# docstring handled through attach_func_to_class_method
|
192
190
|
def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
|
193
|
-
return Collection(
|
191
|
+
return Collection( # type: ignore
|
194
192
|
self.artifacts.all().list() + [artifact],
|
193
|
+
# key is automatically taken from revises.key
|
195
194
|
description=self.description,
|
196
195
|
revises=self,
|
197
196
|
run=run,
|
@@ -218,13 +217,46 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
|
218
217
|
return hash
|
219
218
|
|
220
219
|
|
220
|
+
# docstring handled through attach_func_to_class_method
|
221
|
+
def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
|
222
|
+
if self._state.adding:
|
223
|
+
artifacts = self._artifacts
|
224
|
+
logger.warning("the collection isn't saved, consider calling `.save()`")
|
225
|
+
else:
|
226
|
+
artifacts = self.ordered_artifacts.all()
|
227
|
+
paths = [artifact.path for artifact in artifacts]
|
228
|
+
# this checks that the filesystem is the same for all paths
|
229
|
+
# this is a requirement of pyarrow.dataset.dataset
|
230
|
+
fs = paths[0].fs
|
231
|
+
for path in paths[1:]:
|
232
|
+
# this assumes that the filesystems are cached by fsspec
|
233
|
+
if path.fs is not fs:
|
234
|
+
raise ValueError(
|
235
|
+
"The collection has artifacts with different filesystems, this is not supported."
|
236
|
+
)
|
237
|
+
if not _is_pyarrow_dataset(paths):
|
238
|
+
suffixes = {path.suffix for path in paths}
|
239
|
+
suffixes_str = ", ".join(suffixes)
|
240
|
+
err_msg = "This collection is not compatible with pyarrow.dataset.dataset(), "
|
241
|
+
err_msg += (
|
242
|
+
f"the artifacts have incompatible file types: {suffixes_str}"
|
243
|
+
if len(suffixes) > 1
|
244
|
+
else f"the file type {suffixes_str} is not supported by pyarrow."
|
245
|
+
)
|
246
|
+
raise ValueError(err_msg)
|
247
|
+
dataset = _open_pyarrow_dataset(paths)
|
248
|
+
# track only if successful
|
249
|
+
_track_run_input(self, is_run_input)
|
250
|
+
return dataset
|
251
|
+
|
252
|
+
|
221
253
|
# docstring handled through attach_func_to_class_method
|
222
254
|
def mapped(
|
223
255
|
self,
|
224
256
|
layers_keys: str | list[str] | None = None,
|
225
257
|
obs_keys: str | list[str] | None = None,
|
226
258
|
obsm_keys: str | list[str] | None = None,
|
227
|
-
obs_filter: dict[str, str |
|
259
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
228
260
|
join: Literal["inner", "outer"] | None = "inner",
|
229
261
|
encode_labels: bool | list[str] = True,
|
230
262
|
unknown_label: str | dict[str, str] | None = None,
|
@@ -237,12 +269,12 @@ def mapped(
|
|
237
269
|
path_list = []
|
238
270
|
if self._state.adding:
|
239
271
|
artifacts = self._artifacts
|
240
|
-
logger.warning("
|
272
|
+
logger.warning("the collection isn't saved, consider calling `.save()`")
|
241
273
|
else:
|
242
274
|
artifacts = self.ordered_artifacts.all()
|
243
275
|
for artifact in artifacts:
|
244
276
|
if artifact.suffix not in {".h5ad", ".zarr"}:
|
245
|
-
logger.warning(f"
|
277
|
+
logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
|
246
278
|
continue
|
247
279
|
elif not stream:
|
248
280
|
path_list.append(artifact.cache())
|
@@ -335,14 +367,14 @@ def save(self, using: str | None = None) -> Collection:
|
|
335
367
|
if self.meta_artifact is not None:
|
336
368
|
self.meta_artifact.save()
|
337
369
|
# we don't need to save feature sets again
|
338
|
-
|
370
|
+
save_staged_feature_sets(self)
|
339
371
|
super(Collection, self).save()
|
340
372
|
# we don't allow updating the collection of artifacts
|
341
373
|
# if users want to update the set of artifacts, they
|
342
374
|
# have to create a new collection
|
343
375
|
if hasattr(self, "_artifacts"):
|
344
376
|
links = [
|
345
|
-
CollectionArtifact(collection_id=self.id, artifact_id=artifact.id)
|
377
|
+
CollectionArtifact(collection_id=self.id, artifact_id=artifact.id) # type: ignore
|
346
378
|
for artifact in self._artifacts
|
347
379
|
]
|
348
380
|
# the below seems to preserve the order of the list in the
|
@@ -380,6 +412,7 @@ def data_artifact(self) -> Artifact | None:
|
|
380
412
|
METHOD_NAMES = [
|
381
413
|
"__init__",
|
382
414
|
"append",
|
415
|
+
"open",
|
383
416
|
"mapped",
|
384
417
|
"cache",
|
385
418
|
"load",
|
@@ -400,6 +433,7 @@ if ln_setup._TESTING:
|
|
400
433
|
for name in METHOD_NAMES:
|
401
434
|
attach_func_to_class_method(name, Collection, globals())
|
402
435
|
|
436
|
+
# mypy: ignore-errors
|
403
437
|
Collection.ordered_artifacts = ordered_artifacts
|
404
438
|
Collection.data_artifact = data_artifact
|
405
439
|
Collection.describe = describe
|
lamindb/_feature.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import importlib
|
3
4
|
from typing import TYPE_CHECKING, Any, get_args
|
4
5
|
|
5
6
|
import lamindb_setup as ln_setup
|
6
7
|
import pandas as pd
|
8
|
+
from django.db.models.query_utils import DeferredAttribute
|
7
9
|
from lamin_utils import logger
|
10
|
+
from lamindb_setup._init_instance import get_schema_module_name
|
8
11
|
from lamindb_setup.core._docs import doc_args
|
9
12
|
from pandas.api.types import CategoricalDtype, is_string_dtype
|
10
13
|
|
14
|
+
from lamindb._record import _get_record_kwargs
|
11
15
|
from lamindb.base.types import FeatureDtype
|
12
|
-
from lamindb.
|
13
|
-
from lamindb.models import Artifact, Feature, Record
|
16
|
+
from lamindb.errors import FieldValidationError, ValidationError
|
17
|
+
from lamindb.models import Artifact, Feature, Record, Registry
|
14
18
|
|
15
19
|
from ._query_set import RecordList
|
16
20
|
from ._utils import attach_func_to_class_method
|
@@ -27,21 +31,133 @@ if TYPE_CHECKING:
|
|
27
31
|
FEATURE_DTYPES = set(get_args(FeatureDtype))
|
28
32
|
|
29
33
|
|
30
|
-
def
|
31
|
-
|
34
|
+
def parse_dtype_single_cat(
|
35
|
+
dtype_str: str,
|
36
|
+
related_registries: dict[str, Record] | None = None,
|
37
|
+
is_itype: bool = False,
|
38
|
+
) -> dict:
|
39
|
+
assert isinstance(dtype_str, str) # noqa: S101
|
40
|
+
if related_registries is None:
|
41
|
+
related_registries = dict_module_name_to_model_name(Artifact)
|
42
|
+
split_result = dtype_str.split("[")
|
43
|
+
# has sub type
|
44
|
+
sub_type_str = ""
|
45
|
+
if len(split_result) == 2:
|
46
|
+
registry_str = split_result[0]
|
47
|
+
assert "]" in split_result[1] # noqa: S101
|
48
|
+
sub_type_field_split = split_result[1].split("].")
|
49
|
+
if len(sub_type_field_split) == 1:
|
50
|
+
sub_type_str = sub_type_field_split[0].strip("]")
|
51
|
+
field_str = ""
|
52
|
+
else:
|
53
|
+
sub_type_str = sub_type_field_split[0]
|
54
|
+
field_str = sub_type_field_split[1]
|
55
|
+
elif len(split_result) == 1:
|
56
|
+
registry_field_split = split_result[0].split(".")
|
57
|
+
if (
|
58
|
+
len(registry_field_split) == 2 and registry_field_split[1][0].isupper()
|
59
|
+
) or len(registry_field_split) == 3:
|
60
|
+
# bionty.CellType or bionty.CellType.name
|
61
|
+
registry_str = f"{registry_field_split[0]}.{registry_field_split[1]}"
|
62
|
+
field_str = (
|
63
|
+
"" if len(registry_field_split) == 2 else registry_field_split[2]
|
64
|
+
)
|
65
|
+
else:
|
66
|
+
# ULabel or ULabel.name
|
67
|
+
registry_str = registry_field_split[0]
|
68
|
+
field_str = (
|
69
|
+
"" if len(registry_field_split) == 1 else registry_field_split[1]
|
70
|
+
)
|
71
|
+
if not is_itype:
|
72
|
+
if registry_str not in related_registries:
|
73
|
+
raise ValidationError(
|
74
|
+
f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
|
75
|
+
)
|
76
|
+
registry = related_registries[registry_str]
|
77
|
+
else:
|
78
|
+
if "." in registry_str:
|
79
|
+
registry_str_split = registry_str.split(".")
|
80
|
+
assert len(registry_str_split) == 2, registry_str # noqa: S101
|
81
|
+
module_name, class_name = registry_str_split
|
82
|
+
module_name = get_schema_module_name(module_name)
|
83
|
+
else:
|
84
|
+
module_name, class_name = "lamindb", registry_str
|
85
|
+
module = importlib.import_module(module_name)
|
86
|
+
registry = getattr(module, class_name)
|
87
|
+
if sub_type_str != "":
|
88
|
+
pass
|
89
|
+
# validate that the subtype is a record in the registry with is_type = True
|
90
|
+
if field_str != "":
|
91
|
+
pass
|
92
|
+
# validate that field_str is an actual field of the module
|
93
|
+
else:
|
94
|
+
field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
|
95
|
+
return {
|
96
|
+
"registry": registry, # should be typed as CanCurate
|
97
|
+
"registry_str": registry_str,
|
98
|
+
"subtype_str": sub_type_str,
|
99
|
+
"field_str": field_str,
|
100
|
+
"field": getattr(registry, field_str),
|
101
|
+
}
|
102
|
+
|
103
|
+
|
104
|
+
def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
|
105
|
+
allowed_dtypes = FEATURE_DTYPES
|
106
|
+
if is_param:
|
107
|
+
allowed_dtypes.add("dict")
|
108
|
+
is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
|
109
|
+
result = []
|
110
|
+
if is_composed_cat:
|
111
|
+
related_registries = dict_module_name_to_model_name(Artifact)
|
112
|
+
registries_str = dtype_str.replace("cat[", "")[:-1] # strip last ]
|
113
|
+
if registries_str != "":
|
114
|
+
registry_str_list = registries_str.split("|")
|
115
|
+
for cat_single_dtype_str in registry_str_list:
|
116
|
+
single_result = parse_dtype_single_cat(
|
117
|
+
cat_single_dtype_str, related_registries
|
118
|
+
)
|
119
|
+
result.append(single_result)
|
120
|
+
elif dtype_str not in allowed_dtypes:
|
121
|
+
raise ValueError(
|
122
|
+
f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
|
123
|
+
)
|
124
|
+
return result
|
125
|
+
|
126
|
+
|
127
|
+
def get_dtype_str_from_dtype(dtype: Any, is_itype: bool = False) -> str:
|
128
|
+
if (
|
129
|
+
not isinstance(dtype, list)
|
130
|
+
and hasattr(dtype, "__name__")
|
131
|
+
and dtype.__name__ in FEATURE_DTYPES
|
132
|
+
):
|
32
133
|
dtype_str = dtype.__name__
|
33
134
|
else:
|
34
|
-
error_message =
|
35
|
-
|
135
|
+
error_message = (
|
136
|
+
"dtype has to be a record, a record field, or a list of records, not {}"
|
137
|
+
)
|
138
|
+
if isinstance(dtype, Registry):
|
139
|
+
dtype = [dtype]
|
140
|
+
elif isinstance(dtype, DeferredAttribute):
|
36
141
|
dtype = [dtype]
|
37
142
|
elif not isinstance(dtype, list):
|
38
|
-
raise ValueError(error_message)
|
39
|
-
|
40
|
-
for
|
41
|
-
if not
|
42
|
-
|
43
|
-
|
44
|
-
|
143
|
+
raise ValueError(error_message.format(dtype))
|
144
|
+
dtype_str = ""
|
145
|
+
for single_dtype in dtype:
|
146
|
+
if not isinstance(single_dtype, Registry) and not isinstance(
|
147
|
+
single_dtype, DeferredAttribute
|
148
|
+
):
|
149
|
+
raise ValueError(error_message.format(single_dtype))
|
150
|
+
if isinstance(single_dtype, Registry):
|
151
|
+
dtype_str += single_dtype.__get_name_with_module__() + "|"
|
152
|
+
else:
|
153
|
+
dtype_str += (
|
154
|
+
single_dtype.field.model.__get_name_with_module__()
|
155
|
+
+ f".{single_dtype.field.name}"
|
156
|
+
+ "|"
|
157
|
+
)
|
158
|
+
dtype_str = dtype_str.rstrip("|")
|
159
|
+
if not is_itype:
|
160
|
+
dtype_str = f"cat[{dtype_str}]"
|
45
161
|
return dtype_str
|
46
162
|
|
47
163
|
|
@@ -63,44 +179,64 @@ def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
|
|
63
179
|
return dtype
|
64
180
|
|
65
181
|
|
66
|
-
def
|
67
|
-
if len(args) == len(self._meta.concrete_fields):
|
68
|
-
super(Feature, self).__init__(*args, **kwargs)
|
69
|
-
return None
|
182
|
+
def process_init_feature_param(args, kwargs, is_param: bool = False):
|
70
183
|
# now we proceed with the user-facing constructor
|
71
184
|
if len(args) != 0:
|
72
185
|
raise ValueError("Only keyword args allowed")
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
186
|
+
name: str = kwargs.pop("name", None)
|
187
|
+
dtype: type | str | None = kwargs.pop("dtype", None)
|
188
|
+
is_type: bool = kwargs.pop("is_type", None)
|
189
|
+
type_: Feature | str | None = kwargs.pop("type", None)
|
190
|
+
description: str | None = kwargs.pop("description", None)
|
191
|
+
if kwargs:
|
192
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
|
193
|
+
raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
|
194
|
+
kwargs["name"] = name
|
195
|
+
kwargs["type"] = type_
|
196
|
+
kwargs["is_type"] = is_type
|
197
|
+
if not is_param:
|
198
|
+
kwargs["description"] = description
|
199
|
+
# cast dtype
|
200
|
+
if dtype is None and not is_type:
|
201
|
+
raise ValidationError(
|
202
|
+
f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
|
203
|
+
)
|
204
|
+
dtype_str = None
|
205
|
+
if dtype is not None:
|
78
206
|
if not isinstance(dtype, str):
|
79
207
|
dtype_str = get_dtype_str_from_dtype(dtype)
|
80
208
|
else:
|
81
209
|
dtype_str = dtype
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
kwargs["dtype"] = dtype_str
|
210
|
+
parse_dtype(dtype_str, is_param=is_param)
|
211
|
+
kwargs["dtype"] = dtype_str
|
212
|
+
return kwargs
|
213
|
+
|
214
|
+
|
215
|
+
def __init__(self, *args, **kwargs):
|
216
|
+
if len(args) == len(self._meta.concrete_fields):
|
217
|
+
super(Feature, self).__init__(*args, **kwargs)
|
218
|
+
return None
|
219
|
+
dtype = kwargs.get("dtype", None)
|
220
|
+
default_value = kwargs.pop("default_value", None)
|
221
|
+
nullable = kwargs.pop("nullable", None)
|
222
|
+
cat_filters = kwargs.pop("cat_filters", None)
|
223
|
+
kwargs = process_init_feature_param(args, kwargs)
|
97
224
|
super(Feature, self).__init__(*args, **kwargs)
|
225
|
+
self.default_value = default_value
|
226
|
+
self.nullable = nullable
|
227
|
+
dtype_str = kwargs.pop("dtype", None)
|
228
|
+
if cat_filters:
|
229
|
+
assert "|" not in dtype_str # noqa: S101
|
230
|
+
assert "]]" not in dtype_str # noqa: S101
|
231
|
+
fill_in = ", ".join(f"{key}='{value}'" for (key, value) in cat_filters.items())
|
232
|
+
dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
|
233
|
+
self.dtype = dtype_str
|
98
234
|
if not self._state.adding:
|
99
235
|
if not (
|
100
|
-
self.dtype.startswith("cat") if dtype == "cat" else self.dtype ==
|
236
|
+
self.dtype.startswith("cat") if dtype == "cat" else self.dtype == dtype_str
|
101
237
|
):
|
102
238
|
raise ValidationError(
|
103
|
-
f"Feature {self.name} already exists with dtype {self.dtype}, you passed {
|
239
|
+
f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
|
104
240
|
)
|
105
241
|
|
106
242
|
|
@@ -138,7 +274,7 @@ def categoricals_from_df(df: pd.DataFrame) -> dict:
|
|
138
274
|
def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
|
139
275
|
"""{}""" # noqa: D415
|
140
276
|
field = Feature.name if field is None else field
|
141
|
-
registry = field.field.model
|
277
|
+
registry = field.field.model # type: ignore
|
142
278
|
if registry != Feature:
|
143
279
|
raise ValueError("field must be a Feature FieldAttr!")
|
144
280
|
categoricals = categoricals_from_df(df)
|
@@ -149,7 +285,7 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList
|
|
149
285
|
else:
|
150
286
|
dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
|
151
287
|
with logger.mute(): # silence the warning "loaded record with exact same name "
|
152
|
-
features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]
|
288
|
+
features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()] # type: ignore
|
153
289
|
assert len(features) == len(df.columns) # noqa: S101
|
154
290
|
return RecordList(features)
|
155
291
|
|