lamindb 0.71.2__py3-none-any.whl → 0.72.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +2 -2
- lamindb/_annotate.py +6 -10
- lamindb/_artifact.py +24 -10
- lamindb/_can_validate.py +9 -3
- lamindb/_collection.py +7 -7
- lamindb/_feature.py +53 -45
- lamindb/_feature_set.py +37 -74
- lamindb/_from_values.py +27 -8
- lamindb/_query_manager.py +6 -1
- lamindb/_registry.py +60 -100
- lamindb/_run.py +0 -2
- lamindb/_save.py +28 -11
- lamindb/core/__init__.py +4 -0
- lamindb/core/_data.py +56 -30
- lamindb/core/_feature_manager.py +159 -64
- lamindb/core/_label_manager.py +53 -38
- lamindb/core/_run_context.py +24 -1
- lamindb/core/datasets/_core.py +10 -18
- lamindb/core/schema.py +53 -0
- {lamindb-0.71.2.dist-info → lamindb-0.72.0.dist-info}/METADATA +7 -6
- {lamindb-0.71.2.dist-info → lamindb-0.72.0.dist-info}/RECORD +23 -22
- {lamindb-0.71.2.dist-info → lamindb-0.72.0.dist-info}/LICENSE +0 -0
- {lamindb-0.71.2.dist-info → lamindb-0.72.0.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -41,7 +41,7 @@ Modules & settings:
|
|
41
41
|
"""
|
42
42
|
|
43
43
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
44
|
-
__version__ = "0.
|
44
|
+
__version__ = "0.72.0"
|
45
45
|
|
46
46
|
import os as _os
|
47
47
|
|
@@ -72,6 +72,7 @@ if _check_instance_setup(from_lamindb=True):
|
|
72
72
|
User,
|
73
73
|
)
|
74
74
|
|
75
|
+
from . import core # isort: split
|
75
76
|
from . import (
|
76
77
|
_annotate,
|
77
78
|
_artifact,
|
@@ -86,7 +87,6 @@ if _check_instance_setup(from_lamindb=True):
|
|
86
87
|
_storage,
|
87
88
|
_transform,
|
88
89
|
_ulabel,
|
89
|
-
core,
|
90
90
|
)
|
91
91
|
|
92
92
|
dev = core # backward compat
|
lamindb/_annotate.py
CHANGED
@@ -9,18 +9,14 @@ from lamin_utils import colors, logger
|
|
9
9
|
from lamindb_setup.core._docs import doc_args
|
10
10
|
from lnschema_core import Artifact, Collection, Feature, Registry, Run, ULabel
|
11
11
|
|
12
|
+
from .core.exceptions import ValidationError
|
13
|
+
|
12
14
|
if TYPE_CHECKING:
|
13
15
|
from lamindb_setup.core.types import UPathStr
|
14
16
|
from lnschema_core.types import FieldAttr
|
15
17
|
from mudata import MuData
|
16
18
|
|
17
19
|
|
18
|
-
class ValidationError(ValueError):
|
19
|
-
"""Validation error."""
|
20
|
-
|
21
|
-
pass
|
22
|
-
|
23
|
-
|
24
20
|
class AnnotateLookup:
|
25
21
|
"""Lookup categories from the reference instance."""
|
26
22
|
|
@@ -566,7 +562,7 @@ class MuDataAnnotator:
|
|
566
562
|
save_function="add_new_from_var_index",
|
567
563
|
using=self._using,
|
568
564
|
validated_only=validated_only,
|
569
|
-
|
565
|
+
dtype="number",
|
570
566
|
**kwargs,
|
571
567
|
)
|
572
568
|
|
@@ -1034,7 +1030,7 @@ def update_registry(
|
|
1034
1030
|
validated_only: bool = True,
|
1035
1031
|
df: pd.DataFrame | None = None,
|
1036
1032
|
organism: str | None = None,
|
1037
|
-
|
1033
|
+
dtype: str | None = None,
|
1038
1034
|
**kwargs,
|
1039
1035
|
) -> None:
|
1040
1036
|
"""Save features or labels records in the default instance from the using instance.
|
@@ -1048,7 +1044,7 @@ def update_registry(
|
|
1048
1044
|
validated_only: If True, only save validated labels.
|
1049
1045
|
df: A DataFrame to save labels from.
|
1050
1046
|
organism: The organism name.
|
1051
|
-
|
1047
|
+
dtype: The type of the feature.
|
1052
1048
|
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
1053
1049
|
"""
|
1054
1050
|
from lamindb._save import save as ln_save
|
@@ -1102,7 +1098,7 @@ def update_registry(
|
|
1102
1098
|
for value in labels_saved["without reference"]:
|
1103
1099
|
filter_kwargs[field.field.name] = value
|
1104
1100
|
if registry == Feature:
|
1105
|
-
filter_kwargs["
|
1101
|
+
filter_kwargs["dtype"] = "cat" if dtype is None else dtype
|
1106
1102
|
non_validated_records.append(registry(**filter_kwargs, **kwargs))
|
1107
1103
|
ln_save(non_validated_records)
|
1108
1104
|
|
lamindb/_artifact.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import os
|
3
4
|
import shutil
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
4
6
|
from pathlib import Path, PurePath, PurePosixPath
|
5
7
|
from typing import TYPE_CHECKING, Any, Mapping
|
6
8
|
|
7
9
|
import fsspec
|
8
10
|
import lamindb_setup as ln_setup
|
9
11
|
import pandas as pd
|
12
|
+
import psutil
|
10
13
|
from anndata import AnnData
|
11
14
|
from lamin_utils import colors, logger
|
12
15
|
from lamindb_setup import settings as setup_settings
|
@@ -204,15 +207,26 @@ def get_stat_or_artifact(
|
|
204
207
|
return size, hash, hash_type, n_objects
|
205
208
|
else:
|
206
209
|
if path.is_dir():
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
210
|
+
files = (subpath for subpath in path.rglob("*") if subpath.is_file())
|
211
|
+
|
212
|
+
def hash_size(file):
|
213
|
+
file_size = file.stat().st_size
|
214
|
+
return hash_file(file, file_size)[0], file_size
|
215
|
+
|
216
|
+
try:
|
217
|
+
n_workers = len(psutil.Process().cpu_affinity())
|
218
|
+
except AttributeError:
|
219
|
+
n_workers = psutil.cpu_count()
|
220
|
+
if n_workers > 1:
|
221
|
+
with ThreadPoolExecutor(n_workers) as pool:
|
222
|
+
hashes_sizes = pool.map(hash_size, files)
|
223
|
+
else:
|
224
|
+
hashes_sizes = map(hash_size, files)
|
225
|
+
hashes, sizes = zip(*hashes_sizes)
|
226
|
+
|
227
|
+
hash, hash_type = hash_md5s_from_dir(hashes)
|
228
|
+
n_objects = len(hashes)
|
229
|
+
size = sum(sizes)
|
216
230
|
else:
|
217
231
|
hash, hash_type = hash_file(path)
|
218
232
|
size = stat.st_size
|
@@ -335,7 +349,7 @@ def get_artifact_kwargs_from_data(
|
|
335
349
|
# save the information that this artifact was previously
|
336
350
|
# produced by another run
|
337
351
|
if artifact.run is not None:
|
338
|
-
artifact.run.
|
352
|
+
artifact.run.output_artifacts_with_later_updates.add(artifact)
|
339
353
|
# update the run of the artifact with the latest run
|
340
354
|
stat_or_artifact.run = run
|
341
355
|
stat_or_artifact.transform = run.transform
|
lamindb/_can_validate.py
CHANGED
@@ -80,7 +80,9 @@ def _inspect(
|
|
80
80
|
|
81
81
|
# inspect in the DB
|
82
82
|
result_db = inspect(
|
83
|
-
df=_filter_query_based_on_organism(
|
83
|
+
df=_filter_query_based_on_organism(
|
84
|
+
queryset=queryset, field=field, organism=organism
|
85
|
+
),
|
84
86
|
identifiers=values,
|
85
87
|
field=field,
|
86
88
|
mute=mute,
|
@@ -161,6 +163,7 @@ def _validate(
|
|
161
163
|
field_values = pd.Series(
|
162
164
|
_filter_query_based_on_organism(
|
163
165
|
queryset=queryset,
|
166
|
+
field=field,
|
164
167
|
organism=organism,
|
165
168
|
values_list_field=field,
|
166
169
|
),
|
@@ -284,7 +287,9 @@ def _standardize(
|
|
284
287
|
|
285
288
|
try:
|
286
289
|
orm._meta.get_field(synonyms_field)
|
287
|
-
df = _filter_query_based_on_organism(
|
290
|
+
df = _filter_query_based_on_organism(
|
291
|
+
queryset=queryset, field=field, organism=organism
|
292
|
+
)
|
288
293
|
except FieldDoesNotExist:
|
289
294
|
df = pd.DataFrame()
|
290
295
|
|
@@ -439,6 +444,7 @@ def _check_synonyms_field_exist(record: Registry):
|
|
439
444
|
|
440
445
|
def _filter_query_based_on_organism(
|
441
446
|
queryset: QuerySet,
|
447
|
+
field: str,
|
442
448
|
organism: str | Registry | None = None,
|
443
449
|
values_list_field: str | None = None,
|
444
450
|
):
|
@@ -447,7 +453,7 @@ def _filter_query_based_on_organism(
|
|
447
453
|
|
448
454
|
orm = queryset.model
|
449
455
|
|
450
|
-
if _has_organism_field(orm):
|
456
|
+
if _has_organism_field(orm) and not field.endswith("id"):
|
451
457
|
# here, we can safely import lnschema_bionty
|
452
458
|
from lnschema_bionty._bionty import create_or_get_organism_record
|
453
459
|
|
lamindb/_collection.py
CHANGED
@@ -103,9 +103,9 @@ def __init__(
|
|
103
103
|
if meta._state.adding:
|
104
104
|
raise ValueError("Save meta artifact before creating collection!")
|
105
105
|
if not feature_sets:
|
106
|
-
feature_sets = meta.features.
|
106
|
+
feature_sets = meta.features.feature_set_by_slot
|
107
107
|
else:
|
108
|
-
if len(meta.features.
|
108
|
+
if len(meta.features.feature_set_by_slot) > 0:
|
109
109
|
logger.info("overwriting feature sets linked to artifact")
|
110
110
|
# we ignore collections in trash containing the same hash
|
111
111
|
if hash is not None:
|
@@ -121,7 +121,7 @@ def __init__(
|
|
121
121
|
# save the information that this artifact was previously
|
122
122
|
# produced by another run
|
123
123
|
if existing_collection.run is not None:
|
124
|
-
existing_collection.run.
|
124
|
+
existing_collection.run.output_collections_with_later_updates.add(
|
125
125
|
existing_collection
|
126
126
|
)
|
127
127
|
# update the run of the artifact with the latest run
|
@@ -129,7 +129,7 @@ def __init__(
|
|
129
129
|
existing_collection.transform = run.transform
|
130
130
|
init_self_from_db(collection, existing_collection)
|
131
131
|
update_attributes(collection, {"description": description, "name": name})
|
132
|
-
for slot, feature_set in collection.features.
|
132
|
+
for slot, feature_set in collection.features.feature_set_by_slot.items():
|
133
133
|
if slot in feature_sets:
|
134
134
|
if not feature_sets[slot] == feature_set:
|
135
135
|
collection.feature_sets.remove(feature_set)
|
@@ -177,7 +177,7 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
|
177
177
|
feature_sets_by_slots = defaultdict(list)
|
178
178
|
logger.debug("slots")
|
179
179
|
for link in feature_set_artifact_links:
|
180
|
-
feature_sets_by_slots[link.slot].append(link.
|
180
|
+
feature_sets_by_slots[link.slot].append(link.featureset_id)
|
181
181
|
feature_sets_union = {}
|
182
182
|
logger.debug("union")
|
183
183
|
for slot, feature_set_ids_slot in feature_sets_by_slots.items():
|
@@ -197,7 +197,7 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
|
197
197
|
)
|
198
198
|
start_time = logger.debug("done, start evaluate", time=start_time)
|
199
199
|
features = features_registry.filter(id__in=feature_ids)
|
200
|
-
feature_sets_union[slot] = FeatureSet(features,
|
200
|
+
feature_sets_union[slot] = FeatureSet(features, dtype=feature_set_1.dtype)
|
201
201
|
start_time = logger.debug("done", time=start_time)
|
202
202
|
# validate consistency of hashes
|
203
203
|
# we do not allow duplicate hashes
|
@@ -361,7 +361,7 @@ def restore(self) -> None:
|
|
361
361
|
@doc_args(Collection.artifacts.__doc__)
|
362
362
|
def artifacts(self) -> QuerySet:
|
363
363
|
"""{}."""
|
364
|
-
return self.unordered_artifacts.order_by("
|
364
|
+
return self.unordered_artifacts.order_by("collection_links__id")
|
365
365
|
|
366
366
|
|
367
367
|
METHOD_NAMES = [
|
lamindb/_feature.py
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING
|
3
|
+
from typing import TYPE_CHECKING
|
4
4
|
|
5
5
|
import lamindb_setup as ln_setup
|
6
6
|
import pandas as pd
|
7
7
|
from lamindb_setup.core._docs import doc_args
|
8
|
-
from lnschema_core.models import
|
8
|
+
from lnschema_core.models import Artifact, Feature
|
9
9
|
from pandas.api.types import CategoricalDtype, is_string_dtype
|
10
10
|
|
11
11
|
from lamindb._utils import attach_func_to_class_method
|
12
12
|
from lamindb.core._settings import settings
|
13
13
|
|
14
14
|
from ._query_set import RecordsList
|
15
|
+
from .core.schema import dict_schema_name_to_model_name
|
15
16
|
|
16
17
|
if TYPE_CHECKING:
|
17
18
|
from lnschema_core.types import FieldAttr
|
18
19
|
|
19
20
|
FEATURE_TYPES = {
|
20
|
-
"
|
21
|
-
"
|
22
|
-
"
|
23
|
-
"
|
21
|
+
"number": "number",
|
22
|
+
"int": "int",
|
23
|
+
"float": "float",
|
24
|
+
"bool": "bool",
|
25
|
+
"str": "cat",
|
26
|
+
"object": "cat",
|
24
27
|
}
|
25
28
|
|
26
29
|
|
@@ -28,10 +31,8 @@ def convert_numpy_dtype_to_lamin_feature_type(dtype) -> str:
|
|
28
31
|
orig_type = dtype.name
|
29
32
|
# strip precision qualifiers
|
30
33
|
type = "".join(i for i in orig_type if not i.isdigit())
|
31
|
-
if type == "
|
32
|
-
type = "
|
33
|
-
elif type == "object" or type == "str":
|
34
|
-
type = "category"
|
34
|
+
if type == "object" or type == "str":
|
35
|
+
type = "cat"
|
35
36
|
return type
|
36
37
|
|
37
38
|
|
@@ -42,38 +43,44 @@ def __init__(self, *args, **kwargs):
|
|
42
43
|
# now we proceed with the user-facing constructor
|
43
44
|
if len(args) != 0:
|
44
45
|
raise ValueError("Only non-keyword args allowed")
|
45
|
-
|
46
|
-
kwargs.pop("type") if "type" in kwargs else None
|
47
|
-
)
|
48
|
-
registries: list[Registry] | None = (
|
49
|
-
kwargs.pop("registries") if "registries" in kwargs else None
|
50
|
-
)
|
46
|
+
dtype: type | str = kwargs.pop("dtype") if "dtype" in kwargs else None
|
51
47
|
# cast type
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
48
|
+
if dtype is None:
|
49
|
+
raise ValueError("Please pass a type!")
|
50
|
+
elif dtype is not None:
|
51
|
+
if not isinstance(dtype, str):
|
52
|
+
if not isinstance(dtype, list) and dtype.__name__ in FEATURE_TYPES:
|
53
|
+
dtype_str = FEATURE_TYPES[dtype.__name__]
|
54
|
+
else:
|
55
|
+
if not isinstance(dtype, list):
|
56
|
+
raise ValueError("dtype has to be a list of Registry types")
|
57
|
+
registries_str = ""
|
58
|
+
for cls in dtype:
|
59
|
+
if not hasattr(cls, "__get_name_with_schema__"):
|
60
|
+
raise ValueError(
|
61
|
+
"each element of the list has to be a Registry"
|
62
|
+
)
|
63
|
+
registries_str += cls.__get_name_with_schema__() + "|"
|
64
|
+
dtype_str = f'cat[{registries_str.rstrip("|")}]'
|
67
65
|
else:
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
66
|
+
dtype_str = dtype
|
67
|
+
# add validation that a registry actually exists
|
68
|
+
if dtype_str not in FEATURE_TYPES.values() and not dtype_str.startswith(
|
69
|
+
"cat"
|
70
|
+
):
|
71
|
+
raise ValueError(
|
72
|
+
f"dtype is {dtype_str} but has to be one of 'number', 'int', 'float', 'cat', 'bool', 'cat[...]'!"
|
73
|
+
)
|
74
|
+
if dtype_str != "cat" and dtype_str.startswith("cat"):
|
75
|
+
registries_str = dtype_str.replace("cat[", "").rstrip("]")
|
76
|
+
if registries_str != "":
|
77
|
+
registry_str_list = registries_str.split("|")
|
78
|
+
for registry_str in registry_str_list:
|
79
|
+
if registry_str not in dict_schema_name_to_model_name(Artifact):
|
80
|
+
raise ValueError(
|
81
|
+
f"'{registry_str}' is an invalid dtype, pass, e.g. `[ln.ULabel, bt.CellType]` or similar"
|
82
|
+
)
|
83
|
+
kwargs["dtype"] = dtype_str
|
77
84
|
super(Feature, self).__init__(*args, **kwargs)
|
78
85
|
|
79
86
|
|
@@ -99,11 +106,11 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsLis
|
|
99
106
|
field = Feature.name if field is None else field
|
100
107
|
categoricals = categoricals_from_df(df)
|
101
108
|
|
102
|
-
|
109
|
+
dtypes = {}
|
103
110
|
# categoricals_with_unmapped_categories = {} # type: ignore
|
104
111
|
for name, col in df.items():
|
105
112
|
if name in categoricals:
|
106
|
-
|
113
|
+
dtypes[name] = "cat"
|
107
114
|
# below is a harder feature to write, now, because it requires to
|
108
115
|
# query the link tables between the label Registry and file or collection
|
109
116
|
# the original implementation fell short
|
@@ -117,7 +124,7 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsLis
|
|
117
124
|
# feature=name
|
118
125
|
# ).inspect(categories, "name", logging=False)["not_mapped"]
|
119
126
|
else:
|
120
|
-
|
127
|
+
dtypes[name] = convert_numpy_dtype_to_lamin_feature_type(col.dtype)
|
121
128
|
|
122
129
|
# silence the warning "loaded record with exact same name "
|
123
130
|
verbosity = settings.verbosity
|
@@ -128,7 +135,7 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsLis
|
|
128
135
|
if registry != Feature:
|
129
136
|
raise ValueError("field must be a Feature FieldAttr!")
|
130
137
|
# create records for all features including non-validated
|
131
|
-
features = [Feature(name=name,
|
138
|
+
features = [Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()]
|
132
139
|
finally:
|
133
140
|
settings.verbosity = verbosity
|
134
141
|
|
@@ -174,9 +181,10 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsLis
|
|
174
181
|
|
175
182
|
|
176
183
|
@doc_args(Feature.save.__doc__)
|
177
|
-
def save(self, *args, **kwargs) ->
|
184
|
+
def save(self, *args, **kwargs) -> Feature:
|
178
185
|
"""{}."""
|
179
186
|
super(Feature, self).save(*args, **kwargs)
|
187
|
+
return self
|
180
188
|
|
181
189
|
|
182
190
|
METHOD_NAMES = [
|
lamindb/_feature_set.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING, Iterable
|
3
|
+
from typing import TYPE_CHECKING, Iterable, Type
|
4
4
|
|
5
5
|
import lamindb_setup as ln_setup
|
6
6
|
import numpy as np
|
@@ -14,6 +14,11 @@ from lamindb._utils import attach_func_to_class_method
|
|
14
14
|
|
15
15
|
from ._feature import convert_numpy_dtype_to_lamin_feature_type
|
16
16
|
from ._registry import init_self_from_db
|
17
|
+
from .core.exceptions import ValidationError
|
18
|
+
from .core.schema import (
|
19
|
+
dict_related_model_to_related_name,
|
20
|
+
get_related_name,
|
21
|
+
)
|
17
22
|
|
18
23
|
if TYPE_CHECKING:
|
19
24
|
import pandas as pd
|
@@ -21,57 +26,7 @@ if TYPE_CHECKING:
|
|
21
26
|
from ._query_set import QuerySet
|
22
27
|
|
23
28
|
NUMBER_TYPE = "number"
|
24
|
-
|
25
|
-
|
26
|
-
def dict_related_model_to_related_name(orm):
|
27
|
-
d: dict = {
|
28
|
-
i.related_model.__get_name_with_schema__(): i.related_name
|
29
|
-
for i in orm._meta.related_objects
|
30
|
-
if i.related_name is not None
|
31
|
-
}
|
32
|
-
d.update(
|
33
|
-
{
|
34
|
-
i.related_model.__get_name_with_schema__(): i.name
|
35
|
-
for i in orm._meta.many_to_many
|
36
|
-
if i.name is not None
|
37
|
-
}
|
38
|
-
)
|
39
|
-
|
40
|
-
return d
|
41
|
-
|
42
|
-
|
43
|
-
def dict_schema_name_to_model_name(orm):
|
44
|
-
d: dict = {
|
45
|
-
i.related_model.__get_name_with_schema__(): i.related_model
|
46
|
-
for i in orm._meta.related_objects
|
47
|
-
if i.related_name is not None
|
48
|
-
}
|
49
|
-
d.update(
|
50
|
-
{
|
51
|
-
i.related_model.__get_name_with_schema__(): i.related_model
|
52
|
-
for i in orm._meta.many_to_many
|
53
|
-
if i.name is not None
|
54
|
-
}
|
55
|
-
)
|
56
|
-
|
57
|
-
return d
|
58
|
-
|
59
|
-
|
60
|
-
def get_related_name(features_type: Registry):
|
61
|
-
candidates = [
|
62
|
-
field.related_name
|
63
|
-
for field in FeatureSet._meta.related_objects
|
64
|
-
if field.related_model == features_type
|
65
|
-
]
|
66
|
-
if not candidates:
|
67
|
-
raise ValueError(
|
68
|
-
f"Can't create feature sets from {features_type.__name__} because it's not"
|
69
|
-
" related to it!\nYou need to create a link model between FeatureSet and"
|
70
|
-
" your Registry in your custom schema.\nTo do so, add a"
|
71
|
-
" line:\nfeature_sets = models.ManyToMany(FeatureSet,"
|
72
|
-
" related_name='mythings')\n"
|
73
|
-
)
|
74
|
-
return candidates[0]
|
29
|
+
DICT_KEYS_TYPE = type({}.keys()) # type: ignore
|
75
30
|
|
76
31
|
|
77
32
|
def validate_features(features: list[Registry]) -> Registry:
|
@@ -106,14 +61,14 @@ def __init__(self, *args, **kwargs):
|
|
106
61
|
if len(args) > 1:
|
107
62
|
raise ValueError("Only one non-keyword arg allowed: features")
|
108
63
|
features: Iterable[Registry] = kwargs.pop("features") if len(args) == 0 else args[0]
|
109
|
-
|
64
|
+
dtype: str | None = kwargs.pop("dtype") if "dtype" in kwargs else None
|
110
65
|
name: str | None = kwargs.pop("name") if "name" in kwargs else None
|
111
66
|
if len(kwargs) > 0:
|
112
67
|
raise ValueError("Only features, type, name are valid keyword arguments")
|
113
68
|
# now code
|
114
69
|
features_registry = validate_features(features)
|
115
|
-
if
|
116
|
-
|
70
|
+
if dtype is None:
|
71
|
+
dtype = None if features_registry == Feature else NUMBER_TYPE
|
117
72
|
n_features = len(features)
|
118
73
|
features_hash = hash_set({feature.uid for feature in features})
|
119
74
|
feature_set = FeatureSet.filter(hash=features_hash).one_or_none()
|
@@ -128,7 +83,7 @@ def __init__(self, *args, **kwargs):
|
|
128
83
|
super(FeatureSet, self).__init__(
|
129
84
|
uid=ids.base62_20(),
|
130
85
|
name=name,
|
131
|
-
|
86
|
+
dtype=get_type_str(dtype),
|
132
87
|
n=n_features,
|
133
88
|
registry=features_registry.__get_name_with_schema__(),
|
134
89
|
hash=hash,
|
@@ -144,13 +99,11 @@ def save(self, *args, **kwargs) -> None:
|
|
144
99
|
getattr(self, related_name).set(records)
|
145
100
|
|
146
101
|
|
147
|
-
def get_type_str(
|
148
|
-
if
|
149
|
-
type_str =
|
102
|
+
def get_type_str(dtype: str | None) -> str | None:
|
103
|
+
if dtype is not None:
|
104
|
+
type_str = dtype.__name__ if not isinstance(dtype, str) else dtype # type: ignore
|
150
105
|
else:
|
151
106
|
type_str = None
|
152
|
-
if type == "int" or type == "float":
|
153
|
-
type_str = NUMBER_TYPE
|
154
107
|
return type_str
|
155
108
|
|
156
109
|
|
@@ -165,7 +118,8 @@ def from_values(
|
|
165
118
|
mute: bool = False,
|
166
119
|
organism: Registry | str | None = None,
|
167
120
|
public_source: Registry | None = None,
|
168
|
-
|
121
|
+
raise_validation_error: bool = True,
|
122
|
+
) -> FeatureSet:
|
169
123
|
"""{}."""
|
170
124
|
if not isinstance(field, FieldAttr):
|
171
125
|
raise TypeError(
|
@@ -173,16 +127,25 @@ def from_values(
|
|
173
127
|
)
|
174
128
|
if len(values) == 0:
|
175
129
|
raise ValueError("Provide a list of at least one value")
|
130
|
+
if isinstance(values, DICT_KEYS_TYPE):
|
131
|
+
values = list(values)
|
176
132
|
registry = field.field.model
|
177
133
|
if registry != Feature and type is None:
|
178
134
|
type = NUMBER_TYPE
|
179
135
|
logger.debug("setting feature set to 'number'")
|
180
136
|
validated = registry.validate(values, field=field, mute=mute, organism=organism)
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
137
|
+
values_array = np.array(values)
|
138
|
+
validated_values = values_array[validated]
|
139
|
+
if validated.sum() != len(values):
|
140
|
+
not_validated_values = values_array[~validated]
|
141
|
+
msg = (
|
142
|
+
f"These values could not be validated: {not_validated_values.tolist()}\n"
|
143
|
+
f"If there are no typos, add them to their registry: {registry}"
|
144
|
+
)
|
145
|
+
if raise_validation_error:
|
146
|
+
raise ValidationError(msg)
|
147
|
+
elif len(validated_values) == 0:
|
148
|
+
return None # temporarily return None here
|
186
149
|
validated_features = registry.from_values(
|
187
150
|
validated_values,
|
188
151
|
field=field,
|
@@ -192,7 +155,7 @@ def from_values(
|
|
192
155
|
feature_set = FeatureSet(
|
193
156
|
features=validated_features,
|
194
157
|
name=name,
|
195
|
-
|
158
|
+
dtype=get_type_str(type),
|
196
159
|
)
|
197
160
|
return feature_set
|
198
161
|
|
@@ -217,12 +180,12 @@ def from_df(
|
|
217
180
|
return None
|
218
181
|
if registry == Feature:
|
219
182
|
validated_features = Feature.from_df(df.loc[:, validated])
|
220
|
-
feature_set = FeatureSet(validated_features, name=name,
|
183
|
+
feature_set = FeatureSet(validated_features, name=name, dtype=None)
|
221
184
|
else:
|
222
185
|
dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
|
223
186
|
if len(set(dtypes)) != 1:
|
224
187
|
raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
|
225
|
-
|
188
|
+
dtype = convert_numpy_dtype_to_lamin_feature_type(dtypes[0])
|
226
189
|
validated_features = registry.from_values(
|
227
190
|
df.columns[validated],
|
228
191
|
field=field,
|
@@ -232,7 +195,7 @@ def from_df(
|
|
232
195
|
feature_set = FeatureSet(
|
233
196
|
features=validated_features,
|
234
197
|
name=name,
|
235
|
-
|
198
|
+
dtype=get_type_str(dtype),
|
236
199
|
)
|
237
200
|
return feature_set
|
238
201
|
|
@@ -246,14 +209,14 @@ def members(self) -> QuerySet:
|
|
246
209
|
# need to fix this
|
247
210
|
return self._features[1]
|
248
211
|
related_name = self._get_related_name()
|
212
|
+
if related_name is None:
|
213
|
+
related_name = "features"
|
249
214
|
return self.__getattribute__(related_name).all()
|
250
215
|
|
251
216
|
|
252
217
|
def _get_related_name(self: FeatureSet) -> str:
|
253
|
-
key_split = self.registry.split(".")
|
254
|
-
orm_name_with_schema = f"{key_split[0]}.{key_split[1]}"
|
255
218
|
feature_sets_related_models = dict_related_model_to_related_name(self)
|
256
|
-
related_name = feature_sets_related_models.get(
|
219
|
+
related_name = feature_sets_related_models.get(self.registry)
|
257
220
|
return related_name
|
258
221
|
|
259
222
|
|