lamindb 0.49.3__py3-none-any.whl → 0.50.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +55 -15
- lamindb/_context.py +25 -25
- lamindb/_delete.py +8 -8
- lamindb/_feature.py +15 -11
- lamindb/_feature_set.py +70 -39
- lamindb/_file.py +80 -56
- lamindb/_filter.py +5 -5
- lamindb/_from_values.py +55 -92
- lamindb/{_manager.py → _query_manager.py} +8 -5
- lamindb/{_queryset.py → _query_set.py} +31 -28
- lamindb/{_orm.py → _registry.py} +53 -294
- lamindb/_save.py +14 -13
- lamindb/_synonym.py +203 -0
- lamindb/_validate.py +134 -0
- lamindb/_view.py +15 -9
- lamindb/dev/__init__.py +13 -6
- lamindb/dev/_data.py +195 -0
- lamindb/dev/_feature_manager.py +102 -0
- lamindb/dev/_settings.py +10 -9
- lamindb/dev/_view_parents.py +36 -17
- lamindb/dev/datasets/__init__.py +5 -3
- lamindb/dev/datasets/_core.py +35 -17
- lamindb/dev/exc.py +4 -0
- lamindb/dev/storage/_backed_access.py +53 -17
- lamindb/dev/storage/file.py +44 -15
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/METADATA +34 -36
- lamindb-0.50.1.dist-info/RECORD +47 -0
- lamindb/_feature_manager.py +0 -237
- lamindb-0.49.3.dist-info/RECORD +0 -43
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/LICENSE +0 -0
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/WHEEL +0 -0
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/entry_points.txt +0 -0
lamindb/__init__.py
CHANGED
@@ -1,29 +1,67 @@
|
|
1
|
-
"""Open-source data
|
1
|
+
"""Open-source data platform for biology.
|
2
2
|
|
3
|
-
|
3
|
+
LaminDB helps you manage data using registries.
|
4
|
+
The two most central are:
|
4
5
|
|
5
|
-
|
6
|
+
.. autosummary::
|
7
|
+
:toctree: .
|
8
|
+
|
9
|
+
File
|
10
|
+
Dataset
|
11
|
+
|
12
|
+
|
13
|
+
.. dropdown:: With more detail, what are files & datasets?
|
14
|
+
|
15
|
+
Both files & datasets
|
16
|
+
|
17
|
+
- track numerical & categorical data batches of arbitrary format & size
|
18
|
+
|
19
|
+
- can validate & link features (the measured dimensions in a data batch)
|
20
|
+
|
21
|
+
Roughly,
|
22
|
+
|
23
|
+
- a file stores a single immutable batch of data
|
24
|
+
|
25
|
+
- a dataset stores a mutable collection of data batches
|
26
|
+
|
27
|
+
Examples:
|
28
|
+
|
29
|
+
- Blob-like immutable files (pdf, txt, csv, jpg, ...) or arrays (h5, h5ad,
|
30
|
+
...) → :class:`~lamindb.File`
|
31
|
+
|
32
|
+
- Mutable streamable backends (DuckDB, zarr, TileDB, ...) → :class:`~lamindb.Dataset` wrapping :class:`~lamindb.File`
|
33
|
+
|
34
|
+
- Collections of files → :class:`~lamindb.Dataset` wrapping :class:`~lamindb.File`
|
6
35
|
|
7
|
-
|
36
|
+
- Datasets in BigQuery, Snowflake, Postgres, ... → :class:`~lamindb.Dataset` (not yet implemented)
|
8
37
|
|
9
|
-
|
10
|
-
to streamable storage backends (HDF5, DuckDB, zarr, TileDB, etc.).
|
38
|
+
Hence, while
|
11
39
|
|
12
|
-
|
40
|
+
- files *always* have a one-to-one correspondence with a storage accessor
|
41
|
+
|
42
|
+
- datasets *can* reference a single file, multiple files or a dataset
|
43
|
+
in a warehouse like BigQuery or Snowflake
|
44
|
+
|
45
|
+
|
46
|
+
There are four registries to track provenance of data batches:
|
13
47
|
|
14
48
|
.. autosummary::
|
15
49
|
:toctree: .
|
16
50
|
|
17
|
-
|
18
|
-
|
51
|
+
User
|
52
|
+
Storage
|
19
53
|
Transform
|
54
|
+
Run
|
55
|
+
|
56
|
+
And four registries to validate & contextualize measurements in data batches:
|
57
|
+
|
58
|
+
.. autosummary::
|
59
|
+
:toctree: .
|
60
|
+
|
20
61
|
Label
|
21
62
|
Feature
|
22
63
|
FeatureSet
|
23
64
|
Modality
|
24
|
-
User
|
25
|
-
Storage
|
26
|
-
Run
|
27
65
|
|
28
66
|
Functional tools:
|
29
67
|
|
@@ -47,7 +85,7 @@ Static classes & modules:
|
|
47
85
|
|
48
86
|
"""
|
49
87
|
|
50
|
-
__version__ = "0.
|
88
|
+
__version__ = "0.50.1" # denote a release candidate for 0.1.0 with 0.1rc1
|
51
89
|
|
52
90
|
import os as _os
|
53
91
|
|
@@ -107,11 +145,13 @@ if _INSTANCE_SETUP:
|
|
107
145
|
from . import _feature_set # noqa
|
108
146
|
from . import _file # noqa
|
109
147
|
from . import _label # noqa
|
110
|
-
from . import
|
148
|
+
from . import _registry # noqa
|
111
149
|
from . import _storage # noqa
|
150
|
+
from . import _synonym # noqa
|
112
151
|
from . import _transform # noqa
|
152
|
+
from . import _validate # noqa
|
113
153
|
from ._delete import delete # noqa
|
114
|
-
from .
|
154
|
+
from ._registry import select_backward as select # noqa
|
115
155
|
from ._save import save # noqa
|
116
156
|
from ._view import view # noqa
|
117
157
|
from .dev._settings import settings
|
lamindb/_context.py
CHANGED
@@ -15,16 +15,16 @@ from lnschema_core.types import TransformType
|
|
15
15
|
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
16
16
|
|
17
17
|
msg_path_failed = (
|
18
|
-
"
|
18
|
+
"failed to infer notebook path.\nfix: either track manually via"
|
19
19
|
" `ln.track(ln.Transform(name='My notebook'))` or pass"
|
20
|
-
" `notebook_path` to ln.track()
|
20
|
+
" `notebook_path` to ln.track()"
|
21
21
|
)
|
22
22
|
|
23
23
|
msg_manual_init = (
|
24
|
-
"\n(1)
|
25
|
-
"\n(2)
|
24
|
+
"\n(1) save your notebook!"
|
25
|
+
"\n(2) attach metadata to the notebook by running the CLI:\n"
|
26
26
|
"lamin track {notebook_path}"
|
27
|
-
"\n(3)
|
27
|
+
"\n(3) reload or re-open your notebook"
|
28
28
|
)
|
29
29
|
|
30
30
|
|
@@ -167,8 +167,8 @@ class run_context:
|
|
167
167
|
install[jupyter]`, you can simply call:
|
168
168
|
|
169
169
|
>>> ln.track()
|
170
|
-
✅
|
171
|
-
✅
|
170
|
+
✅ saved: Transform(id=1LCd8kco9lZUBg, name=Track data lineage / provenance, short_name=02-data-lineage, stem_id=1LCd8kco9lZU, version=0, type=notebook, updated_at=2023-07-10 18:37:19, created_by_id=DzTjkKse) # noqa
|
171
|
+
✅ saved: Run(id=pHgVICV9DxBaV6BAuKJl, run_at=2023-07-10 18:37:19, transform_id=1LCd8kco9lZUBg, created_by_id=DzTjkKse) # noqa
|
172
172
|
>>> ln.context.transform
|
173
173
|
Transform(id=1LCd8kco9lZUBg, name=Track data lineage / provenance, short_name=02-data-lineage, stem_id=1LCd8kco9lZU, version=0, type=notebook, updated_at=2023-07-10 18:37:19, created_by_id=DzTjkKse) # noqa
|
174
174
|
>>> ln.context.run
|
@@ -180,8 +180,8 @@ class run_context:
|
|
180
180
|
>>> ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
|
181
181
|
>>> transform = ln.Transform.filter(name="Cell Ranger", version="7.2.0").one()
|
182
182
|
>>> ln.track(transform)
|
183
|
-
💬
|
184
|
-
✅
|
183
|
+
💬 loaded: Transform(id=ceHkZMaiHFdoB6, name=Cell Ranger, stem_id=ceHkZMaiHFdo, version=7.2.0, type=pipeline, updated_at=2023-07-10 18:37:19, created_by_id=DzTjkKse) # noqa
|
184
|
+
✅ saved: Run(id=RcpWIKC8cF74Pn3RUJ1W, run_at=2023-07-10 18:37:19, transform_id=ceHkZMaiHFdoB6, created_by_id=DzTjkKse) # noqa
|
185
185
|
>>> ln.context.transform
|
186
186
|
Transform(id=ceHkZMaiHFdoB6, name=Cell Ranger, stem_id=ceHkZMaiHFdo, version=7.2.0, type=pipeline, updated_at=2023-07-10 18:37:19, created_by_id=DzTjkKse) # noqa
|
187
187
|
>>> ln.context.run
|
@@ -204,20 +204,20 @@ class run_context:
|
|
204
204
|
except Exception as e:
|
205
205
|
if isinstance(e, ImportError):
|
206
206
|
logger.info(
|
207
|
-
"
|
208
|
-
"notebook!\
|
207
|
+
"it looks like you are running ln.track() from a "
|
208
|
+
"notebook!\nplease install nbproject: pip install nbproject"
|
209
209
|
)
|
210
210
|
elif isinstance(e, UpdateNbWithNonInteractiveEditorError):
|
211
211
|
raise e
|
212
212
|
elif isinstance(e, (NotebookNotSavedError, NoTitleError)):
|
213
213
|
raise e
|
214
214
|
else:
|
215
|
-
logger.warning(f"
|
215
|
+
logger.warning(f"automatic tracking of notebook failed: {e}")
|
216
216
|
is_tracked_notebook = False
|
217
217
|
|
218
218
|
if not is_tracked_notebook:
|
219
219
|
logger.warning(
|
220
|
-
"
|
220
|
+
"no automatic metadata detection, consider passing transform"
|
221
221
|
)
|
222
222
|
return None
|
223
223
|
else:
|
@@ -227,10 +227,10 @@ class run_context:
|
|
227
227
|
transform_exists = Transform.filter(id=transform.id).first()
|
228
228
|
if transform_exists is None:
|
229
229
|
transform.save()
|
230
|
-
logger.
|
230
|
+
logger.save(f"saved: {transform}")
|
231
231
|
transform_exists = transform
|
232
232
|
else:
|
233
|
-
logger.
|
233
|
+
logger.success(f"loaded: {transform_exists}")
|
234
234
|
cls.transform = transform_exists
|
235
235
|
|
236
236
|
if new_run is None: # for notebooks, default to loading latest runs
|
@@ -248,22 +248,22 @@ class run_context:
|
|
248
248
|
if run is not None: # loaded latest run
|
249
249
|
run.run_at = datetime.now(timezone.utc) # update run time
|
250
250
|
run.save()
|
251
|
-
logger.
|
251
|
+
logger.success(f"loaded: {run}")
|
252
252
|
|
253
253
|
if run is None: # create new run
|
254
254
|
run = ln.Run(transform=cls.transform)
|
255
255
|
run.save()
|
256
|
-
logger.
|
256
|
+
logger.save(f"saved: {run}")
|
257
257
|
cls.run = run
|
258
258
|
|
259
259
|
# at this point, we have a transform can display its parents if there are any
|
260
260
|
parents = cls.transform.parents.all() if cls.transform is not None else []
|
261
261
|
if len(parents) > 0:
|
262
262
|
if len(parents) == 1:
|
263
|
-
logger.info(f"
|
263
|
+
logger.info(f" parent transform: {parents[0]}")
|
264
264
|
else:
|
265
265
|
parents_formatted = "\n - ".join([f"{parent}" for parent in parents])
|
266
|
-
logger.info(f"
|
266
|
+
logger.info(f" parent transforms:\n - {parents_formatted}")
|
267
267
|
|
268
268
|
# only for newly intialized notebooks
|
269
269
|
if hasattr(cls, "_notebook_meta"):
|
@@ -347,11 +347,11 @@ class run_context:
|
|
347
347
|
|
348
348
|
dm = DisplayMeta(metadata)
|
349
349
|
logger.info(
|
350
|
-
"
|
350
|
+
"notebook imports:"
|
351
351
|
f" {' '.join(dm.pypackage(infer_pypackages(nb, pin_versions=True)))}" # noqa
|
352
352
|
)
|
353
353
|
except Exception:
|
354
|
-
logger.debug("
|
354
|
+
logger.debug("inferring imported packages failed")
|
355
355
|
pass
|
356
356
|
|
357
357
|
if needs_init:
|
@@ -405,9 +405,9 @@ class run_context:
|
|
405
405
|
type=TransformType.notebook,
|
406
406
|
)
|
407
407
|
transform.save()
|
408
|
-
logger.
|
408
|
+
logger.save(f"saved: {transform}")
|
409
409
|
else:
|
410
|
-
logger.
|
410
|
+
logger.success(f"loaded: {transform}")
|
411
411
|
if transform.name != title or transform.short_name != filestem:
|
412
412
|
response = input(
|
413
413
|
"Updated notebook name and/or title: Do you want to assign a"
|
@@ -427,9 +427,9 @@ class run_context:
|
|
427
427
|
transform.short_name = filestem
|
428
428
|
transform.save()
|
429
429
|
if response == "y":
|
430
|
-
logger.
|
430
|
+
logger.save(f"saved: {transform}")
|
431
431
|
else:
|
432
|
-
logger.success(f"
|
432
|
+
logger.success(f"updated: {transform}")
|
433
433
|
|
434
434
|
cls.transform = transform
|
435
435
|
|
lamindb/_delete.py
CHANGED
@@ -1,30 +1,30 @@
|
|
1
1
|
from typing import List, Union, overload # noqa
|
2
2
|
|
3
3
|
from lamin_utils import colors, logger
|
4
|
-
from lnschema_core import
|
4
|
+
from lnschema_core import Registry
|
5
5
|
|
6
6
|
|
7
7
|
@overload
|
8
8
|
def delete(
|
9
|
-
record:
|
9
|
+
record: Registry,
|
10
10
|
) -> None:
|
11
11
|
...
|
12
12
|
|
13
13
|
|
14
14
|
@overload
|
15
15
|
def delete(
|
16
|
-
records: List[
|
16
|
+
records: List[Registry],
|
17
17
|
) -> None: # type: ignore
|
18
18
|
...
|
19
19
|
|
20
20
|
|
21
21
|
def delete( # type: ignore
|
22
|
-
records: Union[
|
22
|
+
records: Union[Registry, List[Registry]],
|
23
23
|
) -> None:
|
24
24
|
"""Delete metadata records & files.
|
25
25
|
|
26
26
|
Args:
|
27
|
-
records: `Union[
|
27
|
+
records: `Union[Registry, List[Registry]]` One or multiple records.
|
28
28
|
|
29
29
|
Returns:
|
30
30
|
`None`
|
@@ -55,11 +55,11 @@ def delete( # type: ignore
|
|
55
55
|
Label(id=CcFPLmpq, name=Label1, updated_at=2023-07-19 18:28:16, created_by_id=kmvZDIX9)] # noqa
|
56
56
|
>>> queryset.delete()
|
57
57
|
"""
|
58
|
-
logger.warning("
|
58
|
+
logger.warning("for efficient bulk delete, use `queryset.delete` instead")
|
59
59
|
if isinstance(records, list):
|
60
60
|
records = records
|
61
|
-
elif isinstance(records,
|
61
|
+
elif isinstance(records, Registry):
|
62
62
|
records = [records]
|
63
63
|
for record in records:
|
64
64
|
record.delete()
|
65
|
-
logger.success(f"
|
65
|
+
logger.success(f"deleted {colors.yellow(f'{record}')}")
|
lamindb/_feature.py
CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5
5
|
from lamin_utils import colors, logger
|
6
6
|
from lamindb_setup.dev._docs import doc_args
|
7
7
|
from lnschema_core import Feature, Label
|
8
|
-
from lnschema_core.models import
|
8
|
+
from lnschema_core.models import Registry
|
9
9
|
from pandas.api.types import is_categorical_dtype, is_string_dtype
|
10
10
|
|
11
11
|
from lamindb.dev.utils import attach_func_to_class_method
|
@@ -34,7 +34,7 @@ def __init__(self, *args, **kwargs):
|
|
34
34
|
if len(args) != 0:
|
35
35
|
raise ValueError("Only non-keyword args allowed")
|
36
36
|
type: Optional[Union[type, str]] = kwargs.pop("type") if "type" in kwargs else None
|
37
|
-
registries: Optional[List[
|
37
|
+
registries: Optional[List[Registry]] = (
|
38
38
|
kwargs.pop("registries") if "registries" in kwargs else None
|
39
39
|
)
|
40
40
|
# cast type
|
@@ -46,14 +46,18 @@ def __init__(self, *args, **kwargs):
|
|
46
46
|
# cast registries
|
47
47
|
registries_str: Optional[str] = None
|
48
48
|
if registries is not None:
|
49
|
-
if
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
if not
|
54
|
-
raise ValueError("
|
55
|
-
registries_str
|
56
|
-
|
49
|
+
if isinstance(registries, str):
|
50
|
+
# TODO: add more validation
|
51
|
+
registries_str = registries
|
52
|
+
else:
|
53
|
+
if not isinstance(registries, List):
|
54
|
+
raise ValueError("registries has to be a list of Registry types")
|
55
|
+
registries_str = ""
|
56
|
+
for cls in registries:
|
57
|
+
if not hasattr(cls, "__get_name_with_schema__"):
|
58
|
+
raise ValueError("each element of the list has to be a Registry")
|
59
|
+
registries_str += cls.__get_name_with_schema__() + "|"
|
60
|
+
registries_str = registries_str.rstrip("|")
|
57
61
|
kwargs["registries"] = registries_str
|
58
62
|
super(Feature, self).__init__(*args, **kwargs)
|
59
63
|
|
@@ -75,7 +79,7 @@ def from_df(cls, df: "pd.DataFrame") -> List["Feature"]:
|
|
75
79
|
if name in categoricals:
|
76
80
|
types[name] = "category"
|
77
81
|
# below is a harder feature to write, now, because it requires to
|
78
|
-
# query the link tables between the label
|
82
|
+
# query the link tables between the label Registry and file or dataset
|
79
83
|
# the original implementation fell short
|
80
84
|
# categorical = categoricals[name]
|
81
85
|
# if hasattr(
|
lamindb/_feature_set.py
CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
|
|
4
4
|
from django.db.models.query_utils import DeferredAttribute as Field
|
5
5
|
from lamin_utils import logger
|
6
6
|
from lamindb_setup.dev._docs import doc_args
|
7
|
-
from lnschema_core import
|
7
|
+
from lnschema_core import Feature, FeatureSet, Modality, Registry, ids
|
8
8
|
from lnschema_core.types import ListLike
|
9
9
|
|
10
10
|
from lamindb.dev.hashing import hash_set
|
@@ -12,11 +12,11 @@ from lamindb.dev.utils import attach_func_to_class_method
|
|
12
12
|
|
13
13
|
from . import _TESTING
|
14
14
|
from ._from_values import get_or_create_records, index_iterable
|
15
|
-
from .
|
15
|
+
from ._registry import init_self_from_db
|
16
16
|
from ._save import bulk_create
|
17
17
|
|
18
18
|
|
19
|
-
def get_related_name(features_type:
|
19
|
+
def get_related_name(features_type: Registry):
|
20
20
|
candidates = [
|
21
21
|
field.related_name
|
22
22
|
for field in FeatureSet._meta.related_objects
|
@@ -26,19 +26,20 @@ def get_related_name(features_type: ORM):
|
|
26
26
|
raise ValueError(
|
27
27
|
f"Can't create feature sets from {features_type.__name__} because it's not"
|
28
28
|
" related to it!\nYou need to create a link model between FeatureSet and"
|
29
|
-
" your
|
30
|
-
" models.ManyToMany(FeatureSet,
|
29
|
+
" your Registry in your custom schema.\nTo do so, add a"
|
30
|
+
" line:\nfeature_sets = models.ManyToMany(FeatureSet,"
|
31
|
+
" related_name='mythings')\n"
|
31
32
|
)
|
32
33
|
return candidates[0]
|
33
34
|
|
34
35
|
|
35
|
-
def
|
36
|
+
def sanity_check_features(features: List[Registry]) -> Registry:
|
36
37
|
"""Validate and return feature type."""
|
37
38
|
if len(features) == 0:
|
38
39
|
raise ValueError("provide list of features with at least one element")
|
39
40
|
if not hasattr(features, "__getitem__"):
|
40
41
|
raise TypeError("features has to be list-like")
|
41
|
-
if not isinstance(features[0],
|
42
|
+
if not isinstance(features[0], Registry):
|
42
43
|
raise TypeError(
|
43
44
|
"features has to store feature records! use .from_values() otherwise"
|
44
45
|
)
|
@@ -48,6 +49,24 @@ def validate_features(features: List[ORM]) -> ORM:
|
|
48
49
|
return next(iter(feature_types)) # return value in set of cardinality 1
|
49
50
|
|
50
51
|
|
52
|
+
def get_validated_features(features: List[Registry], field: Field) -> List[Registry]:
|
53
|
+
validated_features = []
|
54
|
+
non_validated_features = []
|
55
|
+
for feature in features:
|
56
|
+
if feature._state.adding and not (
|
57
|
+
hasattr(feature, "_from_bionty") and feature._from_bionty
|
58
|
+
):
|
59
|
+
non_validated_features.append(getattr(feature, field.field.name))
|
60
|
+
else:
|
61
|
+
validated_features.append(feature)
|
62
|
+
if non_validated_features:
|
63
|
+
non_validated_features_display = ",".join(non_validated_features)
|
64
|
+
logger.warning(
|
65
|
+
f"ignoring non-validated features: {non_validated_features_display}"
|
66
|
+
)
|
67
|
+
return validated_features
|
68
|
+
|
69
|
+
|
51
70
|
def __init__(self, *args, **kwargs):
|
52
71
|
if len(args) == len(self._meta.concrete_fields):
|
53
72
|
super(FeatureSet, self).__init__(*args, **kwargs)
|
@@ -55,10 +74,7 @@ def __init__(self, *args, **kwargs):
|
|
55
74
|
# now we proceed with the user-facing constructor
|
56
75
|
if len(args) > 1:
|
57
76
|
raise ValueError("Only one non-keyword arg allowed: features")
|
58
|
-
features: Iterable[
|
59
|
-
ref_field: Optional[str] = (
|
60
|
-
kwargs.pop("ref_field") if "ref_field" in kwargs else "id"
|
61
|
-
)
|
77
|
+
features: Iterable[Registry] = kwargs.pop("features") if len(args) == 0 else args[0]
|
62
78
|
type: Optional[Union[type, str]] = kwargs.pop("type") if "type" in kwargs else None
|
63
79
|
modality: Optional[str] = kwargs.pop("modality") if "modality" in kwargs else None
|
64
80
|
name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
|
@@ -66,11 +82,11 @@ def __init__(self, *args, **kwargs):
|
|
66
82
|
hash: Optional[str] = kwargs.pop("hash") if "hash" in kwargs else None
|
67
83
|
if len(kwargs) > 0:
|
68
84
|
raise ValueError(
|
69
|
-
"Only features,
|
85
|
+
"Only features, type, modality, name are valid keyword arguments"
|
70
86
|
)
|
71
87
|
|
72
88
|
# now code
|
73
|
-
features_orm =
|
89
|
+
features_orm = sanity_check_features(features)
|
74
90
|
if features_orm == Feature:
|
75
91
|
type = None
|
76
92
|
else:
|
@@ -80,7 +96,7 @@ def __init__(self, *args, **kwargs):
|
|
80
96
|
features_hash = hash_set({feature.id for feature in features})
|
81
97
|
feature_set = FeatureSet.filter(hash=features_hash).one_or_none()
|
82
98
|
if feature_set is not None:
|
83
|
-
logger.
|
99
|
+
logger.success(f"loaded: {feature_set}")
|
84
100
|
init_self_from_db(self, feature_set)
|
85
101
|
return None
|
86
102
|
else:
|
@@ -108,7 +124,7 @@ def __init__(self, *args, **kwargs):
|
|
108
124
|
type=type_str,
|
109
125
|
n=n_features,
|
110
126
|
modality=modality_record,
|
111
|
-
|
127
|
+
registry=features_orm.__get_name_with_schema__(),
|
112
128
|
hash=hash,
|
113
129
|
)
|
114
130
|
|
@@ -139,39 +155,48 @@ def from_values(
|
|
139
155
|
name: Optional[str] = None,
|
140
156
|
modality: Optional[str] = None,
|
141
157
|
**kwargs,
|
142
|
-
) -> "FeatureSet":
|
158
|
+
) -> Optional["FeatureSet"]:
|
143
159
|
"""{}"""
|
144
160
|
if not isinstance(field, Field):
|
145
|
-
raise TypeError(
|
161
|
+
raise TypeError(
|
162
|
+
"Argument `field` must be an Registry field, e.g., `Feature.name`"
|
163
|
+
)
|
146
164
|
if len(values) == 0:
|
147
165
|
raise ValueError("Provide a list of at least one value")
|
148
|
-
|
149
|
-
if isinstance(
|
166
|
+
registry = field.field.model
|
167
|
+
if isinstance(registry, Feature):
|
150
168
|
raise ValueError("Please use from_df() instead of from_values()")
|
151
169
|
iterable_idx = index_iterable(values)
|
152
170
|
if not isinstance(iterable_idx[0], (str, int)):
|
153
171
|
raise TypeError("values should be list-like of str or int")
|
154
|
-
|
172
|
+
from_bionty = registry.__module__.startswith("lnschema_bionty")
|
173
|
+
features = get_or_create_records(
|
174
|
+
iterable=iterable_idx,
|
175
|
+
field=field,
|
176
|
+
from_bionty=from_bionty,
|
177
|
+
**kwargs,
|
178
|
+
)
|
179
|
+
validated_features = get_validated_features(features, field)
|
180
|
+
validated_feature_ids = [feature.id for feature in validated_features]
|
181
|
+
features_hash = hash_set(set(validated_feature_ids))
|
155
182
|
feature_set = FeatureSet.filter(hash=features_hash).one_or_none()
|
156
183
|
if feature_set is not None:
|
157
|
-
logger.
|
184
|
+
logger.success(f"loaded {feature_set}")
|
158
185
|
else:
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
ref_field=field.field.name,
|
174
|
-
)
|
186
|
+
if type is not None:
|
187
|
+
type_str = type.__name__ if not isinstance(type, str) else type
|
188
|
+
else:
|
189
|
+
type_str = None
|
190
|
+
if validated_features:
|
191
|
+
feature_set = FeatureSet(
|
192
|
+
features=validated_features,
|
193
|
+
hash=features_hash,
|
194
|
+
name=name,
|
195
|
+
modality=modality,
|
196
|
+
type=type_str,
|
197
|
+
)
|
198
|
+
else:
|
199
|
+
feature_set = None
|
175
200
|
return feature_set
|
176
201
|
|
177
202
|
|
@@ -181,10 +206,16 @@ def from_df(
|
|
181
206
|
cls,
|
182
207
|
df: "pd.DataFrame",
|
183
208
|
name: Optional[str] = None,
|
184
|
-
) -> "FeatureSet":
|
209
|
+
) -> Optional["FeatureSet"]:
|
185
210
|
"""{}"""
|
186
211
|
features = Feature.from_df(df)
|
187
|
-
|
212
|
+
validated_features = get_validated_features(features, Feature.name)
|
213
|
+
if validated_features:
|
214
|
+
feature_set = FeatureSet(validated_features, name=name)
|
215
|
+
else:
|
216
|
+
logger.warning("no validated features, skip creating feature set")
|
217
|
+
feature_set = None
|
218
|
+
# raise ValidationError("Dataframe columns contain no validated feature names")
|
188
219
|
return feature_set
|
189
220
|
|
190
221
|
|