lamindb 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +15 -24
- lamindb/_context.py +5 -2
- lamindb/_dataset.py +6 -3
- lamindb/_delete.py +6 -6
- lamindb/_feature.py +61 -26
- lamindb/_feature_manager.py +176 -0
- lamindb/_feature_set.py +63 -27
- lamindb/_file.py +120 -76
- lamindb/_from_values.py +88 -28
- lamindb/_label.py +85 -0
- lamindb/_logger.py +1 -1
- lamindb/_manager.py +24 -17
- lamindb/_orm.py +157 -33
- lamindb/_queryset.py +37 -35
- lamindb/_save.py +19 -9
- lamindb/_transform.py +12 -3
- lamindb/_view.py +1 -1
- lamindb/dev/__init__.py +4 -0
- lamindb/dev/_settings.py +1 -1
- lamindb/dev/_view_parents.py +70 -34
- lamindb/dev/datasets/__init__.py +12 -0
- lamindb/dev/datasets/_core.py +116 -65
- lamindb/dev/storage/__init__.py +1 -5
- lamindb/dev/storage/_backed_access.py +505 -379
- lamindb/dev/storage/file.py +3 -1
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/METADATA +10 -8
- lamindb-0.48.1.dist-info/RECORD +42 -0
- lamindb/_category.py +0 -42
- lamindb-0.48a2.dist-info/RECORD +0 -41
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/LICENSE +0 -0
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/WHEEL +0 -0
- {lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/entry_points.txt +0 -0
lamindb/__init__.py
CHANGED
@@ -4,11 +4,12 @@ Import the package::
|
|
4
4
|
|
5
5
|
import lamindb as ln
|
6
6
|
|
7
|
-
|
8
|
-
in-memory data objects (`DataFrame`, `AnnData`, etc.) and allows to link them
|
9
|
-
against entities of core schema & custom schemas.
|
7
|
+
.. note::
|
10
8
|
|
11
|
-
|
9
|
+
`File` abstracts over objects in storage from blob-like files (pdf, txt, etc.)
|
10
|
+
to streamable storage backends (HDF5, DuckDB, zarr, TileDB, etc.).
|
11
|
+
|
12
|
+
`Dataset` abstracts over `File` and tables in classical warehouses (BigQuery, Snowflake).
|
12
13
|
|
13
14
|
.. autosummary::
|
14
15
|
:toctree: .
|
@@ -16,20 +17,13 @@ The core schema entities are central to lamindb's API:
|
|
16
17
|
File
|
17
18
|
Dataset
|
18
19
|
Transform
|
19
|
-
|
20
|
+
Label
|
20
21
|
Feature
|
22
|
+
FeatureSet
|
23
|
+
Modality
|
21
24
|
User
|
22
25
|
Storage
|
23
|
-
|
24
|
-
Project
|
25
|
-
|
26
|
-
More control over feature management:
|
27
|
-
|
28
|
-
.. autosummary::
|
29
|
-
:toctree: .
|
30
|
-
|
31
|
-
FeatureSet
|
32
|
-
Category
|
26
|
+
Run
|
33
27
|
|
34
28
|
Functional tools:
|
35
29
|
|
@@ -39,7 +33,6 @@ Functional tools:
|
|
39
33
|
track
|
40
34
|
view
|
41
35
|
save
|
42
|
-
delete
|
43
36
|
|
44
37
|
Static classes & modules:
|
45
38
|
|
@@ -47,7 +40,6 @@ Static classes & modules:
|
|
47
40
|
:toctree: .
|
48
41
|
|
49
42
|
settings
|
50
|
-
context
|
51
43
|
types
|
52
44
|
setup
|
53
45
|
schema
|
@@ -55,14 +47,14 @@ Static classes & modules:
|
|
55
47
|
|
56
48
|
"""
|
57
49
|
|
58
|
-
__version__ = "0.
|
50
|
+
__version__ = "0.48.1" # denote a release candidate for 0.1.0 with 0.1rc1
|
59
51
|
|
60
52
|
import os as _os
|
61
53
|
|
62
54
|
import lamindb_setup as _lamindb_setup
|
63
55
|
|
64
56
|
# prints warning of python versions
|
65
|
-
from
|
57
|
+
from lamin_utils import py_version_warning as _py_version_warning
|
66
58
|
from lamindb_setup import _check_instance_setup
|
67
59
|
from lamindb_setup._check_instance_setup import _INSTANCE_NOT_SETUP_WARNING
|
68
60
|
|
@@ -90,15 +82,14 @@ if _INSTANCE_SETUP:
|
|
90
82
|
del InstanceNotSetupError
|
91
83
|
del __getattr__ # delete so that imports work out
|
92
84
|
from lnschema_core import ( # noqa
|
93
|
-
Category,
|
94
85
|
Dataset,
|
95
86
|
Feature,
|
96
87
|
FeatureSet,
|
97
88
|
File,
|
98
|
-
|
89
|
+
Label,
|
90
|
+
Modality,
|
99
91
|
Run,
|
100
92
|
Storage,
|
101
|
-
Tag,
|
102
93
|
Transform,
|
103
94
|
User,
|
104
95
|
)
|
@@ -109,13 +100,13 @@ if _INSTANCE_SETUP:
|
|
109
100
|
from ._context import context # noqa
|
110
101
|
|
111
102
|
track = context._track # noqa
|
112
|
-
from
|
103
|
+
from lamin_utils import logger as _logger
|
113
104
|
|
114
|
-
from . import _category # noqa
|
115
105
|
from . import _dataset # noqa
|
116
106
|
from . import _feature # noqa
|
117
107
|
from . import _feature_set # noqa
|
118
108
|
from . import _file # noqa
|
109
|
+
from . import _label # noqa
|
119
110
|
from . import _orm # noqa
|
120
111
|
from . import _transform # noqa
|
121
112
|
from ._delete import delete # noqa
|
lamindb/_context.py
CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path, PurePath
|
|
6
6
|
from typing import Dict, List, Optional, Tuple, Union
|
7
7
|
|
8
8
|
import lnschema_core
|
9
|
-
from
|
9
|
+
from lamin_utils import logger
|
10
10
|
from lamindb_setup import settings
|
11
11
|
from lamindb_setup.dev import InstanceSettings
|
12
12
|
from lnschema_core import Run, Transform
|
@@ -116,7 +116,7 @@ def get_notebook_name_colab() -> str:
|
|
116
116
|
return name.rstrip(".ipynb")
|
117
117
|
|
118
118
|
|
119
|
-
class
|
119
|
+
class run_context:
|
120
120
|
"""Global run context."""
|
121
121
|
|
122
122
|
instance: Optional[InstanceSettings] = None
|
@@ -434,3 +434,6 @@ class context:
|
|
434
434
|
logger.success(f"Updated: {transform}")
|
435
435
|
|
436
436
|
cls.transform = transform
|
437
|
+
|
438
|
+
|
439
|
+
context = run_context
|
lamindb/_dataset.py
CHANGED
@@ -67,7 +67,7 @@ def from_files(dataset: Dataset, *, name: str, files: Iterable[File]) -> Dataset
|
|
67
67
|
feature_set_file_links = File.feature_sets.through.objects.filter(
|
68
68
|
file_id__in=file_ids
|
69
69
|
)
|
70
|
-
feature_set_ids = [link.
|
70
|
+
feature_set_ids = [link.feature_set_id for link in feature_set_file_links]
|
71
71
|
feature_sets = FeatureSet.select(id__in=feature_set_ids)
|
72
72
|
# validate consistency of feature_sets
|
73
73
|
# we only allow one feature set per type
|
@@ -128,13 +128,16 @@ def delete(dataset: Dataset, storage: bool = False):
|
|
128
128
|
def save(dataset: Dataset):
|
129
129
|
if dataset.file is not None:
|
130
130
|
dataset.file.save()
|
131
|
-
|
131
|
+
feature_sets = dataset._feature_sets
|
132
|
+
if isinstance(feature_sets, dict):
|
133
|
+
feature_sets = feature_sets.values()
|
134
|
+
for feature_set in feature_sets:
|
132
135
|
feature_set.save()
|
133
136
|
super(Dataset, dataset).save()
|
134
137
|
if len(dataset._files) > 0:
|
135
138
|
dataset.files.set(dataset._files)
|
136
139
|
if len(dataset._feature_sets) > 0:
|
137
|
-
dataset.feature_sets.set(
|
140
|
+
dataset.feature_sets.set(feature_sets)
|
138
141
|
|
139
142
|
|
140
143
|
Dataset.__init__ = __init__
|
lamindb/_delete.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from typing import List, Union, overload # noqa
|
2
2
|
|
3
|
-
from
|
3
|
+
from lamin_utils import colors, logger
|
4
4
|
from lnschema_core import ORM
|
5
5
|
|
6
6
|
|
@@ -47,12 +47,12 @@ def delete( # type: ignore
|
|
47
47
|
|
48
48
|
Bulk delete via QuerySet:
|
49
49
|
|
50
|
-
>>> ln.save(ln.
|
51
|
-
>>> queryset = ln.
|
50
|
+
>>> ln.save(ln.Label.from_values(["Label1", "Label2", "Label3"], field="name"))
|
51
|
+
>>> queryset = ln.Label.select(name__icontains = "label")
|
52
52
|
>>> queryset.list()
|
53
|
-
[
|
54
|
-
|
55
|
-
|
53
|
+
[Label(id=o3FY3c5n, name=Label2, updated_at=2023-07-19 18:28:16, created_by_id=kmvZDIX9), # noqa
|
54
|
+
Label(id=Qi3c4utq, name=Label3, updated_at=2023-07-19 18:28:16, created_by_id=kmvZDIX9), # noqa
|
55
|
+
Label(id=CcFPLmpq, name=Label1, updated_at=2023-07-19 18:28:16, created_by_id=kmvZDIX9)] # noqa
|
56
56
|
>>> queryset.delete()
|
57
57
|
"""
|
58
58
|
logger.warning("For efficient bulk delete, use `queryset.delete` instead")
|
lamindb/_feature.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
-
from
|
1
|
+
from itertools import islice
|
2
|
+
from typing import List, Optional, Union
|
2
3
|
|
3
4
|
import pandas as pd
|
5
|
+
from lamin_utils import logger
|
4
6
|
from lamindb_setup.dev._docs import doc_args
|
5
|
-
from lnschema_core import
|
7
|
+
from lnschema_core import Feature, Label
|
6
8
|
from pandas.api.types import is_categorical_dtype, is_string_dtype
|
7
9
|
|
8
10
|
from lamindb.dev.utils import attach_func_to_class_method
|
@@ -11,6 +13,18 @@ from . import _TESTING
|
|
11
13
|
from ._save import bulk_create
|
12
14
|
|
13
15
|
|
16
|
+
def convert_numpy_dtype_to_lamin_feature_type(dtype) -> str:
|
17
|
+
orig_type = dtype.name
|
18
|
+
# strip precision qualifiers
|
19
|
+
type = "".join(i for i in orig_type if not i.isdigit())
|
20
|
+
return type
|
21
|
+
|
22
|
+
|
23
|
+
def take(n, iterable):
|
24
|
+
"""Return the first n items of the iterable as a list."""
|
25
|
+
return list(islice(iterable, n))
|
26
|
+
|
27
|
+
|
14
28
|
def __init__(self, *args, **kwargs):
|
15
29
|
if len(args) == len(self._meta.concrete_fields):
|
16
30
|
super(Feature, self).__init__(*args, **kwargs)
|
@@ -18,43 +32,64 @@ def __init__(self, *args, **kwargs):
|
|
18
32
|
# now we proceed with the user-facing constructor
|
19
33
|
if len(args) != 0:
|
20
34
|
raise ValueError("Only non-keyword args allowed")
|
35
|
+
type: Optional[Union[type, str]] = kwargs.pop("type") if "type" in kwargs else None
|
36
|
+
if type is not None:
|
37
|
+
type_str = type.__name__ if not isinstance(type, str) else type
|
38
|
+
else:
|
39
|
+
type_str = None
|
40
|
+
kwargs["type"] = type_str
|
21
41
|
super(Feature, self).__init__(*args, **kwargs)
|
22
42
|
|
23
43
|
|
24
44
|
@classmethod # type:ignore
|
25
45
|
@doc_args(Feature.from_df.__doc__)
|
26
|
-
def from_df(cls, df) -> List["Feature"]:
|
46
|
+
def from_df(cls, df: "pd.DataFrame") -> List["Feature"]:
|
27
47
|
"""{}"""
|
28
|
-
records = Feature.from_values(df.columns, field=Feature.name)
|
29
|
-
assert len(records) == len(df.columns)
|
30
|
-
|
31
48
|
string_cols = [col for col in df.columns if is_string_dtype(df[col])]
|
32
49
|
categoricals = {col: df[col] for col in df.columns if is_categorical_dtype(df[col])}
|
33
50
|
for key in string_cols:
|
34
51
|
c = pd.Categorical(df[key])
|
35
|
-
# TODO: We should only check if non-null values are unique, but
|
36
|
-
# this would break cases where string columns with nulls could
|
37
|
-
# be written as categorical, but not as string.
|
38
|
-
# Possible solution: https://github.com/scverse/anndata/issues/504
|
39
52
|
if len(c.categories) < len(c):
|
40
53
|
categoricals[key] = c
|
41
54
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
55
|
+
types = {}
|
56
|
+
categoricals_with_unmapped_categories = {}
|
57
|
+
for name, col in df.items():
|
58
|
+
if name in categoricals:
|
59
|
+
types[name] = "category"
|
60
|
+
categorical = categoricals[name]
|
61
|
+
if hasattr(
|
62
|
+
categorical, "cat"
|
63
|
+
): # because .categories > pd2.0, .cat.categories < pd2.0
|
64
|
+
categorical = categorical.cat
|
65
|
+
categories = categorical.categories
|
66
|
+
categoricals_with_unmapped_categories[name] = Label.select(
|
67
|
+
feature=name
|
68
|
+
).inspect(categories, "name", logging=False)["not_mapped"]
|
53
69
|
else:
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
70
|
+
types[name] = convert_numpy_dtype_to_lamin_feature_type(col.dtype)
|
71
|
+
|
72
|
+
features = Feature.from_values(df.columns, field=Feature.name, types=types)
|
73
|
+
assert len(features) == len(df.columns)
|
74
|
+
|
75
|
+
if len(categoricals_with_unmapped_categories) > 0:
|
76
|
+
n_max = 20
|
77
|
+
categoricals_with_unmapped_categories_formatted = "\n ".join(
|
78
|
+
[
|
79
|
+
f"{key}: {', '.join(value)}"
|
80
|
+
for key, value in take(
|
81
|
+
n_max, categoricals_with_unmapped_categories.items()
|
82
|
+
)
|
83
|
+
]
|
84
|
+
)
|
85
|
+
if len(categoricals_with_unmapped_categories) > n_max:
|
86
|
+
categoricals_with_unmapped_categories_formatted += "\n ..."
|
87
|
+
categoricals_with_unmapped_categories_formatted
|
88
|
+
logger.info(
|
89
|
+
"There are unmapped categories:\n "
|
90
|
+
f" {categoricals_with_unmapped_categories_formatted}"
|
91
|
+
)
|
92
|
+
return features
|
58
93
|
|
59
94
|
|
60
95
|
@doc_args(Feature.save.__doc__)
|
@@ -65,7 +100,7 @@ def save(self, *args, **kwargs) -> None:
|
|
65
100
|
if hasattr(self, "_categories_records"):
|
66
101
|
records = self._categories_records
|
67
102
|
if hasattr(self, "_categories_raw"):
|
68
|
-
records =
|
103
|
+
records = Label.from_values(self._categories_raw, feature=self)
|
69
104
|
if records is not None:
|
70
105
|
bulk_create(records)
|
71
106
|
|
@@ -0,0 +1,176 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import Iterable, List, Optional, Union
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from lamin_utils import logger
|
6
|
+
from lnschema_core.models import ORM, Dataset, Feature, FeatureSet, File
|
7
|
+
|
8
|
+
from ._queryset import QuerySet
|
9
|
+
from ._save import save
|
10
|
+
|
11
|
+
|
12
|
+
def validate_and_cast_feature(feature) -> Feature:
|
13
|
+
if isinstance(feature, str):
|
14
|
+
feature_name = feature
|
15
|
+
feature = Feature.select(name=feature_name).one_or_none()
|
16
|
+
if feature is None:
|
17
|
+
raise ValueError(
|
18
|
+
f"Please create feature: ln.Feature(name='{feature_name}',"
|
19
|
+
" type='category').save()"
|
20
|
+
)
|
21
|
+
return feature
|
22
|
+
|
23
|
+
|
24
|
+
def create_features_df(
|
25
|
+
file: File, feature_sets: List[FeatureSet], exclude: bool = True
|
26
|
+
):
|
27
|
+
features = []
|
28
|
+
for feature_set in feature_sets:
|
29
|
+
if exclude:
|
30
|
+
features_df = feature_set.features.exclude(labels_orm__isnull=True).df()
|
31
|
+
else:
|
32
|
+
features_df = feature_set.features.df()
|
33
|
+
slots = file.feature_sets.through.objects.filter(
|
34
|
+
file=file, feature_set=feature_set
|
35
|
+
).list("slot")
|
36
|
+
for slot in slots:
|
37
|
+
features_df["slot"] = slot
|
38
|
+
features.append(features_df)
|
39
|
+
features_df = pd.concat(features)
|
40
|
+
return features_df.sort_values(["labels_schema", "labels_orm"])
|
41
|
+
|
42
|
+
|
43
|
+
class FeatureManager:
|
44
|
+
"""Feature manager."""
|
45
|
+
|
46
|
+
def __init__(self, host: Union[File, Dataset]):
|
47
|
+
self._host = host
|
48
|
+
slot_feature_sets = (
|
49
|
+
self._feature_set_df_with_slots().reset_index().set_index("slot")["id"]
|
50
|
+
)
|
51
|
+
self._slots = {
|
52
|
+
slot: self._host.feature_sets.get(id=i)
|
53
|
+
for slot, i in slot_feature_sets.items()
|
54
|
+
}
|
55
|
+
|
56
|
+
def __repr__(self) -> str:
|
57
|
+
if len(self._slots) > 0:
|
58
|
+
msg = "slots:\n"
|
59
|
+
for slot, feature_set in self._slots.items():
|
60
|
+
msg += f" {slot}: {feature_set}\n"
|
61
|
+
return msg
|
62
|
+
else:
|
63
|
+
return "No linked features."
|
64
|
+
|
65
|
+
def __getitem__(self, slot) -> QuerySet:
|
66
|
+
id = (
|
67
|
+
self._host.feature_sets.through.objects.filter(
|
68
|
+
file_id=self._host.id, slot=slot
|
69
|
+
)
|
70
|
+
.one()
|
71
|
+
.feature_set_id
|
72
|
+
)
|
73
|
+
accessor_by_orm = {
|
74
|
+
field.related_model.__name__: field.name
|
75
|
+
for field in self._host._meta.related_objects
|
76
|
+
}
|
77
|
+
accessor_by_orm["Feature"] = "features"
|
78
|
+
feature_set = self._host.feature_sets.filter(id=id).one()
|
79
|
+
return getattr(feature_set, accessor_by_orm[feature_set.ref_orm]).all()
|
80
|
+
|
81
|
+
def _feature_set_df_with_slots(self) -> pd.DataFrame:
|
82
|
+
"""Return DataFrame."""
|
83
|
+
df = self._host.feature_sets.df()
|
84
|
+
df.insert(
|
85
|
+
0,
|
86
|
+
"slot",
|
87
|
+
self._host.feature_sets.through.objects.filter(file_id=self._host.id)
|
88
|
+
.df()
|
89
|
+
.set_index("feature_set_id")
|
90
|
+
.slot,
|
91
|
+
)
|
92
|
+
return df
|
93
|
+
|
94
|
+
def add_labels(
|
95
|
+
self, records: Union[ORM, List[ORM]], feature: Optional[Union[str, ORM]] = None
|
96
|
+
):
|
97
|
+
"""Add one or several labels and associate them with a feature."""
|
98
|
+
if isinstance(records, str) or not isinstance(records, Iterable):
|
99
|
+
records = [records]
|
100
|
+
if isinstance(records[0], str): # type: ignore
|
101
|
+
raise ValueError(
|
102
|
+
"Please pass a record (an ORM object), not a string, e.g., via: label"
|
103
|
+
f" = ln.Label(name='{records[0]}')" # type: ignore
|
104
|
+
)
|
105
|
+
if self._host._state.adding:
|
106
|
+
raise ValueError("Please save the file or dataset before adding a label!")
|
107
|
+
feature = validate_and_cast_feature(feature)
|
108
|
+
records_by_orm = defaultdict(list)
|
109
|
+
records_by_feature_orm = defaultdict(list)
|
110
|
+
for record in records:
|
111
|
+
records_by_orm[record.__class__.__name__].append(record)
|
112
|
+
if feature is None:
|
113
|
+
try:
|
114
|
+
record_feature = (
|
115
|
+
record._feature
|
116
|
+
if hasattr(record, "_feature")
|
117
|
+
else record.feature
|
118
|
+
)
|
119
|
+
except ValueError:
|
120
|
+
raise ValueError("Pass feature argument")
|
121
|
+
else:
|
122
|
+
record_feature = feature
|
123
|
+
records_by_feature_orm[(record_feature, record.__class__.__name__)].append(
|
124
|
+
record
|
125
|
+
)
|
126
|
+
schema_and_accessor_by_orm = {
|
127
|
+
field.related_model.__name__: (
|
128
|
+
field.related_model.__get_schema_name__(),
|
129
|
+
field.name,
|
130
|
+
)
|
131
|
+
for field in self._host._meta.related_objects
|
132
|
+
}
|
133
|
+
schema_and_accessor_by_orm["Label"] = ("core", "labels")
|
134
|
+
for orm_name, records in records_by_orm.items():
|
135
|
+
save(records)
|
136
|
+
getattr(self._host, schema_and_accessor_by_orm[orm_name][1]).set(records)
|
137
|
+
accessor_by_orm = {
|
138
|
+
field.related_model.__name__: field.name
|
139
|
+
for field in self._host._meta.related_objects
|
140
|
+
}
|
141
|
+
accessor_by_orm["Feature"] = "features"
|
142
|
+
feature_sets = self._host.feature_sets.all()
|
143
|
+
feature_sets_by_orm = {
|
144
|
+
feature_set.ref_orm: feature_set for feature_set in feature_sets
|
145
|
+
}
|
146
|
+
for (feature, orm_name), records in records_by_feature_orm.items():
|
147
|
+
feature = validate_and_cast_feature(feature)
|
148
|
+
logger.info(f"Linking feature {feature.name} to {orm_name}")
|
149
|
+
feature.labels_orm = orm_name
|
150
|
+
feature.labels_schema = schema_and_accessor_by_orm[orm_name][0]
|
151
|
+
feature.save()
|
152
|
+
# check whether we have to update the feature set that manages labels
|
153
|
+
# (Feature) to account for a new feature
|
154
|
+
feature_set = feature_sets_by_orm["Feature"]
|
155
|
+
accessor = "features"
|
156
|
+
linked_features = getattr(feature_set, accessor)
|
157
|
+
if feature not in linked_features.all():
|
158
|
+
logger.info(
|
159
|
+
f"Linking feature {feature.name} to feature set {feature_set}"
|
160
|
+
)
|
161
|
+
linked_features.add(feature)
|
162
|
+
feature_set.n += 1
|
163
|
+
feature_set.save()
|
164
|
+
|
165
|
+
def add_feature_set(self, feature_set: FeatureSet, slot: str):
|
166
|
+
if self._host._state.adding:
|
167
|
+
raise ValueError(
|
168
|
+
"Please save the file or dataset before adding a feature set!"
|
169
|
+
)
|
170
|
+
feature_set.save()
|
171
|
+
self._host.feature_sets.add(feature_set)
|
172
|
+
link_record = self._host.feature_sets.through.objects.filter(
|
173
|
+
file=self._host, feature_set=feature_set
|
174
|
+
).one()
|
175
|
+
link_record.slot = slot
|
176
|
+
link_record.save()
|
lamindb/_feature_set.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
from typing import List, Optional
|
1
|
+
from typing import Iterable, List, Optional, Type, Union
|
2
2
|
|
3
3
|
import pandas as pd
|
4
4
|
from django.db.models.query_utils import DeferredAttribute as Field
|
5
|
-
from
|
5
|
+
from lamin_utils import logger
|
6
6
|
from lamindb_setup.dev._docs import doc_args
|
7
|
-
from lnschema_core import ORM, Feature, FeatureSet
|
7
|
+
from lnschema_core import ORM, Feature, FeatureSet, ids
|
8
8
|
from lnschema_core.types import ListLike
|
9
9
|
|
10
10
|
from lamindb.dev.hashing import hash_set
|
@@ -55,28 +55,51 @@ def __init__(self, *args, **kwargs):
|
|
55
55
|
# now we proceed with the user-facing constructor
|
56
56
|
if len(args) > 1:
|
57
57
|
raise ValueError("Only one non-keyword arg allowed: features")
|
58
|
-
features:
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
if
|
58
|
+
features: Iterable[ORM] = kwargs.pop("features") if len(args) == 0 else args[0]
|
59
|
+
ref_field: Optional[str] = (
|
60
|
+
kwargs.pop("ref_field") if "ref_field" in kwargs else "id"
|
61
|
+
)
|
62
|
+
type: Optional[Union[type, str]] = kwargs.pop("type") if "type" in kwargs else None
|
63
|
+
modality: Optional[str] = kwargs.pop("modality") if "modality" in kwargs else None
|
64
|
+
name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
|
65
|
+
# hash is only internally used
|
66
|
+
hash: Optional[str] = kwargs.pop("hash") if "hash" in kwargs else None
|
67
|
+
if len(kwargs) > 0:
|
68
|
+
raise ValueError(
|
69
|
+
"Only features, ref_field, type, modality, name are valid keyword arguments"
|
70
|
+
)
|
71
|
+
|
72
|
+
# now code
|
73
|
+
features_orm = validate_features(features)
|
74
|
+
if features_orm == Feature:
|
75
|
+
type = None
|
76
|
+
else:
|
77
|
+
type = float
|
78
|
+
n_features = len(features)
|
79
|
+
if hash is None:
|
64
80
|
features_hash = hash_set({feature.id for feature in features})
|
65
|
-
feature_set = FeatureSet.select(
|
81
|
+
feature_set = FeatureSet.select(hash=features_hash).one_or_none()
|
66
82
|
if feature_set is not None:
|
67
|
-
logger.info("
|
83
|
+
logger.info(f"Loaded {feature_set}")
|
68
84
|
init_self_from_db(self, feature_set)
|
69
85
|
return None
|
70
86
|
else:
|
71
|
-
|
72
|
-
self._features = (
|
73
|
-
if
|
74
|
-
|
87
|
+
hash = features_hash
|
88
|
+
self._features = (get_related_name(features_orm), features)
|
89
|
+
if type is not None:
|
90
|
+
type_str = type.__name__ if not isinstance(type, str) else type
|
91
|
+
else:
|
92
|
+
type_str = None
|
75
93
|
super(FeatureSet, self).__init__(
|
76
|
-
id=
|
77
|
-
|
78
|
-
|
79
|
-
|
94
|
+
id=ids.base62_20(),
|
95
|
+
name=name,
|
96
|
+
type=type_str,
|
97
|
+
n=n_features,
|
98
|
+
modality=modality,
|
99
|
+
ref_orm=features_orm.__name__,
|
100
|
+
ref_schema=features_orm.__get_schema_name__(),
|
101
|
+
ref_field=ref_field,
|
102
|
+
hash=hash,
|
80
103
|
)
|
81
104
|
|
82
105
|
|
@@ -99,33 +122,45 @@ def save(self, *args, **kwargs) -> None:
|
|
99
122
|
@classmethod # type:ignore
|
100
123
|
@doc_args(FeatureSet.from_values.__doc__)
|
101
124
|
def from_values(
|
102
|
-
cls,
|
125
|
+
cls,
|
126
|
+
values: ListLike,
|
127
|
+
field: Field = Feature.name,
|
128
|
+
type: Optional[Union[Type, str]] = None,
|
129
|
+
name: Optional[str] = None,
|
130
|
+
modality: Optional[str] = None,
|
131
|
+
**kwargs,
|
103
132
|
) -> "FeatureSet":
|
104
133
|
"""{}"""
|
105
134
|
if not isinstance(field, Field):
|
106
135
|
raise TypeError("Argument `field` must be an ORM field, e.g., `Feature.name`")
|
107
136
|
if len(values) == 0:
|
108
137
|
raise ValueError("Provide a list of at least one value")
|
109
|
-
|
138
|
+
ORM = field.field.model
|
139
|
+
if isinstance(ORM, Feature):
|
140
|
+
raise ValueError("Please use from_df() instead of from_values()")
|
110
141
|
iterable_idx = index_iterable(values)
|
111
142
|
if not isinstance(iterable_idx[0], (str, int)):
|
112
143
|
raise TypeError("values should be list-like of str or int")
|
113
144
|
features_hash = hash_set(set(iterable_idx))
|
114
|
-
feature_set = FeatureSet.select(
|
145
|
+
feature_set = FeatureSet.select(hash=features_hash).one_or_none()
|
115
146
|
if feature_set is not None:
|
116
|
-
logger.info("
|
147
|
+
logger.info(f"Loaded {feature_set}")
|
117
148
|
else:
|
118
|
-
from_bionty =
|
149
|
+
from_bionty = ORM.__module__.startswith("lnschema_bionty")
|
119
150
|
records = get_or_create_records(
|
120
151
|
iterable=iterable_idx,
|
121
152
|
field=field,
|
122
153
|
from_bionty=from_bionty,
|
123
154
|
**kwargs,
|
124
155
|
)
|
156
|
+
# type_str = type.__name__ if not isinstance(type, str) else type
|
125
157
|
feature_set = FeatureSet(
|
126
|
-
id=features_hash,
|
127
|
-
field=field.field.name,
|
128
158
|
features=records,
|
159
|
+
hash=features_hash,
|
160
|
+
name=name,
|
161
|
+
modality=modality,
|
162
|
+
type=type,
|
163
|
+
ref_field=field.field.name,
|
129
164
|
)
|
130
165
|
return feature_set
|
131
166
|
|
@@ -135,10 +170,11 @@ def from_values(
|
|
135
170
|
def from_df(
|
136
171
|
cls,
|
137
172
|
df: "pd.DataFrame",
|
173
|
+
name: Optional[str] = None,
|
138
174
|
) -> "FeatureSet":
|
139
175
|
"""{}"""
|
140
176
|
features = Feature.from_df(df)
|
141
|
-
feature_set = FeatureSet(features)
|
177
|
+
feature_set = FeatureSet(features, name=name)
|
142
178
|
return feature_set
|
143
179
|
|
144
180
|
|