lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +6 -3
- lamindb/_can_curate.py +3 -1
- lamindb/_collection.py +1 -1
- lamindb/_curate.py +387 -318
- lamindb/_feature.py +84 -58
- lamindb/_feature_set.py +6 -4
- lamindb/_finish.py +68 -13
- lamindb/_from_values.py +10 -6
- lamindb/_query_set.py +321 -102
- lamindb/_record.py +5 -3
- lamindb/_save.py +1 -0
- lamindb/_view.py +105 -9
- lamindb/core/__init__.py +2 -2
- lamindb/core/_context.py +9 -13
- lamindb/core/_data.py +58 -88
- lamindb/core/_describe.py +139 -0
- lamindb/core/_django.py +5 -6
- lamindb/core/_feature_manager.py +408 -198
- lamindb/core/_label_manager.py +147 -109
- lamindb/core/datasets/__init__.py +31 -2
- lamindb/core/datasets/_core.py +0 -27
- lamindb/core/datasets/_small.py +100 -0
- lamindb/core/exceptions.py +1 -1
- lamindb/core/storage/paths.py +9 -4
- lamindb/core/types.py +12 -2
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/METADATA +7 -8
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/RECORD +30 -28
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/LICENSE +0 -0
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/WHEEL +0 -0
lamindb/core/_feature_manager.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import warnings
|
3
4
|
from collections import defaultdict
|
4
5
|
from collections.abc import Iterable
|
6
|
+
from datetime import date, datetime
|
5
7
|
from itertools import compress
|
6
8
|
from typing import TYPE_CHECKING, Any
|
7
9
|
|
@@ -31,9 +33,16 @@ from lnschema_core.models import (
|
|
31
33
|
Run,
|
32
34
|
ULabel,
|
33
35
|
)
|
36
|
+
from rich.table import Column, Table
|
37
|
+
from rich.text import Text
|
34
38
|
|
35
|
-
from lamindb._feature import
|
39
|
+
from lamindb._feature import (
|
40
|
+
FEATURE_DTYPES,
|
41
|
+
convert_pandas_dtype_to_lamin_dtype,
|
42
|
+
suggest_categorical_for_str_iterable,
|
43
|
+
)
|
36
44
|
from lamindb._feature_set import DICT_KEYS_TYPE, FeatureSet
|
45
|
+
from lamindb._from_values import _print_values
|
37
46
|
from lamindb._record import (
|
38
47
|
REGISTRY_UNIQUE_FIELD,
|
39
48
|
get_name_field,
|
@@ -44,8 +53,15 @@ from lamindb._save import save
|
|
44
53
|
from lamindb.core.exceptions import DoesNotExist, ValidationError
|
45
54
|
from lamindb.core.storage import LocalPathClasses
|
46
55
|
|
56
|
+
from ._describe import (
|
57
|
+
NAME_WIDTH,
|
58
|
+
TYPE_WIDTH,
|
59
|
+
VALUES_WIDTH,
|
60
|
+
describe_header,
|
61
|
+
print_rich_tree,
|
62
|
+
)
|
47
63
|
from ._django import get_artifact_with_related
|
48
|
-
from ._label_manager import
|
64
|
+
from ._label_manager import _get_labels, describe_labels
|
49
65
|
from ._settings import settings
|
50
66
|
from .schema import (
|
51
67
|
dict_related_model_to_related_name,
|
@@ -53,6 +69,7 @@ from .schema import (
|
|
53
69
|
|
54
70
|
if TYPE_CHECKING:
|
55
71
|
from lnschema_core.types import FieldAttr
|
72
|
+
from rich.tree import Tree
|
56
73
|
|
57
74
|
from lamindb._query_set import QuerySet
|
58
75
|
|
@@ -75,7 +92,9 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
|
|
75
92
|
return dictionary
|
76
93
|
|
77
94
|
|
78
|
-
def get_feature_set_by_slot_(host) -> dict:
|
95
|
+
def get_feature_set_by_slot_(host: Artifact | Collection) -> dict:
|
96
|
+
if isinstance(host, Collection):
|
97
|
+
return {}
|
79
98
|
# if the host is not yet saved
|
80
99
|
if host._state.adding:
|
81
100
|
if hasattr(host, "_feature_sets"):
|
@@ -134,14 +153,14 @@ def custom_aggregate(field, using: str):
|
|
134
153
|
return GroupConcat(field)
|
135
154
|
|
136
155
|
|
137
|
-
def
|
156
|
+
def _get_categoricals_postgres(
|
138
157
|
self: Artifact | Collection,
|
139
158
|
related_data: dict | None = None,
|
140
|
-
print_types: bool = False,
|
141
|
-
to_dict: bool = False,
|
142
159
|
print_params: bool = False,
|
143
|
-
):
|
144
|
-
|
160
|
+
) -> dict[tuple[str, str], set[str]]:
|
161
|
+
"""Get categorical features and their values using PostgreSQL-specific optimizations."""
|
162
|
+
if print_params:
|
163
|
+
return {}
|
145
164
|
|
146
165
|
if not related_data:
|
147
166
|
artifact_meta = get_artifact_with_related(
|
@@ -149,6 +168,7 @@ def _print_categoricals_postgres(
|
|
149
168
|
)
|
150
169
|
related_data = artifact_meta.get("related_data", {})
|
151
170
|
|
171
|
+
# Process m2m data
|
152
172
|
m2m_data = related_data.get("m2m", {}) if related_data else {}
|
153
173
|
m2m_name = {}
|
154
174
|
for related_name, values in m2m_data.items():
|
@@ -157,6 +177,8 @@ def _print_categoricals_postgres(
|
|
157
177
|
self.__class__.__name__, ""
|
158
178
|
).lower()
|
159
179
|
m2m_name[related_model_name] = values
|
180
|
+
|
181
|
+
# Get feature information
|
160
182
|
links_data = related_data.get("link", {}) if related_data else {}
|
161
183
|
feature_dict = {
|
162
184
|
id: (name, dtype)
|
@@ -165,188 +187,295 @@ def _print_categoricals_postgres(
|
|
165
187
|
)
|
166
188
|
}
|
167
189
|
|
168
|
-
|
169
|
-
|
190
|
+
# Build result dictionary
|
191
|
+
result = defaultdict(set)
|
192
|
+
for link_name, link_values in links_data.items():
|
193
|
+
related_name = link_name.removeprefix("links_").replace("_", "")
|
194
|
+
if not link_values:
|
195
|
+
continue
|
170
196
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
labels_msgs = []
|
175
|
-
feature_values: dict = {}
|
176
|
-
for link_name, link_values in links_data.items():
|
177
|
-
related_name = link_name.removeprefix("links_").replace("_", "")
|
178
|
-
link_model = getattr(self.__class__, link_name).rel.related_model
|
179
|
-
if not link_values:
|
197
|
+
for link_value in link_values:
|
198
|
+
feature_id = link_value.get("feature")
|
199
|
+
if feature_id is None:
|
180
200
|
continue
|
181
|
-
for link_value in link_values:
|
182
|
-
feature_id = link_value.get("feature")
|
183
|
-
if feature_id is None:
|
184
|
-
continue
|
185
|
-
feature_name = feature_dict.get(feature_id)[0]
|
186
|
-
if feature_name not in feature_values:
|
187
|
-
feature_values[feature_name] = (feature_dict.get(feature_id)[1], [])
|
188
|
-
label_id = link_value.get(related_name)
|
189
|
-
feature_values[feature_name][1].append(
|
190
|
-
m2m_name.get(related_name, {}).get(label_id)
|
191
|
-
)
|
192
|
-
for feature_name, (dtype, labels_list) in feature_values.items():
|
193
|
-
print_values = _print_values(labels_list, n=10)
|
194
|
-
type_str = f": {dtype}" if print_types else ""
|
195
|
-
if to_dict:
|
196
|
-
dictionary[feature_name] = (
|
197
|
-
labels_list if len(labels_list) > 1 else labels_list[0]
|
198
|
-
)
|
199
|
-
labels_msgs.append(f" '{feature_name}'{type_str} = {print_values}")
|
200
|
-
if len(labels_msgs) > 0:
|
201
|
-
labels_msg = "\n".join(sorted(labels_msgs)) + "\n"
|
202
|
-
msg += labels_msg
|
203
|
-
return msg, dictionary
|
204
201
|
|
202
|
+
feature_name, feature_dtype = feature_dict.get(feature_id)
|
203
|
+
label_id = link_value.get(related_name)
|
204
|
+
label_name = m2m_name.get(related_name, {}).get(label_id)
|
205
|
+
if label_name:
|
206
|
+
result[(feature_name, feature_dtype)].add(label_name)
|
207
|
+
|
208
|
+
return dict(result)
|
205
209
|
|
206
|
-
|
210
|
+
|
211
|
+
def _get_categoricals(
|
207
212
|
self: Artifact | Collection,
|
208
|
-
print_types: bool = False,
|
209
|
-
to_dict: bool = False,
|
210
213
|
print_params: bool = False,
|
211
|
-
):
|
212
|
-
|
214
|
+
) -> dict[tuple[str, str], set[str]]:
|
215
|
+
"""Get categorical features and their values using the default approach."""
|
216
|
+
if print_params:
|
217
|
+
return {}
|
213
218
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
).items():
|
223
|
-
for link in links:
|
224
|
-
if hasattr(link, "feature_id") and link.feature_id is not None:
|
225
|
-
link_attr = get_link_attr(link, self)
|
226
|
-
labels_by_feature[link.feature_id].append(
|
227
|
-
getattr(link, link_attr).name
|
228
|
-
)
|
229
|
-
labels_msgs = []
|
230
|
-
for feature_id, labels_list in labels_by_feature.items():
|
231
|
-
feature = Feature.objects.using(self._state.db).get(id=feature_id)
|
232
|
-
print_values = _print_values(labels_list, n=10)
|
233
|
-
type_str = f": {feature.dtype}" if print_types else ""
|
234
|
-
if to_dict:
|
235
|
-
dictionary[feature.name] = (
|
236
|
-
labels_list if len(labels_list) > 1 else labels_list[0]
|
237
|
-
)
|
238
|
-
labels_msgs.append(f" '{feature.name}'{type_str} = {print_values}")
|
239
|
-
if len(labels_msgs) > 0:
|
240
|
-
labels_msg = "\n".join(sorted(labels_msgs)) + "\n"
|
241
|
-
msg += labels_msg
|
242
|
-
return msg, dictionary
|
219
|
+
result = defaultdict(set)
|
220
|
+
for _, links in _get_labels(self, links=True, instance=self._state.db).items():
|
221
|
+
for link in links:
|
222
|
+
if hasattr(link, "feature_id") and link.feature_id is not None:
|
223
|
+
feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
|
224
|
+
link_attr = get_link_attr(link, self)
|
225
|
+
label_name = getattr(link, link_attr).name
|
226
|
+
result[(feature.name, feature.dtype)].add(label_name)
|
243
227
|
|
228
|
+
return dict(result)
|
229
|
+
|
230
|
+
|
231
|
+
def _get_non_categoricals(
|
232
|
+
self,
|
233
|
+
print_params: bool = False,
|
234
|
+
) -> dict[tuple[str, str], set[Any]]:
|
235
|
+
"""Get non-categorical features and their values."""
|
236
|
+
non_categoricals = {}
|
244
237
|
|
245
|
-
|
238
|
+
if self.id is not None and isinstance(self, (Artifact, Run)):
|
239
|
+
attr_name = "param" if print_params else "feature"
|
240
|
+
_feature_values = (
|
241
|
+
getattr(self, f"_{attr_name}_values")
|
242
|
+
.values(f"{attr_name}__name", f"{attr_name}__dtype")
|
243
|
+
.annotate(values=custom_aggregate("value", self._state.db))
|
244
|
+
.order_by(f"{attr_name}__name")
|
245
|
+
)
|
246
|
+
|
247
|
+
for fv in _feature_values:
|
248
|
+
feature_name = fv[f"{attr_name}__name"]
|
249
|
+
feature_dtype = fv[f"{attr_name}__dtype"]
|
250
|
+
values = fv["values"]
|
251
|
+
|
252
|
+
# Convert single values to sets
|
253
|
+
if not isinstance(values, (list, dict, set)):
|
254
|
+
values = {values}
|
255
|
+
elif (
|
256
|
+
isinstance(values, list)
|
257
|
+
and feature_dtype != "dict"
|
258
|
+
and not feature_dtype.startswith("list")
|
259
|
+
):
|
260
|
+
values = set(values)
|
261
|
+
|
262
|
+
# Handle special datetime types
|
263
|
+
if feature_dtype == "datetime":
|
264
|
+
values = {datetime.fromisoformat(value) for value in values}
|
265
|
+
if feature_dtype == "date":
|
266
|
+
values = {date.fromisoformat(value) for value in values}
|
267
|
+
|
268
|
+
non_categoricals[(feature_name, feature_dtype)] = values
|
269
|
+
|
270
|
+
return non_categoricals
|
271
|
+
|
272
|
+
|
273
|
+
def _get_featuresets_postgres(
|
246
274
|
self: Artifact | Collection,
|
247
275
|
related_data: dict | None = None,
|
248
|
-
|
249
|
-
):
|
250
|
-
from lamindb._from_values import _print_values
|
251
|
-
|
276
|
+
) -> dict:
|
252
277
|
if not related_data:
|
253
278
|
artifact_meta = get_artifact_with_related(self, include_featureset=True)
|
254
279
|
related_data = artifact_meta.get("related_data", {})
|
255
280
|
|
256
281
|
fs_data = related_data.get("featuresets", {}) if related_data else {}
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
282
|
+
return fs_data
|
283
|
+
|
284
|
+
|
285
|
+
def _create_feature_table(name: str, registry_str: str, data: list) -> Table:
|
286
|
+
"""Create a Rich table for a feature group."""
|
287
|
+
table = Table(
|
288
|
+
Column(name, style="", no_wrap=True, width=NAME_WIDTH),
|
289
|
+
Column(registry_str, style="dim", no_wrap=True, width=TYPE_WIDTH),
|
290
|
+
Column("", width=VALUES_WIDTH, no_wrap=True),
|
291
|
+
show_header=True,
|
292
|
+
box=None,
|
293
|
+
pad_edge=False,
|
294
|
+
)
|
295
|
+
for row in data:
|
296
|
+
table.add_row(*row)
|
297
|
+
return table
|
266
298
|
|
267
299
|
|
268
|
-
def
|
300
|
+
def describe_features(
|
269
301
|
self: Artifact | Collection,
|
270
302
|
related_data: dict | None = None,
|
271
303
|
print_types: bool = False,
|
272
304
|
to_dict: bool = False,
|
273
305
|
print_params: bool = False,
|
274
|
-
|
275
|
-
|
306
|
+
tree: Tree | None = None,
|
307
|
+
with_labels: bool = False,
|
308
|
+
):
|
309
|
+
"""Describe features of an artifact or collection."""
|
310
|
+
if print_types:
|
311
|
+
warnings.warn(
|
312
|
+
"`print_types` parameter is deprecated and will be removed in a future version. Types are now always printed.",
|
313
|
+
DeprecationWarning,
|
314
|
+
stacklevel=2,
|
315
|
+
)
|
316
|
+
|
317
|
+
# initialize tree
|
318
|
+
if tree is None:
|
319
|
+
tree = describe_header(self)
|
320
|
+
|
321
|
+
dictionary: dict[str, Any] = {}
|
322
|
+
|
323
|
+
if self._state.adding:
|
324
|
+
return dictionary if to_dict else tree
|
276
325
|
|
326
|
+
# feature sets
|
327
|
+
feature_set_data: dict[str, tuple[str, list[str]]] = {}
|
328
|
+
feature_data: dict[str, tuple[str, list[str]]] = {}
|
329
|
+
if not print_params and not to_dict:
|
330
|
+
if self.id is not None and connections[self._state.db].vendor == "postgresql":
|
331
|
+
fs_data = _get_featuresets_postgres(self, related_data=related_data)
|
332
|
+
for fs_id, (slot, data) in fs_data.items():
|
333
|
+
for registry_str, feature_names in data.items():
|
334
|
+
feature_set = FeatureSet.get(id=fs_id)
|
335
|
+
feature_set_data[slot] = (feature_set, feature_names)
|
336
|
+
for feature_name in feature_names:
|
337
|
+
feature_data[feature_name] = (slot, registry_str)
|
338
|
+
else:
|
339
|
+
for slot, feature_set in get_feature_set_by_slot_(self).items():
|
340
|
+
features = feature_set.members
|
341
|
+
# features.first() is a lot slower than features[0] here
|
342
|
+
name_field = get_name_field(features[0])
|
343
|
+
feature_names = list(features.values_list(name_field, flat=True)[:20])
|
344
|
+
feature_set_data[slot] = (feature_set, feature_names)
|
345
|
+
for feature_name in feature_names:
|
346
|
+
feature_data[feature_name] = (slot, feature_set.registry)
|
347
|
+
|
348
|
+
internal_feature_names: set[str] = {} # type: ignore
|
349
|
+
if isinstance(self, Artifact):
|
350
|
+
feature_sets = self.feature_sets.filter(registry="Feature").all()
|
351
|
+
internal_feature_names = set() # type: ignore
|
352
|
+
if len(feature_sets) > 0:
|
353
|
+
for feature_set in feature_sets:
|
354
|
+
internal_feature_names = internal_feature_names.union(
|
355
|
+
set(feature_set.members.values_list("name", flat=True))
|
356
|
+
) # type: ignore
|
357
|
+
|
358
|
+
# categorical feature values
|
359
|
+
# Get the categorical data using the appropriate method
|
277
360
|
if not self._state.adding and connections[self._state.db].vendor == "postgresql":
|
278
|
-
|
361
|
+
categoricals = _get_categoricals_postgres(
|
279
362
|
self,
|
280
363
|
related_data=related_data,
|
281
|
-
print_types=print_types,
|
282
|
-
to_dict=to_dict,
|
283
364
|
print_params=print_params,
|
284
365
|
)
|
285
366
|
else:
|
286
|
-
|
367
|
+
categoricals = _get_categoricals(
|
287
368
|
self,
|
288
|
-
print_types=print_types,
|
289
|
-
to_dict=to_dict,
|
290
369
|
print_params=print_params,
|
291
370
|
)
|
292
371
|
|
293
|
-
# non-categorical
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
getattr(self, f"_{attr_name}_values")
|
299
|
-
.values(f"{attr_name}__name", f"{attr_name}__dtype")
|
300
|
-
.annotate(values=custom_aggregate("value", self._state.db))
|
301
|
-
.order_by(f"{attr_name}__name")
|
302
|
-
)
|
303
|
-
if len(_feature_values) > 0:
|
304
|
-
for fv in _feature_values:
|
305
|
-
feature_name = fv[f"{attr_name}__name"]
|
306
|
-
feature_dtype = fv[f"{attr_name}__dtype"]
|
307
|
-
values = fv["values"]
|
308
|
-
# TODO: understand why the below is necessary
|
309
|
-
if not isinstance(values, list):
|
310
|
-
values = [values]
|
311
|
-
if to_dict:
|
312
|
-
dictionary[feature_name] = values if len(values) > 1 else values[0]
|
313
|
-
type_str = f": {feature_dtype}" if print_types else ""
|
314
|
-
printed_values = (
|
315
|
-
_print_values(values, n=10, quotes=False)
|
316
|
-
if not feature_dtype.startswith("list")
|
317
|
-
else values
|
318
|
-
)
|
319
|
-
non_labels_msg += f" '{feature_name}'{type_str} = {printed_values}\n"
|
320
|
-
msg += non_labels_msg
|
372
|
+
# Get non-categorical features
|
373
|
+
non_categoricals = _get_non_categoricals(
|
374
|
+
self,
|
375
|
+
print_params=print_params,
|
376
|
+
)
|
321
377
|
|
322
|
-
|
323
|
-
|
324
|
-
|
378
|
+
# Process all Features containing labels and sort into internal/external
|
379
|
+
internal_feature_labels = {}
|
380
|
+
external_data = []
|
381
|
+
for features, is_list_type in [(categoricals, False), (non_categoricals, True)]:
|
382
|
+
for (feature_name, feature_dtype), values in sorted(features.items()):
|
383
|
+
# Handle dictionary conversion
|
384
|
+
if to_dict:
|
385
|
+
dict_value = values if len(values) > 1 else next(iter(values))
|
386
|
+
dictionary[feature_name] = dict_value
|
387
|
+
continue
|
325
388
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
self, related_data=related_data
|
389
|
+
# Format message
|
390
|
+
printed_values = (
|
391
|
+
_print_values(sorted(values), n=10, quotes=False)
|
392
|
+
if not is_list_type or not feature_dtype.startswith("list")
|
393
|
+
else sorted(values)
|
332
394
|
)
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
msg += feature_set_msg
|
395
|
+
|
396
|
+
# Sort into internal/external
|
397
|
+
feature_info = (
|
398
|
+
feature_name,
|
399
|
+
Text(feature_dtype, style="dim"),
|
400
|
+
printed_values,
|
401
|
+
)
|
402
|
+
if feature_name in internal_feature_names:
|
403
|
+
internal_feature_labels[feature_name] = feature_info
|
404
|
+
else:
|
405
|
+
external_data.append(feature_info)
|
406
|
+
|
346
407
|
if to_dict:
|
347
408
|
return dictionary
|
348
|
-
|
349
|
-
|
409
|
+
|
410
|
+
# Dataset section
|
411
|
+
internal_features_slot: dict[
|
412
|
+
str, list
|
413
|
+
] = {} # internal features from the `Feature` registry that contain labels
|
414
|
+
for feature_name, feature_row in internal_feature_labels.items():
|
415
|
+
slot, _ = feature_data.get(feature_name)
|
416
|
+
internal_features_slot.setdefault(slot, []).append(feature_row)
|
417
|
+
dataset_tree_children = []
|
418
|
+
|
419
|
+
for slot, (feature_set, feature_names) in feature_set_data.items():
|
420
|
+
if slot in internal_features_slot:
|
421
|
+
feature_rows = internal_features_slot[slot]
|
422
|
+
else:
|
423
|
+
feature_rows = [
|
424
|
+
(feature_name, Text(str(feature_set.dtype), style="dim"), "")
|
425
|
+
for feature_name in feature_names
|
426
|
+
if feature_name
|
427
|
+
]
|
428
|
+
dataset_tree_children.append(
|
429
|
+
_create_feature_table(
|
430
|
+
Text.assemble(
|
431
|
+
(slot, "violet"),
|
432
|
+
(" • ", "dim"),
|
433
|
+
(str(feature_set.n), "pink1"),
|
434
|
+
),
|
435
|
+
Text.assemble((f"[{feature_set.registry}]", "pink1")),
|
436
|
+
feature_rows,
|
437
|
+
)
|
438
|
+
)
|
439
|
+
## internal features from the non-`Feature` registry
|
440
|
+
if dataset_tree_children:
|
441
|
+
dataset_tree = tree.add(
|
442
|
+
Text.assemble(
|
443
|
+
("Dataset", "bold bright_magenta"),
|
444
|
+
("/", "dim"),
|
445
|
+
(".feature_sets", "dim bold"),
|
446
|
+
)
|
447
|
+
)
|
448
|
+
for child in dataset_tree_children:
|
449
|
+
dataset_tree.add(child)
|
450
|
+
|
451
|
+
# Annotations section
|
452
|
+
## external features
|
453
|
+
features_tree_children = []
|
454
|
+
if external_data:
|
455
|
+
features_tree_children.append(
|
456
|
+
_create_feature_table(
|
457
|
+
Text.assemble(
|
458
|
+
("Params" if print_params else "Features", "green_yellow")
|
459
|
+
),
|
460
|
+
"",
|
461
|
+
external_data,
|
462
|
+
)
|
463
|
+
)
|
464
|
+
annotations_tree = None
|
465
|
+
if features_tree_children:
|
466
|
+
annotations_tree = tree.add(Text("Annotations", style="bold dark_orange"))
|
467
|
+
for child in features_tree_children:
|
468
|
+
annotations_tree.add(child)
|
469
|
+
if with_labels:
|
470
|
+
labels_tree = describe_labels(self, as_subtree=True)
|
471
|
+
if labels_tree:
|
472
|
+
if annotations_tree is None:
|
473
|
+
annotations_tree = tree.add(
|
474
|
+
Text("Annotations", style="bold dark_orange")
|
475
|
+
)
|
476
|
+
annotations_tree.add(labels_tree)
|
477
|
+
|
478
|
+
return tree
|
350
479
|
|
351
480
|
|
352
481
|
def parse_feature_sets_from_anndata(
|
@@ -371,7 +500,7 @@ def parse_feature_sets_from_anndata(
|
|
371
500
|
type = (
|
372
501
|
"float"
|
373
502
|
if adata.X is None
|
374
|
-
else
|
503
|
+
else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
|
375
504
|
)
|
376
505
|
feature_sets = {}
|
377
506
|
if var_field is not None:
|
@@ -409,51 +538,75 @@ def parse_feature_sets_from_anndata(
|
|
409
538
|
return feature_sets
|
410
539
|
|
411
540
|
|
541
|
+
def is_valid_datetime_str(date_string: str) -> bool | str:
|
542
|
+
try:
|
543
|
+
dt = datetime.fromisoformat(date_string)
|
544
|
+
return dt.isoformat()
|
545
|
+
except ValueError:
|
546
|
+
return False
|
547
|
+
|
548
|
+
|
412
549
|
def infer_feature_type_convert_json(
|
413
|
-
value: Any, mute: bool = False, str_as_ulabel: bool = True
|
414
|
-
) -> tuple[str, Any]:
|
550
|
+
key: str, value: Any, mute: bool = False, str_as_ulabel: bool = True
|
551
|
+
) -> tuple[str, Any, str]:
|
552
|
+
message = ""
|
415
553
|
if isinstance(value, bool):
|
416
|
-
return
|
554
|
+
return "bool", value, message
|
417
555
|
elif isinstance(value, int):
|
418
|
-
return
|
556
|
+
return "int", value, message
|
419
557
|
elif isinstance(value, float):
|
420
|
-
return
|
558
|
+
return "float", value, message
|
559
|
+
elif isinstance(value, date):
|
560
|
+
return "date", value.isoformat(), message
|
561
|
+
elif isinstance(value, datetime):
|
562
|
+
return "datetime", value.isoformat(), message
|
421
563
|
elif isinstance(value, str):
|
422
|
-
if
|
423
|
-
|
564
|
+
if datetime_str := is_valid_datetime_str(value):
|
565
|
+
dt_type = (
|
566
|
+
"date" if len(value) == 10 else "datetime"
|
567
|
+
) # YYYY-MM-DD is exactly 10 characters
|
568
|
+
sanitized_value = datetime_str[:10] if dt_type == "date" else datetime_str # type: ignore
|
569
|
+
return dt_type, sanitized_value, message # type: ignore
|
424
570
|
else:
|
425
|
-
return "str", value
|
571
|
+
return "cat ? str", value, message
|
426
572
|
elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
|
427
|
-
if isinstance(value, (pd.Series, np.ndarray)):
|
428
|
-
|
429
|
-
|
430
|
-
|
573
|
+
if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)):
|
574
|
+
dtype = convert_pandas_dtype_to_lamin_dtype(value.dtype)
|
575
|
+
if dtype == "str":
|
576
|
+
# ndarray doesn't know categorical, so there was no conscious choice
|
577
|
+
# offer both options
|
578
|
+
if isinstance(value, np.ndarray):
|
579
|
+
dtype = "cat ? str"
|
580
|
+
else:
|
581
|
+
# suggest to create a categorical if there are few unique values
|
582
|
+
message = suggest_categorical_for_str_iterable(value, key)
|
583
|
+
if message:
|
584
|
+
message = f" # {message}"
|
585
|
+
return dtype, list(value), message
|
431
586
|
if isinstance(value, dict):
|
432
|
-
return "dict", value
|
587
|
+
return "dict", value, message
|
433
588
|
if len(value) > 0: # type: ignore
|
434
589
|
first_element_type = type(next(iter(value)))
|
435
590
|
if all(isinstance(elem, first_element_type) for elem in value):
|
436
591
|
if first_element_type is bool:
|
437
|
-
return
|
592
|
+
return "list[bool]", value, message
|
438
593
|
elif first_element_type is int:
|
439
|
-
return
|
594
|
+
return "list[int]", value, message
|
440
595
|
elif first_element_type is float:
|
441
|
-
return
|
596
|
+
return "list[float]", value, message
|
442
597
|
elif first_element_type is str:
|
443
|
-
|
444
|
-
return FEATURE_TYPES["str"] + "[ULabel]", value
|
445
|
-
else:
|
446
|
-
return "list[str]", value
|
598
|
+
return ("list[cat ? str]", value, message)
|
447
599
|
elif first_element_type == Record:
|
448
600
|
return (
|
449
|
-
f"cat[{first_element_type.__get_name_with_schema__()}]",
|
601
|
+
f"list[cat[{first_element_type.__get_name_with_schema__()}]]",
|
450
602
|
value,
|
603
|
+
message,
|
451
604
|
)
|
452
605
|
elif isinstance(value, Record):
|
453
|
-
return (f"cat[{value.__class__.__get_name_with_schema__()}]", value)
|
606
|
+
return (f"cat[{value.__class__.__get_name_with_schema__()}]", value, message)
|
454
607
|
if not mute:
|
455
608
|
logger.warning(f"cannot infer feature type of: {value}, returning '?")
|
456
|
-
return
|
609
|
+
return "?", value, message
|
457
610
|
|
458
611
|
|
459
612
|
def __init__(self, host: Artifact | Collection | Run):
|
@@ -463,12 +616,13 @@ def __init__(self, host: Artifact | Collection | Run):
|
|
463
616
|
|
464
617
|
|
465
618
|
def __repr__(self) -> str:
|
466
|
-
|
619
|
+
tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
|
620
|
+
return print_rich_tree(tree, fallback="no linked features")
|
467
621
|
|
468
622
|
|
469
623
|
def get_values(self) -> dict[str, Any]:
|
470
624
|
"""Get feature values as a dictionary."""
|
471
|
-
return
|
625
|
+
return describe_features(
|
472
626
|
self._host, to_dict=True, print_params=(self.__class__ == ParamManager)
|
473
627
|
) # type: ignore
|
474
628
|
|
@@ -669,10 +823,14 @@ def _add_values(
|
|
669
823
|
validated_keys = keys_array[validated]
|
670
824
|
if validated.sum() != len(keys):
|
671
825
|
not_validated_keys = keys_array[~validated]
|
826
|
+
not_validated_keys_dtype_message = [
|
827
|
+
(key, infer_feature_type_convert_json(key, features_values[key]))
|
828
|
+
for key in not_validated_keys
|
829
|
+
]
|
672
830
|
hint = "\n".join(
|
673
831
|
[
|
674
|
-
f" ln.{model_name}(name='{key}', dtype='{
|
675
|
-
for key in
|
832
|
+
f" ln.{model_name}(name='{key}', dtype='{dtype}').save(){message}"
|
833
|
+
for key, (dtype, _, message) in not_validated_keys_dtype_message
|
676
834
|
]
|
677
835
|
)
|
678
836
|
msg = (
|
@@ -690,12 +848,13 @@ def _add_values(
|
|
690
848
|
not_validated_values = []
|
691
849
|
for key, value in features_values.items():
|
692
850
|
feature = model.get(name=key)
|
693
|
-
inferred_type, converted_value = infer_feature_type_convert_json(
|
851
|
+
inferred_type, converted_value, _ = infer_feature_type_convert_json(
|
852
|
+
key,
|
694
853
|
value,
|
695
854
|
mute=True,
|
696
855
|
str_as_ulabel=str_as_ulabel,
|
697
856
|
)
|
698
|
-
if feature.dtype == "
|
857
|
+
if feature.dtype == "num":
|
699
858
|
if inferred_type not in {"int", "float"}:
|
700
859
|
raise TypeError(
|
701
860
|
f"Value for feature '{key}' with type {feature.dtype} must be a number"
|
@@ -706,12 +865,13 @@ def _add_values(
|
|
706
865
|
raise TypeError(
|
707
866
|
f"Value for feature '{key}' with type '{feature.dtype}' must be a string or record."
|
708
867
|
)
|
709
|
-
elif
|
868
|
+
elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
|
869
|
+
feature.dtype != "str" and feature.dtype != inferred_type
|
870
|
+
):
|
710
871
|
raise ValidationError(
|
711
872
|
f"Expected dtype for '{key}' is '{feature.dtype}', got '{inferred_type}'"
|
712
873
|
)
|
713
874
|
if not feature.dtype.startswith("cat"):
|
714
|
-
# can remove the query once we have the unique constraint
|
715
875
|
filter_kwargs = {model_name.lower(): feature, "value": converted_value}
|
716
876
|
feature_value = value_model.filter(**filter_kwargs).one_or_none()
|
717
877
|
if feature_value is None:
|
@@ -814,6 +974,59 @@ def add_values_params(
|
|
814
974
|
_add_values(self, values, Param.name, str_as_ulabel=False)
|
815
975
|
|
816
976
|
|
977
|
+
def remove_values(
|
978
|
+
self,
|
979
|
+
feature: str | Feature,
|
980
|
+
*,
|
981
|
+
value: Any | None = None,
|
982
|
+
):
|
983
|
+
"""Remove value annotations for a given feature.
|
984
|
+
|
985
|
+
Args:
|
986
|
+
feature: The feature for which to remove values.
|
987
|
+
value: An optional value to restrict removal to a single value.
|
988
|
+
|
989
|
+
"""
|
990
|
+
if isinstance(feature, str):
|
991
|
+
feature = Feature.get(name=feature)
|
992
|
+
filter_kwargs = {"feature": feature}
|
993
|
+
if feature.dtype.startswith("cat["):
|
994
|
+
feature_registry = feature.dtype.replace("cat[", "").replace("]", "")
|
995
|
+
if value is not None:
|
996
|
+
assert isinstance(value, Record) # noqa: S101
|
997
|
+
# the below uses our convention for field names in link models
|
998
|
+
link_name = (
|
999
|
+
feature_registry.split(".")[1]
|
1000
|
+
if "." in feature_registry
|
1001
|
+
else feature_registry
|
1002
|
+
).lower()
|
1003
|
+
filter_kwargs[link_name] = value
|
1004
|
+
if feature_registry == "ULabel":
|
1005
|
+
link_attribute = "links_ulabel"
|
1006
|
+
else:
|
1007
|
+
link_models_on_models = {
|
1008
|
+
getattr(
|
1009
|
+
Artifact, obj.related_name
|
1010
|
+
).through.__get_name_with_schema__(): obj.related_model.__get_name_with_schema__()
|
1011
|
+
for obj in Artifact._meta.related_objects
|
1012
|
+
if obj.related_model.__get_name_with_schema__() == feature_registry
|
1013
|
+
}
|
1014
|
+
link_attribute = {
|
1015
|
+
obj.related_name
|
1016
|
+
for obj in Artifact._meta.related_objects
|
1017
|
+
if obj.related_model.__get_name_with_schema__() in link_models_on_models
|
1018
|
+
}.pop()
|
1019
|
+
getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
|
1020
|
+
else:
|
1021
|
+
if value is not None:
|
1022
|
+
filter_kwargs["value"] = value
|
1023
|
+
feature_values = self._host._feature_values.filter(**filter_kwargs)
|
1024
|
+
self._host._feature_values.remove(*feature_values)
|
1025
|
+
# this might leave a dangling feature_value record
|
1026
|
+
# but we don't want to pay the price of making another query just to remove this annotation
|
1027
|
+
# we can clean the FeatureValue registry periodically if we want to
|
1028
|
+
|
1029
|
+
|
817
1030
|
def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
|
818
1031
|
"""Curate artifact with a feature set.
|
819
1032
|
|
@@ -847,7 +1060,10 @@ def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
|
|
847
1060
|
|
848
1061
|
|
849
1062
|
def _add_set_from_df(
|
850
|
-
self,
|
1063
|
+
self,
|
1064
|
+
field: FieldAttr = Feature.name,
|
1065
|
+
organism: str | None = None,
|
1066
|
+
mute: bool = False,
|
851
1067
|
):
|
852
1068
|
"""Add feature set corresponding to column names of DataFrame."""
|
853
1069
|
if isinstance(self._host, Artifact):
|
@@ -855,21 +1071,14 @@ def _add_set_from_df(
|
|
855
1071
|
else:
|
856
1072
|
# Collection
|
857
1073
|
assert self._host.artifact._accessor == "DataFrame" # noqa: S101
|
858
|
-
|
859
|
-
# parse and register features
|
860
|
-
registry = field.field.model
|
861
1074
|
df = self._host.load()
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
# create and link feature sets
|
870
|
-
feature_set = FeatureSet(features=features)
|
871
|
-
feature_sets = {"columns": feature_set}
|
872
|
-
self._host._feature_sets = feature_sets
|
1075
|
+
feature_set = FeatureSet.from_df(
|
1076
|
+
df=df,
|
1077
|
+
field=field,
|
1078
|
+
mute=mute,
|
1079
|
+
organism=organism,
|
1080
|
+
)
|
1081
|
+
self._host._feature_sets = {"columns": feature_set}
|
873
1082
|
self._host.save()
|
874
1083
|
|
875
1084
|
|
@@ -1056,6 +1265,7 @@ FeatureManager._add_from = _add_from
|
|
1056
1265
|
FeatureManager.filter = filter
|
1057
1266
|
FeatureManager.get = get
|
1058
1267
|
FeatureManager.make_external = make_external
|
1268
|
+
FeatureManager.remove_values = remove_values
|
1059
1269
|
ParamManager.add_values = add_values_params
|
1060
1270
|
ParamManager.get_values = get_values
|
1061
1271
|
ParamManager.filter = filter
|