lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import warnings
3
4
  from collections import defaultdict
4
5
  from collections.abc import Iterable
6
+ from datetime import date, datetime
5
7
  from itertools import compress
6
8
  from typing import TYPE_CHECKING, Any
7
9
 
@@ -31,9 +33,16 @@ from lnschema_core.models import (
31
33
  Run,
32
34
  ULabel,
33
35
  )
36
+ from rich.table import Column, Table
37
+ from rich.text import Text
34
38
 
35
- from lamindb._feature import FEATURE_TYPES, convert_numpy_dtype_to_lamin_feature_type
39
+ from lamindb._feature import (
40
+ FEATURE_DTYPES,
41
+ convert_pandas_dtype_to_lamin_dtype,
42
+ suggest_categorical_for_str_iterable,
43
+ )
36
44
  from lamindb._feature_set import DICT_KEYS_TYPE, FeatureSet
45
+ from lamindb._from_values import _print_values
37
46
  from lamindb._record import (
38
47
  REGISTRY_UNIQUE_FIELD,
39
48
  get_name_field,
@@ -44,8 +53,15 @@ from lamindb._save import save
44
53
  from lamindb.core.exceptions import DoesNotExist, ValidationError
45
54
  from lamindb.core.storage import LocalPathClasses
46
55
 
56
+ from ._describe import (
57
+ NAME_WIDTH,
58
+ TYPE_WIDTH,
59
+ VALUES_WIDTH,
60
+ describe_header,
61
+ print_rich_tree,
62
+ )
47
63
  from ._django import get_artifact_with_related
48
- from ._label_manager import get_labels_as_dict
64
+ from ._label_manager import _get_labels, describe_labels
49
65
  from ._settings import settings
50
66
  from .schema import (
51
67
  dict_related_model_to_related_name,
@@ -53,6 +69,7 @@ from .schema import (
53
69
 
54
70
  if TYPE_CHECKING:
55
71
  from lnschema_core.types import FieldAttr
72
+ from rich.tree import Tree
56
73
 
57
74
  from lamindb._query_set import QuerySet
58
75
 
@@ -75,7 +92,9 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
75
92
  return dictionary
76
93
 
77
94
 
78
- def get_feature_set_by_slot_(host) -> dict:
95
+ def get_feature_set_by_slot_(host: Artifact | Collection) -> dict:
96
+ if isinstance(host, Collection):
97
+ return {}
79
98
  # if the host is not yet saved
80
99
  if host._state.adding:
81
100
  if hasattr(host, "_feature_sets"):
@@ -134,14 +153,14 @@ def custom_aggregate(field, using: str):
134
153
  return GroupConcat(field)
135
154
 
136
155
 
137
- def _print_categoricals_postgres(
156
+ def _get_categoricals_postgres(
138
157
  self: Artifact | Collection,
139
158
  related_data: dict | None = None,
140
- print_types: bool = False,
141
- to_dict: bool = False,
142
159
  print_params: bool = False,
143
- ):
144
- from lamindb._from_values import _print_values
160
+ ) -> dict[tuple[str, str], set[str]]:
161
+ """Get categorical features and their values using PostgreSQL-specific optimizations."""
162
+ if print_params:
163
+ return {}
145
164
 
146
165
  if not related_data:
147
166
  artifact_meta = get_artifact_with_related(
@@ -149,6 +168,7 @@ def _print_categoricals_postgres(
149
168
  )
150
169
  related_data = artifact_meta.get("related_data", {})
151
170
 
171
+ # Process m2m data
152
172
  m2m_data = related_data.get("m2m", {}) if related_data else {}
153
173
  m2m_name = {}
154
174
  for related_name, values in m2m_data.items():
@@ -157,6 +177,8 @@ def _print_categoricals_postgres(
157
177
  self.__class__.__name__, ""
158
178
  ).lower()
159
179
  m2m_name[related_model_name] = values
180
+
181
+ # Get feature information
160
182
  links_data = related_data.get("link", {}) if related_data else {}
161
183
  feature_dict = {
162
184
  id: (name, dtype)
@@ -165,188 +187,295 @@ def _print_categoricals_postgres(
165
187
  )
166
188
  }
167
189
 
168
- msg = ""
169
- dictionary = {}
190
+ # Build result dictionary
191
+ result = defaultdict(set)
192
+ for link_name, link_values in links_data.items():
193
+ related_name = link_name.removeprefix("links_").replace("_", "")
194
+ if not link_values:
195
+ continue
170
196
 
171
- # categorical feature values
172
- if not print_params:
173
- labels_msg = ""
174
- labels_msgs = []
175
- feature_values: dict = {}
176
- for link_name, link_values in links_data.items():
177
- related_name = link_name.removeprefix("links_").replace("_", "")
178
- link_model = getattr(self.__class__, link_name).rel.related_model
179
- if not link_values:
197
+ for link_value in link_values:
198
+ feature_id = link_value.get("feature")
199
+ if feature_id is None:
180
200
  continue
181
- for link_value in link_values:
182
- feature_id = link_value.get("feature")
183
- if feature_id is None:
184
- continue
185
- feature_name = feature_dict.get(feature_id)[0]
186
- if feature_name not in feature_values:
187
- feature_values[feature_name] = (feature_dict.get(feature_id)[1], [])
188
- label_id = link_value.get(related_name)
189
- feature_values[feature_name][1].append(
190
- m2m_name.get(related_name, {}).get(label_id)
191
- )
192
- for feature_name, (dtype, labels_list) in feature_values.items():
193
- print_values = _print_values(labels_list, n=10)
194
- type_str = f": {dtype}" if print_types else ""
195
- if to_dict:
196
- dictionary[feature_name] = (
197
- labels_list if len(labels_list) > 1 else labels_list[0]
198
- )
199
- labels_msgs.append(f" '{feature_name}'{type_str} = {print_values}")
200
- if len(labels_msgs) > 0:
201
- labels_msg = "\n".join(sorted(labels_msgs)) + "\n"
202
- msg += labels_msg
203
- return msg, dictionary
204
201
 
202
+ feature_name, feature_dtype = feature_dict.get(feature_id)
203
+ label_id = link_value.get(related_name)
204
+ label_name = m2m_name.get(related_name, {}).get(label_id)
205
+ if label_name:
206
+ result[(feature_name, feature_dtype)].add(label_name)
207
+
208
+ return dict(result)
205
209
 
206
- def _print_categoricals(
210
+
211
+ def _get_categoricals(
207
212
  self: Artifact | Collection,
208
- print_types: bool = False,
209
- to_dict: bool = False,
210
213
  print_params: bool = False,
211
- ):
212
- from lamindb._from_values import _print_values
214
+ ) -> dict[tuple[str, str], set[str]]:
215
+ """Get categorical features and their values using the default approach."""
216
+ if print_params:
217
+ return {}
213
218
 
214
- msg = ""
215
- dictionary = {}
216
- # categorical feature values
217
- if not print_params:
218
- labels_msg = ""
219
- labels_by_feature = defaultdict(list)
220
- for _, (_, links) in get_labels_as_dict(
221
- self, links=True, instance=self._state.db
222
- ).items():
223
- for link in links:
224
- if hasattr(link, "feature_id") and link.feature_id is not None:
225
- link_attr = get_link_attr(link, self)
226
- labels_by_feature[link.feature_id].append(
227
- getattr(link, link_attr).name
228
- )
229
- labels_msgs = []
230
- for feature_id, labels_list in labels_by_feature.items():
231
- feature = Feature.objects.using(self._state.db).get(id=feature_id)
232
- print_values = _print_values(labels_list, n=10)
233
- type_str = f": {feature.dtype}" if print_types else ""
234
- if to_dict:
235
- dictionary[feature.name] = (
236
- labels_list if len(labels_list) > 1 else labels_list[0]
237
- )
238
- labels_msgs.append(f" '{feature.name}'{type_str} = {print_values}")
239
- if len(labels_msgs) > 0:
240
- labels_msg = "\n".join(sorted(labels_msgs)) + "\n"
241
- msg += labels_msg
242
- return msg, dictionary
219
+ result = defaultdict(set)
220
+ for _, links in _get_labels(self, links=True, instance=self._state.db).items():
221
+ for link in links:
222
+ if hasattr(link, "feature_id") and link.feature_id is not None:
223
+ feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
224
+ link_attr = get_link_attr(link, self)
225
+ label_name = getattr(link, link_attr).name
226
+ result[(feature.name, feature.dtype)].add(label_name)
243
227
 
228
+ return dict(result)
229
+
230
+
231
+ def _get_non_categoricals(
232
+ self,
233
+ print_params: bool = False,
234
+ ) -> dict[tuple[str, str], set[Any]]:
235
+ """Get non-categorical features and their values."""
236
+ non_categoricals = {}
244
237
 
245
- def _print_featuresets_postgres(
238
+ if self.id is not None and isinstance(self, (Artifact, Run)):
239
+ attr_name = "param" if print_params else "feature"
240
+ _feature_values = (
241
+ getattr(self, f"_{attr_name}_values")
242
+ .values(f"{attr_name}__name", f"{attr_name}__dtype")
243
+ .annotate(values=custom_aggregate("value", self._state.db))
244
+ .order_by(f"{attr_name}__name")
245
+ )
246
+
247
+ for fv in _feature_values:
248
+ feature_name = fv[f"{attr_name}__name"]
249
+ feature_dtype = fv[f"{attr_name}__dtype"]
250
+ values = fv["values"]
251
+
252
+ # Convert single values to sets
253
+ if not isinstance(values, (list, dict, set)):
254
+ values = {values}
255
+ elif (
256
+ isinstance(values, list)
257
+ and feature_dtype != "dict"
258
+ and not feature_dtype.startswith("list")
259
+ ):
260
+ values = set(values)
261
+
262
+ # Handle special datetime types
263
+ if feature_dtype == "datetime":
264
+ values = {datetime.fromisoformat(value) for value in values}
265
+ if feature_dtype == "date":
266
+ values = {date.fromisoformat(value) for value in values}
267
+
268
+ non_categoricals[(feature_name, feature_dtype)] = values
269
+
270
+ return non_categoricals
271
+
272
+
273
+ def _get_featuresets_postgres(
246
274
  self: Artifact | Collection,
247
275
  related_data: dict | None = None,
248
- print_types: bool = False,
249
- ):
250
- from lamindb._from_values import _print_values
251
-
276
+ ) -> dict:
252
277
  if not related_data:
253
278
  artifact_meta = get_artifact_with_related(self, include_featureset=True)
254
279
  related_data = artifact_meta.get("related_data", {})
255
280
 
256
281
  fs_data = related_data.get("featuresets", {}) if related_data else {}
257
- feature_set_msg = ""
258
- for _, (slot, data) in fs_data.items():
259
- for type_str, feature_names in data.items():
260
- type_str = f": {type_str}" if print_types else ""
261
- feature_set_msg += (
262
- f" '{slot}'{type_str} = {_print_values(feature_names)}\n"
263
- )
264
-
265
- return feature_set_msg
282
+ return fs_data
283
+
284
+
285
+ def _create_feature_table(name: str, registry_str: str, data: list) -> Table:
286
+ """Create a Rich table for a feature group."""
287
+ table = Table(
288
+ Column(name, style="", no_wrap=True, width=NAME_WIDTH),
289
+ Column(registry_str, style="dim", no_wrap=True, width=TYPE_WIDTH),
290
+ Column("", width=VALUES_WIDTH, no_wrap=True),
291
+ show_header=True,
292
+ box=None,
293
+ pad_edge=False,
294
+ )
295
+ for row in data:
296
+ table.add_row(*row)
297
+ return table
266
298
 
267
299
 
268
- def print_features(
300
+ def describe_features(
269
301
  self: Artifact | Collection,
270
302
  related_data: dict | None = None,
271
303
  print_types: bool = False,
272
304
  to_dict: bool = False,
273
305
  print_params: bool = False,
274
- ) -> str | dict[str, Any]:
275
- from lamindb._from_values import _print_values
306
+ tree: Tree | None = None,
307
+ with_labels: bool = False,
308
+ ):
309
+ """Describe features of an artifact or collection."""
310
+ if print_types:
311
+ warnings.warn(
312
+ "`print_types` parameter is deprecated and will be removed in a future version. Types are now always printed.",
313
+ DeprecationWarning,
314
+ stacklevel=2,
315
+ )
316
+
317
+ # initialize tree
318
+ if tree is None:
319
+ tree = describe_header(self)
320
+
321
+ dictionary: dict[str, Any] = {}
322
+
323
+ if self._state.adding:
324
+ return dictionary if to_dict else tree
276
325
 
326
+ # feature sets
327
+ feature_set_data: dict[str, tuple[str, list[str]]] = {}
328
+ feature_data: dict[str, tuple[str, list[str]]] = {}
329
+ if not print_params and not to_dict:
330
+ if self.id is not None and connections[self._state.db].vendor == "postgresql":
331
+ fs_data = _get_featuresets_postgres(self, related_data=related_data)
332
+ for fs_id, (slot, data) in fs_data.items():
333
+ for registry_str, feature_names in data.items():
334
+ feature_set = FeatureSet.get(id=fs_id)
335
+ feature_set_data[slot] = (feature_set, feature_names)
336
+ for feature_name in feature_names:
337
+ feature_data[feature_name] = (slot, registry_str)
338
+ else:
339
+ for slot, feature_set in get_feature_set_by_slot_(self).items():
340
+ features = feature_set.members
341
+ # features.first() is a lot slower than features[0] here
342
+ name_field = get_name_field(features[0])
343
+ feature_names = list(features.values_list(name_field, flat=True)[:20])
344
+ feature_set_data[slot] = (feature_set, feature_names)
345
+ for feature_name in feature_names:
346
+ feature_data[feature_name] = (slot, feature_set.registry)
347
+
348
+ internal_feature_names: set[str] = {} # type: ignore
349
+ if isinstance(self, Artifact):
350
+ feature_sets = self.feature_sets.filter(registry="Feature").all()
351
+ internal_feature_names = set() # type: ignore
352
+ if len(feature_sets) > 0:
353
+ for feature_set in feature_sets:
354
+ internal_feature_names = internal_feature_names.union(
355
+ set(feature_set.members.values_list("name", flat=True))
356
+ ) # type: ignore
357
+
358
+ # categorical feature values
359
+ # Get the categorical data using the appropriate method
277
360
  if not self._state.adding and connections[self._state.db].vendor == "postgresql":
278
- msg, dictionary = _print_categoricals_postgres(
361
+ categoricals = _get_categoricals_postgres(
279
362
  self,
280
363
  related_data=related_data,
281
- print_types=print_types,
282
- to_dict=to_dict,
283
364
  print_params=print_params,
284
365
  )
285
366
  else:
286
- msg, dictionary = _print_categoricals(
367
+ categoricals = _get_categoricals(
287
368
  self,
288
- print_types=print_types,
289
- to_dict=to_dict,
290
369
  print_params=print_params,
291
370
  )
292
371
 
293
- # non-categorical feature values
294
- non_labels_msg = ""
295
- if self.id is not None and self.__class__ == Artifact or self.__class__ == Run:
296
- attr_name = "param" if print_params else "feature"
297
- _feature_values = (
298
- getattr(self, f"_{attr_name}_values")
299
- .values(f"{attr_name}__name", f"{attr_name}__dtype")
300
- .annotate(values=custom_aggregate("value", self._state.db))
301
- .order_by(f"{attr_name}__name")
302
- )
303
- if len(_feature_values) > 0:
304
- for fv in _feature_values:
305
- feature_name = fv[f"{attr_name}__name"]
306
- feature_dtype = fv[f"{attr_name}__dtype"]
307
- values = fv["values"]
308
- # TODO: understand why the below is necessary
309
- if not isinstance(values, list):
310
- values = [values]
311
- if to_dict:
312
- dictionary[feature_name] = values if len(values) > 1 else values[0]
313
- type_str = f": {feature_dtype}" if print_types else ""
314
- printed_values = (
315
- _print_values(values, n=10, quotes=False)
316
- if not feature_dtype.startswith("list")
317
- else values
318
- )
319
- non_labels_msg += f" '{feature_name}'{type_str} = {printed_values}\n"
320
- msg += non_labels_msg
372
+ # Get non-categorical features
373
+ non_categoricals = _get_non_categoricals(
374
+ self,
375
+ print_params=print_params,
376
+ )
321
377
 
322
- if msg != "":
323
- header = "Features" if not print_params else "Params"
324
- msg = f" {colors.italic(header)}\n" + msg
378
+ # Process all Features containing labels and sort into internal/external
379
+ internal_feature_labels = {}
380
+ external_data = []
381
+ for features, is_list_type in [(categoricals, False), (non_categoricals, True)]:
382
+ for (feature_name, feature_dtype), values in sorted(features.items()):
383
+ # Handle dictionary conversion
384
+ if to_dict:
385
+ dict_value = values if len(values) > 1 else next(iter(values))
386
+ dictionary[feature_name] = dict_value
387
+ continue
325
388
 
326
- # feature sets
327
- if not print_params:
328
- feature_set_msg = ""
329
- if self.id is not None and connections[self._state.db].vendor == "postgresql":
330
- feature_set_msg = _print_featuresets_postgres(
331
- self, related_data=related_data
389
+ # Format message
390
+ printed_values = (
391
+ _print_values(sorted(values), n=10, quotes=False)
392
+ if not is_list_type or not feature_dtype.startswith("list")
393
+ else sorted(values)
332
394
  )
333
- else:
334
- for slot, feature_set in get_feature_set_by_slot_(self).items():
335
- features = feature_set.members
336
- # features.first() is a lot slower than features[0] here
337
- name_field = get_name_field(features[0])
338
- feature_names = list(features.values_list(name_field, flat=True)[:20])
339
- type_str = f": {feature_set.registry}" if print_types else ""
340
- feature_set_msg += (
341
- f" '{slot}'{type_str} = {_print_values(feature_names)}\n"
342
- )
343
- if feature_set_msg:
344
- msg += f" {colors.italic('Feature sets')}\n"
345
- msg += feature_set_msg
395
+
396
+ # Sort into internal/external
397
+ feature_info = (
398
+ feature_name,
399
+ Text(feature_dtype, style="dim"),
400
+ printed_values,
401
+ )
402
+ if feature_name in internal_feature_names:
403
+ internal_feature_labels[feature_name] = feature_info
404
+ else:
405
+ external_data.append(feature_info)
406
+
346
407
  if to_dict:
347
408
  return dictionary
348
- else:
349
- return msg
409
+
410
+ # Dataset section
411
+ internal_features_slot: dict[
412
+ str, list
413
+ ] = {} # internal features from the `Feature` registry that contain labels
414
+ for feature_name, feature_row in internal_feature_labels.items():
415
+ slot, _ = feature_data.get(feature_name)
416
+ internal_features_slot.setdefault(slot, []).append(feature_row)
417
+ dataset_tree_children = []
418
+
419
+ for slot, (feature_set, feature_names) in feature_set_data.items():
420
+ if slot in internal_features_slot:
421
+ feature_rows = internal_features_slot[slot]
422
+ else:
423
+ feature_rows = [
424
+ (feature_name, Text(str(feature_set.dtype), style="dim"), "")
425
+ for feature_name in feature_names
426
+ if feature_name
427
+ ]
428
+ dataset_tree_children.append(
429
+ _create_feature_table(
430
+ Text.assemble(
431
+ (slot, "violet"),
432
+ (" • ", "dim"),
433
+ (str(feature_set.n), "pink1"),
434
+ ),
435
+ Text.assemble((f"[{feature_set.registry}]", "pink1")),
436
+ feature_rows,
437
+ )
438
+ )
439
+ ## internal features from the non-`Feature` registry
440
+ if dataset_tree_children:
441
+ dataset_tree = tree.add(
442
+ Text.assemble(
443
+ ("Dataset", "bold bright_magenta"),
444
+ ("/", "dim"),
445
+ (".feature_sets", "dim bold"),
446
+ )
447
+ )
448
+ for child in dataset_tree_children:
449
+ dataset_tree.add(child)
450
+
451
+ # Annotations section
452
+ ## external features
453
+ features_tree_children = []
454
+ if external_data:
455
+ features_tree_children.append(
456
+ _create_feature_table(
457
+ Text.assemble(
458
+ ("Params" if print_params else "Features", "green_yellow")
459
+ ),
460
+ "",
461
+ external_data,
462
+ )
463
+ )
464
+ annotations_tree = None
465
+ if features_tree_children:
466
+ annotations_tree = tree.add(Text("Annotations", style="bold dark_orange"))
467
+ for child in features_tree_children:
468
+ annotations_tree.add(child)
469
+ if with_labels:
470
+ labels_tree = describe_labels(self, as_subtree=True)
471
+ if labels_tree:
472
+ if annotations_tree is None:
473
+ annotations_tree = tree.add(
474
+ Text("Annotations", style="bold dark_orange")
475
+ )
476
+ annotations_tree.add(labels_tree)
477
+
478
+ return tree
350
479
 
351
480
 
352
481
  def parse_feature_sets_from_anndata(
@@ -371,7 +500,7 @@ def parse_feature_sets_from_anndata(
371
500
  type = (
372
501
  "float"
373
502
  if adata.X is None
374
- else convert_numpy_dtype_to_lamin_feature_type(adata.X.dtype)
503
+ else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
375
504
  )
376
505
  feature_sets = {}
377
506
  if var_field is not None:
@@ -409,51 +538,75 @@ def parse_feature_sets_from_anndata(
409
538
  return feature_sets
410
539
 
411
540
 
541
+ def is_valid_datetime_str(date_string: str) -> bool | str:
542
+ try:
543
+ dt = datetime.fromisoformat(date_string)
544
+ return dt.isoformat()
545
+ except ValueError:
546
+ return False
547
+
548
+
412
549
  def infer_feature_type_convert_json(
413
- value: Any, mute: bool = False, str_as_ulabel: bool = True
414
- ) -> tuple[str, Any]:
550
+ key: str, value: Any, mute: bool = False, str_as_ulabel: bool = True
551
+ ) -> tuple[str, Any, str]:
552
+ message = ""
415
553
  if isinstance(value, bool):
416
- return FEATURE_TYPES["bool"], value
554
+ return "bool", value, message
417
555
  elif isinstance(value, int):
418
- return FEATURE_TYPES["int"], value
556
+ return "int", value, message
419
557
  elif isinstance(value, float):
420
- return FEATURE_TYPES["float"], value
558
+ return "float", value, message
559
+ elif isinstance(value, date):
560
+ return "date", value.isoformat(), message
561
+ elif isinstance(value, datetime):
562
+ return "datetime", value.isoformat(), message
421
563
  elif isinstance(value, str):
422
- if str_as_ulabel:
423
- return FEATURE_TYPES["str"] + "[ULabel]", value
564
+ if datetime_str := is_valid_datetime_str(value):
565
+ dt_type = (
566
+ "date" if len(value) == 10 else "datetime"
567
+ ) # YYYY-MM-DD is exactly 10 characters
568
+ sanitized_value = datetime_str[:10] if dt_type == "date" else datetime_str # type: ignore
569
+ return dt_type, sanitized_value, message # type: ignore
424
570
  else:
425
- return "str", value
571
+ return "cat ? str", value, message
426
572
  elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
427
- if isinstance(value, (pd.Series, np.ndarray)):
428
- return convert_numpy_dtype_to_lamin_feature_type(
429
- value.dtype, str_as_cat=str_as_ulabel
430
- ), list(value)
573
+ if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)):
574
+ dtype = convert_pandas_dtype_to_lamin_dtype(value.dtype)
575
+ if dtype == "str":
576
+ # ndarray doesn't know categorical, so there was no conscious choice
577
+ # offer both options
578
+ if isinstance(value, np.ndarray):
579
+ dtype = "cat ? str"
580
+ else:
581
+ # suggest to create a categorical if there are few unique values
582
+ message = suggest_categorical_for_str_iterable(value, key)
583
+ if message:
584
+ message = f" # {message}"
585
+ return dtype, list(value), message
431
586
  if isinstance(value, dict):
432
- return "dict", value
587
+ return "dict", value, message
433
588
  if len(value) > 0: # type: ignore
434
589
  first_element_type = type(next(iter(value)))
435
590
  if all(isinstance(elem, first_element_type) for elem in value):
436
591
  if first_element_type is bool:
437
- return f"list[{FEATURE_TYPES['bool']}]", value
592
+ return "list[bool]", value, message
438
593
  elif first_element_type is int:
439
- return f"list[{FEATURE_TYPES['int']}]", value
594
+ return "list[int]", value, message
440
595
  elif first_element_type is float:
441
- return f"list[{FEATURE_TYPES['float']}]", value
596
+ return "list[float]", value, message
442
597
  elif first_element_type is str:
443
- if str_as_ulabel:
444
- return FEATURE_TYPES["str"] + "[ULabel]", value
445
- else:
446
- return "list[str]", value
598
+ return ("list[cat ? str]", value, message)
447
599
  elif first_element_type == Record:
448
600
  return (
449
- f"cat[{first_element_type.__get_name_with_schema__()}]",
601
+ f"list[cat[{first_element_type.__get_name_with_schema__()}]]",
450
602
  value,
603
+ message,
451
604
  )
452
605
  elif isinstance(value, Record):
453
- return (f"cat[{value.__class__.__get_name_with_schema__()}]", value)
606
+ return (f"cat[{value.__class__.__get_name_with_schema__()}]", value, message)
454
607
  if not mute:
455
608
  logger.warning(f"cannot infer feature type of: {value}, returning '?")
456
- return ("?", value)
609
+ return "?", value, message
457
610
 
458
611
 
459
612
  def __init__(self, host: Artifact | Collection | Run):
@@ -463,12 +616,13 @@ def __init__(self, host: Artifact | Collection | Run):
463
616
 
464
617
 
465
618
  def __repr__(self) -> str:
466
- return print_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
619
+ tree = describe_features(self._host, print_params=(self.__class__ == ParamManager)) # type: ignore
620
+ return print_rich_tree(tree, fallback="no linked features")
467
621
 
468
622
 
469
623
  def get_values(self) -> dict[str, Any]:
470
624
  """Get feature values as a dictionary."""
471
- return print_features(
625
+ return describe_features(
472
626
  self._host, to_dict=True, print_params=(self.__class__ == ParamManager)
473
627
  ) # type: ignore
474
628
 
@@ -669,10 +823,14 @@ def _add_values(
669
823
  validated_keys = keys_array[validated]
670
824
  if validated.sum() != len(keys):
671
825
  not_validated_keys = keys_array[~validated]
826
+ not_validated_keys_dtype_message = [
827
+ (key, infer_feature_type_convert_json(key, features_values[key]))
828
+ for key in not_validated_keys
829
+ ]
672
830
  hint = "\n".join(
673
831
  [
674
- f" ln.{model_name}(name='{key}', dtype='{infer_feature_type_convert_json(features_values[key], str_as_ulabel=str_as_ulabel)[0]}').save()"
675
- for key in not_validated_keys
832
+ f" ln.{model_name}(name='{key}', dtype='{dtype}').save(){message}"
833
+ for key, (dtype, _, message) in not_validated_keys_dtype_message
676
834
  ]
677
835
  )
678
836
  msg = (
@@ -690,12 +848,13 @@ def _add_values(
690
848
  not_validated_values = []
691
849
  for key, value in features_values.items():
692
850
  feature = model.get(name=key)
693
- inferred_type, converted_value = infer_feature_type_convert_json(
851
+ inferred_type, converted_value, _ = infer_feature_type_convert_json(
852
+ key,
694
853
  value,
695
854
  mute=True,
696
855
  str_as_ulabel=str_as_ulabel,
697
856
  )
698
- if feature.dtype == "number":
857
+ if feature.dtype == "num":
699
858
  if inferred_type not in {"int", "float"}:
700
859
  raise TypeError(
701
860
  f"Value for feature '{key}' with type {feature.dtype} must be a number"
@@ -706,12 +865,13 @@ def _add_values(
706
865
  raise TypeError(
707
866
  f"Value for feature '{key}' with type '{feature.dtype}' must be a string or record."
708
867
  )
709
- elif not inferred_type == feature.dtype:
868
+ elif (feature.dtype == "str" and feature.dtype not in inferred_type) or (
869
+ feature.dtype != "str" and feature.dtype != inferred_type
870
+ ):
710
871
  raise ValidationError(
711
872
  f"Expected dtype for '{key}' is '{feature.dtype}', got '{inferred_type}'"
712
873
  )
713
874
  if not feature.dtype.startswith("cat"):
714
- # can remove the query once we have the unique constraint
715
875
  filter_kwargs = {model_name.lower(): feature, "value": converted_value}
716
876
  feature_value = value_model.filter(**filter_kwargs).one_or_none()
717
877
  if feature_value is None:
@@ -814,6 +974,59 @@ def add_values_params(
814
974
  _add_values(self, values, Param.name, str_as_ulabel=False)
815
975
 
816
976
 
977
+ def remove_values(
978
+ self,
979
+ feature: str | Feature,
980
+ *,
981
+ value: Any | None = None,
982
+ ):
983
+ """Remove value annotations for a given feature.
984
+
985
+ Args:
986
+ feature: The feature for which to remove values.
987
+ value: An optional value to restrict removal to a single value.
988
+
989
+ """
990
+ if isinstance(feature, str):
991
+ feature = Feature.get(name=feature)
992
+ filter_kwargs = {"feature": feature}
993
+ if feature.dtype.startswith("cat["):
994
+ feature_registry = feature.dtype.replace("cat[", "").replace("]", "")
995
+ if value is not None:
996
+ assert isinstance(value, Record) # noqa: S101
997
+ # the below uses our convention for field names in link models
998
+ link_name = (
999
+ feature_registry.split(".")[1]
1000
+ if "." in feature_registry
1001
+ else feature_registry
1002
+ ).lower()
1003
+ filter_kwargs[link_name] = value
1004
+ if feature_registry == "ULabel":
1005
+ link_attribute = "links_ulabel"
1006
+ else:
1007
+ link_models_on_models = {
1008
+ getattr(
1009
+ Artifact, obj.related_name
1010
+ ).through.__get_name_with_schema__(): obj.related_model.__get_name_with_schema__()
1011
+ for obj in Artifact._meta.related_objects
1012
+ if obj.related_model.__get_name_with_schema__() == feature_registry
1013
+ }
1014
+ link_attribute = {
1015
+ obj.related_name
1016
+ for obj in Artifact._meta.related_objects
1017
+ if obj.related_model.__get_name_with_schema__() in link_models_on_models
1018
+ }.pop()
1019
+ getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
1020
+ else:
1021
+ if value is not None:
1022
+ filter_kwargs["value"] = value
1023
+ feature_values = self._host._feature_values.filter(**filter_kwargs)
1024
+ self._host._feature_values.remove(*feature_values)
1025
+ # this might leave a dangling feature_value record
1026
+ # but we don't want to pay the price of making another query just to remove this annotation
1027
+ # we can clean the FeatureValue registry periodically if we want to
1028
+
1029
+
817
1030
  def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
818
1031
  """Curate artifact with a feature set.
819
1032
 
@@ -847,7 +1060,10 @@ def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
847
1060
 
848
1061
 
849
1062
  def _add_set_from_df(
850
- self, field: FieldAttr = Feature.name, organism: str | None = None
1063
+ self,
1064
+ field: FieldAttr = Feature.name,
1065
+ organism: str | None = None,
1066
+ mute: bool = False,
851
1067
  ):
852
1068
  """Add feature set corresponding to column names of DataFrame."""
853
1069
  if isinstance(self._host, Artifact):
@@ -855,21 +1071,14 @@ def _add_set_from_df(
855
1071
  else:
856
1072
  # Collection
857
1073
  assert self._host.artifact._accessor == "DataFrame" # noqa: S101
858
-
859
- # parse and register features
860
- registry = field.field.model
861
1074
  df = self._host.load()
862
- features = registry.from_values(df.columns, field=field, organism=organism)
863
- if len(features) == 0:
864
- logger.error(
865
- "no validated features found in DataFrame! please register features first!"
866
- )
867
- return
868
-
869
- # create and link feature sets
870
- feature_set = FeatureSet(features=features)
871
- feature_sets = {"columns": feature_set}
872
- self._host._feature_sets = feature_sets
1075
+ feature_set = FeatureSet.from_df(
1076
+ df=df,
1077
+ field=field,
1078
+ mute=mute,
1079
+ organism=organism,
1080
+ )
1081
+ self._host._feature_sets = {"columns": feature_set}
873
1082
  self._host.save()
874
1083
 
875
1084
 
@@ -1056,6 +1265,7 @@ FeatureManager._add_from = _add_from
1056
1265
  FeatureManager.filter = filter
1057
1266
  FeatureManager.get = get
1058
1267
  FeatureManager.make_external = make_external
1268
+ FeatureManager.remove_values = remove_values
1059
1269
  ParamManager.add_values = add_values_params
1060
1270
  ParamManager.get_values = get_values
1061
1271
  ParamManager.filter = filter