lamindb 0.76.8__py3-none-any.whl → 0.76.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +114 -113
  2. lamindb/_artifact.py +1206 -1205
  3. lamindb/_can_validate.py +621 -579
  4. lamindb/_collection.py +390 -387
  5. lamindb/_curate.py +1603 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +244 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +250 -256
  10. lamindb/_from_values.py +403 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +364 -362
  15. lamindb/_record.py +668 -649
  16. lamindb/_run.py +60 -57
  17. lamindb/_save.py +310 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +130 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +590 -574
  25. lamindb/core/_data.py +510 -438
  26. lamindb/core/_django.py +209 -0
  27. lamindb/core/_feature_manager.py +994 -867
  28. lamindb/core/_label_manager.py +289 -253
  29. lamindb/core/_mapped_collection.py +631 -597
  30. lamindb/core/_settings.py +188 -187
  31. lamindb/core/_sync_git.py +138 -138
  32. lamindb/core/_track_environment.py +27 -27
  33. lamindb/core/datasets/__init__.py +59 -59
  34. lamindb/core/datasets/_core.py +581 -571
  35. lamindb/core/datasets/_fake.py +36 -36
  36. lamindb/core/exceptions.py +90 -90
  37. lamindb/core/fields.py +12 -12
  38. lamindb/core/loaders.py +164 -164
  39. lamindb/core/schema.py +56 -56
  40. lamindb/core/storage/__init__.py +25 -25
  41. lamindb/core/storage/_anndata_accessor.py +741 -740
  42. lamindb/core/storage/_anndata_sizes.py +41 -41
  43. lamindb/core/storage/_backed_access.py +98 -98
  44. lamindb/core/storage/_tiledbsoma.py +204 -204
  45. lamindb/core/storage/_valid_suffixes.py +21 -21
  46. lamindb/core/storage/_zarr.py +110 -110
  47. lamindb/core/storage/objects.py +62 -62
  48. lamindb/core/storage/paths.py +172 -172
  49. lamindb/core/subsettings/__init__.py +12 -12
  50. lamindb/core/subsettings/_creation_settings.py +38 -38
  51. lamindb/core/subsettings/_transform_settings.py +21 -21
  52. lamindb/core/types.py +19 -19
  53. lamindb/core/versioning.py +146 -158
  54. lamindb/integrations/__init__.py +12 -12
  55. lamindb/integrations/_vitessce.py +107 -107
  56. lamindb/setup/__init__.py +14 -14
  57. lamindb/setup/core/__init__.py +4 -4
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/LICENSE +201 -201
  59. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/METADATA +8 -8
  60. lamindb-0.76.10.dist-info/RECORD +61 -0
  61. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/WHEEL +1 -1
  62. lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/_can_validate.py CHANGED
@@ -1,579 +1,621 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Literal
4
-
5
- import lamindb_setup as ln_setup
6
- import numpy as np
7
- import pandas as pd
8
- from django.core.exceptions import FieldDoesNotExist
9
- from lamin_utils import colors, logger
10
- from lamindb_setup.core._docs import doc_args
11
- from lnschema_core import CanValidate, Record
12
-
13
- from lamindb._utils import attach_func_to_class_method
14
-
15
- from ._from_values import _has_organism_field, _print_values
16
- from ._record import _queryset, get_name_field
17
-
18
- if TYPE_CHECKING:
19
- from django.db.models import QuerySet
20
- from lamin_utils._inspect import InspectResult
21
- from lnschema_core.types import ListLike, StrField
22
-
23
-
24
- @classmethod # type: ignore
25
- @doc_args(CanValidate.inspect.__doc__)
26
- def inspect(
27
- cls,
28
- values: ListLike,
29
- field: str | StrField | None = None,
30
- *,
31
- mute: bool = False,
32
- organism: str | Record | None = None,
33
- source: Record | None = None,
34
- ) -> InspectResult:
35
- """{}""" # noqa: D415
36
- return _inspect(
37
- cls=cls,
38
- values=values,
39
- field=field,
40
- mute=mute,
41
- organism=organism,
42
- source=source,
43
- )
44
-
45
-
46
- @classmethod # type: ignore
47
- @doc_args(CanValidate.validate.__doc__)
48
- def validate(
49
- cls,
50
- values: ListLike,
51
- field: str | StrField | None = None,
52
- *,
53
- mute: bool = False,
54
- organism: str | Record | None = None,
55
- source: Record | None = None,
56
- ) -> np.ndarray:
57
- """{}""" # noqa: D415
58
- return _validate(
59
- cls=cls, values=values, field=field, mute=mute, organism=organism, source=source
60
- )
61
-
62
-
63
- def _check_source_db(source: Record, using_key: str | None):
64
- """Check if the source is from the DB."""
65
- if using_key is not None and using_key != "default":
66
- if source._state.db != using_key:
67
- raise ValueError(
68
- f"source must be a bionty.Source record from instance '{using_key}'!"
69
- )
70
-
71
-
72
- def _check_organism_db(organism: Record, using_key: str | None):
73
- """Check if the organism is from the DB."""
74
- if isinstance(organism, Record):
75
- if using_key is not None and using_key != "default":
76
- if organism._state.db != using_key:
77
- raise ValueError(
78
- f"organism must be a bionty.Organism record from instance '{using_key}'!"
79
- )
80
-
81
-
82
- def _concat_lists(values: ListLike) -> list[str]:
83
- """Concatenate a list of lists of strings into a single list."""
84
- if len(values) > 0 and isinstance(values, (list, pd.Series)):
85
- try:
86
- if isinstance(values[0], list):
87
- if isinstance(values, pd.Series):
88
- values = values.tolist()
89
- values = sum([v for v in values if isinstance(v, list)], [])
90
- except KeyError:
91
- pass
92
- return values
93
-
94
-
95
- def _inspect(
96
- cls,
97
- values: ListLike,
98
- field: str | StrField | None = None,
99
- *,
100
- mute: bool = False,
101
- using_key: str | None = None,
102
- organism: str | Record | None = None,
103
- source: Record | None = None,
104
- ) -> pd.DataFrame | dict[str, list[str]]:
105
- """{}""" # noqa: D415
106
- from lamin_utils._inspect import inspect
107
-
108
- if isinstance(values, str):
109
- values = [values]
110
- values = _concat_lists(values)
111
-
112
- field = get_name_field(cls, field=field)
113
- queryset = _queryset(cls, using_key)
114
- using_key = queryset.db
115
- if isinstance(source, Record):
116
- _check_source_db(source, using_key)
117
- queryset = queryset.filter(source=source).all()
118
- _check_organism_db(organism, using_key)
119
- registry = queryset.model
120
- model_name = registry._meta.model.__name__
121
-
122
- # inspect in the DB
123
- result_db = inspect(
124
- df=_filter_query_based_on_organism(
125
- queryset=queryset, field=field, organism=organism
126
- ),
127
- identifiers=values,
128
- field=field,
129
- mute=mute,
130
- )
131
- nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
132
-
133
- if len(nonval) > 0 and registry.__get_schema_name__() == "bionty":
134
- try:
135
- bionty_result = registry.public(organism=organism, source=source).inspect(
136
- values=nonval, field=field, mute=True
137
- )
138
- bionty_validated = bionty_result.validated
139
- bionty_mapper = bionty_result.synonyms_mapper
140
- hint = False
141
- if len(bionty_validated) > 0 and not mute:
142
- print_values = _print_values(bionty_validated)
143
- s = "" if len(bionty_validated) == 1 else "s"
144
- labels = colors.yellow(f"{len(bionty_validated)} {model_name} term{s}")
145
- logger.print(
146
- f" detected {labels} in Bionty for"
147
- f" {colors.italic(field)}: {colors.yellow(print_values)}"
148
- )
149
- hint = True
150
-
151
- if len(bionty_mapper) > 0 and not mute:
152
- print_values = _print_values(list(bionty_mapper.keys()))
153
- s = "" if len(bionty_mapper) == 1 else "s"
154
- labels = colors.yellow(f"{len(bionty_mapper)} {model_name} term{s}")
155
- logger.print(
156
- f" detected {labels} in Bionty as {colors.italic(f'synonym{s}')}:"
157
- f" {colors.yellow(print_values)}"
158
- )
159
- hint = True
160
-
161
- if hint:
162
- logger.print(
163
- f"→ add records from Bionty to your {model_name} registry via"
164
- f" {colors.italic('.from_values()')}"
165
- )
166
-
167
- nonval = bionty_result.non_validated
168
- # no bionty source is found
169
- except ValueError:
170
- logger.warning("no Bionty source found, skipping Bionty validation")
171
-
172
- if len(nonval) > 0 and not mute:
173
- print_values = _print_values(list(nonval))
174
- s = "" if len(nonval) == 1 else "s"
175
- labels = colors.red(f"{len(nonval)} term{s}")
176
- logger.print(f" couldn't validate {labels}: {colors.red(print_values)}")
177
- logger.print(
178
- f"→ if you are sure, create new record{s} via"
179
- f" {colors.italic(f'{registry.__name__}()')} and save to your registry"
180
- )
181
-
182
- return result_db
183
-
184
-
185
- def _validate(
186
- cls,
187
- values: ListLike,
188
- field: str | StrField | None = None,
189
- *,
190
- mute: bool = False,
191
- using_key: str | None = None,
192
- organism: str | Record | None = None,
193
- source: Record | None = None,
194
- ) -> np.ndarray:
195
- """{}""" # noqa: D415
196
- from lamin_utils._inspect import validate
197
-
198
- return_str = True if isinstance(values, str) else False
199
- if isinstance(values, str):
200
- values = [values]
201
- values = _concat_lists(values)
202
-
203
- field = get_name_field(cls, field=field)
204
-
205
- queryset = _queryset(cls, using_key)
206
- using_key = queryset.db
207
- if isinstance(source, Record):
208
- _check_source_db(source, using_key)
209
- queryset = queryset.filter(source=source).all()
210
- _check_organism_db(organism, using_key)
211
- field_values = pd.Series(
212
- _filter_query_based_on_organism(
213
- queryset=queryset,
214
- field=field,
215
- organism=organism,
216
- values_list_field=field,
217
- ),
218
- dtype="object",
219
- )
220
- if field_values.empty:
221
- if not mute:
222
- msg = (
223
- f"Your {cls.__name__} registry is empty, consider populating it first!"
224
- )
225
- if hasattr(cls, "source_id"):
226
- msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
227
- logger.warning(msg)
228
- return np.array([False] * len(values))
229
-
230
- result = validate(
231
- identifiers=values,
232
- field_values=field_values,
233
- case_sensitive=True,
234
- mute=mute,
235
- field=field,
236
- )
237
- if return_str and len(result) == 1:
238
- return result[0]
239
- else:
240
- return result
241
-
242
-
243
- @classmethod # type: ignore
244
- @doc_args(CanValidate.standardize.__doc__)
245
- def standardize(
246
- cls,
247
- values: ListLike,
248
- field: str | StrField | None = None,
249
- *,
250
- return_field: str = None,
251
- return_mapper: bool = False,
252
- case_sensitive: bool = False,
253
- mute: bool = False,
254
- public_aware: bool = True,
255
- keep: Literal["first", "last", False] = "first",
256
- synonyms_field: str = "synonyms",
257
- organism: str | Record | None = None,
258
- source: Record | None = None,
259
- ) -> list[str] | dict[str, str]:
260
- """{}""" # noqa: D415
261
- return _standardize(
262
- cls=cls,
263
- values=values,
264
- field=field,
265
- return_field=return_field,
266
- return_mapper=return_mapper,
267
- case_sensitive=case_sensitive,
268
- mute=mute,
269
- public_aware=public_aware,
270
- keep=keep,
271
- synonyms_field=synonyms_field,
272
- organism=organism,
273
- source=source,
274
- )
275
-
276
-
277
- def set_abbr(self, value: str):
278
- self.abbr = value
279
-
280
- if hasattr(self, "name") and value == self.name:
281
- pass
282
- else:
283
- try:
284
- self.add_synonym(value, save=False)
285
- except Exception as e: # pragma: no cover
286
- logger.debug(
287
- f"Encountered an Exception while attempting to add synonyms.\n{e}"
288
- )
289
-
290
- if not self._state.adding:
291
- self.save()
292
-
293
-
294
- def add_synonym(
295
- self,
296
- synonym: str | ListLike,
297
- force: bool = False,
298
- save: bool | None = None,
299
- ):
300
- _check_synonyms_field_exist(self)
301
- _add_or_remove_synonyms(
302
- synonym=synonym, record=self, force=force, action="add", save=save
303
- )
304
-
305
-
306
- def remove_synonym(self, synonym: str | ListLike):
307
- _check_synonyms_field_exist(self)
308
- _add_or_remove_synonyms(synonym=synonym, record=self, action="remove")
309
-
310
-
311
- def _standardize(
312
- cls,
313
- values: ListLike,
314
- field: str | StrField | None = None,
315
- *,
316
- return_field: str = None,
317
- return_mapper: bool = False,
318
- case_sensitive: bool = False,
319
- mute: bool = False,
320
- public_aware: bool = True,
321
- keep: Literal["first", "last", False] = "first",
322
- synonyms_field: str = "synonyms",
323
- using_key: str | None = None,
324
- organism: str | Record | None = None,
325
- source: Record | None = None,
326
- ) -> list[str] | dict[str, str]:
327
- """{}""" # noqa: D415
328
- from lamin_utils._standardize import standardize as map_synonyms
329
-
330
- return_str = True if isinstance(values, str) else False
331
- if isinstance(values, str):
332
- values = [values]
333
- values = _concat_lists(values)
334
-
335
- field = get_name_field(cls, field=field)
336
- return_field = get_name_field(
337
- cls, field=field if return_field is None else return_field
338
- )
339
- queryset = _queryset(cls, using_key)
340
- using_key = queryset.db
341
- if isinstance(source, Record):
342
- _check_source_db(source, using_key)
343
- queryset = queryset.filter(source=source).all()
344
- _check_organism_db(organism, using_key)
345
- registry = queryset.model
346
-
347
- if _has_organism_field(registry):
348
- # here, we can safely import bionty
349
- from bionty._bionty import create_or_get_organism_record
350
-
351
- organism_record = create_or_get_organism_record(
352
- organism=organism, registry=registry
353
- )
354
- organism = (
355
- organism_record.name if organism_record is not None else organism_record
356
- )
357
-
358
- try:
359
- registry._meta.get_field(synonyms_field)
360
- df = _filter_query_based_on_organism(
361
- queryset=queryset, field=field, organism=organism
362
- )
363
- except FieldDoesNotExist:
364
- df = pd.DataFrame()
365
-
366
- _kwargs = {
367
- "field": field,
368
- "return_field": return_field,
369
- "case_sensitive": case_sensitive,
370
- "keep": keep,
371
- "synonyms_field": synonyms_field,
372
- }
373
- # standardized names from the DB
374
- std_names_db = map_synonyms(
375
- df=df,
376
- identifiers=values,
377
- return_mapper=return_mapper,
378
- mute=mute,
379
- **_kwargs,
380
- )
381
-
382
- def _return(result: list, mapper: dict):
383
- if return_mapper:
384
- return mapper
385
- else:
386
- if return_str and len(result) == 1:
387
- return result[0]
388
- return result
389
-
390
- # map synonyms in Bionty
391
- if registry.__get_schema_name__() == "bionty" and public_aware:
392
- mapper = {}
393
- if return_mapper:
394
- mapper = std_names_db
395
- std_names_db = map_synonyms(
396
- df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs
397
- )
398
-
399
- val_res = registry.validate(
400
- std_names_db, field=field, mute=True, organism=organism
401
- )
402
- if all(val_res):
403
- return _return(result=std_names_db, mapper=mapper)
404
-
405
- nonval = np.array(std_names_db)[~val_res]
406
- std_names_bt_mapper = registry.public(organism=organism).standardize(
407
- nonval, return_mapper=True, mute=True, **_kwargs
408
- )
409
-
410
- if len(std_names_bt_mapper) > 0 and not mute:
411
- s = "" if len(std_names_bt_mapper) == 1 else "s"
412
- field_print = "synonym" if field == return_field else field
413
- warn_msg = (
414
- f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty:"
415
- f" {list(std_names_bt_mapper.keys())}"
416
- )
417
- warn_msg += (
418
- f"\n please add corresponding {registry._meta.model.__name__} records via"
419
- f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
420
- )
421
- logger.warning(warn_msg)
422
-
423
- mapper.update(std_names_bt_mapper)
424
- if pd.api.types.is_categorical_dtype(std_names_db):
425
- result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
426
- else:
427
- result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
428
- return _return(result=result, mapper=mapper)
429
-
430
- else:
431
- return _return(result=std_names_db, mapper=std_names_db)
432
-
433
-
434
- def _add_or_remove_synonyms(
435
- synonym: str | ListLike,
436
- record: Record,
437
- action: Literal["add", "remove"],
438
- force: bool = False,
439
- save: bool | None = None,
440
- ):
441
- """Add or remove synonyms."""
442
-
443
- def check_synonyms_in_all_records(synonyms: set[str], record: Record):
444
- """Errors if input synonym is associated with other records in the DB."""
445
- import pandas as pd
446
- from IPython.display import display
447
-
448
- syns_all = (
449
- record.__class__.objects.exclude(synonyms="").exclude(synonyms=None).all()
450
- )
451
- if len(syns_all) == 0:
452
- return
453
- df = pd.DataFrame(syns_all.values())
454
- df["synonyms"] = df["synonyms"].str.split("|")
455
- df = df.explode("synonyms")
456
- matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)]
457
- if matches_df.shape[0] > 0:
458
- records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values())
459
- logger.error(
460
- f"input synonyms {matches_df['synonyms'].unique()} already associated"
461
- " with the following records:\n"
462
- )
463
- display(records_df)
464
- raise ValueError(
465
- "cannot assigned a synonym that is already associated with a record to a different record.\n"
466
- "Consider removing the synonym from existing records or using a different synonym."
467
- )
468
-
469
- # passed synonyms
470
- # nothing happens when passing an empty string or list
471
- if isinstance(synonym, str):
472
- if len(synonym) == 0:
473
- return
474
- syn_new_set = {synonym}
475
- else:
476
- if synonym == [""]:
477
- return
478
- syn_new_set = set(synonym)
479
- # nothing happens when passing an empty string or list
480
- if len(syn_new_set) == 0:
481
- return
482
- # because we use | as the separator
483
- if any("|" in i for i in syn_new_set):
484
- raise ValueError("a synonym can't contain '|'!")
485
-
486
- # existing synonyms
487
- syns_exist = record.synonyms
488
- if syns_exist is None or len(syns_exist) == 0:
489
- syns_exist_set = set()
490
- else:
491
- syns_exist_set = set(syns_exist.split("|"))
492
-
493
- if action == "add":
494
- if not force:
495
- check_synonyms_in_all_records(syn_new_set, record)
496
- syns_exist_set.update(syn_new_set)
497
- elif action == "remove":
498
- syns_exist_set = syns_exist_set.difference(syn_new_set)
499
-
500
- if len(syns_exist_set) == 0:
501
- syns_str = None
502
- else:
503
- syns_str = "|".join(syns_exist_set)
504
-
505
- record.synonyms = syns_str
506
-
507
- if save is None:
508
- # if record is already in DB, save the changes to DB
509
- save = not record._state.adding
510
- if save:
511
- record.save()
512
-
513
-
514
- def _check_synonyms_field_exist(record: Record):
515
- try:
516
- record.__getattribute__("synonyms")
517
- except AttributeError:
518
- raise NotImplementedError(
519
- f"No synonyms field found in table {record.__class__.__name__}!"
520
- ) from None
521
-
522
-
523
- def _filter_query_based_on_organism(
524
- queryset: QuerySet,
525
- field: str,
526
- organism: str | Record | None = None,
527
- values_list_field: str | None = None,
528
- ):
529
- """Filter a queryset based on organism."""
530
- import pandas as pd
531
-
532
- registry = queryset.model
533
-
534
- if _has_organism_field(registry) and not _field_is_id(field, registry):
535
- # here, we can safely import bionty
536
- from bionty._bionty import create_or_get_organism_record
537
-
538
- organism_record = create_or_get_organism_record(
539
- organism=organism, registry=registry
540
- )
541
- if organism_record is not None:
542
- queryset = queryset.filter(organism__name=organism_record.name)
543
-
544
- if values_list_field is None:
545
- return pd.DataFrame.from_records(queryset.values())
546
- else:
547
- return queryset.values_list(values_list_field, flat=True)
548
-
549
-
550
- def _field_is_id(field: str, registry: type[Record]) -> bool:
551
- """Check if the field is an ontology ID."""
552
- if hasattr(registry, "_ontology_id_field"):
553
- if field == registry._ontology_id_field:
554
- return True
555
- if field.endswith("id"):
556
- return True
557
- return False
558
-
559
-
560
- METHOD_NAMES = [
561
- "validate",
562
- "inspect",
563
- "standardize",
564
- "add_synonym",
565
- "remove_synonym",
566
- "set_abbr",
567
- ]
568
-
569
- if ln_setup._TESTING: # type: ignore
570
- from inspect import signature
571
-
572
- SIGS = {
573
- name: signature(getattr(CanValidate, name))
574
- for name in METHOD_NAMES
575
- if not name.startswith("__")
576
- }
577
-
578
- for name in METHOD_NAMES:
579
- attach_func_to_class_method(name, CanValidate, globals())
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ import lamindb_setup as ln_setup
6
+ import numpy as np
7
+ import pandas as pd
8
+ from django.core.exceptions import FieldDoesNotExist
9
+ from lamin_utils import colors, logger
10
+ from lamindb_setup.core._docs import doc_args
11
+ from lnschema_core import CanValidate, Record
12
+
13
+ from lamindb._utils import attach_func_to_class_method
14
+
15
+ from ._from_values import _has_organism_field, _print_values, get_or_create_records
16
+ from ._record import _queryset, get_name_field
17
+
18
+ if TYPE_CHECKING:
19
+ from django.db.models import QuerySet
20
+ from lamin_utils._inspect import InspectResult
21
+ from lnschema_core.types import ListLike, StrField
22
+
23
+
24
+ # from_values doesn't apply for QuerySet or Manager
25
+ @classmethod # type:ignore
26
+ @doc_args(CanValidate.from_values.__doc__)
27
+ def from_values(
28
+ cls,
29
+ values: ListLike,
30
+ field: StrField | None = None,
31
+ create: bool = False,
32
+ organism: Record | str | None = None,
33
+ source: Record | None = None,
34
+ mute: bool = False,
35
+ ) -> list[Record]:
36
+ """{}""" # noqa: D415
37
+ from_source = True if cls.__module__.startswith("bionty.") else False
38
+
39
+ field_str = get_name_field(cls, field=field)
40
+ return get_or_create_records(
41
+ iterable=values,
42
+ field=getattr(cls, field_str),
43
+ create=create,
44
+ from_source=from_source,
45
+ organism=organism,
46
+ source=source,
47
+ mute=mute,
48
+ )
49
+
50
+
51
+ @classmethod # type: ignore
52
+ @doc_args(CanValidate.inspect.__doc__)
53
+ def inspect(
54
+ cls,
55
+ values: ListLike,
56
+ field: str | StrField | None = None,
57
+ *,
58
+ mute: bool = False,
59
+ organism: str | Record | None = None,
60
+ source: Record | None = None,
61
+ ) -> InspectResult:
62
+ """{}""" # noqa: D415
63
+ return _inspect(
64
+ cls=cls,
65
+ values=values,
66
+ field=field,
67
+ mute=mute,
68
+ organism=organism,
69
+ source=source,
70
+ )
71
+
72
+
73
+ @classmethod # type: ignore
74
+ @doc_args(CanValidate.validate.__doc__)
75
+ def validate(
76
+ cls,
77
+ values: ListLike,
78
+ field: str | StrField | None = None,
79
+ *,
80
+ mute: bool = False,
81
+ organism: str | Record | None = None,
82
+ source: Record | None = None,
83
+ ) -> np.ndarray:
84
+ """{}""" # noqa: D415
85
+ return _validate(
86
+ cls=cls, values=values, field=field, mute=mute, organism=organism, source=source
87
+ )
88
+
89
+
90
+ def _check_source_db(source: Record, using_key: str | None):
91
+ """Check if the source is from the DB."""
92
+ if using_key is not None and using_key != "default":
93
+ if source._state.db != using_key:
94
+ raise ValueError(
95
+ f"source must be a bionty.Source record from instance '{using_key}'!"
96
+ )
97
+
98
+
99
+ def _check_organism_db(organism: Record, using_key: str | None):
100
+ """Check if the organism is from the DB."""
101
+ if isinstance(organism, Record):
102
+ if using_key is not None and using_key != "default":
103
+ if organism._state.db != using_key:
104
+ raise ValueError(
105
+ f"organism must be a bionty.Organism record from instance '{using_key}'!"
106
+ )
107
+
108
+
109
+ def _concat_lists(values: ListLike) -> list[str]:
110
+ """Concatenate a list of lists of strings into a single list."""
111
+ if len(values) > 0 and isinstance(values, (list, pd.Series)):
112
+ try:
113
+ if isinstance(values[0], list):
114
+ if isinstance(values, pd.Series):
115
+ values = values.tolist()
116
+ values = sum([v for v in values if isinstance(v, list)], [])
117
+ except KeyError:
118
+ pass
119
+ return values
120
+
121
+
122
+ def _inspect(
123
+ cls,
124
+ values: ListLike,
125
+ field: str | StrField | None = None,
126
+ *,
127
+ mute: bool = False,
128
+ using_key: str | None = None,
129
+ organism: str | Record | None = None,
130
+ source: Record | None = None,
131
+ ) -> pd.DataFrame | dict[str, list[str]]:
132
+ """{}""" # noqa: D415
133
+ from lamin_utils._inspect import inspect
134
+
135
+ if isinstance(values, str):
136
+ values = [values]
137
+ values = _concat_lists(values)
138
+
139
+ field = get_name_field(cls, field=field)
140
+ queryset = _queryset(cls, using_key)
141
+ using_key = queryset.db
142
+ if isinstance(source, Record):
143
+ _check_source_db(source, using_key)
144
+ queryset = queryset.filter(source=source).all()
145
+ _check_organism_db(organism, using_key)
146
+ registry = queryset.model
147
+ model_name = registry._meta.model.__name__
148
+
149
+ # inspect in the DB
150
+ result_db = inspect(
151
+ df=_filter_query_based_on_organism(
152
+ queryset=queryset, field=field, organism=organism
153
+ ),
154
+ identifiers=values,
155
+ field=field,
156
+ mute=mute,
157
+ )
158
+ nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
159
+
160
+ if len(nonval) > 0 and registry.__get_schema_name__() == "bionty":
161
+ try:
162
+ bionty_result = registry.public(organism=organism, source=source).inspect(
163
+ values=nonval, field=field, mute=True
164
+ )
165
+ bionty_validated = bionty_result.validated
166
+ bionty_mapper = bionty_result.synonyms_mapper
167
+ hint = False
168
+ if len(bionty_validated) > 0 and not mute:
169
+ print_values = _print_values(bionty_validated)
170
+ s = "" if len(bionty_validated) == 1 else "s"
171
+ labels = colors.yellow(f"{len(bionty_validated)} {model_name} term{s}")
172
+ logger.print(
173
+ f" detected {labels} in Bionty for"
174
+ f" {colors.italic(field)}: {colors.yellow(print_values)}"
175
+ )
176
+ hint = True
177
+
178
+ if len(bionty_mapper) > 0 and not mute:
179
+ print_values = _print_values(list(bionty_mapper.keys()))
180
+ s = "" if len(bionty_mapper) == 1 else "s"
181
+ labels = colors.yellow(f"{len(bionty_mapper)} {model_name} term{s}")
182
+ logger.print(
183
+ f" detected {labels} in Bionty as {colors.italic(f'synonym{s}')}:"
184
+ f" {colors.yellow(print_values)}"
185
+ )
186
+ hint = True
187
+
188
+ if hint:
189
+ logger.print(
190
+ f"→ add records from Bionty to your {model_name} registry via"
191
+ f" {colors.italic('.from_values()')}"
192
+ )
193
+
194
+ nonval = bionty_result.non_validated
195
+ # no bionty source is found
196
+ except ValueError:
197
+ logger.warning("no Bionty source found, skipping Bionty validation")
198
+
199
+ if len(nonval) > 0 and not mute:
200
+ print_values = _print_values(list(nonval))
201
+ s = "" if len(nonval) == 1 else "s"
202
+ labels = colors.red(f"{len(nonval)} term{s}")
203
+ logger.print(f" couldn't validate {labels}: {colors.red(print_values)}")
204
+ logger.print(
205
+ f"→ if you are sure, create new record{s} via"
206
+ f" {colors.italic(f'{registry.__name__}()')} and save to your registry"
207
+ )
208
+
209
+ return result_db
210
+
211
+
212
+ def _validate(
213
+ cls,
214
+ values: ListLike,
215
+ field: str | StrField | None = None,
216
+ *,
217
+ mute: bool = False,
218
+ using_key: str | None = None,
219
+ organism: str | Record | None = None,
220
+ source: Record | None = None,
221
+ ) -> np.ndarray:
222
+ """{}""" # noqa: D415
223
+ from lamin_utils._inspect import validate
224
+
225
+ return_str = True if isinstance(values, str) else False
226
+ if isinstance(values, str):
227
+ values = [values]
228
+ values = _concat_lists(values)
229
+
230
+ field = get_name_field(cls, field=field)
231
+
232
+ queryset = _queryset(cls, using_key)
233
+ using_key = queryset.db
234
+ if isinstance(source, Record):
235
+ _check_source_db(source, using_key)
236
+ queryset = queryset.filter(source=source).all()
237
+ _check_organism_db(organism, using_key)
238
+ field_values = pd.Series(
239
+ _filter_query_based_on_organism(
240
+ queryset=queryset,
241
+ field=field,
242
+ organism=organism,
243
+ values_list_field=field,
244
+ ),
245
+ dtype="object",
246
+ )
247
+ if field_values.empty:
248
+ if not mute:
249
+ msg = (
250
+ f"Your {cls.__name__} registry is empty, consider populating it first!"
251
+ )
252
+ if hasattr(cls, "source_id"):
253
+ msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
254
+ logger.warning(msg)
255
+ return np.array([False] * len(values))
256
+
257
+ result = validate(
258
+ identifiers=values,
259
+ field_values=field_values,
260
+ case_sensitive=True,
261
+ mute=mute,
262
+ field=field,
263
+ )
264
+ if return_str and len(result) == 1:
265
+ return result[0]
266
+ else:
267
+ return result
268
+
269
+
270
+ @classmethod # type: ignore
271
+ @doc_args(CanValidate.standardize.__doc__)
272
+ def standardize(
273
+ cls,
274
+ values: ListLike,
275
+ field: str | StrField | None = None,
276
+ *,
277
+ return_field: str = None,
278
+ return_mapper: bool = False,
279
+ case_sensitive: bool = False,
280
+ mute: bool = False,
281
+ public_aware: bool = True,
282
+ keep: Literal["first", "last", False] = "first",
283
+ synonyms_field: str = "synonyms",
284
+ organism: str | Record | None = None,
285
+ source: Record | None = None,
286
+ ) -> list[str] | dict[str, str]:
287
+ """{}""" # noqa: D415
288
+ return _standardize(
289
+ cls=cls,
290
+ values=values,
291
+ field=field,
292
+ return_field=return_field,
293
+ return_mapper=return_mapper,
294
+ case_sensitive=case_sensitive,
295
+ mute=mute,
296
+ public_aware=public_aware,
297
+ keep=keep,
298
+ synonyms_field=synonyms_field,
299
+ organism=organism,
300
+ source=source,
301
+ )
302
+
303
+
304
+ def set_abbr(self, value: str):
305
+ self.abbr = value
306
+
307
+ if hasattr(self, "name") and value == self.name:
308
+ pass
309
+ else:
310
+ try:
311
+ self.add_synonym(value, save=False)
312
+ except Exception as e: # pragma: no cover
313
+ logger.debug(
314
+ f"Encountered an Exception while attempting to add synonyms.\n{e}"
315
+ )
316
+
317
+ if not self._state.adding:
318
+ self.save()
319
+
320
+
321
+ def add_synonym(
322
+ self,
323
+ synonym: str | ListLike,
324
+ force: bool = False,
325
+ save: bool | None = None,
326
+ ):
327
+ _check_synonyms_field_exist(self)
328
+ _add_or_remove_synonyms(
329
+ synonym=synonym, record=self, force=force, action="add", save=save
330
+ )
331
+
332
+
333
+ def remove_synonym(self, synonym: str | ListLike):
334
+ _check_synonyms_field_exist(self)
335
+ _add_or_remove_synonyms(synonym=synonym, record=self, action="remove")
336
+
337
+
338
+ def _standardize(
339
+ cls,
340
+ values: ListLike,
341
+ field: str | StrField | None = None,
342
+ *,
343
+ return_field: str = None,
344
+ return_mapper: bool = False,
345
+ case_sensitive: bool = False,
346
+ mute: bool = False,
347
+ public_aware: bool = True,
348
+ keep: Literal["first", "last", False] = "first",
349
+ synonyms_field: str = "synonyms",
350
+ using_key: str | None = None,
351
+ organism: str | Record | None = None,
352
+ source: Record | None = None,
353
+ ) -> list[str] | dict[str, str]:
354
+ """{}""" # noqa: D415
355
+ from lamin_utils._standardize import standardize as map_synonyms
356
+
357
+ return_str = True if isinstance(values, str) else False
358
+ if isinstance(values, str):
359
+ values = [values]
360
+ values = _concat_lists(values)
361
+
362
+ field = get_name_field(cls, field=field)
363
+ return_field = get_name_field(
364
+ cls, field=field if return_field is None else return_field
365
+ )
366
+ queryset = _queryset(cls, using_key)
367
+ using_key = queryset.db
368
+ if isinstance(source, Record):
369
+ _check_source_db(source, using_key)
370
+ queryset = queryset.filter(source=source).all()
371
+ _check_organism_db(organism, using_key)
372
+ registry = queryset.model
373
+
374
+ if _has_organism_field(registry):
375
+ # here, we can safely import bionty
376
+ from bionty._bionty import create_or_get_organism_record
377
+
378
+ organism_record = create_or_get_organism_record(
379
+ organism=organism, registry=registry, field=field
380
+ )
381
+ organism = (
382
+ organism_record.name if organism_record is not None else organism_record
383
+ )
384
+
385
+ # only perform synonym mapping if field is the name field
386
+ if hasattr(registry, "_name_field") and field != registry._name_field:
387
+ synonyms_field = None
388
+
389
+ try:
390
+ registry._meta.get_field(synonyms_field)
391
+ fields = {i for i in [field, return_field, synonyms_field] if i is not None}
392
+ df = _filter_query_based_on_organism(
393
+ queryset=queryset,
394
+ field=field,
395
+ organism=organism,
396
+ fields=list(fields),
397
+ )
398
+ except FieldDoesNotExist:
399
+ df = pd.DataFrame()
400
+
401
+ _kwargs = {
402
+ "field": field,
403
+ "return_field": return_field,
404
+ "case_sensitive": case_sensitive,
405
+ "keep": keep,
406
+ "synonyms_field": synonyms_field,
407
+ }
408
+ # standardized names from the DB
409
+ std_names_db = map_synonyms(
410
+ df=df,
411
+ identifiers=values,
412
+ return_mapper=return_mapper,
413
+ mute=mute,
414
+ **_kwargs,
415
+ )
416
+
417
+ def _return(result: list, mapper: dict):
418
+ if return_mapper:
419
+ return mapper
420
+ else:
421
+ if return_str and len(result) == 1:
422
+ return result[0]
423
+ return result
424
+
425
+ # map synonyms in Bionty
426
+ if registry.__get_schema_name__() == "bionty" and public_aware:
427
+ mapper = {}
428
+ if return_mapper:
429
+ mapper = std_names_db
430
+ std_names_db = map_synonyms(
431
+ df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs
432
+ )
433
+
434
+ val_res = registry.validate(
435
+ std_names_db, field=field, mute=True, organism=organism
436
+ )
437
+ if all(val_res):
438
+ return _return(result=std_names_db, mapper=mapper)
439
+
440
+ nonval = np.array(std_names_db)[~val_res]
441
+ std_names_bt_mapper = registry.public(organism=organism).standardize(
442
+ nonval, return_mapper=True, mute=True, **_kwargs
443
+ )
444
+
445
+ if len(std_names_bt_mapper) > 0 and not mute:
446
+ s = "" if len(std_names_bt_mapper) == 1 else "s"
447
+ field_print = "synonym" if field == return_field else field
448
+ warn_msg = (
449
+ f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty:"
450
+ f" {list(std_names_bt_mapper.keys())}"
451
+ )
452
+ warn_msg += (
453
+ f"\n please add corresponding {registry._meta.model.__name__} records via"
454
+ f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
455
+ )
456
+ logger.warning(warn_msg)
457
+
458
+ mapper.update(std_names_bt_mapper)
459
+ if pd.api.types.is_categorical_dtype(std_names_db):
460
+ result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
461
+ else:
462
+ result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
463
+ return _return(result=result, mapper=mapper)
464
+
465
+ else:
466
+ return _return(result=std_names_db, mapper=std_names_db)
467
+
468
+
469
+ def _add_or_remove_synonyms(
470
+ synonym: str | ListLike,
471
+ record: Record,
472
+ action: Literal["add", "remove"],
473
+ force: bool = False,
474
+ save: bool | None = None,
475
+ ):
476
+ """Add or remove synonyms."""
477
+
478
+ def check_synonyms_in_all_records(synonyms: set[str], record: Record):
479
+ """Errors if input synonym is associated with other records in the DB."""
480
+ import pandas as pd
481
+ from IPython.display import display
482
+
483
+ syns_all = (
484
+ record.__class__.objects.exclude(synonyms="").exclude(synonyms=None).all()
485
+ )
486
+ if len(syns_all) == 0:
487
+ return
488
+ df = pd.DataFrame(syns_all.values())
489
+ df["synonyms"] = df["synonyms"].str.split("|")
490
+ df = df.explode("synonyms")
491
+ matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)]
492
+ if matches_df.shape[0] > 0:
493
+ records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values())
494
+ logger.error(
495
+ f"input synonyms {matches_df['synonyms'].unique()} already associated"
496
+ " with the following records:\n"
497
+ )
498
+ display(records_df)
499
+ raise ValueError(
500
+ "cannot assigned a synonym that is already associated with a record to a different record.\n"
501
+ "Consider removing the synonym from existing records or using a different synonym."
502
+ )
503
+
504
+ # passed synonyms
505
+ # nothing happens when passing an empty string or list
506
+ if isinstance(synonym, str):
507
+ if len(synonym) == 0:
508
+ return
509
+ syn_new_set = {synonym}
510
+ else:
511
+ if synonym == [""]:
512
+ return
513
+ syn_new_set = set(synonym)
514
+ # nothing happens when passing an empty string or list
515
+ if len(syn_new_set) == 0:
516
+ return
517
+ # because we use | as the separator
518
+ if any("|" in i for i in syn_new_set):
519
+ raise ValueError("a synonym can't contain '|'!")
520
+
521
+ # existing synonyms
522
+ syns_exist = record.synonyms
523
+ if syns_exist is None or len(syns_exist) == 0:
524
+ syns_exist_set = set()
525
+ else:
526
+ syns_exist_set = set(syns_exist.split("|"))
527
+
528
+ if action == "add":
529
+ if not force:
530
+ check_synonyms_in_all_records(syn_new_set, record)
531
+ syns_exist_set.update(syn_new_set)
532
+ elif action == "remove":
533
+ syns_exist_set = syns_exist_set.difference(syn_new_set)
534
+
535
+ if len(syns_exist_set) == 0:
536
+ syns_str = None
537
+ else:
538
+ syns_str = "|".join(syns_exist_set)
539
+
540
+ record.synonyms = syns_str
541
+
542
+ if save is None:
543
+ # if record is already in DB, save the changes to DB
544
+ save = not record._state.adding
545
+ if save:
546
+ record.save()
547
+
548
+
549
+ def _check_synonyms_field_exist(record: Record):
550
+ try:
551
+ record.__getattribute__("synonyms")
552
+ except AttributeError:
553
+ raise NotImplementedError(
554
+ f"No synonyms field found in table {record.__class__.__name__}!"
555
+ ) from None
556
+
557
+
558
+ def _filter_query_based_on_organism(
559
+ queryset: QuerySet,
560
+ field: str,
561
+ organism: str | Record | None = None,
562
+ values_list_field: str | None = None,
563
+ fields: list[str] | None = None,
564
+ ):
565
+ """Filter a queryset based on organism."""
566
+ import pandas as pd
567
+
568
+ registry = queryset.model
569
+
570
+ if _has_organism_field(registry) and not _field_is_id(field, registry):
571
+ # here, we can safely import bionty
572
+ from bionty._bionty import create_or_get_organism_record
573
+
574
+ organism_record = create_or_get_organism_record(
575
+ organism=organism, registry=registry, field=field
576
+ )
577
+ if organism_record is not None:
578
+ queryset = queryset.filter(organism__name=organism_record.name)
579
+
580
+ if values_list_field is None:
581
+ if fields:
582
+ return pd.DataFrame.from_records(
583
+ queryset.values_list(*fields), columns=fields
584
+ )
585
+ return pd.DataFrame.from_records(queryset.values())
586
+
587
+ else:
588
+ return queryset.values_list(values_list_field, flat=True)
589
+
590
+
591
+ def _field_is_id(field: str, registry: type[Record]) -> bool:
592
+ """Check if the field is an ontology ID."""
593
+ if hasattr(registry, "_ontology_id_field"):
594
+ if field == registry._ontology_id_field:
595
+ return True
596
+ if field.endswith("id"):
597
+ return True
598
+ return False
599
+
600
+
601
+ METHOD_NAMES = [
602
+ "validate",
603
+ "inspect",
604
+ "standardize",
605
+ "add_synonym",
606
+ "remove_synonym",
607
+ "set_abbr",
608
+ "from_values",
609
+ ]
610
+
611
+ if ln_setup._TESTING: # type: ignore
612
+ from inspect import signature
613
+
614
+ SIGS = {
615
+ name: signature(getattr(CanValidate, name))
616
+ for name in METHOD_NAMES
617
+ if not name.startswith("__")
618
+ }
619
+
620
+ for name in METHOD_NAMES:
621
+ attach_func_to_class_method(name, CanValidate, globals())