lamindb 0.76.6__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1174
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -382
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -93
  24. lamindb/core/_context.py +574 -558
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -866
  27. lamindb/core/_label_manager.py +253 -252
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -0
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -196
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -245
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/METADATA +5 -5
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.6.dist-info/RECORD +0 -59
lamindb/_can_validate.py CHANGED
@@ -1,579 +1,579 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Literal
4
-
5
- import lamindb_setup as ln_setup
6
- import numpy as np
7
- import pandas as pd
8
- from django.core.exceptions import FieldDoesNotExist
9
- from lamin_utils import colors, logger
10
- from lamindb_setup.core._docs import doc_args
11
- from lnschema_core import CanValidate, Record
12
-
13
- from lamindb._utils import attach_func_to_class_method
14
-
15
- from ._from_values import _has_organism_field, _print_values
16
- from ._record import _queryset, get_name_field
17
-
18
- if TYPE_CHECKING:
19
- from django.db.models import QuerySet
20
- from lamin_utils._inspect import InspectResult
21
- from lnschema_core.types import ListLike, StrField
22
-
23
-
24
- @classmethod # type: ignore
25
- @doc_args(CanValidate.inspect.__doc__)
26
- def inspect(
27
- cls,
28
- values: ListLike,
29
- field: str | StrField | None = None,
30
- *,
31
- mute: bool = False,
32
- organism: str | Record | None = None,
33
- source: Record | None = None,
34
- ) -> InspectResult:
35
- """{}""" # noqa: D415
36
- return _inspect(
37
- cls=cls,
38
- values=values,
39
- field=field,
40
- mute=mute,
41
- organism=organism,
42
- source=source,
43
- )
44
-
45
-
46
- @classmethod # type: ignore
47
- @doc_args(CanValidate.validate.__doc__)
48
- def validate(
49
- cls,
50
- values: ListLike,
51
- field: str | StrField | None = None,
52
- *,
53
- mute: bool = False,
54
- organism: str | Record | None = None,
55
- source: Record | None = None,
56
- ) -> np.ndarray:
57
- """{}""" # noqa: D415
58
- return _validate(
59
- cls=cls, values=values, field=field, mute=mute, organism=organism, source=source
60
- )
61
-
62
-
63
- def _check_source_db(source: Record, using_key: str | None):
64
- """Check if the source is from the DB."""
65
- if using_key is not None and using_key != "default":
66
- if source._state.db != using_key:
67
- raise ValueError(
68
- f"source must be a bionty.Source record from instance '{using_key}'!"
69
- )
70
-
71
-
72
- def _check_organism_db(organism: Record, using_key: str | None):
73
- """Check if the organism is from the DB."""
74
- if isinstance(organism, Record):
75
- if using_key is not None and using_key != "default":
76
- if organism._state.db != using_key:
77
- raise ValueError(
78
- f"organism must be a bionty.Organism record from instance '{using_key}'!"
79
- )
80
-
81
-
82
- def _concat_lists(values: ListLike) -> list[str]:
83
- """Concatenate a list of lists of strings into a single list."""
84
- if len(values) > 0 and isinstance(values, (list, pd.Series)):
85
- try:
86
- if isinstance(values[0], list):
87
- if isinstance(values, pd.Series):
88
- values = values.tolist()
89
- values = sum([v for v in values if isinstance(v, list)], [])
90
- except KeyError:
91
- pass
92
- return values
93
-
94
-
95
- def _inspect(
96
- cls,
97
- values: ListLike,
98
- field: str | StrField | None = None,
99
- *,
100
- mute: bool = False,
101
- using_key: str | None = None,
102
- organism: str | Record | None = None,
103
- source: Record | None = None,
104
- ) -> pd.DataFrame | dict[str, list[str]]:
105
- """{}""" # noqa: D415
106
- from lamin_utils._inspect import inspect
107
-
108
- if isinstance(values, str):
109
- values = [values]
110
- values = _concat_lists(values)
111
-
112
- field = get_name_field(cls, field=field)
113
- queryset = _queryset(cls, using_key)
114
- using_key = queryset.db
115
- if isinstance(source, Record):
116
- _check_source_db(source, using_key)
117
- queryset = queryset.filter(source=source).all()
118
- _check_organism_db(organism, using_key)
119
- registry = queryset.model
120
- model_name = registry._meta.model.__name__
121
-
122
- # inspect in the DB
123
- result_db = inspect(
124
- df=_filter_query_based_on_organism(
125
- queryset=queryset, field=field, organism=organism
126
- ),
127
- identifiers=values,
128
- field=field,
129
- mute=mute,
130
- )
131
- nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
132
-
133
- if len(nonval) > 0 and registry.__get_schema_name__() == "bionty":
134
- try:
135
- bionty_result = registry.public(organism=organism, source=source).inspect(
136
- values=nonval, field=field, mute=True
137
- )
138
- bionty_validated = bionty_result.validated
139
- bionty_mapper = bionty_result.synonyms_mapper
140
- hint = False
141
- if len(bionty_validated) > 0 and not mute:
142
- print_values = _print_values(bionty_validated)
143
- s = "" if len(bionty_validated) == 1 else "s"
144
- labels = colors.yellow(f"{len(bionty_validated)} {model_name} term{s}")
145
- logger.print(
146
- f" detected {labels} in Bionty for"
147
- f" {colors.italic(field)}: {colors.yellow(print_values)}"
148
- )
149
- hint = True
150
-
151
- if len(bionty_mapper) > 0 and not mute:
152
- print_values = _print_values(list(bionty_mapper.keys()))
153
- s = "" if len(bionty_mapper) == 1 else "s"
154
- labels = colors.yellow(f"{len(bionty_mapper)} {model_name} term{s}")
155
- logger.print(
156
- f" detected {labels} in Bionty as {colors.italic(f'synonym{s}')}:"
157
- f" {colors.yellow(print_values)}"
158
- )
159
- hint = True
160
-
161
- if hint:
162
- logger.print(
163
- f"→ add records from Bionty to your {model_name} registry via"
164
- f" {colors.italic('.from_values()')}"
165
- )
166
-
167
- nonval = bionty_result.non_validated
168
- # no bionty source is found
169
- except ValueError:
170
- logger.warning("no Bionty source found, skipping Bionty validation")
171
-
172
- if len(nonval) > 0 and not mute:
173
- print_values = _print_values(list(nonval))
174
- s = "" if len(nonval) == 1 else "s"
175
- labels = colors.red(f"{len(nonval)} term{s}")
176
- logger.print(f" couldn't validate {labels}: {colors.red(print_values)}")
177
- logger.print(
178
- f"→ if you are sure, create new record{s} via"
179
- f" {colors.italic(f'{registry.__name__}()')} and save to your registry"
180
- )
181
-
182
- return result_db
183
-
184
-
185
- def _validate(
186
- cls,
187
- values: ListLike,
188
- field: str | StrField | None = None,
189
- *,
190
- mute: bool = False,
191
- using_key: str | None = None,
192
- organism: str | Record | None = None,
193
- source: Record | None = None,
194
- ) -> np.ndarray:
195
- """{}""" # noqa: D415
196
- from lamin_utils._inspect import validate
197
-
198
- return_str = True if isinstance(values, str) else False
199
- if isinstance(values, str):
200
- values = [values]
201
- values = _concat_lists(values)
202
-
203
- field = get_name_field(cls, field=field)
204
-
205
- queryset = _queryset(cls, using_key)
206
- using_key = queryset.db
207
- if isinstance(source, Record):
208
- _check_source_db(source, using_key)
209
- queryset = queryset.filter(source=source).all()
210
- _check_organism_db(organism, using_key)
211
- field_values = pd.Series(
212
- _filter_query_based_on_organism(
213
- queryset=queryset,
214
- field=field,
215
- organism=organism,
216
- values_list_field=field,
217
- ),
218
- dtype="object",
219
- )
220
- if field_values.empty:
221
- if not mute:
222
- msg = (
223
- f"Your {cls.__name__} registry is empty, consider populating it first!"
224
- )
225
- if hasattr(cls, "source_id"):
226
- msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
227
- logger.warning(msg)
228
- return np.array([False] * len(values))
229
-
230
- result = validate(
231
- identifiers=values,
232
- field_values=field_values,
233
- case_sensitive=True,
234
- mute=mute,
235
- field=field,
236
- )
237
- if return_str and len(result) == 1:
238
- return result[0]
239
- else:
240
- return result
241
-
242
-
243
- @classmethod # type: ignore
244
- @doc_args(CanValidate.standardize.__doc__)
245
- def standardize(
246
- cls,
247
- values: ListLike,
248
- field: str | StrField | None = None,
249
- *,
250
- return_field: str = None,
251
- return_mapper: bool = False,
252
- case_sensitive: bool = False,
253
- mute: bool = False,
254
- public_aware: bool = True,
255
- keep: Literal["first", "last", False] = "first",
256
- synonyms_field: str = "synonyms",
257
- organism: str | Record | None = None,
258
- source: Record | None = None,
259
- ) -> list[str] | dict[str, str]:
260
- """{}""" # noqa: D415
261
- return _standardize(
262
- cls=cls,
263
- values=values,
264
- field=field,
265
- return_field=return_field,
266
- return_mapper=return_mapper,
267
- case_sensitive=case_sensitive,
268
- mute=mute,
269
- public_aware=public_aware,
270
- keep=keep,
271
- synonyms_field=synonyms_field,
272
- organism=organism,
273
- source=source,
274
- )
275
-
276
-
277
- def set_abbr(self, value: str):
278
- self.abbr = value
279
-
280
- if hasattr(self, "name") and value == self.name:
281
- pass
282
- else:
283
- try:
284
- self.add_synonym(value, save=False)
285
- except Exception as e: # pragma: no cover
286
- logger.debug(
287
- f"Encountered an Exception while attempting to add synonyms.\n{e}"
288
- )
289
-
290
- if not self._state.adding:
291
- self.save()
292
-
293
-
294
- def add_synonym(
295
- self,
296
- synonym: str | ListLike,
297
- force: bool = False,
298
- save: bool | None = None,
299
- ):
300
- _check_synonyms_field_exist(self)
301
- _add_or_remove_synonyms(
302
- synonym=synonym, record=self, force=force, action="add", save=save
303
- )
304
-
305
-
306
- def remove_synonym(self, synonym: str | ListLike):
307
- _check_synonyms_field_exist(self)
308
- _add_or_remove_synonyms(synonym=synonym, record=self, action="remove")
309
-
310
-
311
- def _standardize(
312
- cls,
313
- values: ListLike,
314
- field: str | StrField | None = None,
315
- *,
316
- return_field: str = None,
317
- return_mapper: bool = False,
318
- case_sensitive: bool = False,
319
- mute: bool = False,
320
- public_aware: bool = True,
321
- keep: Literal["first", "last", False] = "first",
322
- synonyms_field: str = "synonyms",
323
- using_key: str | None = None,
324
- organism: str | Record | None = None,
325
- source: Record | None = None,
326
- ) -> list[str] | dict[str, str]:
327
- """{}""" # noqa: D415
328
- from lamin_utils._standardize import standardize as map_synonyms
329
-
330
- return_str = True if isinstance(values, str) else False
331
- if isinstance(values, str):
332
- values = [values]
333
- values = _concat_lists(values)
334
-
335
- field = get_name_field(cls, field=field)
336
- return_field = get_name_field(
337
- cls, field=field if return_field is None else return_field
338
- )
339
- queryset = _queryset(cls, using_key)
340
- using_key = queryset.db
341
- if isinstance(source, Record):
342
- _check_source_db(source, using_key)
343
- queryset = queryset.filter(source=source).all()
344
- _check_organism_db(organism, using_key)
345
- registry = queryset.model
346
-
347
- if _has_organism_field(registry):
348
- # here, we can safely import bionty
349
- from bionty._bionty import create_or_get_organism_record
350
-
351
- organism_record = create_or_get_organism_record(
352
- organism=organism, registry=registry
353
- )
354
- organism = (
355
- organism_record.name if organism_record is not None else organism_record
356
- )
357
-
358
- try:
359
- registry._meta.get_field(synonyms_field)
360
- df = _filter_query_based_on_organism(
361
- queryset=queryset, field=field, organism=organism
362
- )
363
- except FieldDoesNotExist:
364
- df = pd.DataFrame()
365
-
366
- _kwargs = {
367
- "field": field,
368
- "return_field": return_field,
369
- "case_sensitive": case_sensitive,
370
- "keep": keep,
371
- "synonyms_field": synonyms_field,
372
- }
373
- # standardized names from the DB
374
- std_names_db = map_synonyms(
375
- df=df,
376
- identifiers=values,
377
- return_mapper=return_mapper,
378
- mute=mute,
379
- **_kwargs,
380
- )
381
-
382
- def _return(result: list, mapper: dict):
383
- if return_mapper:
384
- return mapper
385
- else:
386
- if return_str and len(result) == 1:
387
- return result[0]
388
- return result
389
-
390
- # map synonyms in Bionty
391
- if registry.__get_schema_name__() == "bionty" and public_aware:
392
- mapper = {}
393
- if return_mapper:
394
- mapper = std_names_db
395
- std_names_db = map_synonyms(
396
- df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs
397
- )
398
-
399
- val_res = registry.validate(
400
- std_names_db, field=field, mute=True, organism=organism
401
- )
402
- if all(val_res):
403
- return _return(result=std_names_db, mapper=mapper)
404
-
405
- nonval = np.array(std_names_db)[~val_res]
406
- std_names_bt_mapper = registry.public(organism=organism).standardize(
407
- nonval, return_mapper=True, mute=True, **_kwargs
408
- )
409
-
410
- if len(std_names_bt_mapper) > 0 and not mute:
411
- s = "" if len(std_names_bt_mapper) == 1 else "s"
412
- field_print = "synonym" if field == return_field else field
413
- warn_msg = (
414
- f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty:"
415
- f" {list(std_names_bt_mapper.keys())}"
416
- )
417
- warn_msg += (
418
- f"\n please add corresponding {registry._meta.model.__name__} records via"
419
- f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
420
- )
421
- logger.warning(warn_msg)
422
-
423
- mapper.update(std_names_bt_mapper)
424
- if pd.api.types.is_categorical_dtype(std_names_db):
425
- result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
426
- else:
427
- result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
428
- return _return(result=result, mapper=mapper)
429
-
430
- else:
431
- return _return(result=std_names_db, mapper=std_names_db)
432
-
433
-
434
- def _add_or_remove_synonyms(
435
- synonym: str | ListLike,
436
- record: Record,
437
- action: Literal["add", "remove"],
438
- force: bool = False,
439
- save: bool | None = None,
440
- ):
441
- """Add or remove synonyms."""
442
-
443
- def check_synonyms_in_all_records(synonyms: set[str], record: Record):
444
- """Errors if input synonym is associated with other records in the DB."""
445
- import pandas as pd
446
- from IPython.display import display
447
-
448
- syns_all = (
449
- record.__class__.objects.exclude(synonyms="").exclude(synonyms=None).all()
450
- )
451
- if len(syns_all) == 0:
452
- return
453
- df = pd.DataFrame(syns_all.values())
454
- df["synonyms"] = df["synonyms"].str.split("|")
455
- df = df.explode("synonyms")
456
- matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)]
457
- if matches_df.shape[0] > 0:
458
- records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values())
459
- logger.error(
460
- f"input synonyms {matches_df['synonyms'].unique()} already associated"
461
- " with the following records:\n"
462
- )
463
- display(records_df)
464
- raise ValueError(
465
- "cannot assigned a synonym that is already associated with a record to a different record.\n"
466
- "Consider removing the synonym from existing records or using a different synonym."
467
- )
468
-
469
- # passed synonyms
470
- # nothing happens when passing an empty string or list
471
- if isinstance(synonym, str):
472
- if len(synonym) == 0:
473
- return
474
- syn_new_set = {synonym}
475
- else:
476
- if synonym == [""]:
477
- return
478
- syn_new_set = set(synonym)
479
- # nothing happens when passing an empty string or list
480
- if len(syn_new_set) == 0:
481
- return
482
- # because we use | as the separator
483
- if any("|" in i for i in syn_new_set):
484
- raise ValueError("a synonym can't contain '|'!")
485
-
486
- # existing synonyms
487
- syns_exist = record.synonyms
488
- if syns_exist is None or len(syns_exist) == 0:
489
- syns_exist_set = set()
490
- else:
491
- syns_exist_set = set(syns_exist.split("|"))
492
-
493
- if action == "add":
494
- if not force:
495
- check_synonyms_in_all_records(syn_new_set, record)
496
- syns_exist_set.update(syn_new_set)
497
- elif action == "remove":
498
- syns_exist_set = syns_exist_set.difference(syn_new_set)
499
-
500
- if len(syns_exist_set) == 0:
501
- syns_str = None
502
- else:
503
- syns_str = "|".join(syns_exist_set)
504
-
505
- record.synonyms = syns_str
506
-
507
- if save is None:
508
- # if record is already in DB, save the changes to DB
509
- save = not record._state.adding
510
- if save:
511
- record.save()
512
-
513
-
514
- def _check_synonyms_field_exist(record: Record):
515
- try:
516
- record.__getattribute__("synonyms")
517
- except AttributeError:
518
- raise NotImplementedError(
519
- f"No synonyms field found in table {record.__class__.__name__}!"
520
- ) from None
521
-
522
-
523
- def _filter_query_based_on_organism(
524
- queryset: QuerySet,
525
- field: str,
526
- organism: str | Record | None = None,
527
- values_list_field: str | None = None,
528
- ):
529
- """Filter a queryset based on organism."""
530
- import pandas as pd
531
-
532
- registry = queryset.model
533
-
534
- if _has_organism_field(registry) and not _field_is_id(field, registry):
535
- # here, we can safely import bionty
536
- from bionty._bionty import create_or_get_organism_record
537
-
538
- organism_record = create_or_get_organism_record(
539
- organism=organism, registry=registry
540
- )
541
- if organism_record is not None:
542
- queryset = queryset.filter(organism__name=organism_record.name)
543
-
544
- if values_list_field is None:
545
- return pd.DataFrame.from_records(queryset.values())
546
- else:
547
- return queryset.values_list(values_list_field, flat=True)
548
-
549
-
550
- def _field_is_id(field: str, registry: type[Record]) -> bool:
551
- """Check if the field is an ontology ID."""
552
- if hasattr(registry, "_ontology_id_field"):
553
- if field == registry._ontology_id_field:
554
- return True
555
- if field.endswith("id"):
556
- return True
557
- return False
558
-
559
-
560
- METHOD_NAMES = [
561
- "validate",
562
- "inspect",
563
- "standardize",
564
- "add_synonym",
565
- "remove_synonym",
566
- "set_abbr",
567
- ]
568
-
569
- if ln_setup._TESTING: # type: ignore
570
- from inspect import signature
571
-
572
- SIGS = {
573
- name: signature(getattr(CanValidate, name))
574
- for name in METHOD_NAMES
575
- if not name.startswith("__")
576
- }
577
-
578
- for name in METHOD_NAMES:
579
- attach_func_to_class_method(name, CanValidate, globals())
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ import lamindb_setup as ln_setup
6
+ import numpy as np
7
+ import pandas as pd
8
+ from django.core.exceptions import FieldDoesNotExist
9
+ from lamin_utils import colors, logger
10
+ from lamindb_setup.core._docs import doc_args
11
+ from lnschema_core import CanValidate, Record
12
+
13
+ from lamindb._utils import attach_func_to_class_method
14
+
15
+ from ._from_values import _has_organism_field, _print_values
16
+ from ._record import _queryset, get_name_field
17
+
18
+ if TYPE_CHECKING:
19
+ from django.db.models import QuerySet
20
+ from lamin_utils._inspect import InspectResult
21
+ from lnschema_core.types import ListLike, StrField
22
+
23
+
24
+ @classmethod # type: ignore
25
+ @doc_args(CanValidate.inspect.__doc__)
26
+ def inspect(
27
+ cls,
28
+ values: ListLike,
29
+ field: str | StrField | None = None,
30
+ *,
31
+ mute: bool = False,
32
+ organism: str | Record | None = None,
33
+ source: Record | None = None,
34
+ ) -> InspectResult:
35
+ """{}""" # noqa: D415
36
+ return _inspect(
37
+ cls=cls,
38
+ values=values,
39
+ field=field,
40
+ mute=mute,
41
+ organism=organism,
42
+ source=source,
43
+ )
44
+
45
+
46
+ @classmethod # type: ignore
47
+ @doc_args(CanValidate.validate.__doc__)
48
+ def validate(
49
+ cls,
50
+ values: ListLike,
51
+ field: str | StrField | None = None,
52
+ *,
53
+ mute: bool = False,
54
+ organism: str | Record | None = None,
55
+ source: Record | None = None,
56
+ ) -> np.ndarray:
57
+ """{}""" # noqa: D415
58
+ return _validate(
59
+ cls=cls, values=values, field=field, mute=mute, organism=organism, source=source
60
+ )
61
+
62
+
63
+ def _check_source_db(source: Record, using_key: str | None):
64
+ """Check if the source is from the DB."""
65
+ if using_key is not None and using_key != "default":
66
+ if source._state.db != using_key:
67
+ raise ValueError(
68
+ f"source must be a bionty.Source record from instance '{using_key}'!"
69
+ )
70
+
71
+
72
+ def _check_organism_db(organism: Record, using_key: str | None):
73
+ """Check if the organism is from the DB."""
74
+ if isinstance(organism, Record):
75
+ if using_key is not None and using_key != "default":
76
+ if organism._state.db != using_key:
77
+ raise ValueError(
78
+ f"organism must be a bionty.Organism record from instance '{using_key}'!"
79
+ )
80
+
81
+
82
+ def _concat_lists(values: ListLike) -> list[str]:
83
+ """Concatenate a list of lists of strings into a single list."""
84
+ if len(values) > 0 and isinstance(values, (list, pd.Series)):
85
+ try:
86
+ if isinstance(values[0], list):
87
+ if isinstance(values, pd.Series):
88
+ values = values.tolist()
89
+ values = sum([v for v in values if isinstance(v, list)], [])
90
+ except KeyError:
91
+ pass
92
+ return values
93
+
94
+
95
+ def _inspect(
96
+ cls,
97
+ values: ListLike,
98
+ field: str | StrField | None = None,
99
+ *,
100
+ mute: bool = False,
101
+ using_key: str | None = None,
102
+ organism: str | Record | None = None,
103
+ source: Record | None = None,
104
+ ) -> pd.DataFrame | dict[str, list[str]]:
105
+ """{}""" # noqa: D415
106
+ from lamin_utils._inspect import inspect
107
+
108
+ if isinstance(values, str):
109
+ values = [values]
110
+ values = _concat_lists(values)
111
+
112
+ field = get_name_field(cls, field=field)
113
+ queryset = _queryset(cls, using_key)
114
+ using_key = queryset.db
115
+ if isinstance(source, Record):
116
+ _check_source_db(source, using_key)
117
+ queryset = queryset.filter(source=source).all()
118
+ _check_organism_db(organism, using_key)
119
+ registry = queryset.model
120
+ model_name = registry._meta.model.__name__
121
+
122
+ # inspect in the DB
123
+ result_db = inspect(
124
+ df=_filter_query_based_on_organism(
125
+ queryset=queryset, field=field, organism=organism
126
+ ),
127
+ identifiers=values,
128
+ field=field,
129
+ mute=mute,
130
+ )
131
+ nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
132
+
133
+ if len(nonval) > 0 and registry.__get_schema_name__() == "bionty":
134
+ try:
135
+ bionty_result = registry.public(organism=organism, source=source).inspect(
136
+ values=nonval, field=field, mute=True
137
+ )
138
+ bionty_validated = bionty_result.validated
139
+ bionty_mapper = bionty_result.synonyms_mapper
140
+ hint = False
141
+ if len(bionty_validated) > 0 and not mute:
142
+ print_values = _print_values(bionty_validated)
143
+ s = "" if len(bionty_validated) == 1 else "s"
144
+ labels = colors.yellow(f"{len(bionty_validated)} {model_name} term{s}")
145
+ logger.print(
146
+ f" detected {labels} in Bionty for"
147
+ f" {colors.italic(field)}: {colors.yellow(print_values)}"
148
+ )
149
+ hint = True
150
+
151
+ if len(bionty_mapper) > 0 and not mute:
152
+ print_values = _print_values(list(bionty_mapper.keys()))
153
+ s = "" if len(bionty_mapper) == 1 else "s"
154
+ labels = colors.yellow(f"{len(bionty_mapper)} {model_name} term{s}")
155
+ logger.print(
156
+ f" detected {labels} in Bionty as {colors.italic(f'synonym{s}')}:"
157
+ f" {colors.yellow(print_values)}"
158
+ )
159
+ hint = True
160
+
161
+ if hint:
162
+ logger.print(
163
+ f"→ add records from Bionty to your {model_name} registry via"
164
+ f" {colors.italic('.from_values()')}"
165
+ )
166
+
167
+ nonval = bionty_result.non_validated
168
+ # no bionty source is found
169
+ except ValueError:
170
+ logger.warning("no Bionty source found, skipping Bionty validation")
171
+
172
+ if len(nonval) > 0 and not mute:
173
+ print_values = _print_values(list(nonval))
174
+ s = "" if len(nonval) == 1 else "s"
175
+ labels = colors.red(f"{len(nonval)} term{s}")
176
+ logger.print(f" couldn't validate {labels}: {colors.red(print_values)}")
177
+ logger.print(
178
+ f"→ if you are sure, create new record{s} via"
179
+ f" {colors.italic(f'{registry.__name__}()')} and save to your registry"
180
+ )
181
+
182
+ return result_db
183
+
184
+
185
+ def _validate(
186
+ cls,
187
+ values: ListLike,
188
+ field: str | StrField | None = None,
189
+ *,
190
+ mute: bool = False,
191
+ using_key: str | None = None,
192
+ organism: str | Record | None = None,
193
+ source: Record | None = None,
194
+ ) -> np.ndarray:
195
+ """{}""" # noqa: D415
196
+ from lamin_utils._inspect import validate
197
+
198
+ return_str = True if isinstance(values, str) else False
199
+ if isinstance(values, str):
200
+ values = [values]
201
+ values = _concat_lists(values)
202
+
203
+ field = get_name_field(cls, field=field)
204
+
205
+ queryset = _queryset(cls, using_key)
206
+ using_key = queryset.db
207
+ if isinstance(source, Record):
208
+ _check_source_db(source, using_key)
209
+ queryset = queryset.filter(source=source).all()
210
+ _check_organism_db(organism, using_key)
211
+ field_values = pd.Series(
212
+ _filter_query_based_on_organism(
213
+ queryset=queryset,
214
+ field=field,
215
+ organism=organism,
216
+ values_list_field=field,
217
+ ),
218
+ dtype="object",
219
+ )
220
+ if field_values.empty:
221
+ if not mute:
222
+ msg = (
223
+ f"Your {cls.__name__} registry is empty, consider populating it first!"
224
+ )
225
+ if hasattr(cls, "source_id"):
226
+ msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
227
+ logger.warning(msg)
228
+ return np.array([False] * len(values))
229
+
230
+ result = validate(
231
+ identifiers=values,
232
+ field_values=field_values,
233
+ case_sensitive=True,
234
+ mute=mute,
235
+ field=field,
236
+ )
237
+ if return_str and len(result) == 1:
238
+ return result[0]
239
+ else:
240
+ return result
241
+
242
+
243
+ @classmethod # type: ignore
244
+ @doc_args(CanValidate.standardize.__doc__)
245
+ def standardize(
246
+ cls,
247
+ values: ListLike,
248
+ field: str | StrField | None = None,
249
+ *,
250
+ return_field: str = None,
251
+ return_mapper: bool = False,
252
+ case_sensitive: bool = False,
253
+ mute: bool = False,
254
+ public_aware: bool = True,
255
+ keep: Literal["first", "last", False] = "first",
256
+ synonyms_field: str = "synonyms",
257
+ organism: str | Record | None = None,
258
+ source: Record | None = None,
259
+ ) -> list[str] | dict[str, str]:
260
+ """{}""" # noqa: D415
261
+ return _standardize(
262
+ cls=cls,
263
+ values=values,
264
+ field=field,
265
+ return_field=return_field,
266
+ return_mapper=return_mapper,
267
+ case_sensitive=case_sensitive,
268
+ mute=mute,
269
+ public_aware=public_aware,
270
+ keep=keep,
271
+ synonyms_field=synonyms_field,
272
+ organism=organism,
273
+ source=source,
274
+ )
275
+
276
+
277
+ def set_abbr(self, value: str):
278
+ self.abbr = value
279
+
280
+ if hasattr(self, "name") and value == self.name:
281
+ pass
282
+ else:
283
+ try:
284
+ self.add_synonym(value, save=False)
285
+ except Exception as e: # pragma: no cover
286
+ logger.debug(
287
+ f"Encountered an Exception while attempting to add synonyms.\n{e}"
288
+ )
289
+
290
+ if not self._state.adding:
291
+ self.save()
292
+
293
+
294
+ def add_synonym(
295
+ self,
296
+ synonym: str | ListLike,
297
+ force: bool = False,
298
+ save: bool | None = None,
299
+ ):
300
+ _check_synonyms_field_exist(self)
301
+ _add_or_remove_synonyms(
302
+ synonym=synonym, record=self, force=force, action="add", save=save
303
+ )
304
+
305
+
306
+ def remove_synonym(self, synonym: str | ListLike):
307
+ _check_synonyms_field_exist(self)
308
+ _add_or_remove_synonyms(synonym=synonym, record=self, action="remove")
309
+
310
+
311
+ def _standardize(
312
+ cls,
313
+ values: ListLike,
314
+ field: str | StrField | None = None,
315
+ *,
316
+ return_field: str = None,
317
+ return_mapper: bool = False,
318
+ case_sensitive: bool = False,
319
+ mute: bool = False,
320
+ public_aware: bool = True,
321
+ keep: Literal["first", "last", False] = "first",
322
+ synonyms_field: str = "synonyms",
323
+ using_key: str | None = None,
324
+ organism: str | Record | None = None,
325
+ source: Record | None = None,
326
+ ) -> list[str] | dict[str, str]:
327
+ """{}""" # noqa: D415
328
+ from lamin_utils._standardize import standardize as map_synonyms
329
+
330
+ return_str = True if isinstance(values, str) else False
331
+ if isinstance(values, str):
332
+ values = [values]
333
+ values = _concat_lists(values)
334
+
335
+ field = get_name_field(cls, field=field)
336
+ return_field = get_name_field(
337
+ cls, field=field if return_field is None else return_field
338
+ )
339
+ queryset = _queryset(cls, using_key)
340
+ using_key = queryset.db
341
+ if isinstance(source, Record):
342
+ _check_source_db(source, using_key)
343
+ queryset = queryset.filter(source=source).all()
344
+ _check_organism_db(organism, using_key)
345
+ registry = queryset.model
346
+
347
+ if _has_organism_field(registry):
348
+ # here, we can safely import bionty
349
+ from bionty._bionty import create_or_get_organism_record
350
+
351
+ organism_record = create_or_get_organism_record(
352
+ organism=organism, registry=registry
353
+ )
354
+ organism = (
355
+ organism_record.name if organism_record is not None else organism_record
356
+ )
357
+
358
+ try:
359
+ registry._meta.get_field(synonyms_field)
360
+ df = _filter_query_based_on_organism(
361
+ queryset=queryset, field=field, organism=organism
362
+ )
363
+ except FieldDoesNotExist:
364
+ df = pd.DataFrame()
365
+
366
+ _kwargs = {
367
+ "field": field,
368
+ "return_field": return_field,
369
+ "case_sensitive": case_sensitive,
370
+ "keep": keep,
371
+ "synonyms_field": synonyms_field,
372
+ }
373
+ # standardized names from the DB
374
+ std_names_db = map_synonyms(
375
+ df=df,
376
+ identifiers=values,
377
+ return_mapper=return_mapper,
378
+ mute=mute,
379
+ **_kwargs,
380
+ )
381
+
382
+ def _return(result: list, mapper: dict):
383
+ if return_mapper:
384
+ return mapper
385
+ else:
386
+ if return_str and len(result) == 1:
387
+ return result[0]
388
+ return result
389
+
390
+ # map synonyms in Bionty
391
+ if registry.__get_schema_name__() == "bionty" and public_aware:
392
+ mapper = {}
393
+ if return_mapper:
394
+ mapper = std_names_db
395
+ std_names_db = map_synonyms(
396
+ df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs
397
+ )
398
+
399
+ val_res = registry.validate(
400
+ std_names_db, field=field, mute=True, organism=organism
401
+ )
402
+ if all(val_res):
403
+ return _return(result=std_names_db, mapper=mapper)
404
+
405
+ nonval = np.array(std_names_db)[~val_res]
406
+ std_names_bt_mapper = registry.public(organism=organism).standardize(
407
+ nonval, return_mapper=True, mute=True, **_kwargs
408
+ )
409
+
410
+ if len(std_names_bt_mapper) > 0 and not mute:
411
+ s = "" if len(std_names_bt_mapper) == 1 else "s"
412
+ field_print = "synonym" if field == return_field else field
413
+ warn_msg = (
414
+ f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty:"
415
+ f" {list(std_names_bt_mapper.keys())}"
416
+ )
417
+ warn_msg += (
418
+ f"\n please add corresponding {registry._meta.model.__name__} records via"
419
+ f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
420
+ )
421
+ logger.warning(warn_msg)
422
+
423
+ mapper.update(std_names_bt_mapper)
424
+ if pd.api.types.is_categorical_dtype(std_names_db):
425
+ result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
426
+ else:
427
+ result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
428
+ return _return(result=result, mapper=mapper)
429
+
430
+ else:
431
+ return _return(result=std_names_db, mapper=std_names_db)
432
+
433
+
434
+ def _add_or_remove_synonyms(
435
+ synonym: str | ListLike,
436
+ record: Record,
437
+ action: Literal["add", "remove"],
438
+ force: bool = False,
439
+ save: bool | None = None,
440
+ ):
441
+ """Add or remove synonyms."""
442
+
443
+ def check_synonyms_in_all_records(synonyms: set[str], record: Record):
444
+ """Errors if input synonym is associated with other records in the DB."""
445
+ import pandas as pd
446
+ from IPython.display import display
447
+
448
+ syns_all = (
449
+ record.__class__.objects.exclude(synonyms="").exclude(synonyms=None).all()
450
+ )
451
+ if len(syns_all) == 0:
452
+ return
453
+ df = pd.DataFrame(syns_all.values())
454
+ df["synonyms"] = df["synonyms"].str.split("|")
455
+ df = df.explode("synonyms")
456
+ matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)]
457
+ if matches_df.shape[0] > 0:
458
+ records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values())
459
+ logger.error(
460
+ f"input synonyms {matches_df['synonyms'].unique()} already associated"
461
+ " with the following records:\n"
462
+ )
463
+ display(records_df)
464
+ raise ValueError(
465
+ "cannot assigned a synonym that is already associated with a record to a different record.\n"
466
+ "Consider removing the synonym from existing records or using a different synonym."
467
+ )
468
+
469
+ # passed synonyms
470
+ # nothing happens when passing an empty string or list
471
+ if isinstance(synonym, str):
472
+ if len(synonym) == 0:
473
+ return
474
+ syn_new_set = {synonym}
475
+ else:
476
+ if synonym == [""]:
477
+ return
478
+ syn_new_set = set(synonym)
479
+ # nothing happens when passing an empty string or list
480
+ if len(syn_new_set) == 0:
481
+ return
482
+ # because we use | as the separator
483
+ if any("|" in i for i in syn_new_set):
484
+ raise ValueError("a synonym can't contain '|'!")
485
+
486
+ # existing synonyms
487
+ syns_exist = record.synonyms
488
+ if syns_exist is None or len(syns_exist) == 0:
489
+ syns_exist_set = set()
490
+ else:
491
+ syns_exist_set = set(syns_exist.split("|"))
492
+
493
+ if action == "add":
494
+ if not force:
495
+ check_synonyms_in_all_records(syn_new_set, record)
496
+ syns_exist_set.update(syn_new_set)
497
+ elif action == "remove":
498
+ syns_exist_set = syns_exist_set.difference(syn_new_set)
499
+
500
+ if len(syns_exist_set) == 0:
501
+ syns_str = None
502
+ else:
503
+ syns_str = "|".join(syns_exist_set)
504
+
505
+ record.synonyms = syns_str
506
+
507
+ if save is None:
508
+ # if record is already in DB, save the changes to DB
509
+ save = not record._state.adding
510
+ if save:
511
+ record.save()
512
+
513
+
514
+ def _check_synonyms_field_exist(record: Record):
515
+ try:
516
+ record.__getattribute__("synonyms")
517
+ except AttributeError:
518
+ raise NotImplementedError(
519
+ f"No synonyms field found in table {record.__class__.__name__}!"
520
+ ) from None
521
+
522
+
523
+ def _filter_query_based_on_organism(
524
+ queryset: QuerySet,
525
+ field: str,
526
+ organism: str | Record | None = None,
527
+ values_list_field: str | None = None,
528
+ ):
529
+ """Filter a queryset based on organism."""
530
+ import pandas as pd
531
+
532
+ registry = queryset.model
533
+
534
+ if _has_organism_field(registry) and not _field_is_id(field, registry):
535
+ # here, we can safely import bionty
536
+ from bionty._bionty import create_or_get_organism_record
537
+
538
+ organism_record = create_or_get_organism_record(
539
+ organism=organism, registry=registry
540
+ )
541
+ if organism_record is not None:
542
+ queryset = queryset.filter(organism__name=organism_record.name)
543
+
544
+ if values_list_field is None:
545
+ return pd.DataFrame.from_records(queryset.values())
546
+ else:
547
+ return queryset.values_list(values_list_field, flat=True)
548
+
549
+
550
+ def _field_is_id(field: str, registry: type[Record]) -> bool:
551
+ """Check if the field is an ontology ID."""
552
+ if hasattr(registry, "_ontology_id_field"):
553
+ if field == registry._ontology_id_field:
554
+ return True
555
+ if field.endswith("id"):
556
+ return True
557
+ return False
558
+
559
+
560
+ METHOD_NAMES = [
561
+ "validate",
562
+ "inspect",
563
+ "standardize",
564
+ "add_synonym",
565
+ "remove_synonym",
566
+ "set_abbr",
567
+ ]
568
+
569
+ if ln_setup._TESTING: # type: ignore
570
+ from inspect import signature
571
+
572
+ SIGS = {
573
+ name: signature(getattr(CanValidate, name))
574
+ for name in METHOD_NAMES
575
+ if not name.startswith("__")
576
+ }
577
+
578
+ for name in METHOD_NAMES:
579
+ attach_func_to_class_method(name, CanValidate, globals())