lamindb 0.76.7__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1178
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -141
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/METADATA +3 -3
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.7.dist-info/RECORD +0 -60
lamindb/_from_values.py CHANGED
@@ -1,382 +1,382 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Iterable
4
-
5
- import pandas as pd
6
- from django.core.exceptions import FieldDoesNotExist
7
- from lamin_utils import colors, logger
8
- from lnschema_core.models import Feature, Record, ULabel
9
-
10
- from .core._settings import settings
11
-
12
- if TYPE_CHECKING:
13
- from lnschema_core.types import ListLike, StrField
14
-
15
-
16
- # The base function for `from_values`
17
- def get_or_create_records(
18
- iterable: ListLike,
19
- field: StrField,
20
- *,
21
- create: bool = False,
22
- from_source: bool = False,
23
- organism: Record | str | None = None,
24
- source: Record | None = None,
25
- mute: bool = False,
26
- ) -> list[Record]:
27
- """Get or create records from iterables."""
28
- registry = field.field.model
29
- if create:
30
- return [registry(**{field.field.name: value}) for value in iterable]
31
- creation_search_names = settings.creation.search_names
32
- feature: Feature = None
33
- organism = _get_organism_record(field, organism)
34
- kwargs: dict = {}
35
- if organism is not None:
36
- kwargs["organism"] = organism
37
- if source is not None:
38
- kwargs["source"] = source
39
- settings.creation.search_names = False
40
- try:
41
- iterable_idx = index_iterable(iterable)
42
-
43
- # returns existing records & non-existing values
44
- records, nonexist_values, msg = get_existing_records(
45
- iterable_idx=iterable_idx, field=field, mute=mute, **kwargs
46
- )
47
-
48
- # new records to be created based on new values
49
- if len(nonexist_values) > 0:
50
- source_record = None
51
- if from_source:
52
- if isinstance(source, Record):
53
- source_record = source
54
- elif (
55
- len(records) > 0
56
- and hasattr(records[0], "source_id")
57
- and records[0].source_id
58
- ):
59
- source_record = records[0].source
60
- if not source_record and hasattr(registry, "public"):
61
- from bionty._bionty import get_source_record
62
-
63
- source_record = get_source_record(
64
- registry.public(organism=organism), registry
65
- )
66
- if source_record:
67
- from bionty.core._add_ontology import check_source_in_db
68
-
69
- check_source_in_db(
70
- registry=registry,
71
- source=source_record,
72
- update=True,
73
- )
74
-
75
- from_source = not source_record.in_db
76
- elif hasattr(registry, "source_id"):
77
- from_source = True
78
- else:
79
- from_source = False
80
-
81
- if from_source:
82
- records_bionty, unmapped_values = create_records_from_source(
83
- iterable_idx=nonexist_values,
84
- field=field,
85
- msg=msg,
86
- mute=mute,
87
- **kwargs,
88
- )
89
- if len(records_bionty) > 0:
90
- msg = ""
91
- for record in records_bionty:
92
- record._from_source = True
93
- records += records_bionty
94
- else:
95
- unmapped_values = nonexist_values
96
- # unmapped new_ids will NOT create records
97
- if len(unmapped_values) > 0:
98
- if len(msg) > 0 and not mute:
99
- logger.success(msg)
100
- s = "" if len(unmapped_values) == 1 else "s"
101
- print_values = colors.yellow(_print_values(unmapped_values))
102
- name = registry.__name__
103
- n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
104
- if not mute:
105
- logger.warning(
106
- f"{colors.red('did not create')} {name} record{s} for "
107
- f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
108
- )
109
- if registry.__get_schema_name__() == "bionty" or registry == ULabel:
110
- if isinstance(iterable, pd.Series):
111
- feature = iterable.name
112
- feature_name = None
113
- if isinstance(feature, str):
114
- feature_name = feature
115
- if feature_name is not None:
116
- if feature_name is not None:
117
- for record in records:
118
- record._feature = feature_name
119
- logger.debug(f"added default feature '{feature_name}'")
120
- return records
121
- finally:
122
- settings.creation.search_names = creation_search_names
123
-
124
-
125
- def get_existing_records(
126
- iterable_idx: pd.Index,
127
- field: StrField,
128
- mute: bool = False,
129
- **kwargs,
130
- ):
131
- model = field.field.model
132
- condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
133
- # existing records matching is agnostic to the bionty source
134
- if "source" in condition:
135
- condition.pop("source")
136
-
137
- # standardize based on the DB reference
138
- # log synonyms mapped terms
139
- result = model.inspect(
140
- iterable_idx,
141
- field=field,
142
- organism=kwargs.get("organism"),
143
- source=kwargs.get("source"),
144
- mute=True,
145
- )
146
- syn_mapper = result.synonyms_mapper
147
-
148
- syn_msg = ""
149
- if len(syn_mapper) > 0:
150
- s = "" if len(syn_mapper) == 1 else "s"
151
- names = list(syn_mapper.keys())
152
- print_values = colors.green(_print_values(names))
153
- syn_msg = (
154
- "loaded"
155
- f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
156
- f" matching {colors.italic('synonyms')}: {print_values}"
157
- )
158
- iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
159
-
160
- # get all existing records in the db
161
- # if necessary, create records for the values in kwargs
162
- # k:v -> k:v_record
163
- # kwargs is used to deal with organism
164
- condition.update({f"{field.field.name}__in": iterable_idx.values})
165
-
166
- query_set = model.filter(**condition)
167
- records = query_set.list()
168
-
169
- # now we have to sort the list of queried records
170
- # preserved = Case(
171
- # *[
172
- # When(**{field.field.name: value}, then=pos)
173
- # for pos, value in enumerate(iterable_idx)
174
- # ]
175
- # )
176
- # order by causes a factor 10 in runtime
177
- # records = query_set.order_by(preserved).list()
178
-
179
- # log validated terms
180
- validated = result.validated
181
- msg = ""
182
- if len(validated) > 0:
183
- s = "" if len(validated) == 1 else "s"
184
- print_values = colors.green(_print_values(validated))
185
- msg = (
186
- "loaded"
187
- f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
188
- f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
189
- )
190
-
191
- # no logging if all values are validated
192
- # logs if there are synonyms
193
- if len(syn_msg) > 0:
194
- if len(msg) > 0 and not mute:
195
- logger.success(msg)
196
- if not mute:
197
- logger.success(syn_msg)
198
- msg = ""
199
-
200
- existing_values = iterable_idx.intersection(
201
- query_set.values_list(field.field.name, flat=True)
202
- )
203
- nonexist_values = iterable_idx.difference(existing_values)
204
-
205
- return records, nonexist_values, msg
206
-
207
-
208
- def create_records_from_source(
209
- iterable_idx: pd.Index,
210
- field: StrField,
211
- msg: str = "",
212
- mute: bool = False,
213
- **kwargs,
214
- ):
215
- model = field.field.model
216
- records: list = []
217
- # populate additional fields from bionty
218
- from bionty._bionty import get_source_record
219
- from bionty.core._bionty import filter_bionty_df_columns
220
-
221
- # create the corresponding bionty object from model
222
- try:
223
- # TODO: more generic
224
- organism = kwargs.get("organism")
225
- if field.field.name == "ensembl_gene_id":
226
- if iterable_idx[0].startswith("ENSG"):
227
- organism = "human"
228
- elif iterable_idx[0].startswith("ENSMUSG"):
229
- organism = "mouse"
230
- public_ontology = model.public(organism=organism, source=kwargs.get("source"))
231
- except Exception:
232
- # for custom records that are not created from public sources
233
- return records, iterable_idx
234
- # add source record to the kwargs
235
- source_record = get_source_record(public_ontology, model)
236
- kwargs.update({"source": source_record})
237
-
238
- # filter the columns in bionty df based on fields
239
- bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
240
-
241
- # standardize in the bionty reference
242
- result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
243
- syn_mapper = result.synonyms_mapper
244
-
245
- msg_syn: str = ""
246
- if len(syn_mapper) > 0:
247
- s = "" if len(syn_mapper) == 1 else "s"
248
- names = list(syn_mapper.keys())
249
- print_values = colors.purple(_print_values(names))
250
- msg_syn = (
251
- "created"
252
- f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
253
- f" matching {colors.italic('synonyms')}: {print_values}"
254
- )
255
-
256
- iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
257
-
258
- # create records for values that are found in the bionty reference
259
- # matching either field or synonyms
260
- mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
261
-
262
- multi_msg = ""
263
- if len(mapped_values) > 0:
264
- bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
265
- keys=mapped_values, column_name=field.field.name, df=bionty_df
266
- )
267
- organism_kwargs = {}
268
- if "organism" not in kwargs:
269
- organism_record = _get_organism_record(
270
- field, public_ontology.organism, force=True
271
- )
272
- if organism_record is not None:
273
- organism_kwargs["organism"] = organism_record
274
- for bk in bionty_kwargs:
275
- records.append(model(**bk, **kwargs, **organism_kwargs))
276
-
277
- # number of records that matches field (not synonyms)
278
- validated = result.validated
279
- if len(validated) > 0:
280
- s = "" if len(validated) == 1 else "s"
281
- print_values = colors.purple(_print_values(validated))
282
- # this is the success msg for existing records in the DB
283
- if len(msg) > 0 and not mute:
284
- logger.success(msg)
285
- if not mute:
286
- logger.success(
287
- "created"
288
- f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
289
- f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
290
- )
291
-
292
- # make sure that synonyms logging appears after the field logging
293
- if len(msg_syn) > 0 and not mute:
294
- logger.success(msg_syn)
295
- # warning about multi matches
296
- if len(multi_msg) > 0 and not mute:
297
- logger.warning(multi_msg)
298
-
299
- # return the values that are not found in the bionty reference
300
- unmapped_values = iterable_idx.difference(mapped_values)
301
- return records, unmapped_values
302
-
303
-
304
- def index_iterable(iterable: Iterable) -> pd.Index:
305
- idx = pd.Index(iterable).unique()
306
- # No entries are made for NAs, '', None
307
- # returns an ordered unique not null list
308
- return idx[(idx != "") & (~idx.isnull())]
309
-
310
-
311
- def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
312
- if isinstance(names, dict):
313
- items = {
314
- f"{key}: {value}": None
315
- for key, value in names.items()
316
- if key != "None" and value != "None"
317
- }
318
- else:
319
- # Use a dictionary instead of a list to have unique values and preserve order
320
- items = {str(name): None for name in names if name != "None"}
321
-
322
- unique_items = list(items.keys())
323
-
324
- if quotes:
325
- unique_items = [f"'{item}'" for item in unique_items]
326
-
327
- print_values = ", ".join(unique_items[:n])
328
-
329
- if len(unique_items) > n:
330
- print_values += ", ..."
331
-
332
- return print_values
333
-
334
-
335
- def _bulk_create_dicts_from_df(
336
- keys: set | list, column_name: str, df: pd.DataFrame
337
- ) -> tuple[dict, str]:
338
- """Get fields from a DataFrame for many rows."""
339
- multi_msg = ""
340
- if df.index.name != column_name:
341
- df = df.set_index(column_name).loc[list(keys)]
342
- if not df.index.is_unique:
343
- # return all records for multi-matches with a warning
344
- dup = df.index[df.index.duplicated()].unique().tolist()
345
- if len(dup) > 0:
346
- s = "" if len(dup) == 1 else "s"
347
- print_values = _print_values(dup)
348
- multi_msg = (
349
- f"ambiguous validation in Bionty for {len(dup)} record{s}:"
350
- f" {print_values}"
351
- )
352
-
353
- return df.reset_index().to_dict(orient="records"), multi_msg
354
-
355
-
356
- def _has_organism_field(registry: type[Record]) -> bool:
357
- try:
358
- registry._meta.get_field("organism")
359
- return True
360
- except FieldDoesNotExist:
361
- return False
362
-
363
-
364
- def _get_organism_record(
365
- field: StrField, organism: str | Record, force: bool = False
366
- ) -> Record:
367
- registry = field.field.model
368
- check = True
369
- if not force and hasattr(registry, "_ontology_id_field"):
370
- check = field.field.name != registry._ontology_id_field
371
- # e.g. bionty.CellMarker has "name" as _ontology_id_field
372
- if not registry._ontology_id_field.endswith("id"):
373
- check = True
374
-
375
- if _has_organism_field(registry) and check:
376
- from bionty._bionty import create_or_get_organism_record
377
-
378
- organism_record = create_or_get_organism_record(
379
- organism=organism, registry=registry
380
- )
381
- if organism_record is not None:
382
- return organism_record
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Iterable
4
+
5
+ import pandas as pd
6
+ from django.core.exceptions import FieldDoesNotExist
7
+ from lamin_utils import colors, logger
8
+ from lnschema_core.models import Feature, Record, ULabel
9
+
10
+ from .core._settings import settings
11
+
12
+ if TYPE_CHECKING:
13
+ from lnschema_core.types import ListLike, StrField
14
+
15
+
16
+ # The base function for `from_values`
17
+ def get_or_create_records(
18
+ iterable: ListLike,
19
+ field: StrField,
20
+ *,
21
+ create: bool = False,
22
+ from_source: bool = False,
23
+ organism: Record | str | None = None,
24
+ source: Record | None = None,
25
+ mute: bool = False,
26
+ ) -> list[Record]:
27
+ """Get or create records from iterables."""
28
+ registry = field.field.model
29
+ if create:
30
+ return [registry(**{field.field.name: value}) for value in iterable]
31
+ creation_search_names = settings.creation.search_names
32
+ feature: Feature = None
33
+ organism = _get_organism_record(field, organism)
34
+ kwargs: dict = {}
35
+ if organism is not None:
36
+ kwargs["organism"] = organism
37
+ if source is not None:
38
+ kwargs["source"] = source
39
+ settings.creation.search_names = False
40
+ try:
41
+ iterable_idx = index_iterable(iterable)
42
+
43
+ # returns existing records & non-existing values
44
+ records, nonexist_values, msg = get_existing_records(
45
+ iterable_idx=iterable_idx, field=field, mute=mute, **kwargs
46
+ )
47
+
48
+ # new records to be created based on new values
49
+ if len(nonexist_values) > 0:
50
+ source_record = None
51
+ if from_source:
52
+ if isinstance(source, Record):
53
+ source_record = source
54
+ elif (
55
+ len(records) > 0
56
+ and hasattr(records[0], "source_id")
57
+ and records[0].source_id
58
+ ):
59
+ source_record = records[0].source
60
+ if not source_record and hasattr(registry, "public"):
61
+ from bionty._bionty import get_source_record
62
+
63
+ source_record = get_source_record(
64
+ registry.public(organism=organism), registry
65
+ )
66
+ if source_record:
67
+ from bionty.core._add_ontology import check_source_in_db
68
+
69
+ check_source_in_db(
70
+ registry=registry,
71
+ source=source_record,
72
+ update=True,
73
+ )
74
+
75
+ from_source = not source_record.in_db
76
+ elif hasattr(registry, "source_id"):
77
+ from_source = True
78
+ else:
79
+ from_source = False
80
+
81
+ if from_source:
82
+ records_bionty, unmapped_values = create_records_from_source(
83
+ iterable_idx=nonexist_values,
84
+ field=field,
85
+ msg=msg,
86
+ mute=mute,
87
+ **kwargs,
88
+ )
89
+ if len(records_bionty) > 0:
90
+ msg = ""
91
+ for record in records_bionty:
92
+ record._from_source = True
93
+ records += records_bionty
94
+ else:
95
+ unmapped_values = nonexist_values
96
+ # unmapped new_ids will NOT create records
97
+ if len(unmapped_values) > 0:
98
+ if len(msg) > 0 and not mute:
99
+ logger.success(msg)
100
+ s = "" if len(unmapped_values) == 1 else "s"
101
+ print_values = colors.yellow(_print_values(unmapped_values))
102
+ name = registry.__name__
103
+ n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
104
+ if not mute:
105
+ logger.warning(
106
+ f"{colors.red('did not create')} {name} record{s} for "
107
+ f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
108
+ )
109
+ if registry.__get_schema_name__() == "bionty" or registry == ULabel:
110
+ if isinstance(iterable, pd.Series):
111
+ feature = iterable.name
112
+ feature_name = None
113
+ if isinstance(feature, str):
114
+ feature_name = feature
115
+ if feature_name is not None:
116
+ if feature_name is not None:
117
+ for record in records:
118
+ record._feature = feature_name
119
+ logger.debug(f"added default feature '{feature_name}'")
120
+ return records
121
+ finally:
122
+ settings.creation.search_names = creation_search_names
123
+
124
+
125
+ def get_existing_records(
126
+ iterable_idx: pd.Index,
127
+ field: StrField,
128
+ mute: bool = False,
129
+ **kwargs,
130
+ ):
131
+ model = field.field.model
132
+ condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
133
+ # existing records matching is agnostic to the bionty source
134
+ if "source" in condition:
135
+ condition.pop("source")
136
+
137
+ # standardize based on the DB reference
138
+ # log synonyms mapped terms
139
+ result = model.inspect(
140
+ iterable_idx,
141
+ field=field,
142
+ organism=kwargs.get("organism"),
143
+ source=kwargs.get("source"),
144
+ mute=True,
145
+ )
146
+ syn_mapper = result.synonyms_mapper
147
+
148
+ syn_msg = ""
149
+ if len(syn_mapper) > 0:
150
+ s = "" if len(syn_mapper) == 1 else "s"
151
+ names = list(syn_mapper.keys())
152
+ print_values = colors.green(_print_values(names))
153
+ syn_msg = (
154
+ "loaded"
155
+ f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
156
+ f" matching {colors.italic('synonyms')}: {print_values}"
157
+ )
158
+ iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
159
+
160
+ # get all existing records in the db
161
+ # if necessary, create records for the values in kwargs
162
+ # k:v -> k:v_record
163
+ # kwargs is used to deal with organism
164
+ condition.update({f"{field.field.name}__in": iterable_idx.values})
165
+
166
+ query_set = model.filter(**condition)
167
+ records = query_set.list()
168
+
169
+ # now we have to sort the list of queried records
170
+ # preserved = Case(
171
+ # *[
172
+ # When(**{field.field.name: value}, then=pos)
173
+ # for pos, value in enumerate(iterable_idx)
174
+ # ]
175
+ # )
176
+ # order by causes a factor 10 in runtime
177
+ # records = query_set.order_by(preserved).list()
178
+
179
+ # log validated terms
180
+ validated = result.validated
181
+ msg = ""
182
+ if len(validated) > 0:
183
+ s = "" if len(validated) == 1 else "s"
184
+ print_values = colors.green(_print_values(validated))
185
+ msg = (
186
+ "loaded"
187
+ f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
188
+ f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
189
+ )
190
+
191
+ # no logging if all values are validated
192
+ # logs if there are synonyms
193
+ if len(syn_msg) > 0:
194
+ if len(msg) > 0 and not mute:
195
+ logger.success(msg)
196
+ if not mute:
197
+ logger.success(syn_msg)
198
+ msg = ""
199
+
200
+ existing_values = iterable_idx.intersection(
201
+ query_set.values_list(field.field.name, flat=True)
202
+ )
203
+ nonexist_values = iterable_idx.difference(existing_values)
204
+
205
+ return records, nonexist_values, msg
206
+
207
+
208
+ def create_records_from_source(
209
+ iterable_idx: pd.Index,
210
+ field: StrField,
211
+ msg: str = "",
212
+ mute: bool = False,
213
+ **kwargs,
214
+ ):
215
+ model = field.field.model
216
+ records: list = []
217
+ # populate additional fields from bionty
218
+ from bionty._bionty import get_source_record
219
+ from bionty.core._bionty import filter_bionty_df_columns
220
+
221
+ # create the corresponding bionty object from model
222
+ try:
223
+ # TODO: more generic
224
+ organism = kwargs.get("organism")
225
+ if field.field.name == "ensembl_gene_id":
226
+ if iterable_idx[0].startswith("ENSG"):
227
+ organism = "human"
228
+ elif iterable_idx[0].startswith("ENSMUSG"):
229
+ organism = "mouse"
230
+ public_ontology = model.public(organism=organism, source=kwargs.get("source"))
231
+ except Exception:
232
+ # for custom records that are not created from public sources
233
+ return records, iterable_idx
234
+ # add source record to the kwargs
235
+ source_record = get_source_record(public_ontology, model)
236
+ kwargs.update({"source": source_record})
237
+
238
+ # filter the columns in bionty df based on fields
239
+ bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
240
+
241
+ # standardize in the bionty reference
242
+ result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
243
+ syn_mapper = result.synonyms_mapper
244
+
245
+ msg_syn: str = ""
246
+ if len(syn_mapper) > 0:
247
+ s = "" if len(syn_mapper) == 1 else "s"
248
+ names = list(syn_mapper.keys())
249
+ print_values = colors.purple(_print_values(names))
250
+ msg_syn = (
251
+ "created"
252
+ f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
253
+ f" matching {colors.italic('synonyms')}: {print_values}"
254
+ )
255
+
256
+ iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
257
+
258
+ # create records for values that are found in the bionty reference
259
+ # matching either field or synonyms
260
+ mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
261
+
262
+ multi_msg = ""
263
+ if len(mapped_values) > 0:
264
+ bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
265
+ keys=mapped_values, column_name=field.field.name, df=bionty_df
266
+ )
267
+ organism_kwargs = {}
268
+ if "organism" not in kwargs:
269
+ organism_record = _get_organism_record(
270
+ field, public_ontology.organism, force=True
271
+ )
272
+ if organism_record is not None:
273
+ organism_kwargs["organism"] = organism_record
274
+ for bk in bionty_kwargs:
275
+ records.append(model(**bk, **kwargs, **organism_kwargs))
276
+
277
+ # number of records that matches field (not synonyms)
278
+ validated = result.validated
279
+ if len(validated) > 0:
280
+ s = "" if len(validated) == 1 else "s"
281
+ print_values = colors.purple(_print_values(validated))
282
+ # this is the success msg for existing records in the DB
283
+ if len(msg) > 0 and not mute:
284
+ logger.success(msg)
285
+ if not mute:
286
+ logger.success(
287
+ "created"
288
+ f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
289
+ f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
290
+ )
291
+
292
+ # make sure that synonyms logging appears after the field logging
293
+ if len(msg_syn) > 0 and not mute:
294
+ logger.success(msg_syn)
295
+ # warning about multi matches
296
+ if len(multi_msg) > 0 and not mute:
297
+ logger.warning(multi_msg)
298
+
299
+ # return the values that are not found in the bionty reference
300
+ unmapped_values = iterable_idx.difference(mapped_values)
301
+ return records, unmapped_values
302
+
303
+
304
+ def index_iterable(iterable: Iterable) -> pd.Index:
305
+ idx = pd.Index(iterable).unique()
306
+ # No entries are made for NAs, '', None
307
+ # returns an ordered unique not null list
308
+ return idx[(idx != "") & (~idx.isnull())]
309
+
310
+
311
+ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
312
+ if isinstance(names, dict):
313
+ items = {
314
+ f"{key}: {value}": None
315
+ for key, value in names.items()
316
+ if key != "None" and value != "None"
317
+ }
318
+ else:
319
+ # Use a dictionary instead of a list to have unique values and preserve order
320
+ items = {str(name): None for name in names if name != "None"}
321
+
322
+ unique_items = list(items.keys())
323
+
324
+ if quotes:
325
+ unique_items = [f"'{item}'" for item in unique_items]
326
+
327
+ print_values = ", ".join(unique_items[:n])
328
+
329
+ if len(unique_items) > n:
330
+ print_values += ", ..."
331
+
332
+ return print_values
333
+
334
+
335
+ def _bulk_create_dicts_from_df(
336
+ keys: set | list, column_name: str, df: pd.DataFrame
337
+ ) -> tuple[dict, str]:
338
+ """Get fields from a DataFrame for many rows."""
339
+ multi_msg = ""
340
+ if df.index.name != column_name:
341
+ df = df.set_index(column_name).loc[list(keys)]
342
+ if not df.index.is_unique:
343
+ # return all records for multi-matches with a warning
344
+ dup = df.index[df.index.duplicated()].unique().tolist()
345
+ if len(dup) > 0:
346
+ s = "" if len(dup) == 1 else "s"
347
+ print_values = _print_values(dup)
348
+ multi_msg = (
349
+ f"ambiguous validation in Bionty for {len(dup)} record{s}:"
350
+ f" {print_values}"
351
+ )
352
+
353
+ return df.reset_index().to_dict(orient="records"), multi_msg
354
+
355
+
356
+ def _has_organism_field(registry: type[Record]) -> bool:
357
+ try:
358
+ registry._meta.get_field("organism")
359
+ return True
360
+ except FieldDoesNotExist:
361
+ return False
362
+
363
+
364
+ def _get_organism_record(
365
+ field: StrField, organism: str | Record, force: bool = False
366
+ ) -> Record:
367
+ registry = field.field.model
368
+ check = True
369
+ if not force and hasattr(registry, "_ontology_id_field"):
370
+ check = field.field.name != registry._ontology_id_field
371
+ # e.g. bionty.CellMarker has "name" as _ontology_id_field
372
+ if not registry._ontology_id_field.endswith("id"):
373
+ check = True
374
+
375
+ if _has_organism_field(registry) and check:
376
+ from bionty._bionty import create_or_get_organism_record
377
+
378
+ organism_record = create_or_get_organism_record(
379
+ organism=organism, registry=registry
380
+ )
381
+ if organism_record is not None:
382
+ return organism_record