lamindb 0.76.8__py3-none-any.whl → 0.76.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +114 -113
  2. lamindb/_artifact.py +1206 -1205
  3. lamindb/_can_validate.py +621 -579
  4. lamindb/_collection.py +390 -387
  5. lamindb/_curate.py +1603 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +244 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +250 -256
  10. lamindb/_from_values.py +403 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +364 -362
  15. lamindb/_record.py +668 -649
  16. lamindb/_run.py +60 -57
  17. lamindb/_save.py +310 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +130 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +590 -574
  25. lamindb/core/_data.py +510 -438
  26. lamindb/core/_django.py +209 -0
  27. lamindb/core/_feature_manager.py +994 -867
  28. lamindb/core/_label_manager.py +289 -253
  29. lamindb/core/_mapped_collection.py +631 -597
  30. lamindb/core/_settings.py +188 -187
  31. lamindb/core/_sync_git.py +138 -138
  32. lamindb/core/_track_environment.py +27 -27
  33. lamindb/core/datasets/__init__.py +59 -59
  34. lamindb/core/datasets/_core.py +581 -571
  35. lamindb/core/datasets/_fake.py +36 -36
  36. lamindb/core/exceptions.py +90 -90
  37. lamindb/core/fields.py +12 -12
  38. lamindb/core/loaders.py +164 -164
  39. lamindb/core/schema.py +56 -56
  40. lamindb/core/storage/__init__.py +25 -25
  41. lamindb/core/storage/_anndata_accessor.py +741 -740
  42. lamindb/core/storage/_anndata_sizes.py +41 -41
  43. lamindb/core/storage/_backed_access.py +98 -98
  44. lamindb/core/storage/_tiledbsoma.py +204 -204
  45. lamindb/core/storage/_valid_suffixes.py +21 -21
  46. lamindb/core/storage/_zarr.py +110 -110
  47. lamindb/core/storage/objects.py +62 -62
  48. lamindb/core/storage/paths.py +172 -172
  49. lamindb/core/subsettings/__init__.py +12 -12
  50. lamindb/core/subsettings/_creation_settings.py +38 -38
  51. lamindb/core/subsettings/_transform_settings.py +21 -21
  52. lamindb/core/types.py +19 -19
  53. lamindb/core/versioning.py +146 -158
  54. lamindb/integrations/__init__.py +12 -12
  55. lamindb/integrations/_vitessce.py +107 -107
  56. lamindb/setup/__init__.py +14 -14
  57. lamindb/setup/core/__init__.py +4 -4
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/LICENSE +201 -201
  59. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/METADATA +8 -8
  60. lamindb-0.76.10.dist-info/RECORD +61 -0
  61. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/WHEEL +1 -1
  62. lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/_from_values.py CHANGED
@@ -1,382 +1,403 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING, Iterable
4
-
5
- import pandas as pd
6
- from django.core.exceptions import FieldDoesNotExist
7
- from lamin_utils import colors, logger
8
- from lnschema_core.models import Feature, Record, ULabel
9
-
10
- from .core._settings import settings
11
-
12
- if TYPE_CHECKING:
13
- from lnschema_core.types import ListLike, StrField
14
-
15
-
16
- # The base function for `from_values`
17
- def get_or_create_records(
18
- iterable: ListLike,
19
- field: StrField,
20
- *,
21
- create: bool = False,
22
- from_source: bool = False,
23
- organism: Record | str | None = None,
24
- source: Record | None = None,
25
- mute: bool = False,
26
- ) -> list[Record]:
27
- """Get or create records from iterables."""
28
- registry = field.field.model
29
- if create:
30
- return [registry(**{field.field.name: value}) for value in iterable]
31
- creation_search_names = settings.creation.search_names
32
- feature: Feature = None
33
- organism = _get_organism_record(field, organism)
34
- kwargs: dict = {}
35
- if organism is not None:
36
- kwargs["organism"] = organism
37
- if source is not None:
38
- kwargs["source"] = source
39
- settings.creation.search_names = False
40
- try:
41
- iterable_idx = index_iterable(iterable)
42
-
43
- # returns existing records & non-existing values
44
- records, nonexist_values, msg = get_existing_records(
45
- iterable_idx=iterable_idx, field=field, mute=mute, **kwargs
46
- )
47
-
48
- # new records to be created based on new values
49
- if len(nonexist_values) > 0:
50
- source_record = None
51
- if from_source:
52
- if isinstance(source, Record):
53
- source_record = source
54
- elif (
55
- len(records) > 0
56
- and hasattr(records[0], "source_id")
57
- and records[0].source_id
58
- ):
59
- source_record = records[0].source
60
- if not source_record and hasattr(registry, "public"):
61
- from bionty._bionty import get_source_record
62
-
63
- source_record = get_source_record(
64
- registry.public(organism=organism), registry
65
- )
66
- if source_record:
67
- from bionty.core._add_ontology import check_source_in_db
68
-
69
- check_source_in_db(
70
- registry=registry,
71
- source=source_record,
72
- update=True,
73
- )
74
-
75
- from_source = not source_record.in_db
76
- elif hasattr(registry, "source_id"):
77
- from_source = True
78
- else:
79
- from_source = False
80
-
81
- if from_source:
82
- records_bionty, unmapped_values = create_records_from_source(
83
- iterable_idx=nonexist_values,
84
- field=field,
85
- msg=msg,
86
- mute=mute,
87
- **kwargs,
88
- )
89
- if len(records_bionty) > 0:
90
- msg = ""
91
- for record in records_bionty:
92
- record._from_source = True
93
- records += records_bionty
94
- else:
95
- unmapped_values = nonexist_values
96
- # unmapped new_ids will NOT create records
97
- if len(unmapped_values) > 0:
98
- if len(msg) > 0 and not mute:
99
- logger.success(msg)
100
- s = "" if len(unmapped_values) == 1 else "s"
101
- print_values = colors.yellow(_print_values(unmapped_values))
102
- name = registry.__name__
103
- n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
104
- if not mute:
105
- logger.warning(
106
- f"{colors.red('did not create')} {name} record{s} for "
107
- f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
108
- )
109
- if registry.__get_schema_name__() == "bionty" or registry == ULabel:
110
- if isinstance(iterable, pd.Series):
111
- feature = iterable.name
112
- feature_name = None
113
- if isinstance(feature, str):
114
- feature_name = feature
115
- if feature_name is not None:
116
- if feature_name is not None:
117
- for record in records:
118
- record._feature = feature_name
119
- logger.debug(f"added default feature '{feature_name}'")
120
- return records
121
- finally:
122
- settings.creation.search_names = creation_search_names
123
-
124
-
125
- def get_existing_records(
126
- iterable_idx: pd.Index,
127
- field: StrField,
128
- mute: bool = False,
129
- **kwargs,
130
- ):
131
- model = field.field.model
132
- condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
133
- # existing records matching is agnostic to the bionty source
134
- if "source" in condition:
135
- condition.pop("source")
136
-
137
- # standardize based on the DB reference
138
- # log synonyms mapped terms
139
- result = model.inspect(
140
- iterable_idx,
141
- field=field,
142
- organism=kwargs.get("organism"),
143
- source=kwargs.get("source"),
144
- mute=True,
145
- )
146
- syn_mapper = result.synonyms_mapper
147
-
148
- syn_msg = ""
149
- if len(syn_mapper) > 0:
150
- s = "" if len(syn_mapper) == 1 else "s"
151
- names = list(syn_mapper.keys())
152
- print_values = colors.green(_print_values(names))
153
- syn_msg = (
154
- "loaded"
155
- f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
156
- f" matching {colors.italic('synonyms')}: {print_values}"
157
- )
158
- iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
159
-
160
- # get all existing records in the db
161
- # if necessary, create records for the values in kwargs
162
- # k:v -> k:v_record
163
- # kwargs is used to deal with organism
164
- condition.update({f"{field.field.name}__in": iterable_idx.values})
165
-
166
- query_set = model.filter(**condition)
167
- records = query_set.list()
168
-
169
- # now we have to sort the list of queried records
170
- # preserved = Case(
171
- # *[
172
- # When(**{field.field.name: value}, then=pos)
173
- # for pos, value in enumerate(iterable_idx)
174
- # ]
175
- # )
176
- # order by causes a factor 10 in runtime
177
- # records = query_set.order_by(preserved).list()
178
-
179
- # log validated terms
180
- validated = result.validated
181
- msg = ""
182
- if len(validated) > 0:
183
- s = "" if len(validated) == 1 else "s"
184
- print_values = colors.green(_print_values(validated))
185
- msg = (
186
- "loaded"
187
- f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
188
- f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
189
- )
190
-
191
- # no logging if all values are validated
192
- # logs if there are synonyms
193
- if len(syn_msg) > 0:
194
- if len(msg) > 0 and not mute:
195
- logger.success(msg)
196
- if not mute:
197
- logger.success(syn_msg)
198
- msg = ""
199
-
200
- existing_values = iterable_idx.intersection(
201
- query_set.values_list(field.field.name, flat=True)
202
- )
203
- nonexist_values = iterable_idx.difference(existing_values)
204
-
205
- return records, nonexist_values, msg
206
-
207
-
208
- def create_records_from_source(
209
- iterable_idx: pd.Index,
210
- field: StrField,
211
- msg: str = "",
212
- mute: bool = False,
213
- **kwargs,
214
- ):
215
- model = field.field.model
216
- records: list = []
217
- # populate additional fields from bionty
218
- from bionty._bionty import get_source_record
219
- from bionty.core._bionty import filter_bionty_df_columns
220
-
221
- # create the corresponding bionty object from model
222
- try:
223
- # TODO: more generic
224
- organism = kwargs.get("organism")
225
- if field.field.name == "ensembl_gene_id":
226
- if iterable_idx[0].startswith("ENSG"):
227
- organism = "human"
228
- elif iterable_idx[0].startswith("ENSMUSG"):
229
- organism = "mouse"
230
- public_ontology = model.public(organism=organism, source=kwargs.get("source"))
231
- except Exception:
232
- # for custom records that are not created from public sources
233
- return records, iterable_idx
234
- # add source record to the kwargs
235
- source_record = get_source_record(public_ontology, model)
236
- kwargs.update({"source": source_record})
237
-
238
- # filter the columns in bionty df based on fields
239
- bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
240
-
241
- # standardize in the bionty reference
242
- result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
243
- syn_mapper = result.synonyms_mapper
244
-
245
- msg_syn: str = ""
246
- if len(syn_mapper) > 0:
247
- s = "" if len(syn_mapper) == 1 else "s"
248
- names = list(syn_mapper.keys())
249
- print_values = colors.purple(_print_values(names))
250
- msg_syn = (
251
- "created"
252
- f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
253
- f" matching {colors.italic('synonyms')}: {print_values}"
254
- )
255
-
256
- iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
257
-
258
- # create records for values that are found in the bionty reference
259
- # matching either field or synonyms
260
- mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
261
-
262
- multi_msg = ""
263
- if len(mapped_values) > 0:
264
- bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
265
- keys=mapped_values, column_name=field.field.name, df=bionty_df
266
- )
267
- organism_kwargs = {}
268
- if "organism" not in kwargs:
269
- organism_record = _get_organism_record(
270
- field, public_ontology.organism, force=True
271
- )
272
- if organism_record is not None:
273
- organism_kwargs["organism"] = organism_record
274
- for bk in bionty_kwargs:
275
- records.append(model(**bk, **kwargs, **organism_kwargs))
276
-
277
- # number of records that matches field (not synonyms)
278
- validated = result.validated
279
- if len(validated) > 0:
280
- s = "" if len(validated) == 1 else "s"
281
- print_values = colors.purple(_print_values(validated))
282
- # this is the success msg for existing records in the DB
283
- if len(msg) > 0 and not mute:
284
- logger.success(msg)
285
- if not mute:
286
- logger.success(
287
- "created"
288
- f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
289
- f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
290
- )
291
-
292
- # make sure that synonyms logging appears after the field logging
293
- if len(msg_syn) > 0 and not mute:
294
- logger.success(msg_syn)
295
- # warning about multi matches
296
- if len(multi_msg) > 0 and not mute:
297
- logger.warning(multi_msg)
298
-
299
- # return the values that are not found in the bionty reference
300
- unmapped_values = iterable_idx.difference(mapped_values)
301
- return records, unmapped_values
302
-
303
-
304
- def index_iterable(iterable: Iterable) -> pd.Index:
305
- idx = pd.Index(iterable).unique()
306
- # No entries are made for NAs, '', None
307
- # returns an ordered unique not null list
308
- return idx[(idx != "") & (~idx.isnull())]
309
-
310
-
311
- def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
312
- if isinstance(names, dict):
313
- items = {
314
- f"{key}: {value}": None
315
- for key, value in names.items()
316
- if key != "None" and value != "None"
317
- }
318
- else:
319
- # Use a dictionary instead of a list to have unique values and preserve order
320
- items = {str(name): None for name in names if name != "None"}
321
-
322
- unique_items = list(items.keys())
323
-
324
- if quotes:
325
- unique_items = [f"'{item}'" for item in unique_items]
326
-
327
- print_values = ", ".join(unique_items[:n])
328
-
329
- if len(unique_items) > n:
330
- print_values += ", ..."
331
-
332
- return print_values
333
-
334
-
335
- def _bulk_create_dicts_from_df(
336
- keys: set | list, column_name: str, df: pd.DataFrame
337
- ) -> tuple[dict, str]:
338
- """Get fields from a DataFrame for many rows."""
339
- multi_msg = ""
340
- if df.index.name != column_name:
341
- df = df.set_index(column_name).loc[list(keys)]
342
- if not df.index.is_unique:
343
- # return all records for multi-matches with a warning
344
- dup = df.index[df.index.duplicated()].unique().tolist()
345
- if len(dup) > 0:
346
- s = "" if len(dup) == 1 else "s"
347
- print_values = _print_values(dup)
348
- multi_msg = (
349
- f"ambiguous validation in Bionty for {len(dup)} record{s}:"
350
- f" {print_values}"
351
- )
352
-
353
- return df.reset_index().to_dict(orient="records"), multi_msg
354
-
355
-
356
- def _has_organism_field(registry: type[Record]) -> bool:
357
- try:
358
- registry._meta.get_field("organism")
359
- return True
360
- except FieldDoesNotExist:
361
- return False
362
-
363
-
364
- def _get_organism_record(
365
- field: StrField, organism: str | Record, force: bool = False
366
- ) -> Record:
367
- registry = field.field.model
368
- check = True
369
- if not force and hasattr(registry, "_ontology_id_field"):
370
- check = field.field.name != registry._ontology_id_field
371
- # e.g. bionty.CellMarker has "name" as _ontology_id_field
372
- if not registry._ontology_id_field.endswith("id"):
373
- check = True
374
-
375
- if _has_organism_field(registry) and check:
376
- from bionty._bionty import create_or_get_organism_record
377
-
378
- organism_record = create_or_get_organism_record(
379
- organism=organism, registry=registry
380
- )
381
- if organism_record is not None:
382
- return organism_record
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import pandas as pd
6
+ from django.core.exceptions import FieldDoesNotExist
7
+ from lamin_utils import colors, logger
8
+ from lnschema_core.models import Feature, Field, Record, ULabel
9
+
10
+ from .core._settings import settings
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Iterable
14
+
15
+ from lnschema_core.types import ListLike, StrField
16
+
17
+
18
+ # The base function for `from_values`
19
+ def get_or_create_records(
20
+ iterable: ListLike,
21
+ field: StrField,
22
+ *,
23
+ create: bool = False,
24
+ from_source: bool = False,
25
+ organism: Record | str | None = None,
26
+ source: Record | None = None,
27
+ mute: bool = False,
28
+ ) -> list[Record]:
29
+ """Get or create records from iterables."""
30
+ registry = field.field.model
31
+ if create:
32
+ return [registry(**{field.field.name: value}) for value in iterable]
33
+ creation_search_names = settings.creation.search_names
34
+ organism = _get_organism_record(field, organism)
35
+ settings.creation.search_names = False
36
+ try:
37
+ iterable_idx = index_iterable(iterable)
38
+
39
+ # returns existing records & non-existing values
40
+ records, nonexist_values, msg = get_existing_records(
41
+ iterable_idx=iterable_idx,
42
+ field=field,
43
+ organism=organism,
44
+ mute=mute,
45
+ )
46
+
47
+ # new records to be created based on new values
48
+ if len(nonexist_values) > 0:
49
+ source_record = None
50
+ if from_source:
51
+ if isinstance(source, Record):
52
+ source_record = source
53
+ elif (
54
+ len(records) > 0
55
+ and hasattr(records[0], "source_id")
56
+ and records[0].source_id
57
+ ):
58
+ source_record = records[0].source
59
+ if not source_record and hasattr(registry, "public"):
60
+ if organism is None:
61
+ organism = _ensembl_prefix(nonexist_values[0], field, organism)
62
+ organism = _get_organism_record(field, organism, force=True)
63
+
64
+ if source_record:
65
+ from bionty.core._add_ontology import check_source_in_db
66
+
67
+ check_source_in_db(
68
+ registry=registry,
69
+ source=source_record,
70
+ update=True,
71
+ )
72
+
73
+ from_source = not source_record.in_db
74
+ elif hasattr(registry, "source_id"):
75
+ from_source = True
76
+ else:
77
+ from_source = False
78
+
79
+ if from_source:
80
+ records_bionty, unmapped_values = create_records_from_source(
81
+ iterable_idx=nonexist_values,
82
+ field=field,
83
+ organism=organism,
84
+ source=source_record,
85
+ msg=msg,
86
+ mute=mute,
87
+ )
88
+ if len(records_bionty) > 0:
89
+ msg = ""
90
+ for record in records_bionty:
91
+ record._from_source = True
92
+ records += records_bionty
93
+ else:
94
+ unmapped_values = nonexist_values
95
+ # unmapped new_ids will NOT create records
96
+ if len(unmapped_values) > 0:
97
+ if len(msg) > 0 and not mute:
98
+ logger.success(msg)
99
+ s = "" if len(unmapped_values) == 1 else "s"
100
+ print_values = colors.yellow(_print_values(unmapped_values))
101
+ name = registry.__name__
102
+ n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
103
+ if not mute:
104
+ logger.warning(
105
+ f"{colors.red('did not create')} {name} record{s} for "
106
+ f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
107
+ )
108
+ # if registry.__get_schema_name__() == "bionty" or registry == ULabel:
109
+ # if isinstance(iterable, pd.Series):
110
+ # feature = iterable.name
111
+ # feature_name = None
112
+ # if isinstance(feature, str):
113
+ # feature_name = feature
114
+ # if feature_name is not None:
115
+ # if feature_name is not None:
116
+ # for record in records:
117
+ # record._feature = feature_name
118
+ # logger.debug(f"added default feature '{feature_name}'")
119
+ return records
120
+ finally:
121
+ settings.creation.search_names = creation_search_names
122
+
123
+
124
+ def get_existing_records(
125
+ iterable_idx: pd.Index,
126
+ field: StrField,
127
+ organism: Record | None = None,
128
+ mute: bool = False,
129
+ ):
130
+ # NOTE: existing records matching is agnostic to the source
131
+ model = field.field.model
132
+ if organism is None and field.field.name == "ensembl_gene_id":
133
+ if len(iterable_idx) > 0:
134
+ organism = _ensembl_prefix(iterable_idx[0], field, organism)
135
+ organism = _get_organism_record(field, organism, force=True)
136
+
137
+ # standardize based on the DB reference
138
+ # log synonyms mapped terms
139
+ syn_mapper = model.standardize(
140
+ iterable_idx,
141
+ field=field,
142
+ organism=organism,
143
+ mute=True,
144
+ public_aware=False,
145
+ return_mapper=True,
146
+ )
147
+ iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
148
+
149
+ # now we have to sort the list of queried records
150
+ # preserved = Case(
151
+ # *[
152
+ # When(**{field.field.name: value}, then=pos)
153
+ # for pos, value in enumerate(iterable_idx)
154
+ # ]
155
+ # )
156
+ # order by causes a factor 10 in runtime
157
+ # records = query_set.order_by(preserved).list()
158
+
159
+ # log validated terms
160
+ is_validated = model.validate(
161
+ iterable_idx, field=field, organism=organism, mute=True
162
+ )
163
+ if len(is_validated) > 0:
164
+ validated = iterable_idx[is_validated]
165
+ else:
166
+ validated = []
167
+ msg = ""
168
+ syn_msg = ""
169
+ if not mute:
170
+ if len(validated) > 0:
171
+ s = "" if len(validated) == 1 else "s"
172
+ print_values = colors.green(_print_values(validated))
173
+ msg = (
174
+ "loaded"
175
+ f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
176
+ f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
177
+ )
178
+ if len(syn_mapper) > 0:
179
+ s = "" if len(syn_mapper) == 1 else "s"
180
+ names = list(syn_mapper.keys())
181
+ print_values = colors.green(_print_values(names))
182
+ syn_msg = (
183
+ "loaded"
184
+ f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
185
+ f" matching {colors.italic('synonyms')}: {print_values}"
186
+ )
187
+
188
+ # no logging if all values are validated
189
+ # logs if there are synonyms
190
+ if len(syn_msg) > 0:
191
+ if len(msg) > 0 and not mute:
192
+ logger.success(msg)
193
+ if not mute:
194
+ logger.success(syn_msg)
195
+ msg = ""
196
+
197
+ # get all existing records in the db
198
+ # if necessary, create records for the values in kwargs
199
+ # k:v -> k:v_record
200
+ query = {f"{field.field.name}__in": iterable_idx.values}
201
+ if organism is not None:
202
+ query["organism"] = organism
203
+ records = model.filter(**query).list()
204
+
205
+ if len(validated) == len(iterable_idx):
206
+ return records, [], msg
207
+ else:
208
+ nonval_values = iterable_idx.difference(validated)
209
+ return records, nonval_values, msg
210
+
211
+
212
+ def create_records_from_source(
213
+ iterable_idx: pd.Index,
214
+ field: StrField,
215
+ organism: Record | None = None,
216
+ source: Record | None = None,
217
+ msg: str = "",
218
+ mute: bool = False,
219
+ ):
220
+ model = field.field.model
221
+ records: list = []
222
+ # populate additional fields from bionty
223
+ from bionty._bionty import get_source_record
224
+ from bionty.core._bionty import filter_bionty_df_columns
225
+
226
+ # create the corresponding bionty object from model
227
+ try:
228
+ # TODO: more generic
229
+ public_ontology = model.public(organism=organism, source=source)
230
+ except Exception:
231
+ # for custom records that are not created from public sources
232
+ return records, iterable_idx
233
+ # get the default source
234
+ if source is None:
235
+ source = get_source_record(public_ontology, model)
236
+
237
+ # filter the columns in bionty df based on fields
238
+ bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
239
+
240
+ # standardize in the bionty reference
241
+ result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
242
+ syn_mapper = result.synonyms_mapper
243
+
244
+ msg_syn: str = ""
245
+ if len(syn_mapper) > 0:
246
+ s = "" if len(syn_mapper) == 1 else "s"
247
+ names = list(syn_mapper.keys())
248
+ print_values = colors.purple(_print_values(names))
249
+ msg_syn = (
250
+ "created"
251
+ f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
252
+ f" matching {colors.italic('synonyms')}: {print_values}"
253
+ )
254
+
255
+ iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
256
+
257
+ # create records for values that are found in the bionty reference
258
+ # matching either field or synonyms
259
+ mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
260
+
261
+ multi_msg = ""
262
+ if len(mapped_values) > 0:
263
+ bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
264
+ keys=mapped_values, column_name=field.field.name, df=bionty_df
265
+ )
266
+
267
+ if hasattr(model, "organism_id") and organism is None:
268
+ organism = _get_organism_record(field, source.organism, force=True)
269
+
270
+ create_kwargs = (
271
+ {"organism": organism, "source": source}
272
+ if organism is not None
273
+ else {"source": source}
274
+ )
275
+ for bk in bionty_kwargs:
276
+ records.append(model(**bk, **create_kwargs))
277
+
278
+ # number of records that matches field (not synonyms)
279
+ validated = result.validated
280
+ if len(validated) > 0:
281
+ s = "" if len(validated) == 1 else "s"
282
+ print_values = colors.purple(_print_values(validated))
283
+ # this is the success msg for existing records in the DB
284
+ if len(msg) > 0 and not mute:
285
+ logger.success(msg)
286
+ if not mute:
287
+ logger.success(
288
+ "created"
289
+ f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
290
+ f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
291
+ )
292
+
293
+ # make sure that synonyms logging appears after the field logging
294
+ if len(msg_syn) > 0 and not mute:
295
+ logger.success(msg_syn)
296
+ # warning about multi matches
297
+ if len(multi_msg) > 0 and not mute:
298
+ logger.warning(multi_msg)
299
+
300
+ # return the values that are not found in the bionty reference
301
+ unmapped_values = iterable_idx.difference(mapped_values)
302
+ return records, unmapped_values
303
+
304
+
305
+ def index_iterable(iterable: Iterable) -> pd.Index:
306
+ idx = pd.Index(iterable).unique()
307
+ # No entries are made for NAs, '', None
308
+ # returns an ordered unique not null list
309
+ return idx[(idx != "") & (~idx.isnull())]
310
+
311
+
312
+ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
313
+ if isinstance(names, dict):
314
+ items = {
315
+ f"{key}: {value}": None
316
+ for key, value in names.items()
317
+ if key != "None" and value != "None"
318
+ }
319
+ else:
320
+ # Use a dictionary instead of a list to have unique values and preserve order
321
+ items = {str(name): None for name in names if name != "None"}
322
+
323
+ unique_items = list(items.keys())
324
+
325
+ if quotes:
326
+ unique_items = [f"'{item}'" for item in unique_items]
327
+
328
+ print_values = ", ".join(unique_items[:n])
329
+
330
+ if len(unique_items) > n:
331
+ print_values += ", ..."
332
+
333
+ return print_values
334
+
335
+
336
+ def _bulk_create_dicts_from_df(
337
+ keys: set | list, column_name: str, df: pd.DataFrame
338
+ ) -> tuple[dict, str]:
339
+ """Get fields from a DataFrame for many rows."""
340
+ multi_msg = ""
341
+ if df.index.name != column_name:
342
+ df = df.set_index(column_name).loc[list(keys)]
343
+ if not df.index.is_unique:
344
+ # return all records for multi-matches with a warning
345
+ dup = df.index[df.index.duplicated()].unique().tolist()
346
+ if len(dup) > 0:
347
+ s = "" if len(dup) == 1 else "s"
348
+ print_values = _print_values(dup)
349
+ multi_msg = (
350
+ f"ambiguous validation in Bionty for {len(dup)} record{s}:"
351
+ f" {print_values}"
352
+ )
353
+
354
+ return df.reset_index().to_dict(orient="records"), multi_msg
355
+
356
+
357
+ def _has_organism_field(registry: type[Record]) -> bool:
358
+ try:
359
+ registry._meta.get_field("organism")
360
+ return True
361
+ except FieldDoesNotExist:
362
+ return False
363
+
364
+
365
+ def _get_organism_record(
366
+ field: StrField, organism: str | Record, force: bool = False
367
+ ) -> Record:
368
+ """Get organism record.
369
+
370
+ Args:
371
+ field: the field to get the organism record for
372
+ organism: the organism to get the record for
373
+ force: whether to force fetching the organism record
374
+ """
375
+ registry = field.field.model
376
+ check = True
377
+ if not force and hasattr(registry, "_ontology_id_field"):
378
+ check = field.field.name != registry._ontology_id_field
379
+ # e.g. bionty.CellMarker has "name" as _ontology_id_field
380
+ if not registry._ontology_id_field.endswith("id"):
381
+ check = True
382
+
383
+ if _has_organism_field(registry) and check:
384
+ from bionty._bionty import create_or_get_organism_record
385
+
386
+ if field and not isinstance(field, str):
387
+ field = field.field.name
388
+
389
+ organism_record = create_or_get_organism_record(
390
+ organism=organism, registry=registry, field=field
391
+ )
392
+ if organism_record is not None:
393
+ return organism_record
394
+
395
+
396
+ def _ensembl_prefix(id: str, field: StrField, organism: Record | None) -> str | None:
397
+ if field.field.name == "ensembl_gene_id" and organism is None:
398
+ if id.startswith("ENSG"):
399
+ organism = "human"
400
+ elif id.startswith("ENSMUSG"):
401
+ organism = "mouse"
402
+
403
+ return organism