lamindb 0.76.8__py3-none-any.whl → 0.76.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +114 -113
- lamindb/_artifact.py +1206 -1205
- lamindb/_can_validate.py +621 -579
- lamindb/_collection.py +390 -387
- lamindb/_curate.py +1603 -1601
- lamindb/_feature.py +155 -155
- lamindb/_feature_set.py +244 -242
- lamindb/_filter.py +23 -23
- lamindb/_finish.py +250 -256
- lamindb/_from_values.py +403 -382
- lamindb/_is_versioned.py +40 -40
- lamindb/_parents.py +476 -476
- lamindb/_query_manager.py +125 -125
- lamindb/_query_set.py +364 -362
- lamindb/_record.py +668 -649
- lamindb/_run.py +60 -57
- lamindb/_save.py +310 -308
- lamindb/_storage.py +14 -14
- lamindb/_transform.py +130 -127
- lamindb/_ulabel.py +56 -56
- lamindb/_utils.py +9 -9
- lamindb/_view.py +72 -72
- lamindb/core/__init__.py +94 -94
- lamindb/core/_context.py +590 -574
- lamindb/core/_data.py +510 -438
- lamindb/core/_django.py +209 -0
- lamindb/core/_feature_manager.py +994 -867
- lamindb/core/_label_manager.py +289 -253
- lamindb/core/_mapped_collection.py +631 -597
- lamindb/core/_settings.py +188 -187
- lamindb/core/_sync_git.py +138 -138
- lamindb/core/_track_environment.py +27 -27
- lamindb/core/datasets/__init__.py +59 -59
- lamindb/core/datasets/_core.py +581 -571
- lamindb/core/datasets/_fake.py +36 -36
- lamindb/core/exceptions.py +90 -90
- lamindb/core/fields.py +12 -12
- lamindb/core/loaders.py +164 -164
- lamindb/core/schema.py +56 -56
- lamindb/core/storage/__init__.py +25 -25
- lamindb/core/storage/_anndata_accessor.py +741 -740
- lamindb/core/storage/_anndata_sizes.py +41 -41
- lamindb/core/storage/_backed_access.py +98 -98
- lamindb/core/storage/_tiledbsoma.py +204 -204
- lamindb/core/storage/_valid_suffixes.py +21 -21
- lamindb/core/storage/_zarr.py +110 -110
- lamindb/core/storage/objects.py +62 -62
- lamindb/core/storage/paths.py +172 -172
- lamindb/core/subsettings/__init__.py +12 -12
- lamindb/core/subsettings/_creation_settings.py +38 -38
- lamindb/core/subsettings/_transform_settings.py +21 -21
- lamindb/core/types.py +19 -19
- lamindb/core/versioning.py +146 -158
- lamindb/integrations/__init__.py +12 -12
- lamindb/integrations/_vitessce.py +107 -107
- lamindb/setup/__init__.py +14 -14
- lamindb/setup/core/__init__.py +4 -4
- {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/LICENSE +201 -201
- {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/METADATA +8 -8
- lamindb-0.76.10.dist-info/RECORD +61 -0
- {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/WHEEL +1 -1
- lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/_from_values.py
CHANGED
@@ -1,382 +1,403 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from typing import TYPE_CHECKING
|
4
|
-
|
5
|
-
import pandas as pd
|
6
|
-
from django.core.exceptions import FieldDoesNotExist
|
7
|
-
from lamin_utils import colors, logger
|
8
|
-
from lnschema_core.models import Feature, Record, ULabel
|
9
|
-
|
10
|
-
from .core._settings import settings
|
11
|
-
|
12
|
-
if TYPE_CHECKING:
|
13
|
-
from
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
and
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
from_source =
|
76
|
-
|
77
|
-
from_source =
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
msg=msg,
|
86
|
-
mute=mute,
|
87
|
-
|
88
|
-
)
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
f"{colors.
|
107
|
-
|
108
|
-
|
109
|
-
if
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
mute: bool = False,
|
129
|
-
|
130
|
-
|
131
|
-
model = field.field.model
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
# standardize based on the DB reference
|
138
|
-
# log synonyms mapped terms
|
139
|
-
|
140
|
-
iterable_idx,
|
141
|
-
field=field,
|
142
|
-
organism=
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
"
|
252
|
-
f" {colors.
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
#
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
f"
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
#
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
f" {
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
)
|
381
|
-
|
382
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from django.core.exceptions import FieldDoesNotExist
|
7
|
+
from lamin_utils import colors, logger
|
8
|
+
from lnschema_core.models import Feature, Field, Record, ULabel
|
9
|
+
|
10
|
+
from .core._settings import settings
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from collections.abc import Iterable
|
14
|
+
|
15
|
+
from lnschema_core.types import ListLike, StrField
|
16
|
+
|
17
|
+
|
18
|
+
# The base function for `from_values`
|
19
|
+
def get_or_create_records(
|
20
|
+
iterable: ListLike,
|
21
|
+
field: StrField,
|
22
|
+
*,
|
23
|
+
create: bool = False,
|
24
|
+
from_source: bool = False,
|
25
|
+
organism: Record | str | None = None,
|
26
|
+
source: Record | None = None,
|
27
|
+
mute: bool = False,
|
28
|
+
) -> list[Record]:
|
29
|
+
"""Get or create records from iterables."""
|
30
|
+
registry = field.field.model
|
31
|
+
if create:
|
32
|
+
return [registry(**{field.field.name: value}) for value in iterable]
|
33
|
+
creation_search_names = settings.creation.search_names
|
34
|
+
organism = _get_organism_record(field, organism)
|
35
|
+
settings.creation.search_names = False
|
36
|
+
try:
|
37
|
+
iterable_idx = index_iterable(iterable)
|
38
|
+
|
39
|
+
# returns existing records & non-existing values
|
40
|
+
records, nonexist_values, msg = get_existing_records(
|
41
|
+
iterable_idx=iterable_idx,
|
42
|
+
field=field,
|
43
|
+
organism=organism,
|
44
|
+
mute=mute,
|
45
|
+
)
|
46
|
+
|
47
|
+
# new records to be created based on new values
|
48
|
+
if len(nonexist_values) > 0:
|
49
|
+
source_record = None
|
50
|
+
if from_source:
|
51
|
+
if isinstance(source, Record):
|
52
|
+
source_record = source
|
53
|
+
elif (
|
54
|
+
len(records) > 0
|
55
|
+
and hasattr(records[0], "source_id")
|
56
|
+
and records[0].source_id
|
57
|
+
):
|
58
|
+
source_record = records[0].source
|
59
|
+
if not source_record and hasattr(registry, "public"):
|
60
|
+
if organism is None:
|
61
|
+
organism = _ensembl_prefix(nonexist_values[0], field, organism)
|
62
|
+
organism = _get_organism_record(field, organism, force=True)
|
63
|
+
|
64
|
+
if source_record:
|
65
|
+
from bionty.core._add_ontology import check_source_in_db
|
66
|
+
|
67
|
+
check_source_in_db(
|
68
|
+
registry=registry,
|
69
|
+
source=source_record,
|
70
|
+
update=True,
|
71
|
+
)
|
72
|
+
|
73
|
+
from_source = not source_record.in_db
|
74
|
+
elif hasattr(registry, "source_id"):
|
75
|
+
from_source = True
|
76
|
+
else:
|
77
|
+
from_source = False
|
78
|
+
|
79
|
+
if from_source:
|
80
|
+
records_bionty, unmapped_values = create_records_from_source(
|
81
|
+
iterable_idx=nonexist_values,
|
82
|
+
field=field,
|
83
|
+
organism=organism,
|
84
|
+
source=source_record,
|
85
|
+
msg=msg,
|
86
|
+
mute=mute,
|
87
|
+
)
|
88
|
+
if len(records_bionty) > 0:
|
89
|
+
msg = ""
|
90
|
+
for record in records_bionty:
|
91
|
+
record._from_source = True
|
92
|
+
records += records_bionty
|
93
|
+
else:
|
94
|
+
unmapped_values = nonexist_values
|
95
|
+
# unmapped new_ids will NOT create records
|
96
|
+
if len(unmapped_values) > 0:
|
97
|
+
if len(msg) > 0 and not mute:
|
98
|
+
logger.success(msg)
|
99
|
+
s = "" if len(unmapped_values) == 1 else "s"
|
100
|
+
print_values = colors.yellow(_print_values(unmapped_values))
|
101
|
+
name = registry.__name__
|
102
|
+
n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
|
103
|
+
if not mute:
|
104
|
+
logger.warning(
|
105
|
+
f"{colors.red('did not create')} {name} record{s} for "
|
106
|
+
f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
|
107
|
+
)
|
108
|
+
# if registry.__get_schema_name__() == "bionty" or registry == ULabel:
|
109
|
+
# if isinstance(iterable, pd.Series):
|
110
|
+
# feature = iterable.name
|
111
|
+
# feature_name = None
|
112
|
+
# if isinstance(feature, str):
|
113
|
+
# feature_name = feature
|
114
|
+
# if feature_name is not None:
|
115
|
+
# if feature_name is not None:
|
116
|
+
# for record in records:
|
117
|
+
# record._feature = feature_name
|
118
|
+
# logger.debug(f"added default feature '{feature_name}'")
|
119
|
+
return records
|
120
|
+
finally:
|
121
|
+
settings.creation.search_names = creation_search_names
|
122
|
+
|
123
|
+
|
124
|
+
def get_existing_records(
|
125
|
+
iterable_idx: pd.Index,
|
126
|
+
field: StrField,
|
127
|
+
organism: Record | None = None,
|
128
|
+
mute: bool = False,
|
129
|
+
):
|
130
|
+
# NOTE: existing records matching is agnostic to the source
|
131
|
+
model = field.field.model
|
132
|
+
if organism is None and field.field.name == "ensembl_gene_id":
|
133
|
+
if len(iterable_idx) > 0:
|
134
|
+
organism = _ensembl_prefix(iterable_idx[0], field, organism)
|
135
|
+
organism = _get_organism_record(field, organism, force=True)
|
136
|
+
|
137
|
+
# standardize based on the DB reference
|
138
|
+
# log synonyms mapped terms
|
139
|
+
syn_mapper = model.standardize(
|
140
|
+
iterable_idx,
|
141
|
+
field=field,
|
142
|
+
organism=organism,
|
143
|
+
mute=True,
|
144
|
+
public_aware=False,
|
145
|
+
return_mapper=True,
|
146
|
+
)
|
147
|
+
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
148
|
+
|
149
|
+
# now we have to sort the list of queried records
|
150
|
+
# preserved = Case(
|
151
|
+
# *[
|
152
|
+
# When(**{field.field.name: value}, then=pos)
|
153
|
+
# for pos, value in enumerate(iterable_idx)
|
154
|
+
# ]
|
155
|
+
# )
|
156
|
+
# order by causes a factor 10 in runtime
|
157
|
+
# records = query_set.order_by(preserved).list()
|
158
|
+
|
159
|
+
# log validated terms
|
160
|
+
is_validated = model.validate(
|
161
|
+
iterable_idx, field=field, organism=organism, mute=True
|
162
|
+
)
|
163
|
+
if len(is_validated) > 0:
|
164
|
+
validated = iterable_idx[is_validated]
|
165
|
+
else:
|
166
|
+
validated = []
|
167
|
+
msg = ""
|
168
|
+
syn_msg = ""
|
169
|
+
if not mute:
|
170
|
+
if len(validated) > 0:
|
171
|
+
s = "" if len(validated) == 1 else "s"
|
172
|
+
print_values = colors.green(_print_values(validated))
|
173
|
+
msg = (
|
174
|
+
"loaded"
|
175
|
+
f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
|
176
|
+
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
177
|
+
)
|
178
|
+
if len(syn_mapper) > 0:
|
179
|
+
s = "" if len(syn_mapper) == 1 else "s"
|
180
|
+
names = list(syn_mapper.keys())
|
181
|
+
print_values = colors.green(_print_values(names))
|
182
|
+
syn_msg = (
|
183
|
+
"loaded"
|
184
|
+
f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
|
185
|
+
f" matching {colors.italic('synonyms')}: {print_values}"
|
186
|
+
)
|
187
|
+
|
188
|
+
# no logging if all values are validated
|
189
|
+
# logs if there are synonyms
|
190
|
+
if len(syn_msg) > 0:
|
191
|
+
if len(msg) > 0 and not mute:
|
192
|
+
logger.success(msg)
|
193
|
+
if not mute:
|
194
|
+
logger.success(syn_msg)
|
195
|
+
msg = ""
|
196
|
+
|
197
|
+
# get all existing records in the db
|
198
|
+
# if necessary, create records for the values in kwargs
|
199
|
+
# k:v -> k:v_record
|
200
|
+
query = {f"{field.field.name}__in": iterable_idx.values}
|
201
|
+
if organism is not None:
|
202
|
+
query["organism"] = organism
|
203
|
+
records = model.filter(**query).list()
|
204
|
+
|
205
|
+
if len(validated) == len(iterable_idx):
|
206
|
+
return records, [], msg
|
207
|
+
else:
|
208
|
+
nonval_values = iterable_idx.difference(validated)
|
209
|
+
return records, nonval_values, msg
|
210
|
+
|
211
|
+
|
212
|
+
def create_records_from_source(
|
213
|
+
iterable_idx: pd.Index,
|
214
|
+
field: StrField,
|
215
|
+
organism: Record | None = None,
|
216
|
+
source: Record | None = None,
|
217
|
+
msg: str = "",
|
218
|
+
mute: bool = False,
|
219
|
+
):
|
220
|
+
model = field.field.model
|
221
|
+
records: list = []
|
222
|
+
# populate additional fields from bionty
|
223
|
+
from bionty._bionty import get_source_record
|
224
|
+
from bionty.core._bionty import filter_bionty_df_columns
|
225
|
+
|
226
|
+
# create the corresponding bionty object from model
|
227
|
+
try:
|
228
|
+
# TODO: more generic
|
229
|
+
public_ontology = model.public(organism=organism, source=source)
|
230
|
+
except Exception:
|
231
|
+
# for custom records that are not created from public sources
|
232
|
+
return records, iterable_idx
|
233
|
+
# get the default source
|
234
|
+
if source is None:
|
235
|
+
source = get_source_record(public_ontology, model)
|
236
|
+
|
237
|
+
# filter the columns in bionty df based on fields
|
238
|
+
bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
|
239
|
+
|
240
|
+
# standardize in the bionty reference
|
241
|
+
result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
|
242
|
+
syn_mapper = result.synonyms_mapper
|
243
|
+
|
244
|
+
msg_syn: str = ""
|
245
|
+
if len(syn_mapper) > 0:
|
246
|
+
s = "" if len(syn_mapper) == 1 else "s"
|
247
|
+
names = list(syn_mapper.keys())
|
248
|
+
print_values = colors.purple(_print_values(names))
|
249
|
+
msg_syn = (
|
250
|
+
"created"
|
251
|
+
f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
|
252
|
+
f" matching {colors.italic('synonyms')}: {print_values}"
|
253
|
+
)
|
254
|
+
|
255
|
+
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
256
|
+
|
257
|
+
# create records for values that are found in the bionty reference
|
258
|
+
# matching either field or synonyms
|
259
|
+
mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
|
260
|
+
|
261
|
+
multi_msg = ""
|
262
|
+
if len(mapped_values) > 0:
|
263
|
+
bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
|
264
|
+
keys=mapped_values, column_name=field.field.name, df=bionty_df
|
265
|
+
)
|
266
|
+
|
267
|
+
if hasattr(model, "organism_id") and organism is None:
|
268
|
+
organism = _get_organism_record(field, source.organism, force=True)
|
269
|
+
|
270
|
+
create_kwargs = (
|
271
|
+
{"organism": organism, "source": source}
|
272
|
+
if organism is not None
|
273
|
+
else {"source": source}
|
274
|
+
)
|
275
|
+
for bk in bionty_kwargs:
|
276
|
+
records.append(model(**bk, **create_kwargs))
|
277
|
+
|
278
|
+
# number of records that matches field (not synonyms)
|
279
|
+
validated = result.validated
|
280
|
+
if len(validated) > 0:
|
281
|
+
s = "" if len(validated) == 1 else "s"
|
282
|
+
print_values = colors.purple(_print_values(validated))
|
283
|
+
# this is the success msg for existing records in the DB
|
284
|
+
if len(msg) > 0 and not mute:
|
285
|
+
logger.success(msg)
|
286
|
+
if not mute:
|
287
|
+
logger.success(
|
288
|
+
"created"
|
289
|
+
f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
|
290
|
+
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
291
|
+
)
|
292
|
+
|
293
|
+
# make sure that synonyms logging appears after the field logging
|
294
|
+
if len(msg_syn) > 0 and not mute:
|
295
|
+
logger.success(msg_syn)
|
296
|
+
# warning about multi matches
|
297
|
+
if len(multi_msg) > 0 and not mute:
|
298
|
+
logger.warning(multi_msg)
|
299
|
+
|
300
|
+
# return the values that are not found in the bionty reference
|
301
|
+
unmapped_values = iterable_idx.difference(mapped_values)
|
302
|
+
return records, unmapped_values
|
303
|
+
|
304
|
+
|
305
|
+
def index_iterable(iterable: Iterable) -> pd.Index:
|
306
|
+
idx = pd.Index(iterable).unique()
|
307
|
+
# No entries are made for NAs, '', None
|
308
|
+
# returns an ordered unique not null list
|
309
|
+
return idx[(idx != "") & (~idx.isnull())]
|
310
|
+
|
311
|
+
|
312
|
+
def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
|
313
|
+
if isinstance(names, dict):
|
314
|
+
items = {
|
315
|
+
f"{key}: {value}": None
|
316
|
+
for key, value in names.items()
|
317
|
+
if key != "None" and value != "None"
|
318
|
+
}
|
319
|
+
else:
|
320
|
+
# Use a dictionary instead of a list to have unique values and preserve order
|
321
|
+
items = {str(name): None for name in names if name != "None"}
|
322
|
+
|
323
|
+
unique_items = list(items.keys())
|
324
|
+
|
325
|
+
if quotes:
|
326
|
+
unique_items = [f"'{item}'" for item in unique_items]
|
327
|
+
|
328
|
+
print_values = ", ".join(unique_items[:n])
|
329
|
+
|
330
|
+
if len(unique_items) > n:
|
331
|
+
print_values += ", ..."
|
332
|
+
|
333
|
+
return print_values
|
334
|
+
|
335
|
+
|
336
|
+
def _bulk_create_dicts_from_df(
|
337
|
+
keys: set | list, column_name: str, df: pd.DataFrame
|
338
|
+
) -> tuple[dict, str]:
|
339
|
+
"""Get fields from a DataFrame for many rows."""
|
340
|
+
multi_msg = ""
|
341
|
+
if df.index.name != column_name:
|
342
|
+
df = df.set_index(column_name).loc[list(keys)]
|
343
|
+
if not df.index.is_unique:
|
344
|
+
# return all records for multi-matches with a warning
|
345
|
+
dup = df.index[df.index.duplicated()].unique().tolist()
|
346
|
+
if len(dup) > 0:
|
347
|
+
s = "" if len(dup) == 1 else "s"
|
348
|
+
print_values = _print_values(dup)
|
349
|
+
multi_msg = (
|
350
|
+
f"ambiguous validation in Bionty for {len(dup)} record{s}:"
|
351
|
+
f" {print_values}"
|
352
|
+
)
|
353
|
+
|
354
|
+
return df.reset_index().to_dict(orient="records"), multi_msg
|
355
|
+
|
356
|
+
|
357
|
+
def _has_organism_field(registry: type[Record]) -> bool:
|
358
|
+
try:
|
359
|
+
registry._meta.get_field("organism")
|
360
|
+
return True
|
361
|
+
except FieldDoesNotExist:
|
362
|
+
return False
|
363
|
+
|
364
|
+
|
365
|
+
def _get_organism_record(
|
366
|
+
field: StrField, organism: str | Record, force: bool = False
|
367
|
+
) -> Record:
|
368
|
+
"""Get organism record.
|
369
|
+
|
370
|
+
Args:
|
371
|
+
field: the field to get the organism record for
|
372
|
+
organism: the organism to get the record for
|
373
|
+
force: whether to force fetching the organism record
|
374
|
+
"""
|
375
|
+
registry = field.field.model
|
376
|
+
check = True
|
377
|
+
if not force and hasattr(registry, "_ontology_id_field"):
|
378
|
+
check = field.field.name != registry._ontology_id_field
|
379
|
+
# e.g. bionty.CellMarker has "name" as _ontology_id_field
|
380
|
+
if not registry._ontology_id_field.endswith("id"):
|
381
|
+
check = True
|
382
|
+
|
383
|
+
if _has_organism_field(registry) and check:
|
384
|
+
from bionty._bionty import create_or_get_organism_record
|
385
|
+
|
386
|
+
if field and not isinstance(field, str):
|
387
|
+
field = field.field.name
|
388
|
+
|
389
|
+
organism_record = create_or_get_organism_record(
|
390
|
+
organism=organism, registry=registry, field=field
|
391
|
+
)
|
392
|
+
if organism_record is not None:
|
393
|
+
return organism_record
|
394
|
+
|
395
|
+
|
396
|
+
def _ensembl_prefix(id: str, field: StrField, organism: Record | None) -> str | None:
|
397
|
+
if field.field.name == "ensembl_gene_id" and organism is None:
|
398
|
+
if id.startswith("ENSG"):
|
399
|
+
organism = "human"
|
400
|
+
elif id.startswith("ENSMUSG"):
|
401
|
+
organism = "mouse"
|
402
|
+
|
403
|
+
return organism
|