lamindb 0.76.7__py3-none-any.whl → 0.76.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +113 -113
- lamindb/_artifact.py +1205 -1178
- lamindb/_can_validate.py +579 -579
- lamindb/_collection.py +387 -387
- lamindb/_curate.py +1601 -1601
- lamindb/_feature.py +155 -155
- lamindb/_feature_set.py +242 -242
- lamindb/_filter.py +23 -23
- lamindb/_finish.py +256 -256
- lamindb/_from_values.py +382 -382
- lamindb/_is_versioned.py +40 -40
- lamindb/_parents.py +476 -476
- lamindb/_query_manager.py +125 -125
- lamindb/_query_set.py +362 -362
- lamindb/_record.py +649 -649
- lamindb/_run.py +57 -57
- lamindb/_save.py +308 -295
- lamindb/_storage.py +14 -14
- lamindb/_transform.py +127 -127
- lamindb/_ulabel.py +56 -56
- lamindb/_utils.py +9 -9
- lamindb/_view.py +72 -72
- lamindb/core/__init__.py +94 -94
- lamindb/core/_context.py +574 -574
- lamindb/core/_data.py +438 -438
- lamindb/core/_feature_manager.py +867 -867
- lamindb/core/_label_manager.py +253 -253
- lamindb/core/_mapped_collection.py +597 -597
- lamindb/core/_settings.py +187 -187
- lamindb/core/_sync_git.py +138 -138
- lamindb/core/_track_environment.py +27 -27
- lamindb/core/datasets/__init__.py +59 -59
- lamindb/core/datasets/_core.py +571 -571
- lamindb/core/datasets/_fake.py +36 -36
- lamindb/core/exceptions.py +90 -77
- lamindb/core/fields.py +12 -12
- lamindb/core/loaders.py +164 -164
- lamindb/core/schema.py +56 -56
- lamindb/core/storage/__init__.py +25 -25
- lamindb/core/storage/_anndata_accessor.py +740 -740
- lamindb/core/storage/_anndata_sizes.py +41 -41
- lamindb/core/storage/_backed_access.py +98 -98
- lamindb/core/storage/_tiledbsoma.py +204 -204
- lamindb/core/storage/_valid_suffixes.py +21 -21
- lamindb/core/storage/_zarr.py +110 -110
- lamindb/core/storage/objects.py +62 -62
- lamindb/core/storage/paths.py +172 -141
- lamindb/core/subsettings/__init__.py +12 -12
- lamindb/core/subsettings/_creation_settings.py +38 -38
- lamindb/core/subsettings/_transform_settings.py +21 -21
- lamindb/core/types.py +19 -19
- lamindb/core/versioning.py +158 -158
- lamindb/integrations/__init__.py +12 -12
- lamindb/integrations/_vitessce.py +107 -107
- lamindb/setup/__init__.py +14 -14
- lamindb/setup/core/__init__.py +4 -4
- {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
- {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/METADATA +3 -3
- lamindb-0.76.8.dist-info/RECORD +60 -0
- {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
- lamindb-0.76.7.dist-info/RECORD +0 -60
lamindb/_from_values.py
CHANGED
@@ -1,382 +1,382 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from typing import TYPE_CHECKING, Iterable
|
4
|
-
|
5
|
-
import pandas as pd
|
6
|
-
from django.core.exceptions import FieldDoesNotExist
|
7
|
-
from lamin_utils import colors, logger
|
8
|
-
from lnschema_core.models import Feature, Record, ULabel
|
9
|
-
|
10
|
-
from .core._settings import settings
|
11
|
-
|
12
|
-
if TYPE_CHECKING:
|
13
|
-
from lnschema_core.types import ListLike, StrField
|
14
|
-
|
15
|
-
|
16
|
-
# The base function for `from_values`
|
17
|
-
def get_or_create_records(
|
18
|
-
iterable: ListLike,
|
19
|
-
field: StrField,
|
20
|
-
*,
|
21
|
-
create: bool = False,
|
22
|
-
from_source: bool = False,
|
23
|
-
organism: Record | str | None = None,
|
24
|
-
source: Record | None = None,
|
25
|
-
mute: bool = False,
|
26
|
-
) -> list[Record]:
|
27
|
-
"""Get or create records from iterables."""
|
28
|
-
registry = field.field.model
|
29
|
-
if create:
|
30
|
-
return [registry(**{field.field.name: value}) for value in iterable]
|
31
|
-
creation_search_names = settings.creation.search_names
|
32
|
-
feature: Feature = None
|
33
|
-
organism = _get_organism_record(field, organism)
|
34
|
-
kwargs: dict = {}
|
35
|
-
if organism is not None:
|
36
|
-
kwargs["organism"] = organism
|
37
|
-
if source is not None:
|
38
|
-
kwargs["source"] = source
|
39
|
-
settings.creation.search_names = False
|
40
|
-
try:
|
41
|
-
iterable_idx = index_iterable(iterable)
|
42
|
-
|
43
|
-
# returns existing records & non-existing values
|
44
|
-
records, nonexist_values, msg = get_existing_records(
|
45
|
-
iterable_idx=iterable_idx, field=field, mute=mute, **kwargs
|
46
|
-
)
|
47
|
-
|
48
|
-
# new records to be created based on new values
|
49
|
-
if len(nonexist_values) > 0:
|
50
|
-
source_record = None
|
51
|
-
if from_source:
|
52
|
-
if isinstance(source, Record):
|
53
|
-
source_record = source
|
54
|
-
elif (
|
55
|
-
len(records) > 0
|
56
|
-
and hasattr(records[0], "source_id")
|
57
|
-
and records[0].source_id
|
58
|
-
):
|
59
|
-
source_record = records[0].source
|
60
|
-
if not source_record and hasattr(registry, "public"):
|
61
|
-
from bionty._bionty import get_source_record
|
62
|
-
|
63
|
-
source_record = get_source_record(
|
64
|
-
registry.public(organism=organism), registry
|
65
|
-
)
|
66
|
-
if source_record:
|
67
|
-
from bionty.core._add_ontology import check_source_in_db
|
68
|
-
|
69
|
-
check_source_in_db(
|
70
|
-
registry=registry,
|
71
|
-
source=source_record,
|
72
|
-
update=True,
|
73
|
-
)
|
74
|
-
|
75
|
-
from_source = not source_record.in_db
|
76
|
-
elif hasattr(registry, "source_id"):
|
77
|
-
from_source = True
|
78
|
-
else:
|
79
|
-
from_source = False
|
80
|
-
|
81
|
-
if from_source:
|
82
|
-
records_bionty, unmapped_values = create_records_from_source(
|
83
|
-
iterable_idx=nonexist_values,
|
84
|
-
field=field,
|
85
|
-
msg=msg,
|
86
|
-
mute=mute,
|
87
|
-
**kwargs,
|
88
|
-
)
|
89
|
-
if len(records_bionty) > 0:
|
90
|
-
msg = ""
|
91
|
-
for record in records_bionty:
|
92
|
-
record._from_source = True
|
93
|
-
records += records_bionty
|
94
|
-
else:
|
95
|
-
unmapped_values = nonexist_values
|
96
|
-
# unmapped new_ids will NOT create records
|
97
|
-
if len(unmapped_values) > 0:
|
98
|
-
if len(msg) > 0 and not mute:
|
99
|
-
logger.success(msg)
|
100
|
-
s = "" if len(unmapped_values) == 1 else "s"
|
101
|
-
print_values = colors.yellow(_print_values(unmapped_values))
|
102
|
-
name = registry.__name__
|
103
|
-
n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
|
104
|
-
if not mute:
|
105
|
-
logger.warning(
|
106
|
-
f"{colors.red('did not create')} {name} record{s} for "
|
107
|
-
f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
|
108
|
-
)
|
109
|
-
if registry.__get_schema_name__() == "bionty" or registry == ULabel:
|
110
|
-
if isinstance(iterable, pd.Series):
|
111
|
-
feature = iterable.name
|
112
|
-
feature_name = None
|
113
|
-
if isinstance(feature, str):
|
114
|
-
feature_name = feature
|
115
|
-
if feature_name is not None:
|
116
|
-
if feature_name is not None:
|
117
|
-
for record in records:
|
118
|
-
record._feature = feature_name
|
119
|
-
logger.debug(f"added default feature '{feature_name}'")
|
120
|
-
return records
|
121
|
-
finally:
|
122
|
-
settings.creation.search_names = creation_search_names
|
123
|
-
|
124
|
-
|
125
|
-
def get_existing_records(
|
126
|
-
iterable_idx: pd.Index,
|
127
|
-
field: StrField,
|
128
|
-
mute: bool = False,
|
129
|
-
**kwargs,
|
130
|
-
):
|
131
|
-
model = field.field.model
|
132
|
-
condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
|
133
|
-
# existing records matching is agnostic to the bionty source
|
134
|
-
if "source" in condition:
|
135
|
-
condition.pop("source")
|
136
|
-
|
137
|
-
# standardize based on the DB reference
|
138
|
-
# log synonyms mapped terms
|
139
|
-
result = model.inspect(
|
140
|
-
iterable_idx,
|
141
|
-
field=field,
|
142
|
-
organism=kwargs.get("organism"),
|
143
|
-
source=kwargs.get("source"),
|
144
|
-
mute=True,
|
145
|
-
)
|
146
|
-
syn_mapper = result.synonyms_mapper
|
147
|
-
|
148
|
-
syn_msg = ""
|
149
|
-
if len(syn_mapper) > 0:
|
150
|
-
s = "" if len(syn_mapper) == 1 else "s"
|
151
|
-
names = list(syn_mapper.keys())
|
152
|
-
print_values = colors.green(_print_values(names))
|
153
|
-
syn_msg = (
|
154
|
-
"loaded"
|
155
|
-
f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
|
156
|
-
f" matching {colors.italic('synonyms')}: {print_values}"
|
157
|
-
)
|
158
|
-
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
159
|
-
|
160
|
-
# get all existing records in the db
|
161
|
-
# if necessary, create records for the values in kwargs
|
162
|
-
# k:v -> k:v_record
|
163
|
-
# kwargs is used to deal with organism
|
164
|
-
condition.update({f"{field.field.name}__in": iterable_idx.values})
|
165
|
-
|
166
|
-
query_set = model.filter(**condition)
|
167
|
-
records = query_set.list()
|
168
|
-
|
169
|
-
# now we have to sort the list of queried records
|
170
|
-
# preserved = Case(
|
171
|
-
# *[
|
172
|
-
# When(**{field.field.name: value}, then=pos)
|
173
|
-
# for pos, value in enumerate(iterable_idx)
|
174
|
-
# ]
|
175
|
-
# )
|
176
|
-
# order by causes a factor 10 in runtime
|
177
|
-
# records = query_set.order_by(preserved).list()
|
178
|
-
|
179
|
-
# log validated terms
|
180
|
-
validated = result.validated
|
181
|
-
msg = ""
|
182
|
-
if len(validated) > 0:
|
183
|
-
s = "" if len(validated) == 1 else "s"
|
184
|
-
print_values = colors.green(_print_values(validated))
|
185
|
-
msg = (
|
186
|
-
"loaded"
|
187
|
-
f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
|
188
|
-
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
189
|
-
)
|
190
|
-
|
191
|
-
# no logging if all values are validated
|
192
|
-
# logs if there are synonyms
|
193
|
-
if len(syn_msg) > 0:
|
194
|
-
if len(msg) > 0 and not mute:
|
195
|
-
logger.success(msg)
|
196
|
-
if not mute:
|
197
|
-
logger.success(syn_msg)
|
198
|
-
msg = ""
|
199
|
-
|
200
|
-
existing_values = iterable_idx.intersection(
|
201
|
-
query_set.values_list(field.field.name, flat=True)
|
202
|
-
)
|
203
|
-
nonexist_values = iterable_idx.difference(existing_values)
|
204
|
-
|
205
|
-
return records, nonexist_values, msg
|
206
|
-
|
207
|
-
|
208
|
-
def create_records_from_source(
|
209
|
-
iterable_idx: pd.Index,
|
210
|
-
field: StrField,
|
211
|
-
msg: str = "",
|
212
|
-
mute: bool = False,
|
213
|
-
**kwargs,
|
214
|
-
):
|
215
|
-
model = field.field.model
|
216
|
-
records: list = []
|
217
|
-
# populate additional fields from bionty
|
218
|
-
from bionty._bionty import get_source_record
|
219
|
-
from bionty.core._bionty import filter_bionty_df_columns
|
220
|
-
|
221
|
-
# create the corresponding bionty object from model
|
222
|
-
try:
|
223
|
-
# TODO: more generic
|
224
|
-
organism = kwargs.get("organism")
|
225
|
-
if field.field.name == "ensembl_gene_id":
|
226
|
-
if iterable_idx[0].startswith("ENSG"):
|
227
|
-
organism = "human"
|
228
|
-
elif iterable_idx[0].startswith("ENSMUSG"):
|
229
|
-
organism = "mouse"
|
230
|
-
public_ontology = model.public(organism=organism, source=kwargs.get("source"))
|
231
|
-
except Exception:
|
232
|
-
# for custom records that are not created from public sources
|
233
|
-
return records, iterable_idx
|
234
|
-
# add source record to the kwargs
|
235
|
-
source_record = get_source_record(public_ontology, model)
|
236
|
-
kwargs.update({"source": source_record})
|
237
|
-
|
238
|
-
# filter the columns in bionty df based on fields
|
239
|
-
bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
|
240
|
-
|
241
|
-
# standardize in the bionty reference
|
242
|
-
result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
|
243
|
-
syn_mapper = result.synonyms_mapper
|
244
|
-
|
245
|
-
msg_syn: str = ""
|
246
|
-
if len(syn_mapper) > 0:
|
247
|
-
s = "" if len(syn_mapper) == 1 else "s"
|
248
|
-
names = list(syn_mapper.keys())
|
249
|
-
print_values = colors.purple(_print_values(names))
|
250
|
-
msg_syn = (
|
251
|
-
"created"
|
252
|
-
f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
|
253
|
-
f" matching {colors.italic('synonyms')}: {print_values}"
|
254
|
-
)
|
255
|
-
|
256
|
-
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
257
|
-
|
258
|
-
# create records for values that are found in the bionty reference
|
259
|
-
# matching either field or synonyms
|
260
|
-
mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
|
261
|
-
|
262
|
-
multi_msg = ""
|
263
|
-
if len(mapped_values) > 0:
|
264
|
-
bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
|
265
|
-
keys=mapped_values, column_name=field.field.name, df=bionty_df
|
266
|
-
)
|
267
|
-
organism_kwargs = {}
|
268
|
-
if "organism" not in kwargs:
|
269
|
-
organism_record = _get_organism_record(
|
270
|
-
field, public_ontology.organism, force=True
|
271
|
-
)
|
272
|
-
if organism_record is not None:
|
273
|
-
organism_kwargs["organism"] = organism_record
|
274
|
-
for bk in bionty_kwargs:
|
275
|
-
records.append(model(**bk, **kwargs, **organism_kwargs))
|
276
|
-
|
277
|
-
# number of records that matches field (not synonyms)
|
278
|
-
validated = result.validated
|
279
|
-
if len(validated) > 0:
|
280
|
-
s = "" if len(validated) == 1 else "s"
|
281
|
-
print_values = colors.purple(_print_values(validated))
|
282
|
-
# this is the success msg for existing records in the DB
|
283
|
-
if len(msg) > 0 and not mute:
|
284
|
-
logger.success(msg)
|
285
|
-
if not mute:
|
286
|
-
logger.success(
|
287
|
-
"created"
|
288
|
-
f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
|
289
|
-
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
290
|
-
)
|
291
|
-
|
292
|
-
# make sure that synonyms logging appears after the field logging
|
293
|
-
if len(msg_syn) > 0 and not mute:
|
294
|
-
logger.success(msg_syn)
|
295
|
-
# warning about multi matches
|
296
|
-
if len(multi_msg) > 0 and not mute:
|
297
|
-
logger.warning(multi_msg)
|
298
|
-
|
299
|
-
# return the values that are not found in the bionty reference
|
300
|
-
unmapped_values = iterable_idx.difference(mapped_values)
|
301
|
-
return records, unmapped_values
|
302
|
-
|
303
|
-
|
304
|
-
def index_iterable(iterable: Iterable) -> pd.Index:
|
305
|
-
idx = pd.Index(iterable).unique()
|
306
|
-
# No entries are made for NAs, '', None
|
307
|
-
# returns an ordered unique not null list
|
308
|
-
return idx[(idx != "") & (~idx.isnull())]
|
309
|
-
|
310
|
-
|
311
|
-
def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
|
312
|
-
if isinstance(names, dict):
|
313
|
-
items = {
|
314
|
-
f"{key}: {value}": None
|
315
|
-
for key, value in names.items()
|
316
|
-
if key != "None" and value != "None"
|
317
|
-
}
|
318
|
-
else:
|
319
|
-
# Use a dictionary instead of a list to have unique values and preserve order
|
320
|
-
items = {str(name): None for name in names if name != "None"}
|
321
|
-
|
322
|
-
unique_items = list(items.keys())
|
323
|
-
|
324
|
-
if quotes:
|
325
|
-
unique_items = [f"'{item}'" for item in unique_items]
|
326
|
-
|
327
|
-
print_values = ", ".join(unique_items[:n])
|
328
|
-
|
329
|
-
if len(unique_items) > n:
|
330
|
-
print_values += ", ..."
|
331
|
-
|
332
|
-
return print_values
|
333
|
-
|
334
|
-
|
335
|
-
def _bulk_create_dicts_from_df(
|
336
|
-
keys: set | list, column_name: str, df: pd.DataFrame
|
337
|
-
) -> tuple[dict, str]:
|
338
|
-
"""Get fields from a DataFrame for many rows."""
|
339
|
-
multi_msg = ""
|
340
|
-
if df.index.name != column_name:
|
341
|
-
df = df.set_index(column_name).loc[list(keys)]
|
342
|
-
if not df.index.is_unique:
|
343
|
-
# return all records for multi-matches with a warning
|
344
|
-
dup = df.index[df.index.duplicated()].unique().tolist()
|
345
|
-
if len(dup) > 0:
|
346
|
-
s = "" if len(dup) == 1 else "s"
|
347
|
-
print_values = _print_values(dup)
|
348
|
-
multi_msg = (
|
349
|
-
f"ambiguous validation in Bionty for {len(dup)} record{s}:"
|
350
|
-
f" {print_values}"
|
351
|
-
)
|
352
|
-
|
353
|
-
return df.reset_index().to_dict(orient="records"), multi_msg
|
354
|
-
|
355
|
-
|
356
|
-
def _has_organism_field(registry: type[Record]) -> bool:
|
357
|
-
try:
|
358
|
-
registry._meta.get_field("organism")
|
359
|
-
return True
|
360
|
-
except FieldDoesNotExist:
|
361
|
-
return False
|
362
|
-
|
363
|
-
|
364
|
-
def _get_organism_record(
|
365
|
-
field: StrField, organism: str | Record, force: bool = False
|
366
|
-
) -> Record:
|
367
|
-
registry = field.field.model
|
368
|
-
check = True
|
369
|
-
if not force and hasattr(registry, "_ontology_id_field"):
|
370
|
-
check = field.field.name != registry._ontology_id_field
|
371
|
-
# e.g. bionty.CellMarker has "name" as _ontology_id_field
|
372
|
-
if not registry._ontology_id_field.endswith("id"):
|
373
|
-
check = True
|
374
|
-
|
375
|
-
if _has_organism_field(registry) and check:
|
376
|
-
from bionty._bionty import create_or_get_organism_record
|
377
|
-
|
378
|
-
organism_record = create_or_get_organism_record(
|
379
|
-
organism=organism, registry=registry
|
380
|
-
)
|
381
|
-
if organism_record is not None:
|
382
|
-
return organism_record
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Iterable
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from django.core.exceptions import FieldDoesNotExist
|
7
|
+
from lamin_utils import colors, logger
|
8
|
+
from lnschema_core.models import Feature, Record, ULabel
|
9
|
+
|
10
|
+
from .core._settings import settings
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from lnschema_core.types import ListLike, StrField
|
14
|
+
|
15
|
+
|
16
|
+
# The base function for `from_values`
|
17
|
+
def get_or_create_records(
|
18
|
+
iterable: ListLike,
|
19
|
+
field: StrField,
|
20
|
+
*,
|
21
|
+
create: bool = False,
|
22
|
+
from_source: bool = False,
|
23
|
+
organism: Record | str | None = None,
|
24
|
+
source: Record | None = None,
|
25
|
+
mute: bool = False,
|
26
|
+
) -> list[Record]:
|
27
|
+
"""Get or create records from iterables."""
|
28
|
+
registry = field.field.model
|
29
|
+
if create:
|
30
|
+
return [registry(**{field.field.name: value}) for value in iterable]
|
31
|
+
creation_search_names = settings.creation.search_names
|
32
|
+
feature: Feature = None
|
33
|
+
organism = _get_organism_record(field, organism)
|
34
|
+
kwargs: dict = {}
|
35
|
+
if organism is not None:
|
36
|
+
kwargs["organism"] = organism
|
37
|
+
if source is not None:
|
38
|
+
kwargs["source"] = source
|
39
|
+
settings.creation.search_names = False
|
40
|
+
try:
|
41
|
+
iterable_idx = index_iterable(iterable)
|
42
|
+
|
43
|
+
# returns existing records & non-existing values
|
44
|
+
records, nonexist_values, msg = get_existing_records(
|
45
|
+
iterable_idx=iterable_idx, field=field, mute=mute, **kwargs
|
46
|
+
)
|
47
|
+
|
48
|
+
# new records to be created based on new values
|
49
|
+
if len(nonexist_values) > 0:
|
50
|
+
source_record = None
|
51
|
+
if from_source:
|
52
|
+
if isinstance(source, Record):
|
53
|
+
source_record = source
|
54
|
+
elif (
|
55
|
+
len(records) > 0
|
56
|
+
and hasattr(records[0], "source_id")
|
57
|
+
and records[0].source_id
|
58
|
+
):
|
59
|
+
source_record = records[0].source
|
60
|
+
if not source_record and hasattr(registry, "public"):
|
61
|
+
from bionty._bionty import get_source_record
|
62
|
+
|
63
|
+
source_record = get_source_record(
|
64
|
+
registry.public(organism=organism), registry
|
65
|
+
)
|
66
|
+
if source_record:
|
67
|
+
from bionty.core._add_ontology import check_source_in_db
|
68
|
+
|
69
|
+
check_source_in_db(
|
70
|
+
registry=registry,
|
71
|
+
source=source_record,
|
72
|
+
update=True,
|
73
|
+
)
|
74
|
+
|
75
|
+
from_source = not source_record.in_db
|
76
|
+
elif hasattr(registry, "source_id"):
|
77
|
+
from_source = True
|
78
|
+
else:
|
79
|
+
from_source = False
|
80
|
+
|
81
|
+
if from_source:
|
82
|
+
records_bionty, unmapped_values = create_records_from_source(
|
83
|
+
iterable_idx=nonexist_values,
|
84
|
+
field=field,
|
85
|
+
msg=msg,
|
86
|
+
mute=mute,
|
87
|
+
**kwargs,
|
88
|
+
)
|
89
|
+
if len(records_bionty) > 0:
|
90
|
+
msg = ""
|
91
|
+
for record in records_bionty:
|
92
|
+
record._from_source = True
|
93
|
+
records += records_bionty
|
94
|
+
else:
|
95
|
+
unmapped_values = nonexist_values
|
96
|
+
# unmapped new_ids will NOT create records
|
97
|
+
if len(unmapped_values) > 0:
|
98
|
+
if len(msg) > 0 and not mute:
|
99
|
+
logger.success(msg)
|
100
|
+
s = "" if len(unmapped_values) == 1 else "s"
|
101
|
+
print_values = colors.yellow(_print_values(unmapped_values))
|
102
|
+
name = registry.__name__
|
103
|
+
n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
|
104
|
+
if not mute:
|
105
|
+
logger.warning(
|
106
|
+
f"{colors.red('did not create')} {name} record{s} for "
|
107
|
+
f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
|
108
|
+
)
|
109
|
+
if registry.__get_schema_name__() == "bionty" or registry == ULabel:
|
110
|
+
if isinstance(iterable, pd.Series):
|
111
|
+
feature = iterable.name
|
112
|
+
feature_name = None
|
113
|
+
if isinstance(feature, str):
|
114
|
+
feature_name = feature
|
115
|
+
if feature_name is not None:
|
116
|
+
if feature_name is not None:
|
117
|
+
for record in records:
|
118
|
+
record._feature = feature_name
|
119
|
+
logger.debug(f"added default feature '{feature_name}'")
|
120
|
+
return records
|
121
|
+
finally:
|
122
|
+
settings.creation.search_names = creation_search_names
|
123
|
+
|
124
|
+
|
125
|
+
def get_existing_records(
|
126
|
+
iterable_idx: pd.Index,
|
127
|
+
field: StrField,
|
128
|
+
mute: bool = False,
|
129
|
+
**kwargs,
|
130
|
+
):
|
131
|
+
model = field.field.model
|
132
|
+
condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
|
133
|
+
# existing records matching is agnostic to the bionty source
|
134
|
+
if "source" in condition:
|
135
|
+
condition.pop("source")
|
136
|
+
|
137
|
+
# standardize based on the DB reference
|
138
|
+
# log synonyms mapped terms
|
139
|
+
result = model.inspect(
|
140
|
+
iterable_idx,
|
141
|
+
field=field,
|
142
|
+
organism=kwargs.get("organism"),
|
143
|
+
source=kwargs.get("source"),
|
144
|
+
mute=True,
|
145
|
+
)
|
146
|
+
syn_mapper = result.synonyms_mapper
|
147
|
+
|
148
|
+
syn_msg = ""
|
149
|
+
if len(syn_mapper) > 0:
|
150
|
+
s = "" if len(syn_mapper) == 1 else "s"
|
151
|
+
names = list(syn_mapper.keys())
|
152
|
+
print_values = colors.green(_print_values(names))
|
153
|
+
syn_msg = (
|
154
|
+
"loaded"
|
155
|
+
f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')}"
|
156
|
+
f" matching {colors.italic('synonyms')}: {print_values}"
|
157
|
+
)
|
158
|
+
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
159
|
+
|
160
|
+
# get all existing records in the db
|
161
|
+
# if necessary, create records for the values in kwargs
|
162
|
+
# k:v -> k:v_record
|
163
|
+
# kwargs is used to deal with organism
|
164
|
+
condition.update({f"{field.field.name}__in": iterable_idx.values})
|
165
|
+
|
166
|
+
query_set = model.filter(**condition)
|
167
|
+
records = query_set.list()
|
168
|
+
|
169
|
+
# now we have to sort the list of queried records
|
170
|
+
# preserved = Case(
|
171
|
+
# *[
|
172
|
+
# When(**{field.field.name: value}, then=pos)
|
173
|
+
# for pos, value in enumerate(iterable_idx)
|
174
|
+
# ]
|
175
|
+
# )
|
176
|
+
# order by causes a factor 10 in runtime
|
177
|
+
# records = query_set.order_by(preserved).list()
|
178
|
+
|
179
|
+
# log validated terms
|
180
|
+
validated = result.validated
|
181
|
+
msg = ""
|
182
|
+
if len(validated) > 0:
|
183
|
+
s = "" if len(validated) == 1 else "s"
|
184
|
+
print_values = colors.green(_print_values(validated))
|
185
|
+
msg = (
|
186
|
+
"loaded"
|
187
|
+
f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
|
188
|
+
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
189
|
+
)
|
190
|
+
|
191
|
+
# no logging if all values are validated
|
192
|
+
# logs if there are synonyms
|
193
|
+
if len(syn_msg) > 0:
|
194
|
+
if len(msg) > 0 and not mute:
|
195
|
+
logger.success(msg)
|
196
|
+
if not mute:
|
197
|
+
logger.success(syn_msg)
|
198
|
+
msg = ""
|
199
|
+
|
200
|
+
existing_values = iterable_idx.intersection(
|
201
|
+
query_set.values_list(field.field.name, flat=True)
|
202
|
+
)
|
203
|
+
nonexist_values = iterable_idx.difference(existing_values)
|
204
|
+
|
205
|
+
return records, nonexist_values, msg
|
206
|
+
|
207
|
+
|
208
|
+
def create_records_from_source(
|
209
|
+
iterable_idx: pd.Index,
|
210
|
+
field: StrField,
|
211
|
+
msg: str = "",
|
212
|
+
mute: bool = False,
|
213
|
+
**kwargs,
|
214
|
+
):
|
215
|
+
model = field.field.model
|
216
|
+
records: list = []
|
217
|
+
# populate additional fields from bionty
|
218
|
+
from bionty._bionty import get_source_record
|
219
|
+
from bionty.core._bionty import filter_bionty_df_columns
|
220
|
+
|
221
|
+
# create the corresponding bionty object from model
|
222
|
+
try:
|
223
|
+
# TODO: more generic
|
224
|
+
organism = kwargs.get("organism")
|
225
|
+
if field.field.name == "ensembl_gene_id":
|
226
|
+
if iterable_idx[0].startswith("ENSG"):
|
227
|
+
organism = "human"
|
228
|
+
elif iterable_idx[0].startswith("ENSMUSG"):
|
229
|
+
organism = "mouse"
|
230
|
+
public_ontology = model.public(organism=organism, source=kwargs.get("source"))
|
231
|
+
except Exception:
|
232
|
+
# for custom records that are not created from public sources
|
233
|
+
return records, iterable_idx
|
234
|
+
# add source record to the kwargs
|
235
|
+
source_record = get_source_record(public_ontology, model)
|
236
|
+
kwargs.update({"source": source_record})
|
237
|
+
|
238
|
+
# filter the columns in bionty df based on fields
|
239
|
+
bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
|
240
|
+
|
241
|
+
# standardize in the bionty reference
|
242
|
+
result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
|
243
|
+
syn_mapper = result.synonyms_mapper
|
244
|
+
|
245
|
+
msg_syn: str = ""
|
246
|
+
if len(syn_mapper) > 0:
|
247
|
+
s = "" if len(syn_mapper) == 1 else "s"
|
248
|
+
names = list(syn_mapper.keys())
|
249
|
+
print_values = colors.purple(_print_values(names))
|
250
|
+
msg_syn = (
|
251
|
+
"created"
|
252
|
+
f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')}"
|
253
|
+
f" matching {colors.italic('synonyms')}: {print_values}"
|
254
|
+
)
|
255
|
+
|
256
|
+
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
257
|
+
|
258
|
+
# create records for values that are found in the bionty reference
|
259
|
+
# matching either field or synonyms
|
260
|
+
mapped_values = iterable_idx.intersection(bionty_df[field.field.name])
|
261
|
+
|
262
|
+
multi_msg = ""
|
263
|
+
if len(mapped_values) > 0:
|
264
|
+
bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
|
265
|
+
keys=mapped_values, column_name=field.field.name, df=bionty_df
|
266
|
+
)
|
267
|
+
organism_kwargs = {}
|
268
|
+
if "organism" not in kwargs:
|
269
|
+
organism_record = _get_organism_record(
|
270
|
+
field, public_ontology.organism, force=True
|
271
|
+
)
|
272
|
+
if organism_record is not None:
|
273
|
+
organism_kwargs["organism"] = organism_record
|
274
|
+
for bk in bionty_kwargs:
|
275
|
+
records.append(model(**bk, **kwargs, **organism_kwargs))
|
276
|
+
|
277
|
+
# number of records that matches field (not synonyms)
|
278
|
+
validated = result.validated
|
279
|
+
if len(validated) > 0:
|
280
|
+
s = "" if len(validated) == 1 else "s"
|
281
|
+
print_values = colors.purple(_print_values(validated))
|
282
|
+
# this is the success msg for existing records in the DB
|
283
|
+
if len(msg) > 0 and not mute:
|
284
|
+
logger.success(msg)
|
285
|
+
if not mute:
|
286
|
+
logger.success(
|
287
|
+
"created"
|
288
|
+
f" {colors.purple(f'{len(validated)} {model.__name__} record{s} from Bionty')}"
|
289
|
+
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
290
|
+
)
|
291
|
+
|
292
|
+
# make sure that synonyms logging appears after the field logging
|
293
|
+
if len(msg_syn) > 0 and not mute:
|
294
|
+
logger.success(msg_syn)
|
295
|
+
# warning about multi matches
|
296
|
+
if len(multi_msg) > 0 and not mute:
|
297
|
+
logger.warning(multi_msg)
|
298
|
+
|
299
|
+
# return the values that are not found in the bionty reference
|
300
|
+
unmapped_values = iterable_idx.difference(mapped_values)
|
301
|
+
return records, unmapped_values
|
302
|
+
|
303
|
+
|
304
|
+
def index_iterable(iterable: Iterable) -> pd.Index:
|
305
|
+
idx = pd.Index(iterable).unique()
|
306
|
+
# No entries are made for NAs, '', None
|
307
|
+
# returns an ordered unique not null list
|
308
|
+
return idx[(idx != "") & (~idx.isnull())]
|
309
|
+
|
310
|
+
|
311
|
+
def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
|
312
|
+
if isinstance(names, dict):
|
313
|
+
items = {
|
314
|
+
f"{key}: {value}": None
|
315
|
+
for key, value in names.items()
|
316
|
+
if key != "None" and value != "None"
|
317
|
+
}
|
318
|
+
else:
|
319
|
+
# Use a dictionary instead of a list to have unique values and preserve order
|
320
|
+
items = {str(name): None for name in names if name != "None"}
|
321
|
+
|
322
|
+
unique_items = list(items.keys())
|
323
|
+
|
324
|
+
if quotes:
|
325
|
+
unique_items = [f"'{item}'" for item in unique_items]
|
326
|
+
|
327
|
+
print_values = ", ".join(unique_items[:n])
|
328
|
+
|
329
|
+
if len(unique_items) > n:
|
330
|
+
print_values += ", ..."
|
331
|
+
|
332
|
+
return print_values
|
333
|
+
|
334
|
+
|
335
|
+
def _bulk_create_dicts_from_df(
|
336
|
+
keys: set | list, column_name: str, df: pd.DataFrame
|
337
|
+
) -> tuple[dict, str]:
|
338
|
+
"""Get fields from a DataFrame for many rows."""
|
339
|
+
multi_msg = ""
|
340
|
+
if df.index.name != column_name:
|
341
|
+
df = df.set_index(column_name).loc[list(keys)]
|
342
|
+
if not df.index.is_unique:
|
343
|
+
# return all records for multi-matches with a warning
|
344
|
+
dup = df.index[df.index.duplicated()].unique().tolist()
|
345
|
+
if len(dup) > 0:
|
346
|
+
s = "" if len(dup) == 1 else "s"
|
347
|
+
print_values = _print_values(dup)
|
348
|
+
multi_msg = (
|
349
|
+
f"ambiguous validation in Bionty for {len(dup)} record{s}:"
|
350
|
+
f" {print_values}"
|
351
|
+
)
|
352
|
+
|
353
|
+
return df.reset_index().to_dict(orient="records"), multi_msg
|
354
|
+
|
355
|
+
|
356
|
+
def _has_organism_field(registry: type[Record]) -> bool:
|
357
|
+
try:
|
358
|
+
registry._meta.get_field("organism")
|
359
|
+
return True
|
360
|
+
except FieldDoesNotExist:
|
361
|
+
return False
|
362
|
+
|
363
|
+
|
364
|
+
def _get_organism_record(
|
365
|
+
field: StrField, organism: str | Record, force: bool = False
|
366
|
+
) -> Record:
|
367
|
+
registry = field.field.model
|
368
|
+
check = True
|
369
|
+
if not force and hasattr(registry, "_ontology_id_field"):
|
370
|
+
check = field.field.name != registry._ontology_id_field
|
371
|
+
# e.g. bionty.CellMarker has "name" as _ontology_id_field
|
372
|
+
if not registry._ontology_id_field.endswith("id"):
|
373
|
+
check = True
|
374
|
+
|
375
|
+
if _has_organism_field(registry) and check:
|
376
|
+
from bionty._bionty import create_or_get_organism_record
|
377
|
+
|
378
|
+
organism_record = create_or_get_organism_record(
|
379
|
+
organism=organism, registry=registry
|
380
|
+
)
|
381
|
+
if organism_record is not None:
|
382
|
+
return organism_record
|