lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -1
- lamindb/_view.py +2 -2
- lamindb/base/types.py +50 -11
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +15 -12
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/loaders.py +22 -12
- lamindb/core/storage/_tiledbsoma.py +2 -2
- lamindb/core/storage/_zarr.py +84 -26
- lamindb/core/storage/objects.py +45 -44
- lamindb/core/types.py +11 -1
- lamindb/curators/__init__.py +1430 -1665
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +86 -42
- lamindb/models/_from_values.py +110 -119
- lamindb/models/_label_manager.py +17 -10
- lamindb/models/artifact.py +170 -102
- lamindb/models/can_curate.py +200 -231
- lamindb/models/feature.py +76 -47
- lamindb/models/project.py +69 -7
- lamindb/models/query_set.py +12 -2
- lamindb/models/record.py +77 -50
- lamindb/models/run.py +20 -7
- lamindb/models/schema.py +7 -15
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/METADATA +8 -7
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/RECORD +31 -30
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0
lamindb/models/can_curate.py
CHANGED
@@ -5,14 +5,18 @@ from typing import TYPE_CHECKING, Iterable, Literal, Union
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
from django.core.exceptions import FieldDoesNotExist
|
8
|
+
from django.db.models import Manager, QuerySet
|
8
9
|
from lamin_utils import colors, logger
|
9
10
|
|
10
11
|
from ..errors import ValidationError
|
11
|
-
from ._from_values import
|
12
|
-
|
12
|
+
from ._from_values import (
|
13
|
+
_format_values,
|
14
|
+
_from_values,
|
15
|
+
get_organism_record_from_field,
|
16
|
+
)
|
17
|
+
from .record import Record, get_name_field
|
13
18
|
|
14
19
|
if TYPE_CHECKING:
|
15
|
-
from django.db.models import QuerySet
|
16
20
|
from lamin_utils._inspect import InspectResult
|
17
21
|
|
18
22
|
from lamindb.base.types import ListLike, StrField
|
@@ -20,27 +24,20 @@ if TYPE_CHECKING:
|
|
20
24
|
from .query_set import RecordList
|
21
25
|
|
22
26
|
|
23
|
-
def
|
24
|
-
"""Check if the
|
25
|
-
if
|
26
|
-
if source._state.db != using_key:
|
27
|
-
raise ValueError(
|
28
|
-
f"source must be a bionty.Source record from instance '{using_key}'!"
|
29
|
-
)
|
30
|
-
|
31
|
-
|
32
|
-
def _check_organism_db(organism: str | Record | None, using_key: str | None):
|
33
|
-
"""Check if the organism is from the DB."""
|
34
|
-
if isinstance(organism, Record):
|
27
|
+
def _check_if_record_in_db(record: str | Record | None, using_key: str | None):
|
28
|
+
"""Check if the record is from the using_key DB."""
|
29
|
+
if isinstance(record, Record):
|
35
30
|
if using_key is not None and using_key != "default":
|
36
|
-
if
|
31
|
+
if record._state.db != using_key:
|
37
32
|
raise ValueError(
|
38
|
-
f"
|
33
|
+
f"record must be a {record.__class__.__get_name_with_module__()} record from instance '{using_key}'!"
|
39
34
|
)
|
40
35
|
|
41
36
|
|
42
|
-
def _concat_lists(values: ListLike) -> list[str]:
|
37
|
+
def _concat_lists(values: ListLike | str) -> list[str]:
|
43
38
|
"""Concatenate a list of lists of strings into a single list."""
|
39
|
+
if isinstance(values, str):
|
40
|
+
values = [values]
|
44
41
|
if isinstance(values, (list, pd.Series)) and len(values) > 0:
|
45
42
|
first_item = values[0] if isinstance(values, list) else values.iloc[0]
|
46
43
|
if isinstance(first_item, list):
|
@@ -55,10 +52,9 @@ def _concat_lists(values: ListLike) -> list[str]:
|
|
55
52
|
def _inspect(
|
56
53
|
cls,
|
57
54
|
values: ListLike,
|
58
|
-
field:
|
55
|
+
field: StrField | None = None,
|
59
56
|
*,
|
60
57
|
mute: bool = False,
|
61
|
-
using_key: str | None = None,
|
62
58
|
organism: str | Record | None = None,
|
63
59
|
source: Record | None = None,
|
64
60
|
strict_source: bool = False,
|
@@ -66,78 +62,81 @@ def _inspect(
|
|
66
62
|
"""{}""" # noqa: D415
|
67
63
|
from lamin_utils._inspect import inspect
|
68
64
|
|
69
|
-
if isinstance(values, str):
|
70
|
-
values = [values]
|
71
65
|
values = _concat_lists(values)
|
72
66
|
|
73
|
-
|
74
|
-
queryset =
|
75
|
-
|
67
|
+
field_str = get_name_field(cls, field=field)
|
68
|
+
queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
|
69
|
+
registry = queryset.model
|
70
|
+
model_name = registry._meta.model.__name__
|
76
71
|
if isinstance(source, Record):
|
77
|
-
|
72
|
+
_check_if_record_in_db(source, queryset.db)
|
78
73
|
# if strict_source mode, restrict the query to the passed ontology source
|
79
74
|
# otherwise, inspect across records present in the DB from all ontology sources and no-source
|
80
75
|
if strict_source:
|
81
76
|
queryset = queryset.filter(source=source)
|
82
|
-
|
83
|
-
|
84
|
-
|
77
|
+
organism_record = get_organism_record_from_field(
|
78
|
+
getattr(registry, field_str), organism, values, queryset.db
|
79
|
+
)
|
80
|
+
_check_if_record_in_db(organism_record, queryset.db)
|
85
81
|
|
86
82
|
# do not inspect synonyms if the field is not name field
|
87
83
|
inspect_synonyms = True
|
88
|
-
if hasattr(registry, "_name_field") and
|
84
|
+
if hasattr(registry, "_name_field") and field_str != registry._name_field:
|
89
85
|
inspect_synonyms = False
|
90
86
|
|
91
87
|
# inspect in the DB
|
92
88
|
result_db = inspect(
|
93
|
-
df=
|
94
|
-
queryset=queryset, field=field, organism=organism
|
95
|
-
),
|
89
|
+
df=_filter_queryset_with_organism(queryset=queryset, organism=organism_record),
|
96
90
|
identifiers=values,
|
97
|
-
field=
|
91
|
+
field=field_str,
|
98
92
|
mute=mute,
|
99
93
|
inspect_synonyms=inspect_synonyms,
|
100
94
|
)
|
101
95
|
nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
|
102
96
|
|
103
|
-
if len(nonval) > 0 and registry
|
97
|
+
if len(nonval) > 0 and hasattr(registry, "source_id"):
|
104
98
|
try:
|
105
|
-
|
106
|
-
|
99
|
+
public_result = registry.public(
|
100
|
+
organism=organism_record, source=source
|
101
|
+
).inspect(
|
102
|
+
values=nonval,
|
103
|
+
field=field_str,
|
104
|
+
mute=True,
|
105
|
+
inspect_synonyms=inspect_synonyms,
|
107
106
|
)
|
108
|
-
|
109
|
-
|
107
|
+
public_validated = public_result.validated
|
108
|
+
public_mapper = public_result.synonyms_mapper
|
110
109
|
hint = False
|
111
|
-
if len(
|
112
|
-
print_values = _format_values(
|
113
|
-
s = "" if len(
|
114
|
-
labels = colors.yellow(f"{len(
|
110
|
+
if len(public_validated) > 0 and not mute:
|
111
|
+
print_values = _format_values(public_validated)
|
112
|
+
s = "" if len(public_validated) == 1 else "s"
|
113
|
+
labels = colors.yellow(f"{len(public_validated)} {model_name} term{s}")
|
115
114
|
logger.print(
|
116
|
-
f" detected {labels} in
|
117
|
-
f" {colors.italic(
|
115
|
+
f" detected {labels} in public source for"
|
116
|
+
f" {colors.italic(field_str)}: {colors.yellow(print_values)}"
|
118
117
|
)
|
119
118
|
hint = True
|
120
119
|
|
121
|
-
if len(
|
122
|
-
print_values = _format_values(list(
|
123
|
-
s = "" if len(
|
124
|
-
labels = colors.yellow(f"{len(
|
120
|
+
if len(public_mapper) > 0 and not mute:
|
121
|
+
print_values = _format_values(list(public_mapper.keys()))
|
122
|
+
s = "" if len(public_mapper) == 1 else "s"
|
123
|
+
labels = colors.yellow(f"{len(public_mapper)} {model_name} term{s}")
|
125
124
|
logger.print(
|
126
|
-
f" detected {labels} in
|
125
|
+
f" detected {labels} in public source as {colors.italic(f'synonym{s}')}:"
|
127
126
|
f" {colors.yellow(print_values)}"
|
128
127
|
)
|
129
128
|
hint = True
|
130
129
|
|
131
130
|
if hint:
|
132
131
|
logger.print(
|
133
|
-
f"→ add records from
|
132
|
+
f"→ add records from public source to your {model_name} registry via"
|
134
133
|
f" {colors.italic('.from_values()')}"
|
135
134
|
)
|
136
135
|
|
137
|
-
nonval = [i for i in
|
138
|
-
# no
|
136
|
+
nonval = [i for i in public_result.non_validated if i not in public_mapper] # type: ignore
|
137
|
+
# no public source is found
|
139
138
|
except ValueError:
|
140
|
-
logger.warning("no
|
139
|
+
logger.warning("no public source found, skipping source validation")
|
141
140
|
|
142
141
|
if len(nonval) > 0 and not mute:
|
143
142
|
print_values = _format_values(list(nonval))
|
@@ -155,10 +154,9 @@ def _inspect(
|
|
155
154
|
def _validate(
|
156
155
|
cls,
|
157
156
|
values: ListLike,
|
158
|
-
field:
|
157
|
+
field: StrField | None = None,
|
159
158
|
*,
|
160
159
|
mute: bool = False,
|
161
|
-
using_key: str | None = None,
|
162
160
|
organism: str | Record | None = None,
|
163
161
|
source: Record | None = None,
|
164
162
|
strict_source: bool = False,
|
@@ -167,25 +165,26 @@ def _validate(
|
|
167
165
|
from lamin_utils._inspect import validate
|
168
166
|
|
169
167
|
return_str = True if isinstance(values, str) else False
|
170
|
-
if isinstance(values, str):
|
171
|
-
values = [values]
|
172
168
|
values = _concat_lists(values)
|
173
169
|
|
174
|
-
|
170
|
+
field_str = get_name_field(cls, field=field)
|
175
171
|
|
176
|
-
queryset =
|
177
|
-
|
172
|
+
queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
|
173
|
+
registry = queryset.model
|
178
174
|
if isinstance(source, Record):
|
179
|
-
|
175
|
+
_check_if_record_in_db(source, queryset.db)
|
180
176
|
if strict_source:
|
181
177
|
queryset = queryset.filter(source=source)
|
182
|
-
|
178
|
+
|
179
|
+
organism_record = get_organism_record_from_field(
|
180
|
+
getattr(registry, field_str), organism, values, queryset.db
|
181
|
+
)
|
182
|
+
_check_if_record_in_db(organism_record, queryset.db)
|
183
183
|
field_values = pd.Series(
|
184
|
-
|
184
|
+
_filter_queryset_with_organism(
|
185
185
|
queryset=queryset,
|
186
|
-
|
187
|
-
|
188
|
-
values_list_field=field,
|
186
|
+
organism=organism_record,
|
187
|
+
values_list_field=field_str,
|
189
188
|
),
|
190
189
|
dtype="object",
|
191
190
|
)
|
@@ -204,7 +203,7 @@ def _validate(
|
|
204
203
|
field_values=field_values,
|
205
204
|
case_sensitive=True,
|
206
205
|
mute=mute,
|
207
|
-
field=
|
206
|
+
field=field_str,
|
208
207
|
)
|
209
208
|
if return_str and len(result) == 1:
|
210
209
|
return result[0]
|
@@ -215,16 +214,15 @@ def _validate(
|
|
215
214
|
def _standardize(
|
216
215
|
cls,
|
217
216
|
values: ListLike,
|
218
|
-
field:
|
217
|
+
field: StrField | None = None,
|
219
218
|
*,
|
220
219
|
return_field: str = None,
|
221
220
|
return_mapper: bool = False,
|
222
221
|
case_sensitive: bool = False,
|
223
222
|
mute: bool = False,
|
224
|
-
|
223
|
+
source_aware: bool = True,
|
225
224
|
keep: Literal["first", "last", False] = "first",
|
226
225
|
synonyms_field: str = "synonyms",
|
227
|
-
using_key: str | None = None,
|
228
226
|
organism: str | Record | None = None,
|
229
227
|
source: Record | None = None,
|
230
228
|
strict_source: bool = False,
|
@@ -233,59 +231,45 @@ def _standardize(
|
|
233
231
|
from lamin_utils._standardize import standardize as map_synonyms
|
234
232
|
|
235
233
|
return_str = True if isinstance(values, str) else False
|
236
|
-
if isinstance(values, str):
|
237
|
-
values = [values]
|
238
234
|
values = _concat_lists(values)
|
239
235
|
|
240
|
-
|
241
|
-
|
236
|
+
field_str = get_name_field(cls, field=field)
|
237
|
+
return_field_str = get_name_field(
|
242
238
|
cls, field=field if return_field is None else return_field
|
243
239
|
)
|
244
|
-
queryset =
|
245
|
-
|
240
|
+
queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
|
241
|
+
registry = queryset.model
|
246
242
|
if isinstance(source, Record):
|
247
|
-
|
243
|
+
_check_if_record_in_db(source, queryset.db)
|
248
244
|
if strict_source:
|
249
245
|
queryset = queryset.filter(source=source)
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
# here, we can safely import bionty
|
255
|
-
from bionty._bionty import create_or_get_organism_record
|
256
|
-
|
257
|
-
organism_record = create_or_get_organism_record(
|
258
|
-
organism=organism, registry=registry, field=field
|
259
|
-
)
|
260
|
-
organism = (
|
261
|
-
organism_record.name if organism_record is not None else organism_record
|
262
|
-
)
|
263
|
-
else:
|
264
|
-
organism = None
|
246
|
+
organism_record = get_organism_record_from_field(
|
247
|
+
getattr(registry, field_str), organism, values, queryset.db
|
248
|
+
)
|
249
|
+
_check_if_record_in_db(organism_record, queryset.db)
|
265
250
|
|
266
251
|
# only perform synonym mapping if field is the name field
|
267
|
-
if hasattr(registry, "_name_field") and
|
252
|
+
if hasattr(registry, "_name_field") and field_str != registry._name_field:
|
268
253
|
synonyms_field = None
|
269
254
|
|
270
255
|
try:
|
271
256
|
registry._meta.get_field(synonyms_field)
|
272
257
|
fields = {
|
273
258
|
field_name
|
274
|
-
for field_name in [
|
259
|
+
for field_name in [field_str, return_field_str, synonyms_field]
|
275
260
|
if field_name is not None
|
276
261
|
}
|
277
|
-
df =
|
262
|
+
df = _filter_queryset_with_organism(
|
278
263
|
queryset=queryset,
|
279
|
-
|
280
|
-
|
281
|
-
fields=list(fields),
|
264
|
+
organism=organism_record,
|
265
|
+
values_list_fields=list(fields),
|
282
266
|
)
|
283
267
|
except FieldDoesNotExist:
|
284
268
|
df = pd.DataFrame()
|
285
269
|
|
286
270
|
_kwargs = {
|
287
|
-
"field":
|
288
|
-
"return_field":
|
271
|
+
"field": field_str,
|
272
|
+
"return_field": return_field_str,
|
289
273
|
"case_sensitive": case_sensitive,
|
290
274
|
"keep": keep,
|
291
275
|
"synonyms_field": synonyms_field,
|
@@ -307,8 +291,8 @@ def _standardize(
|
|
307
291
|
return result[0]
|
308
292
|
return result
|
309
293
|
|
310
|
-
# map synonyms in
|
311
|
-
if registry
|
294
|
+
# map synonyms in public source
|
295
|
+
if hasattr(registry, "source_id") and source_aware:
|
312
296
|
mapper = {}
|
313
297
|
if return_mapper:
|
314
298
|
mapper = std_names_db
|
@@ -317,19 +301,19 @@ def _standardize(
|
|
317
301
|
)
|
318
302
|
|
319
303
|
val_res = registry.validate(
|
320
|
-
std_names_db, field=field, mute=True, organism=
|
304
|
+
std_names_db, field=field, mute=True, organism=organism_record
|
321
305
|
)
|
322
306
|
if all(val_res):
|
323
307
|
return _return(result=std_names_db, mapper=mapper)
|
324
308
|
|
325
309
|
nonval = np.array(std_names_db)[~val_res]
|
326
|
-
std_names_bt_mapper = registry.public(organism=
|
310
|
+
std_names_bt_mapper = registry.public(organism=organism_record).standardize(
|
327
311
|
nonval, return_mapper=True, mute=True, **_kwargs
|
328
312
|
)
|
329
313
|
|
330
314
|
if len(std_names_bt_mapper) > 0 and not mute:
|
331
315
|
s = "" if len(std_names_bt_mapper) == 1 else "s"
|
332
|
-
field_print = "synonym" if
|
316
|
+
field_print = "synonym" if field_str == return_field_str else field_str
|
333
317
|
|
334
318
|
reduced_mapped_keys_str = f"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}"
|
335
319
|
truncated_note = (
|
@@ -337,7 +321,7 @@ def _standardize(
|
|
337
321
|
)
|
338
322
|
|
339
323
|
warn_msg = (
|
340
|
-
f"found {len(std_names_bt_mapper)} {field_print}{s} in
|
324
|
+
f"found {len(std_names_bt_mapper)} {field_print}{s} in public source{truncated_note}:"
|
341
325
|
f" {reduced_mapped_keys_str}\n"
|
342
326
|
f" please add corresponding {registry._meta.model.__name__} records via{truncated_note}:"
|
343
327
|
f" `.from_values({reduced_mapped_keys_str})`"
|
@@ -437,57 +421,36 @@ def _add_or_remove_synonyms(
|
|
437
421
|
|
438
422
|
|
439
423
|
def _check_synonyms_field_exist(record: CanCurate):
|
440
|
-
|
441
|
-
|
442
|
-
except AttributeError:
|
424
|
+
"""Check if synonyms field exists."""
|
425
|
+
if not hasattr(record, "synonyms"):
|
443
426
|
raise NotImplementedError(
|
444
427
|
f"No synonyms field found in table {record.__class__.__name__}!"
|
445
428
|
) from None
|
446
429
|
|
447
430
|
|
448
|
-
def
|
431
|
+
def _filter_queryset_with_organism(
|
449
432
|
queryset: QuerySet,
|
450
|
-
|
451
|
-
organism: str | Record | None = None,
|
433
|
+
organism: Record | None = None,
|
452
434
|
values_list_field: str | None = None,
|
453
|
-
|
435
|
+
values_list_fields: list[str] | None = None,
|
454
436
|
):
|
455
437
|
"""Filter a queryset based on organism."""
|
456
438
|
import pandas as pd
|
457
439
|
|
458
|
-
|
459
|
-
|
460
|
-
if _has_organism_field(registry) and not _field_is_id(field, registry):
|
461
|
-
# here, we can safely import bionty
|
462
|
-
from bionty._bionty import create_or_get_organism_record
|
463
|
-
|
464
|
-
organism_record = create_or_get_organism_record(
|
465
|
-
organism=organism, registry=registry, field=field
|
466
|
-
)
|
467
|
-
if organism_record is not None:
|
468
|
-
queryset = queryset.filter(organism__name=organism_record.name)
|
440
|
+
if organism is not None:
|
441
|
+
queryset = queryset.filter(organism=organism)
|
469
442
|
|
443
|
+
# values_list_field/s for better performance
|
470
444
|
if values_list_field is None:
|
471
|
-
if
|
445
|
+
if values_list_fields:
|
472
446
|
return pd.DataFrame.from_records(
|
473
|
-
queryset.values_list(*
|
447
|
+
queryset.values_list(*values_list_fields), columns=values_list_fields
|
474
448
|
)
|
475
449
|
return pd.DataFrame.from_records(queryset.values())
|
476
|
-
|
477
450
|
else:
|
478
451
|
return queryset.values_list(values_list_field, flat=True)
|
479
452
|
|
480
453
|
|
481
|
-
def _field_is_id(field: str, registry: type[Record]) -> bool:
|
482
|
-
"""Check if the field is an ontology ID."""
|
483
|
-
if hasattr(registry, "_ontology_id_field"):
|
484
|
-
if field == registry._ontology_id_field:
|
485
|
-
return True
|
486
|
-
if field.endswith("id"):
|
487
|
-
return True
|
488
|
-
return False
|
489
|
-
|
490
|
-
|
491
454
|
class CanCurate:
|
492
455
|
"""Base class providing :class:`~lamindb.models.Record`-based validation."""
|
493
456
|
|
@@ -495,7 +458,7 @@ class CanCurate:
|
|
495
458
|
def inspect(
|
496
459
|
cls,
|
497
460
|
values: ListLike,
|
498
|
-
field:
|
461
|
+
field: StrField | None = None,
|
499
462
|
*,
|
500
463
|
mute: bool = False,
|
501
464
|
organism: Union[str, Record, None] = None,
|
@@ -517,21 +480,23 @@ class CanCurate:
|
|
517
480
|
strict_source: Determines the validation behavior against records in the registry.
|
518
481
|
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
519
482
|
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
520
|
-
Note: this parameter won't affect validation against
|
483
|
+
Note: this parameter won't affect validation against public sources.
|
521
484
|
|
522
485
|
See Also:
|
523
486
|
:meth:`~lamindb.models.CanCurate.validate`
|
524
487
|
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
488
|
+
Example::
|
489
|
+
|
490
|
+
import bionty as bt
|
491
|
+
|
492
|
+
# save some gene records
|
493
|
+
bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save()
|
494
|
+
|
495
|
+
# inspect gene symbols
|
496
|
+
gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
|
497
|
+
result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol, organism="human")
|
498
|
+
assert result.validated == ["A1CF", "A1BG"]
|
499
|
+
assert result.non_validated == ["FANCD1", "FANCD20"]
|
535
500
|
"""
|
536
501
|
return _inspect(
|
537
502
|
cls=cls,
|
@@ -547,7 +512,7 @@ class CanCurate:
|
|
547
512
|
def validate(
|
548
513
|
cls,
|
549
514
|
values: ListLike,
|
550
|
-
field:
|
515
|
+
field: StrField | None = None,
|
551
516
|
*,
|
552
517
|
mute: bool = False,
|
553
518
|
organism: Union[str, Record, None] = None,
|
@@ -569,7 +534,7 @@ class CanCurate:
|
|
569
534
|
strict_source: Determines the validation behavior against records in the registry.
|
570
535
|
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
571
536
|
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
572
|
-
Note: this parameter won't affect validation against
|
537
|
+
Note: this parameter won't affect validation against public sources.
|
573
538
|
|
574
539
|
Returns:
|
575
540
|
A vector of booleans indicating if an element is validated.
|
@@ -577,13 +542,15 @@ class CanCurate:
|
|
577
542
|
See Also:
|
578
543
|
:meth:`~lamindb.models.CanCurate.inspect`
|
579
544
|
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
545
|
+
Example::
|
546
|
+
|
547
|
+
import bionty as bt
|
548
|
+
|
549
|
+
bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save()
|
550
|
+
|
551
|
+
gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
|
552
|
+
bt.Gene.validate(gene_symbols, field=bt.Gene.symbol, organism="human")
|
553
|
+
#> array([ True, True, False, False])
|
587
554
|
"""
|
588
555
|
return _validate(
|
589
556
|
cls=cls,
|
@@ -622,33 +589,25 @@ class CanCurate:
|
|
622
589
|
Notes:
|
623
590
|
For more info, see tutorial: :doc:`docs:bio-registries`.
|
624
591
|
|
625
|
-
|
592
|
+
Example::
|
626
593
|
|
627
|
-
|
594
|
+
import bionty as bt
|
628
595
|
|
629
|
-
|
630
|
-
|
596
|
+
# Bulk create from non-validated values will log warnings & returns empty list
|
597
|
+
ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"])
|
598
|
+
assert len(ulabels) == 0
|
631
599
|
|
632
|
-
Bulk create records from validated values returns the corresponding existing records
|
600
|
+
# Bulk create records from validated values returns the corresponding existing records
|
601
|
+
ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], create=True).save()
|
602
|
+
assert len(ulabels) == 3
|
633
603
|
|
634
|
-
|
635
|
-
|
636
|
-
>>> assert len(ulabels) == 3
|
637
|
-
|
638
|
-
Bulk create records from public reference:
|
639
|
-
|
640
|
-
>>> import bionty as bt
|
641
|
-
>>> records = bt.CellType.from_values(["T cell", "B cell"], field="name")
|
642
|
-
>>> records
|
604
|
+
# Bulk create records from public reference
|
605
|
+
bt.CellType.from_values(["T cell", "B cell"]).save()
|
643
606
|
"""
|
644
|
-
|
645
|
-
|
646
|
-
field_str = get_name_field(cls, field=field)
|
647
|
-
return get_or_create_records(
|
607
|
+
return _from_values(
|
648
608
|
iterable=values,
|
649
|
-
field=getattr(cls,
|
609
|
+
field=getattr(cls, get_name_field(cls, field=field)),
|
650
610
|
create=create,
|
651
|
-
from_source=from_source,
|
652
611
|
organism=organism,
|
653
612
|
source=source,
|
654
613
|
mute=mute,
|
@@ -658,13 +617,13 @@ class CanCurate:
|
|
658
617
|
def standardize(
|
659
618
|
cls,
|
660
619
|
values: Iterable,
|
661
|
-
field:
|
620
|
+
field: StrField | None = None,
|
662
621
|
*,
|
663
|
-
return_field:
|
622
|
+
return_field: StrField | None = None,
|
664
623
|
return_mapper: bool = False,
|
665
624
|
case_sensitive: bool = False,
|
666
625
|
mute: bool = False,
|
667
|
-
|
626
|
+
source_aware: bool = True,
|
668
627
|
keep: Literal["first", "last", False] = "first",
|
669
628
|
synonyms_field: str = "synonyms",
|
670
629
|
organism: Union[str, Record, None] = None,
|
@@ -680,22 +639,22 @@ class CanCurate:
|
|
680
639
|
return_mapper: If `True`, returns `{input_value: standardized_name}`.
|
681
640
|
case_sensitive: Whether the mapping is case sensitive.
|
682
641
|
mute: Whether to mute logging.
|
683
|
-
|
642
|
+
source_aware: Whether to standardize from public source. Defaults to `True` for BioRecord registries.
|
684
643
|
keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`:
|
685
|
-
|
686
|
-
|
687
|
-
|
644
|
+
- `"first"`: returns the first mapped standardized name
|
645
|
+
- `"last"`: returns the last mapped standardized name
|
646
|
+
- `False`: returns all mapped standardized name.
|
688
647
|
|
689
|
-
|
648
|
+
When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates.
|
690
649
|
|
691
|
-
|
650
|
+
When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value.
|
692
651
|
synonyms_field: A field containing the concatenated synonyms.
|
693
652
|
organism: An Organism name or record.
|
694
653
|
source: A `bionty.Source` record that specifies the version to validate against.
|
695
654
|
strict_source: Determines the validation behavior against records in the registry.
|
696
655
|
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
697
656
|
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
698
|
-
Note: this parameter won't affect validation against
|
657
|
+
Note: this parameter won't affect validation against public sources.
|
699
658
|
|
700
659
|
Returns:
|
701
660
|
If `return_mapper` is `False`: a list of standardized names. Otherwise,
|
@@ -708,14 +667,17 @@ class CanCurate:
|
|
708
667
|
:meth:`~lamindb.models.CanCurate.remove_synonym`
|
709
668
|
Remove synonyms.
|
710
669
|
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
670
|
+
Example::
|
671
|
+
|
672
|
+
import bionty as bt
|
673
|
+
|
674
|
+
# save some gene records
|
675
|
+
bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save()
|
676
|
+
|
677
|
+
# standardize gene synonyms
|
678
|
+
gene_synonyms = ["A1CF", "A1BG", "FANCD1", "FANCD20"]
|
679
|
+
bt.Gene.standardize(gene_synonyms)
|
680
|
+
#> ['A1CF', 'A1BG', 'BRCA2', 'FANCD20']
|
719
681
|
"""
|
720
682
|
return _standardize(
|
721
683
|
cls=cls,
|
@@ -726,7 +688,7 @@ class CanCurate:
|
|
726
688
|
case_sensitive=case_sensitive,
|
727
689
|
mute=mute,
|
728
690
|
strict_source=strict_source,
|
729
|
-
|
691
|
+
source_aware=source_aware,
|
730
692
|
keep=keep,
|
731
693
|
synonyms_field=synonyms_field,
|
732
694
|
organism=organism,
|
@@ -750,16 +712,19 @@ class CanCurate:
|
|
750
712
|
:meth:`~lamindb.models.CanCurate.remove_synonym`
|
751
713
|
Remove synonyms.
|
752
714
|
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
715
|
+
Example::
|
716
|
+
|
717
|
+
import bionty as bt
|
718
|
+
|
719
|
+
# save "T cell" record
|
720
|
+
record = bt.CellType.from_source(name="T cell").save()
|
721
|
+
record.synonyms
|
722
|
+
#> "T-cell|T lymphocyte|T-lymphocyte"
|
723
|
+
|
724
|
+
# add a synonym
|
725
|
+
record.add_synonym("T cells")
|
726
|
+
record.synonyms
|
727
|
+
#> "T cells|T-cell|T-lymphocyte|T lymphocyte"
|
763
728
|
"""
|
764
729
|
_check_synonyms_field_exist(self)
|
765
730
|
_add_or_remove_synonyms(
|
@@ -776,15 +741,19 @@ class CanCurate:
|
|
776
741
|
:meth:`~lamindb.models.CanCurate.add_synonym`
|
777
742
|
Add synonyms
|
778
743
|
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
744
|
+
Example::
|
745
|
+
|
746
|
+
import bionty as bt
|
747
|
+
|
748
|
+
# save "T cell" record
|
749
|
+
record = bt.CellType.from_source(name="T cell").save()
|
750
|
+
record.synonyms
|
751
|
+
#> "T-cell|T lymphocyte|T-lymphocyte"
|
752
|
+
|
753
|
+
# remove a synonym
|
754
|
+
record.remove_synonym("T-cell")
|
755
|
+
record.synonyms
|
756
|
+
#> "T lymphocyte|T-lymphocyte"
|
788
757
|
"""
|
789
758
|
_check_synonyms_field_exist(self)
|
790
759
|
_add_or_remove_synonyms(synonym=synonym, record=self, action="remove")
|
@@ -798,20 +767,20 @@ class CanCurate:
|
|
798
767
|
See Also:
|
799
768
|
:meth:`~lamindb.models.CanCurate.add_synonym`
|
800
769
|
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
770
|
+
Example::
|
771
|
+
|
772
|
+
import bionty as bt
|
773
|
+
|
774
|
+
# save an experimental factor record
|
775
|
+
scrna = bt.ExperimentalFactor.from_source(name="single-cell RNA sequencing").save()
|
776
|
+
assert scrna.abbr is None
|
777
|
+
assert scrna.synonyms == "single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing"
|
778
|
+
|
779
|
+
# set abbreviation
|
780
|
+
scrna.set_abbr("scRNA")
|
781
|
+
assert scrna.abbr == "scRNA"
|
782
|
+
# synonyms are updated
|
783
|
+
assert scrna.synonyms == "scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq"
|
815
784
|
"""
|
816
785
|
self.abbr = value
|
817
786
|
|