lamindb 1.2a2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -1
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +15 -12
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/loaders.py +22 -12
- lamindb/core/storage/_tiledbsoma.py +2 -2
- lamindb/core/storage/_zarr.py +84 -26
- lamindb/core/storage/objects.py +45 -44
- lamindb/core/types.py +10 -0
- lamindb/curators/__init__.py +1272 -1517
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +65 -14
- lamindb/models/_from_values.py +113 -78
- lamindb/models/artifact.py +142 -98
- lamindb/models/can_curate.py +185 -216
- lamindb/models/feature.py +32 -2
- lamindb/models/project.py +69 -7
- lamindb/models/query_set.py +12 -2
- lamindb/models/record.py +48 -25
- lamindb/models/run.py +18 -1
- lamindb/models/schema.py +0 -8
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/METADATA +7 -6
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/RECORD +28 -27
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/LICENSE +0 -0
- {lamindb-1.2a2.dist-info → lamindb-1.3.0.dist-info}/WHEEL +0 -0
lamindb/models/_from_values.py
CHANGED
@@ -1,28 +1,27 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import re
|
3
4
|
from typing import TYPE_CHECKING
|
4
5
|
|
5
6
|
import pandas as pd
|
6
7
|
from django.core.exceptions import FieldDoesNotExist
|
7
8
|
from lamin_utils import colors, logger
|
8
9
|
|
9
|
-
from .record import Record
|
10
|
-
|
11
10
|
if TYPE_CHECKING:
|
12
11
|
from collections.abc import Iterable
|
13
12
|
|
14
|
-
from lamindb.base.types import
|
13
|
+
from lamindb.base.types import FieldAttr, ListLike
|
15
14
|
|
16
15
|
from .query_set import RecordList
|
16
|
+
from .record import Record
|
17
17
|
|
18
18
|
|
19
19
|
# The base function for `from_values`
|
20
|
-
def
|
20
|
+
def _from_values(
|
21
21
|
iterable: ListLike,
|
22
|
-
field:
|
22
|
+
field: FieldAttr,
|
23
23
|
*,
|
24
24
|
create: bool = False,
|
25
|
-
from_source: bool = False,
|
26
25
|
organism: Record | str | None = None,
|
27
26
|
source: Record | None = None,
|
28
27
|
mute: bool = False,
|
@@ -31,47 +30,44 @@ def get_or_create_records(
|
|
31
30
|
from .query_set import RecordList
|
32
31
|
|
33
32
|
registry = field.field.model # type: ignore
|
33
|
+
organism_record = _get_organism_record(field, organism, values=iterable)
|
34
|
+
# TODO: the create is problematic if field is not a name field
|
34
35
|
if create:
|
35
|
-
|
36
|
-
|
36
|
+
create_kwargs = {}
|
37
|
+
if organism_record:
|
38
|
+
create_kwargs["organism"] = organism_record
|
39
|
+
return RecordList(
|
40
|
+
[
|
41
|
+
registry(**{field.field.name: value}, **create_kwargs)
|
42
|
+
for value in iterable
|
43
|
+
]
|
44
|
+
) # type: ignore
|
45
|
+
|
37
46
|
iterable_idx = index_iterable(iterable)
|
38
47
|
|
39
48
|
# returns existing records & non-existing values
|
40
49
|
records, nonexist_values, msg = get_existing_records(
|
41
50
|
iterable_idx=iterable_idx,
|
42
51
|
field=field,
|
43
|
-
organism=
|
52
|
+
organism=organism_record,
|
44
53
|
mute=mute,
|
45
54
|
)
|
46
55
|
|
47
56
|
# new records to be created based on new values
|
48
57
|
if len(nonexist_values) > 0:
|
49
|
-
|
50
|
-
|
51
|
-
if
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
if source_record:
|
59
|
-
from bionty.core._add_ontology import check_source_in_db
|
60
|
-
|
61
|
-
check_source_in_db(registry=registry, source=source_record)
|
62
|
-
|
63
|
-
from_source = not source_record.in_db
|
64
|
-
elif hasattr(registry, "source_id"):
|
65
|
-
from_source = True
|
66
|
-
else:
|
67
|
-
from_source = False
|
68
|
-
|
69
|
-
if from_source:
|
58
|
+
if hasattr(registry, "source_id"):
|
59
|
+
# if can and needed, get organism record from the existing records
|
60
|
+
if (
|
61
|
+
organism_record is None
|
62
|
+
and len(records) > 0
|
63
|
+
and _is_organism_required(registry)
|
64
|
+
):
|
65
|
+
organism_record = records[0].organism
|
70
66
|
records_bionty, unmapped_values = create_records_from_source(
|
71
67
|
iterable_idx=nonexist_values,
|
72
68
|
field=field,
|
73
|
-
organism=
|
74
|
-
source=
|
69
|
+
organism=organism_record,
|
70
|
+
source=source,
|
75
71
|
msg=msg,
|
76
72
|
mute=mute,
|
77
73
|
)
|
@@ -84,15 +80,15 @@ def get_or_create_records(
|
|
84
80
|
unmapped_values = nonexist_values
|
85
81
|
# unmapped new_ids will NOT create records
|
86
82
|
if len(unmapped_values) > 0:
|
83
|
+
# first log the success message
|
87
84
|
if len(msg) > 0 and not mute:
|
88
85
|
logger.success(msg)
|
89
86
|
s = "" if len(unmapped_values) == 1 else "s"
|
90
87
|
print_values = colors.yellow(_format_values(unmapped_values))
|
91
|
-
name = registry.__name__
|
92
88
|
n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
|
93
89
|
if not mute:
|
94
90
|
logger.warning(
|
95
|
-
f"{colors.red('did not create')} {
|
91
|
+
f"{colors.red('did not create')} {registry.__name__} record{s} for "
|
96
92
|
f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}" # type: ignore
|
97
93
|
)
|
98
94
|
return RecordList(records)
|
@@ -100,25 +96,21 @@ def get_or_create_records(
|
|
100
96
|
|
101
97
|
def get_existing_records(
|
102
98
|
iterable_idx: pd.Index,
|
103
|
-
field:
|
99
|
+
field: FieldAttr,
|
104
100
|
organism: Record | None = None,
|
105
101
|
mute: bool = False,
|
106
|
-
):
|
102
|
+
) -> tuple[list, pd.Index, str]:
|
103
|
+
"""Get existing records from the database."""
|
107
104
|
# NOTE: existing records matching is agnostic to the source
|
108
105
|
model = field.field.model # type: ignore
|
109
|
-
if organism is None and field.field.name == "ensembl_gene_id": # type: ignore
|
110
|
-
if len(iterable_idx) > 0:
|
111
|
-
organism = _ensembl_prefix(iterable_idx[0], field, organism) # type: ignore
|
112
|
-
organism = _get_organism_record(field, organism, force=True)
|
113
106
|
|
114
|
-
# standardize based on the DB reference
|
115
107
|
# log synonyms mapped terms
|
116
108
|
syn_mapper = model.standardize(
|
117
109
|
iterable_idx,
|
118
110
|
field=field,
|
119
111
|
organism=organism,
|
120
112
|
mute=True,
|
121
|
-
|
113
|
+
source_aware=False, # standardize only based on the DB reference
|
122
114
|
return_mapper=True,
|
123
115
|
)
|
124
116
|
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
@@ -137,7 +129,6 @@ def get_existing_records(
|
|
137
129
|
is_validated = model.validate(
|
138
130
|
iterable_idx, field=field, organism=organism, mute=True
|
139
131
|
)
|
140
|
-
|
141
132
|
if len(is_validated) > 0:
|
142
133
|
validated = iterable_idx[is_validated]
|
143
134
|
else:
|
@@ -151,7 +142,7 @@ def get_existing_records(
|
|
151
142
|
msg = (
|
152
143
|
"loaded"
|
153
144
|
f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
|
154
|
-
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
145
|
+
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
155
146
|
)
|
156
147
|
if len(syn_mapper) > 0:
|
157
148
|
s = "" if len(syn_mapper) == 1 else "s"
|
@@ -173,15 +164,13 @@ def get_existing_records(
|
|
173
164
|
msg = ""
|
174
165
|
|
175
166
|
# get all existing records in the db
|
176
|
-
# if necessary, create records for the values in kwargs
|
177
|
-
# k:v -> k:v_record
|
178
167
|
query = {f"{field.field.name}__in": iterable_idx.values} # type: ignore
|
179
168
|
if organism is not None:
|
180
169
|
query["organism"] = organism
|
181
170
|
records = model.filter(**query).list()
|
182
171
|
|
183
172
|
if len(validated) == len(iterable_idx):
|
184
|
-
return records, [], msg
|
173
|
+
return records, pd.Index([]), msg
|
185
174
|
else:
|
186
175
|
nonval_values = iterable_idx.difference(validated)
|
187
176
|
return records, nonval_values, msg
|
@@ -189,12 +178,13 @@ def get_existing_records(
|
|
189
178
|
|
190
179
|
def create_records_from_source(
|
191
180
|
iterable_idx: pd.Index,
|
192
|
-
field:
|
181
|
+
field: FieldAttr,
|
193
182
|
organism: Record | None = None,
|
194
183
|
source: Record | None = None,
|
195
184
|
msg: str = "",
|
196
185
|
mute: bool = False,
|
197
|
-
):
|
186
|
+
) -> tuple[list, pd.Index]:
|
187
|
+
"""Create records from source."""
|
198
188
|
model = field.field.model # type: ignore
|
199
189
|
records: list = []
|
200
190
|
# populate additional fields from bionty
|
@@ -253,8 +243,11 @@ def create_records_from_source(
|
|
253
243
|
df=bionty_df,
|
254
244
|
)
|
255
245
|
|
256
|
-
|
257
|
-
|
246
|
+
# this here is needed when the organism is required to create new records
|
247
|
+
if organism is None:
|
248
|
+
organism = _get_organism_record(
|
249
|
+
field, source.organism, values=mapped_values
|
250
|
+
)
|
258
251
|
|
259
252
|
create_kwargs = (
|
260
253
|
{"organism": organism, "source": source}
|
@@ -292,6 +285,7 @@ def create_records_from_source(
|
|
292
285
|
|
293
286
|
|
294
287
|
def index_iterable(iterable: Iterable) -> pd.Index:
|
288
|
+
"""Get unique values from an iterable."""
|
295
289
|
idx = pd.Index(iterable).unique()
|
296
290
|
# No entries are made for NAs, '', None
|
297
291
|
# returns an ordered unique not null list
|
@@ -301,6 +295,7 @@ def index_iterable(iterable: Iterable) -> pd.Index:
|
|
301
295
|
def _format_values(
|
302
296
|
names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
|
303
297
|
) -> str:
|
298
|
+
"""Format values for printing."""
|
304
299
|
if isinstance(names, dict):
|
305
300
|
items = {
|
306
301
|
f"{key}: {value}": None
|
@@ -345,50 +340,90 @@ def _bulk_create_dicts_from_df(
|
|
345
340
|
return df.reset_index().to_dict(orient="records"), multi_msg
|
346
341
|
|
347
342
|
|
348
|
-
def
|
343
|
+
def _is_organism_required(registry: type[Record]) -> bool:
|
344
|
+
"""Check if the registry has an organism field and is required.
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
True if the registry has an organism field and is required, False otherwise.
|
348
|
+
"""
|
349
349
|
try:
|
350
|
-
registry._meta.get_field("organism")
|
351
|
-
|
350
|
+
organism_field = registry._meta.get_field("organism")
|
351
|
+
# organism is not required or not a relation
|
352
|
+
if organism_field.null or not organism_field.is_relation:
|
353
|
+
return False
|
354
|
+
else:
|
355
|
+
return True
|
352
356
|
except FieldDoesNotExist:
|
353
357
|
return False
|
354
358
|
|
355
359
|
|
360
|
+
def _is_simple_field_unique(field: FieldAttr) -> bool:
|
361
|
+
"""Check if the field is an id field."""
|
362
|
+
# id field is a unique field that's not a relation
|
363
|
+
field = field.field
|
364
|
+
if field.unique and not field.is_relation:
|
365
|
+
return True
|
366
|
+
return False
|
367
|
+
|
368
|
+
|
356
369
|
def _get_organism_record( # type: ignore
|
357
|
-
field:
|
358
|
-
|
370
|
+
field: FieldAttr,
|
371
|
+
organism: str | Record | None = None,
|
372
|
+
values: Iterable = [],
|
373
|
+
using_key: str | None = None,
|
374
|
+
) -> Record | None:
|
359
375
|
"""Get organism record.
|
360
376
|
|
361
377
|
Args:
|
362
378
|
field: the field to get the organism record for
|
363
379
|
organism: the organism to get the record for
|
364
|
-
|
380
|
+
values: the values to get the organism record for
|
381
|
+
using_key: the db to get the organism record for
|
382
|
+
|
383
|
+
Returns:
|
384
|
+
The organism record if:
|
385
|
+
The organism FK is required for the registry
|
386
|
+
The field is not unique or the organism is not None
|
365
387
|
"""
|
366
|
-
registry = field.field.model
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
check = True
|
373
|
-
|
374
|
-
if _has_organism_field(registry) and check:
|
375
|
-
from bionty._bionty import create_or_get_organism_record
|
388
|
+
registry = field.field.model
|
389
|
+
field_str = field.field.name
|
390
|
+
check = not _is_simple_field_unique(field=field) or organism is not None
|
391
|
+
|
392
|
+
if field_str == "ensembl_gene_id" and len(values) > 0 and organism is None: # type: ignore
|
393
|
+
return _organism_from_ensembl_id(values[0], using_key) # type: ignore
|
376
394
|
|
377
|
-
|
378
|
-
|
395
|
+
if _is_organism_required(registry) and check:
|
396
|
+
from bionty._bionty import create_or_get_organism_record
|
379
397
|
|
380
398
|
organism_record = create_or_get_organism_record(
|
381
|
-
organism=organism, registry=registry, field=
|
399
|
+
organism=organism, registry=registry, field=field_str
|
382
400
|
)
|
383
401
|
if organism_record is not None:
|
384
|
-
return organism_record
|
402
|
+
return organism_record.save()
|
385
403
|
|
386
404
|
|
387
|
-
def
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
405
|
+
def _organism_from_ensembl_id(id: str, using_key: str | None) -> Record | None: # type: ignore
|
406
|
+
"""Get organism record from ensembl id."""
|
407
|
+
import bionty as bt
|
408
|
+
from bionty.base.dev._io import s3_bionty_assets
|
409
|
+
|
410
|
+
localpath = s3_bionty_assets(
|
411
|
+
".lamindb/0QeqXlKq9aqW8aqe0000.parquet", bt.base.settings.versionsdir
|
412
|
+
)
|
413
|
+
ensembl_prefixes = pd.read_parquet(localpath).set_index("gene_prefix")
|
414
|
+
|
415
|
+
prefix = re.sub(r"\d+", "", id)
|
416
|
+
if prefix in ensembl_prefixes.index:
|
417
|
+
organism_name = ensembl_prefixes.loc[prefix, "name"].lower()
|
418
|
+
|
419
|
+
using_key = None if using_key == "default" else using_key
|
420
|
+
|
421
|
+
organism_record = (
|
422
|
+
bt.Organism.using(using_key).filter(name=organism_name).one_or_none()
|
423
|
+
)
|
424
|
+
if organism_record is None:
|
425
|
+
organism_record = bt.Organism.from_source(name=organism_name)
|
426
|
+
if organism_record is not None:
|
427
|
+
organism_record.save(using=using_key)
|
393
428
|
|
394
|
-
|
429
|
+
return organism_record
|