lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +3 -1
- lamindb/_view.py +2 -2
- lamindb/base/types.py +50 -11
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +15 -12
- lamindb/core/datasets/__init__.py +1 -0
- lamindb/core/datasets/_core.py +23 -0
- lamindb/core/datasets/_small.py +16 -2
- lamindb/core/loaders.py +22 -12
- lamindb/core/storage/_tiledbsoma.py +2 -2
- lamindb/core/storage/_zarr.py +84 -26
- lamindb/core/storage/objects.py +45 -44
- lamindb/core/types.py +11 -1
- lamindb/curators/__init__.py +1430 -1665
- lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
- lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
- lamindb/models/_feature_manager.py +86 -42
- lamindb/models/_from_values.py +110 -119
- lamindb/models/_label_manager.py +17 -10
- lamindb/models/artifact.py +170 -102
- lamindb/models/can_curate.py +200 -231
- lamindb/models/feature.py +76 -47
- lamindb/models/project.py +69 -7
- lamindb/models/query_set.py +12 -2
- lamindb/models/record.py +77 -50
- lamindb/models/run.py +20 -7
- lamindb/models/schema.py +7 -15
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/METADATA +8 -7
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/RECORD +31 -30
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
- {lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0
lamindb/models/_from_values.py
CHANGED
@@ -3,26 +3,21 @@ from __future__ import annotations
|
|
3
3
|
from typing import TYPE_CHECKING
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
-
from django.core.exceptions import FieldDoesNotExist
|
7
6
|
from lamin_utils import colors, logger
|
8
7
|
|
9
|
-
from .record import Record
|
10
|
-
|
11
8
|
if TYPE_CHECKING:
|
12
|
-
from
|
13
|
-
|
14
|
-
from lamindb.base.types import ListLike, StrField
|
9
|
+
from lamindb.base.types import FieldAttr, ListLike
|
15
10
|
|
16
11
|
from .query_set import RecordList
|
12
|
+
from .record import Record
|
17
13
|
|
18
14
|
|
19
15
|
# The base function for `from_values`
|
20
|
-
def
|
16
|
+
def _from_values(
|
21
17
|
iterable: ListLike,
|
22
|
-
field:
|
18
|
+
field: FieldAttr,
|
23
19
|
*,
|
24
20
|
create: bool = False,
|
25
|
-
from_source: bool = False,
|
26
21
|
organism: Record | str | None = None,
|
27
22
|
source: Record | None = None,
|
28
23
|
mute: bool = False,
|
@@ -31,68 +26,67 @@ def get_or_create_records(
|
|
31
26
|
from .query_set import RecordList
|
32
27
|
|
33
28
|
registry = field.field.model # type: ignore
|
29
|
+
organism_record = get_organism_record_from_field(field, organism, values=iterable)
|
30
|
+
# TODO: the create is problematic if field is not a name field
|
34
31
|
if create:
|
35
|
-
|
36
|
-
|
32
|
+
create_kwargs = {}
|
33
|
+
if organism_record:
|
34
|
+
create_kwargs["organism"] = organism_record
|
35
|
+
return RecordList(
|
36
|
+
[
|
37
|
+
registry(**{field.field.name: value}, **create_kwargs)
|
38
|
+
for value in iterable
|
39
|
+
]
|
40
|
+
) # type: ignore
|
41
|
+
|
37
42
|
iterable_idx = index_iterable(iterable)
|
38
43
|
|
39
44
|
# returns existing records & non-existing values
|
40
45
|
records, nonexist_values, msg = get_existing_records(
|
41
46
|
iterable_idx=iterable_idx,
|
42
47
|
field=field,
|
43
|
-
organism=
|
48
|
+
organism=organism_record,
|
44
49
|
mute=mute,
|
45
50
|
)
|
46
51
|
|
47
52
|
# new records to be created based on new values
|
48
53
|
if len(nonexist_values) > 0:
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
check_source_in_db(registry=registry, source=source_record)
|
62
|
-
|
63
|
-
from_source = not source_record.in_db
|
64
|
-
elif hasattr(registry, "source_id"):
|
65
|
-
from_source = True
|
66
|
-
else:
|
67
|
-
from_source = False
|
68
|
-
|
69
|
-
if from_source:
|
70
|
-
records_bionty, unmapped_values = create_records_from_source(
|
54
|
+
if registry.__base__.__name__ == "BioRecord":
|
55
|
+
from bionty._organism import is_organism_required
|
56
|
+
|
57
|
+
# if can and needed, get organism record from the existing records
|
58
|
+
if (
|
59
|
+
organism_record is None
|
60
|
+
and len(records) > 0
|
61
|
+
and is_organism_required(registry)
|
62
|
+
):
|
63
|
+
organism_record = records[0].organism
|
64
|
+
records_public, unmapped_values = create_records_from_source(
|
71
65
|
iterable_idx=nonexist_values,
|
72
66
|
field=field,
|
73
|
-
organism=
|
74
|
-
source=
|
67
|
+
organism=organism_record,
|
68
|
+
source=source,
|
75
69
|
msg=msg,
|
76
70
|
mute=mute,
|
77
71
|
)
|
78
|
-
if len(
|
72
|
+
if len(records_public) > 0:
|
79
73
|
msg = ""
|
80
|
-
for record in
|
74
|
+
for record in records_public:
|
81
75
|
record._from_source = True
|
82
|
-
records +=
|
76
|
+
records += records_public
|
83
77
|
else:
|
84
78
|
unmapped_values = nonexist_values
|
85
79
|
# unmapped new_ids will NOT create records
|
86
80
|
if len(unmapped_values) > 0:
|
81
|
+
# first log the success message
|
87
82
|
if len(msg) > 0 and not mute:
|
88
83
|
logger.success(msg)
|
89
84
|
s = "" if len(unmapped_values) == 1 else "s"
|
90
85
|
print_values = colors.yellow(_format_values(unmapped_values))
|
91
|
-
name = registry.__name__
|
92
86
|
n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
|
93
87
|
if not mute:
|
94
88
|
logger.warning(
|
95
|
-
f"{colors.red('did not create')} {
|
89
|
+
f"{colors.red('did not create')} {registry.__name__} record{s} for "
|
96
90
|
f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}" # type: ignore
|
97
91
|
)
|
98
92
|
return RecordList(records)
|
@@ -100,25 +94,21 @@ def get_or_create_records(
|
|
100
94
|
|
101
95
|
def get_existing_records(
|
102
96
|
iterable_idx: pd.Index,
|
103
|
-
field:
|
97
|
+
field: FieldAttr,
|
104
98
|
organism: Record | None = None,
|
105
99
|
mute: bool = False,
|
106
|
-
):
|
100
|
+
) -> tuple[list, pd.Index, str]:
|
101
|
+
"""Get existing records from the database."""
|
107
102
|
# NOTE: existing records matching is agnostic to the source
|
108
103
|
model = field.field.model # type: ignore
|
109
|
-
if organism is None and field.field.name == "ensembl_gene_id": # type: ignore
|
110
|
-
if len(iterable_idx) > 0:
|
111
|
-
organism = _ensembl_prefix(iterable_idx[0], field, organism) # type: ignore
|
112
|
-
organism = _get_organism_record(field, organism, force=True)
|
113
104
|
|
114
|
-
# standardize based on the DB reference
|
115
105
|
# log synonyms mapped terms
|
116
106
|
syn_mapper = model.standardize(
|
117
107
|
iterable_idx,
|
118
108
|
field=field,
|
119
109
|
organism=organism,
|
120
110
|
mute=True,
|
121
|
-
|
111
|
+
source_aware=False, # standardize only based on the DB reference
|
122
112
|
return_mapper=True,
|
123
113
|
)
|
124
114
|
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
@@ -137,7 +127,6 @@ def get_existing_records(
|
|
137
127
|
is_validated = model.validate(
|
138
128
|
iterable_idx, field=field, organism=organism, mute=True
|
139
129
|
)
|
140
|
-
|
141
130
|
if len(is_validated) > 0:
|
142
131
|
validated = iterable_idx[is_validated]
|
143
132
|
else:
|
@@ -151,7 +140,7 @@ def get_existing_records(
|
|
151
140
|
msg = (
|
152
141
|
"loaded"
|
153
142
|
f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
|
154
|
-
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
143
|
+
f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
|
155
144
|
)
|
156
145
|
if len(syn_mapper) > 0:
|
157
146
|
s = "" if len(syn_mapper) == 1 else "s"
|
@@ -173,15 +162,13 @@ def get_existing_records(
|
|
173
162
|
msg = ""
|
174
163
|
|
175
164
|
# get all existing records in the db
|
176
|
-
# if necessary, create records for the values in kwargs
|
177
|
-
# k:v -> k:v_record
|
178
165
|
query = {f"{field.field.name}__in": iterable_idx.values} # type: ignore
|
179
166
|
if organism is not None:
|
180
167
|
query["organism"] = organism
|
181
168
|
records = model.filter(**query).list()
|
182
169
|
|
183
170
|
if len(validated) == len(iterable_idx):
|
184
|
-
return records, [], msg
|
171
|
+
return records, pd.Index([]), msg
|
185
172
|
else:
|
186
173
|
nonval_values = iterable_idx.difference(validated)
|
187
174
|
return records, nonval_values, msg
|
@@ -189,33 +176,35 @@ def get_existing_records(
|
|
189
176
|
|
190
177
|
def create_records_from_source(
|
191
178
|
iterable_idx: pd.Index,
|
192
|
-
field:
|
179
|
+
field: FieldAttr,
|
193
180
|
organism: Record | None = None,
|
194
181
|
source: Record | None = None,
|
195
182
|
msg: str = "",
|
196
183
|
mute: bool = False,
|
197
|
-
):
|
184
|
+
) -> tuple[list, pd.Index]:
|
185
|
+
"""Create records from source."""
|
198
186
|
model = field.field.model # type: ignore
|
199
187
|
records: list = []
|
200
|
-
# populate additional fields from
|
201
|
-
from bionty.
|
202
|
-
|
188
|
+
# populate additional fields from public_df
|
189
|
+
from bionty._source import filter_public_df_columns, get_source_record
|
190
|
+
|
191
|
+
# get the default source
|
192
|
+
source_record = get_source_record(model, organism, source)
|
203
193
|
|
204
|
-
# create the corresponding
|
194
|
+
# create the corresponding PublicOntology object from model
|
205
195
|
try:
|
206
|
-
|
207
|
-
public_ontology = model.public(organism=organism, source=source)
|
196
|
+
public_ontology = model.public(source=source_record)
|
208
197
|
except Exception:
|
209
|
-
#
|
198
|
+
# no public source
|
210
199
|
return records, iterable_idx
|
211
|
-
# get the default source
|
212
|
-
if source is None:
|
213
|
-
source = get_source_record(public_ontology, model)
|
214
200
|
|
215
|
-
# filter the columns in
|
216
|
-
|
201
|
+
# filter the columns in public df based on fields
|
202
|
+
public_df = filter_public_df_columns(model=model, public_ontology=public_ontology)
|
203
|
+
|
204
|
+
if public_df.empty:
|
205
|
+
return records, iterable_idx
|
217
206
|
|
218
|
-
# standardize in the
|
207
|
+
# standardize in the public reference
|
219
208
|
# do not inspect synonyms if the field is not name field
|
220
209
|
inspect_synonyms = True
|
221
210
|
if hasattr(model, "_name_field") and field.field.name != model._name_field: # type: ignore
|
@@ -241,27 +230,30 @@ def create_records_from_source(
|
|
241
230
|
|
242
231
|
iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
|
243
232
|
|
244
|
-
# create records for values that are found in the
|
233
|
+
# create records for values that are found in the public reference
|
245
234
|
# matching either field or synonyms
|
246
|
-
mapped_values = iterable_idx.intersection(
|
235
|
+
mapped_values = iterable_idx.intersection(public_df[field.field.name]) # type: ignore
|
247
236
|
|
248
237
|
multi_msg = ""
|
249
238
|
if len(mapped_values) > 0:
|
250
|
-
|
239
|
+
public_kwargs, multi_msg = _bulk_create_dicts_from_df(
|
251
240
|
keys=mapped_values,
|
252
241
|
column_name=field.field.name, # type: ignore
|
253
|
-
df=
|
242
|
+
df=public_df,
|
254
243
|
)
|
255
244
|
|
256
|
-
|
257
|
-
|
245
|
+
# this here is needed when the organism is required to create new records
|
246
|
+
if organism is None:
|
247
|
+
organism = get_organism_record_from_field(
|
248
|
+
field, source_record.organism, values=mapped_values
|
249
|
+
)
|
258
250
|
|
259
251
|
create_kwargs = (
|
260
|
-
{"organism": organism, "source":
|
252
|
+
{"organism": organism, "source": source_record}
|
261
253
|
if organism is not None
|
262
|
-
else {"source":
|
254
|
+
else {"source": source_record}
|
263
255
|
)
|
264
|
-
for bk in
|
256
|
+
for bk in public_kwargs:
|
265
257
|
records.append(model(**bk, **create_kwargs, _skip_validation=True))
|
266
258
|
|
267
259
|
# number of records that matches field (not synonyms)
|
@@ -286,12 +278,13 @@ def create_records_from_source(
|
|
286
278
|
if len(multi_msg) > 0 and not mute:
|
287
279
|
logger.warning(multi_msg)
|
288
280
|
|
289
|
-
# return the values that are not found in the
|
281
|
+
# return the values that are not found in the public reference
|
290
282
|
unmapped_values = iterable_idx.difference(mapped_values)
|
291
283
|
return records, unmapped_values
|
292
284
|
|
293
285
|
|
294
|
-
def index_iterable(iterable:
|
286
|
+
def index_iterable(iterable: ListLike) -> pd.Index:
|
287
|
+
"""Get unique values from an iterable."""
|
295
288
|
idx = pd.Index(iterable).unique()
|
296
289
|
# No entries are made for NAs, '', None
|
297
290
|
# returns an ordered unique not null list
|
@@ -299,8 +292,9 @@ def index_iterable(iterable: Iterable) -> pd.Index:
|
|
299
292
|
|
300
293
|
|
301
294
|
def _format_values(
|
302
|
-
names:
|
295
|
+
names: ListLike, n: int = 20, quotes: bool = True, sep: str = "'"
|
303
296
|
) -> str:
|
297
|
+
"""Format values for printing."""
|
304
298
|
if isinstance(names, dict):
|
305
299
|
items = {
|
306
300
|
f"{key}: {value}": None
|
@@ -345,50 +339,47 @@ def _bulk_create_dicts_from_df(
|
|
345
339
|
return df.reset_index().to_dict(orient="records"), multi_msg
|
346
340
|
|
347
341
|
|
348
|
-
def
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
def _get_organism_record( # type: ignore
|
357
|
-
field: StrField, organism: str | Record, force: bool = False
|
358
|
-
) -> Record:
|
342
|
+
def get_organism_record_from_field( # type: ignore
|
343
|
+
field: FieldAttr,
|
344
|
+
organism: str | Record | None = None,
|
345
|
+
values: ListLike = None,
|
346
|
+
using_key: str | None = None,
|
347
|
+
) -> Record | None:
|
359
348
|
"""Get organism record.
|
360
349
|
|
361
350
|
Args:
|
362
351
|
field: the field to get the organism record for
|
363
352
|
organism: the organism to get the record for
|
364
|
-
|
365
|
-
|
366
|
-
registry = field.field.model # type: ignore
|
367
|
-
check = True
|
368
|
-
if not force and hasattr(registry, "_ontology_id_field"):
|
369
|
-
check = field.field.name != registry._ontology_id_field # type: ignore
|
370
|
-
# e.g. bionty.CellMarker has "name" as _ontology_id_field
|
371
|
-
if not registry._ontology_id_field.endswith("id"):
|
372
|
-
check = True
|
353
|
+
values: the values to get the organism record for
|
354
|
+
using_key: the db to get the organism record for
|
373
355
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
356
|
+
Returns:
|
357
|
+
The organism record if:
|
358
|
+
The organism FK is required for the registry
|
359
|
+
The field is not unique or the organism is not None
|
360
|
+
"""
|
361
|
+
if values is None:
|
362
|
+
values = []
|
363
|
+
registry = field.field.model
|
364
|
+
field_str = field.field.name
|
365
|
+
# id field is a unique field that's not a relation
|
366
|
+
is_simple_field_unique = field.field.unique and not field.field.is_relation
|
367
|
+
check = not is_simple_field_unique or organism is not None
|
368
|
+
|
369
|
+
if (
|
370
|
+
registry.__get_name_with_module__() == "bionty.Gene"
|
371
|
+
and field.field.name == "ensembl_gene_id"
|
372
|
+
and len(values) > 0
|
373
|
+
and organism is None
|
374
|
+
): # type: ignore
|
375
|
+
from bionty._organism import organism_from_ensembl_id
|
376
|
+
|
377
|
+
return organism_from_ensembl_id(values[0], using_key) # type: ignore
|
378
|
+
|
379
|
+
if registry.__base__.__name__ == "BioRecord" and check:
|
380
|
+
from bionty._organism import create_or_get_organism_record
|
379
381
|
|
380
382
|
organism_record = create_or_get_organism_record(
|
381
|
-
organism=organism, registry=registry, field=
|
383
|
+
organism=organism, registry=registry, field=field_str
|
382
384
|
)
|
383
|
-
|
384
|
-
return organism_record
|
385
|
-
|
386
|
-
|
387
|
-
def _ensembl_prefix(id: str, field: StrField, organism: Record | None) -> str | None:
|
388
|
-
if field.field.name == "ensembl_gene_id" and organism is None: # type: ignore
|
389
|
-
if id.startswith("ENSG"):
|
390
|
-
organism = "human" # type: ignore
|
391
|
-
elif id.startswith("ENSMUSG"):
|
392
|
-
organism = "mouse" # type: ignore
|
393
|
-
|
394
|
-
return organism
|
385
|
+
return organism_record
|
lamindb/models/_label_manager.py
CHANGED
@@ -142,7 +142,7 @@ def _save_validated_records(
|
|
142
142
|
# save labels from ontology_ids
|
143
143
|
if hasattr(registry, "_ontology_id_field") and label_uids:
|
144
144
|
try:
|
145
|
-
records = registry.from_values(label_uids, field=field)
|
145
|
+
records = registry.from_values(label_uids, field=field, mute=True)
|
146
146
|
save([r for r in records if r._state.adding])
|
147
147
|
except Exception: # noqa: S110
|
148
148
|
pass
|
@@ -240,7 +240,7 @@ class LabelManager:
|
|
240
240
|
continue
|
241
241
|
# look for features
|
242
242
|
data_name_lower = data.__class__.__name__.lower()
|
243
|
-
labels_by_features = defaultdict(list)
|
243
|
+
labels_by_features: dict = defaultdict(list)
|
244
244
|
features = set()
|
245
245
|
new_labels = save_validated_records(labels)
|
246
246
|
if len(new_labels) > 0:
|
@@ -248,18 +248,24 @@ class LabelManager:
|
|
248
248
|
new_labels, using_key, transfer_logs=transfer_logs
|
249
249
|
)
|
250
250
|
for label in labels:
|
251
|
+
keys: list = []
|
251
252
|
# if the link table doesn't follow this convention, we'll ignore it
|
252
253
|
if not hasattr(label, f"links_{data_name_lower}"):
|
253
254
|
key = None
|
255
|
+
keys.append(key)
|
254
256
|
else:
|
255
|
-
|
256
|
-
|
257
|
+
links = (
|
258
|
+
getattr(label, f"links_{data_name_lower}")
|
259
|
+
.filter(**{f"{data_name_lower}_id": data.id})
|
260
|
+
.all()
|
257
261
|
)
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
262
|
+
for link in links:
|
263
|
+
if link.feature is not None:
|
264
|
+
features.add(link.feature)
|
265
|
+
key = link.feature.name
|
266
|
+
else:
|
267
|
+
key = None
|
268
|
+
keys.append(key)
|
263
269
|
label_returned = transfer_to_default_db(
|
264
270
|
label,
|
265
271
|
using_key,
|
@@ -270,7 +276,8 @@ class LabelManager:
|
|
270
276
|
# TODO: refactor return value of transfer to default db
|
271
277
|
if label_returned is not None:
|
272
278
|
label = label_returned
|
273
|
-
|
279
|
+
for key in keys:
|
280
|
+
labels_by_features[key].append(label)
|
274
281
|
# treat features
|
275
282
|
new_features = save_validated_records(list(features))
|
276
283
|
if len(new_features) > 0:
|