lamindb 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +4 -2
- lamindb/_artifact.py +54 -36
- lamindb/_collection.py +1 -1
- lamindb/_feature.py +1 -1
- lamindb/_finish.py +9 -1
- lamindb/_query_set.py +24 -6
- lamindb/_record.py +4 -5
- lamindb/_save.py +9 -1
- lamindb/_tracked.py +25 -2
- lamindb/base/users.py +1 -4
- lamindb/core/_context.py +7 -2
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +15 -3
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +10 -5
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/paths.py +12 -12
- lamindb/curators/__init__.py +77 -65
- lamindb/models.py +58 -18
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/METADATA +2 -2
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/RECORD +25 -25
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
@@ -18,15 +18,30 @@ def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
|
|
18
18
|
# we don't check here that the filesystem is the same
|
19
19
|
# but this is a requirement for pyarrow.dataset.dataset
|
20
20
|
if isinstance(paths, list):
|
21
|
-
|
22
|
-
elif paths.
|
23
|
-
|
21
|
+
path_list = paths
|
22
|
+
elif paths.is_dir():
|
23
|
+
path_list = [path for path in paths.rglob("*") if path.suffix != ""]
|
24
24
|
else:
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
25
|
+
path_list = [paths]
|
26
|
+
suffix = None
|
27
|
+
for path in path_list:
|
28
|
+
path_suffixes = path.suffixes
|
29
|
+
# this doesn't work for externally gzipped files, REMOVE LATER
|
30
|
+
path_suffix = (
|
31
|
+
path_suffixes[-2]
|
32
|
+
if len(path_suffixes) > 1 and ".gz" in path_suffixes
|
33
|
+
else path.suffix
|
34
|
+
)
|
35
|
+
if path_suffix not in PYARROW_SUFFIXES:
|
36
|
+
return False
|
37
|
+
elif suffix is None:
|
38
|
+
suffix = path_suffix
|
39
|
+
elif path_suffix != suffix:
|
40
|
+
return False
|
41
|
+
return True
|
42
|
+
|
43
|
+
|
44
|
+
def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
|
30
45
|
if isinstance(paths, list):
|
31
46
|
path0 = paths[0]
|
32
47
|
if isinstance(path0, LocalPathClasses):
|
@@ -38,4 +53,4 @@ def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
|
|
38
53
|
else:
|
39
54
|
paths_str, filesystem = paths.path, paths.fs
|
40
55
|
|
41
|
-
return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)
|
56
|
+
return pyarrow.dataset.dataset(paths_str, filesystem=filesystem, **kwargs)
|
lamindb/core/storage/paths.py
CHANGED
@@ -41,24 +41,21 @@ def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> s
|
|
41
41
|
return storage_key
|
42
42
|
|
43
43
|
|
44
|
-
def _safely_resolve(upath: UPath) -> UPath:
|
45
|
-
if upath.protocol in {"http", "https"}:
|
46
|
-
resolve_kwargs = {"follow_redirects": False}
|
47
|
-
else:
|
48
|
-
resolve_kwargs = {}
|
49
|
-
return upath.resolve(**resolve_kwargs)
|
50
|
-
|
51
|
-
|
52
44
|
def check_path_is_child_of_root(path: UPathStr, root: UPathStr) -> bool:
|
53
45
|
if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)):
|
54
46
|
return False
|
55
|
-
path_upath =
|
56
|
-
root_upath =
|
47
|
+
path_upath = UPath(path)
|
48
|
+
root_upath = UPath(root)
|
57
49
|
if path_upath.protocol == "s3":
|
58
50
|
endpoint_path = path_upath.storage_options.get("endpoint_url", "")
|
59
51
|
endpoint_root = root_upath.storage_options.get("endpoint_url", "")
|
60
52
|
if endpoint_path != endpoint_root:
|
61
53
|
return False
|
54
|
+
# we don't resolve http links because they can resolve into a different domain
|
55
|
+
# for example into a temporary url
|
56
|
+
if path_upath.protocol not in {"http", "https"}:
|
57
|
+
path_upath = path_upath.resolve()
|
58
|
+
root_upath = root_upath.resolve()
|
62
59
|
# str is needed to eliminate UPath storage_options
|
63
60
|
# which affect equality checks
|
64
61
|
return UPath(str(root_upath)) in UPath(str(path_upath)).parents
|
@@ -134,7 +131,7 @@ def filepath_cache_key_from_artifact(
|
|
134
131
|
|
135
132
|
|
136
133
|
def store_file_or_folder(
|
137
|
-
local_path: UPathStr, storage_path: UPath, print_progress: bool = True
|
134
|
+
local_path: UPathStr, storage_path: UPath, print_progress: bool = True, **kwargs
|
138
135
|
) -> None:
|
139
136
|
"""Store file or folder (localpath) at storagepath."""
|
140
137
|
local_path = UPath(local_path)
|
@@ -155,7 +152,10 @@ def store_file_or_folder(
|
|
155
152
|
else:
|
156
153
|
create_folder = None
|
157
154
|
storage_path.upload_from(
|
158
|
-
local_path,
|
155
|
+
local_path,
|
156
|
+
create_folder=create_folder,
|
157
|
+
print_progress=print_progress,
|
158
|
+
**kwargs,
|
159
159
|
)
|
160
160
|
else: # storage path is local
|
161
161
|
if local_path.resolve().as_posix() == storage_path.resolve().as_posix():
|
lamindb/curators/__init__.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
"""Curators.
|
2
2
|
|
3
|
+
.. versionadded:: 1.1.0
|
4
|
+
|
3
5
|
.. autosummary::
|
4
6
|
:toctree: .
|
5
7
|
|
@@ -146,6 +148,12 @@ class CurateLookup:
|
|
146
148
|
CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
147
149
|
|
148
150
|
|
151
|
+
SLOTS_DOCSTRING = """Curator objects by slot.
|
152
|
+
|
153
|
+
.. versionadded:: 1.1.1
|
154
|
+
"""
|
155
|
+
|
156
|
+
|
149
157
|
VALIDATE_DOCSTRING = """Validate dataset.
|
150
158
|
|
151
159
|
Raises:
|
@@ -170,6 +178,8 @@ class Curator:
|
|
170
178
|
|
171
179
|
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
172
180
|
|
181
|
+
.. versionadded:: 1.1.0
|
182
|
+
|
173
183
|
See:
|
174
184
|
- :class:`~lamindb.curators.DataFrameCurator`
|
175
185
|
- :class:`~lamindb.curators.AnnDataCurator`
|
@@ -212,6 +222,8 @@ class DataFrameCurator(Curator):
|
|
212
222
|
|
213
223
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
214
224
|
|
225
|
+
.. versionadded:: 1.1.0
|
226
|
+
|
215
227
|
Args:
|
216
228
|
dataset: The DataFrame-like object to validate & annotate.
|
217
229
|
schema: A `Schema` object that defines the validation constraints.
|
@@ -222,9 +234,9 @@ class DataFrameCurator(Curator):
|
|
222
234
|
import bionty as bt
|
223
235
|
|
224
236
|
# define valid labels
|
225
|
-
|
226
|
-
ln.ULabel(name="DMSO", type=
|
227
|
-
ln.ULabel(name="IFNG", type=
|
237
|
+
perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
|
238
|
+
ln.ULabel(name="DMSO", type=perturbation).save()
|
239
|
+
ln.ULabel(name="IFNG", type=perturbation).save()
|
228
240
|
bt.CellType.from_source(name="B cell").save()
|
229
241
|
bt.CellType.from_source(name="T cell").save()
|
230
242
|
|
@@ -232,7 +244,7 @@ class DataFrameCurator(Curator):
|
|
232
244
|
schema = ln.Schema(
|
233
245
|
name="small_dataset1_obs_level_metadata",
|
234
246
|
features=[
|
235
|
-
ln.Feature(name="
|
247
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
236
248
|
ln.Feature(name="sample_note", dtype=str).save(),
|
237
249
|
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
238
250
|
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
|
@@ -252,10 +264,10 @@ class DataFrameCurator(Curator):
|
|
252
264
|
schema: Schema,
|
253
265
|
) -> None:
|
254
266
|
super().__init__(dataset=dataset, schema=schema)
|
267
|
+
categoricals = {}
|
255
268
|
if schema.n > 0:
|
256
269
|
# populate features
|
257
270
|
pandera_columns = {}
|
258
|
-
categoricals = {}
|
259
271
|
for feature in schema.features.all():
|
260
272
|
pandera_dtype = (
|
261
273
|
feature.dtype if not feature.dtype.startswith("cat") else "category"
|
@@ -268,13 +280,13 @@ class DataFrameCurator(Curator):
|
|
268
280
|
self._pandera_schema = pandera.DataFrameSchema(
|
269
281
|
pandera_columns, coerce=schema.coerce_dtype
|
270
282
|
)
|
271
|
-
# now deal with detailed validation of categoricals
|
272
|
-
self._cat_manager = DataFrameCatManager(
|
273
|
-
self._dataset,
|
274
|
-
categoricals=categoricals,
|
275
|
-
)
|
276
283
|
else:
|
277
284
|
assert schema.itype is not None # noqa: S101
|
285
|
+
self._cat_manager = DataFrameCatManager(
|
286
|
+
self._dataset,
|
287
|
+
columns=parse_dtype_single_cat(schema.itype, is_itype=True)["field"],
|
288
|
+
categoricals=categoricals,
|
289
|
+
)
|
278
290
|
|
279
291
|
@property
|
280
292
|
@doc_args(CAT_MANAGER_DOCSTRING)
|
@@ -285,16 +297,29 @@ class DataFrameCurator(Curator):
|
|
285
297
|
def standardize(self) -> None:
|
286
298
|
"""Standardize the dataset.
|
287
299
|
|
288
|
-
- Adds missing columns
|
289
|
-
- Fills missing values
|
300
|
+
- Adds missing columns for features
|
301
|
+
- Fills missing values for features with default values
|
290
302
|
"""
|
291
303
|
for feature in self._schema.members:
|
292
304
|
if feature.name not in self._dataset.columns:
|
293
|
-
if feature.default_value is not None:
|
294
|
-
|
305
|
+
if feature.default_value is not None or feature.nullable:
|
306
|
+
fill_value = (
|
307
|
+
feature.default_value
|
308
|
+
if feature.default_value is not None
|
309
|
+
else pd.NA
|
310
|
+
)
|
311
|
+
if feature.dtype.startswith("cat"):
|
312
|
+
self._dataset[feature.name] = pd.Categorical(
|
313
|
+
[fill_value] * len(self._dataset)
|
314
|
+
)
|
315
|
+
else:
|
316
|
+
self._dataset[feature.name] = fill_value
|
317
|
+
logger.important(
|
318
|
+
f"added column {feature.name} with fill value {fill_value}"
|
319
|
+
)
|
295
320
|
else:
|
296
321
|
raise ValidationError(
|
297
|
-
f"Missing column {feature.name} cannot be added because
|
322
|
+
f"Missing column {feature.name} cannot be added because is not nullable and has no default value"
|
298
323
|
)
|
299
324
|
else:
|
300
325
|
if feature.default_value is not None:
|
@@ -312,46 +337,29 @@ class DataFrameCurator(Curator):
|
|
312
337
|
feature.default_value
|
313
338
|
)
|
314
339
|
|
340
|
+
def _cat_manager_validate(self) -> None:
|
341
|
+
self._cat_manager.validate()
|
342
|
+
if self._cat_manager._is_validated:
|
343
|
+
self._is_validated = True
|
344
|
+
else:
|
345
|
+
self._is_validated = False
|
346
|
+
raise ValidationError(self._cat_manager._validate_category_error_messages)
|
347
|
+
|
315
348
|
@doc_args(VALIDATE_DOCSTRING)
|
316
349
|
def validate(self) -> None:
|
317
350
|
"""{}""" # noqa: D415
|
318
351
|
if self._schema.n > 0:
|
319
|
-
self._cat_manager.validate()
|
320
352
|
try:
|
353
|
+
# first validate through pandera
|
321
354
|
self._pandera_schema.validate(self._dataset)
|
322
|
-
|
323
|
-
|
324
|
-
else:
|
325
|
-
self._is_validated = False
|
326
|
-
raise ValidationError(
|
327
|
-
self._cat_manager._validate_category_error_messages
|
328
|
-
)
|
355
|
+
# then validate lamindb categoricals
|
356
|
+
self._cat_manager_validate()
|
329
357
|
except pandera.errors.SchemaError as err:
|
330
358
|
self._is_validated = False
|
331
359
|
# .exconly() doesn't exist on SchemaError
|
332
360
|
raise ValidationError(str(err)) from err
|
333
361
|
else:
|
334
|
-
|
335
|
-
registry: CanCurate = result["registry"]
|
336
|
-
inspector = registry.inspect(
|
337
|
-
self._dataset.columns,
|
338
|
-
result["field"],
|
339
|
-
mute=True,
|
340
|
-
)
|
341
|
-
if len(inspector.non_validated) > 0:
|
342
|
-
# also check public ontology
|
343
|
-
if hasattr(registry, "public"):
|
344
|
-
registry.from_values(
|
345
|
-
inspector.non_validated, result["field"], mute=True
|
346
|
-
).save()
|
347
|
-
inspector = registry.inspect(
|
348
|
-
inspector.non_validated, result["field"], mute=True
|
349
|
-
)
|
350
|
-
if len(inspector.non_validated) > 0:
|
351
|
-
self._is_validated = False
|
352
|
-
raise ValidationError(
|
353
|
-
f"Invalid identifiers for {self._schema.itype}: {inspector.non_validated}"
|
354
|
-
)
|
362
|
+
self._cat_manager_validate()
|
355
363
|
|
356
364
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
357
365
|
def save_artifact(
|
@@ -385,6 +393,8 @@ class AnnDataCurator(Curator):
|
|
385
393
|
|
386
394
|
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
387
395
|
|
396
|
+
.. versionadded:: 1.1.0
|
397
|
+
|
388
398
|
Args:
|
389
399
|
dataset: The AnnData-like object to validate & annotate.
|
390
400
|
schema: A `Schema` object that defines the validation constraints.
|
@@ -395,9 +405,9 @@ class AnnDataCurator(Curator):
|
|
395
405
|
import bionty as bt
|
396
406
|
|
397
407
|
# define valid labels
|
398
|
-
|
399
|
-
ln.ULabel(name="DMSO", type=
|
400
|
-
ln.ULabel(name="IFNG", type=
|
408
|
+
perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
|
409
|
+
ln.ULabel(name="DMSO", type=perturbation).save()
|
410
|
+
ln.ULabel(name="IFNG", type=perturbation).save()
|
401
411
|
bt.CellType.from_source(name="B cell").save()
|
402
412
|
bt.CellType.from_source(name="T cell").save()
|
403
413
|
|
@@ -405,9 +415,9 @@ class AnnDataCurator(Curator):
|
|
405
415
|
obs_schema = ln.Schema(
|
406
416
|
name="small_dataset1_obs_level_metadata",
|
407
417
|
features=[
|
408
|
-
ln.Feature(name="
|
418
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
409
419
|
ln.Feature(name="sample_note", dtype=str).save(),
|
410
|
-
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType
|
420
|
+
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
411
421
|
ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
|
412
422
|
],
|
413
423
|
).save()
|
@@ -416,7 +426,7 @@ class AnnDataCurator(Curator):
|
|
416
426
|
var_schema = ln.Schema(
|
417
427
|
name="scRNA_seq_var_schema",
|
418
428
|
itype=bt.Gene.ensembl_gene_id,
|
419
|
-
dtype=
|
429
|
+
dtype=int,
|
420
430
|
).save()
|
421
431
|
|
422
432
|
# define composite schema
|
@@ -450,15 +460,27 @@ class AnnDataCurator(Curator):
|
|
450
460
|
self._dataset.var.T, schema._get_component("var")
|
451
461
|
)
|
452
462
|
|
463
|
+
@property
|
464
|
+
@doc_args(SLOTS_DOCSTRING)
|
465
|
+
def slots(self) -> dict[str, DataFrameCurator]:
|
466
|
+
"""{}""" # noqa: D415
|
467
|
+
return {"obs": self._obs_curator, "var": self._var_curator}
|
468
|
+
|
453
469
|
@doc_args(VALIDATE_DOCSTRING)
|
454
470
|
def validate(self) -> None:
|
455
471
|
"""{}""" # noqa: D415
|
456
472
|
self._obs_curator.validate()
|
457
473
|
self._var_curator.validate()
|
458
|
-
self._is_validated = True
|
459
474
|
|
460
475
|
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
461
|
-
def save_artifact(
|
476
|
+
def save_artifact(
|
477
|
+
self,
|
478
|
+
*,
|
479
|
+
key: str | None = None,
|
480
|
+
description: str | None = None,
|
481
|
+
revises: Artifact | None = None,
|
482
|
+
run: Run | None = None,
|
483
|
+
):
|
462
484
|
"""{}""" # noqa: D415
|
463
485
|
if not self._is_validated:
|
464
486
|
self.validate() # raises ValidationError if doesn't validate
|
@@ -1031,11 +1053,6 @@ class AnnDataCatManager(CatManager):
|
|
1031
1053
|
class MuDataCatManager(CatManager):
|
1032
1054
|
"""Curation flow for a ``MuData`` object.
|
1033
1055
|
|
1034
|
-
See also :class:`~lamindb.Curator`.
|
1035
|
-
|
1036
|
-
Note that if genes or other measurements are removed from the MuData object,
|
1037
|
-
the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
|
1038
|
-
|
1039
1056
|
Args:
|
1040
1057
|
mdata: The MuData object to curate.
|
1041
1058
|
var_index: The registry field for mapping the ``.var`` index for each modality.
|
@@ -1289,8 +1306,6 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1289
1306
|
class TiledbsomaCatManager(CatManager):
|
1290
1307
|
"""Curation flow for `tiledbsoma.Experiment`.
|
1291
1308
|
|
1292
|
-
See also :class:`~lamindb.Curator`.
|
1293
|
-
|
1294
1309
|
Args:
|
1295
1310
|
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
1296
1311
|
var_index: The registry fields for mapping the `.var` indices for measurements.
|
@@ -3168,10 +3183,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
|
|
3168
3183
|
import bionty as bt
|
3169
3184
|
|
3170
3185
|
if organism is None and bt.settings.organism is None:
|
3171
|
-
|
3172
|
-
f"{registry.__name__} registry requires an organism!\n"
|
3173
|
-
" → please pass an organism name via organism="
|
3174
|
-
)
|
3186
|
+
return {}
|
3175
3187
|
return {"organism": organism or bt.settings.organism.name}
|
3176
3188
|
return {}
|
3177
3189
|
|
@@ -3263,7 +3275,7 @@ def validate_categories(
|
|
3263
3275
|
warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
|
3264
3276
|
if n_non_validated > len(syn_mapper):
|
3265
3277
|
if syn_mapper:
|
3266
|
-
warning_message += " for remaining terms:\n"
|
3278
|
+
warning_message += "\n for remaining terms:\n"
|
3267
3279
|
warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
|
3268
3280
|
|
3269
3281
|
if logger.indent == "":
|
@@ -3422,7 +3434,7 @@ def save_artifact(
|
|
3422
3434
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
3423
3435
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
3424
3436
|
# multi-value columns are separated by "|"
|
3425
|
-
if df[key].str.contains("|").any():
|
3437
|
+
if not df[key].isna().all() and df[key].str.contains("|").any():
|
3426
3438
|
values = df[key].str.split("|").explode().unique()
|
3427
3439
|
else:
|
3428
3440
|
values = df[key].unique()
|
lamindb/models.py
CHANGED
@@ -699,7 +699,7 @@ class Registry(ModelBase):
|
|
699
699
|
A record.
|
700
700
|
|
701
701
|
Raises:
|
702
|
-
:exc:`docs:lamindb.
|
702
|
+
:exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
|
703
703
|
|
704
704
|
See Also:
|
705
705
|
- Guide: :doc:`docs:registries`
|
@@ -2015,6 +2015,8 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
2015
2015
|
"""A default value that overwrites missing values (default `None`).
|
2016
2016
|
|
2017
2017
|
This takes effect when you call `Curator.standardize()`.
|
2018
|
+
|
2019
|
+
If `default_value = None`, missing values like `pd.NA` or `np.nan` are kept.
|
2018
2020
|
"""
|
2019
2021
|
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
|
2020
2022
|
return self._aux["af"]["0"]
|
@@ -2050,12 +2052,14 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
2050
2052
|
|
2051
2053
|
"""
|
2052
2054
|
if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
|
2053
|
-
|
2055
|
+
value = self._aux["af"]["1"]
|
2056
|
+
return True if value is None else value
|
2054
2057
|
else:
|
2055
2058
|
return True
|
2056
2059
|
|
2057
2060
|
@nullable.setter
|
2058
2061
|
def nullable(self, value: bool) -> None:
|
2062
|
+
assert isinstance(value, bool), value # noqa: S101
|
2059
2063
|
if self._aux is None:
|
2060
2064
|
self._aux = {}
|
2061
2065
|
if "af" not in self._aux:
|
@@ -2134,20 +2138,9 @@ class FeatureValue(Record, TracksRun):
|
|
2134
2138
|
class Schema(Record, CanCurate, TracksRun):
|
2135
2139
|
"""Schemas / feature sets.
|
2136
2140
|
|
2137
|
-
|
2138
|
-
that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
|
2139
|
-
entities.
|
2141
|
+
A simple schema is just a set of columns in a `DataFrame`, a "feature set".
|
2140
2142
|
|
2141
|
-
|
2142
|
-
|
2143
|
-
1. Performance: Imagine you measure the same panel of 20k transcripts in
|
2144
|
-
1M samples. By modeling the panel as a feature set, you can link all
|
2145
|
-
your artifacts against one feature set and only need to store 1M
|
2146
|
-
instead of 1M x 20k = 20B links.
|
2147
|
-
2. Interpretation: Model protein panels, gene panels, etc.
|
2148
|
-
3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
|
2149
|
-
|
2150
|
-
These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
|
2143
|
+
A composite schema has multiple components, e.g. for an `AnnData`, each a feature set for `obs` and `var`.
|
2151
2144
|
|
2152
2145
|
Args:
|
2153
2146
|
features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
|
@@ -2162,7 +2155,7 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2162
2155
|
dtype: `str | None = None` The simple type. Defaults to
|
2163
2156
|
`None` for sets of :class:`~lamindb.Feature` records.
|
2164
2157
|
Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
|
2165
|
-
itype: `str | None = None` The
|
2158
|
+
itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
|
2166
2159
|
type: `Schema | None = None` A type.
|
2167
2160
|
is_type: `bool = False` Distinguish types from instances of the type.
|
2168
2161
|
otype: `str | None = None` An object type to define the structure of a composite schema.
|
@@ -2174,6 +2167,17 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2174
2167
|
coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
|
2175
2168
|
during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
|
2176
2169
|
|
2170
|
+
.. dropdown:: Why does LaminDB model schemas, not just features?
|
2171
|
+
|
2172
|
+
1. Performance: Imagine you measure the same panel of 20k transcripts in
|
2173
|
+
1M samples. By modeling the panel as a feature set, you can link all
|
2174
|
+
your artifacts against one feature set and only need to store 1M
|
2175
|
+
instead of 1M x 20k = 20B links.
|
2176
|
+
2. Interpretation: Model protein panels, gene panels, etc.
|
2177
|
+
3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
|
2178
|
+
|
2179
|
+
These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
|
2180
|
+
|
2177
2181
|
Note:
|
2178
2182
|
|
2179
2183
|
A feature set can be identified by the `hash` of its feature uids.
|
@@ -2211,7 +2215,10 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2211
2215
|
abstract = False
|
2212
2216
|
|
2213
2217
|
_name_field: str = "name"
|
2214
|
-
_aux_fields: dict[str, tuple[str, type]] = {
|
2218
|
+
_aux_fields: dict[str, tuple[str, type]] = {
|
2219
|
+
"0": ("coerce_dtype", bool),
|
2220
|
+
"1": ("_index_feature_uid", str),
|
2221
|
+
}
|
2215
2222
|
|
2216
2223
|
id: int = models.AutoField(primary_key=True)
|
2217
2224
|
"""Internal id, valid only in one DB instance."""
|
@@ -2427,6 +2434,39 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2427
2434
|
self._aux["af"] = {}
|
2428
2435
|
self._aux["af"]["0"] = value
|
2429
2436
|
|
2437
|
+
@coerce_dtype.setter
|
2438
|
+
def coerce_dtype(self, value: bool) -> None:
|
2439
|
+
if self._aux is None:
|
2440
|
+
self._aux = {}
|
2441
|
+
if "af" not in self._aux:
|
2442
|
+
self._aux["af"] = {}
|
2443
|
+
self._aux["af"]["0"] = value
|
2444
|
+
|
2445
|
+
# @property
|
2446
|
+
# def index_feature(self) -> None | Feature:
|
2447
|
+
# # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
|
2448
|
+
# """The uid of the index feature, if `index_feature` was set."""
|
2449
|
+
# if self._index_feature_uid is None:
|
2450
|
+
# return None
|
2451
|
+
# else:
|
2452
|
+
# return self.features.get(uid=self._index_feature_uid)
|
2453
|
+
|
2454
|
+
# @property
|
2455
|
+
# def _index_feature_uid(self) -> None | str:
|
2456
|
+
# """The uid of the index feature, if `index_feature` was set."""
|
2457
|
+
# if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
|
2458
|
+
# return self._aux["af"]["1"]
|
2459
|
+
# else:
|
2460
|
+
# return None
|
2461
|
+
|
2462
|
+
# @_index_feature_uid.setter
|
2463
|
+
# def _index_feature_uid(self, value: str) -> None:
|
2464
|
+
# if self._aux is None:
|
2465
|
+
# self._aux = {}
|
2466
|
+
# if "af" not in self._aux:
|
2467
|
+
# self._aux["af"] = {}
|
2468
|
+
# self._aux["af"]["1"] = value
|
2469
|
+
|
2430
2470
|
@property
|
2431
2471
|
@deprecated("itype")
|
2432
2472
|
def registry(self) -> str:
|
@@ -3057,7 +3097,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
3057
3097
|
pass
|
3058
3098
|
|
3059
3099
|
def open(
|
3060
|
-
self, mode: str = "r", is_run_input: bool | None = None
|
3100
|
+
self, mode: str = "r", is_run_input: bool | None = None, **kwargs
|
3061
3101
|
) -> (
|
3062
3102
|
AnnDataAccessor
|
3063
3103
|
| BackedAccessor
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: lamindb
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.1
|
4
4
|
Summary: A data framework for biology.
|
5
5
|
Author-email: Lamin Labs <open-source@lamin.ai>
|
6
6
|
Requires-Python: >=3.10,<3.13
|
@@ -10,7 +10,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
11
11
|
Requires-Dist: lamin_utils==0.13.10
|
12
12
|
Requires-Dist: lamin_cli==1.1.0
|
13
|
-
Requires-Dist: lamindb_setup[aws]==1.
|
13
|
+
Requires-Dist: lamindb_setup[aws]==1.2.0
|
14
14
|
Requires-Dist: pyyaml
|
15
15
|
Requires-Dist: pyarrow
|
16
16
|
Requires-Dist: pandera
|