lamindb 1.9.1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/core/__init__.py +2 -2
- lamindb/core/storage/__init__.py +2 -1
- lamindb/core/storage/_anndata_accessor.py +10 -1
- lamindb/core/storage/_backed_access.py +4 -0
- lamindb/core/storage/_spatialdata_accessor.py +52 -0
- lamindb/examples/__init__.py +3 -18
- lamindb/examples/cellxgene/_cellxgene.py +11 -3
- lamindb/examples/croissant/__init__.py +44 -0
- lamindb/examples/croissant/mini_immuno.anndata.zarr_metadata.json +73 -0
- lamindb/{core → examples}/datasets/__init__.py +1 -1
- lamindb/{core → examples}/datasets/mini_immuno.py +19 -8
- lamindb/examples/schemas/_anndata.py +25 -15
- lamindb/examples/schemas/_simple.py +23 -9
- lamindb/integrations/__init__.py +2 -0
- lamindb/integrations/_croissant.py +122 -0
- lamindb/integrations/_vitessce.py +14 -12
- lamindb/migrations/0116_remove_artifact_unique_artifact_storage_key_hash_and_more.py +51 -0
- lamindb/migrations/0117_fix_artifact_storage_hash_unique_constraints.py +32 -0
- lamindb/migrations/{0115_squashed.py → 0117_squashed.py} +29 -6
- lamindb/models/_describe.py +107 -1
- lamindb/models/_django.py +63 -6
- lamindb/models/_feature_manager.py +0 -1
- lamindb/models/artifact.py +41 -11
- lamindb/models/collection.py +4 -9
- lamindb/models/project.py +2 -2
- lamindb/models/record.py +1 -1
- lamindb/models/run.py +1 -1
- lamindb/models/sqlrecord.py +3 -0
- {lamindb-1.9.1.dist-info → lamindb-1.10.0.dist-info}/METADATA +3 -3
- {lamindb-1.9.1.dist-info → lamindb-1.10.0.dist-info}/RECORD +36 -30
- /lamindb/{core → examples}/datasets/_core.py +0 -0
- /lamindb/{core → examples}/datasets/_fake.py +0 -0
- /lamindb/{core → examples}/datasets/_small.py +0 -0
- {lamindb-1.9.1.dist-info → lamindb-1.10.0.dist-info}/LICENSE +0 -0
- {lamindb-1.9.1.dist-info → lamindb-1.10.0.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
lamindb/core/__init__.py
CHANGED
@@ -28,7 +28,6 @@ Modules:
|
|
28
28
|
.. autosummary::
|
29
29
|
:toctree: .
|
30
30
|
|
31
|
-
datasets
|
32
31
|
storage
|
33
32
|
logger
|
34
33
|
|
@@ -38,7 +37,8 @@ from lamin_utils import logger
|
|
38
37
|
from lamin_utils._inspect import InspectResult
|
39
38
|
|
40
39
|
from .. import errors as exceptions
|
41
|
-
from
|
40
|
+
from ..examples import datasets # backward compat
|
41
|
+
from . import loaders, subsettings, types
|
42
42
|
from ._context import Context
|
43
43
|
from ._mapped_collection import MappedCollection
|
44
44
|
from ._settings import Settings
|
lamindb/core/storage/__init__.py
CHANGED
@@ -13,12 +13,13 @@ Array accessors.
|
|
13
13
|
:toctree: .
|
14
14
|
|
15
15
|
AnnDataAccessor
|
16
|
+
SpatialDataAccessor
|
16
17
|
BackedAccessor
|
17
18
|
"""
|
18
19
|
|
19
20
|
from lamindb_setup.core.upath import LocalPathClasses, UPath, infer_filesystem
|
20
21
|
|
21
|
-
from ._backed_access import AnnDataAccessor, BackedAccessor
|
22
|
+
from ._backed_access import AnnDataAccessor, BackedAccessor, SpatialDataAccessor
|
22
23
|
from ._tiledbsoma import save_tiledbsoma_experiment
|
23
24
|
from ._valid_suffixes import VALID_SUFFIXES
|
24
25
|
from .objects import infer_suffix, write_to_disk
|
@@ -353,7 +353,16 @@ if ZARR_INSTALLED:
|
|
353
353
|
attrs_keys: dict[str, list] = {}
|
354
354
|
obs_var_arrays = []
|
355
355
|
|
356
|
-
|
356
|
+
prefix = storage.path
|
357
|
+
if prefix == "":
|
358
|
+
paths_iter = (path for path in paths)
|
359
|
+
else:
|
360
|
+
prefix += "/"
|
361
|
+
paths_iter = (
|
362
|
+
path.removeprefix(prefix) for path in paths if path.startswith(prefix)
|
363
|
+
)
|
364
|
+
|
365
|
+
for path in paths_iter:
|
357
366
|
if path in (".zattrs", ".zgroup"):
|
358
367
|
continue
|
359
368
|
parts = path.split("/")
|
@@ -9,6 +9,7 @@ from anndata._io.specs.registry import get_spec
|
|
9
9
|
from ._anndata_accessor import AnnDataAccessor, StorageType, registry
|
10
10
|
from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
|
11
11
|
from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset
|
12
|
+
from ._spatialdata_accessor import SpatialDataAccessor
|
12
13
|
from ._tiledbsoma import _open_tiledbsoma
|
13
14
|
from .paths import filepath_from_artifact
|
14
15
|
|
@@ -80,6 +81,7 @@ def backed_access(
|
|
80
81
|
**kwargs,
|
81
82
|
) -> (
|
82
83
|
AnnDataAccessor
|
84
|
+
| SpatialDataAccessor
|
83
85
|
| BackedAccessor
|
84
86
|
| SOMACollection
|
85
87
|
| SOMAExperiment
|
@@ -110,6 +112,8 @@ def backed_access(
|
|
110
112
|
conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
|
111
113
|
elif suffix == ".zarr":
|
112
114
|
conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
|
115
|
+
if "spatialdata_attrs" in storage.attrs:
|
116
|
+
return SpatialDataAccessor(storage, name)
|
113
117
|
elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
|
114
118
|
df_suffix := df_suffixes.pop()
|
115
119
|
) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from functools import cached_property
|
4
|
+
from typing import TYPE_CHECKING
|
5
|
+
|
6
|
+
from ._anndata_accessor import AnnDataAccessor
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from zarr import Group
|
10
|
+
|
11
|
+
|
12
|
+
class _TablesAccessor:
|
13
|
+
def __init__(self, tables: Group):
|
14
|
+
self._tables = tables
|
15
|
+
|
16
|
+
def __getitem__(self, key: str) -> AnnDataAccessor:
|
17
|
+
return AnnDataAccessor(connection=None, storage=self._tables[key], filename=key)
|
18
|
+
|
19
|
+
def keys(self) -> list[str]:
|
20
|
+
return list(self._tables.keys())
|
21
|
+
|
22
|
+
def __repr__(self) -> str:
|
23
|
+
"""Description of the _TablesAccessor object."""
|
24
|
+
descr = (
|
25
|
+
f"Accessor for the SpatialData attribute tables\n with keys: {self.keys()}"
|
26
|
+
)
|
27
|
+
return descr
|
28
|
+
|
29
|
+
|
30
|
+
class SpatialDataAccessor:
|
31
|
+
"""Cloud-backed SpatialData.
|
32
|
+
|
33
|
+
For now only allows to access `tables`.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(self, storage: Group, name: str):
|
37
|
+
self.storage = storage
|
38
|
+
self._name = name
|
39
|
+
|
40
|
+
@cached_property
|
41
|
+
def tables(self) -> _TablesAccessor:
|
42
|
+
"""tables of the underlying SpatialData object."""
|
43
|
+
return _TablesAccessor(self.storage["tables"])
|
44
|
+
|
45
|
+
def __repr__(self):
|
46
|
+
"""Description of the SpatialDataAccessor object."""
|
47
|
+
descr = (
|
48
|
+
"SpatialDataAccessor object"
|
49
|
+
f"\n constructed for the SpatialData object {self._name}"
|
50
|
+
f"\n with tables: {self.tables.keys()}"
|
51
|
+
)
|
52
|
+
return descr
|
lamindb/examples/__init__.py
CHANGED
@@ -3,27 +3,12 @@
|
|
3
3
|
.. autosummary::
|
4
4
|
:toctree: .
|
5
5
|
|
6
|
-
ingest_mini_immuno_datasets
|
7
6
|
schemas
|
7
|
+
datasets
|
8
8
|
cellxgene
|
9
|
+
croissant
|
9
10
|
|
10
11
|
"""
|
11
12
|
|
12
|
-
from . import schemas
|
13
|
+
from . import croissant, datasets, schemas
|
13
14
|
from .cellxgene import _cellxgene
|
14
|
-
|
15
|
-
|
16
|
-
def ingest_mini_immuno_datasets():
|
17
|
-
"""Ingest mini immuno datasets.
|
18
|
-
|
19
|
-
.. literalinclude:: scripts/ingest_mini_immuno_datasets.py
|
20
|
-
:language: python
|
21
|
-
"""
|
22
|
-
import sys
|
23
|
-
from pathlib import Path
|
24
|
-
|
25
|
-
docs_path = Path(__file__).parent.parent.parent / "docs" / "scripts"
|
26
|
-
if str(docs_path) not in sys.path:
|
27
|
-
sys.path.append(str(docs_path))
|
28
|
-
|
29
|
-
import ingest_mini_immuno_datasets # noqa
|
@@ -1,12 +1,16 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Collection, Literal, NamedTuple
|
2
4
|
|
3
5
|
import pandas as pd
|
4
6
|
from lamindb_setup.core.upath import UPath
|
5
7
|
|
6
|
-
from lamindb.base.types import FieldAttr
|
7
|
-
from lamindb.models import Feature, Schema, SQLRecord, ULabel
|
8
8
|
from lamindb.models._from_values import _format_values
|
9
9
|
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from lamindb.base.types import FieldAttr
|
12
|
+
from lamindb.models import Schema, SQLRecord
|
13
|
+
|
10
14
|
CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0"]
|
11
15
|
FieldType = Literal["ontology_id", "name"]
|
12
16
|
|
@@ -25,6 +29,8 @@ def save_cxg_defaults() -> None:
|
|
25
29
|
"""
|
26
30
|
import bionty as bt
|
27
31
|
|
32
|
+
from lamindb.models import ULabel
|
33
|
+
|
28
34
|
# "normal" in Disease
|
29
35
|
normal = bt.Phenotype.from_source(
|
30
36
|
ontology_id="PATO:0000461",
|
@@ -135,6 +141,8 @@ def get_cxg_schema(
|
|
135
141
|
"""
|
136
142
|
import bionty as bt
|
137
143
|
|
144
|
+
from lamindb.models import Feature, Schema, ULabel
|
145
|
+
|
138
146
|
class CategorySpec(NamedTuple):
|
139
147
|
field: str | FieldAttr
|
140
148
|
default: str | None
|
@@ -0,0 +1,44 @@
|
|
1
|
+
"""Example Croissant files.
|
2
|
+
|
3
|
+
Examples for MLCommons Croissant files, which are used to store metadata about datasets.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
|
10
|
+
def mini_immuno(n_files: int = 1) -> list[Path]:
|
11
|
+
"""Return paths to the mini immuno dataset and its metadata as a Croissant file.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
n_files: Number of files inside the croissant file. Default is 1.
|
15
|
+
"""
|
16
|
+
from ..datasets import file_mini_csv
|
17
|
+
from ..datasets.mini_immuno import get_dataset1
|
18
|
+
|
19
|
+
adata = get_dataset1(otype="AnnData")
|
20
|
+
dataset1_path = Path("mini_immuno.anndata.zarr")
|
21
|
+
adata.write_zarr(dataset1_path)
|
22
|
+
orig_croissant_path = (
|
23
|
+
Path(__file__).parent / "mini_immuno.anndata.zarr_metadata.json"
|
24
|
+
)
|
25
|
+
with open(orig_croissant_path, encoding="utf-8") as f:
|
26
|
+
data = json.load(f)
|
27
|
+
if n_files == 2:
|
28
|
+
dataset2_path = file_mini_csv()
|
29
|
+
data["distribution"].append(
|
30
|
+
{
|
31
|
+
"@type": "sc:FileObject",
|
32
|
+
"@id": "mini.csv",
|
33
|
+
"name": "mini.csv",
|
34
|
+
"encodingFormat": "text/csv",
|
35
|
+
}
|
36
|
+
)
|
37
|
+
croissant_path = Path("mini_immuno.anndata.zarr_metadata.json")
|
38
|
+
with open(croissant_path, "w", encoding="utf-8") as f:
|
39
|
+
json.dump(data, f, indent=2)
|
40
|
+
result: list[Path] = [croissant_path, dataset1_path]
|
41
|
+
if n_files == 1:
|
42
|
+
return result
|
43
|
+
result.append(dataset2_path)
|
44
|
+
return result
|
@@ -0,0 +1,73 @@
|
|
1
|
+
{
|
2
|
+
"@context": {
|
3
|
+
"@vocab": "https://schema.org/",
|
4
|
+
"cr": "https://mlcommons.org/croissant/",
|
5
|
+
"ml": "http://ml-schema.org/",
|
6
|
+
"sc": "https://schema.org/",
|
7
|
+
"dct": "http://purl.org/dc/terms/",
|
8
|
+
"data": "https://mlcommons.org/croissant/data/",
|
9
|
+
"rai": "https://mlcommons.org/croissant/rai/",
|
10
|
+
"format": "https://mlcommons.org/croissant/format/",
|
11
|
+
"citeAs": "https://mlcommons.org/croissant/citeAs/",
|
12
|
+
"conformsTo": "https://mlcommons.org/croissant/conformsTo/",
|
13
|
+
"@language": "en",
|
14
|
+
"repeated": "https://mlcommons.org/croissant/repeated/",
|
15
|
+
"field": "https://mlcommons.org/croissant/field/",
|
16
|
+
"examples": "https://mlcommons.org/croissant/examples/",
|
17
|
+
"recordSet": "https://mlcommons.org/croissant/recordSet/",
|
18
|
+
"fileObject": "https://mlcommons.org/croissant/fileObject/",
|
19
|
+
"fileSet": "https://mlcommons.org/croissant/fileSet/",
|
20
|
+
"source": "https://mlcommons.org/croissant/source/",
|
21
|
+
"references": "https://mlcommons.org/croissant/references/",
|
22
|
+
"key": "https://mlcommons.org/croissant/key/",
|
23
|
+
"parentField": "https://mlcommons.org/croissant/parentField/",
|
24
|
+
"isLiveDataset": "https://mlcommons.org/croissant/isLiveDataset/",
|
25
|
+
"separator": "https://mlcommons.org/croissant/separator/",
|
26
|
+
"extract": "https://mlcommons.org/croissant/extract/",
|
27
|
+
"subField": "https://mlcommons.org/croissant/subField/",
|
28
|
+
"regex": "https://mlcommons.org/croissant/regex/",
|
29
|
+
"column": "https://mlcommons.org/croissant/column/",
|
30
|
+
"path": "https://mlcommons.org/croissant/path/",
|
31
|
+
"fileProperty": "https://mlcommons.org/croissant/fileProperty/",
|
32
|
+
"md5": "https://mlcommons.org/croissant/md5/",
|
33
|
+
"jsonPath": "https://mlcommons.org/croissant/jsonPath/",
|
34
|
+
"transform": "https://mlcommons.org/croissant/transform/",
|
35
|
+
"replace": "https://mlcommons.org/croissant/replace/",
|
36
|
+
"dataType": "https://mlcommons.org/croissant/dataType/",
|
37
|
+
"includes": "https://mlcommons.org/croissant/includes/",
|
38
|
+
"excludes": "https://mlcommons.org/croissant/excludes/"
|
39
|
+
},
|
40
|
+
"@type": "Dataset",
|
41
|
+
"name": "Mini immuno dataset",
|
42
|
+
"description": "A few samples from the immunology dataset",
|
43
|
+
"url": "https://lamin.ai/laminlabs/lamindata/artifact/tCUkRcaEjTjhtozp0000",
|
44
|
+
"creator": {
|
45
|
+
"@type": "Person",
|
46
|
+
"name": "falexwolf"
|
47
|
+
},
|
48
|
+
"dateCreated": "2025-07-16",
|
49
|
+
"cr:projectName": "Mini Immuno Project",
|
50
|
+
"datePublished": "2025-07-16",
|
51
|
+
"version": "1.0",
|
52
|
+
"license": "https://creativecommons.org/licenses/by/4.0/",
|
53
|
+
"citation": "Please cite this dataset as: mini immuno (2025)",
|
54
|
+
"encodingFormat": "zarr",
|
55
|
+
"distribution": [
|
56
|
+
{
|
57
|
+
"@type": "cr:FileSet",
|
58
|
+
"@id": "mini_immuno.anndata.zarr",
|
59
|
+
"containedIn": {
|
60
|
+
"@id": "directory"
|
61
|
+
},
|
62
|
+
"encodingFormat": "zarr"
|
63
|
+
}
|
64
|
+
],
|
65
|
+
"cr:recordSet": [
|
66
|
+
{
|
67
|
+
"@type": "cr:RecordSet",
|
68
|
+
"@id": "#samples",
|
69
|
+
"name": "samples",
|
70
|
+
"description": "my sample"
|
71
|
+
}
|
72
|
+
]
|
73
|
+
}
|
@@ -1,16 +1,20 @@
|
|
1
|
-
"""The mini immuno
|
1
|
+
"""The two "mini immuno" datasets.
|
2
2
|
|
3
3
|
.. autosummary::
|
4
4
|
:toctree: .
|
5
5
|
|
6
|
-
define_features_labels
|
7
6
|
get_dataset1
|
8
7
|
get_dataset2
|
8
|
+
define_features_labels
|
9
|
+
define_mini_immuno_schema_flexible
|
10
|
+
save_mini_immuno_datasets
|
9
11
|
|
10
12
|
"""
|
11
13
|
|
12
14
|
from __future__ import annotations
|
13
15
|
|
16
|
+
import sys
|
17
|
+
from pathlib import Path
|
14
18
|
from typing import TYPE_CHECKING, Literal
|
15
19
|
|
16
20
|
import anndata as ad
|
@@ -26,9 +30,6 @@ def define_features_labels() -> None:
|
|
26
30
|
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
27
31
|
:language: python
|
28
32
|
"""
|
29
|
-
import sys
|
30
|
-
from pathlib import Path
|
31
|
-
|
32
33
|
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
33
34
|
if str(docs_path) not in sys.path:
|
34
35
|
sys.path.append(str(docs_path))
|
@@ -42,9 +43,6 @@ def define_mini_immuno_schema_flexible() -> Schema:
|
|
42
43
|
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
43
44
|
:language: python
|
44
45
|
"""
|
45
|
-
import sys
|
46
|
-
from pathlib import Path
|
47
|
-
|
48
46
|
from lamindb.models import Schema
|
49
47
|
|
50
48
|
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
@@ -57,6 +55,19 @@ def define_mini_immuno_schema_flexible() -> Schema:
|
|
57
55
|
return Schema.get(name="Mini immuno schema")
|
58
56
|
|
59
57
|
|
58
|
+
def save_mini_immuno_datasets():
|
59
|
+
"""Save the two "mini immuno" datasets.
|
60
|
+
|
61
|
+
.. literalinclude:: scripts/save_mini_immuno_datasets.py
|
62
|
+
:language: python
|
63
|
+
"""
|
64
|
+
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
65
|
+
if str(docs_path) not in sys.path:
|
66
|
+
sys.path.append(str(docs_path))
|
67
|
+
|
68
|
+
import save_mini_immuno_datasets # noqa
|
69
|
+
|
70
|
+
|
60
71
|
def get_dataset1(
|
61
72
|
otype: Literal["DataFrame", "AnnData"] = "DataFrame",
|
62
73
|
gene_symbols_in_index: bool = False,
|
@@ -1,4 +1,12 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import importlib
|
4
|
+
import sys
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import TYPE_CHECKING
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from ... import Schema
|
2
10
|
|
3
11
|
|
4
12
|
def anndata_ensembl_gene_ids_and_valid_features_in_obs() -> Schema:
|
@@ -7,19 +15,21 @@ def anndata_ensembl_gene_ids_and_valid_features_in_obs() -> Schema:
|
|
7
15
|
.. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py
|
8
16
|
:language: python
|
9
17
|
"""
|
10
|
-
import
|
11
|
-
from pathlib import Path
|
18
|
+
from ... import Schema
|
12
19
|
|
13
20
|
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
if str(docs_path) not in sys.path:
|
22
|
+
sys.path.append(str(docs_path))
|
23
|
+
|
24
|
+
try:
|
25
|
+
return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs")
|
26
|
+
except Schema.DoesNotExist:
|
27
|
+
import define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs # noqa
|
28
|
+
|
29
|
+
try:
|
30
|
+
return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs")
|
31
|
+
except Schema.DoesNotExist:
|
32
|
+
importlib.reload(
|
33
|
+
define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs
|
34
|
+
)
|
35
|
+
return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs")
|
@@ -1,19 +1,33 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import importlib
|
4
|
+
import sys
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import TYPE_CHECKING
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from ... import Schema
|
2
10
|
|
3
11
|
|
4
12
|
def valid_features() -> Schema:
|
5
13
|
"""Return a schema for an AnnData with Ensembl gene IDs and valid features in obs.
|
6
14
|
|
7
|
-
.. literalinclude:: scripts/
|
15
|
+
.. literalinclude:: scripts/define_valid_features.py
|
8
16
|
:language: python
|
9
17
|
"""
|
10
|
-
import
|
11
|
-
from pathlib import Path
|
18
|
+
from ... import Schema
|
12
19
|
|
13
20
|
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
21
|
+
if str(docs_path) not in sys.path:
|
22
|
+
sys.path.append(str(docs_path))
|
23
|
+
|
24
|
+
try:
|
25
|
+
return Schema.get(name="valid_features")
|
26
|
+
except Schema.DoesNotExist:
|
27
|
+
try:
|
28
|
+
import define_valid_features # noqa
|
18
29
|
|
19
|
-
|
30
|
+
return Schema.get(name="valid_features")
|
31
|
+
except Schema.DoesNotExist:
|
32
|
+
importlib.reload(define_valid_features)
|
33
|
+
return Schema.get(name="valid_features")
|
lamindb/integrations/__init__.py
CHANGED
@@ -0,0 +1,122 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
import lamindb as ln
|
9
|
+
|
10
|
+
|
11
|
+
def curate_from_croissant(
|
12
|
+
croissant_data: str | Path | dict[str, Any],
|
13
|
+
run: ln.Run | None = None,
|
14
|
+
) -> ln.Artifact | ln.Collection:
|
15
|
+
"""Create annotated artifacts from a CroissantML file.
|
16
|
+
|
17
|
+
Returns a collection if multiple files are found in `croissant_data`, otherwise a single artifact.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
croissant_data: Path to CroissantML JSON file or dictionary.
|
21
|
+
|
22
|
+
Example:
|
23
|
+
|
24
|
+
::
|
25
|
+
|
26
|
+
artifact = ln.integrations.curate_from_croissant("dataset_metadata.json")
|
27
|
+
"""
|
28
|
+
import lamindb as ln
|
29
|
+
|
30
|
+
# Load CroissantML data
|
31
|
+
if isinstance(croissant_data, (str, Path)):
|
32
|
+
if not Path(croissant_data).exists():
|
33
|
+
raise FileNotFoundError(f"File not found: {croissant_data}")
|
34
|
+
with open(croissant_data, encoding="utf-8") as f:
|
35
|
+
data = json.load(f)
|
36
|
+
elif isinstance(croissant_data, dict):
|
37
|
+
data = croissant_data
|
38
|
+
else:
|
39
|
+
raise ValueError(
|
40
|
+
"croissant_data must be a file path, JSON string, or dictionary"
|
41
|
+
)
|
42
|
+
|
43
|
+
# Validate basic structure
|
44
|
+
if data.get("@type") != "Dataset":
|
45
|
+
raise ValueError("CroissantML @type must be 'Dataset'")
|
46
|
+
|
47
|
+
if "name" not in data:
|
48
|
+
raise ValueError("CroissantML must have a 'name' field")
|
49
|
+
|
50
|
+
# Extract basic metadata
|
51
|
+
dataset_name = data["name"]
|
52
|
+
description = data.get("description", "")
|
53
|
+
version = data.get("version", "1.0")
|
54
|
+
license_info = data.get("license", "")
|
55
|
+
project_name = data.get("cr:projectName", "")
|
56
|
+
|
57
|
+
# Create license feature and label if license info exists
|
58
|
+
license_label = None
|
59
|
+
if license_info:
|
60
|
+
license_label_type = ln.ULabel.filter(name="License", is_type=True).first()
|
61
|
+
if not license_label_type:
|
62
|
+
license_label_type = ln.ULabel(name="License", is_type=True).save()
|
63
|
+
license_label = ln.ULabel.filter(name=license_info).first()
|
64
|
+
if not license_label:
|
65
|
+
license_label = ln.ULabel(
|
66
|
+
name=license_info,
|
67
|
+
description="Dataset license",
|
68
|
+
type=license_label_type,
|
69
|
+
).save()
|
70
|
+
project_label = None
|
71
|
+
if project_name:
|
72
|
+
project_label = ln.Project.filter(name=project_name).first()
|
73
|
+
if not project_label:
|
74
|
+
project_label = ln.Project(name=project_name).save()
|
75
|
+
|
76
|
+
# Extract file distributions
|
77
|
+
artifacts = []
|
78
|
+
file_distributions = data.get("distribution", [])
|
79
|
+
if not file_distributions:
|
80
|
+
raise ValueError("No file distributions found in croissant data")
|
81
|
+
for dist in file_distributions:
|
82
|
+
file_id = dist.get("@id", "")
|
83
|
+
if Path(file_id).exists():
|
84
|
+
file_path = file_id
|
85
|
+
else:
|
86
|
+
content_url = dist.get("contentUrl", "")
|
87
|
+
file_path = content_url or data.get("url", "")
|
88
|
+
if not file_path:
|
89
|
+
raise ValueError(
|
90
|
+
f"No valid file path found in croissant distribution: {dist}"
|
91
|
+
)
|
92
|
+
if len(file_distributions) == 1:
|
93
|
+
artifact_description = f"{dataset_name}"
|
94
|
+
if file_id != dataset_name:
|
95
|
+
artifact_description += f" ({file_id})"
|
96
|
+
artifact_description += f" - {description}"
|
97
|
+
else:
|
98
|
+
artifact_description = f"{file_id}"
|
99
|
+
artifact = ln.Artifact( # type: ignore
|
100
|
+
file_path,
|
101
|
+
description=artifact_description,
|
102
|
+
version=version,
|
103
|
+
kind="dataset",
|
104
|
+
run=run,
|
105
|
+
).save()
|
106
|
+
if license_label:
|
107
|
+
artifact.ulabels.add(license_label)
|
108
|
+
if project_label:
|
109
|
+
artifact.projects.add(project_label)
|
110
|
+
artifacts.append(artifact)
|
111
|
+
|
112
|
+
if len(artifacts) == 1:
|
113
|
+
return artifacts[0]
|
114
|
+
else:
|
115
|
+
collection = ln.Collection( # type: ignore
|
116
|
+
artifacts, key=dataset_name, description=description, version=version
|
117
|
+
).save()
|
118
|
+
if license_label:
|
119
|
+
collection.ulabels.add(license_label)
|
120
|
+
if project_label:
|
121
|
+
collection.projects.add(project_label)
|
122
|
+
return collection
|
@@ -28,21 +28,17 @@ def save_vitessce_config(
|
|
28
28
|
If the `VitessceConfig` object references multiple artifacts, automatically
|
29
29
|
creates a `Collection` and displays the "Vitessce button" next to it.
|
30
30
|
|
31
|
+
The `VitessceConfig` artifact has `.suffix = ".vitessce.json"` and `.kind = "__lamindb_config__"`,
|
32
|
+
which is by default hidden on the hub UI.
|
33
|
+
|
31
34
|
Guide: :doc:`docs:vitessce`.
|
32
35
|
|
33
36
|
Args:
|
34
37
|
vitessce_config: A `VitessceConfig` object.
|
35
|
-
key: A key for the `VitessceConfig`
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
.. versionchanged:: 0.76.12
|
41
|
-
Now assumes `vitessce-python >= 3.4.0`, which allows passing artifacts within `VitessceConfig`.
|
42
|
-
.. versionchanged:: 0.75.1
|
43
|
-
Now displays the "Vitessce button" on the hub next to the dataset. It additionally keeps displaying it next to the configuration file.
|
44
|
-
.. versionchanged:: 0.70.2
|
45
|
-
No longer saves the dataset. It only saves the `VitessceConfig` object.
|
38
|
+
key: A `key` for the `VitessceConfig` artifact.
|
39
|
+
description: A `description` for the `VitessceConfig` aritifact. Is additionally
|
40
|
+
used as `key` for a `Collection` in case the `VitessceConfig` object
|
41
|
+
references multiple artifacts.
|
46
42
|
"""
|
47
43
|
# can only import here because vitessce is not a dependency
|
48
44
|
from vitessce import VitessceConfig
|
@@ -73,6 +69,8 @@ def save_vitessce_config(
|
|
73
69
|
if len(dataset_artifacts) > 1:
|
74
70
|
# if we have more datasets, we should create a collection
|
75
71
|
# and attach an action to the collection
|
72
|
+
# consicious use of description for key, see here
|
73
|
+
# https://github.com/laminlabs/lamindb/pull/2997
|
76
74
|
collection = Collection(dataset_artifacts, key=description).save()
|
77
75
|
|
78
76
|
# create a JSON export
|
@@ -80,7 +78,11 @@ def save_vitessce_config(
|
|
80
78
|
with open(config_file_local_path, "w") as file:
|
81
79
|
json.dump(vc_dict, file)
|
82
80
|
vitessce_config_artifact = Artifact(
|
83
|
-
config_file_local_path,
|
81
|
+
config_file_local_path,
|
82
|
+
key=key,
|
83
|
+
description=description,
|
84
|
+
run=run,
|
85
|
+
kind="__lamindb_config__",
|
84
86
|
).save()
|
85
87
|
slug = ln_setup.settings.instance.slug
|
86
88
|
logger.important(
|