lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +216 -133
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3559
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1546 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/models/__init__.py +12 -2
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +384 -301
- lamindb/models/_from_values.py +1 -1
- lamindb/models/_is_versioned.py +5 -15
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +354 -177
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/can_curate.py +4 -1
- lamindb/models/collection.py +79 -56
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +78 -47
- lamindb/models/has_parents.py +24 -9
- lamindb/models/project.py +3 -3
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +251 -206
- lamindb/models/record.py +211 -344
- lamindb/models/run.py +59 -5
- lamindb/models/save.py +9 -5
- lamindb/models/schema.py +673 -196
- lamindb/models/transform.py +5 -14
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
- lamindb-1.5.0.dist-info/RECORD +108 -0
- lamindb-1.3.2.dist-info/RECORD +0 -95
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
lamindb/core/_settings.py
CHANGED
@@ -9,6 +9,7 @@ from lamindb_setup._set_managed_storage import set_managed_storage
|
|
9
9
|
from lamindb_setup.core._settings import settings as setup_settings
|
10
10
|
from lamindb_setup.core._settings_instance import sanitize_git_repo_url
|
11
11
|
|
12
|
+
from .subsettings._annotation_settings import AnnotationSettings, annotation_settings
|
12
13
|
from .subsettings._creation_settings import CreationSettings, creation_settings
|
13
14
|
|
14
15
|
if TYPE_CHECKING:
|
@@ -34,13 +35,13 @@ VERBOSITY_TO_STR: dict[int, str] = dict(
|
|
34
35
|
class Settings:
|
35
36
|
"""Settings.
|
36
37
|
|
37
|
-
Use
|
38
|
+
Use `lamindb.settings` instead of instantiating this class yourself.
|
38
39
|
"""
|
39
40
|
|
40
|
-
def __init__(self
|
41
|
+
def __init__(self):
|
41
42
|
self._verbosity_int: int = 1 # warning-level logging
|
42
43
|
logger.set_verbosity(self._verbosity_int)
|
43
|
-
self._sync_git_repo: str | None =
|
44
|
+
self._sync_git_repo: str | None = None
|
44
45
|
|
45
46
|
@property
|
46
47
|
def creation(self) -> CreationSettings:
|
@@ -51,6 +52,15 @@ class Settings:
|
|
51
52
|
"""
|
52
53
|
return creation_settings
|
53
54
|
|
55
|
+
@property
|
56
|
+
def annotation(self) -> AnnotationSettings:
|
57
|
+
"""Artifact annotation settings.
|
58
|
+
|
59
|
+
For example, `ln.settings.creation.search_names = False` will disable
|
60
|
+
searching for records with similar names during creation.
|
61
|
+
"""
|
62
|
+
return annotation_settings
|
63
|
+
|
54
64
|
track_run_inputs: bool = True
|
55
65
|
"""Track files as input upon `.load()`, `.cache()` and `.open()`.
|
56
66
|
|
@@ -85,13 +95,18 @@ class Settings:
|
|
85
95
|
|
86
96
|
Provide the full git repo URL.
|
87
97
|
"""
|
88
|
-
|
98
|
+
if self._sync_git_repo is not None:
|
99
|
+
return self._sync_git_repo
|
100
|
+
elif os.environ.get("LAMINDB_MULTI_INSTANCE") == "true":
|
101
|
+
return None
|
102
|
+
else:
|
103
|
+
return setup_settings.instance.git_repo
|
89
104
|
|
90
105
|
@sync_git_repo.setter
|
91
106
|
def sync_git_repo(self, value) -> None:
|
92
107
|
"""Sync transforms with scripts in git repository.
|
93
108
|
|
94
|
-
For example: `ln.sync_git_repo = https://github.com/laminlabs/redun-lamin`
|
109
|
+
For example: `ln.settings.sync_git_repo = https://github.com/laminlabs/redun-lamin`
|
95
110
|
"""
|
96
111
|
self._sync_git_repo = sanitize_git_repo_url(value)
|
97
112
|
if not self._sync_git_repo.startswith("https://"): # pragma: nocover
|
@@ -99,28 +114,31 @@ class Settings:
|
|
99
114
|
|
100
115
|
@property
|
101
116
|
def storage(self) -> StorageSettings:
|
102
|
-
"""
|
117
|
+
"""Current default storage location for writes.
|
103
118
|
|
104
119
|
Examples:
|
105
120
|
|
106
|
-
|
107
|
-
|
121
|
+
Retrieve the storage settings::
|
122
|
+
|
123
|
+
ln.settings.storage
|
124
|
+
#> StorageSettings(root='s3://my-bucket')
|
108
125
|
|
109
|
-
|
110
|
-
UPath('s3://my-bucket')
|
126
|
+
Retrieve the storage root::
|
111
127
|
|
112
|
-
|
113
|
-
|
128
|
+
ln.settings.storage.root
|
129
|
+
#> UPath('s3://my-bucket')
|
114
130
|
|
115
|
-
|
131
|
+
You can write artifacts to other storage locations by switching the current default storage location::
|
116
132
|
|
117
|
-
|
133
|
+
ln.settings.storage = "s3://some-bucket"
|
118
134
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
135
|
+
You can also pass additional fsspec kwargs via::
|
136
|
+
|
137
|
+
kwargs = dict(
|
138
|
+
profile="some_profile", # fsspec arg
|
139
|
+
cache_regions=True # fsspec arg for s3
|
140
|
+
)
|
141
|
+
ln.settings.storage = "s3://some-bucket", kwargs
|
124
142
|
"""
|
125
143
|
return self._storage_settings
|
126
144
|
|
@@ -174,9 +192,4 @@ class Settings:
|
|
174
192
|
logger.set_verbosity(verbosity_int)
|
175
193
|
|
176
194
|
|
177
|
-
|
178
|
-
git_repo = None
|
179
|
-
else:
|
180
|
-
git_repo = setup_settings.instance.git_repo
|
181
|
-
|
182
|
-
settings = Settings(git_repo=git_repo)
|
195
|
+
settings = Settings()
|
@@ -1,12 +1,17 @@
|
|
1
1
|
"""Test datasets.
|
2
2
|
|
3
|
+
The mini immuno dataset.
|
4
|
+
|
5
|
+
.. autosummary::
|
6
|
+
:toctree: .
|
7
|
+
|
8
|
+
mini_immuno
|
9
|
+
|
3
10
|
Small in-memory datasets.
|
4
11
|
|
5
12
|
.. autosummary::
|
6
13
|
:toctree: .
|
7
14
|
|
8
|
-
small_dataset1
|
9
|
-
small_dataset2
|
10
15
|
anndata_with_obs
|
11
16
|
|
12
17
|
Files.
|
@@ -59,6 +64,7 @@ Other.
|
|
59
64
|
fake_bio_notebook_titles
|
60
65
|
"""
|
61
66
|
|
67
|
+
from . import mini_immuno
|
62
68
|
from ._core import (
|
63
69
|
anndata_file_pbmc68k_test,
|
64
70
|
anndata_human_immune_cells,
|
@@ -88,7 +94,8 @@ from ._core import (
|
|
88
94
|
from ._fake import fake_bio_notebook_titles
|
89
95
|
from ._small import (
|
90
96
|
anndata_with_obs,
|
91
|
-
small_dataset1,
|
92
|
-
small_dataset2,
|
93
97
|
small_dataset3_cellxgene,
|
94
98
|
)
|
99
|
+
|
100
|
+
small_dataset1 = mini_immuno.get_dataset1 # backward compat
|
101
|
+
small_dataset2 = mini_immuno.get_dataset2 # backward compat
|
lamindb/core/datasets/_core.py
CHANGED
@@ -322,8 +322,6 @@ def anndata_human_immune_cells(
|
|
322
322
|
|
323
323
|
import lamindb as ln
|
324
324
|
|
325
|
-
verbosity = ln.settings.verbosity
|
326
|
-
ln.settings.verbosity = "error"
|
327
325
|
ln.save(
|
328
326
|
bt.Gene.from_values(
|
329
327
|
adata.var.index, field="ensembl_gene_id", organism="human"
|
@@ -339,7 +337,6 @@ def anndata_human_immune_cells(
|
|
339
337
|
ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
|
340
338
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
341
339
|
ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
|
342
|
-
ln.settings.verbosity = verbosity
|
343
340
|
return adata
|
344
341
|
|
345
342
|
|
@@ -560,11 +557,14 @@ def spatialdata_blobs() -> SpatialData:
|
|
560
557
|
from spatialdata.datasets import blobs
|
561
558
|
|
562
559
|
sdata = blobs()
|
563
|
-
sdata.attrs["
|
564
|
-
"assay": "Visium Spatial Gene Expression",
|
560
|
+
sdata.attrs["bio"] = {
|
565
561
|
"disease": "Alzheimer disease",
|
566
562
|
"developmental_stage": "adult stage",
|
567
563
|
}
|
564
|
+
sdata.attrs["tech"] = {
|
565
|
+
"assay": "Visium Spatial Gene Expression",
|
566
|
+
}
|
567
|
+
sdata.attrs["random_int"] = 20
|
568
568
|
sdata.tables["table"].var.index = [
|
569
569
|
"ENSG00000139618", # BRCA2
|
570
570
|
"ENSG00000157764", # BRAF
|
lamindb/core/datasets/_small.py
CHANGED
@@ -7,99 +7,6 @@ import numpy as np
|
|
7
7
|
import pandas as pd
|
8
8
|
|
9
9
|
|
10
|
-
def small_dataset1(
|
11
|
-
otype: Literal["DataFrame", "AnnData"] = "DataFrame",
|
12
|
-
gene_symbols_in_index: bool = False,
|
13
|
-
with_typo: bool = False,
|
14
|
-
with_cell_type_synonym: bool = False,
|
15
|
-
with_cell_type_typo: bool = False,
|
16
|
-
) -> pd.DataFrame | ad.AnnData:
|
17
|
-
# define the data in the dataset
|
18
|
-
# it's a mix of numerical measurements and observation-level metadata
|
19
|
-
ifng = "IFNJ" if with_typo else "IFNG"
|
20
|
-
if gene_symbols_in_index:
|
21
|
-
var_ids = ["CD8A", "CD4", "CD14"]
|
22
|
-
else:
|
23
|
-
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
|
24
|
-
abt_cell = (
|
25
|
-
"CD8-pos alpha-beta T cell"
|
26
|
-
if with_cell_type_typo
|
27
|
-
else "CD8-positive, alpha-beta T cell"
|
28
|
-
)
|
29
|
-
dataset_dict = {
|
30
|
-
var_ids[0]: [1, 2, 3],
|
31
|
-
var_ids[1]: [3, 4, 5],
|
32
|
-
var_ids[2]: [5, 6, 7],
|
33
|
-
"perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
|
34
|
-
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
35
|
-
"cell_type_by_expert": pd.Categorical(
|
36
|
-
["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
|
37
|
-
),
|
38
|
-
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
39
|
-
"assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
|
40
|
-
"concentration": ["0.1%", "200 nM", "0.1%"],
|
41
|
-
"treatment_time_h": [24, 24, 6],
|
42
|
-
"donor": ["D0001", "D0002", None],
|
43
|
-
}
|
44
|
-
# define the dataset-level metadata
|
45
|
-
metadata = {
|
46
|
-
"temperature": 21.6,
|
47
|
-
"study": "Candidate marker study 1",
|
48
|
-
"date_of_study": "2024-12-01",
|
49
|
-
"study_note": "We had a great time performing this study and the results look compelling.",
|
50
|
-
}
|
51
|
-
# the dataset as DataFrame
|
52
|
-
dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
|
53
|
-
if otype == "DataFrame":
|
54
|
-
for key, value in metadata.items():
|
55
|
-
dataset_df.attrs[key] = value
|
56
|
-
return dataset_df
|
57
|
-
else:
|
58
|
-
dataset_ad = ad.AnnData(
|
59
|
-
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
60
|
-
)
|
61
|
-
return dataset_ad
|
62
|
-
|
63
|
-
|
64
|
-
def small_dataset2(
|
65
|
-
otype: Literal["DataFrame", "AnnData"],
|
66
|
-
gene_symbols_in_index: bool = False,
|
67
|
-
) -> pd.DataFrame | ad.AnnData:
|
68
|
-
if gene_symbols_in_index:
|
69
|
-
var_ids = ["CD8A", "CD4", "CD38"]
|
70
|
-
else:
|
71
|
-
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
|
72
|
-
dataset_dict = {
|
73
|
-
var_ids[0]: [2, 3, 3],
|
74
|
-
var_ids[1]: [3, 4, 5],
|
75
|
-
var_ids[2]: [4, 2, 3],
|
76
|
-
"perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
77
|
-
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
78
|
-
}
|
79
|
-
metadata = {
|
80
|
-
"temperature": 22.6,
|
81
|
-
"study": "Candidate marker study 2",
|
82
|
-
"date_of_study": "2025-02-13",
|
83
|
-
}
|
84
|
-
dataset_df = pd.DataFrame(
|
85
|
-
dataset_dict,
|
86
|
-
index=["sample4", "sample5", "sample6"],
|
87
|
-
)
|
88
|
-
ad.AnnData(
|
89
|
-
dataset_df[var_ids],
|
90
|
-
obs=dataset_df[["perturbation", "cell_type_by_model"]],
|
91
|
-
)
|
92
|
-
if otype == "DataFrame":
|
93
|
-
for key, value in metadata.items():
|
94
|
-
dataset_df.attrs[key] = value
|
95
|
-
return dataset_df
|
96
|
-
else:
|
97
|
-
dataset_ad = ad.AnnData(
|
98
|
-
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
99
|
-
)
|
100
|
-
return dataset_ad
|
101
|
-
|
102
|
-
|
103
10
|
def small_dataset3_cellxgene(
|
104
11
|
otype: Literal["DataFrame", "AnnData"] = "AnnData",
|
105
12
|
) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
|
@@ -0,0 +1,172 @@
|
|
1
|
+
"""The mini immuno dataset.
|
2
|
+
|
3
|
+
.. autosummary::
|
4
|
+
:toctree: .
|
5
|
+
|
6
|
+
define_features_labels
|
7
|
+
get_dataset1
|
8
|
+
get_dataset2
|
9
|
+
|
10
|
+
"""
|
11
|
+
|
12
|
+
from __future__ import annotations
|
13
|
+
|
14
|
+
from typing import TYPE_CHECKING, Literal
|
15
|
+
|
16
|
+
import anndata as ad
|
17
|
+
import pandas as pd
|
18
|
+
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from lamindb.models import Schema
|
21
|
+
|
22
|
+
|
23
|
+
def define_features_labels() -> None:
|
24
|
+
"""Features & labels to validate the mini immuno datasets.
|
25
|
+
|
26
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
27
|
+
:language: python
|
28
|
+
"""
|
29
|
+
import sys
|
30
|
+
from pathlib import Path
|
31
|
+
|
32
|
+
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
33
|
+
if str(docs_path) not in sys.path:
|
34
|
+
sys.path.append(str(docs_path))
|
35
|
+
|
36
|
+
import define_mini_immuno_features_labels # noqa
|
37
|
+
|
38
|
+
|
39
|
+
def define_mini_immuno_schema_flexible() -> Schema:
|
40
|
+
"""Features & labels to validate the mini immuno datasets.
|
41
|
+
|
42
|
+
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
43
|
+
:language: python
|
44
|
+
"""
|
45
|
+
import sys
|
46
|
+
from pathlib import Path
|
47
|
+
|
48
|
+
from lamindb.models import Schema
|
49
|
+
|
50
|
+
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
51
|
+
if str(docs_path) not in sys.path:
|
52
|
+
sys.path.append(str(docs_path))
|
53
|
+
|
54
|
+
define_features_labels()
|
55
|
+
import define_mini_immuno_schema_flexible # noqa
|
56
|
+
|
57
|
+
return Schema.get(name="Mini immuno schema")
|
58
|
+
|
59
|
+
|
60
|
+
def get_dataset1(
|
61
|
+
otype: Literal["DataFrame", "AnnData"] = "DataFrame",
|
62
|
+
gene_symbols_in_index: bool = False,
|
63
|
+
with_typo: bool = False,
|
64
|
+
with_cell_type_synonym: bool = False,
|
65
|
+
with_cell_type_typo: bool = False,
|
66
|
+
with_gene_typo: bool = False,
|
67
|
+
with_outdated_gene: bool = False,
|
68
|
+
with_wrong_subtype: bool = False,
|
69
|
+
with_index_type_mismatch: bool = False,
|
70
|
+
) -> pd.DataFrame | ad.AnnData:
|
71
|
+
"""A small tabular dataset measuring expression & metadata."""
|
72
|
+
# define the data in the dataset
|
73
|
+
# it's a mix of numerical measurements and observation-level metadata
|
74
|
+
ifng = "IFNJ" if with_typo else "IFNG"
|
75
|
+
thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO"
|
76
|
+
if gene_symbols_in_index:
|
77
|
+
var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"]
|
78
|
+
else:
|
79
|
+
var_ids = [
|
80
|
+
"ENSG00000153563",
|
81
|
+
"ENSG00000010610",
|
82
|
+
"ENSG00000170458"
|
83
|
+
if not with_gene_typo
|
84
|
+
else "GeneTypo"
|
85
|
+
if not with_outdated_gene
|
86
|
+
else "ENSG00000278198",
|
87
|
+
]
|
88
|
+
abt_cell = (
|
89
|
+
"CD8-pos alpha-beta T cell"
|
90
|
+
if with_cell_type_typo
|
91
|
+
else "CD8-positive, alpha-beta T cell"
|
92
|
+
)
|
93
|
+
dataset_dict = {
|
94
|
+
var_ids[0]: [1, 2, 3],
|
95
|
+
var_ids[1]: [3, 4, 5],
|
96
|
+
var_ids[2]: [5, 6, 7],
|
97
|
+
"perturbation": pd.Categorical(["DMSO", ifng, thing]),
|
98
|
+
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
99
|
+
"cell_type_by_expert": pd.Categorical(
|
100
|
+
["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
|
101
|
+
),
|
102
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
103
|
+
"assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
|
104
|
+
"concentration": ["0.1%", "200 nM", "0.1%"],
|
105
|
+
"treatment_time_h": [24, 24, 6],
|
106
|
+
"donor": ["D0001", "D0002", None],
|
107
|
+
}
|
108
|
+
# define the dataset-level metadata
|
109
|
+
metadata = {
|
110
|
+
"temperature": 21.6,
|
111
|
+
"experiment": "Experiment 1",
|
112
|
+
"date_of_study": "2024-12-01",
|
113
|
+
"study_note": "We had a great time performing this study and the results look compelling.",
|
114
|
+
}
|
115
|
+
# the dataset as DataFrame
|
116
|
+
dataset_df = pd.DataFrame(
|
117
|
+
dataset_dict,
|
118
|
+
index=["sample1", "sample2", 0] # type: ignore
|
119
|
+
if with_index_type_mismatch
|
120
|
+
else ["sample1", "sample2", "sample3"],
|
121
|
+
)
|
122
|
+
if otype == "DataFrame":
|
123
|
+
for key, value in metadata.items():
|
124
|
+
dataset_df.attrs[key] = value
|
125
|
+
return dataset_df
|
126
|
+
else:
|
127
|
+
dataset_ad = ad.AnnData(
|
128
|
+
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
129
|
+
)
|
130
|
+
return dataset_ad
|
131
|
+
|
132
|
+
|
133
|
+
def get_dataset2(
|
134
|
+
otype: Literal["DataFrame", "AnnData"],
|
135
|
+
gene_symbols_in_index: bool = False,
|
136
|
+
) -> pd.DataFrame | ad.AnnData:
|
137
|
+
if gene_symbols_in_index:
|
138
|
+
var_ids = ["CD8A", "CD4", "CD38"]
|
139
|
+
else:
|
140
|
+
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
|
141
|
+
dataset_dict = {
|
142
|
+
var_ids[0]: [2, 3, 3],
|
143
|
+
var_ids[1]: [3, 4, 5],
|
144
|
+
var_ids[2]: [4, 2, 3],
|
145
|
+
"perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
146
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
147
|
+
"concentration": ["0.1%", "200 nM", "0.1%"],
|
148
|
+
"treatment_time_h": [24, 24, 6],
|
149
|
+
"donor": ["D0003", "D0003", "D0004"],
|
150
|
+
}
|
151
|
+
metadata = {
|
152
|
+
"temperature": 22.6,
|
153
|
+
"experiment": "Experiment 2",
|
154
|
+
"date_of_study": "2025-02-13",
|
155
|
+
}
|
156
|
+
dataset_df = pd.DataFrame(
|
157
|
+
dataset_dict,
|
158
|
+
index=["sample4", "sample5", "sample6"],
|
159
|
+
)
|
160
|
+
ad.AnnData(
|
161
|
+
dataset_df[var_ids],
|
162
|
+
obs=dataset_df[["perturbation", "cell_type_by_model"]],
|
163
|
+
)
|
164
|
+
if otype == "DataFrame":
|
165
|
+
for key, value in metadata.items():
|
166
|
+
dataset_df.attrs[key] = value
|
167
|
+
return dataset_df
|
168
|
+
else:
|
169
|
+
dataset_ad = ad.AnnData(
|
170
|
+
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
171
|
+
)
|
172
|
+
return dataset_ad
|
lamindb/core/loaders.py
CHANGED
@@ -44,7 +44,7 @@ try:
|
|
44
44
|
except ImportError:
|
45
45
|
|
46
46
|
def load_zarr(storepath): # type: ignore
|
47
|
-
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
47
|
+
raise ImportError("Please install zarr: pip install 'zarr<=2.18.4'")
|
48
48
|
|
49
49
|
|
50
50
|
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
@@ -1,20 +1,26 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
5
6
|
|
6
7
|
from anndata._io.specs.registry import get_spec
|
7
8
|
|
8
9
|
from ._anndata_accessor import AnnDataAccessor, StorageType, registry
|
9
|
-
from .
|
10
|
+
from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
|
11
|
+
from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset
|
10
12
|
from ._tiledbsoma import _open_tiledbsoma
|
11
13
|
from .paths import filepath_from_artifact
|
12
14
|
|
13
15
|
if TYPE_CHECKING:
|
16
|
+
from collections.abc import Iterator
|
17
|
+
|
14
18
|
from fsspec.core import OpenFile
|
19
|
+
from polars import LazyFrame as PolarsLazyFrame
|
15
20
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
16
21
|
from tiledbsoma import Collection as SOMACollection
|
17
22
|
from tiledbsoma import Experiment as SOMAExperiment
|
23
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
18
24
|
from upath import UPath
|
19
25
|
|
20
26
|
from lamindb.models.artifact import Artifact
|
@@ -69,10 +75,17 @@ class BackedAccessor:
|
|
69
75
|
def backed_access(
|
70
76
|
artifact_or_filepath: Artifact | UPath,
|
71
77
|
mode: str = "r",
|
78
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
72
79
|
using_key: str | None = None,
|
73
80
|
**kwargs,
|
74
81
|
) -> (
|
75
|
-
AnnDataAccessor
|
82
|
+
AnnDataAccessor
|
83
|
+
| BackedAccessor
|
84
|
+
| SOMACollection
|
85
|
+
| SOMAExperiment
|
86
|
+
| SOMAMeasurement
|
87
|
+
| PyArrowDataset
|
88
|
+
| Iterator[PolarsLazyFrame]
|
76
89
|
):
|
77
90
|
from lamindb.models import Artifact
|
78
91
|
|
@@ -97,12 +110,15 @@ def backed_access(
|
|
97
110
|
conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
|
98
111
|
elif suffix == ".zarr":
|
99
112
|
conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
|
100
|
-
elif
|
101
|
-
|
113
|
+
elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
|
114
|
+
df_suffix := df_suffixes.pop()
|
115
|
+
) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
|
116
|
+
return _open_dataframe(objectpath, df_suffix, engine, **kwargs)
|
102
117
|
else:
|
103
118
|
raise ValueError(
|
104
119
|
"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
|
105
|
-
f"
|
120
|
+
f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, "
|
121
|
+
f"instead of being {suffix} object."
|
106
122
|
)
|
107
123
|
|
108
124
|
is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
|
@@ -112,3 +128,81 @@ def backed_access(
|
|
112
128
|
return AnnDataAccessor(conn, storage, name)
|
113
129
|
else:
|
114
130
|
return BackedAccessor(conn, storage)
|
131
|
+
|
132
|
+
|
133
|
+
def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:
|
134
|
+
# it is assumed here that the paths exist
|
135
|
+
# we don't check here that the filesystem is the same
|
136
|
+
# but this is a requirement for pyarrow.dataset.dataset
|
137
|
+
path_list = []
|
138
|
+
if isinstance(paths, Path):
|
139
|
+
paths = [paths]
|
140
|
+
for path in paths:
|
141
|
+
# assume http is always a file
|
142
|
+
if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
|
143
|
+
path_list += [p for p in path.rglob("*") if p.suffix != ""]
|
144
|
+
else:
|
145
|
+
path_list.append(path)
|
146
|
+
|
147
|
+
suffixes = set()
|
148
|
+
for path in path_list:
|
149
|
+
path_suffixes = path.suffixes
|
150
|
+
# this doesn't work for externally gzipped files, REMOVE LATER
|
151
|
+
path_suffix = (
|
152
|
+
path_suffixes[-2]
|
153
|
+
if len(path_suffixes) > 1 and ".gz" in path_suffixes
|
154
|
+
else path.suffix
|
155
|
+
)
|
156
|
+
suffixes.add(path_suffix)
|
157
|
+
return suffixes
|
158
|
+
|
159
|
+
|
160
|
+
def _open_dataframe(
|
161
|
+
paths: UPath | list[UPath],
|
162
|
+
suffix: str | None = None,
|
163
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
164
|
+
**kwargs,
|
165
|
+
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
|
166
|
+
df_suffix: str
|
167
|
+
if suffix is None:
|
168
|
+
df_suffixes = _flat_suffixes(paths)
|
169
|
+
if len(df_suffixes) > 1:
|
170
|
+
raise ValueError(
|
171
|
+
f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n"
|
172
|
+
"It is not possible to open such stores with pyarrow or polars."
|
173
|
+
)
|
174
|
+
df_suffix = df_suffixes.pop()
|
175
|
+
else:
|
176
|
+
df_suffix = suffix
|
177
|
+
|
178
|
+
if engine == "pyarrow":
|
179
|
+
if df_suffix not in PYARROW_SUFFIXES:
|
180
|
+
raise ValueError(
|
181
|
+
f"{df_suffix} files are not supported by pyarrow, "
|
182
|
+
f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
|
183
|
+
)
|
184
|
+
# this checks that the filesystem is the same for all paths
|
185
|
+
# this is a requirement of pyarrow.dataset.dataset
|
186
|
+
if not isinstance(paths, Path): # is a list then
|
187
|
+
fs = getattr(paths[0], "fs", None)
|
188
|
+
for path in paths[1:]:
|
189
|
+
# this assumes that the filesystems are cached by fsspec
|
190
|
+
if getattr(path, "fs", None) is not fs:
|
191
|
+
raise ValueError(
|
192
|
+
"The collection has artifacts with different filesystems, "
|
193
|
+
"this is not supported by pyarrow."
|
194
|
+
)
|
195
|
+
dataframe = _open_pyarrow_dataset(paths, **kwargs)
|
196
|
+
elif engine == "polars":
|
197
|
+
if df_suffix not in POLARS_SUFFIXES:
|
198
|
+
raise ValueError(
|
199
|
+
f"{df_suffix} files are not supported by polars, "
|
200
|
+
f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
|
201
|
+
)
|
202
|
+
dataframe = _open_polars_lazy_df(paths, **kwargs)
|
203
|
+
else:
|
204
|
+
raise ValueError(
|
205
|
+
f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
|
206
|
+
)
|
207
|
+
|
208
|
+
return dataframe
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from collections.abc import Iterator
|
9
|
+
|
10
|
+
from polars import LazyFrame as PolarsLazyFrame
|
11
|
+
from upath import UPath
|
12
|
+
|
13
|
+
POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
|
14
|
+
|
15
|
+
|
16
|
+
@contextmanager
|
17
|
+
def _open_polars_lazy_df(
|
18
|
+
paths: UPath | list[UPath], **kwargs
|
19
|
+
) -> Iterator[PolarsLazyFrame]:
|
20
|
+
try:
|
21
|
+
import polars as pl
|
22
|
+
except ImportError as ie:
|
23
|
+
raise ImportError("Please install polars: pip install polars") from ie
|
24
|
+
|
25
|
+
scans = {
|
26
|
+
".parquet": pl.scan_parquet,
|
27
|
+
".csv": pl.scan_csv,
|
28
|
+
".ndjson": pl.scan_ndjson,
|
29
|
+
".ipc": pl.scan_ipc,
|
30
|
+
}
|
31
|
+
|
32
|
+
path_list = []
|
33
|
+
if isinstance(paths, Path):
|
34
|
+
paths = [paths]
|
35
|
+
for path in paths:
|
36
|
+
# assume http is always a file
|
37
|
+
if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
|
38
|
+
path_list += [p for p in path.rglob("*") if p.suffix != ""]
|
39
|
+
else:
|
40
|
+
path_list.append(path)
|
41
|
+
|
42
|
+
open_files = []
|
43
|
+
|
44
|
+
try:
|
45
|
+
for path in path_list:
|
46
|
+
open_files.append(path.open(mode="rb"))
|
47
|
+
|
48
|
+
yield scans[path_list[0].suffix](open_files, **kwargs)
|
49
|
+
finally:
|
50
|
+
for open_file in open_files:
|
51
|
+
open_file.close()
|