lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +174 -57
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +222 -81
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +39 -36
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +54 -44
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +7 -13
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +41 -22
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +423 -156
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.4.dist-info/RECORD +0 -102
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
@@ -87,7 +87,7 @@ class MappedCollection:
|
|
87
87
|
obs_keys: Keys from the ``.obs`` slots.
|
88
88
|
obs_filter: Select only observations with these values for the given obs columns.
|
89
89
|
Should be a dictionary with obs column names as keys
|
90
|
-
and filtering values (a string or a
|
90
|
+
and filtering values (a string or a list of strings) as values.
|
91
91
|
join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
|
92
92
|
does not join.
|
93
93
|
encode_labels: Encode labels into integers.
|
@@ -106,7 +106,7 @@ class MappedCollection:
|
|
106
106
|
layers_keys: str | list[str] | None = None,
|
107
107
|
obs_keys: str | list[str] | None = None,
|
108
108
|
obsm_keys: str | list[str] | None = None,
|
109
|
-
obs_filter: dict[str, str |
|
109
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
110
110
|
join: Literal["inner", "outer"] | None = "inner",
|
111
111
|
encode_labels: bool | list[str] = True,
|
112
112
|
unknown_label: str | dict[str, str] | None = None,
|
@@ -184,9 +184,14 @@ class MappedCollection:
|
|
184
184
|
if self.filtered:
|
185
185
|
indices_storage_mask = None
|
186
186
|
for obs_filter_key, obs_filter_values in obs_filter.items():
|
187
|
-
|
188
|
-
|
189
|
-
)
|
187
|
+
if isinstance(obs_filter_values, tuple):
|
188
|
+
obs_filter_values = list(obs_filter_values)
|
189
|
+
elif not isinstance(obs_filter_values, list):
|
190
|
+
obs_filter_values = [obs_filter_values]
|
191
|
+
obs_labels = self._get_labels(store, obs_filter_key)
|
192
|
+
obs_filter_mask = np.isin(obs_labels, obs_filter_values)
|
193
|
+
if pd.isna(obs_filter_values).any():
|
194
|
+
obs_filter_mask |= pd.isna(obs_labels)
|
190
195
|
if indices_storage_mask is None:
|
191
196
|
indices_storage_mask = obs_filter_mask
|
192
197
|
else:
|
@@ -296,7 +301,7 @@ class MappedCollection:
|
|
296
301
|
self.var_joint = reduce(pd.Index.intersection, self.var_list)
|
297
302
|
if len(self.var_joint) == 0:
|
298
303
|
raise ValueError(
|
299
|
-
"The provided AnnData objects don't have shared
|
304
|
+
"The provided AnnData objects don't have shared variables.\n"
|
300
305
|
"Use join='outer'."
|
301
306
|
)
|
302
307
|
self.var_indices = [
|
@@ -389,7 +394,7 @@ class MappedCollection:
|
|
389
394
|
else:
|
390
395
|
cats = None
|
391
396
|
label_idx = self._get_obs_idx(store, obs_idx, label, cats)
|
392
|
-
if label in self.encoders:
|
397
|
+
if label in self.encoders and label_idx is not np.nan:
|
393
398
|
label_idx = self.encoders[label][label_idx]
|
394
399
|
out[label] = label_idx
|
395
400
|
return out
|
@@ -453,6 +458,8 @@ class MappedCollection:
|
|
453
458
|
label = labels[idx]
|
454
459
|
else:
|
455
460
|
label = labels["codes"][idx]
|
461
|
+
if label == -1:
|
462
|
+
return np.nan
|
456
463
|
if categories is not None:
|
457
464
|
cats = categories
|
458
465
|
else:
|
@@ -589,7 +596,13 @@ class MappedCollection:
|
|
589
596
|
cats = self._get_categories(storage, label_key)
|
590
597
|
if cats is not None:
|
591
598
|
cats = _decode(cats) if isinstance(cats[0], bytes) else cats
|
599
|
+
# NaN is coded as -1
|
600
|
+
nans = labels == -1
|
592
601
|
labels = cats[labels]
|
602
|
+
# detect and replace nans
|
603
|
+
if nans.any():
|
604
|
+
labels[nans] = np.nan
|
605
|
+
|
593
606
|
return labels
|
594
607
|
|
595
608
|
def close(self):
|
@@ -85,4 +85,9 @@ from ._core import (
|
|
85
85
|
schmidt22_perturbseq,
|
86
86
|
)
|
87
87
|
from ._fake import fake_bio_notebook_titles
|
88
|
-
from ._small import
|
88
|
+
from ._small import (
|
89
|
+
anndata_with_obs,
|
90
|
+
small_dataset1,
|
91
|
+
small_dataset2,
|
92
|
+
small_dataset3_cellxgene,
|
93
|
+
)
|
lamindb/core/datasets/_core.py
CHANGED
@@ -18,7 +18,8 @@ if TYPE_CHECKING:
|
|
18
18
|
def file_fcs() -> Path:
|
19
19
|
"""Example FCS artifact."""
|
20
20
|
filepath, _ = urlretrieve(
|
21
|
-
"https://lamindb-
|
21
|
+
"https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
|
22
|
+
"example.fcs",
|
22
23
|
)
|
23
24
|
return Path(filepath)
|
24
25
|
|
@@ -48,8 +49,8 @@ def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no
|
|
48
49
|
bt.CellMarker.public().inspect(std, "name").validated, "name"
|
49
50
|
)
|
50
51
|
)
|
51
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
52
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
52
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
53
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
|
53
54
|
ln.settings.verbosity = verbosity
|
54
55
|
return Path(filepath)
|
55
56
|
|
@@ -84,8 +85,8 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
|
|
84
85
|
|
85
86
|
verbosity = ln.settings.verbosity
|
86
87
|
ln.settings.verbosity = "error"
|
87
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
88
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
88
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
89
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
|
89
90
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
|
90
91
|
ln.settings.verbosity = verbosity
|
91
92
|
|
@@ -207,7 +208,7 @@ def anndata_mouse_sc_lymph_node(
|
|
207
208
|
# cell types
|
208
209
|
ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
|
209
210
|
# assays
|
210
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
211
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
211
212
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
212
213
|
# genes
|
213
214
|
validated = bt.Gene.public(organism="mouse").validate(
|
@@ -330,11 +331,11 @@ def anndata_human_immune_cells(
|
|
330
331
|
ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
|
331
332
|
ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
|
332
333
|
ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
|
333
|
-
ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
|
334
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
335
|
-
ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
|
336
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
337
|
-
ln.Feature(name="donor", dtype=[ln.ULabel]).save()
|
334
|
+
ln.Feature(name="cell_type", dtype=[bt.CellType]).save() # type: ignore
|
335
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
|
336
|
+
ln.Feature(name="tissue", dtype=[bt.Tissue]).save() # type: ignore
|
337
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
|
338
|
+
ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
|
338
339
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
339
340
|
ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
|
340
341
|
ln.settings.verbosity = verbosity
|
lamindb/core/datasets/_small.py
CHANGED
@@ -8,20 +8,25 @@ import pandas as pd
|
|
8
8
|
|
9
9
|
|
10
10
|
def small_dataset1(
|
11
|
-
|
11
|
+
otype: Literal["DataFrame", "AnnData"],
|
12
|
+
gene_symbols_in_index: bool = False,
|
12
13
|
with_typo: bool = False,
|
13
|
-
) ->
|
14
|
+
) -> pd.DataFrame | ad.AnnData:
|
14
15
|
# define the data in the dataset
|
15
16
|
# it's a mix of numerical measurements and observation-level metadata
|
16
17
|
ifng = "IFNJ" if with_typo else "IFNG"
|
18
|
+
if gene_symbols_in_index:
|
19
|
+
var_ids = ["CD8A", "CD4", "CD14"]
|
20
|
+
else:
|
21
|
+
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
|
17
22
|
dataset_dict = {
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
"cell_medium": ["DMSO", ifng, "DMSO"],
|
23
|
+
var_ids[0]: [1, 2, 3],
|
24
|
+
var_ids[1]: [3, 4, 5],
|
25
|
+
var_ids[2]: [5, 6, 7],
|
26
|
+
"cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
|
22
27
|
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
23
|
-
"cell_type_by_expert": ["B cell", "T cell", "T cell"],
|
24
|
-
"cell_type_by_model": ["B cell", "T cell", "T cell"],
|
28
|
+
"cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
|
29
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
25
30
|
}
|
26
31
|
# define the dataset-level metadata
|
27
32
|
metadata = {
|
@@ -32,8 +37,10 @@ def small_dataset1(
|
|
32
37
|
}
|
33
38
|
# the dataset as DataFrame
|
34
39
|
dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
|
35
|
-
if
|
36
|
-
|
40
|
+
if otype == "DataFrame":
|
41
|
+
for key, value in metadata.items():
|
42
|
+
dataset_df.attrs[key] = value
|
43
|
+
return dataset_df
|
37
44
|
else:
|
38
45
|
dataset_ad = ad.AnnData(
|
39
46
|
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
@@ -42,14 +49,19 @@ def small_dataset1(
|
|
42
49
|
|
43
50
|
|
44
51
|
def small_dataset2(
|
45
|
-
|
46
|
-
|
52
|
+
otype: Literal["DataFrame", "AnnData"],
|
53
|
+
gene_symbols_in_index: bool = False,
|
54
|
+
) -> pd.DataFrame | ad.AnnData:
|
55
|
+
if gene_symbols_in_index:
|
56
|
+
var_ids = ["CD8A", "CD4", "CD38"]
|
57
|
+
else:
|
58
|
+
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
|
47
59
|
dataset_dict = {
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
"cell_medium": ["DMSO", "IFNG", "IFNG"],
|
52
|
-
"cell_type_by_model": ["B cell", "T cell", "T cell"],
|
60
|
+
var_ids[0]: [2, 3, 3],
|
61
|
+
var_ids[1]: [3, 4, 5],
|
62
|
+
var_ids[2]: [4, 2, 3],
|
63
|
+
"cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
64
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
53
65
|
}
|
54
66
|
metadata = {
|
55
67
|
"temperature": 22.6,
|
@@ -61,11 +73,13 @@ def small_dataset2(
|
|
61
73
|
index=["sample4", "sample5", "sample6"],
|
62
74
|
)
|
63
75
|
ad.AnnData(
|
64
|
-
dataset_df[
|
76
|
+
dataset_df[var_ids],
|
65
77
|
obs=dataset_df[["cell_medium", "cell_type_by_model"]],
|
66
78
|
)
|
67
|
-
if
|
68
|
-
|
79
|
+
if otype == "DataFrame":
|
80
|
+
for key, value in metadata.items():
|
81
|
+
dataset_df.attrs[key] = value
|
82
|
+
return dataset_df
|
69
83
|
else:
|
70
84
|
dataset_ad = ad.AnnData(
|
71
85
|
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
@@ -73,6 +87,38 @@ def small_dataset2(
|
|
73
87
|
return dataset_ad
|
74
88
|
|
75
89
|
|
90
|
+
def small_dataset3_cellxgene(
|
91
|
+
otype: Literal["DataFrame", "AnnData"] = "AnnData",
|
92
|
+
) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
|
93
|
+
# TODO: consider other ids for other organisms
|
94
|
+
# "ENSMUSG00002076988"
|
95
|
+
var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
|
96
|
+
dataset_dict = {
|
97
|
+
var_ids[0]: [2, 3, 3],
|
98
|
+
var_ids[1]: [3, 4, 5],
|
99
|
+
var_ids[2]: [4, 2, 3],
|
100
|
+
"disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
|
101
|
+
"organism": ["human", "human", "human"],
|
102
|
+
"sex": ["female", "male", "unknown"],
|
103
|
+
"tissue": ["lungg", "lungg", "heart"],
|
104
|
+
"donor": ["-1", "1", "2"],
|
105
|
+
}
|
106
|
+
dataset_df = pd.DataFrame(
|
107
|
+
dataset_dict,
|
108
|
+
index=["barcode1", "barcode2", "barcode3"],
|
109
|
+
)
|
110
|
+
dataset_df["tissue"] = dataset_df["tissue"].astype("category")
|
111
|
+
ad.AnnData(
|
112
|
+
dataset_df[var_ids],
|
113
|
+
obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
|
114
|
+
)
|
115
|
+
if otype == "DataFrame":
|
116
|
+
return dataset_df
|
117
|
+
else:
|
118
|
+
dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
|
119
|
+
return dataset_ad
|
120
|
+
|
121
|
+
|
76
122
|
def anndata_with_obs() -> ad.AnnData:
|
77
123
|
"""Create a mini anndata with cell_type, disease and tissue."""
|
78
124
|
import anndata as ad
|
lamindb/core/exceptions.py
CHANGED
@@ -1,90 +1 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
.. autosummary::
|
4
|
-
:toctree: .
|
5
|
-
|
6
|
-
InvalidArgument
|
7
|
-
DoesNotExist
|
8
|
-
ValidationError
|
9
|
-
NotebookNotSaved
|
10
|
-
MissingContextUID
|
11
|
-
UpdateContext
|
12
|
-
IntegrityError
|
13
|
-
RecordNameChangeIntegrityError
|
14
|
-
|
15
|
-
"""
|
16
|
-
|
17
|
-
# inheriting from SystemExit has the sole purpose of suppressing
|
18
|
-
# the traceback - this isn't optimal but the current best solution
|
19
|
-
# https://laminlabs.slack.com/archives/C04A0RMA0SC/p1726856875597489
|
20
|
-
|
21
|
-
|
22
|
-
class InvalidArgument(SystemExit):
|
23
|
-
"""Invalid method or function argument."""
|
24
|
-
|
25
|
-
pass
|
26
|
-
|
27
|
-
|
28
|
-
class TrackNotCalled(SystemExit):
|
29
|
-
"""`ln.track()` wasn't called."""
|
30
|
-
|
31
|
-
pass
|
32
|
-
|
33
|
-
|
34
|
-
class NotebookNotSaved(SystemExit):
|
35
|
-
"""Notebook wasn't saved."""
|
36
|
-
|
37
|
-
pass
|
38
|
-
|
39
|
-
|
40
|
-
class ValidationError(SystemExit):
|
41
|
-
"""Validation error: not mapped in registry."""
|
42
|
-
|
43
|
-
pass
|
44
|
-
|
45
|
-
|
46
|
-
# inspired by Django's DoesNotExist
|
47
|
-
# equivalent to SQLAlchemy's NoResultFound
|
48
|
-
class DoesNotExist(SystemExit):
|
49
|
-
"""No record found."""
|
50
|
-
|
51
|
-
pass
|
52
|
-
|
53
|
-
|
54
|
-
class InconsistentKey(Exception):
|
55
|
-
"""Inconsistent transform or artifact `key`."""
|
56
|
-
|
57
|
-
pass
|
58
|
-
|
59
|
-
|
60
|
-
class RecordNameChangeIntegrityError(SystemExit):
|
61
|
-
"""Custom exception for name change errors."""
|
62
|
-
|
63
|
-
pass
|
64
|
-
|
65
|
-
|
66
|
-
# -------------------------------------------------------------------------------------
|
67
|
-
# run context
|
68
|
-
# -------------------------------------------------------------------------------------
|
69
|
-
|
70
|
-
|
71
|
-
class IntegrityError(Exception):
|
72
|
-
"""Integrity error.
|
73
|
-
|
74
|
-
For instance, it's not allowed to delete artifacts outside managed storage
|
75
|
-
locations.
|
76
|
-
"""
|
77
|
-
|
78
|
-
pass
|
79
|
-
|
80
|
-
|
81
|
-
class MissingContextUID(SystemExit):
|
82
|
-
"""User didn't define transform settings."""
|
83
|
-
|
84
|
-
pass
|
85
|
-
|
86
|
-
|
87
|
-
class UpdateContext(SystemExit):
|
88
|
-
"""Transform settings require update."""
|
89
|
-
|
90
|
-
pass
|
1
|
+
from ..errors import * # noqa: F403 backward compat
|
lamindb/core/loaders.py
CHANGED
@@ -40,7 +40,7 @@ try:
|
|
40
40
|
except ImportError:
|
41
41
|
|
42
42
|
def load_anndata_zarr(storepath): # type: ignore
|
43
|
-
raise ImportError("Please install zarr: pip install zarr")
|
43
|
+
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
44
44
|
|
45
45
|
|
46
46
|
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
@@ -109,19 +109,13 @@ def load_json(path: UPathStr) -> dict:
|
|
109
109
|
return data
|
110
110
|
|
111
111
|
|
112
|
-
def load_yaml(path: UPathStr) -> dict
|
112
|
+
def load_yaml(path: UPathStr) -> dict:
|
113
113
|
"""Load `.yaml` to `dict`."""
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
return data
|
120
|
-
except ImportError:
|
121
|
-
logger.warning(
|
122
|
-
"Please install PyYAML (`pip install PyYAML`) to load `.yaml` files."
|
123
|
-
)
|
124
|
-
return path
|
114
|
+
import yaml # type: ignore
|
115
|
+
|
116
|
+
with open(path) as f:
|
117
|
+
data = yaml.safe_load(f)
|
118
|
+
return data
|
125
119
|
|
126
120
|
|
127
121
|
def load_image(path: UPathStr) -> None | UPathStr:
|
lamindb/core/relations.py
CHANGED
@@ -8,7 +8,7 @@ from lamindb_setup._connect_instance import (
|
|
8
8
|
)
|
9
9
|
from lamindb_setup.core._settings_store import instance_settings_file
|
10
10
|
|
11
|
-
from lamindb.models import LinkORM, Record, Schema
|
11
|
+
from lamindb.models import LinkORM, Record, Registry, Schema
|
12
12
|
|
13
13
|
|
14
14
|
def get_schema_modules(instance: str | None) -> set[str]:
|
@@ -35,9 +35,11 @@ def get_schema_modules(instance: str | None) -> set[str]:
|
|
35
35
|
return shared_schema_modules
|
36
36
|
|
37
37
|
|
38
|
+
# this function here should likely be renamed
|
39
|
+
# it maps the __get_name_with_module__() onto the actual model
|
38
40
|
def dict_module_name_to_model_name(
|
39
|
-
registry:
|
40
|
-
) -> dict[str,
|
41
|
+
registry: Registry, instance: str | None = None
|
42
|
+
) -> dict[str, Registry]:
|
41
43
|
schema_modules = get_schema_modules(instance)
|
42
44
|
d: dict = {
|
43
45
|
i.related_model.__get_name_with_module__(): i.related_model
|
@@ -92,7 +94,7 @@ def get_related_name(features_type: type[Record]) -> str:
|
|
92
94
|
f"Can't create feature sets from {features_type.__name__} because it's not"
|
93
95
|
" related to it!\nYou need to create a link model between Schema and"
|
94
96
|
" your Record in your custom module.\nTo do so, add a"
|
95
|
-
" line:\
|
97
|
+
" line:\n_feature_sets = models.ManyToMany(Schema,"
|
96
98
|
" related_name='mythings')\n"
|
97
99
|
)
|
98
100
|
return candidates[0]
|
@@ -19,6 +19,7 @@ from fsspec.implementations.local import LocalFileSystem
|
|
19
19
|
from lamin_utils import logger
|
20
20
|
from lamindb_setup.core.upath import create_mapper, infer_filesystem
|
21
21
|
from packaging import version
|
22
|
+
from upath import UPath
|
22
23
|
|
23
24
|
if TYPE_CHECKING:
|
24
25
|
from collections.abc import Mapping
|
@@ -741,3 +742,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
|
|
741
742
|
return AnnDataRawAccessor(
|
742
743
|
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
743
744
|
)
|
745
|
+
|
746
|
+
|
747
|
+
# get the number of observations in an anndata object or file fast and safely
|
748
|
+
def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
|
749
|
+
if isinstance(object, AnnData):
|
750
|
+
return object.n_obs
|
751
|
+
|
752
|
+
try:
|
753
|
+
objectpath = UPath(object)
|
754
|
+
suffix = objectpath.suffix
|
755
|
+
conn_module = {".h5ad": "h5py", ".zarr": "zarr"}.get(suffix, suffix[1:])
|
756
|
+
conn, storage = registry.open(conn_module, objectpath, mode="r")
|
757
|
+
except Exception as e:
|
758
|
+
logger.warning(f"Could not open {object} to read n_observations: {e}")
|
759
|
+
return None
|
760
|
+
|
761
|
+
n_observations: int | None = None
|
762
|
+
try:
|
763
|
+
obs = storage["obs"]
|
764
|
+
if isinstance(obs, GroupTypes): # type: ignore
|
765
|
+
if "_index" in obs.attrs:
|
766
|
+
elem_key = _read_attr(obs.attrs, "_index")
|
767
|
+
else:
|
768
|
+
elem_key = next(iter(obs))
|
769
|
+
elem = obs[elem_key]
|
770
|
+
if isinstance(elem, ArrayTypes): # type: ignore
|
771
|
+
n_observations = elem.shape[0]
|
772
|
+
else:
|
773
|
+
# assume standard obs group
|
774
|
+
n_observations = elem["codes"].shape[0]
|
775
|
+
else:
|
776
|
+
n_observations = obs.shape[0]
|
777
|
+
except Exception as e:
|
778
|
+
logger.warning(f"Could not read n_observations from anndata {object}: {e}")
|
779
|
+
finally:
|
780
|
+
if hasattr(storage, "close"):
|
781
|
+
storage.close()
|
782
|
+
if hasattr(conn, "close"):
|
783
|
+
conn.close()
|
784
|
+
return n_observations
|
@@ -94,8 +94,8 @@ def backed_access(
|
|
94
94
|
return _open_pyarrow_dataset(objectpath)
|
95
95
|
else:
|
96
96
|
raise ValueError(
|
97
|
-
"object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix
|
98
|
-
f" {suffix}."
|
97
|
+
"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
|
98
|
+
f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
|
99
99
|
)
|
100
100
|
|
101
101
|
is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
|
@@ -6,26 +6,36 @@ import pyarrow.dataset
|
|
6
6
|
from lamindb_setup.core.upath import LocalPathClasses
|
7
7
|
|
8
8
|
if TYPE_CHECKING:
|
9
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
9
10
|
from upath import UPath
|
10
11
|
|
11
12
|
|
12
|
-
PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather")
|
13
|
+
PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
|
13
14
|
|
14
15
|
|
15
|
-
def _is_pyarrow_dataset(
|
16
|
-
# it is assumed here that
|
17
|
-
|
18
|
-
|
16
|
+
def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
|
17
|
+
# it is assumed here that the paths exist
|
18
|
+
# we don't check here that the filesystem is the same
|
19
|
+
# but this is a requirement for pyarrow.dataset.dataset
|
20
|
+
if isinstance(paths, list):
|
21
|
+
suffixes = {path.suffix for path in paths}
|
22
|
+
elif paths.is_file():
|
23
|
+
suffixes = {paths.suffix}
|
19
24
|
else:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
suffixes = {path.suffix for path in paths.rglob("*") if path.suffix != ""}
|
26
|
+
return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
|
27
|
+
|
28
|
+
|
29
|
+
def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
|
30
|
+
if isinstance(paths, list):
|
31
|
+
path0 = paths[0]
|
32
|
+
if isinstance(path0, LocalPathClasses):
|
33
|
+
paths_str, filesystem = [path.as_posix() for path in paths], None
|
34
|
+
else:
|
35
|
+
paths_str, filesystem = [path.path for path in paths], path0.fs
|
36
|
+
elif isinstance(paths, LocalPathClasses):
|
37
|
+
paths_str, filesystem = paths.as_posix(), None
|
28
38
|
else:
|
29
|
-
|
39
|
+
paths_str, filesystem = paths.path, paths.fs
|
30
40
|
|
31
|
-
return pyarrow.dataset.dataset(
|
41
|
+
return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)
|