lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +17 -15
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +2 -2
- lamindb/core/storage/_anndata_accessor.py +29 -9
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +442 -188
- lamindb/errors.py +6 -0
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +32 -6
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +9 -2
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/fixtures/sheets.py +8 -2
- lamindb/integrations/_croissant.py +34 -11
- lamindb/migrations/0119_squashed.py +5 -2
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +60 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +131 -71
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +326 -172
- lamindb/models/artifact_set.py +45 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +18 -0
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +191 -78
- lamindb/models/record.py +30 -5
- lamindb/models/run.py +10 -33
- lamindb/models/save.py +6 -8
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +152 -40
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
- {lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0
lamindb/errors.py
CHANGED
@@ -3,9 +3,14 @@
|
|
3
3
|
.. autosummary::
|
4
4
|
:toctree: .
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
save_cellxgene_defaults
|
7
|
+
create_cellxgene_schema
|
8
8
|
|
9
9
|
"""
|
10
10
|
|
11
|
-
from ._cellxgene import
|
11
|
+
from ._cellxgene import (
|
12
|
+
create_cellxgene_schema,
|
13
|
+
get_cxg_schema,
|
14
|
+
save_cellxgene_defaults,
|
15
|
+
save_cxg_defaults,
|
16
|
+
)
|
@@ -3,7 +3,9 @@ from __future__ import annotations
|
|
3
3
|
from typing import TYPE_CHECKING, Collection, Literal, NamedTuple
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
+
from lamindb_setup.core import deprecated
|
6
7
|
from lamindb_setup.core.upath import UPath
|
8
|
+
from packaging import version
|
7
9
|
|
8
10
|
from lamindb.models._from_values import _format_values
|
9
11
|
|
@@ -11,11 +13,25 @@ if TYPE_CHECKING:
|
|
11
13
|
from lamindb.base.types import FieldAttr
|
12
14
|
from lamindb.models import Schema, SQLRecord
|
13
15
|
|
14
|
-
CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0"]
|
16
|
+
CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0", "6.0.0"]
|
17
|
+
CELLxGENEOrganisms = Literal[
|
18
|
+
"human",
|
19
|
+
"mouse",
|
20
|
+
"zebra danio",
|
21
|
+
"rhesus macaquedomestic pig",
|
22
|
+
"chimpanzee",
|
23
|
+
"white-tufted-ear marmoset",
|
24
|
+
"sars-2",
|
25
|
+
]
|
15
26
|
FieldType = Literal["ontology_id", "name"]
|
16
27
|
|
17
28
|
|
29
|
+
@deprecated(new_name="save_cellxgene_defaults")
|
18
30
|
def save_cxg_defaults() -> None:
|
31
|
+
return save_cellxgene_defaults()
|
32
|
+
|
33
|
+
|
34
|
+
def save_cellxgene_defaults() -> None:
|
19
35
|
"""Save default values of the CELLxGENE schema to the instance.
|
20
36
|
|
21
37
|
Adds CELLxGENE specific (control) values that are not available in the ontologies:
|
@@ -25,7 +41,6 @@ def save_cxg_defaults() -> None:
|
|
25
41
|
- "unknown" entries for DevelopmentalStage, Phenotype, and CellType
|
26
42
|
- "tissue", "organoid", and "cell culture" ULabels (tissue_type)
|
27
43
|
- "cell", "nucleus", "na" ULabels (suspension_type)
|
28
|
-
|
29
44
|
"""
|
30
45
|
import bionty as bt
|
31
46
|
|
@@ -47,12 +62,13 @@ def save_cxg_defaults() -> None:
|
|
47
62
|
# na, unknown
|
48
63
|
for model, name in zip(
|
49
64
|
[
|
65
|
+
bt.Ethnicity,
|
50
66
|
bt.Ethnicity,
|
51
67
|
bt.DevelopmentalStage,
|
52
68
|
bt.Phenotype,
|
53
69
|
bt.CellType,
|
54
70
|
],
|
55
|
-
["na", "unknown", "unknown", "unknown"],
|
71
|
+
["na", "unknown", "unknown", "unknown", "unknown"],
|
56
72
|
):
|
57
73
|
model(ontology_id=name, name=name, description="From CellxGene schema.").save()
|
58
74
|
|
@@ -76,8 +92,24 @@ def save_cxg_defaults() -> None:
|
|
76
92
|
name=name, type=suspension_type, description="From CellxGene schema."
|
77
93
|
).save()
|
78
94
|
|
95
|
+
# organisms
|
96
|
+
taxonomy_ids = [
|
97
|
+
"NCBITaxon:9606", # Homo sapiens (Human)
|
98
|
+
"NCBITaxon:10090", # Mus musculus (House mouse)
|
99
|
+
"NCBITaxon:9544", # Macaca mulatta (Rhesus monkey)
|
100
|
+
"NCBITaxon:9825", # Sus scrofa domesticus (Domestic pig)
|
101
|
+
"NCBITaxon:9598", # Pan troglodytes (Chimpanzee)
|
102
|
+
"NCBITaxon:9483", # Callithrix jacchus (White-tufted-ear marmoset)
|
103
|
+
"NCBITaxon:7955", # Danio rerio (Zebrafish)
|
104
|
+
]
|
105
|
+
for ontology_id in taxonomy_ids:
|
106
|
+
bt.Organism.from_source(
|
107
|
+
ontology_id=ontology_id,
|
108
|
+
source=bt.Source.get(name="ncbitaxon", currently_used=True),
|
109
|
+
).save()
|
110
|
+
|
79
111
|
|
80
|
-
def
|
112
|
+
def _create_cellxgene_sources(
|
81
113
|
categoricals: dict[str, FieldAttr], schema_version: str, organism: str
|
82
114
|
) -> dict[str, SQLRecord]:
|
83
115
|
"""Create a source dictionary of CELLxGENE categoricals to Source."""
|
@@ -105,7 +137,7 @@ def _create_cxg_sources(
|
|
105
137
|
)
|
106
138
|
return source
|
107
139
|
|
108
|
-
sources_df = pd.read_csv(UPath(__file__).parent / "
|
140
|
+
sources_df = pd.read_csv(UPath(__file__).parent / "cellxgene_schema_versions.csv")
|
109
141
|
sources_df = sources_df[sources_df.schema_version == schema_version]
|
110
142
|
if sources_df.empty:
|
111
143
|
raise ValueError(
|
@@ -126,11 +158,28 @@ def _create_cxg_sources(
|
|
126
158
|
return key_to_source
|
127
159
|
|
128
160
|
|
161
|
+
@deprecated(new_name="create_cellxgene_schema")
|
129
162
|
def get_cxg_schema(
|
130
163
|
schema_version: CELLxGENESchemaVersions,
|
131
164
|
*,
|
132
165
|
field_types: FieldType | Collection[FieldType] = "ontology_id",
|
133
|
-
organism:
|
166
|
+
organism: CELLxGENEOrganisms = "human",
|
167
|
+
spatial_library_id: str | None = None,
|
168
|
+
) -> Schema:
|
169
|
+
return create_cellxgene_schema(
|
170
|
+
schema_version,
|
171
|
+
field_types=field_types,
|
172
|
+
organism=organism,
|
173
|
+
spatial_library_id=spatial_library_id,
|
174
|
+
)
|
175
|
+
|
176
|
+
|
177
|
+
def create_cellxgene_schema(
|
178
|
+
schema_version: CELLxGENESchemaVersions,
|
179
|
+
*,
|
180
|
+
field_types: FieldType | Collection[FieldType] = "ontology_id",
|
181
|
+
organism: CELLxGENEOrganisms = "human",
|
182
|
+
spatial_library_id: str | None = None,
|
134
183
|
) -> Schema:
|
135
184
|
"""Generates a :class:`~lamindb.Schema` for a specific CELLxGENE schema version.
|
136
185
|
|
@@ -138,6 +187,8 @@ def get_cxg_schema(
|
|
138
187
|
schema_version: The CELLxGENE Schema version.
|
139
188
|
field_types: One or several of 'ontology_id', 'name'.
|
140
189
|
organism: The organism of the Schema.
|
190
|
+
library_id: Identifier for the spatial library.
|
191
|
+
Specifying this value enables curation against spatial requirements.
|
141
192
|
"""
|
142
193
|
import bionty as bt
|
143
194
|
|
@@ -168,7 +219,7 @@ def get_cxg_schema(
|
|
168
219
|
"tissue": CategorySpec(bt.Tissue.name, None),
|
169
220
|
"tissue_ontology_term_id": CategorySpec(bt.Tissue.ontology_id, None),
|
170
221
|
"tissue_type": CategorySpec(ULabel.name, "tissue"),
|
171
|
-
"organism": CategorySpec(bt.Organism.
|
222
|
+
"organism": CategorySpec(bt.Organism.scientific_name, None),
|
172
223
|
"organism_ontology_term_id": CategorySpec(bt.Organism.ontology_id, None),
|
173
224
|
"donor_id": CategorySpec(str, "unknown"),
|
174
225
|
}
|
@@ -195,7 +246,17 @@ def get_cxg_schema(
|
|
195
246
|
f"Invalid field_types: {field_types}. Must contain 'ontology_id', 'name', or both."
|
196
247
|
)
|
197
248
|
|
198
|
-
|
249
|
+
is_version_6_or_later = version.parse(schema_version) >= version.parse("6.0.0")
|
250
|
+
|
251
|
+
organism_fields = {"organism", "organism_ontology_term_id"}
|
252
|
+
if is_version_6_or_later:
|
253
|
+
obs_categoricals = {
|
254
|
+
k: v for k, v in categoricals.items() if k not in organism_fields
|
255
|
+
}
|
256
|
+
else:
|
257
|
+
obs_categoricals = categoricals
|
258
|
+
|
259
|
+
sources = _create_cellxgene_sources(
|
199
260
|
categoricals=categoricals,
|
200
261
|
schema_version=schema_version,
|
201
262
|
organism=organism,
|
@@ -217,30 +278,83 @@ def get_cxg_schema(
|
|
217
278
|
obs_features = [
|
218
279
|
Feature(
|
219
280
|
name=field,
|
220
|
-
dtype=
|
281
|
+
dtype=obs_categoricals[field],
|
221
282
|
cat_filters={"source": source},
|
222
283
|
default_value=categoricals_to_spec[field].default,
|
223
284
|
).save()
|
224
285
|
for field, source in sources.items()
|
225
|
-
if field != "var_index"
|
286
|
+
if field != "var_index" and field in obs_categoricals
|
226
287
|
]
|
227
288
|
for name in ["is_primary_data", "suspension_type", "tissue_type"]:
|
228
289
|
obs_features.append(Feature(name=name, dtype=ULabel.name).save())
|
229
290
|
|
230
291
|
obs_schema = Schema(
|
231
|
-
name=f"obs of CELLxGENE version {schema_version}",
|
292
|
+
name=f"obs of CELLxGENE version {schema_version} for {organism} of {field_types}",
|
232
293
|
features=obs_features,
|
233
294
|
otype="DataFrame",
|
234
295
|
minimal_set=True,
|
235
296
|
coerce_dtype=True,
|
236
297
|
).save()
|
237
298
|
|
299
|
+
slots = {"var": var_schema, "obs": obs_schema}
|
300
|
+
|
301
|
+
if is_version_6_or_later:
|
302
|
+
uns_categoricals = {
|
303
|
+
k: v for k, v in categoricals.items() if k in organism_fields
|
304
|
+
}
|
305
|
+
|
306
|
+
uns_features = [
|
307
|
+
Feature(
|
308
|
+
name=field,
|
309
|
+
dtype=uns_categoricals[field],
|
310
|
+
cat_filters={"source": sources[field]},
|
311
|
+
default_value=categoricals_to_spec[field].default,
|
312
|
+
).save()
|
313
|
+
for field in uns_categoricals
|
314
|
+
]
|
315
|
+
|
316
|
+
uns_schema = Schema(
|
317
|
+
name=f"uns of CELLxGENE version {schema_version}",
|
318
|
+
features=uns_features,
|
319
|
+
otype="DataFrame",
|
320
|
+
minimal_set=True,
|
321
|
+
coerce_dtype=True,
|
322
|
+
).save()
|
323
|
+
|
324
|
+
slots["uns"] = uns_schema
|
325
|
+
|
326
|
+
# Add spatial validation if library_id is provided
|
327
|
+
if spatial_library_id:
|
328
|
+
scalefactors_schema = Schema(
|
329
|
+
name=f"scalefactors of spatial {spatial_library_id}",
|
330
|
+
features=[
|
331
|
+
Feature(name="spot_diameter_fullres", dtype=float).save(),
|
332
|
+
Feature(name="tissue_hires_scalef", dtype=float).save(),
|
333
|
+
],
|
334
|
+
).save()
|
335
|
+
|
336
|
+
spatial_schema = Schema(
|
337
|
+
name="CELLxGENE spatial metadata",
|
338
|
+
features=[
|
339
|
+
Feature(
|
340
|
+
name="is_single",
|
341
|
+
dtype=bool,
|
342
|
+
description="True if dataset represents single spatial unit (tissue section for Visium, array for Slide-seqV2)",
|
343
|
+
).save()
|
344
|
+
],
|
345
|
+
).save()
|
346
|
+
|
347
|
+
slots["uns:spatial"] = spatial_schema
|
348
|
+
slots[f"uns:spatial:{spatial_library_id}:scalefactors"] = (
|
349
|
+
scalefactors_schema
|
350
|
+
)
|
351
|
+
|
238
352
|
full_cxg_schema = Schema(
|
239
|
-
name=f"AnnData of CELLxGENE version {schema_version}",
|
353
|
+
name=f"AnnData of CELLxGENE version {schema_version} for {organism} of {', '.join(field_types) if isinstance(field_types, list) else field_types}",
|
240
354
|
otype="AnnData",
|
241
355
|
minimal_set=True,
|
242
356
|
coerce_dtype=True,
|
243
|
-
slots=
|
357
|
+
slots=slots,
|
244
358
|
).save()
|
245
359
|
|
246
360
|
return full_cxg_schema
|
@@ -52,3 +52,14 @@ schema_version,entity,organism,source,version
|
|
52
52
|
5.3.0,Tissue,all,uberon,2025-01-15
|
53
53
|
5.3.0,Gene,human,ensembl,release-110
|
54
54
|
5.3.0,Gene,mouse,ensembl,release-110
|
55
|
+
6.0.0,CellType,all,cl,2025-04-10
|
56
|
+
6.0.0,ExperimentalFactor,all,efo,3.78.0
|
57
|
+
6.0.0,Ethnicity,human,hancestro,3.0
|
58
|
+
6.0.0,DevelopmentalStage,human,hsapdv,2025-01-23
|
59
|
+
6.0.0,DevelopmentalStage,mouse,mmusdv,2025-01-23
|
60
|
+
6.0.0,Disease,all,mondo,2025-05-06
|
61
|
+
6.0.0,Organism,all,ncbitaxon,2025-03-13
|
62
|
+
6.0.0,Phenotype,all,pato,2025-05-14
|
63
|
+
6.0.0,Tissue,all,uberon,2025-05-28
|
64
|
+
6.0.0,Gene,human,ensembl,release-110
|
65
|
+
6.0.0,Gene,mouse,ensembl,release-110
|
@@ -1,35 +1,61 @@
|
|
1
|
-
"""
|
1
|
+
"""Examples for MLCommons Croissant files, which are used to store metadata about datasets.
|
2
|
+
|
3
|
+
.. autosummary::
|
4
|
+
:toctree: .
|
5
|
+
|
6
|
+
mini_immuno
|
2
7
|
|
3
|
-
Examples for MLCommons Croissant files, which are used to store metadata about datasets.
|
4
8
|
"""
|
5
9
|
|
6
10
|
import json
|
7
11
|
from pathlib import Path
|
8
12
|
|
9
13
|
|
10
|
-
def mini_immuno(
|
14
|
+
def mini_immuno(
|
15
|
+
n_files: int = 1, filepath_prefix: str = "", strip_version: bool = False
|
16
|
+
) -> list[Path]:
|
11
17
|
"""Return paths to the mini immuno dataset and its metadata as a Croissant file.
|
12
18
|
|
13
19
|
Args:
|
14
20
|
n_files: Number of files inside the croissant file. Default is 1.
|
21
|
+
filepath_prefix: Move the dataset and references to it in a specific directory.
|
22
|
+
|
23
|
+
Example
|
24
|
+
|
25
|
+
::
|
26
|
+
|
27
|
+
croissant_path, dataset1_path = ln.examples.croissant.mini_immuno()
|
28
|
+
croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(n_files=2)
|
15
29
|
"""
|
16
30
|
from ..datasets import file_mini_csv
|
17
31
|
from ..datasets.mini_immuno import get_dataset1
|
18
32
|
|
19
33
|
adata = get_dataset1(otype="AnnData")
|
20
|
-
|
34
|
+
if filepath_prefix:
|
35
|
+
dataset1_path = Path(filepath_prefix) / "mini_immuno.anndata.zarr"
|
36
|
+
else:
|
37
|
+
dataset1_path = Path("mini_immuno.anndata.zarr")
|
21
38
|
adata.write_zarr(dataset1_path)
|
22
39
|
orig_croissant_path = (
|
23
40
|
Path(__file__).parent / "mini_immuno.anndata.zarr_metadata.json"
|
24
41
|
)
|
25
42
|
with open(orig_croissant_path, encoding="utf-8") as f:
|
26
43
|
data = json.load(f)
|
44
|
+
if filepath_prefix:
|
45
|
+
assert data["distribution"][0]["@id"] == "mini_immuno.anndata.zarr" # noqa: S101
|
46
|
+
data["distribution"][0]["@id"] = str(Path(filepath_prefix) / dataset1_path.name)
|
47
|
+
if strip_version:
|
48
|
+
data.pop("version", None)
|
27
49
|
if n_files == 2:
|
28
|
-
|
50
|
+
file_mini_csv()
|
51
|
+
if filepath_prefix:
|
52
|
+
dataset2_path = Path(filepath_prefix) / "mini.csv"
|
53
|
+
else:
|
54
|
+
dataset2_path = Path("mini.csv")
|
29
55
|
data["distribution"].append(
|
30
56
|
{
|
31
57
|
"@type": "sc:FileObject",
|
32
|
-
"@id":
|
58
|
+
"@id": dataset2_path.as_posix(),
|
33
59
|
"name": "mini.csv",
|
34
60
|
"encodingFormat": "text/csv",
|
35
61
|
}
|
@@ -41,7 +41,7 @@ Dictionary, Dataframe, AnnData, MuData, SpatialData.
|
|
41
41
|
.. autosummary::
|
42
42
|
:toctree: .
|
43
43
|
|
44
|
-
|
44
|
+
dict_cellxgene_uns
|
45
45
|
df_iris
|
46
46
|
df_iris_in_meter
|
47
47
|
df_iris_in_meter_study1
|
@@ -78,7 +78,7 @@ from ._core import (
|
|
78
78
|
df_iris_in_meter,
|
79
79
|
df_iris_in_meter_study1,
|
80
80
|
df_iris_in_meter_study2,
|
81
|
-
|
81
|
+
dict_cellxgene_uns,
|
82
82
|
dir_iris_images,
|
83
83
|
dir_scrnaseq_cellranger,
|
84
84
|
file_bam,
|
@@ -353,7 +353,7 @@ def anndata_suo22_Visium10X(): # pragma: no cover
|
|
353
353
|
return ad.read_h5ad(filepath)
|
354
354
|
|
355
355
|
|
356
|
-
def mudata_papalexi21_subset() -> MuData: # pragma: no cover
|
356
|
+
def mudata_papalexi21_subset(with_uns: bool = False) -> MuData: # pragma: no cover
|
357
357
|
"""A subsetted mudata from papalexi21.
|
358
358
|
|
359
359
|
To reproduce the subsetting:
|
@@ -415,10 +415,17 @@ def mudata_papalexi21_subset() -> MuData: # pragma: no cover
|
|
415
415
|
mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
|
416
416
|
mdata.pull_obs(["technique"], mods="hto")
|
417
417
|
|
418
|
+
if with_uns:
|
419
|
+
mdata.uns["study_metadata"] = {
|
420
|
+
"temperature": 21.6,
|
421
|
+
"experiment": "Experiment 1",
|
422
|
+
}
|
423
|
+
mdata["rna"].uns["site_metadata"] = {"pos": 99.9, "site_id": "SITE001"}
|
424
|
+
|
418
425
|
return mdata
|
419
426
|
|
420
427
|
|
421
|
-
def
|
428
|
+
def dict_cellxgene_uns() -> dict[str, Any]:
|
422
429
|
"""An example CELLxGENE AnnData `.uns` dictionary."""
|
423
430
|
uns = {
|
424
431
|
"organism_ontology_term_id": "NCBITaxon:9606",
|
@@ -9,32 +9,36 @@ import pandas as pd
|
|
9
9
|
|
10
10
|
def small_dataset3_cellxgene(
|
11
11
|
otype: Literal["DataFrame", "AnnData"] = "AnnData",
|
12
|
+
*,
|
12
13
|
with_obs_defaults: bool = False,
|
14
|
+
with_var_typo: bool = False,
|
13
15
|
with_obs_typo: bool = False,
|
16
|
+
with_uns_organism: bool = False,
|
17
|
+
with_uns_spatial: bool = False,
|
14
18
|
) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
|
15
|
-
|
16
|
-
|
17
|
-
var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
|
18
|
-
|
19
|
+
var_id = "invalid_ensembl_id" if with_var_typo else "ENSG00000000457"
|
20
|
+
var_ids = [var_id, "ENSG00000000419", "ENSG00000139618"]
|
19
21
|
lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048"
|
22
|
+
|
23
|
+
obs_data = {
|
24
|
+
"disease_ontology_term_id": [
|
25
|
+
"MONDO:0004975",
|
26
|
+
"MONDO:0004980",
|
27
|
+
"MONDO:0004980",
|
28
|
+
],
|
29
|
+
"development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
|
30
|
+
"sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
|
31
|
+
"tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
|
32
|
+
"cell_type": ["T cell", "B cell", "B cell"],
|
33
|
+
"self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
|
34
|
+
"donor_id": ["-1", "1", "2"],
|
35
|
+
"is_primary_data": [False, False, False],
|
36
|
+
"suspension_type": ["cell", "cell", "cell"],
|
37
|
+
"tissue_type": ["tissue", "tissue", "tissue"],
|
38
|
+
}
|
39
|
+
|
20
40
|
obs_df = pd.DataFrame(
|
21
|
-
|
22
|
-
"disease_ontology_term_id": [
|
23
|
-
"MONDO:0004975",
|
24
|
-
"MONDO:0004980",
|
25
|
-
"MONDO:0004980",
|
26
|
-
],
|
27
|
-
"development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
|
28
|
-
"organism": ["human", "human", "human"],
|
29
|
-
"sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
|
30
|
-
"tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
|
31
|
-
"cell_type": ["T cell", "B cell", "B cell"],
|
32
|
-
"self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
|
33
|
-
"donor_id": ["-1", "1", "2"],
|
34
|
-
"is_primary_data": [False, False, False],
|
35
|
-
"suspension_type": ["cell", "cell", "cell"],
|
36
|
-
"tissue_type": ["tissue", "tissue", "tissue"],
|
37
|
-
},
|
41
|
+
obs_data,
|
38
42
|
index=["barcode1", "barcode2", "barcode3"],
|
39
43
|
)
|
40
44
|
|
@@ -65,8 +69,38 @@ def small_dataset3_cellxgene(
|
|
65
69
|
# CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304
|
66
70
|
adata.raw = adata.copy()
|
67
71
|
adata.raw.var.drop(columns="feature_is_filtered", inplace=True)
|
72
|
+
|
68
73
|
if with_obs_defaults:
|
74
|
+
adata.obs["cell_type_ontology_term_id"] = [
|
75
|
+
"CL:0000084",
|
76
|
+
"CL:0000236",
|
77
|
+
"CL:0000236",
|
78
|
+
]
|
79
|
+
adata.obs["self_reported_ethnicity_ontology_term_id"] = "na"
|
80
|
+
adata.obs["assay_ontology_term_id"] = "EFO:1001982"
|
69
81
|
adata.obs["assay"] = "single-cell RNA sequencing"
|
82
|
+
if with_uns_organism:
|
83
|
+
adata.uns["organism_ontology_term_id"] = "NCBITaxon:9606"
|
84
|
+
adata.uns["organism"] = "Homo sapiens"
|
85
|
+
else:
|
86
|
+
adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606"
|
87
|
+
obs_data["organism"] = ["Homo sapiens", "Homo sapiens", "Homo sapiens"]
|
88
|
+
if with_uns_spatial:
|
89
|
+
adata.uns["spatial"] = {
|
90
|
+
"is_single": True,
|
91
|
+
"library_123": {
|
92
|
+
"scalefactors": {
|
93
|
+
"spot_diameter_fullres": 165.0,
|
94
|
+
"tissue_hires_scalef": 0.5,
|
95
|
+
},
|
96
|
+
"images": {
|
97
|
+
"hires": np.random.default_rng().integers(
|
98
|
+
0, 255, (2000, 2000, 3), dtype=np.uint8
|
99
|
+
)
|
100
|
+
},
|
101
|
+
},
|
102
|
+
}
|
103
|
+
|
70
104
|
return adata
|
71
105
|
|
72
106
|
|
@@ -92,6 +126,16 @@ def anndata_with_obs() -> ad.AnnData:
|
|
92
126
|
df.index = "obs" + df.index.astype(str)
|
93
127
|
|
94
128
|
adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
|
95
|
-
|
129
|
+
bionty_genes = bionty_base.Gene()
|
130
|
+
# backwards compatible
|
131
|
+
adata.var.index = (
|
132
|
+
(
|
133
|
+
bionty_genes.to_dataframe()
|
134
|
+
if hasattr(bionty_genes, "to_dataframe")
|
135
|
+
else bionty_genes.df()
|
136
|
+
)
|
137
|
+
.head(100)["ensembl_gene_id"]
|
138
|
+
.values
|
139
|
+
)
|
96
140
|
|
97
141
|
return adata
|
@@ -46,6 +46,8 @@ def populate_sheets_compound_treatment():
|
|
46
46
|
|
47
47
|
# Samples ---------------------------
|
48
48
|
|
49
|
+
project = ln.Feature(name="project", dtype=ln.Project).save()
|
50
|
+
project1 = ln.Project(name="Project 1").save()
|
49
51
|
sample_type = ln.Record(name="BioSample", is_type=True).save()
|
50
52
|
treatment = ln.Feature(name="treatment", dtype=treatment_type).save()
|
51
53
|
cell_line = ln.Feature(name="cell_line", dtype=bt.CellLine).save()
|
@@ -54,7 +56,7 @@ def populate_sheets_compound_treatment():
|
|
54
56
|
cell_line.save()
|
55
57
|
schema1 = ln.Schema(
|
56
58
|
name="My samples schema 2025-06",
|
57
|
-
features=[treatment, cell_line, preparation_date],
|
59
|
+
features=[treatment, cell_line, preparation_date, project],
|
58
60
|
).save()
|
59
61
|
sample_sheet1 = ln.Record(
|
60
62
|
name="My samples 2025-06", schema=schema1, type=sample_type
|
@@ -69,6 +71,7 @@ def populate_sheets_compound_treatment():
|
|
69
71
|
ln.models.RecordJson(
|
70
72
|
record=sample1, feature=preparation_date, value="2025-06-01T05:00:00"
|
71
73
|
).save()
|
74
|
+
ln.models.RecordProject(record=sample1, feature=project, value=project1).save()
|
72
75
|
# populate sample2
|
73
76
|
sample2 = ln.Record(name="sample2", type=sample_sheet1).save()
|
74
77
|
ln.models.RecordRecord(record=sample2, feature=treatment, value=treatment2).save()
|
@@ -76,12 +79,13 @@ def populate_sheets_compound_treatment():
|
|
76
79
|
ln.models.RecordJson(
|
77
80
|
record=sample2, feature=preparation_date, value="2025-06-01T06:00:00"
|
78
81
|
).save()
|
82
|
+
ln.models.RecordProject(record=sample2, feature=project, value=project1).save()
|
79
83
|
|
80
84
|
# another sheet for samples
|
81
85
|
sample_note = ln.Feature(name="sample_note", dtype="str").save()
|
82
86
|
schema2 = ln.Schema(
|
83
87
|
name="My samples schema 2025-07",
|
84
|
-
features=[treatment, cell_line, sample_note],
|
88
|
+
features=[treatment, cell_line, sample_note, project],
|
85
89
|
).save()
|
86
90
|
# the sheet
|
87
91
|
sample_sheet2 = ln.Record(
|
@@ -94,6 +98,7 @@ def populate_sheets_compound_treatment():
|
|
94
98
|
ln.models.RecordJson(
|
95
99
|
record=sample3, feature=preparation_date, value="2025-06-02T05:00:00Z"
|
96
100
|
).save()
|
101
|
+
ln.models.RecordProject(record=sample3, feature=project, value=project1).save()
|
97
102
|
# populate sample4
|
98
103
|
sample4 = ln.Record(type=sample_sheet2).save()
|
99
104
|
ln.models.RecordRecord(record=sample4, feature=treatment, value=treatment2).save()
|
@@ -101,6 +106,7 @@ def populate_sheets_compound_treatment():
|
|
101
106
|
ln.models.RecordJson(
|
102
107
|
record=sample4, feature=preparation_date, value="2025-06-02T06:00:00Z"
|
103
108
|
).save()
|
109
|
+
ln.models.RecordProject(record=sample4, feature=project, value=project1).save()
|
104
110
|
|
105
111
|
yield treatments_sheet, sample_sheet1
|
106
112
|
|
@@ -4,6 +4,10 @@ import json
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import TYPE_CHECKING, Any
|
6
6
|
|
7
|
+
import lamindb_setup as ln_setup
|
8
|
+
from lamin_utils import logger
|
9
|
+
from lamindb_setup.core.upath import UPath
|
10
|
+
|
7
11
|
if TYPE_CHECKING:
|
8
12
|
import lamindb as ln
|
9
13
|
|
@@ -27,6 +31,8 @@ def curate_from_croissant(
|
|
27
31
|
"""
|
28
32
|
import lamindb as ln
|
29
33
|
|
34
|
+
from ..models.artifact import check_path_in_existing_storage
|
35
|
+
|
30
36
|
# Load CroissantML data
|
31
37
|
if isinstance(croissant_data, (str, Path)):
|
32
38
|
if not Path(croissant_data).exists():
|
@@ -49,10 +55,10 @@ def curate_from_croissant(
|
|
49
55
|
|
50
56
|
# Extract basic metadata
|
51
57
|
dataset_name = data["name"]
|
52
|
-
description = data.get("description",
|
53
|
-
version = data.get("version",
|
54
|
-
license_info = data.get("license",
|
55
|
-
project_name = data.get("cr:projectName",
|
58
|
+
description = data.get("description", None)
|
59
|
+
version = data.get("version", None)
|
60
|
+
license_info = data.get("license", None)
|
61
|
+
project_name = data.get("cr:projectName", None)
|
56
62
|
|
57
63
|
# Create license feature and label if license info exists
|
58
64
|
license_label = None
|
@@ -86,18 +92,35 @@ def curate_from_croissant(
|
|
86
92
|
content_url = dist.get("contentUrl", "")
|
87
93
|
file_path = content_url or data.get("url", "")
|
88
94
|
if not file_path:
|
89
|
-
raise ValueError(
|
90
|
-
|
95
|
+
raise ValueError(f"No file path found in croissant distribution: {dist}")
|
96
|
+
if not UPath(file_path).exists():
|
97
|
+
raise ValueError(f"Inferred file path does not exist: {file_path}")
|
98
|
+
result = check_path_in_existing_storage(
|
99
|
+
file_path, check_hub_register_storage=ln_setup.settings.instance.is_on_hub
|
100
|
+
)
|
101
|
+
if isinstance(result, ln.Storage):
|
102
|
+
key = None # will automatically use existing storage key
|
103
|
+
else:
|
104
|
+
current_storage_location = (
|
105
|
+
ln.settings.storage
|
106
|
+
if not ln.setup.settings.instance.keep_artifacts_local
|
107
|
+
else ln.settings.local_storage
|
108
|
+
)
|
109
|
+
logger.warning(
|
110
|
+
f"file path {file_path} is not part of a known storage location, will be duplicated to: {current_storage_location}"
|
91
111
|
)
|
112
|
+
key = file_id
|
92
113
|
if len(file_distributions) == 1:
|
93
|
-
|
94
|
-
if
|
95
|
-
|
96
|
-
|
114
|
+
# it doesn't make sense to have the dataset name on the individual
|
115
|
+
# artifact if it's part of a collection
|
116
|
+
artifact_description = dataset_name
|
117
|
+
if description is not None:
|
118
|
+
artifact_description += f" - {description}"
|
97
119
|
else:
|
98
|
-
artifact_description =
|
120
|
+
artifact_description = None
|
99
121
|
artifact = ln.Artifact( # type: ignore
|
100
122
|
file_path,
|
123
|
+
key=key,
|
101
124
|
description=artifact_description,
|
102
125
|
version=version,
|
103
126
|
kind="dataset",
|
@@ -219,9 +219,8 @@ class Migration(migrations.Migration):
|
|
219
219
|
"uid",
|
220
220
|
lamindb.base.fields.CharField(
|
221
221
|
blank=True,
|
222
|
-
db_default="aaaaaaaaaaaa",
|
223
222
|
db_index=True,
|
224
|
-
default=
|
223
|
+
default=lamindb.base.uids.base62_12,
|
225
224
|
editable=False,
|
226
225
|
max_length=12,
|
227
226
|
unique=True,
|
@@ -4582,4 +4581,8 @@ class Migration(migrations.Migration):
|
|
4582
4581
|
name="unique_artifact_storage_hash_null_key",
|
4583
4582
|
),
|
4584
4583
|
),
|
4584
|
+
migrations.AlterModelOptions(
|
4585
|
+
name="user",
|
4586
|
+
options={},
|
4587
|
+
),
|
4585
4588
|
]
|