lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +17 -15
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +2 -2
  8. lamindb/core/storage/_anndata_accessor.py +29 -9
  9. lamindb/curators/_legacy.py +16 -3
  10. lamindb/curators/core.py +442 -188
  11. lamindb/errors.py +6 -0
  12. lamindb/examples/cellxgene/__init__.py +8 -3
  13. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  14. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  15. lamindb/examples/croissant/__init__.py +32 -6
  16. lamindb/examples/datasets/__init__.py +2 -2
  17. lamindb/examples/datasets/_core.py +9 -2
  18. lamindb/examples/datasets/_small.py +66 -22
  19. lamindb/examples/fixtures/sheets.py +8 -2
  20. lamindb/integrations/_croissant.py +34 -11
  21. lamindb/migrations/0119_squashed.py +5 -2
  22. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  23. lamindb/migrations/0121_recorduser.py +60 -0
  24. lamindb/models/__init__.py +4 -1
  25. lamindb/models/_describe.py +2 -2
  26. lamindb/models/_feature_manager.py +131 -71
  27. lamindb/models/_from_values.py +2 -2
  28. lamindb/models/_is_versioned.py +4 -4
  29. lamindb/models/_label_manager.py +4 -4
  30. lamindb/models/artifact.py +326 -172
  31. lamindb/models/artifact_set.py +45 -1
  32. lamindb/models/can_curate.py +1 -2
  33. lamindb/models/collection.py +3 -34
  34. lamindb/models/feature.py +111 -7
  35. lamindb/models/has_parents.py +11 -11
  36. lamindb/models/project.py +18 -0
  37. lamindb/models/query_manager.py +16 -7
  38. lamindb/models/query_set.py +191 -78
  39. lamindb/models/record.py +30 -5
  40. lamindb/models/run.py +10 -33
  41. lamindb/models/save.py +6 -8
  42. lamindb/models/schema.py +54 -26
  43. lamindb/models/sqlrecord.py +152 -40
  44. lamindb/models/storage.py +59 -14
  45. lamindb/models/transform.py +17 -17
  46. lamindb/models/ulabel.py +6 -1
  47. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
  48. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
  49. {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
  50. {lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0
lamindb/errors.py CHANGED
@@ -60,6 +60,12 @@ class DoesNotExist(Exception):
60
60
  pass
61
61
 
62
62
 
63
+ class MultipleResultsFound(Exception):
64
+ """Multiple records found."""
65
+
66
+ pass
67
+
68
+
63
69
  class InconsistentKey(Exception):
64
70
  """Inconsistent transform or artifact `key`."""
65
71
 
@@ -3,9 +3,14 @@
3
3
  .. autosummary::
4
4
  :toctree: .
5
5
 
6
- save_cxg_defaults
7
- get_cxg_schema
6
+ save_cellxgene_defaults
7
+ create_cellxgene_schema
8
8
 
9
9
  """
10
10
 
11
- from ._cellxgene import get_cxg_schema, save_cxg_defaults
11
+ from ._cellxgene import (
12
+ create_cellxgene_schema,
13
+ get_cxg_schema,
14
+ save_cellxgene_defaults,
15
+ save_cxg_defaults,
16
+ )
@@ -3,7 +3,9 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING, Collection, Literal, NamedTuple
4
4
 
5
5
  import pandas as pd
6
+ from lamindb_setup.core import deprecated
6
7
  from lamindb_setup.core.upath import UPath
8
+ from packaging import version
7
9
 
8
10
  from lamindb.models._from_values import _format_values
9
11
 
@@ -11,11 +13,25 @@ if TYPE_CHECKING:
11
13
  from lamindb.base.types import FieldAttr
12
14
  from lamindb.models import Schema, SQLRecord
13
15
 
14
- CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0"]
16
+ CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0", "6.0.0"]
17
+ CELLxGENEOrganisms = Literal[
18
+ "human",
19
+ "mouse",
20
+ "zebra danio",
21
+ "rhesus macaquedomestic pig",
22
+ "chimpanzee",
23
+ "white-tufted-ear marmoset",
24
+ "sars-2",
25
+ ]
15
26
  FieldType = Literal["ontology_id", "name"]
16
27
 
17
28
 
29
+ @deprecated(new_name="save_cellxgene_defaults")
18
30
  def save_cxg_defaults() -> None:
31
+ return save_cellxgene_defaults()
32
+
33
+
34
+ def save_cellxgene_defaults() -> None:
19
35
  """Save default values of the CELLxGENE schema to the instance.
20
36
 
21
37
  Adds CELLxGENE specific (control) values that are not available in the ontologies:
@@ -25,7 +41,6 @@ def save_cxg_defaults() -> None:
25
41
  - "unknown" entries for DevelopmentalStage, Phenotype, and CellType
26
42
  - "tissue", "organoid", and "cell culture" ULabels (tissue_type)
27
43
  - "cell", "nucleus", "na" ULabels (suspension_type)
28
-
29
44
  """
30
45
  import bionty as bt
31
46
 
@@ -47,12 +62,13 @@ def save_cxg_defaults() -> None:
47
62
  # na, unknown
48
63
  for model, name in zip(
49
64
  [
65
+ bt.Ethnicity,
50
66
  bt.Ethnicity,
51
67
  bt.DevelopmentalStage,
52
68
  bt.Phenotype,
53
69
  bt.CellType,
54
70
  ],
55
- ["na", "unknown", "unknown", "unknown"],
71
+ ["na", "unknown", "unknown", "unknown", "unknown"],
56
72
  ):
57
73
  model(ontology_id=name, name=name, description="From CellxGene schema.").save()
58
74
 
@@ -76,8 +92,24 @@ def save_cxg_defaults() -> None:
76
92
  name=name, type=suspension_type, description="From CellxGene schema."
77
93
  ).save()
78
94
 
95
+ # organisms
96
+ taxonomy_ids = [
97
+ "NCBITaxon:9606", # Homo sapiens (Human)
98
+ "NCBITaxon:10090", # Mus musculus (House mouse)
99
+ "NCBITaxon:9544", # Macaca mulatta (Rhesus monkey)
100
+ "NCBITaxon:9825", # Sus scrofa domesticus (Domestic pig)
101
+ "NCBITaxon:9598", # Pan troglodytes (Chimpanzee)
102
+ "NCBITaxon:9483", # Callithrix jacchus (White-tufted-ear marmoset)
103
+ "NCBITaxon:7955", # Danio rerio (Zebrafish)
104
+ ]
105
+ for ontology_id in taxonomy_ids:
106
+ bt.Organism.from_source(
107
+ ontology_id=ontology_id,
108
+ source=bt.Source.get(name="ncbitaxon", currently_used=True),
109
+ ).save()
110
+
79
111
 
80
- def _create_cxg_sources(
112
+ def _create_cellxgene_sources(
81
113
  categoricals: dict[str, FieldAttr], schema_version: str, organism: str
82
114
  ) -> dict[str, SQLRecord]:
83
115
  """Create a source dictionary of CELLxGENE categoricals to Source."""
@@ -105,7 +137,7 @@ def _create_cxg_sources(
105
137
  )
106
138
  return source
107
139
 
108
- sources_df = pd.read_csv(UPath(__file__).parent / "cxg_schema_versions.csv")
140
+ sources_df = pd.read_csv(UPath(__file__).parent / "cellxgene_schema_versions.csv")
109
141
  sources_df = sources_df[sources_df.schema_version == schema_version]
110
142
  if sources_df.empty:
111
143
  raise ValueError(
@@ -126,11 +158,28 @@ def _create_cxg_sources(
126
158
  return key_to_source
127
159
 
128
160
 
161
+ @deprecated(new_name="create_cellxgene_schema")
129
162
  def get_cxg_schema(
130
163
  schema_version: CELLxGENESchemaVersions,
131
164
  *,
132
165
  field_types: FieldType | Collection[FieldType] = "ontology_id",
133
- organism: Literal["human", "mouse"] = "human",
166
+ organism: CELLxGENEOrganisms = "human",
167
+ spatial_library_id: str | None = None,
168
+ ) -> Schema:
169
+ return create_cellxgene_schema(
170
+ schema_version,
171
+ field_types=field_types,
172
+ organism=organism,
173
+ spatial_library_id=spatial_library_id,
174
+ )
175
+
176
+
177
+ def create_cellxgene_schema(
178
+ schema_version: CELLxGENESchemaVersions,
179
+ *,
180
+ field_types: FieldType | Collection[FieldType] = "ontology_id",
181
+ organism: CELLxGENEOrganisms = "human",
182
+ spatial_library_id: str | None = None,
134
183
  ) -> Schema:
135
184
  """Generates a :class:`~lamindb.Schema` for a specific CELLxGENE schema version.
136
185
 
@@ -138,6 +187,8 @@ def get_cxg_schema(
138
187
  schema_version: The CELLxGENE Schema version.
139
188
  field_types: One or several of 'ontology_id', 'name'.
140
189
  organism: The organism of the Schema.
190
+ library_id: Identifier for the spatial library.
191
+ Specifying this value enables curation against spatial requirements.
141
192
  """
142
193
  import bionty as bt
143
194
 
@@ -168,7 +219,7 @@ def get_cxg_schema(
168
219
  "tissue": CategorySpec(bt.Tissue.name, None),
169
220
  "tissue_ontology_term_id": CategorySpec(bt.Tissue.ontology_id, None),
170
221
  "tissue_type": CategorySpec(ULabel.name, "tissue"),
171
- "organism": CategorySpec(bt.Organism.name, None),
222
+ "organism": CategorySpec(bt.Organism.scientific_name, None),
172
223
  "organism_ontology_term_id": CategorySpec(bt.Organism.ontology_id, None),
173
224
  "donor_id": CategorySpec(str, "unknown"),
174
225
  }
@@ -195,7 +246,17 @@ def get_cxg_schema(
195
246
  f"Invalid field_types: {field_types}. Must contain 'ontology_id', 'name', or both."
196
247
  )
197
248
 
198
- sources = _create_cxg_sources(
249
+ is_version_6_or_later = version.parse(schema_version) >= version.parse("6.0.0")
250
+
251
+ organism_fields = {"organism", "organism_ontology_term_id"}
252
+ if is_version_6_or_later:
253
+ obs_categoricals = {
254
+ k: v for k, v in categoricals.items() if k not in organism_fields
255
+ }
256
+ else:
257
+ obs_categoricals = categoricals
258
+
259
+ sources = _create_cellxgene_sources(
199
260
  categoricals=categoricals,
200
261
  schema_version=schema_version,
201
262
  organism=organism,
@@ -217,30 +278,83 @@ def get_cxg_schema(
217
278
  obs_features = [
218
279
  Feature(
219
280
  name=field,
220
- dtype=categoricals[field],
281
+ dtype=obs_categoricals[field],
221
282
  cat_filters={"source": source},
222
283
  default_value=categoricals_to_spec[field].default,
223
284
  ).save()
224
285
  for field, source in sources.items()
225
- if field != "var_index"
286
+ if field != "var_index" and field in obs_categoricals
226
287
  ]
227
288
  for name in ["is_primary_data", "suspension_type", "tissue_type"]:
228
289
  obs_features.append(Feature(name=name, dtype=ULabel.name).save())
229
290
 
230
291
  obs_schema = Schema(
231
- name=f"obs of CELLxGENE version {schema_version}",
292
+ name=f"obs of CELLxGENE version {schema_version} for {organism} of {field_types}",
232
293
  features=obs_features,
233
294
  otype="DataFrame",
234
295
  minimal_set=True,
235
296
  coerce_dtype=True,
236
297
  ).save()
237
298
 
299
+ slots = {"var": var_schema, "obs": obs_schema}
300
+
301
+ if is_version_6_or_later:
302
+ uns_categoricals = {
303
+ k: v for k, v in categoricals.items() if k in organism_fields
304
+ }
305
+
306
+ uns_features = [
307
+ Feature(
308
+ name=field,
309
+ dtype=uns_categoricals[field],
310
+ cat_filters={"source": sources[field]},
311
+ default_value=categoricals_to_spec[field].default,
312
+ ).save()
313
+ for field in uns_categoricals
314
+ ]
315
+
316
+ uns_schema = Schema(
317
+ name=f"uns of CELLxGENE version {schema_version}",
318
+ features=uns_features,
319
+ otype="DataFrame",
320
+ minimal_set=True,
321
+ coerce_dtype=True,
322
+ ).save()
323
+
324
+ slots["uns"] = uns_schema
325
+
326
+ # Add spatial validation if library_id is provided
327
+ if spatial_library_id:
328
+ scalefactors_schema = Schema(
329
+ name=f"scalefactors of spatial {spatial_library_id}",
330
+ features=[
331
+ Feature(name="spot_diameter_fullres", dtype=float).save(),
332
+ Feature(name="tissue_hires_scalef", dtype=float).save(),
333
+ ],
334
+ ).save()
335
+
336
+ spatial_schema = Schema(
337
+ name="CELLxGENE spatial metadata",
338
+ features=[
339
+ Feature(
340
+ name="is_single",
341
+ dtype=bool,
342
+ description="True if dataset represents single spatial unit (tissue section for Visium, array for Slide-seqV2)",
343
+ ).save()
344
+ ],
345
+ ).save()
346
+
347
+ slots["uns:spatial"] = spatial_schema
348
+ slots[f"uns:spatial:{spatial_library_id}:scalefactors"] = (
349
+ scalefactors_schema
350
+ )
351
+
238
352
  full_cxg_schema = Schema(
239
- name=f"AnnData of CELLxGENE version {schema_version}",
353
+ name=f"AnnData of CELLxGENE version {schema_version} for {organism} of {', '.join(field_types) if isinstance(field_types, list) else field_types}",
240
354
  otype="AnnData",
241
355
  minimal_set=True,
242
356
  coerce_dtype=True,
243
- slots={"var": var_schema, "obs": obs_schema},
357
+ slots=slots,
244
358
  ).save()
245
359
 
246
360
  return full_cxg_schema
@@ -52,3 +52,14 @@ schema_version,entity,organism,source,version
52
52
  5.3.0,Tissue,all,uberon,2025-01-15
53
53
  5.3.0,Gene,human,ensembl,release-110
54
54
  5.3.0,Gene,mouse,ensembl,release-110
55
+ 6.0.0,CellType,all,cl,2025-04-10
56
+ 6.0.0,ExperimentalFactor,all,efo,3.78.0
57
+ 6.0.0,Ethnicity,human,hancestro,3.0
58
+ 6.0.0,DevelopmentalStage,human,hsapdv,2025-01-23
59
+ 6.0.0,DevelopmentalStage,mouse,mmusdv,2025-01-23
60
+ 6.0.0,Disease,all,mondo,2025-05-06
61
+ 6.0.0,Organism,all,ncbitaxon,2025-03-13
62
+ 6.0.0,Phenotype,all,pato,2025-05-14
63
+ 6.0.0,Tissue,all,uberon,2025-05-28
64
+ 6.0.0,Gene,human,ensembl,release-110
65
+ 6.0.0,Gene,mouse,ensembl,release-110
@@ -1,35 +1,61 @@
1
- """Example Croissant files.
1
+ """Examples for MLCommons Croissant files, which are used to store metadata about datasets.
2
+
3
+ .. autosummary::
4
+ :toctree: .
5
+
6
+ mini_immuno
2
7
 
3
- Examples for MLCommons Croissant files, which are used to store metadata about datasets.
4
8
  """
5
9
 
6
10
  import json
7
11
  from pathlib import Path
8
12
 
9
13
 
10
- def mini_immuno(n_files: int = 1) -> list[Path]:
14
+ def mini_immuno(
15
+ n_files: int = 1, filepath_prefix: str = "", strip_version: bool = False
16
+ ) -> list[Path]:
11
17
  """Return paths to the mini immuno dataset and its metadata as a Croissant file.
12
18
 
13
19
  Args:
14
20
  n_files: Number of files inside the croissant file. Default is 1.
21
+ filepath_prefix: Move the dataset and references to it in a specific directory.
22
+
23
+ Example
24
+
25
+ ::
26
+
27
+ croissant_path, dataset1_path = ln.examples.croissant.mini_immuno()
28
+ croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(n_files=2)
15
29
  """
16
30
  from ..datasets import file_mini_csv
17
31
  from ..datasets.mini_immuno import get_dataset1
18
32
 
19
33
  adata = get_dataset1(otype="AnnData")
20
- dataset1_path = Path("mini_immuno.anndata.zarr")
34
+ if filepath_prefix:
35
+ dataset1_path = Path(filepath_prefix) / "mini_immuno.anndata.zarr"
36
+ else:
37
+ dataset1_path = Path("mini_immuno.anndata.zarr")
21
38
  adata.write_zarr(dataset1_path)
22
39
  orig_croissant_path = (
23
40
  Path(__file__).parent / "mini_immuno.anndata.zarr_metadata.json"
24
41
  )
25
42
  with open(orig_croissant_path, encoding="utf-8") as f:
26
43
  data = json.load(f)
44
+ if filepath_prefix:
45
+ assert data["distribution"][0]["@id"] == "mini_immuno.anndata.zarr" # noqa: S101
46
+ data["distribution"][0]["@id"] = str(Path(filepath_prefix) / dataset1_path.name)
47
+ if strip_version:
48
+ data.pop("version", None)
27
49
  if n_files == 2:
28
- dataset2_path = file_mini_csv()
50
+ file_mini_csv()
51
+ if filepath_prefix:
52
+ dataset2_path = Path(filepath_prefix) / "mini.csv"
53
+ else:
54
+ dataset2_path = Path("mini.csv")
29
55
  data["distribution"].append(
30
56
  {
31
57
  "@type": "sc:FileObject",
32
- "@id": "mini.csv",
58
+ "@id": dataset2_path.as_posix(),
33
59
  "name": "mini.csv",
34
60
  "encodingFormat": "text/csv",
35
61
  }
@@ -41,7 +41,7 @@ Dictionary, Dataframe, AnnData, MuData, SpatialData.
41
41
  .. autosummary::
42
42
  :toctree: .
43
43
 
44
- dict_cxg_uns
44
+ dict_cellxgene_uns
45
45
  df_iris
46
46
  df_iris_in_meter
47
47
  df_iris_in_meter_study1
@@ -78,7 +78,7 @@ from ._core import (
78
78
  df_iris_in_meter,
79
79
  df_iris_in_meter_study1,
80
80
  df_iris_in_meter_study2,
81
- dict_cxg_uns,
81
+ dict_cellxgene_uns,
82
82
  dir_iris_images,
83
83
  dir_scrnaseq_cellranger,
84
84
  file_bam,
@@ -353,7 +353,7 @@ def anndata_suo22_Visium10X(): # pragma: no cover
353
353
  return ad.read_h5ad(filepath)
354
354
 
355
355
 
356
- def mudata_papalexi21_subset() -> MuData: # pragma: no cover
356
+ def mudata_papalexi21_subset(with_uns: bool = False) -> MuData: # pragma: no cover
357
357
  """A subsetted mudata from papalexi21.
358
358
 
359
359
  To reproduce the subsetting:
@@ -415,10 +415,17 @@ def mudata_papalexi21_subset() -> MuData: # pragma: no cover
415
415
  mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
416
416
  mdata.pull_obs(["technique"], mods="hto")
417
417
 
418
+ if with_uns:
419
+ mdata.uns["study_metadata"] = {
420
+ "temperature": 21.6,
421
+ "experiment": "Experiment 1",
422
+ }
423
+ mdata["rna"].uns["site_metadata"] = {"pos": 99.9, "site_id": "SITE001"}
424
+
418
425
  return mdata
419
426
 
420
427
 
421
- def dict_cxg_uns() -> dict[str, Any]:
428
+ def dict_cellxgene_uns() -> dict[str, Any]:
422
429
  """An example CELLxGENE AnnData `.uns` dictionary."""
423
430
  uns = {
424
431
  "organism_ontology_term_id": "NCBITaxon:9606",
@@ -9,32 +9,36 @@ import pandas as pd
9
9
 
10
10
  def small_dataset3_cellxgene(
11
11
  otype: Literal["DataFrame", "AnnData"] = "AnnData",
12
+ *,
12
13
  with_obs_defaults: bool = False,
14
+ with_var_typo: bool = False,
13
15
  with_obs_typo: bool = False,
16
+ with_uns_organism: bool = False,
17
+ with_uns_spatial: bool = False,
14
18
  ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
15
- # TODO: consider other ids for other organisms
16
- # "ENSMUSG00002076988"
17
- var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
18
-
19
+ var_id = "invalid_ensembl_id" if with_var_typo else "ENSG00000000457"
20
+ var_ids = [var_id, "ENSG00000000419", "ENSG00000139618"]
19
21
  lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048"
22
+
23
+ obs_data = {
24
+ "disease_ontology_term_id": [
25
+ "MONDO:0004975",
26
+ "MONDO:0004980",
27
+ "MONDO:0004980",
28
+ ],
29
+ "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
30
+ "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
31
+ "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
32
+ "cell_type": ["T cell", "B cell", "B cell"],
33
+ "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
34
+ "donor_id": ["-1", "1", "2"],
35
+ "is_primary_data": [False, False, False],
36
+ "suspension_type": ["cell", "cell", "cell"],
37
+ "tissue_type": ["tissue", "tissue", "tissue"],
38
+ }
39
+
20
40
  obs_df = pd.DataFrame(
21
- {
22
- "disease_ontology_term_id": [
23
- "MONDO:0004975",
24
- "MONDO:0004980",
25
- "MONDO:0004980",
26
- ],
27
- "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
28
- "organism": ["human", "human", "human"],
29
- "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
30
- "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
31
- "cell_type": ["T cell", "B cell", "B cell"],
32
- "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
33
- "donor_id": ["-1", "1", "2"],
34
- "is_primary_data": [False, False, False],
35
- "suspension_type": ["cell", "cell", "cell"],
36
- "tissue_type": ["tissue", "tissue", "tissue"],
37
- },
41
+ obs_data,
38
42
  index=["barcode1", "barcode2", "barcode3"],
39
43
  )
40
44
 
@@ -65,8 +69,38 @@ def small_dataset3_cellxgene(
65
69
  # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304
66
70
  adata.raw = adata.copy()
67
71
  adata.raw.var.drop(columns="feature_is_filtered", inplace=True)
72
+
68
73
  if with_obs_defaults:
74
+ adata.obs["cell_type_ontology_term_id"] = [
75
+ "CL:0000084",
76
+ "CL:0000236",
77
+ "CL:0000236",
78
+ ]
79
+ adata.obs["self_reported_ethnicity_ontology_term_id"] = "na"
80
+ adata.obs["assay_ontology_term_id"] = "EFO:1001982"
69
81
  adata.obs["assay"] = "single-cell RNA sequencing"
82
+ if with_uns_organism:
83
+ adata.uns["organism_ontology_term_id"] = "NCBITaxon:9606"
84
+ adata.uns["organism"] = "Homo sapiens"
85
+ else:
86
+ adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606"
87
+ obs_data["organism"] = ["Homo sapiens", "Homo sapiens", "Homo sapiens"]
88
+ if with_uns_spatial:
89
+ adata.uns["spatial"] = {
90
+ "is_single": True,
91
+ "library_123": {
92
+ "scalefactors": {
93
+ "spot_diameter_fullres": 165.0,
94
+ "tissue_hires_scalef": 0.5,
95
+ },
96
+ "images": {
97
+ "hires": np.random.default_rng().integers(
98
+ 0, 255, (2000, 2000, 3), dtype=np.uint8
99
+ )
100
+ },
101
+ },
102
+ }
103
+
70
104
  return adata
71
105
 
72
106
 
@@ -92,6 +126,16 @@ def anndata_with_obs() -> ad.AnnData:
92
126
  df.index = "obs" + df.index.astype(str)
93
127
 
94
128
  adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
95
- adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
129
+ bionty_genes = bionty_base.Gene()
130
+ # backwards compatible
131
+ adata.var.index = (
132
+ (
133
+ bionty_genes.to_dataframe()
134
+ if hasattr(bionty_genes, "to_dataframe")
135
+ else bionty_genes.df()
136
+ )
137
+ .head(100)["ensembl_gene_id"]
138
+ .values
139
+ )
96
140
 
97
141
  return adata
@@ -46,6 +46,8 @@ def populate_sheets_compound_treatment():
46
46
 
47
47
  # Samples ---------------------------
48
48
 
49
+ project = ln.Feature(name="project", dtype=ln.Project).save()
50
+ project1 = ln.Project(name="Project 1").save()
49
51
  sample_type = ln.Record(name="BioSample", is_type=True).save()
50
52
  treatment = ln.Feature(name="treatment", dtype=treatment_type).save()
51
53
  cell_line = ln.Feature(name="cell_line", dtype=bt.CellLine).save()
@@ -54,7 +56,7 @@ def populate_sheets_compound_treatment():
54
56
  cell_line.save()
55
57
  schema1 = ln.Schema(
56
58
  name="My samples schema 2025-06",
57
- features=[treatment, cell_line, preparation_date],
59
+ features=[treatment, cell_line, preparation_date, project],
58
60
  ).save()
59
61
  sample_sheet1 = ln.Record(
60
62
  name="My samples 2025-06", schema=schema1, type=sample_type
@@ -69,6 +71,7 @@ def populate_sheets_compound_treatment():
69
71
  ln.models.RecordJson(
70
72
  record=sample1, feature=preparation_date, value="2025-06-01T05:00:00"
71
73
  ).save()
74
+ ln.models.RecordProject(record=sample1, feature=project, value=project1).save()
72
75
  # populate sample2
73
76
  sample2 = ln.Record(name="sample2", type=sample_sheet1).save()
74
77
  ln.models.RecordRecord(record=sample2, feature=treatment, value=treatment2).save()
@@ -76,12 +79,13 @@ def populate_sheets_compound_treatment():
76
79
  ln.models.RecordJson(
77
80
  record=sample2, feature=preparation_date, value="2025-06-01T06:00:00"
78
81
  ).save()
82
+ ln.models.RecordProject(record=sample2, feature=project, value=project1).save()
79
83
 
80
84
  # another sheet for samples
81
85
  sample_note = ln.Feature(name="sample_note", dtype="str").save()
82
86
  schema2 = ln.Schema(
83
87
  name="My samples schema 2025-07",
84
- features=[treatment, cell_line, sample_note],
88
+ features=[treatment, cell_line, sample_note, project],
85
89
  ).save()
86
90
  # the sheet
87
91
  sample_sheet2 = ln.Record(
@@ -94,6 +98,7 @@ def populate_sheets_compound_treatment():
94
98
  ln.models.RecordJson(
95
99
  record=sample3, feature=preparation_date, value="2025-06-02T05:00:00Z"
96
100
  ).save()
101
+ ln.models.RecordProject(record=sample3, feature=project, value=project1).save()
97
102
  # populate sample4
98
103
  sample4 = ln.Record(type=sample_sheet2).save()
99
104
  ln.models.RecordRecord(record=sample4, feature=treatment, value=treatment2).save()
@@ -101,6 +106,7 @@ def populate_sheets_compound_treatment():
101
106
  ln.models.RecordJson(
102
107
  record=sample4, feature=preparation_date, value="2025-06-02T06:00:00Z"
103
108
  ).save()
109
+ ln.models.RecordProject(record=sample4, feature=project, value=project1).save()
104
110
 
105
111
  yield treatments_sheet, sample_sheet1
106
112
 
@@ -4,6 +4,10 @@ import json
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
+ import lamindb_setup as ln_setup
8
+ from lamin_utils import logger
9
+ from lamindb_setup.core.upath import UPath
10
+
7
11
  if TYPE_CHECKING:
8
12
  import lamindb as ln
9
13
 
@@ -27,6 +31,8 @@ def curate_from_croissant(
27
31
  """
28
32
  import lamindb as ln
29
33
 
34
+ from ..models.artifact import check_path_in_existing_storage
35
+
30
36
  # Load CroissantML data
31
37
  if isinstance(croissant_data, (str, Path)):
32
38
  if not Path(croissant_data).exists():
@@ -49,10 +55,10 @@ def curate_from_croissant(
49
55
 
50
56
  # Extract basic metadata
51
57
  dataset_name = data["name"]
52
- description = data.get("description", "")
53
- version = data.get("version", "1.0")
54
- license_info = data.get("license", "")
55
- project_name = data.get("cr:projectName", "")
58
+ description = data.get("description", None)
59
+ version = data.get("version", None)
60
+ license_info = data.get("license", None)
61
+ project_name = data.get("cr:projectName", None)
56
62
 
57
63
  # Create license feature and label if license info exists
58
64
  license_label = None
@@ -86,18 +92,35 @@ def curate_from_croissant(
86
92
  content_url = dist.get("contentUrl", "")
87
93
  file_path = content_url or data.get("url", "")
88
94
  if not file_path:
89
- raise ValueError(
90
- f"No valid file path found in croissant distribution: {dist}"
95
+ raise ValueError(f"No file path found in croissant distribution: {dist}")
96
+ if not UPath(file_path).exists():
97
+ raise ValueError(f"Inferred file path does not exist: {file_path}")
98
+ result = check_path_in_existing_storage(
99
+ file_path, check_hub_register_storage=ln_setup.settings.instance.is_on_hub
100
+ )
101
+ if isinstance(result, ln.Storage):
102
+ key = None # will automatically use existing storage key
103
+ else:
104
+ current_storage_location = (
105
+ ln.settings.storage
106
+ if not ln.setup.settings.instance.keep_artifacts_local
107
+ else ln.settings.local_storage
108
+ )
109
+ logger.warning(
110
+ f"file path {file_path} is not part of a known storage location, will be duplicated to: {current_storage_location}"
91
111
  )
112
+ key = file_id
92
113
  if len(file_distributions) == 1:
93
- artifact_description = f"{dataset_name}"
94
- if file_id != dataset_name:
95
- artifact_description += f" ({file_id})"
96
- artifact_description += f" - {description}"
114
+ # it doesn't make sense to have the dataset name on the individual
115
+ # artifact if it's part of a collection
116
+ artifact_description = dataset_name
117
+ if description is not None:
118
+ artifact_description += f" - {description}"
97
119
  else:
98
- artifact_description = f"{file_id}"
120
+ artifact_description = None
99
121
  artifact = ln.Artifact( # type: ignore
100
122
  file_path,
123
+ key=key,
101
124
  description=artifact_description,
102
125
  version=version,
103
126
  kind="dataset",
@@ -219,9 +219,8 @@ class Migration(migrations.Migration):
219
219
  "uid",
220
220
  lamindb.base.fields.CharField(
221
221
  blank=True,
222
- db_default="aaaaaaaaaaaa",
223
222
  db_index=True,
224
- default="aaaaaaaaaaaaa",
223
+ default=lamindb.base.uids.base62_12,
225
224
  editable=False,
226
225
  max_length=12,
227
226
  unique=True,
@@ -4582,4 +4581,8 @@ class Migration(migrations.Migration):
4582
4581
  name="unique_artifact_storage_hash_null_key",
4583
4582
  ),
4584
4583
  ),
4584
+ migrations.AlterModelOptions(
4585
+ name="user",
4586
+ options={},
4587
+ ),
4585
4588
  ]