PyPI - lamindb - Versions diffs - 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

lamindb 1.10.2py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

lamindb/__init__.py +89 -49
lamindb/_finish.py +17 -15
lamindb/_tracked.py +2 -4
lamindb/_view.py +1 -1
lamindb/base/__init__.py +2 -1
lamindb/base/dtypes.py +76 -0
lamindb/core/_settings.py +2 -2
lamindb/core/storage/_anndata_accessor.py +29 -9
lamindb/curators/_legacy.py +16 -3
lamindb/curators/core.py +442 -188
lamindb/errors.py +6 -0
lamindb/examples/cellxgene/__init__.py +8 -3
lamindb/examples/cellxgene/_cellxgene.py +127 -13
lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
lamindb/examples/croissant/__init__.py +32 -6
lamindb/examples/datasets/__init__.py +2 -2
lamindb/examples/datasets/_core.py +9 -2
lamindb/examples/datasets/_small.py +66 -22
lamindb/examples/fixtures/sheets.py +8 -2
lamindb/integrations/_croissant.py +34 -11
lamindb/migrations/0119_squashed.py +5 -2
lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
lamindb/migrations/0121_recorduser.py +60 -0
lamindb/models/__init__.py +4 -1
lamindb/models/_describe.py +2 -2
lamindb/models/_feature_manager.py +131 -71
lamindb/models/_from_values.py +2 -2
lamindb/models/_is_versioned.py +4 -4
lamindb/models/_label_manager.py +4 -4
lamindb/models/artifact.py +326 -172
lamindb/models/artifact_set.py +45 -1
lamindb/models/can_curate.py +1 -2
lamindb/models/collection.py +3 -34
lamindb/models/feature.py +111 -7
lamindb/models/has_parents.py +11 -11
lamindb/models/project.py +18 -0
lamindb/models/query_manager.py +16 -7
lamindb/models/query_set.py +191 -78
lamindb/models/record.py +30 -5
lamindb/models/run.py +10 -33
lamindb/models/save.py +6 -8
lamindb/models/schema.py +54 -26
lamindb/models/sqlrecord.py +152 -40
lamindb/models/storage.py +59 -14
lamindb/models/transform.py +17 -17
lamindb/models/ulabel.py +6 -1
{lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
{lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
{lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
{lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0

lamindb/errors.py CHANGED Viewed

@@ -60,6 +60,12 @@ class DoesNotExist(Exception):
     pass
+class MultipleResultsFound(Exception):
+    """Multiple records found."""
+    pass
 class InconsistentKey(Exception):
     """Inconsistent transform or artifact `key`."""

lamindb/examples/cellxgene/__init__.py CHANGED Viewed

@@ -3,9 +3,14 @@
 .. autosummary::
    :toctree: .
-   save_cxg_defaults
-   get_cxg_schema
+   save_cellxgene_defaults
+   create_cellxgene_schema
 """
-from ._cellxgene import get_cxg_schema, save_cxg_defaults
+from ._cellxgene import (
+    create_cellxgene_schema,
+    get_cxg_schema,
+    save_cellxgene_defaults,
+    save_cxg_defaults,
+)

lamindb/examples/cellxgene/_cellxgene.py CHANGED Viewed

@@ -3,7 +3,9 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Collection, Literal, NamedTuple
 import pandas as pd
+from lamindb_setup.core import deprecated
 from lamindb_setup.core.upath import UPath
+from packaging import version
 from lamindb.models._from_values import _format_values
@@ -11,11 +13,25 @@ if TYPE_CHECKING:
     from lamindb.base.types import FieldAttr
     from lamindb.models import Schema, SQLRecord
-CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0"]
+CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0", "6.0.0"]
+CELLxGENEOrganisms = Literal[
+    "human",
+    "mouse",
+    "zebra danio",
+    "rhesus macaquedomestic pig",
+    "chimpanzee",
+    "white-tufted-ear marmoset",
+    "sars-2",
+]
 FieldType = Literal["ontology_id", "name"]
+@deprecated(new_name="save_cellxgene_defaults")
 def save_cxg_defaults() -> None:
+    return save_cellxgene_defaults()
+def save_cellxgene_defaults() -> None:
     """Save default values of the CELLxGENE schema to the instance.
     Adds CELLxGENE specific (control) values that are not available in the ontologies:
@@ -25,7 +41,6 @@ def save_cxg_defaults() -> None:
     - "unknown" entries for DevelopmentalStage, Phenotype, and CellType
     - "tissue", "organoid", and "cell culture" ULabels (tissue_type)
     - "cell", "nucleus", "na" ULabels (suspension_type)
     """
     import bionty as bt
@@ -47,12 +62,13 @@ def save_cxg_defaults() -> None:
     # na, unknown
     for model, name in zip(
         [
+            bt.Ethnicity,
             bt.Ethnicity,
             bt.DevelopmentalStage,
             bt.Phenotype,
             bt.CellType,
         ],
-        ["na", "unknown", "unknown", "unknown"],
+        ["na", "unknown", "unknown", "unknown", "unknown"],
     ):
         model(ontology_id=name, name=name, description="From CellxGene schema.").save()
@@ -76,8 +92,24 @@ def save_cxg_defaults() -> None:
             name=name, type=suspension_type, description="From CellxGene schema."
         ).save()
+    # organisms
+    taxonomy_ids = [
+        "NCBITaxon:9606",  # Homo sapiens (Human)
+        "NCBITaxon:10090",  # Mus musculus (House mouse)
+        "NCBITaxon:9544",  # Macaca mulatta (Rhesus monkey)
+        "NCBITaxon:9825",  # Sus scrofa domesticus (Domestic pig)
+        "NCBITaxon:9598",  # Pan troglodytes (Chimpanzee)
+        "NCBITaxon:9483",  # Callithrix jacchus (White-tufted-ear marmoset)
+        "NCBITaxon:7955",  # Danio rerio (Zebrafish)
+    ]
+    for ontology_id in taxonomy_ids:
+        bt.Organism.from_source(
+            ontology_id=ontology_id,
+            source=bt.Source.get(name="ncbitaxon", currently_used=True),
+        ).save()
-def _create_cxg_sources(
+def _create_cellxgene_sources(
     categoricals: dict[str, FieldAttr], schema_version: str, organism: str
 ) -> dict[str, SQLRecord]:
     """Create a source dictionary of CELLxGENE categoricals to Source."""
@@ -105,7 +137,7 @@ def _create_cxg_sources(
                 )
             return source
-    sources_df = pd.read_csv(UPath(__file__).parent / "cxg_schema_versions.csv")
+    sources_df = pd.read_csv(UPath(__file__).parent / "cellxgene_schema_versions.csv")
     sources_df = sources_df[sources_df.schema_version == schema_version]
     if sources_df.empty:
         raise ValueError(
@@ -126,11 +158,28 @@ def _create_cxg_sources(
     return key_to_source
+@deprecated(new_name="create_cellxgene_schema")
 def get_cxg_schema(
     schema_version: CELLxGENESchemaVersions,
     *,
     field_types: FieldType | Collection[FieldType] = "ontology_id",
-    organism: Literal["human", "mouse"] = "human",
+    organism: CELLxGENEOrganisms = "human",
+    spatial_library_id: str | None = None,
+) -> Schema:
+    return create_cellxgene_schema(
+        schema_version,
+        field_types=field_types,
+        organism=organism,
+        spatial_library_id=spatial_library_id,
+    )
+def create_cellxgene_schema(
+    schema_version: CELLxGENESchemaVersions,
+    *,
+    field_types: FieldType | Collection[FieldType] = "ontology_id",
+    organism: CELLxGENEOrganisms = "human",
+    spatial_library_id: str | None = None,
 ) -> Schema:
     """Generates a :class:`~lamindb.Schema` for a specific CELLxGENE schema version.
@@ -138,6 +187,8 @@ def get_cxg_schema(
         schema_version: The CELLxGENE Schema version.
         field_types: One or several of 'ontology_id', 'name'.
         organism: The organism of the Schema.
+        library_id: Identifier for the spatial library.
+            Specifying this value enables curation against spatial requirements.
     """
     import bionty as bt
@@ -168,7 +219,7 @@ def get_cxg_schema(
         "tissue": CategorySpec(bt.Tissue.name, None),
         "tissue_ontology_term_id": CategorySpec(bt.Tissue.ontology_id, None),
         "tissue_type": CategorySpec(ULabel.name, "tissue"),
-        "organism": CategorySpec(bt.Organism.name, None),
+        "organism": CategorySpec(bt.Organism.scientific_name, None),
         "organism_ontology_term_id": CategorySpec(bt.Organism.ontology_id, None),
         "donor_id": CategorySpec(str, "unknown"),
     }
@@ -195,7 +246,17 @@ def get_cxg_schema(
             f"Invalid field_types: {field_types}. Must contain 'ontology_id', 'name', or both."
         )
-    sources = _create_cxg_sources(
+    is_version_6_or_later = version.parse(schema_version) >= version.parse("6.0.0")
+    organism_fields = {"organism", "organism_ontology_term_id"}
+    if is_version_6_or_later:
+        obs_categoricals = {
+            k: v for k, v in categoricals.items() if k not in organism_fields
+        }
+    else:
+        obs_categoricals = categoricals
+    sources = _create_cellxgene_sources(
         categoricals=categoricals,
         schema_version=schema_version,
         organism=organism,
@@ -217,30 +278,83 @@ def get_cxg_schema(
     obs_features = [
         Feature(
             name=field,
-            dtype=categoricals[field],
+            dtype=obs_categoricals[field],
             cat_filters={"source": source},
             default_value=categoricals_to_spec[field].default,
         ).save()
         for field, source in sources.items()
-        if field != "var_index"
+        if field != "var_index" and field in obs_categoricals
     ]
     for name in ["is_primary_data", "suspension_type", "tissue_type"]:
         obs_features.append(Feature(name=name, dtype=ULabel.name).save())
     obs_schema = Schema(
-        name=f"obs of CELLxGENE version {schema_version}",
+        name=f"obs of CELLxGENE version {schema_version} for {organism} of {field_types}",
         features=obs_features,
         otype="DataFrame",
         minimal_set=True,
         coerce_dtype=True,
     ).save()
+    slots = {"var": var_schema, "obs": obs_schema}
+    if is_version_6_or_later:
+        uns_categoricals = {
+            k: v for k, v in categoricals.items() if k in organism_fields
+        }
+        uns_features = [
+            Feature(
+                name=field,
+                dtype=uns_categoricals[field],
+                cat_filters={"source": sources[field]},
+                default_value=categoricals_to_spec[field].default,
+            ).save()
+            for field in uns_categoricals
+        ]
+        uns_schema = Schema(
+            name=f"uns of CELLxGENE version {schema_version}",
+            features=uns_features,
+            otype="DataFrame",
+            minimal_set=True,
+            coerce_dtype=True,
+        ).save()
+        slots["uns"] = uns_schema
+        # Add spatial validation if library_id is provided
+        if spatial_library_id:
+            scalefactors_schema = Schema(
+                name=f"scalefactors of spatial {spatial_library_id}",
+                features=[
+                    Feature(name="spot_diameter_fullres", dtype=float).save(),
+                    Feature(name="tissue_hires_scalef", dtype=float).save(),
+                ],
+            ).save()
+            spatial_schema = Schema(
+                name="CELLxGENE spatial metadata",
+                features=[
+                    Feature(
+                        name="is_single",
+                        dtype=bool,
+                        description="True if dataset represents single spatial unit (tissue section for Visium, array for Slide-seqV2)",
+                    ).save()
+                ],
+            ).save()
+            slots["uns:spatial"] = spatial_schema
+            slots[f"uns:spatial:{spatial_library_id}:scalefactors"] = (
+                scalefactors_schema
+            )
     full_cxg_schema = Schema(
-        name=f"AnnData of CELLxGENE version {schema_version}",
+        name=f"AnnData of CELLxGENE version {schema_version} for {organism} of {', '.join(field_types) if isinstance(field_types, list) else field_types}",
         otype="AnnData",
         minimal_set=True,
         coerce_dtype=True,
-        slots={"var": var_schema, "obs": obs_schema},
+        slots=slots,
     ).save()
     return full_cxg_schema

lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} RENAMED Viewed

@@ -52,3 +52,14 @@ schema_version,entity,organism,source,version
 5.3.0,Tissue,all,uberon,2025-01-15
 5.3.0,Gene,human,ensembl,release-110
 5.3.0,Gene,mouse,ensembl,release-110
+6.0.0,CellType,all,cl,2025-04-10
+6.0.0,ExperimentalFactor,all,efo,3.78.0
+6.0.0,Ethnicity,human,hancestro,3.0
+6.0.0,DevelopmentalStage,human,hsapdv,2025-01-23
+6.0.0,DevelopmentalStage,mouse,mmusdv,2025-01-23
+6.0.0,Disease,all,mondo,2025-05-06
+6.0.0,Organism,all,ncbitaxon,2025-03-13
+6.0.0,Phenotype,all,pato,2025-05-14
+6.0.0,Tissue,all,uberon,2025-05-28
+6.0.0,Gene,human,ensembl,release-110
+6.0.0,Gene,mouse,ensembl,release-110

lamindb/examples/croissant/__init__.py CHANGED Viewed

@@ -1,35 +1,61 @@
-"""Example Croissant files.
+"""Examples for MLCommons Croissant files, which are used to store metadata about datasets.
+.. autosummary::
+   :toctree: .
+   mini_immuno
-Examples for MLCommons Croissant files, which are used to store metadata about datasets.
 """
 import json
 from pathlib import Path
-def mini_immuno(n_files: int = 1) -> list[Path]:
+def mini_immuno(
+    n_files: int = 1, filepath_prefix: str = "", strip_version: bool = False
+) -> list[Path]:
     """Return paths to the mini immuno dataset and its metadata as a Croissant file.
     Args:
         n_files: Number of files inside the croissant file. Default is 1.
+        filepath_prefix: Move the dataset and references to it in a specific directory.
+    Example
+        ::
+            croissant_path, dataset1_path = ln.examples.croissant.mini_immuno()
+            croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(n_files=2)
     """
     from ..datasets import file_mini_csv
     from ..datasets.mini_immuno import get_dataset1
     adata = get_dataset1(otype="AnnData")
-    dataset1_path = Path("mini_immuno.anndata.zarr")
+    if filepath_prefix:
+        dataset1_path = Path(filepath_prefix) / "mini_immuno.anndata.zarr"
+    else:
+        dataset1_path = Path("mini_immuno.anndata.zarr")
     adata.write_zarr(dataset1_path)
     orig_croissant_path = (
         Path(__file__).parent / "mini_immuno.anndata.zarr_metadata.json"
     )
     with open(orig_croissant_path, encoding="utf-8") as f:
         data = json.load(f)
+    if filepath_prefix:
+        assert data["distribution"][0]["@id"] == "mini_immuno.anndata.zarr"  # noqa: S101
+        data["distribution"][0]["@id"] = str(Path(filepath_prefix) / dataset1_path.name)
+    if strip_version:
+        data.pop("version", None)
     if n_files == 2:
-        dataset2_path = file_mini_csv()
+        file_mini_csv()
+        if filepath_prefix:
+            dataset2_path = Path(filepath_prefix) / "mini.csv"
+        else:
+            dataset2_path = Path("mini.csv")
         data["distribution"].append(
             {
                 "@type": "sc:FileObject",
-                "@id": "mini.csv",
+                "@id": dataset2_path.as_posix(),
                 "name": "mini.csv",
                 "encodingFormat": "text/csv",
             }

lamindb/examples/datasets/__init__.py CHANGED Viewed

@@ -41,7 +41,7 @@ Dictionary, Dataframe, AnnData, MuData, SpatialData.
 .. autosummary::
    :toctree: .
-   dict_cxg_uns
+   dict_cellxgene_uns
    df_iris
    df_iris_in_meter
    df_iris_in_meter_study1
@@ -78,7 +78,7 @@ from ._core import (
     df_iris_in_meter,
     df_iris_in_meter_study1,
     df_iris_in_meter_study2,
-    dict_cxg_uns,
+    dict_cellxgene_uns,
     dir_iris_images,
     dir_scrnaseq_cellranger,
     file_bam,

lamindb/examples/datasets/_core.py CHANGED Viewed

@@ -353,7 +353,7 @@ def anndata_suo22_Visium10X():  # pragma: no cover
     return ad.read_h5ad(filepath)
-def mudata_papalexi21_subset() -> MuData:  # pragma: no cover
+def mudata_papalexi21_subset(with_uns: bool = False) -> MuData:  # pragma: no cover
     """A subsetted mudata from papalexi21.
     To reproduce the subsetting:
@@ -415,10 +415,17 @@ def mudata_papalexi21_subset() -> MuData:  # pragma: no cover
     mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
     mdata.pull_obs(["technique"], mods="hto")
+    if with_uns:
+        mdata.uns["study_metadata"] = {
+            "temperature": 21.6,
+            "experiment": "Experiment 1",
+        }
+        mdata["rna"].uns["site_metadata"] = {"pos": 99.9, "site_id": "SITE001"}
     return mdata
-def dict_cxg_uns() -> dict[str, Any]:
+def dict_cellxgene_uns() -> dict[str, Any]:
     """An example CELLxGENE AnnData `.uns` dictionary."""
     uns = {
         "organism_ontology_term_id": "NCBITaxon:9606",

lamindb/examples/datasets/_small.py CHANGED Viewed

@@ -9,32 +9,36 @@ import pandas as pd
 def small_dataset3_cellxgene(
     otype: Literal["DataFrame", "AnnData"] = "AnnData",
+    *,
     with_obs_defaults: bool = False,
+    with_var_typo: bool = False,
     with_obs_typo: bool = False,
+    with_uns_organism: bool = False,
+    with_uns_spatial: bool = False,
 ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
-    # TODO: consider other ids for other organisms
-    # "ENSMUSG00002076988"
-    var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
+    var_id = "invalid_ensembl_id" if with_var_typo else "ENSG00000000457"
+    var_ids = [var_id, "ENSG00000000419", "ENSG00000139618"]
     lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048"
+    obs_data = {
+        "disease_ontology_term_id": [
+            "MONDO:0004975",
+            "MONDO:0004980",
+            "MONDO:0004980",
+        ],
+        "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
+        "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
+        "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
+        "cell_type": ["T cell", "B cell", "B cell"],
+        "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
+        "donor_id": ["-1", "1", "2"],
+        "is_primary_data": [False, False, False],
+        "suspension_type": ["cell", "cell", "cell"],
+        "tissue_type": ["tissue", "tissue", "tissue"],
+    }
     obs_df = pd.DataFrame(
-        {
-            "disease_ontology_term_id": [
-                "MONDO:0004975",
-                "MONDO:0004980",
-                "MONDO:0004980",
-            ],
-            "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
-            "organism": ["human", "human", "human"],
-            "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
-            "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
-            "cell_type": ["T cell", "B cell", "B cell"],
-            "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
-            "donor_id": ["-1", "1", "2"],
-            "is_primary_data": [False, False, False],
-            "suspension_type": ["cell", "cell", "cell"],
-            "tissue_type": ["tissue", "tissue", "tissue"],
-        },
+        obs_data,
         index=["barcode1", "barcode2", "barcode3"],
     )
@@ -65,8 +69,38 @@ def small_dataset3_cellxgene(
         # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304
         adata.raw = adata.copy()
         adata.raw.var.drop(columns="feature_is_filtered", inplace=True)
         if with_obs_defaults:
+            adata.obs["cell_type_ontology_term_id"] = [
+                "CL:0000084",
+                "CL:0000236",
+                "CL:0000236",
+            ]
+            adata.obs["self_reported_ethnicity_ontology_term_id"] = "na"
+            adata.obs["assay_ontology_term_id"] = "EFO:1001982"
             adata.obs["assay"] = "single-cell RNA sequencing"
+        if with_uns_organism:
+            adata.uns["organism_ontology_term_id"] = "NCBITaxon:9606"
+            adata.uns["organism"] = "Homo sapiens"
+        else:
+            adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606"
+            obs_data["organism"] = ["Homo sapiens", "Homo sapiens", "Homo sapiens"]
+        if with_uns_spatial:
+            adata.uns["spatial"] = {
+                "is_single": True,
+                "library_123": {
+                    "scalefactors": {
+                        "spot_diameter_fullres": 165.0,
+                        "tissue_hires_scalef": 0.5,
+                    },
+                    "images": {
+                        "hires": np.random.default_rng().integers(
+                            0, 255, (2000, 2000, 3), dtype=np.uint8
+                        )
+                    },
+                },
+            }
         return adata
@@ -92,6 +126,16 @@ def anndata_with_obs() -> ad.AnnData:
     df.index = "obs" + df.index.astype(str)
     adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
-    adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
+    bionty_genes = bionty_base.Gene()
+    # backwards compatible
+    adata.var.index = (
+        (
+            bionty_genes.to_dataframe()
+            if hasattr(bionty_genes, "to_dataframe")
+            else bionty_genes.df()
+        )
+        .head(100)["ensembl_gene_id"]
+        .values
+    )
     return adata

lamindb/examples/fixtures/sheets.py CHANGED Viewed

@@ -46,6 +46,8 @@ def populate_sheets_compound_treatment():
     # Samples ---------------------------
+    project = ln.Feature(name="project", dtype=ln.Project).save()
+    project1 = ln.Project(name="Project 1").save()
     sample_type = ln.Record(name="BioSample", is_type=True).save()
     treatment = ln.Feature(name="treatment", dtype=treatment_type).save()
     cell_line = ln.Feature(name="cell_line", dtype=bt.CellLine).save()
@@ -54,7 +56,7 @@ def populate_sheets_compound_treatment():
     cell_line.save()
     schema1 = ln.Schema(
         name="My samples schema 2025-06",
-        features=[treatment, cell_line, preparation_date],
+        features=[treatment, cell_line, preparation_date, project],
     ).save()
     sample_sheet1 = ln.Record(
         name="My samples 2025-06", schema=schema1, type=sample_type
@@ -69,6 +71,7 @@ def populate_sheets_compound_treatment():
     ln.models.RecordJson(
         record=sample1, feature=preparation_date, value="2025-06-01T05:00:00"
     ).save()
+    ln.models.RecordProject(record=sample1, feature=project, value=project1).save()
     # populate sample2
     sample2 = ln.Record(name="sample2", type=sample_sheet1).save()
     ln.models.RecordRecord(record=sample2, feature=treatment, value=treatment2).save()
@@ -76,12 +79,13 @@ def populate_sheets_compound_treatment():
     ln.models.RecordJson(
         record=sample2, feature=preparation_date, value="2025-06-01T06:00:00"
     ).save()
+    ln.models.RecordProject(record=sample2, feature=project, value=project1).save()
     # another sheet for samples
     sample_note = ln.Feature(name="sample_note", dtype="str").save()
     schema2 = ln.Schema(
         name="My samples schema 2025-07",
-        features=[treatment, cell_line, sample_note],
+        features=[treatment, cell_line, sample_note, project],
     ).save()
     # the sheet
     sample_sheet2 = ln.Record(
@@ -94,6 +98,7 @@ def populate_sheets_compound_treatment():
     ln.models.RecordJson(
         record=sample3, feature=preparation_date, value="2025-06-02T05:00:00Z"
     ).save()
+    ln.models.RecordProject(record=sample3, feature=project, value=project1).save()
     # populate sample4
     sample4 = ln.Record(type=sample_sheet2).save()
     ln.models.RecordRecord(record=sample4, feature=treatment, value=treatment2).save()
@@ -101,6 +106,7 @@ def populate_sheets_compound_treatment():
     ln.models.RecordJson(
         record=sample4, feature=preparation_date, value="2025-06-02T06:00:00Z"
     ).save()
+    ln.models.RecordProject(record=sample4, feature=project, value=project1).save()
     yield treatments_sheet, sample_sheet1

lamindb/integrations/_croissant.py CHANGED Viewed

@@ -4,6 +4,10 @@ import json
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
+import lamindb_setup as ln_setup
+from lamin_utils import logger
+from lamindb_setup.core.upath import UPath
 if TYPE_CHECKING:
     import lamindb as ln
@@ -27,6 +31,8 @@ def curate_from_croissant(
     """
     import lamindb as ln
+    from ..models.artifact import check_path_in_existing_storage
     # Load CroissantML data
     if isinstance(croissant_data, (str, Path)):
         if not Path(croissant_data).exists():
@@ -49,10 +55,10 @@ def curate_from_croissant(
     # Extract basic metadata
     dataset_name = data["name"]
-    description = data.get("description", "")
-    version = data.get("version", "1.0")
-    license_info = data.get("license", "")
-    project_name = data.get("cr:projectName", "")
+    description = data.get("description", None)
+    version = data.get("version", None)
+    license_info = data.get("license", None)
+    project_name = data.get("cr:projectName", None)
     # Create license feature and label if license info exists
     license_label = None
@@ -86,18 +92,35 @@ def curate_from_croissant(
             content_url = dist.get("contentUrl", "")
             file_path = content_url or data.get("url", "")
         if not file_path:
-            raise ValueError(
-                f"No valid file path found in croissant distribution: {dist}"
+            raise ValueError(f"No file path found in croissant distribution: {dist}")
+        if not UPath(file_path).exists():
+            raise ValueError(f"Inferred file path does not exist: {file_path}")
+        result = check_path_in_existing_storage(
+            file_path, check_hub_register_storage=ln_setup.settings.instance.is_on_hub
+        )
+        if isinstance(result, ln.Storage):
+            key = None  # will automatically use existing storage key
+        else:
+            current_storage_location = (
+                ln.settings.storage
+                if not ln.setup.settings.instance.keep_artifacts_local
+                else ln.settings.local_storage
+            )
+            logger.warning(
+                f"file path {file_path} is not part of a known storage location, will be duplicated to: {current_storage_location}"
             )
+            key = file_id
         if len(file_distributions) == 1:
-            artifact_description = f"{dataset_name}"
-            if file_id != dataset_name:
-                artifact_description += f" ({file_id})"
-            artifact_description += f" - {description}"
+            # it doesn't make sense to have the dataset name on the individual
+            # artifact if it's part of a collection
+            artifact_description = dataset_name
+            if description is not None:
+                artifact_description += f" - {description}"
         else:
-            artifact_description = f"{file_id}"
+            artifact_description = None
         artifact = ln.Artifact(  # type: ignore
             file_path,
+            key=key,
             description=artifact_description,
             version=version,
             kind="dataset",

lamindb/migrations/0119_squashed.py CHANGED Viewed

@@ -219,9 +219,8 @@ class Migration(migrations.Migration):
                     "uid",
                     lamindb.base.fields.CharField(
                         blank=True,
-                        db_default="aaaaaaaaaaaa",
                         db_index=True,
-                        default="aaaaaaaaaaaaa",
+                        default=lamindb.base.uids.base62_12,
                         editable=False,
                         max_length=12,
                         unique=True,
@@ -4582,4 +4581,8 @@ class Migration(migrations.Migration):
                 name="unique_artifact_storage_hash_null_key",
             ),
         ),
+        migrations.AlterModelOptions(
+            name="user",
+            options={},
+        ),
     ]

lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl

lamindb 1.10.2py3-none-any.whl → 1.11.0py3-none-any.whl