PyPI - lamindb - Versions diffs - 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl - Mend

lamindb 0.48a2py3-none-any.whl → 0.48.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

lamindb/__init__.py +15 -24
lamindb/_context.py +5 -2
lamindb/_dataset.py +6 -3
lamindb/_delete.py +6 -6
lamindb/_feature.py +61 -26
lamindb/_feature_manager.py +176 -0
lamindb/_feature_set.py +63 -27
lamindb/_file.py +120 -76
lamindb/_from_values.py +88 -28
lamindb/_label.py +85 -0
lamindb/_logger.py +1 -1
lamindb/_manager.py +24 -17
lamindb/_orm.py +157 -33
lamindb/_queryset.py +37 -35
lamindb/_save.py +19 -9
lamindb/_transform.py +12 -3
lamindb/_view.py +1 -1
lamindb/dev/__init__.py +4 -0
lamindb/dev/_settings.py +1 -1
lamindb/dev/_view_parents.py +70 -34
lamindb/dev/datasets/__init__.py +12 -0
lamindb/dev/datasets/_core.py +116 -65
lamindb/dev/storage/__init__.py +1 -5
lamindb/dev/storage/_backed_access.py +505 -379
lamindb/dev/storage/file.py +3 -1
{lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/METADATA +10 -8
lamindb-0.48.1.dist-info/RECORD +42 -0
lamindb/_category.py +0 -42
lamindb-0.48a2.dist-info/RECORD +0 -41
{lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/LICENSE +0 -0
{lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/WHEEL +0 -0
{lamindb-0.48a2.dist-info → lamindb-0.48.1.dist-info}/entry_points.txt +0 -0

lamindb/dev/_view_parents.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Set, Union
 from lnschema_core import ORM, File, Run
+from lnschema_core.models import format_datetime
 def view_lineage(file: File, with_children: bool = True):
@@ -32,9 +33,10 @@ def view_lineage(file: File, with_children: bool = True):
     u = graphviz.Digraph(
         file.id,
         node_attr={
-            "fillcolor": "antiquewhite",
-            "color": "orange",
+            "fillcolor": "honeydew",
+            "color": "seagreen",
             "fontname": "Helvetica",
+            "fontsize": "10",
         },
         edge_attr={"arrowsize": "0.5"},
     )
@@ -47,9 +49,9 @@ def view_lineage(file: File, with_children: bool = True):
             shape = "box"
             fillcolor = "gainsboro"
         else:
-            shape = "oval"
-            style = "filled"
-            fillcolor = "antiquewhite"
+            style = "rounded,filled"
+            shape = "box"
+            fillcolor = "honeydew"
         u.node(
             node_id,
             label=node_label,
@@ -64,39 +66,58 @@ def view_lineage(file: File, with_children: bool = True):
             add_node(row["target_record"], row["target"], row["target_label"], u)
         u.edge(row["source"], row["target"], color="dimgrey")
-    # label the searched file orange
-    u.node(file.id, label=file_label, style="filled", fillcolor="orange", shape="oval")
+    # label the searched file mediumseagreen
+    u.node(
+        file.id,
+        label=file_label,
+        style="rounded,filled",
+        fillcolor="mediumseagreen",
+        shape="box",
+    )
     return u
-def view_parents(record: ORM, field: str, distance: int = 100):
+def view_parents(
+    record: ORM, field: str, with_children: bool = False, distance: int = 100
+):
     """Graph of parents."""
     if not hasattr(record, "parents"):
         return NotImplementedError(
             f"Parents view is not supported for {record.__class__.__name__}!"
         )
     import graphviz
+    import pandas as pd
     df_edges = _df_edges_from_parents(record=record, field=field, distance=distance)
+    if with_children:
+        df_edges = pd.concat(
+            [
+                df_edges,
+                _df_edges_from_parents(
+                    record=record, field=field, distance=distance, children=True
+                ),
+            ]
+        ).drop_duplicates()
     record_label = record.__getattribute__(field)
     u = graphviz.Digraph(
         record.id,
         node_attr={
-            "color": "orange",
-            "fillcolor": "antiquewhite",
+            "color": "seagreen",
+            "fillcolor": "honeydew",
             "shape": "box",
             "style": "rounded,filled",
             "fontname": "Helvetica",
+            "fontsize": "10",
         },
         edge_attr={"arrowsize": "0.5"},
     )
     u.node(
         record_label.replace(":", "_"),
         label=record_label,
-        fillcolor="orange",
+        fillcolor="mediumseagreen",
     )
     for _, row in df_edges.iterrows():
         u.node(row["source"], label=row["source_label"])
@@ -105,17 +126,21 @@ def view_parents(record: ORM, field: str, distance: int = 100):
     return u
-def _get_parents(record: ORM, field: str, distance: int):
+def _get_parents(record: ORM, field: str, distance: int, children: bool = False):
     """Recursively get parent records within a distance."""
+    if children:
+        key = "parents"
+    else:
+        key = "children"
     model = record.__class__
-    condition = f"children__{field}"
+    condition = f"{key}__{field}"
     results = model.select(**{condition: record.__getattribute__(field)}).all()
     if distance < 2:
         return results
     d = 2
     while d < distance:
-        condition = "children__" + condition
+        condition = f"{key}__{condition}"
         records = model.select(**{condition: record.__getattribute__(field)}).all()
         if len(records) == 0:
@@ -126,16 +151,21 @@ def _get_parents(record: ORM, field: str, distance: int):
     return results
-def _df_edges_from_parents(record: ORM, field: str, distance: int):
+def _df_edges_from_parents(
+    record: ORM, field: str, distance: int, children: bool = False
+):
     """Construct a DataFrame of edges as the input of graphviz.Digraph."""
-    parents = _get_parents(record=record, field=field, distance=distance)
+    key = "children" if children else "parents"
+    parents = _get_parents(
+        record=record, field=field, distance=distance, children=children
+    )
     records = parents | record.__class__.objects.filter(id=record.id)
-    df = records.distinct().df(include=[f"parents__{field}"])
-    df_edges = df[[f"parents__{field}", field]]
-    df_edges = df_edges.explode(f"parents__{field}")
+    df = records.distinct().df(include=[f"{key}__{field}"])
+    df_edges = df[[f"{key}__{field}", field]]
+    df_edges = df_edges.explode(f"{key}__{field}")
     df_edges.dropna(axis=0, inplace=True)
     df_edges.rename(
-        columns={f"parents__{field}": "source", field: "target"}, inplace=True
+        columns={f"{key}__{field}": "source", field: "target"}, inplace=True
     )
     df_edges = df_edges.drop_duplicates()
@@ -153,10 +183,10 @@ def _get_all_parent_runs(file: File):
     all_runs = {file.run}
     runs = [file.run]
-    while any([r.inputs.exists() for r in runs if r is not None]):
+    while any([r.input_files.exists() for r in runs if r is not None]):
         inputs = []
         for r in runs:
-            inputs += r.inputs.all()
+            inputs += r.input_files.all()
         runs = [f.run for f in inputs]
         all_runs.update(runs)
     return all_runs
@@ -166,29 +196,35 @@ def _get_all_child_runs(file: File):
     """Get all output file runs recursively."""
     all_runs: Set[Run] = set()
-    runs = {f.run for f in file.run.outputs.all()}
+    runs = {f.run for f in file.run.output_files.all()}
     while runs.difference(all_runs):
         all_runs.update(runs)
         child_runs: Set[Run] = set()
         for r in runs:
-            child_runs.update(Run.select(inputs__id__in=r.outputs.list("id")).list())
+            child_runs.update(
+                Run.select(input_files__id__in=r.output_files.list("id")).list()
+            )
         runs = child_runs
     return all_runs
 def _label_file_run(record: Union[File, Run]):
     if isinstance(record, File):
+        if record.description is None:
+            name = record.key
+        else:
+            name = record.description.replace("&", "&amp;")
         return (
-            rf'<{record.key}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
-            rf' FACE="Monospace">id={record.id}</FONT>>'
-            if record.key is not None
-            else record.id
+            rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
+            rf' FACE="Monospace">id={record.id}<BR/>suffix={record.suffix}</FONT>>'
         )
     elif isinstance(record, Run):
-        name = f'{record.transform.name.replace("&", "&amp;")}   '
+        name = f'{record.transform.name.replace("&", "&amp;")}'
         return (
             rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
-            rf' FACE="Monospace">id={record.id}</FONT>>'
+            rf' FACE="Monospace">id={record.id}<BR/>type={record.transform.type},'
+            rf" user={record.created_by.name}<BR/>run_at={format_datetime(record.run_at)}</FONT>>"  # noqa
         )
@@ -199,10 +235,10 @@ def _df_edges_from_runs(all_runs: List[Run]):
     for run in all_runs:
         if run is None:
             continue
-        if run.inputs.exists():
-            df_values.append((run.inputs.list(), run))
-        if run.outputs.exists():
-            df_values.append((run, run.outputs.list()))
+        if run.input_files.exists():
+            df_values.append((run.input_files.list(), run))
+        if run.output_files.exists():
+            df_values.append((run, run.output_files.list()))
     df = pd.DataFrame(df_values, columns=["source_record", "target_record"])
     df = df.explode("source_record")
     df = df.explode("target_record")

lamindb/dev/datasets/__init__.py CHANGED Viewed

@@ -4,7 +4,9 @@
    :toctree: .
    file_fcs
+   file_fcs_alpert19
    file_jpg_paradisi05
+   file_tiff_suo22
    file_fastq
    file_bam
    file_mini_csv
@@ -19,6 +21,10 @@
    anndata_pbmc68k_reduced
    anndata_pbmc3k_processed
    anndata_with_obs
+   anndata_suo22_Visium10X
+   mudata_papalexi21_subset
+   schmidt22_crispra_gws_IFNG
+   schmidt22_perturbseq
    fake_bio_notebook_titles
 """
@@ -27,6 +33,7 @@ from ._core import (
     anndata_mouse_sc_lymph_node,
     anndata_pbmc3k_processed,
     anndata_pbmc68k_reduced,
+    anndata_suo22_Visium10X,
     anndata_with_obs,
     df_iris,
     df_iris_in_meter,
@@ -36,8 +43,13 @@ from ._core import (
     file_bam,
     file_fastq,
     file_fcs,
+    file_fcs_alpert19,
     file_jpg_paradisi05,
     file_mini_csv,
+    file_tiff_suo22,
     generate_cell_ranger_files,
+    mudata_papalexi21_subset,
+    schmidt22_crispra_gws_IFNG,
+    schmidt22_perturbseq,
 )
 from ._fake import fake_bio_notebook_titles

lamindb/dev/datasets/_core.py CHANGED Viewed

@@ -11,13 +11,22 @@ from .._settings import settings
 def file_fcs() -> Path:
-    """Return fcs file example."""
+    """Example FCS file."""
     filepath, _ = urlretrieve(
         "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
     )
     return Path(filepath)
+def file_fcs_alpert19() -> Path:
+    """FCS file from Alpert19."""
+    filepath, _ = urlretrieve(
+        "https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs",  # noqa
+        "Alpert19.fcs",
+    )
+    return Path(filepath)
 def file_jpg_paradisi05() -> Path:
     """Return jpg file example.
@@ -30,26 +39,45 @@ def file_jpg_paradisi05() -> Path:
     return Path(filepath)
-def file_fastq() -> Path:
+def file_fastq(in_storage_root=False) -> Path:
     """Mini mock fastq file."""
-    with open("./input.fastq.gz", "w") as f:
+    basedir = Path(".") if not in_storage_root else settings.storage
+    filepath = basedir / "input.fastq.gz"
+    with open(filepath, "w") as f:
         f.write("Mock fastq file.")
-    return Path("./input.fastq.gz")
+    return filepath
-def file_bam() -> Path:
+def file_bam(in_storage_root=False) -> Path:
     """Mini mock bam file."""
-    with open("./output.bam", "w") as f:
+    basedir = Path(".") if not in_storage_root else settings.storage
+    filepath = basedir / "output.bam"
+    with open(filepath, "w") as f:
         f.write("Mock bam file.")
-    return Path("./output.bam")
+    return filepath
-def file_mini_csv() -> Path:
+def file_mini_csv(in_storage_root=False) -> Path:
     """Mini csv file."""
-    filename = Path("./mini.csv")
+    basedir = Path(".") if not in_storage_root else settings.storage
+    filepath = basedir / "mini.csv"
     df = pd.DataFrame([1, 2, 3], columns=["test"])
-    df.to_csv(filename, index=False)
-    return filename
+    df.to_csv(filepath, index=False)
+    return filepath
+def file_tiff_suo22():
+    """Image file from Suo22.
+    Pair with anndata_suo22_Visium10X
+    """
+    filepath, _ = urlretrieve(
+        "https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
+        "F121_LP1_4LIV.tiff",
+    )
+    Path("suo22/").mkdir(exist_ok=True)
+    filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff")
+    return Path(filepath)
 def dir_scrnaseq_cellranger(in_storage_root=False) -> Path:
@@ -59,12 +87,12 @@ def dir_scrnaseq_cellranger(in_storage_root=False) -> Path:
     )
     from zipfile import ZipFile
-    path = Path(".") if not in_storage_root else settings.storage
+    basedir = Path(".") if not in_storage_root else settings.storage
     with ZipFile(filepath, "r") as zipObj:
         # Extract all the contents of zip file in current directory
-        zipObj.extractall(path=path)
+        zipObj.extractall(path=basedir)
-    return path / "cellranger_run_001"
+    return basedir / "cellranger_run_001"
 def anndata_mouse_sc_lymph_node() -> ad.AnnData:
@@ -133,7 +161,10 @@ def anndata_human_immune_cells() -> ad.AnnData:
         adata.write('human_immune.h5ad')
     """
     filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
-    return ad.read(filepath)
+    adata = ad.read(filepath)
+    del adata.raw
+    adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
+    return adata
 def anndata_with_obs() -> ad.AnnData:
@@ -163,6 +194,40 @@ def anndata_with_obs() -> ad.AnnData:
     return adata
+def anndata_suo22_Visium10X():
+    """AnnData from Suo22 generated by 10x Visium."""
+    import anndata as ad
+    filepath, _ = urlretrieve(
+        "https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
+        "Visium10X_data_LI_subset.h5ad",
+    )
+    Path("suo22/").mkdir(exist_ok=True)
+    filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
+    return ad.read(filepath)
+def mudata_papalexi21_subset():
+    """A subsetted mudata from papalexi21.
+    To reproduce the subsetting:
+    >>> !wget https://figshare.com/ndownloader/files/36509460
+    >>> import mudata as md
+    >>> import scanpy as sc
+    >>> mdata = md.read_h5mu("36509460")
+    >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
+    >>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu")  # noqa
+    """
+    import mudata as md
+    filepath, _ = urlretrieve(
+        "https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu",  # noqa
+        "papalexi21_subset.h5mu",
+    )
+    return md.read_h5mu(filepath)
 def df_iris() -> pd.DataFrame:
     """The iris dataset as in sklearn.
@@ -260,53 +325,39 @@ def generate_cell_ranger_files(
     return sampledir
-# def schmidt22_crispra_gws_IFNG() -> Path:
-#     """CRISPRi screen dataset of Schmidt22.
-#     Originally from: https://zenodo.org/record/5784651
-#     """
-#     filepath, _ = urlretrieve(
-#         "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
-#         "schmidt22-crispra-gws-IFNG.csv",
-#     )
-#     return Path(filepath)
-# def schmidt22_perturbseq() -> Path:
-#     """Perturb-seq dataset of Schmidt22.
-#     Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
-#     To reproduce the subsample:
-#     >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
-#     >>> adata.obs = adata.obs[['cluster_name']]
-#     >>> del adata.obsp
-#     >>> del adata.var['features']
-#     >>> del adata.obsm['X_pca']
-#     >>> del adata.uns
-#     >>> del adata.raw
-#     >>> del adata.varm
-#     >>> adata.obs = adata.obs.reset_index()
-#     >>> del adata.obs['index']
-#     >>> sc.pp.subsample(adata, 0.03)
-#     >>> adata.write('schmidt22_perturbseq.h5ad')
-#     """
-#     filepath, _ = urlretrieve(
-#         "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
-#         "schmidt22_perturbseq.h5ad",
-#     )
-#     return Path(filepath)
-# def dir_scrnaseq_cellranger_schmidt22() -> Path:
-#     """BFXoutput directory containing Schmidt22_perturbseq."""
-#     filepath, _ = urlretrieve(
-#         "https://lamindb-test.s3.amazonaws.com/scrnaseq-cellranger-schmidt22.zip",
-#     )
-#     from zipfile import ZipFile
-#     with ZipFile(filepath, "r") as zipObj:
-#         # Extract all the contents of zip file in current directory
-#         zipObj.extractall(path=".")
-#     return Path("scrnaseq-cellranger-schmidt22")
+def schmidt22_crispra_gws_IFNG(basedir=".") -> Path:
+    """CRISPRi screen dataset of Schmidt22.
+    Originally from: https://zenodo.org/record/5784651
+    """
+    filepath, _ = urlretrieve(
+        "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
+        "schmidt22-crispra-gws-IFNG.csv",
+    )
+    return Path(filepath).rename(Path(basedir) / filepath)
+def schmidt22_perturbseq(basedir=".") -> Path:
+    """Perturb-seq dataset of Schmidt22.
+    Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
+    To reproduce the subsample:
+    >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
+    >>> adata.obs = adata.obs[['cluster_name']]
+    >>> del adata.obsp
+    >>> del adata.var['features']
+    >>> del adata.obsm['X_pca']
+    >>> del adata.uns
+    >>> del adata.raw
+    >>> del adata.varm
+    >>> adata.obs = adata.obs.reset_index()
+    >>> del adata.obs['index']
+    >>> sc.pp.subsample(adata, 0.03)
+    >>> adata.write('schmidt22_perturbseq.h5ad')
+    """
+    filepath, _ = urlretrieve(
+        "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
+        "schmidt22_perturbseq.h5ad",
+    )
+    return Path(filepath).rename(Path(basedir) / filepath)

lamindb/dev/storage/__init__.py CHANGED Viewed

@@ -12,10 +12,6 @@ from lamindb_setup.dev.upath import UPath
 from lamindb_setup.dev.upath import infer_filesystem as _infer_filesystem
 from ._anndata_sizes import size_adata
-try:
-    from ._backed_access import AnnDataAccessor, BackedAccessor
-except ImportError:
-    pass
+from ._backed_access import AnnDataAccessor, BackedAccessor
 from .file import delete_storage, load_to_memory, store_object
 from .object import infer_suffix, write_to_file

lamindb 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl

lamindb 0.48a2py3-none-any.whl → 0.48.1py3-none-any.whl