PyPI - scdataloader - Versions diffs - 2.0.11__tar.gz → 2.1.0__tar.gz - Mend

scdataloader 2.0.11tar.gz → 2.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{scdataloader-2.0.11 → scdataloader-2.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scdataloader
-Version: 2.0.11
+Version: 2.1.0
 Summary: a dataloader for single cell data in lamindb
 Project-URL: repository, https://github.com/jkobject/scDataLoader
 Author-email: jkobject <jkobject@gmail.com>
@@ -14,7 +14,7 @@ Requires-Dist: cellxgene-census>=0.1.0
 Requires-Dist: django>=4.0.0
 Requires-Dist: ipykernel>=6.20.0
 Requires-Dist: jupytext>=1.16.0
-Requires-Dist: lamindb[bionty,jupyter,zarr]==1.6.2
+Requires-Dist: lamindb[gcp]==2.1.1
 Requires-Dist: leidenalg>=0.8.0
 Requires-Dist: lightning>=2.3.0
 Requires-Dist: matplotlib>=3.5.0
@@ -162,7 +162,7 @@ adata = preprocessor(adata)
 art = ln.Artifact(adata, description="test")
 art.save()
-ln.Collection(art, name="test", description="test").save()
+ln.Collection(art, key="test", description="test").save()
 datamodule = DataModule(
     collection_name="test",
@@ -376,7 +376,7 @@ using absolute paths. You need to do 3 things:
 ```python
 import lamin as ln
-ln.Storage.df()
+ln.Storage.to_dataframe(limit=None)
 # view what is your current storage id (in my case it was GZgLW1TQ)
 ln.Storage.filter(uid="GZgLW1TI").update(
     root=Path("your_new_locations").as_posix().rstrip("/")

{scdataloader-2.0.11 → scdataloader-2.1.0}/README.md RENAMED Viewed

@@ -120,7 +120,7 @@ adata = preprocessor(adata)
 art = ln.Artifact(adata, description="test")
 art.save()
-ln.Collection(art, name="test", description="test").save()
+ln.Collection(art, key="test", description="test").save()
 datamodule = DataModule(
     collection_name="test",
@@ -334,7 +334,7 @@ using absolute paths. You need to do 3 things:
 ```python
 import lamin as ln
-ln.Storage.df()
+ln.Storage.to_dataframe(limit=None)
 # view what is your current storage id (in my case it was GZgLW1TQ)
 ln.Storage.filter(uid="GZgLW1TI").update(
     root=Path("your_new_locations").as_posix().rstrip("/")

{scdataloader-2.0.11 → scdataloader-2.1.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "scdataloader"
-version = "2.0.11"
+version = "2.1.0"
 description = "a dataloader for single cell data in lamindb"
 authors = [
     {name = "jkobject", email = "jkobject@gmail.com"}
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.13"
 keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
 dependencies = [
     "numpy<=2.2.0",
-    "lamindb[bionty,jupyter,zarr]==1.6.2",
+    "lamindb[gcp]==2.1.1",
     "cellxgene-census>=0.1.0",
     "torch>=2.2.0",
     "pytorch-lightning>=2.3.0",

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/data.py RENAMED Viewed

@@ -282,19 +282,25 @@ class Dataset(torchDataset):
             elif clss == "cell_type_ontology_term_id":
                 parentdf = (
                     bt.CellType.filter()
-                    .df(include=["parents__ontology_id", "ontology_id"])
+                    .to_dataframe(
+                        limit=None, include=["parents__ontology_id", "ontology_id"]
+                    )
                     .set_index("ontology_id")
                 )
             elif clss == "tissue_ontology_term_id":
                 parentdf = (
                     bt.Tissue.filter()
-                    .df(include=["parents__ontology_id", "ontology_id"])
+                    .to_dataframe(
+                        limit=None, include=["parents__ontology_id", "ontology_id"]
+                    )
                     .set_index("ontology_id")
                 )
             elif clss == "disease_ontology_term_id":
                 parentdf = (
                     bt.Disease.filter()
-                    .df(include=["parents__ontology_id", "ontology_id"])
+                    .to_dataframe(
+                        limit=None, include=["parents__ontology_id", "ontology_id"]
+                    )
                     .set_index("ontology_id")
                 )
             elif clss in [
@@ -304,19 +310,25 @@ class Dataset(torchDataset):
             ]:
                 parentdf = (
                     bt.DevelopmentalStage.filter()
-                    .df(include=["parents__ontology_id", "ontology_id"])
+                    .to_dataframe(
+                        limit=None, include=["parents__ontology_id", "ontology_id"]
+                    )
                     .set_index("ontology_id")
                 )
             elif clss == "assay_ontology_term_id":
                 parentdf = (
                     bt.ExperimentalFactor.filter()
-                    .df(include=["parents__ontology_id", "ontology_id"])
+                    .to_dataframe(
+                        limit=None, include=["parents__ontology_id", "ontology_id"]
+                    )
                     .set_index("ontology_id")
                 )
             elif clss == "self_reported_ethnicity_ontology_term_id":
                 parentdf = (
                     bt.Ethnicity.filter()
-                    .df(include=["parents__ontology_id", "ontology_id"])
+                    .to_dataframe(
+                        limit=None, include=["parents__ontology_id", "ontology_id"]
+                    )
                     .set_index("ontology_id")
                 )

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/preprocess.py RENAMED Viewed

@@ -264,8 +264,9 @@ class Preprocessor:
         # For genes that are already ENS IDs, use them directly
         prev_size = adata.shape[1]
         # Handle symbol genes
+        cols_to_use = adata.var.columns.difference(genesdf.columns)
         if self.is_symbol:
-            new_var = adata.var.merge(
+            new_var = adata.var[cols_to_use].merge(
                 genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
                 left_index=True,
                 right_index=True,
@@ -275,7 +276,7 @@ class Preprocessor:
             adata = adata[:, new_var.index]
             new_var.index = new_var["ensembl_gene_id"]
         else:
-            new_var = adata.var.merge(
+            new_var = adata.var[cols_to_use].merge(
                 genesdf, left_index=True, right_index=True, how="inner"
             )
             adata = adata[:, new_var.index]
@@ -654,7 +655,7 @@ class LaminPreprocessor(Preprocessor):
                 # Reconstruct collection using keys
                 dataset = ln.Collection(
                     [ln.Artifact.filter(key=k).one() for k in files],
-                    name=name,
+                    key=name,
                     description=description,
                 )
                 dataset.save()
@@ -819,9 +820,15 @@ def additional_postprocess(adata):
     # else:
     print("starting post processing")
     sc.pp.neighbors(adata, use_rep="X_pca")
-    sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
-    sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
-    sc.tl.leiden(adata, key_added="leiden_0.5", resolution=0.5)
+    sc.tl.leiden(
+        adata, key_added="leiden_2", resolution=2.0, flavor="igraph", n_iterations=2
+    )
+    sc.tl.leiden(
+        adata, key_added="leiden_1", resolution=1.0, flavor="igraph", n_iterations=2
+    )
+    sc.tl.leiden(
+        adata, key_added="leiden_0.5", resolution=0.5, flavor="igraph", n_iterations=2
+    )
     sc.tl.umap(adata)
     mid = adata.uns["dataset_id"] if "dataset_id" in adata.uns else "unknown_id"
     sc.pl.umap(

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/utils.py RENAMED Viewed

@@ -389,7 +389,7 @@ def load_dataset_local(
         except IntegrityError:
             print(f"File {file.key} already exists in storage")
         saved_files.append(file)
-    dataset = ln.Collection(saved_files, name=name, description=description)
+    dataset = ln.Collection(saved_files, key=name, description=description)
     dataset.save()
     return dataset
@@ -412,7 +412,7 @@ def load_genes(
     for organism in organisms:
         genesdf = bt.Gene.filter(
             organism_id=bt.Organism.filter(ontology_id=organism).first().id
-        ).df()
+        ).to_dataframe(limit=None)
         genesdf.loc[genesdf.ensembl_gene_id.isna(), "ensembl_gene_id"] = genesdf.loc[
             genesdf.ensembl_gene_id.isna(), "stable_id"
         ]
@@ -440,6 +440,7 @@ def load_genes(
         "ncbi_gene_ids",
         "synonyms",
         "description",
+        "abbr",
     ]:
         if col in organismdf.columns:
             organismdf.drop(columns=[col], inplace=True)
@@ -453,12 +454,16 @@ def _adding_scbasecamp_genes(
 ):
     if len(species) == 0:
         species = set(
-            bt.Organism.using("laminlabs/arc-virtual-cell-atlas").df().ontology_id
+            bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
+            .to_dataframe(limit=None)
+            .ontology_id
         ) - set(["NCBITaxon:10090", "NCBITaxon:9606"])
     species = list(species)
     for i in set(
-        bt.Organism.using("laminlabs/arc-virtual-cell-atlas").df().ontology_id
-    ) - set(bt.Organism.filter().df().ontology_id):
+        bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
+        .to_dataframe(limit=None)
+        .ontology_id
+    ) - set(bt.Organism.filter().to_dataframe(limit=None).ontology_id):
         print(i)
         rec = (
             bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
@@ -485,7 +490,7 @@ def _adding_scbasecamp_genes(
             bt.Gene.using("laminlabs/arc-virtual-cell-atlas")
             .filter(organism__ontology_id=i)
             .all()
-            .df()
+            .to_dataframe(limit=None)
         )
         genes = []
         org = bt.Organism.filter(ontology_id=i).one()
@@ -545,7 +550,11 @@ def populate_my_ontology(
         if len(celltypes) == 0:
             bt.CellType.import_source()
         else:
-            names = bt.CellType.public().df().index if not celltypes else celltypes
+            names = (
+                bt.CellType.public().to_dataframe().index
+                if not celltypes
+                else celltypes
+            )
             records = bt.CellType.from_values(names, field="ontology_id")
             ln.save(records)
         elem = bt.CellType(name="unknown", ontology_id="unknown")
@@ -555,7 +564,7 @@ def populate_my_ontology(
     if organisms_clade is not None:
         records = []
         for organism_clade in organisms_clade:
-            names = bt.Organism.public(organism=organism_clade).df().index
+            names = bt.Organism.public(organism=organism_clade).to_dataframe().index
             source = bt.Source.filter(name="ensembl", organism=organism_clade).last()
             for name in names:
                 try:
@@ -581,7 +590,7 @@ def populate_my_ontology(
         ln.save([elem], ignore_conflicts=True)
     # Phenotype
     if sex is not None:
-        names = bt.Phenotype.public().df().index if not sex else sex
+        names = bt.Phenotype.public().to_dataframe().index if not sex else sex
         source = bt.Source.filter(name="pato").first()
         records = [
             bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
@@ -594,7 +603,11 @@ def populate_my_ontology(
         if len(ethnicities) == 0:
             bt.Ethnicity.import_source()
         else:
-            names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
+            names = (
+                bt.Ethnicity.public().to_dataframe().index
+                if not ethnicities
+                else ethnicities
+            )
             records = bt.Ethnicity.from_values(names, field="ontology_id")
             ln.save(records)
         elem = bt.Ethnicity(name="unknown", ontology_id="unknown")
@@ -604,7 +617,11 @@ def populate_my_ontology(
         if len(assays) == 0:
             bt.ExperimentalFactor.import_source()
         else:
-            names = bt.ExperimentalFactor.public().df().index if not assays else assays
+            names = (
+                bt.ExperimentalFactor.public().to_dataframe().index
+                if not assays
+                else assays
+            )
             records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
             ln.save(records)
         elem = bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
@@ -616,7 +633,11 @@ def populate_my_ontology(
         if len(tissues) == 0:
             bt.Tissue.import_source()
         else:
-            names = bt.Tissue.public().df().index if not tissues else tissues
+            names = (
+                bt.Tissue.public().to_dataframe().index
+                if not tissues
+                else tissues
+            )
             records = bt.Tissue.from_values(names, field="ontology_id")
             ln.save(records)
         elem = bt.Tissue(name="unknown", ontology_id="unknown").save()
@@ -629,7 +650,7 @@ def populate_my_ontology(
             bt.DevelopmentalStage.import_source(source=source)
         else:
             names = (
-                bt.DevelopmentalStage.public().df().index
+                bt.DevelopmentalStage.public().to_dataframe().index
                 if not dev_stages
                 else dev_stages
             )
@@ -641,7 +662,11 @@ def populate_my_ontology(
         if len(diseases) == 0:
             bt.Disease.import_source()
         else:
-            names = bt.Disease.public().df().index if not diseases else diseases
+            names = (
+                bt.Disease.public().to_dataframe().index
+                if not diseases
+                else diseases
+            )
             records = bt.Disease.from_values(names, field="ontology_id")
             ln.save(records)
         bt.Disease(name="normal", ontology_id="PATO:0000461").save()
@@ -650,7 +675,9 @@ def populate_my_ontology(
     for organism in genes_from:
         # convert onto to name
         organism = bt.Organism.filter(ontology_id=organism).one().name
-        names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
+        names = bt.Gene.public(organism=organism).to_dataframe()[
+            "ensembl_gene_id"
+        ]
         # Process names in blocks of 10,000 elements
         block_size = 10000

{scdataloader-2.0.11 → scdataloader-2.1.0}/.gitignore RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/LICENSE RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/__init__.py RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/__main__.py RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/base.py RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/collator.py RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/config.py RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/datamodule.py RENAMED Viewed

File without changes

{scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/mapped.py RENAMED Viewed

File without changes

scdataloader 2.0.11__tar.gz → 2.1.0__tar.gz

scdataloader 2.0.11tar.gz → 2.1.0tar.gz