scdataloader 0.0.4__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/VERSION +1 -1
- scdataloader/__main__.py +3 -0
- scdataloader/collator.py +56 -31
- scdataloader/config.py +6 -0
- scdataloader/data.py +98 -87
- scdataloader/datamodule.py +66 -38
- scdataloader/mapped.py +266 -105
- scdataloader/preprocess.py +3 -207
- scdataloader/utils.py +57 -8
- {scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/METADATA +45 -20
- scdataloader-1.0.1.dist-info/RECORD +16 -0
- scdataloader-0.0.4.dist-info/RECORD +0 -16
- {scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/LICENSE +0 -0
- {scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/WHEEL +0 -0
- {scdataloader-0.0.4.dist-info → scdataloader-1.0.1.dist-info}/entry_points.txt +0 -0
scdataloader/preprocess.py
CHANGED
|
@@ -204,7 +204,9 @@ class Preprocessor:
|
|
|
204
204
|
)
|
|
205
205
|
)
|
|
206
206
|
|
|
207
|
-
if self.is_symbol:
|
|
207
|
+
if self.is_symbol or not adata.var.index.str.contains("ENSG").any():
|
|
208
|
+
if not adata.var.index.str.contains("ENSG").any():
|
|
209
|
+
print("No ENSG genes found, assuming gene symbols...")
|
|
208
210
|
genesdf["ensembl_gene_id"] = genesdf.index
|
|
209
211
|
var = (
|
|
210
212
|
adata.var.merge(
|
|
@@ -692,209 +694,3 @@ def additional_postprocess(adata):
|
|
|
692
694
|
# to query N next time points we just get the N elements below and check they are in the group
|
|
693
695
|
# to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
|
|
694
696
|
return adata
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
"""
|
|
698
|
-
sexr = {
|
|
699
|
-
"Male": "PATO:0000384",
|
|
700
|
-
"Female": "PATO:0000383",
|
|
701
|
-
}
|
|
702
|
-
tissuer = {
|
|
703
|
-
"Kidney": "UBERON:0002113",
|
|
704
|
-
"Lung": "UBERON:0002048",
|
|
705
|
-
"Heart": "UBERON:0000948",
|
|
706
|
-
"Liver": "UBERON:0002107",
|
|
707
|
-
"Brain": "UBERON:0000955",
|
|
708
|
-
"BAT": "UBERON:0001348",
|
|
709
|
-
"Jejunum": "UBERON:0002115",
|
|
710
|
-
"Colon": "UBERON:0001155",
|
|
711
|
-
"Ileum": "UBERON:0002116",
|
|
712
|
-
"Stomach": "UBERON:0000945",
|
|
713
|
-
"gWAT": "UBERON:0001347",
|
|
714
|
-
"Duodenum": "UBERON:0002114",
|
|
715
|
-
"iWAT": "UBERON:0001347",
|
|
716
|
-
"Muscle": "UBERON:0001630",
|
|
717
|
-
}
|
|
718
|
-
ager = {
|
|
719
|
-
"03_months": "MmusDv:0000063",
|
|
720
|
-
"16_months": "MmusDv:0000087",
|
|
721
|
-
"06_months": "MmusDv:0000077",
|
|
722
|
-
"23_months": "MmusDv:0000127",
|
|
723
|
-
"12_months": "MmusDv:0000083",
|
|
724
|
-
"21_months": "MmusDv:0000125",
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
celltyper = {
|
|
728
|
-
"Proximal tubule cells": "epithelial cell of proximal tubule",
|
|
729
|
-
"Vascular endothelial cells": "endothelial cell of vascular tree",
|
|
730
|
-
"Intestinal epithelial cells": "intestinal epithelial cell",
|
|
731
|
-
"Hepatocytes": "hepatocyte",
|
|
732
|
-
"Fibroblasts": "fibroblast",
|
|
733
|
-
"Lymphoid cells_T cells": "T cell",
|
|
734
|
-
"Myeloid cells": "myeloid cell",
|
|
735
|
-
"Brown adipocytes": "brown fat cell",
|
|
736
|
-
"Lymphoid cells_B cells": "B cell",
|
|
737
|
-
"Adipocytes": "fat cell",
|
|
738
|
-
"Type II alveolar epithelial cells": "type II pneumocyte",
|
|
739
|
-
"Colonic epithelial cells": "colon epithelial cell",
|
|
740
|
-
"Mural cells": "mural cell",
|
|
741
|
-
"Cerebellum granule neurons": "cerebellar neuron",
|
|
742
|
-
"Goblet cells": "goblet cell",
|
|
743
|
-
"Vascular endothelial cells_General capillary cells": "endothelial cell of vascular tree",
|
|
744
|
-
"Ventricular cardiomyocytes": "regular ventricular cardiac myocyte",
|
|
745
|
-
"Type II myonuclei": "type II muscle cell",
|
|
746
|
-
"Thick ascending limb of LOH cells": "vasa recta ascending limb cell",
|
|
747
|
-
"Gastric mucous cells": "mucous cell of stomach",
|
|
748
|
-
"Distal convoluted tubule cells": "kidney distal convoluted tubule epithelial cell",
|
|
749
|
-
"Adipoce stem and progenitor cells": "hepatic oval stem cell",
|
|
750
|
-
"Chief cells": "chief cell of parathyroid gland",
|
|
751
|
-
"Paneth cells": "paneth cell",
|
|
752
|
-
"Myeloid cells_Alveolar macrophages": "alveolar macrophage",
|
|
753
|
-
"Lymphoid cells_Plasma cells": "plasma cell",
|
|
754
|
-
"Secretory cells": "secretory cell",
|
|
755
|
-
"Lymphoid cells_Resting B cells": "B cell",
|
|
756
|
-
"Cortical projection neurons 1": "corticothalamic-projecting glutamatergic cortical neuron",
|
|
757
|
-
"Endocardial endothelial cells": "endocardial cell",
|
|
758
|
-
"Type I alveolar epithelial cells": "type I pneumocyte",
|
|
759
|
-
"Interbrain and midbrain neurons 1": "midbrain dopaminergic neuron",
|
|
760
|
-
"Interbrain and midbrain neurons 2": "midbrain dopaminergic neuron",
|
|
761
|
-
"Myeloid cells_Monocytes": "monocyte",
|
|
762
|
-
"Myeloid cells_Dendritic cells": "myeloid dendritic cell",
|
|
763
|
-
"Oligodendrocytes": "oligodendrocyte",
|
|
764
|
-
"Lymphatic endothelial cells": "endothelial cell of lymphatic vessel",
|
|
765
|
-
"Enteroendocrine cells": "enteroendocrine cell",
|
|
766
|
-
"Vascular endothelial cells_Aerocytes": "endothelial cell of vascular tree",
|
|
767
|
-
"Gastric epithelial cells": "epithelial cell of stomach",
|
|
768
|
-
"Fibro–adipogenic progenitors": "fibro/adipogenic progenitor cell",
|
|
769
|
-
"Parietal cells": "parietal cell",
|
|
770
|
-
"Astrocytes": "astrocyte",
|
|
771
|
-
"Connecting tubule cells": "kidney connecting tubule beta-intercalated cell",
|
|
772
|
-
"Hepatic stellate cells": "hepatic stellate cell",
|
|
773
|
-
"Striatal neurons 1": "striatum neuron",
|
|
774
|
-
"Mesothelial cells": "mesothelial cell",
|
|
775
|
-
"Lymphoid cells_Cycling B cells": "germinal center B cell",
|
|
776
|
-
"Type B intercalated cells": "renal beta-intercalated cell",
|
|
777
|
-
"Type A intercalated cells": "renal alpha-intercalated cell",
|
|
778
|
-
"Myeloid cells_Neutrophils": "neutrophil",
|
|
779
|
-
"Principal cells": "renal principal cell",
|
|
780
|
-
"Cortical projection neurons 2": "corticothalamic-projecting glutamatergic cortical neuron",
|
|
781
|
-
"Muc2-producing goblet cells": "intestine goblet cell",
|
|
782
|
-
"OB neurons 1": "olfactory bulb interneuron",
|
|
783
|
-
"Atrial cardiomyocytes": "regular atrial cardiac myocyte",
|
|
784
|
-
"Lymphoid cells": "leukocyte",
|
|
785
|
-
"Skeletal muscle cells": "cell of skeletal muscle",
|
|
786
|
-
"Neural cells": "neural cell",
|
|
787
|
-
"Cerebellum interneurons": "cerebellar neuron",
|
|
788
|
-
"Interneurons 1": "interneuron",
|
|
789
|
-
"Descending thin limb of LOH cells": "vasa recta descending limb cell",
|
|
790
|
-
"Tuft cells": "intestinal tuft cell",
|
|
791
|
-
"Oligodendrocyte progenitor cells": "oligodendrocyte precursor cell",
|
|
792
|
-
"Enteric glia": "enteroglial cell",
|
|
793
|
-
"Endothelial cells": "endothelial cell",
|
|
794
|
-
"Dentate gyrus neurons": "dentate gyrus neuron",
|
|
795
|
-
"Myeloid cells_Interstitial macrophages": "tissue-resident macrophage",
|
|
796
|
-
"Ciliated cells": "ciliated cell",
|
|
797
|
-
"Microglia": "microglial cell",
|
|
798
|
-
"Interneurons 2": "interneuron",
|
|
799
|
-
"Ncam1 positive cells": "parafollicular cell",
|
|
800
|
-
"Rdh16 positive cells": "unknown",
|
|
801
|
-
"Circulating hepatoblasts": "hepatoblast",
|
|
802
|
-
"Enteric neurons": "enteric neuron",
|
|
803
|
-
"Ascending thin limb of LOH cells": "vasa recta ascending limb cell",
|
|
804
|
-
"Mfge8 positive cells": "unknown",
|
|
805
|
-
"Cholangiocytes": "cholangiocyte",
|
|
806
|
-
"Podocytes": "podocyte",
|
|
807
|
-
"Muscle satellite cells": "skeletal muscle satellite cell",
|
|
808
|
-
"Purkinje neurons": "Purkinje cell",
|
|
809
|
-
"Juxtaglomerular cells": "juxtaglomerular complex cell",
|
|
810
|
-
"Ngf positive cells": "neurogliaform cell",
|
|
811
|
-
"Bergmann glia": "Bergmann glial cell",
|
|
812
|
-
"Megf11 positive cells": "unknown",
|
|
813
|
-
"Myotendinous junction myonuclei": "unknown",
|
|
814
|
-
"Vascular leptomeningeal cells": "vascular leptomeningeal cell",
|
|
815
|
-
"Urothelial cells": "urothelial cell",
|
|
816
|
-
"Tenocytes": "tendon cell",
|
|
817
|
-
"Myelinating Schwann cells": "myelinating Schwann cell",
|
|
818
|
-
"Epididymal cells": "epididymis glandular cell",
|
|
819
|
-
"Muc6-producing goblet cells": "lung goblet cell",
|
|
820
|
-
"Type I myonuclei": "type I muscle cell",
|
|
821
|
-
"OB neurons 2": "olfactory bulb interneuron",
|
|
822
|
-
"Sis positive cells": "unknown",
|
|
823
|
-
"Lgr5 positive cells": "unknown",
|
|
824
|
-
"Macula densa cells": "macula densa epithelial cell",
|
|
825
|
-
"Choroid plexus epithelial cells": "choroid plexus epithelial cell",
|
|
826
|
-
"Cortical projection neurons 3": "corticothalamic-projecting glutamatergic cortical neuron",
|
|
827
|
-
"Interstitial cells of Cajal": "interstitial cell of Cajal",
|
|
828
|
-
"Cacna1b positive cells": "unknown",
|
|
829
|
-
"Hindbrain neurons 2": "neuron",
|
|
830
|
-
"Myeloid cells_Basophils": "basophil",
|
|
831
|
-
"Ependymal cells": "ependymal cell",
|
|
832
|
-
"Muc5ac-producing goblet cells": "lung goblet cell",
|
|
833
|
-
"Myeloid cells_Mast cells": "mast cell",
|
|
834
|
-
"Pulmonary neuroendocrine cells": "lung neuroendocrine cell",
|
|
835
|
-
"Basal cells": "basal cell",
|
|
836
|
-
"OB neurons 3": "olfactory bulb interneuron",
|
|
837
|
-
"Non-myelinating Schwann cells": "non-myelinating Schwann cell",
|
|
838
|
-
"Asic2 positive cells": "unknown",
|
|
839
|
-
"Striatal neurons 2": "striatum neuron",
|
|
840
|
-
"Erythroblasts": "erythroblast",
|
|
841
|
-
"Hindbrain neurons 1": "neuron",
|
|
842
|
-
"Neuromuscular junction myonuclei": "unknown",
|
|
843
|
-
"Habenula neurons": "unknown",
|
|
844
|
-
"Pituitary cells": "pituitary gland cell",
|
|
845
|
-
"Unipolar brush cells": "unipolar brush cell",
|
|
846
|
-
"Pde4c positive cells": "unknown",
|
|
847
|
-
"Pancreatic acinar cells": "pancreatic acinar cell",
|
|
848
|
-
"Inferior olivary nucleus neurons": "bushy cell",
|
|
849
|
-
"Colec10 positive cells": "unknown",
|
|
850
|
-
"Fcgbp positive cells": "unknown",
|
|
851
|
-
"Fut9 positive cells": "unknown",
|
|
852
|
-
"Mirg positive cells": "unknown",
|
|
853
|
-
"Alox15 positive cells": "unknown",
|
|
854
|
-
"Osteoblasts": "osteoblast",
|
|
855
|
-
}
|
|
856
|
-
genesdf = utils.load_genes("NCBITaxon:10090")
|
|
857
|
-
{k: v if v =="unknown" else bt.CellType.filter(name=v).one().ontology_id for k, v in celltyper.items()}
|
|
858
|
-
|
|
859
|
-
adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
|
|
860
|
-
adata.obs["tissue_ontology_term_id"] = adata.obs["Organ_name"].replace(tissuer)
|
|
861
|
-
adata.obs["cell_type_ontology_term_id"] = adata.obs["Main_cell_type"].replace(
|
|
862
|
-
celltyper
|
|
863
|
-
)
|
|
864
|
-
adata.obs["disease_ontology_term_id"] = "PATO:0000461"
|
|
865
|
-
adata.obs["assay_ontology_term_id"] = "unknown"
|
|
866
|
-
adata.obs["self_reported_ethnicity_ontology_term_id"] = "unknown"
|
|
867
|
-
adata.obs["development_stage_ontology_term_id"] = adata.obs["Age_group"].replace(
|
|
868
|
-
ager
|
|
869
|
-
)
|
|
870
|
-
adata.obs["sex_ontology_term_id"] = adata.obs["Gender"].replace(sexr)
|
|
871
|
-
|
|
872
|
-
for i in range(num_blocks):
|
|
873
|
-
start_index = i * block_size
|
|
874
|
-
end_index = min((i + 1) * block_size, len(adata))
|
|
875
|
-
block = adata[start_index:end_index].to_memory()
|
|
876
|
-
# process block here
|
|
877
|
-
|
|
878
|
-
block = block[(block.obs["Gene_count"] > 400)]
|
|
879
|
-
|
|
880
|
-
intersect_genes = set(block.var.index).intersection(set(genesdf.index))
|
|
881
|
-
print(f"Removed {len(block.var.index) - len(intersect_genes)} genes.")
|
|
882
|
-
block = block[:, list(intersect_genes)]
|
|
883
|
-
# marking unseen genes
|
|
884
|
-
unseen = set(genesdf.index) - set(block.var.index)
|
|
885
|
-
# adding them to adata
|
|
886
|
-
emptyda = ad.AnnData(
|
|
887
|
-
csr_matrix((block.shape[0], len(unseen)), dtype=np.float32),
|
|
888
|
-
var=pd.DataFrame(index=list(unseen)),
|
|
889
|
-
obs=pd.DataFrame(index=block.obs.index),
|
|
890
|
-
)
|
|
891
|
-
block = ad.concat([block, emptyda], axis=1, join="outer", merge="only")
|
|
892
|
-
# do a validation function
|
|
893
|
-
block.uns["unseen_genes"] = list(unseen)
|
|
894
|
-
block = block[:, block.var.sort_index().index]
|
|
895
|
-
block.var[genesdf.columns] = genesdf.loc[block.var.index]
|
|
896
|
-
for name in ["stable_id", "created_at", "updated_at"]:
|
|
897
|
-
if name in block.var.columns:
|
|
898
|
-
block.var = block.var.drop(columns=name)
|
|
899
|
-
block.write_h5ad('zhang2024_adata_'+str(i)+".h5ad")
|
|
900
|
-
"""
|
scdataloader/utils.py
CHANGED
|
@@ -12,12 +12,48 @@ from scipy.sparse import csr_matrix
|
|
|
12
12
|
from scipy.stats import median_abs_deviation
|
|
13
13
|
from functools import lru_cache
|
|
14
14
|
from collections import Counter
|
|
15
|
+
from torch import Tensor
|
|
16
|
+
import torch
|
|
15
17
|
|
|
16
18
|
from typing import Union, List, Optional
|
|
17
19
|
|
|
18
20
|
from anndata import AnnData
|
|
19
21
|
|
|
20
22
|
|
|
23
|
+
def downsample_profile(mat: Tensor, dropout: float):
|
|
24
|
+
"""
|
|
25
|
+
This function downsamples the expression profile of a given single cell RNA matrix.
|
|
26
|
+
|
|
27
|
+
The noise is applied based on the renoise parameter,
|
|
28
|
+
the total counts of the matrix, and the number of genes. The function first calculates the noise
|
|
29
|
+
threshold (scaler) based on the renoise parameter. It then generates an initial matrix count by
|
|
30
|
+
applying a Poisson distribution to a random tensor scaled by the total counts and the number of genes.
|
|
31
|
+
The function then models the sampling zeros by applying a Poisson distribution to a random tensor
|
|
32
|
+
scaled by the noise threshold, the total counts, and the number of genes. The function also models
|
|
33
|
+
the technical zeros by generating a random tensor and comparing it to the noise threshold. The final
|
|
34
|
+
matrix count is calculated by subtracting the sampling zeros from the initial matrix count and
|
|
35
|
+
multiplying by the technical zeros. The function ensures that the final matrix count is not less
|
|
36
|
+
than zero by taking the maximum of the final matrix count and a tensor of zeros. The function
|
|
37
|
+
returns the final matrix count.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
mat (torch.Tensor): The input matrix.
|
|
41
|
+
dropout (float): The renoise parameter.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
torch.Tensor: The matrix count after applying noise.
|
|
45
|
+
"""
|
|
46
|
+
batch = mat.shape[0]
|
|
47
|
+
ngenes = mat.shape[1]
|
|
48
|
+
dropout = dropout * 1.1
|
|
49
|
+
# we model the sampling zeros (dropping 30% of the reads)
|
|
50
|
+
res = torch.poisson((mat * (dropout / 2))).int()
|
|
51
|
+
# we model the technical zeros (dropping 50% of the genes)
|
|
52
|
+
notdrop = (torch.rand((batch, ngenes), device=mat.device) >= (dropout / 2)).int()
|
|
53
|
+
mat = (mat - res) * notdrop
|
|
54
|
+
return torch.maximum(mat, torch.zeros((1, 1), device=mat.device, dtype=torch.int))
|
|
55
|
+
|
|
56
|
+
|
|
21
57
|
def createFoldersFor(filepath: str):
|
|
22
58
|
"""
|
|
23
59
|
will recursively create folders if needed until having all the folders required to save the file in this filepath
|
|
@@ -38,6 +74,7 @@ def _fetchFromServer(
|
|
|
38
74
|
Args:
|
|
39
75
|
ensemble_server (str): The URL of the ensemble server to fetch data from.
|
|
40
76
|
attributes (list): The list of attributes to fetch from the server.
|
|
77
|
+
database (str): The database to fetch data from.
|
|
41
78
|
|
|
42
79
|
Returns:
|
|
43
80
|
pd.DataFrame: A pandas DataFrame containing the fetched data.
|
|
@@ -68,6 +105,9 @@ def getBiomartTable(
|
|
|
68
105
|
ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
|
|
69
106
|
useCache (bool, optional): whether to use the cache or not. Defaults to False.
|
|
70
107
|
cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
|
|
108
|
+
attributes (List[str], optional): the attributes to fetch. Defaults to [].
|
|
109
|
+
bypass_attributes (bool, optional): whether to bypass the attributes or not. Defaults to False.
|
|
110
|
+
database (str, optional): the database to fetch from. Defaults to "hsapiens_gene_ensembl".
|
|
71
111
|
|
|
72
112
|
Raises:
|
|
73
113
|
ValueError: should be a dataframe (when the result from the server is something else)
|
|
@@ -102,11 +142,12 @@ def getBiomartTable(
|
|
|
102
142
|
res.columns = attr + attributes
|
|
103
143
|
if type(res) is not type(pd.DataFrame()):
|
|
104
144
|
raise ValueError("should be a dataframe")
|
|
105
|
-
res = res[~(res["ensembl_gene_id"].isna()
|
|
106
|
-
|
|
107
|
-
res
|
|
108
|
-
|
|
109
|
-
|
|
145
|
+
res = res[~(res["ensembl_gene_id"].isna())]
|
|
146
|
+
if "hgnc_symbol" in res.columns:
|
|
147
|
+
res = res[res["hgnc_symbol"].isna()]
|
|
148
|
+
res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
|
|
149
|
+
res.hgnc_symbol.isna()
|
|
150
|
+
]["ensembl_gene_id"]
|
|
110
151
|
return res
|
|
111
152
|
|
|
112
153
|
|
|
@@ -206,6 +247,16 @@ def get_all_ancestors(val: str, df: pd.DataFrame):
|
|
|
206
247
|
return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
|
|
207
248
|
|
|
208
249
|
|
|
250
|
+
# setting a cache of 200 elements
|
|
251
|
+
# @lru_cache(maxsize=200)
|
|
252
|
+
def get_descendants(val, df):
|
|
253
|
+
ontos = set(df[df.parents__ontology_id.str.contains(val)].index.tolist())
|
|
254
|
+
r_onto = set()
|
|
255
|
+
for onto in ontos:
|
|
256
|
+
r_onto |= get_descendants(onto, df)
|
|
257
|
+
return r_onto | ontos
|
|
258
|
+
|
|
259
|
+
|
|
209
260
|
def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
|
|
210
261
|
"""
|
|
211
262
|
This function generates a mapping of all elements to their ancestors in the ontology dataframe.
|
|
@@ -404,9 +455,7 @@ def populate_my_ontology(
|
|
|
404
455
|
ln.save(records, parents=bool(tissues))
|
|
405
456
|
bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
406
457
|
# DevelopmentalStage
|
|
407
|
-
names = (
|
|
408
|
-
bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
|
|
409
|
-
)
|
|
458
|
+
names = bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
|
|
410
459
|
records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
|
|
411
460
|
ln.save(records, parents=bool(dev_stages))
|
|
412
461
|
bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
@@ -34,14 +34,16 @@ Description-Content-Type: text/markdown
|
|
|
34
34
|
|
|
35
35
|
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
36
36
|
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
37
|
-
[](https://badge.fury.io/py/scDataLoader)
|
|
38
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
39
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
40
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
41
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
42
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
43
|
+
[](https://github.com/psf/black)
|
|
44
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
38
45
|
|
|
39
|
-
|
|
40
|
-
Awesome single cell dataloader created by @jkobject
|
|
41
|
-
|
|
42
|
-
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
43
|
-
|
|
44
|
-
This data loader is designed to be used with:
|
|
46
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
45
47
|
|
|
46
48
|
- [lamindb](https://lamin.ai/)
|
|
47
49
|
|
|
@@ -57,18 +59,13 @@ It allows you to:
|
|
|
57
59
|
3. create a more complex single cell dataset
|
|
58
60
|
4. extend it to your need
|
|
59
61
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
|
|
62
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
63
63
|
|
|
64
|
-
|
|
65
|
-
2. doing some dataset specific preprocessing if needed
|
|
66
|
-
3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
|
|
67
|
-
4. passing it to a dataloader object that can work with it correctly
|
|
64
|
+
## More
|
|
68
65
|
|
|
69
|
-
|
|
66
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
70
67
|
|
|
71
|
-

|
|
72
69
|
|
|
73
70
|
## Install it from PyPI
|
|
74
71
|
|
|
@@ -87,6 +84,8 @@ then run the notebooks with the poetry installed environment
|
|
|
87
84
|
|
|
88
85
|
## Usage
|
|
89
86
|
|
|
87
|
+
### Direct Usage
|
|
88
|
+
|
|
90
89
|
```python
|
|
91
90
|
# initialize a local lamin database
|
|
92
91
|
# !lamin init --storage ~/scdataloader --schema bionty
|
|
@@ -129,15 +128,41 @@ for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
|
129
128
|
|
|
130
129
|
```
|
|
131
130
|
|
|
132
|
-
see the notebooks in [docs](https://jkobject.
|
|
131
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
132
|
+
|
|
133
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
134
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
135
|
+
|
|
136
|
+
### command line preprocessing
|
|
133
137
|
|
|
134
|
-
|
|
135
|
-
|
|
138
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### command line usage
|
|
145
|
+
|
|
146
|
+
The main way to use
|
|
147
|
+
|
|
148
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
136
149
|
|
|
137
150
|
## Development
|
|
138
151
|
|
|
139
152
|
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
140
153
|
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
157
|
+
|
|
158
|
+
## Acknowledgments
|
|
159
|
+
|
|
160
|
+
- [lamin.ai](https://lamin.ai/)
|
|
161
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
162
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
163
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
164
|
+
|
|
165
|
+
Awesome single cell dataloader created by @jkobject
|
|
141
166
|
GNU GENERAL PUBLIC LICENSE
|
|
142
167
|
Version 3, 29 June 2007
|
|
143
168
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
scdataloader/VERSION,sha256=WYVJhIUxBN9cNT4vaBoV_HkkdC-aLkaMKa8kjc5FzgM,6
|
|
2
|
+
scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
|
|
3
|
+
scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
|
|
4
|
+
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
+
scdataloader/collator.py,sha256=zkFdxirTDub1dJ1OJXO0p48kvd2r2ncKMdevAKIdTTc,13447
|
|
6
|
+
scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
|
|
7
|
+
scdataloader/data.py,sha256=VugtHo9T9PqoJSv3lkJJAB89KD-fRwdVw1D76gnCc9c,12584
|
|
8
|
+
scdataloader/datamodule.py,sha256=WLEWcDMcC1G3VD5tORfhfqRRHcTscpI0EzPikg3udbI,16881
|
|
9
|
+
scdataloader/mapped.py,sha256=yF9l3obuRWbQjW8QZGRSKhc50fizXTWf3Pe1m542fW8,19481
|
|
10
|
+
scdataloader/preprocess.py,sha256=noynYWuy9clhFu9UnN-vSvAHJHwakDttkI5aj1e_T98,29055
|
|
11
|
+
scdataloader/utils.py,sha256=xyDsWaqkjhzlVBP8FiYdBUWHsel3twcVWmI53PhKqTM,21888
|
|
12
|
+
scdataloader-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
+
scdataloader-1.0.1.dist-info/METADATA,sha256=2Xd8M1dq_JmvmFjmrrzn-1U4eOtwU6L51Y_7MCkGxvY,41327
|
|
14
|
+
scdataloader-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
15
|
+
scdataloader-1.0.1.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
16
|
+
scdataloader-1.0.1.dist-info/RECORD,,
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
scdataloader/VERSION,sha256=ln2a-xATRmZxZvLnboGRC8GQSI19QdUMoAcunZLwDjI,6
|
|
2
|
-
scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
|
|
3
|
-
scdataloader/__main__.py,sha256=UyXtFHgWxE-ecJmM_oEDLlzBDBbH-uEKAVj1A7BkwmM,6297
|
|
4
|
-
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
-
scdataloader/collator.py,sha256=Ykjdw24GUvHdbowWUDtp28YTkaF3w65SiWTU2PKBzy4,11714
|
|
6
|
-
scdataloader/config.py,sha256=0_LoIblgdZZ19yM2qvPE-padMGQzdhuaxX20zYrhWq0,2780
|
|
7
|
-
scdataloader/data.py,sha256=faJWN--06N7irWBKcjeU6fcX5NbzyEPXs2_EVGxfBpw,12292
|
|
8
|
-
scdataloader/datamodule.py,sha256=OhHPb3jhGG5HbvahzTGxgzJ_lxbVJ4PfZspVW9h7SZk,14789
|
|
9
|
-
scdataloader/mapped.py,sha256=rhE11Xl3x_wIKu3m_wu8Is6mYsXdblu3nQpT5lNqr60,13301
|
|
10
|
-
scdataloader/preprocess.py,sha256=67ewe6b4HIjz_vTDjlOAJ4lMe4K2oCw2HHHUS-7S77M,38205
|
|
11
|
-
scdataloader/utils.py,sha256=6eKU3_cotEaQcxONMrCWzMx7U8DybabteNhk-vNqfUQ,19365
|
|
12
|
-
scdataloader-0.0.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
-
scdataloader-0.0.4.dist-info/METADATA,sha256=Bf8UjMwRcqSbWW8VbWrLhSb7qKQYdjZtJ7d6Oz4-rn8,39733
|
|
14
|
-
scdataloader-0.0.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
15
|
-
scdataloader-0.0.4.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
16
|
-
scdataloader-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|