scdataloader 2.0.6__py3-none-any.whl → 2.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/data.py +2 -2
- scdataloader/datamodule.py +10 -10
- scdataloader/preprocess.py +27 -8
- scdataloader/utils.py +34 -25
- {scdataloader-2.0.6.dist-info → scdataloader-2.0.8.dist-info}/METADATA +115 -23
- scdataloader-2.0.8.dist-info/RECORD +15 -0
- scdataloader/data.json +0 -384
- scdataloader-2.0.6.dist-info/RECORD +0 -16
- {scdataloader-2.0.6.dist-info → scdataloader-2.0.8.dist-info}/WHEEL +0 -0
- {scdataloader-2.0.6.dist-info → scdataloader-2.0.8.dist-info}/entry_points.txt +0 -0
- {scdataloader-2.0.6.dist-info → scdataloader-2.0.8.dist-info}/licenses/LICENSE +0 -0
scdataloader/data.py
CHANGED
|
@@ -200,7 +200,7 @@ class Dataset(torchDataset):
|
|
|
200
200
|
def get_label_cats(
|
|
201
201
|
self,
|
|
202
202
|
obs_keys: Union[str, List[str]],
|
|
203
|
-
):
|
|
203
|
+
) -> np.ndarray:
|
|
204
204
|
"""
|
|
205
205
|
Get combined categorical codes for one or more label columns.
|
|
206
206
|
|
|
@@ -226,7 +226,7 @@ class Dataset(torchDataset):
|
|
|
226
226
|
labels = concat_categorical_codes([labels, labels_to_str])
|
|
227
227
|
return np.array(labels.codes)
|
|
228
228
|
|
|
229
|
-
def get_unseen_mapped_dataset_elements(self, idx: int):
|
|
229
|
+
def get_unseen_mapped_dataset_elements(self, idx: int) -> list[str]:
|
|
230
230
|
"""
|
|
231
231
|
Get genes marked as unseen for a specific sample.
|
|
232
232
|
|
scdataloader/datamodule.py
CHANGED
|
@@ -136,7 +136,7 @@ class DataModule(L.LightningDataModule):
|
|
|
136
136
|
increases sampling weight balance over epochs. Defaults to 0.
|
|
137
137
|
start_at (int, optional): Starting index for resuming inference. Requires same
|
|
138
138
|
number of GPUs as previous run. Defaults to 0.
|
|
139
|
-
**kwargs: Additional arguments passed to PyTorch DataLoader (e.g., batch_size,
|
|
139
|
+
**kwargs: dict[str, Any]: Additional arguments passed to PyTorch DataLoader (e.g., batch_size,
|
|
140
140
|
num_workers, pin_memory).
|
|
141
141
|
|
|
142
142
|
Attributes:
|
|
@@ -256,7 +256,7 @@ class DataModule(L.LightningDataModule):
|
|
|
256
256
|
)
|
|
257
257
|
|
|
258
258
|
@property
|
|
259
|
-
def decoders(self):
|
|
259
|
+
def decoders(self) -> dict[str, dict[int, str]]:
|
|
260
260
|
"""
|
|
261
261
|
decoders the decoders for any labels that would have been encoded
|
|
262
262
|
|
|
@@ -269,7 +269,7 @@ class DataModule(L.LightningDataModule):
|
|
|
269
269
|
return decoders
|
|
270
270
|
|
|
271
271
|
@property
|
|
272
|
-
def labels_hierarchy(self):
|
|
272
|
+
def labels_hierarchy(self) -> dict[str, dict[str, str]]:
|
|
273
273
|
"""
|
|
274
274
|
labels_hierarchy the hierarchy of labels for any cls that would have a hierarchy
|
|
275
275
|
|
|
@@ -287,7 +287,7 @@ class DataModule(L.LightningDataModule):
|
|
|
287
287
|
return labels_hierarchy
|
|
288
288
|
|
|
289
289
|
@property
|
|
290
|
-
def genes(self):
|
|
290
|
+
def genes(self) -> list:
|
|
291
291
|
"""
|
|
292
292
|
genes the genes used in this datamodule
|
|
293
293
|
|
|
@@ -343,7 +343,7 @@ class DataModule(L.LightningDataModule):
|
|
|
343
343
|
def num_datasets(self):
|
|
344
344
|
return len(self.dataset.mapped_dataset.storages)
|
|
345
345
|
|
|
346
|
-
def setup(self, stage=None):
|
|
346
|
+
def setup(self, stage: Optional[str] = None) -> list[str]:
|
|
347
347
|
"""
|
|
348
348
|
Prepare data splits for training, validation, and testing.
|
|
349
349
|
|
|
@@ -512,7 +512,7 @@ class DataModule(L.LightningDataModule):
|
|
|
512
512
|
print(f"done setup, took {time.time() - start_time:.2f} seconds")
|
|
513
513
|
return self.test_datasets
|
|
514
514
|
|
|
515
|
-
def train_dataloader(self, **kwargs):
|
|
515
|
+
def train_dataloader(self, **kwargs) -> DataLoader:
|
|
516
516
|
"""
|
|
517
517
|
Create the training DataLoader with weighted random sampling.
|
|
518
518
|
|
|
@@ -521,7 +521,7 @@ class DataModule(L.LightningDataModule):
|
|
|
521
521
|
distributed training without weighting.
|
|
522
522
|
|
|
523
523
|
Args:
|
|
524
|
-
**kwargs: Additional arguments passed to DataLoader, overriding defaults.
|
|
524
|
+
**kwargs: dict[str, Any]: Additional arguments passed to DataLoader, overriding defaults.
|
|
525
525
|
|
|
526
526
|
Returns:
|
|
527
527
|
DataLoader: Training DataLoader instance.
|
|
@@ -560,7 +560,7 @@ class DataModule(L.LightningDataModule):
|
|
|
560
560
|
**current_loader_kwargs,
|
|
561
561
|
)
|
|
562
562
|
|
|
563
|
-
def val_dataloader(self):
|
|
563
|
+
def val_dataloader(self) -> Union[DataLoader, list]:
|
|
564
564
|
"""
|
|
565
565
|
Create the validation DataLoader.
|
|
566
566
|
|
|
@@ -576,7 +576,7 @@ class DataModule(L.LightningDataModule):
|
|
|
576
576
|
else []
|
|
577
577
|
)
|
|
578
578
|
|
|
579
|
-
def test_dataloader(self):
|
|
579
|
+
def test_dataloader(self) -> Union[DataLoader, list]:
|
|
580
580
|
"""
|
|
581
581
|
Create the test DataLoader with sequential sampling.
|
|
582
582
|
|
|
@@ -591,7 +591,7 @@ class DataModule(L.LightningDataModule):
|
|
|
591
591
|
else []
|
|
592
592
|
)
|
|
593
593
|
|
|
594
|
-
def predict_dataloader(self):
|
|
594
|
+
def predict_dataloader(self) -> DataLoader:
|
|
595
595
|
"""
|
|
596
596
|
Create a DataLoader for prediction over all training data.
|
|
597
597
|
|
scdataloader/preprocess.py
CHANGED
|
@@ -87,10 +87,12 @@ class Preprocessor:
|
|
|
87
87
|
If int, filters cells with counts. Defaults to False.
|
|
88
88
|
normalize_sum (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
|
|
89
89
|
Defaults to 1e4.
|
|
90
|
-
log1p (bool, optional): Determines whether to apply log1p transform to the normalized data.
|
|
91
|
-
Defaults to True.
|
|
92
90
|
n_hvg_for_postp (int or bool, optional): Determines whether to subset to highly variable genes for the PCA.
|
|
93
91
|
Defaults to False.
|
|
92
|
+
use_layer (str, optional): The layer to use for preprocessing.
|
|
93
|
+
Defaults to None.
|
|
94
|
+
is_symbol (bool, optional): Whether genes are provided as symbols instead of Ensembl IDs.
|
|
95
|
+
Defaults to False.
|
|
94
96
|
hvg_flavor (str, optional): Specifies the flavor of highly variable genes selection.
|
|
95
97
|
See :func:`scanpy.pp.highly_variable_genes` for more details. Defaults to "seurat_v3".
|
|
96
98
|
binning (int, optional): Determines whether to bin the data into discrete values of number of bins provided.
|
|
@@ -112,10 +114,20 @@ class Preprocessor:
|
|
|
112
114
|
Defaults to 5.
|
|
113
115
|
pct_mt_outlier (int, optional): The maximum percentage of mitochondrial genes outlier.
|
|
114
116
|
Defaults to 8.
|
|
115
|
-
|
|
117
|
+
batch_keys (List[str], optional): The keys of :class:`~anndata.AnnData.obs` to use for batch information.
|
|
116
118
|
This arg is used in the highly variable gene selection step.
|
|
117
119
|
skip_validate (bool, optional): Determines whether to skip the validation step.
|
|
118
120
|
Defaults to False.
|
|
121
|
+
additional_preprocess (Callable, optional): Additional preprocessing function.
|
|
122
|
+
Defaults to None.
|
|
123
|
+
additional_postprocess (Callable, optional): Additional postprocessing function.
|
|
124
|
+
Defaults to None.
|
|
125
|
+
do_postp (bool, optional): Whether to perform postprocessing.
|
|
126
|
+
Defaults to True.
|
|
127
|
+
organisms (List[str], optional): List of organisms to support.
|
|
128
|
+
Defaults to ["NCBITaxon:9606", "NCBITaxon:10090"].
|
|
129
|
+
use_raw (bool, optional): Whether to use raw counts.
|
|
130
|
+
Defaults to True.
|
|
119
131
|
keepdata (bool, optional): Determines whether to keep the data in the AnnData object.
|
|
120
132
|
Defaults to False.
|
|
121
133
|
drop_non_primary (bool, optional): Determines whether to drop non-primary cells.
|
|
@@ -483,13 +495,20 @@ class LaminPreprocessor(Preprocessor):
|
|
|
483
495
|
version: str = "2",
|
|
484
496
|
):
|
|
485
497
|
"""
|
|
486
|
-
format
|
|
487
|
-
|
|
498
|
+
Process data with format controlling different input value wrapping.
|
|
499
|
+
|
|
500
|
+
Includes support for categorical binned style, fixed-sum normalized counts,
|
|
501
|
+
log1p fixed-sum normalized counts, etc.
|
|
488
502
|
|
|
489
503
|
Args:
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
504
|
+
data (Union[ln.Collection, AnnData]): The AnnData object or Collection to preprocess.
|
|
505
|
+
name (str, optional): Name for the preprocessed dataset. Defaults to "preprocessed dataset".
|
|
506
|
+
description (str, optional): Description for the preprocessed dataset.
|
|
507
|
+
Defaults to "preprocessed dataset using scprint".
|
|
508
|
+
start_at (int, optional): Starting index for resuming preprocessing.
|
|
509
|
+
Defaults to 0.
|
|
510
|
+
version (str, optional): Version string for the dataset.
|
|
511
|
+
Defaults to "2".
|
|
493
512
|
"""
|
|
494
513
|
files = []
|
|
495
514
|
all_ready_processed_keys = set()
|
scdataloader/utils.py
CHANGED
|
@@ -28,10 +28,11 @@ def fileToList(filename: str, strconv: callable = lambda x: x) -> list:
|
|
|
28
28
|
loads an input file with a\\n b\\n.. into a list [a,b,..]
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
|
-
|
|
31
|
+
filename (str): The filename to load from.
|
|
32
|
+
strconv (callable): A function to convert each line. Defaults to identity function.
|
|
32
33
|
|
|
33
34
|
Returns:
|
|
34
|
-
|
|
35
|
+
list: The list of converted elements from the file.
|
|
35
36
|
"""
|
|
36
37
|
with open(filename) as f:
|
|
37
38
|
return [strconv(val[:-1]) for val in f.readlines()]
|
|
@@ -44,7 +45,7 @@ def listToFile(
|
|
|
44
45
|
listToFile loads a list with [a,b,..] into an input file a\\n b\\n..
|
|
45
46
|
|
|
46
47
|
Args:
|
|
47
|
-
|
|
48
|
+
li (list): The list of elements to be written to the file.
|
|
48
49
|
filename (str): The name of the file where the list will be written.
|
|
49
50
|
strconv (callable, optional): A function to convert each element of the list to a string. Defaults to str.
|
|
50
51
|
|
|
@@ -124,7 +125,7 @@ def getBiomartTable(
|
|
|
124
125
|
attributes: List[str] = [],
|
|
125
126
|
bypass_attributes: bool = False,
|
|
126
127
|
database: str = "hsapiens_gene_ensembl",
|
|
127
|
-
):
|
|
128
|
+
) -> pd.DataFrame:
|
|
128
129
|
"""generate a genelist dataframe from ensembl's biomart
|
|
129
130
|
|
|
130
131
|
Args:
|
|
@@ -175,14 +176,14 @@ def getBiomartTable(
|
|
|
175
176
|
return res
|
|
176
177
|
|
|
177
178
|
|
|
178
|
-
def validate(adata: AnnData, organism: str, need_all=False):
|
|
179
|
+
def validate(adata: AnnData, organism: str, need_all: bool = False) -> bool:
|
|
179
180
|
"""
|
|
180
181
|
validate checks if the adata object is valid for lamindb
|
|
181
182
|
|
|
182
183
|
Args:
|
|
183
|
-
adata (
|
|
184
|
-
|
|
185
|
-
|
|
184
|
+
adata (AnnData): the anndata object
|
|
185
|
+
organism (str): the organism ontology ID
|
|
186
|
+
need_all (bool, optional): whether all columns should be present. Defaults to False.
|
|
186
187
|
|
|
187
188
|
Raises:
|
|
188
189
|
ValueError: if the adata object is not valid
|
|
@@ -298,7 +299,7 @@ def get_descendants(val, df):
|
|
|
298
299
|
return r_onto | ontos
|
|
299
300
|
|
|
300
301
|
|
|
301
|
-
def get_ancestry_mapping(all_elem: List[str], onto_df: pd.DataFrame):
|
|
302
|
+
def get_ancestry_mapping(all_elem: List[str], onto_df: pd.DataFrame) -> dict:
|
|
302
303
|
"""
|
|
303
304
|
This function generates a mapping of all elements to their ancestors in the ontology dataframe.
|
|
304
305
|
|
|
@@ -339,13 +340,12 @@ def load_dataset_local(
|
|
|
339
340
|
description: str,
|
|
340
341
|
use_cache: bool = True,
|
|
341
342
|
only: Optional[List[int]] = None,
|
|
342
|
-
):
|
|
343
|
+
) -> ln.Dataset:
|
|
343
344
|
"""
|
|
344
345
|
This function loads a remote lamindb dataset to local.
|
|
345
346
|
|
|
346
347
|
Args:
|
|
347
|
-
|
|
348
|
-
remote_dataset (lamindb.Dataset): The remote Dataset.
|
|
348
|
+
remote_dataset (lamindb.Collection): The remote Collection.
|
|
349
349
|
download_folder (str): The path to the download folder.
|
|
350
350
|
name (str): The name of the dataset.
|
|
351
351
|
description (str): The description of the dataset.
|
|
@@ -396,7 +396,7 @@ def load_dataset_local(
|
|
|
396
396
|
|
|
397
397
|
def load_genes(
|
|
398
398
|
organisms: Union[str, List[str]] = "NCBITaxon:9606",
|
|
399
|
-
): # "NCBITaxon:10090",
|
|
399
|
+
) -> pd.DataFrame: # "NCBITaxon:10090",
|
|
400
400
|
"""
|
|
401
401
|
Loads genes from the given organisms.
|
|
402
402
|
|
|
@@ -454,9 +454,18 @@ def _adding_scbasecamp_genes(
|
|
|
454
454
|
if len(species) == 0:
|
|
455
455
|
species = set(
|
|
456
456
|
bt.Organism.using("laminlabs/arc-virtual-cell-atlas").df().ontology_id
|
|
457
|
-
)
|
|
458
|
-
-set(["NCBITaxon:10090", "NCBITaxon:9606"])
|
|
457
|
+
) - set(["NCBITaxon:10090", "NCBITaxon:9606"])
|
|
459
458
|
species = list(species)
|
|
459
|
+
for i in set(
|
|
460
|
+
bt.Organism.using("laminlabs/arc-virtual-cell-atlas").df().ontology_id
|
|
461
|
+
) - set(bt.Organism.filter().df().ontology_id):
|
|
462
|
+
print(i)
|
|
463
|
+
rec = (
|
|
464
|
+
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
465
|
+
.filter(ontology_id=i)
|
|
466
|
+
.first()
|
|
467
|
+
)
|
|
468
|
+
rec.save()
|
|
460
469
|
if len(bt.Organism.filter(ontology_id="NCBITaxon:9593")) == 0:
|
|
461
470
|
bt.Organism(
|
|
462
471
|
name="gorilla gorilla",
|
|
@@ -655,7 +664,7 @@ def populate_my_ontology(
|
|
|
655
664
|
ln.save(records)
|
|
656
665
|
|
|
657
666
|
|
|
658
|
-
def random_str(stringLength=6, stype="all", withdigits=True):
|
|
667
|
+
def random_str(stringLength=6, stype="all", withdigits=True) -> str:
|
|
659
668
|
"""
|
|
660
669
|
Generate a random string of letters and digits
|
|
661
670
|
|
|
@@ -664,7 +673,7 @@ def random_str(stringLength=6, stype="all", withdigits=True):
|
|
|
664
673
|
stype (str, optional): one of lowercase, uppercase, all. Defaults to 'all'.
|
|
665
674
|
withdigits (bool, optional): digits allowed in the string? Defaults to True.
|
|
666
675
|
|
|
667
|
-
|
|
676
|
+
Returns:
|
|
668
677
|
str: random string
|
|
669
678
|
"""
|
|
670
679
|
if stype == "lowercase":
|
|
@@ -678,12 +687,12 @@ def random_str(stringLength=6, stype="all", withdigits=True):
|
|
|
678
687
|
return "".join(random.choice(lettersAndDigits) for i in range(stringLength))
|
|
679
688
|
|
|
680
689
|
|
|
681
|
-
def is_outlier(adata: AnnData, metric: str, nmads: int):
|
|
690
|
+
def is_outlier(adata: AnnData, metric: str, nmads: int) -> pd.Series:
|
|
682
691
|
"""
|
|
683
692
|
is_outlier detects outliers in adata.obs[metric]
|
|
684
693
|
|
|
685
694
|
Args:
|
|
686
|
-
adata (
|
|
695
|
+
adata (AnnData): the anndata object
|
|
687
696
|
metric (str): the metric column to use
|
|
688
697
|
nmads (int): the number of median absolute deviations to use as a threshold
|
|
689
698
|
|
|
@@ -697,16 +706,16 @@ def is_outlier(adata: AnnData, metric: str, nmads: int):
|
|
|
697
706
|
return outlier
|
|
698
707
|
|
|
699
708
|
|
|
700
|
-
def length_normalize(adata: AnnData, gene_lengths: list):
|
|
709
|
+
def length_normalize(adata: AnnData, gene_lengths: list) -> AnnData:
|
|
701
710
|
"""
|
|
702
711
|
length_normalize normalizes the counts by the gene length
|
|
703
712
|
|
|
704
713
|
Args:
|
|
705
|
-
adata (
|
|
714
|
+
adata (AnnData): the anndata object
|
|
706
715
|
gene_lengths (list): the gene lengths
|
|
707
716
|
|
|
708
717
|
Returns:
|
|
709
|
-
|
|
718
|
+
AnnData: the normalized anndata object
|
|
710
719
|
"""
|
|
711
720
|
adata.X = csr_matrix((adata.X.T / gene_lengths).T)
|
|
712
721
|
return adata
|
|
@@ -714,13 +723,13 @@ def length_normalize(adata: AnnData, gene_lengths: list):
|
|
|
714
723
|
|
|
715
724
|
def translate(
|
|
716
725
|
val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
|
|
717
|
-
):
|
|
726
|
+
) -> dict:
|
|
718
727
|
"""
|
|
719
728
|
translate translates the ontology term id to the name
|
|
720
729
|
|
|
721
730
|
Args:
|
|
722
|
-
val (str, dict, set, list
|
|
723
|
-
t (
|
|
731
|
+
val (Union[str, dict, set, list]): the object to translate
|
|
732
|
+
t (str, optional): the type of ontology terms.
|
|
724
733
|
one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
|
|
725
734
|
Defaults to "cell_type_ontology_term_id".
|
|
726
735
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.8
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Project-URL: repository, https://github.com/jkobject/scDataLoader
|
|
6
6
|
Author-email: jkobject <jkobject@gmail.com>
|
|
@@ -52,9 +52,10 @@ Description-Content-Type: text/markdown
|
|
|
52
52
|
[](https://github.com/psf/black)
|
|
53
53
|
[](https://doi.org/10.5281/zenodo.10573143)
|
|
54
54
|
|
|
55
|
-
<img src="scdataloader.png" width="600">
|
|
55
|
+
<img src="./docs/scdataloader.png" width="600">
|
|
56
56
|
|
|
57
|
-
This single cell pytorch dataloader / lighting datamodule is designed to be used
|
|
57
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used
|
|
58
|
+
with:
|
|
58
59
|
|
|
59
60
|
- [lamindb](https://lamin.ai/)
|
|
60
61
|
|
|
@@ -66,11 +67,13 @@ and:
|
|
|
66
67
|
It allows you to:
|
|
67
68
|
|
|
68
69
|
1. load thousands of datasets containing millions of cells in a few seconds.
|
|
69
|
-
2. preprocess the data per dataset and download it locally (normalization,
|
|
70
|
+
2. preprocess the data per dataset and download it locally (normalization,
|
|
71
|
+
filtering, etc.)
|
|
70
72
|
3. create a more complex single cell dataset
|
|
71
73
|
4. extend it to your need
|
|
72
74
|
|
|
73
|
-
built on top of `lamindb` and the `.mapped()` function by Sergei:
|
|
75
|
+
built on top of `lamindb` and the `.mapped()` function by Sergei:
|
|
76
|
+
https://github.com/Koncopd
|
|
74
77
|
|
|
75
78
|
```
|
|
76
79
|
Portions of the mapped.py file are derived from Lamin Labs
|
|
@@ -81,11 +84,17 @@ Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_c
|
|
|
81
84
|
for the original implementation
|
|
82
85
|
```
|
|
83
86
|
|
|
84
|
-
The package has been designed together with the
|
|
87
|
+
The package has been designed together with the
|
|
88
|
+
[scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and
|
|
89
|
+
[model](https://github.com/cantinilab/scPRINT).
|
|
85
90
|
|
|
86
91
|
## More
|
|
87
92
|
|
|
88
|
-
I needed to create this Data Loader for my PhD project. I am using it to load &
|
|
93
|
+
I needed to create this Data Loader for my PhD project. I am using it to load &
|
|
94
|
+
preprocess thousands of datasets containing millions of cells in a few seconds.
|
|
95
|
+
I believed that individuals employing AI for single-cell RNA sequencing and
|
|
96
|
+
other sequencing datasets would eagerly utilize and desire such a tool, which
|
|
97
|
+
presently does not exist.
|
|
89
98
|
|
|
90
99
|

|
|
91
100
|
|
|
@@ -99,12 +108,14 @@ pip install scDataLoader[dev] # for dev dependencies
|
|
|
99
108
|
lamin init --storage ./testdb --name test --schema bionty
|
|
100
109
|
```
|
|
101
110
|
|
|
102
|
-
if you start with lamin and had to do a `lamin init`, you will also need to
|
|
111
|
+
if you start with lamin and had to do a `lamin init`, you will also need to
|
|
112
|
+
populate your ontologies. This is because scPRINT is using ontologies to define
|
|
113
|
+
its cell types, diseases, sexes, ethnicities, etc.
|
|
103
114
|
|
|
104
115
|
you can do it manually or with our function:
|
|
105
116
|
|
|
106
117
|
```python
|
|
107
|
-
from scdataloader.utils import populate_my_ontology
|
|
118
|
+
from scdataloader.utils import populate_my_ontology, _adding_scbasecamp_genes
|
|
108
119
|
|
|
109
120
|
populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
|
|
110
121
|
|
|
@@ -118,11 +129,14 @@ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
|
118
129
|
diseases = None,
|
|
119
130
|
dev_stages = None,
|
|
120
131
|
)
|
|
132
|
+
# if you want to load the gene names and species for the arc scbasecount species, also add this:
|
|
133
|
+
_adding_scbasecamp_genes()
|
|
121
134
|
```
|
|
122
135
|
|
|
123
136
|
### Dev install
|
|
124
137
|
|
|
125
|
-
If you want to use the latest version of scDataLoader and work on the code
|
|
138
|
+
If you want to use the latest version of scDataLoader and work on the code
|
|
139
|
+
yourself use `git clone` and `pip -e` instead of `pip install`.
|
|
126
140
|
|
|
127
141
|
```bash
|
|
128
142
|
git clone https://github.com/jkobject/scDataLoader.git
|
|
@@ -161,6 +175,12 @@ datamodule = DataModule(
|
|
|
161
175
|
)
|
|
162
176
|
```
|
|
163
177
|
|
|
178
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/) to learn
|
|
179
|
+
more
|
|
180
|
+
|
|
181
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
182
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
183
|
+
|
|
164
184
|
### lightning-free usage (Dataset+Collator+DataLoader)
|
|
165
185
|
|
|
166
186
|
```python
|
|
@@ -211,7 +231,17 @@ for batch in tqdm(dataloader):
|
|
|
211
231
|
)
|
|
212
232
|
```
|
|
213
233
|
|
|
214
|
-
|
|
234
|
+
## Gathering a pre-training database
|
|
235
|
+
|
|
236
|
+
Here I will explain how to gather and preprocess all of cellxgene (scPRINT-1
|
|
237
|
+
pretraining database) with scDataLoader, and the scPRINT-2 corpus (scPRINT-2
|
|
238
|
+
pretraining database).
|
|
239
|
+
|
|
240
|
+
### Getting all of cellxgene
|
|
241
|
+
|
|
242
|
+
Here is an example of how to download and preprocess all of cellxgene with
|
|
243
|
+
scDataLoader as a script (a notebook version is also available in
|
|
244
|
+
[./notebooks/update_lamin_or_cellxgene.ipynb](https://github.com/jkobject/scdataloader/blob/main/notebooks/update_lamin_or_cellxgene.ipynb)).
|
|
215
245
|
|
|
216
246
|
```python
|
|
217
247
|
# initialize a local lamin database
|
|
@@ -226,11 +256,25 @@ DESCRIPTION='preprocessed by scDataLoader'
|
|
|
226
256
|
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
227
257
|
cx_dataset, len(cx_dataset.artifacts.all())
|
|
228
258
|
|
|
259
|
+
# (OPTIONAL) if you want to do you preprocessing on a slurm cluster without internet connections,
|
|
260
|
+
# you can first do this:
|
|
261
|
+
load_dataset_local(
|
|
262
|
+
cx_dataset,
|
|
263
|
+
download_folder="/my_download_folder",
|
|
264
|
+
name="cached-cellxgene-census",
|
|
265
|
+
description="all of it topreprocess",
|
|
266
|
+
)
|
|
229
267
|
|
|
268
|
+
# preprocessing
|
|
230
269
|
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
231
270
|
|
|
232
271
|
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
233
272
|
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
After this you can use the preprocessed dataset with the DataModule below.
|
|
276
|
+
|
|
277
|
+
```python
|
|
234
278
|
# create dataloaders
|
|
235
279
|
from scdataloader import DataModule
|
|
236
280
|
import tqdm
|
|
@@ -252,27 +296,52 @@ for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
|
252
296
|
|
|
253
297
|
# with lightning:
|
|
254
298
|
# Trainer(model, datamodule)
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
You can use the command line to preprocess a large database of datasets like
|
|
302
|
+
here for cellxgene. this allows parallelizing and easier usage.
|
|
255
303
|
|
|
304
|
+
```bash
|
|
305
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
256
306
|
```
|
|
257
307
|
|
|
258
|
-
|
|
308
|
+
### Getting the rest of the scPRINT-2 corpus
|
|
259
309
|
|
|
260
|
-
|
|
261
|
-
|
|
310
|
+
by now, using the command / scripts above you should be able to get all of
|
|
311
|
+
cellxgene (and preprocess it). laminlabs now also hosts the rest of the
|
|
312
|
+
scPRINT-2 corpus in `laminlabs/arc-virtual-cell-atlas` and they can be
|
|
313
|
+
downloaded and preprocessed the same way as cellxgene above. Be careful however
|
|
314
|
+
that there is no metadata for these datasets.
|
|
262
315
|
|
|
263
|
-
|
|
316
|
+
You can have a look at my notebooks:
|
|
317
|
+
[./notebooks/adding_tahoe.ipynb](https://github.com/jkobject/scdataloader/blob/main/notebooks/adding_tahoe.ipynb)
|
|
318
|
+
and
|
|
319
|
+
[./notebooks/adding_scbasecount.ipynb](https://github.com/jkobject/scdataloader/blob/main/notebooks/adding_scbasecount.ipynb)
|
|
320
|
+
where I create some remmaping to retrive metadata that can be used by
|
|
321
|
+
scdataloader and lamindb from these datasets.
|
|
264
322
|
|
|
265
|
-
|
|
323
|
+
If you do not have access for some reason to these datasets, please contact
|
|
324
|
+
laminlabs. But another solution, is to download them from the original sources
|
|
325
|
+
and add them one by one in your instance and then do the same preprocessing but
|
|
326
|
+
this time use `your_account/your_instance` instead of
|
|
327
|
+
`laminlabs/arc-virtual-cell-atlas`.
|
|
266
328
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
329
|
+
This is actually what I did in my own instance to create the full scPRINT-2
|
|
330
|
+
corpus and you can see some of it in the notebooks above.
|
|
331
|
+
|
|
332
|
+
### Getting even more
|
|
333
|
+
|
|
334
|
+
They also host a pertubation atlas in `laminlabs/pertdata` that can be
|
|
335
|
+
downloaded the same way.
|
|
270
336
|
|
|
271
|
-
### command line usage
|
|
337
|
+
### command line usage to train a moel
|
|
272
338
|
|
|
273
339
|
The main way to use
|
|
274
340
|
|
|
275
|
-
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/)
|
|
341
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/)
|
|
342
|
+
> and
|
|
343
|
+
> [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html)
|
|
344
|
+
> for more information on command line usage
|
|
276
345
|
|
|
277
346
|
## FAQ
|
|
278
347
|
|
|
@@ -295,13 +364,36 @@ from scdataloader import utils
|
|
|
295
364
|
utils.populate_ontologies() # this might take from 5-20mins
|
|
296
365
|
```
|
|
297
366
|
|
|
367
|
+
### how to move my lamin instance to another folder?
|
|
368
|
+
|
|
369
|
+
you cannot just move your folder from one place to another because lamin is
|
|
370
|
+
using absolute paths. You need to do 3 things:
|
|
371
|
+
|
|
372
|
+
1. move your folder to the new place
|
|
373
|
+
2. update your lamin config file (usually in `~/.lamin/my_env.yml`) to point to
|
|
374
|
+
the new place
|
|
375
|
+
3. update the absolute paths in your lamin database. You can do it like this:
|
|
376
|
+
|
|
377
|
+
```python
|
|
378
|
+
import lamin as ln
|
|
379
|
+
ln.Storage.df()
|
|
380
|
+
# view what is your current storage id (in my case it was GZgLW1TQ)
|
|
381
|
+
ln.Storage.filter(uid="GZgLW1TI").update(
|
|
382
|
+
root=Path("your_new_locations").as_posix().rstrip("/")
|
|
383
|
+
)
|
|
384
|
+
```
|
|
385
|
+
|
|
298
386
|
## Development
|
|
299
387
|
|
|
300
|
-
Read the
|
|
388
|
+
Read the
|
|
389
|
+
[CONTRIBUTING.md](https://github.com/jkobject/scdataloader/blob/main/CONTRIBUTING.md)
|
|
390
|
+
file.
|
|
301
391
|
|
|
302
392
|
## License
|
|
303
393
|
|
|
304
|
-
This project is licensed under the MIT License - see the
|
|
394
|
+
This project is licensed under the MIT License - see the
|
|
395
|
+
[LICENSE](https://github.com/jkobject/scdataloader/blob/main/LICENSE) file for
|
|
396
|
+
details.
|
|
305
397
|
|
|
306
398
|
## Acknowledgments
|
|
307
399
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
|
|
2
|
+
scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
|
|
3
|
+
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
4
|
+
scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
|
|
5
|
+
scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
|
|
6
|
+
scdataloader/data.py,sha256=tXvONJNgcdMQIRh2KlAq9KCsf-Sz2L4GUlcGyf1OMhw,25160
|
|
7
|
+
scdataloader/datamodule.py,sha256=pFBGUOHl3ibi8QhiV8x5ukjzVjnJMsZWNw3Ekk3P83Y,43810
|
|
8
|
+
scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
|
|
9
|
+
scdataloader/preprocess.py,sha256=VFmyJluk4drR4fcH5qBAcJLf0cJg26ElA0HDuHOK68s,40730
|
|
10
|
+
scdataloader/utils.py,sha256=B81iwnR6aJs9lzkOCSRF85RszAdwS-dvPZPXA7yoMg4,27734
|
|
11
|
+
scdataloader-2.0.8.dist-info/METADATA,sha256=g7UW_1EeYX1P0LoJsL9MENZ9f0Q7JjLu8opWbV7CnDo,13448
|
|
12
|
+
scdataloader-2.0.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
scdataloader-2.0.8.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
|
|
14
|
+
scdataloader-2.0.8.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
|
|
15
|
+
scdataloader-2.0.8.dist-info/RECORD,,
|
scdataloader/data.json
DELETED
|
@@ -1,384 +0,0 @@
|
|
|
1
|
-
'lung': 42877488
|
|
2
|
-
'blood': 34180713
|
|
3
|
-
'brain': 29530595
|
|
4
|
-
'colon': 25830811
|
|
5
|
-
'unknown': 23810521
|
|
6
|
-
'pancreas': 21597602
|
|
7
|
-
'embryo': 16976623
|
|
8
|
-
'skin': 12513695
|
|
9
|
-
'liver': 10683313
|
|
10
|
-
'breast': 10539762
|
|
11
|
-
'marrow': 9543512
|
|
12
|
-
'kidney': 8213043
|
|
13
|
-
'heart': 4800567
|
|
14
|
-
'immune system': 4687291
|
|
15
|
-
'eye': 4677504
|
|
16
|
-
# 'other (310 tissues)': 87200000
|
|
17
|
-
'UBERON:0002106': 4559701
|
|
18
|
-
'UBERON:0000029': 4314740
|
|
19
|
-
'UBERON:0000945': 4244624
|
|
20
|
-
'UBERON:0001295': 3485757
|
|
21
|
-
'UBERON:0002049': 3350893
|
|
22
|
-
'UBERON:0002108': 3226581
|
|
23
|
-
'UBERON:0001255': 2961628
|
|
24
|
-
'UBERON:0001434': 2823653
|
|
25
|
-
'UBERON:0001004': 2588530
|
|
26
|
-
'UBERON:0002037': 2101623
|
|
27
|
-
'UBERON:0001015': 1920181
|
|
28
|
-
'UBERON:0000995': 1864451
|
|
29
|
-
'UBERON:0001013': 1853702
|
|
30
|
-
'UBERON:0001017': 1818861
|
|
31
|
-
'UBERON:0002240': 1704269
|
|
32
|
-
'UBERON:0009834': 1631471
|
|
33
|
-
'UBERON:0000002': 1596082
|
|
34
|
-
'UBERON:0001893': 1565863
|
|
35
|
-
'UBERON:0001851': 1513857
|
|
36
|
-
'UBERON:0002771': 1317262
|
|
37
|
-
'UBERON:0002367': 1301074
|
|
38
|
-
'UBERON:0002369': 1279963
|
|
39
|
-
'UBERON:0000956': 1252422
|
|
40
|
-
'UBERON:0013682': 1195809
|
|
41
|
-
'UBERON:0000160': 1156931
|
|
42
|
-
'UBERON:0001987': 1127465
|
|
43
|
-
'UBERON:0001043': 1015278
|
|
44
|
-
'UBERON:0001032': 1011915
|
|
45
|
-
'UBERON:0000992': 948464
|
|
46
|
-
'UBERON:0000010': 897312
|
|
47
|
-
'UBERON:0000473': 838235
|
|
48
|
-
'UBERON:0002368': 814761
|
|
49
|
-
'UBERON:0002084': 761793
|
|
50
|
-
'UBERON:0001870': 730928
|
|
51
|
-
'UBERON:0000344': 655220
|
|
52
|
-
'UBERON:0001384': 625538
|
|
53
|
-
'UBERON:0000966': 598023
|
|
54
|
-
'UBERON:0002421': 578257
|
|
55
|
-
'UBERON:0001225': 542410
|
|
56
|
-
'UBERON:0000991': 507126
|
|
57
|
-
'UBERON:0002116': 506392
|
|
58
|
-
'UBERON:8440012': 494325
|
|
59
|
-
'UBERON:0001898': 483451
|
|
60
|
-
'UBERON:0000990': 440483
|
|
61
|
-
'UBERON:0002370': 405221
|
|
62
|
-
'UBERON:0002436': 399661
|
|
63
|
-
'UBERON:0001965': 387483
|
|
64
|
-
'UBERON:0000006': 362898
|
|
65
|
-
'UBERON:0001005': 358584
|
|
66
|
-
'UBERON:0010225': 343320
|
|
67
|
-
'UBERON:0002102': 335470
|
|
68
|
-
'UBERON:0008946': 334444
|
|
69
|
-
'UBERON:0000053': 310875
|
|
70
|
-
'UBERON:0008933': 310653
|
|
71
|
-
'UBERON:0001117': 308212
|
|
72
|
-
'UBERON:0001007': 307511
|
|
73
|
-
'UBERON:0000059': 288232
|
|
74
|
-
'UBERON:0002080': 286423
|
|
75
|
-
'UBERON:0002094': 284283
|
|
76
|
-
'UBERON:0000362': 283305
|
|
77
|
-
'UBERON:0002365': 282382
|
|
78
|
-
'UBERON:0002103': 264447
|
|
79
|
-
'UBERON:0001891': 249305
|
|
80
|
-
'UBERON:0001894': 249065
|
|
81
|
-
'UBERON:0000411': 236082
|
|
82
|
-
'UBERON:0002728': 221808
|
|
83
|
-
'UBERON:0000451': 219655
|
|
84
|
-
'UBERON:0001161': 214938
|
|
85
|
-
'UBERON:0000030': 207230
|
|
86
|
-
'UBERON:0009835': 206197
|
|
87
|
-
'UBERON:0000988': 204168
|
|
88
|
-
'UBERON:0001707': 198868
|
|
89
|
-
'UBERON:0016538': 192147
|
|
90
|
-
'UBERON:0002450': 189973
|
|
91
|
-
'UBERON:0016540': 187526
|
|
92
|
-
'UBERON:0000977': 185185
|
|
93
|
-
'UBERON:0001913': 183332
|
|
94
|
-
'UBERON:0001786': 179309
|
|
95
|
-
'UBERON:0034751': 145757
|
|
96
|
-
'UBERON:0001040': 144260
|
|
97
|
-
'UBERON:0016530': 141537
|
|
98
|
-
'UBERON:0001238': 136529
|
|
99
|
-
'UBERON:0003889': 132985
|
|
100
|
-
'UBERON:0000453': 125672
|
|
101
|
-
'UBERON:0001723': 125523
|
|
102
|
-
'UBERON:0002098': 120306
|
|
103
|
-
'UBERON:0016525': 118039
|
|
104
|
-
'UBERON:0000004': 114936
|
|
105
|
-
'UBERON:0002686': 110752
|
|
106
|
-
'UBERON:0022352': 110671
|
|
107
|
-
'UBERON:0002351': 107573
|
|
108
|
-
'UBERON:0001828': 105079
|
|
109
|
-
'UBERON:0001003': 104237
|
|
110
|
-
'UBERON:0002067': 100247
|
|
111
|
-
'UBERON:0002190': 99711
|
|
112
|
-
'UBERON:0005290': 99110
|
|
113
|
-
'UBERON:0002811': 98536
|
|
114
|
-
'UBERON:0001871': 97537
|
|
115
|
-
'UBERON:0003688': 92733
|
|
116
|
-
'UBERON:0010410': 91248
|
|
117
|
-
'UBERON:0000403': 90975
|
|
118
|
-
'UBERON:0000175': 90741
|
|
119
|
-
'UBERON:0001976': 87947
|
|
120
|
-
'UBERON:0002822': 87214
|
|
121
|
-
'UBERON:0001890': 86604
|
|
122
|
-
'UBERON:0014454': 86134
|
|
123
|
-
'UBERON:0002810': 85525
|
|
124
|
-
'UBERON:0002079': 82676
|
|
125
|
-
'UBERON:0010414': 80085
|
|
126
|
-
'UBERON:0003126': 79986
|
|
127
|
-
'UBERON:0003027': 75807
|
|
128
|
-
'UBERON:0008345': 73634
|
|
129
|
-
'UBERON:0002352': 73045
|
|
130
|
-
'UBERON:8410025': 72957
|
|
131
|
-
'UBERON:0001393': 72549
|
|
132
|
-
'UBERON:0010412': 71752
|
|
133
|
-
'UBERON:0001159': 69025
|
|
134
|
-
'UBERON:0014918': 65710
|
|
135
|
-
'UBERON:0002078': 64486
|
|
136
|
-
'UBERON:0007650': 64093
|
|
137
|
-
'UBERON:0002808': 63227
|
|
138
|
-
'UBERON:0007644': 62647
|
|
139
|
-
'UBERON:8410010': 61264
|
|
140
|
-
'UBERON:0001157': 60907
|
|
141
|
-
'UBERON:0002185': 59377
|
|
142
|
-
'UBERON:0002114': 59363
|
|
143
|
-
'UBERON:0000916': 59001
|
|
144
|
-
'UBERON:0014455': 58536
|
|
145
|
-
'UBERON:0002661': 58226
|
|
146
|
-
'UBERON:0001873': 57571
|
|
147
|
-
'UBERON:0001769': 57422
|
|
148
|
-
'UBERON:0008953': 56919
|
|
149
|
-
'UBERON:0002372': 56899
|
|
150
|
-
'UBERON:0002509': 55694
|
|
151
|
-
'UBERON:0000397': 55666
|
|
152
|
-
'UBERON:0001156': 54735
|
|
153
|
-
'UBERON:0023787': 54666
|
|
154
|
-
'UBERON:0002299': 54085
|
|
155
|
-
'UBERON:0034893': 53956
|
|
156
|
-
'UBERON:0001872': 53269
|
|
157
|
-
'UBERON:0007177': 49990
|
|
158
|
-
'UBERON:0004026': 48185
|
|
159
|
-
'UBERON:0012648': 48068
|
|
160
|
-
'UBERON:0001630': 47291
|
|
161
|
-
'UBERON:0002803': 47176
|
|
162
|
-
'UBERON:0016632': 45130
|
|
163
|
-
'UBERON:0008803': 43943
|
|
164
|
-
'UBERON:0001049': 43485
|
|
165
|
-
'UBERON:0016475': 42323
|
|
166
|
-
'UBERON:0002363': 42319
|
|
167
|
-
'UBERON:0001874': 41277
|
|
168
|
-
'UBERON:0000964': 40990
|
|
169
|
-
'UBERON:0011189': 40231
|
|
170
|
-
'UBERON:0036288': 36574
|
|
171
|
-
'UBERON:0008954': 35284
|
|
172
|
-
'UBERON:0018131': 34687
|
|
173
|
-
'UBERON:0004648': 34173
|
|
174
|
-
'UBERON:0034891': 34153
|
|
175
|
-
'UBERON:0001775': 34132
|
|
176
|
-
'UBERON:0018707': 33610
|
|
177
|
-
'UBERON:0003661': 32722
|
|
178
|
-
'UBERON:0003403': 32138
|
|
179
|
-
'UBERON:0035328': 31696
|
|
180
|
-
'UBERON:0001728': 31325
|
|
181
|
-
'UBERON:0001388': 30877
|
|
182
|
-
'UBERON:0008952': 29895
|
|
183
|
-
'UBERON:0000080': 29606
|
|
184
|
-
'UBERON:0004024': 29064
|
|
185
|
-
'UBERON:0035886': 28873
|
|
186
|
-
'UBERON:0004023': 28857
|
|
187
|
-
'UBERON:0013473': 28621
|
|
188
|
-
'UBERON:0018105': 28367
|
|
189
|
-
'UBERON:0005969': 27736
|
|
190
|
-
'UBERON:0012168': 27154
|
|
191
|
-
'UBERON:0001886': 27092
|
|
192
|
-
'UBERON:0000400': 27087
|
|
193
|
-
'UBERON:0001911': 26952
|
|
194
|
-
'UBERON:0000088': 26853
|
|
195
|
-
'UBERON:0001153': 25865
|
|
196
|
-
'UBERON:0001471': 24982
|
|
197
|
-
'UBERON:0001085': 24807
|
|
198
|
-
'UBERON:0000057': 24700
|
|
199
|
-
'UBERON:0006761': 24573
|
|
200
|
-
'UBERON:0002809': 24445
|
|
201
|
-
'UBERON:0001158': 23887
|
|
202
|
-
'UBERON:0008972': 23110
|
|
203
|
-
'UBERON:0002807': 22796
|
|
204
|
-
'UBERON:0010506': 22652
|
|
205
|
-
'UBERON:0001459': 21633
|
|
206
|
-
'UBERON:8410000': 21592
|
|
207
|
-
'UBERON:0001831': 21003
|
|
208
|
-
'UBERON:0003544': 20212
|
|
209
|
-
'UBERON:0002110': 19880
|
|
210
|
-
'UBERON:0014614': 19650
|
|
211
|
-
'UBERON:8300000': 19408
|
|
212
|
-
'UBERON:0035895': 18814
|
|
213
|
-
'UBERON:0035213': 18775
|
|
214
|
-
'UBERON:0001162': 18404
|
|
215
|
-
'UBERON:0000056': 18354
|
|
216
|
-
'UBERON:0001228': 17958
|
|
217
|
-
'UBERON:0008971': 17831
|
|
218
|
-
'UBERON:0013756': 17625
|
|
219
|
-
'UBERON:0001052': 16913
|
|
220
|
-
'UBERON:0012474': 16607
|
|
221
|
-
'UBERON:0039167': 16527
|
|
222
|
-
'UBERON:0002317': 15963
|
|
223
|
-
'UBERON:0002115': 15762
|
|
224
|
-
'UBERON:0014648': 15580
|
|
225
|
-
'UBERON:8480028': 15307
|
|
226
|
-
'UBERON:0000014': 15215
|
|
227
|
-
'UBERON:0002489': 15127
|
|
228
|
-
'UBERON:0001836': 14502
|
|
229
|
-
'UBERON:0005343': 14336
|
|
230
|
-
'UBERON:8410026': 14090
|
|
231
|
-
'UBERON:0002132': 13953
|
|
232
|
-
'UBERON:0000965': 13900
|
|
233
|
-
'UBERON:0010415': 12330
|
|
234
|
-
'UBERON:0000017': 11977
|
|
235
|
-
'UBERON:0018303': 11937
|
|
236
|
-
'UBERON:0002382': 11898
|
|
237
|
-
'UBERON:0002046': 11840
|
|
238
|
-
'UBERON:0001087': 11702
|
|
239
|
-
'UBERON:0009958': 11377
|
|
240
|
-
'UBERON:0005616': 11243
|
|
241
|
-
'UBERON:8480009': 10533
|
|
242
|
-
'UBERON:0013535': 9915
|
|
243
|
-
'UBERON:0007106': 9898
|
|
244
|
-
'UBERON:0001513': 9887
|
|
245
|
-
'UBERON:0015790': 9816
|
|
246
|
-
'UBERON:0001068': 9773
|
|
247
|
-
'UBERON:0035894': 9667
|
|
248
|
-
'UBERON:0015476': 9656
|
|
249
|
-
'UBERON:0001637': 9652
|
|
250
|
-
'UBERON:0002129': 9649
|
|
251
|
-
'UBERON:0010033': 9467
|
|
252
|
-
'UBERON:0000947': 9290
|
|
253
|
-
'UBERON:0001511': 9288
|
|
254
|
-
'UBERON:0004946': 9195
|
|
255
|
-
'UBERON:0016435': 9097
|
|
256
|
-
'UBERON:0002420': 9000
|
|
257
|
-
'UBERON:0001868': 8799
|
|
258
|
-
'UBERON:0002021': 8799
|
|
259
|
-
'UBERON:0001542': 8683
|
|
260
|
-
'UBERON:0002081': 8279
|
|
261
|
-
'UBERON:0004070': 8033
|
|
262
|
-
'UBERON:0008612': 7825
|
|
263
|
-
'UBERON:0001901': 7228
|
|
264
|
-
'UBERON:0004929': 7133
|
|
265
|
-
'UBERON:0003528': 6670
|
|
266
|
-
'UBERON:0002427': 6529
|
|
267
|
-
'UBERON:0004025': 6448
|
|
268
|
-
'UBERON:0009472': 6348
|
|
269
|
-
'UBERON:0002756': 6279
|
|
270
|
-
'UBERON:0001832': 6196
|
|
271
|
-
'UBERON:0002378': 6142
|
|
272
|
-
'UBERON:0004167': 5874
|
|
273
|
-
'UBERON:0002228': 5725
|
|
274
|
-
'UBERON:0003968': 5625
|
|
275
|
-
'UBERON:0001154': 5515
|
|
276
|
-
'UBERON:0001046': 5420
|
|
277
|
-
'UBERON:0010032': 5367
|
|
278
|
-
'UBERON:0004339': 4969
|
|
279
|
-
'UBERON:0002385': 4881
|
|
280
|
-
'UBERON:0001621': 4867
|
|
281
|
-
'UBERON:0001416': 4808
|
|
282
|
-
'UBERON:0001638': 4395
|
|
283
|
-
'UBERON:0002429': 4355
|
|
284
|
-
'UBERON:0001165': 4028
|
|
285
|
-
'UBERON:0008989': 3997
|
|
286
|
-
'UBERON:0001902': 3883
|
|
287
|
-
'UBERON:0003532': 3443
|
|
288
|
-
'UBERON:0003428': 3406
|
|
289
|
-
'UBERON:0002082': 3226
|
|
290
|
-
'UBERON:0001296': 3025
|
|
291
|
-
'UBERON:0015143': 3014
|
|
292
|
-
'UBERON:0000074': 3011
|
|
293
|
-
'UBERON:0002245': 2971
|
|
294
|
-
'UBERON:0001293': 2693
|
|
295
|
-
'UBERON:0007625': 2348
|
|
296
|
-
'UBERON:0003547': 2344
|
|
297
|
-
'UBERON:0022277': 2333
|
|
298
|
-
'UBERON:0001554': 2272
|
|
299
|
-
'UBERON:0001348': 2223
|
|
300
|
-
'UBERON:0005406': 2121
|
|
301
|
-
'UBERON:0001811': 2084
|
|
302
|
-
'UBERON:0013531': 2055
|
|
303
|
-
'UBERON:0008934': 1866
|
|
304
|
-
'UBERON:0001103': 1858
|
|
305
|
-
'UBERON:0005636': 1651
|
|
306
|
-
'UBERON:0007225': 1594
|
|
307
|
-
'UBERON:0007224': 1564
|
|
308
|
-
'UBERON:0000016': 1520
|
|
309
|
-
'UBERON:8440075': 1482
|
|
310
|
-
'UBERON:0004264': 1475
|
|
311
|
-
'UBERON:0001773': 1431
|
|
312
|
-
'UBERON:0013706': 1150
|
|
313
|
-
'UBERON:0023852': 1143
|
|
314
|
-
'UBERON:0001294': 849
|
|
315
|
-
'UBERON:0001134': 835
|
|
316
|
-
'UBERON:0003902': 675
|
|
317
|
-
'UBERON:0001224': 569
|
|
318
|
-
'UBERON:0005564': 399
|
|
319
|
-
'UBERON:0001817': 248
|
|
320
|
-
'UBERON:0002802': 163
|
|
321
|
-
'UBERON:0003072': 146
|
|
322
|
-
'UBERON:0000926': 139
|
|
323
|
-
'UBERON:0000416': 82
|
|
324
|
-
'UBERON:0003517': 48
|
|
325
|
-
'UBERON:0001483': 37
|
|
326
|
-
'UBERON:1000021': 37
|
|
327
|
-
'UBERON:0002023': 1
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
human: 230822337
|
|
332
|
-
mouse: 102753838
|
|
333
|
-
# other 14070000
|
|
334
|
-
macaque: 3161179
|
|
335
|
-
zebrafish: 3117237
|
|
336
|
-
pig: 1641179
|
|
337
|
-
thale cress: 1564711
|
|
338
|
-
drosophila: 1564460
|
|
339
|
-
chicken: 606680
|
|
340
|
-
nake mole rat: 441042
|
|
341
|
-
rabbit: 417091
|
|
342
|
-
cow: 390675
|
|
343
|
-
corn: 336166
|
|
344
|
-
chimpanzee: 293472
|
|
345
|
-
c. elegans: 251759
|
|
346
|
-
sheep: 177871
|
|
347
|
-
marmoset: 115584
|
|
348
|
-
|
|
349
|
-
10x 3': 146909525
|
|
350
|
-
10x 3' v2: 12021763
|
|
351
|
-
10x 3' v3: 121241920
|
|
352
|
-
# 10x 3': 280173208
|
|
353
|
-
10x 5' v1: 4716686
|
|
354
|
-
10x 5' v2: 3104399
|
|
355
|
-
10x 5': 28919386
|
|
356
|
-
# 10x 5': 36740471
|
|
357
|
-
10x multiome: 6942314
|
|
358
|
-
10x (vdj): 1620181
|
|
359
|
-
10x (CITE): 1580477
|
|
360
|
-
# 10x multiome: 10142972
|
|
361
|
-
sciRNA-seq: 14924076
|
|
362
|
-
slide-seq: 1867286
|
|
363
|
-
scalebio: 685024
|
|
364
|
-
microwell: 599459
|
|
365
|
-
sciPlex: 581480
|
|
366
|
-
drop-seq: 517799
|
|
367
|
-
# other (17 assays): 1423506
|
|
368
|
-
EFO:0010961: 286087
|
|
369
|
-
EFO:0008931: 192101
|
|
370
|
-
EFO:0700003: 146278
|
|
371
|
-
EFO:0700010: 133430
|
|
372
|
-
EFO:0700016: 128855
|
|
373
|
-
EFO:0030007: 105584
|
|
374
|
-
EFO:0008919: 87181
|
|
375
|
-
EFO:0009901: 76022
|
|
376
|
-
EFO:0008796: 68645
|
|
377
|
-
EFO:0009919: 68544
|
|
378
|
-
EFO:0700011: 58981
|
|
379
|
-
EFO:0030019: 31775
|
|
380
|
-
EFO:0008780: 25652
|
|
381
|
-
EFO:0010010: 5231
|
|
382
|
-
EFO:0008953: 4693
|
|
383
|
-
EFO:0008720: 2768
|
|
384
|
-
EFO:0008930: 1679
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
|
|
2
|
-
scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
|
|
3
|
-
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
4
|
-
scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
|
|
5
|
-
scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
|
|
6
|
-
scdataloader/data.json,sha256=Zb8c27yk3rwMgtAU8kkiWWAyUwYBrlCqKUyEtaAx9i8,8785
|
|
7
|
-
scdataloader/data.py,sha256=fMW1OgllPCz87si3DpkzOSoqnufgKlh8aW5rEVmeC_c,25133
|
|
8
|
-
scdataloader/datamodule.py,sha256=ojX0zr2cpGLoKGjWE1S_bHAEdwbFg0Ljl55hqTagW1k,43600
|
|
9
|
-
scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
|
|
10
|
-
scdataloader/preprocess.py,sha256=oAGMilgdIgggyp9B9c9627kdo6SCco2tnFhhIHY4-yc,39642
|
|
11
|
-
scdataloader/utils.py,sha256=2zIgmQHPVKHOFWqLX56Ihqtqci3_rOfCcOs642CPnX4,27183
|
|
12
|
-
scdataloader-2.0.6.dist-info/METADATA,sha256=lnOF9PLih91AxcSI3L2OsD-0FH2kh3Qt2G7p6h3JESk,10314
|
|
13
|
-
scdataloader-2.0.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
14
|
-
scdataloader-2.0.6.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
|
|
15
|
-
scdataloader-2.0.6.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
|
|
16
|
-
scdataloader-2.0.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|