scdataloader 2.0.7__py3-none-any.whl → 2.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/data.py CHANGED
@@ -200,7 +200,7 @@ class Dataset(torchDataset):
200
200
  def get_label_cats(
201
201
  self,
202
202
  obs_keys: Union[str, List[str]],
203
- ):
203
+ ) -> np.ndarray:
204
204
  """
205
205
  Get combined categorical codes for one or more label columns.
206
206
 
@@ -226,7 +226,7 @@ class Dataset(torchDataset):
226
226
  labels = concat_categorical_codes([labels, labels_to_str])
227
227
  return np.array(labels.codes)
228
228
 
229
- def get_unseen_mapped_dataset_elements(self, idx: int):
229
+ def get_unseen_mapped_dataset_elements(self, idx: int) -> list[str]:
230
230
  """
231
231
  Get genes marked as unseen for a specific sample.
232
232
 
@@ -136,7 +136,7 @@ class DataModule(L.LightningDataModule):
136
136
  increases sampling weight balance over epochs. Defaults to 0.
137
137
  start_at (int, optional): Starting index for resuming inference. Requires same
138
138
  number of GPUs as previous run. Defaults to 0.
139
- **kwargs: Additional arguments passed to PyTorch DataLoader (e.g., batch_size,
139
+ **kwargs: dict[str, Any]: Additional arguments passed to PyTorch DataLoader (e.g., batch_size,
140
140
  num_workers, pin_memory).
141
141
 
142
142
  Attributes:
@@ -256,7 +256,7 @@ class DataModule(L.LightningDataModule):
256
256
  )
257
257
 
258
258
  @property
259
- def decoders(self):
259
+ def decoders(self) -> dict[str, dict[int, str]]:
260
260
  """
261
261
  decoders the decoders for any labels that would have been encoded
262
262
 
@@ -269,7 +269,7 @@ class DataModule(L.LightningDataModule):
269
269
  return decoders
270
270
 
271
271
  @property
272
- def labels_hierarchy(self):
272
+ def labels_hierarchy(self) -> dict[str, dict[str, str]]:
273
273
  """
274
274
  labels_hierarchy the hierarchy of labels for any cls that would have a hierarchy
275
275
 
@@ -287,7 +287,7 @@ class DataModule(L.LightningDataModule):
287
287
  return labels_hierarchy
288
288
 
289
289
  @property
290
- def genes(self):
290
+ def genes(self) -> list:
291
291
  """
292
292
  genes the genes used in this datamodule
293
293
 
@@ -343,7 +343,7 @@ class DataModule(L.LightningDataModule):
343
343
  def num_datasets(self):
344
344
  return len(self.dataset.mapped_dataset.storages)
345
345
 
346
- def setup(self, stage=None):
346
+ def setup(self, stage: Optional[str] = None) -> list[str]:
347
347
  """
348
348
  Prepare data splits for training, validation, and testing.
349
349
 
@@ -512,7 +512,7 @@ class DataModule(L.LightningDataModule):
512
512
  print(f"done setup, took {time.time() - start_time:.2f} seconds")
513
513
  return self.test_datasets
514
514
 
515
- def train_dataloader(self, **kwargs):
515
+ def train_dataloader(self, **kwargs) -> DataLoader:
516
516
  """
517
517
  Create the training DataLoader with weighted random sampling.
518
518
 
@@ -521,7 +521,7 @@ class DataModule(L.LightningDataModule):
521
521
  distributed training without weighting.
522
522
 
523
523
  Args:
524
- **kwargs: Additional arguments passed to DataLoader, overriding defaults.
524
+ **kwargs: dict[str, Any]: Additional arguments passed to DataLoader, overriding defaults.
525
525
 
526
526
  Returns:
527
527
  DataLoader: Training DataLoader instance.
@@ -560,7 +560,7 @@ class DataModule(L.LightningDataModule):
560
560
  **current_loader_kwargs,
561
561
  )
562
562
 
563
- def val_dataloader(self):
563
+ def val_dataloader(self) -> Union[DataLoader, list]:
564
564
  """
565
565
  Create the validation DataLoader.
566
566
 
@@ -576,7 +576,7 @@ class DataModule(L.LightningDataModule):
576
576
  else []
577
577
  )
578
578
 
579
- def test_dataloader(self):
579
+ def test_dataloader(self) -> Union[DataLoader, list]:
580
580
  """
581
581
  Create the test DataLoader with sequential sampling.
582
582
 
@@ -591,7 +591,7 @@ class DataModule(L.LightningDataModule):
591
591
  else []
592
592
  )
593
593
 
594
- def predict_dataloader(self):
594
+ def predict_dataloader(self) -> DataLoader:
595
595
  """
596
596
  Create a DataLoader for prediction over all training data.
597
597
 
@@ -87,10 +87,12 @@ class Preprocessor:
87
87
  If int, filters cells with counts. Defaults to False.
88
88
  normalize_sum (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
89
89
  Defaults to 1e4.
90
- log1p (bool, optional): Determines whether to apply log1p transform to the normalized data.
91
- Defaults to True.
92
90
  n_hvg_for_postp (int or bool, optional): Determines whether to subset to highly variable genes for the PCA.
93
91
  Defaults to False.
92
+ use_layer (str, optional): The layer to use for preprocessing.
93
+ Defaults to None.
94
+ is_symbol (bool, optional): Whether genes are provided as symbols instead of Ensembl IDs.
95
+ Defaults to False.
94
96
  hvg_flavor (str, optional): Specifies the flavor of highly variable genes selection.
95
97
  See :func:`scanpy.pp.highly_variable_genes` for more details. Defaults to "seurat_v3".
96
98
  binning (int, optional): Determines whether to bin the data into discrete values of number of bins provided.
@@ -112,10 +114,20 @@ class Preprocessor:
112
114
  Defaults to 5.
113
115
  pct_mt_outlier (int, optional): The maximum percentage of mitochondrial genes outlier.
114
116
  Defaults to 8.
115
- batch_key (str, optional): The key of :class:`~anndata.AnnData.obs` to use for batch information.
117
+ batch_keys (List[str], optional): The keys of :class:`~anndata.AnnData.obs` to use for batch information.
116
118
  This arg is used in the highly variable gene selection step.
117
119
  skip_validate (bool, optional): Determines whether to skip the validation step.
118
120
  Defaults to False.
121
+ additional_preprocess (Callable, optional): Additional preprocessing function.
122
+ Defaults to None.
123
+ additional_postprocess (Callable, optional): Additional postprocessing function.
124
+ Defaults to None.
125
+ do_postp (bool, optional): Whether to perform postprocessing.
126
+ Defaults to True.
127
+ organisms (List[str], optional): List of organisms to support.
128
+ Defaults to ["NCBITaxon:9606", "NCBITaxon:10090"].
129
+ use_raw (bool, optional): Whether to use raw counts.
130
+ Defaults to True.
119
131
  keepdata (bool, optional): Determines whether to keep the data in the AnnData object.
120
132
  Defaults to False.
121
133
  drop_non_primary (bool, optional): Determines whether to drop non-primary cells.
@@ -483,13 +495,20 @@ class LaminPreprocessor(Preprocessor):
483
495
  version: str = "2",
484
496
  ):
485
497
  """
486
- format controls the different input value wrapping, including categorical
487
- binned style, fixed-sum normalized counts, log1p fixed-sum normalized counts, etc.
498
+ Process data with format controlling different input value wrapping.
499
+
500
+ Includes support for categorical binned style, fixed-sum normalized counts,
501
+ log1p fixed-sum normalized counts, etc.
488
502
 
489
503
  Args:
490
- adata (AnnData): The AnnData object to preprocess.
491
- batch_key (str, optional): The key of AnnData.obs to use for batch information. This arg
492
- is used in the highly variable gene selection step.
504
+ data (Union[ln.Collection, AnnData]): The AnnData object or Collection to preprocess.
505
+ name (str, optional): Name for the preprocessed dataset. Defaults to "preprocessed dataset".
506
+ description (str, optional): Description for the preprocessed dataset.
507
+ Defaults to "preprocessed dataset using scprint".
508
+ start_at (int, optional): Starting index for resuming preprocessing.
509
+ Defaults to 0.
510
+ version (str, optional): Version string for the dataset.
511
+ Defaults to "2".
493
512
  """
494
513
  files = []
495
514
  all_ready_processed_keys = set()
scdataloader/utils.py CHANGED
@@ -28,10 +28,11 @@ def fileToList(filename: str, strconv: callable = lambda x: x) -> list:
28
28
  loads an input file with a\\n b\\n.. into a list [a,b,..]
29
29
 
30
30
  Args:
31
- input_str (str): The input string to be completed.
31
+ filename (str): The filename to load from.
32
+ strconv (callable): A function to convert each line. Defaults to identity function.
32
33
 
33
34
  Returns:
34
- str: The completed string with 'complete' appended.
35
+ list: The list of converted elements from the file.
35
36
  """
36
37
  with open(filename) as f:
37
38
  return [strconv(val[:-1]) for val in f.readlines()]
@@ -44,7 +45,7 @@ def listToFile(
44
45
  listToFile loads a list with [a,b,..] into an input file a\\n b\\n..
45
46
 
46
47
  Args:
47
- l (list): The list of elements to be written to the file.
48
+ li (list): The list of elements to be written to the file.
48
49
  filename (str): The name of the file where the list will be written.
49
50
  strconv (callable, optional): A function to convert each element of the list to a string. Defaults to str.
50
51
 
@@ -124,7 +125,7 @@ def getBiomartTable(
124
125
  attributes: List[str] = [],
125
126
  bypass_attributes: bool = False,
126
127
  database: str = "hsapiens_gene_ensembl",
127
- ):
128
+ ) -> pd.DataFrame:
128
129
  """generate a genelist dataframe from ensembl's biomart
129
130
 
130
131
  Args:
@@ -175,14 +176,14 @@ def getBiomartTable(
175
176
  return res
176
177
 
177
178
 
178
- def validate(adata: AnnData, organism: str, need_all=False):
179
+ def validate(adata: AnnData, organism: str, need_all: bool = False) -> bool:
179
180
  """
180
181
  validate checks if the adata object is valid for lamindb
181
182
 
182
183
  Args:
183
- adata (anndata): the anndata object
184
- lb (lamindb): the lamindb instance
185
- organism (str): the organism
184
+ adata (AnnData): the anndata object
185
+ organism (str): the organism ontology ID
186
+ need_all (bool, optional): whether all columns should be present. Defaults to False.
186
187
 
187
188
  Raises:
188
189
  ValueError: if the adata object is not valid
@@ -298,7 +299,7 @@ def get_descendants(val, df):
298
299
  return r_onto | ontos
299
300
 
300
301
 
301
- def get_ancestry_mapping(all_elem: List[str], onto_df: pd.DataFrame):
302
+ def get_ancestry_mapping(all_elem: List[str], onto_df: pd.DataFrame) -> dict:
302
303
  """
303
304
  This function generates a mapping of all elements to their ancestors in the ontology dataframe.
304
305
 
@@ -339,13 +340,12 @@ def load_dataset_local(
339
340
  description: str,
340
341
  use_cache: bool = True,
341
342
  only: Optional[List[int]] = None,
342
- ):
343
+ ) -> ln.Dataset:
343
344
  """
344
345
  This function loads a remote lamindb dataset to local.
345
346
 
346
347
  Args:
347
- lb (lamindb): The lamindb instance.
348
- remote_dataset (lamindb.Dataset): The remote Dataset.
348
+ remote_dataset (lamindb.Collection): The remote Collection.
349
349
  download_folder (str): The path to the download folder.
350
350
  name (str): The name of the dataset.
351
351
  description (str): The description of the dataset.
@@ -396,7 +396,7 @@ def load_dataset_local(
396
396
 
397
397
  def load_genes(
398
398
  organisms: Union[str, List[str]] = "NCBITaxon:9606",
399
- ): # "NCBITaxon:10090",
399
+ ) -> pd.DataFrame: # "NCBITaxon:10090",
400
400
  """
401
401
  Loads genes from the given organisms.
402
402
 
@@ -664,7 +664,7 @@ def populate_my_ontology(
664
664
  ln.save(records)
665
665
 
666
666
 
667
- def random_str(stringLength=6, stype="all", withdigits=True):
667
+ def random_str(stringLength=6, stype="all", withdigits=True) -> str:
668
668
  """
669
669
  Generate a random string of letters and digits
670
670
 
@@ -673,7 +673,7 @@ def random_str(stringLength=6, stype="all", withdigits=True):
673
673
  stype (str, optional): one of lowercase, uppercase, all. Defaults to 'all'.
674
674
  withdigits (bool, optional): digits allowed in the string? Defaults to True.
675
675
 
676
- Returns:
676
+ Returns:
677
677
  str: random string
678
678
  """
679
679
  if stype == "lowercase":
@@ -687,12 +687,12 @@ def random_str(stringLength=6, stype="all", withdigits=True):
687
687
  return "".join(random.choice(lettersAndDigits) for i in range(stringLength))
688
688
 
689
689
 
690
- def is_outlier(adata: AnnData, metric: str, nmads: int):
690
+ def is_outlier(adata: AnnData, metric: str, nmads: int) -> pd.Series:
691
691
  """
692
692
  is_outlier detects outliers in adata.obs[metric]
693
693
 
694
694
  Args:
695
- adata (annData): the anndata object
695
+ adata (AnnData): the anndata object
696
696
  metric (str): the metric column to use
697
697
  nmads (int): the number of median absolute deviations to use as a threshold
698
698
 
@@ -706,16 +706,16 @@ def is_outlier(adata: AnnData, metric: str, nmads: int):
706
706
  return outlier
707
707
 
708
708
 
709
- def length_normalize(adata: AnnData, gene_lengths: list):
709
+ def length_normalize(adata: AnnData, gene_lengths: list) -> AnnData:
710
710
  """
711
711
  length_normalize normalizes the counts by the gene length
712
712
 
713
713
  Args:
714
- adata (anndata): the anndata object
714
+ adata (AnnData): the anndata object
715
715
  gene_lengths (list): the gene lengths
716
716
 
717
717
  Returns:
718
- anndata: the anndata object
718
+ AnnData: the normalized anndata object
719
719
  """
720
720
  adata.X = csr_matrix((adata.X.T / gene_lengths).T)
721
721
  return adata
@@ -723,13 +723,13 @@ def length_normalize(adata: AnnData, gene_lengths: list):
723
723
 
724
724
  def translate(
725
725
  val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
726
- ):
726
+ ) -> dict:
727
727
  """
728
728
  translate translates the ontology term id to the name
729
729
 
730
730
  Args:
731
- val (str, dict, set, list, dict): the object to translate
732
- t (flat, optional): the type of ontology terms.
731
+ val (Union[str, dict, set, list]): the object to translate
732
+ t (str, optional): the type of ontology terms.
733
733
  one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
734
734
  Defaults to "cell_type_ontology_term_id".
735
735
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scdataloader
3
- Version: 2.0.7
3
+ Version: 2.0.9
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Project-URL: repository, https://github.com/jkobject/scDataLoader
6
6
  Author-email: jkobject <jkobject@gmail.com>
@@ -52,9 +52,10 @@ Description-Content-Type: text/markdown
52
52
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
53
53
  [![DOI](https://zenodo.org/badge/731248665.svg)](https://doi.org/10.5281/zenodo.10573143)
54
54
 
55
- <img src="scdataloader.png" width="600">
55
+ <img src="./docs/scdataloader.png" width="600">
56
56
 
57
- This single cell pytorch dataloader / lighting datamodule is designed to be used with:
57
+ This single cell pytorch dataloader / lighting datamodule is designed to be used
58
+ with:
58
59
 
59
60
  - [lamindb](https://lamin.ai/)
60
61
 
@@ -66,11 +67,13 @@ and:
66
67
  It allows you to:
67
68
 
68
69
  1. load thousands of datasets containing millions of cells in a few seconds.
69
- 2. preprocess the data per dataset and download it locally (normalization, filtering, etc.)
70
+ 2. preprocess the data per dataset and download it locally (normalization,
71
+ filtering, etc.)
70
72
  3. create a more complex single cell dataset
71
73
  4. extend it to your need
72
74
 
73
- built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
75
+ built on top of `lamindb` and the `.mapped()` function by Sergei:
76
+ https://github.com/Koncopd
74
77
 
75
78
  ```
76
79
  Portions of the mapped.py file are derived from Lamin Labs
@@ -81,11 +84,17 @@ Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_c
81
84
  for the original implementation
82
85
  ```
83
86
 
84
- The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
87
+ The package has been designed together with the
88
+ [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and
89
+ [model](https://github.com/cantinilab/scPRINT).
85
90
 
86
91
  ## More
87
92
 
88
- I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
93
+ I needed to create this Data Loader for my PhD project. I am using it to load &
94
+ preprocess thousands of datasets containing millions of cells in a few seconds.
95
+ I believed that individuals employing AI for single-cell RNA sequencing and
96
+ other sequencing datasets would eagerly utilize and desire such a tool, which
97
+ presently does not exist.
89
98
 
90
99
  ![scdataloader.drawio.png](docs/scdataloader.drawio.png)
91
100
 
@@ -99,12 +108,14 @@ pip install scDataLoader[dev] # for dev dependencies
99
108
  lamin init --storage ./testdb --name test --schema bionty
100
109
  ```
101
110
 
102
- if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc.
111
+ if you start with lamin and had to do a `lamin init`, you will also need to
112
+ populate your ontologies. This is because scPRINT is using ontologies to define
113
+ its cell types, diseases, sexes, ethnicities, etc.
103
114
 
104
115
  you can do it manually or with our function:
105
116
 
106
117
  ```python
107
- from scdataloader.utils import populate_my_ontology
118
+ from scdataloader.utils import populate_my_ontology, _adding_scbasecamp_genes
108
119
 
109
120
  populate_my_ontology() #to populate everything (recommended) (can take 2-10mns)
110
121
 
@@ -118,11 +129,14 @@ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
118
129
  diseases = None,
119
130
  dev_stages = None,
120
131
  )
132
+ # if you want to load the gene names and species for the arc scbasecount species, also add this:
133
+ _adding_scbasecamp_genes()
121
134
  ```
122
135
 
123
136
  ### Dev install
124
137
 
125
- If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`.
138
+ If you want to use the latest version of scDataLoader and work on the code
139
+ yourself use `git clone` and `pip -e` instead of `pip install`.
126
140
 
127
141
  ```bash
128
142
  git clone https://github.com/jkobject/scDataLoader.git
@@ -161,6 +175,12 @@ datamodule = DataModule(
161
175
  )
162
176
  ```
163
177
 
178
+ see the notebooks in [docs](https://www.jkobject.com/scDataLoader/) to learn
179
+ more
180
+
181
+ 1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
182
+ 2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
183
+
164
184
  ### lightning-free usage (Dataset+Collator+DataLoader)
165
185
 
166
186
  ```python
@@ -211,7 +231,17 @@ for batch in tqdm(dataloader):
211
231
  )
212
232
  ```
213
233
 
214
- ### Usage on all of cellxgene
234
+ ## Gathering a pre-training database
235
+
236
+ Here I will explain how to gather and preprocess all of cellxgene (scPRINT-1
237
+ pretraining database) with scDataLoader, and the scPRINT-2 corpus (scPRINT-2
238
+ pretraining database).
239
+
240
+ ### Getting all of cellxgene
241
+
242
+ Here is an example of how to download and preprocess all of cellxgene with
243
+ scDataLoader as a script (a notebook version is also available in
244
+ [./notebooks/update_lamin_or_cellxgene.ipynb](https://github.com/jkobject/scdataloader/blob/main/notebooks/update_lamin_or_cellxgene.ipynb)).
215
245
 
216
246
  ```python
217
247
  # initialize a local lamin database
@@ -226,11 +256,25 @@ DESCRIPTION='preprocessed by scDataLoader'
226
256
  cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
227
257
  cx_dataset, len(cx_dataset.artifacts.all())
228
258
 
259
+ # (OPTIONAL) if you want to do you preprocessing on a slurm cluster without internet connections,
260
+ # you can first do this:
261
+ load_dataset_local(
262
+ cx_dataset,
263
+ download_folder="/my_download_folder",
264
+ name="cached-cellxgene-census",
265
+ description="all of it topreprocess",
266
+ )
229
267
 
268
+ # preprocessing
230
269
  do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
231
270
 
232
271
  preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
233
272
 
273
+ ```
274
+
275
+ After this you can use the preprocessed dataset with the DataModule below.
276
+
277
+ ```python
234
278
  # create dataloaders
235
279
  from scdataloader import DataModule
236
280
  import tqdm
@@ -252,27 +296,52 @@ for i in tqdm.tqdm(datamodule.train_dataloader()):
252
296
 
253
297
  # with lightning:
254
298
  # Trainer(model, datamodule)
299
+ ```
300
+
301
+ You can use the command line to preprocess a large database of datasets like
302
+ here for cellxgene. this allows parallelizing and easier usage.
255
303
 
304
+ ```bash
305
+ scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
256
306
  ```
257
307
 
258
- see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
308
+ ### Getting the rest of the scPRINT-2 corpus
259
309
 
260
- 1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
261
- 2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
310
+ by now, using the command / scripts above you should be able to get all of
311
+ cellxgene (and preprocess it). laminlabs now also hosts the rest of the
312
+ scPRINT-2 corpus in `laminlabs/arc-virtual-cell-atlas` and they can be
313
+ downloaded and preprocessed the same way as cellxgene above. Be careful however
314
+ that there is no metadata for these datasets.
262
315
 
263
- ### command line preprocessing
316
+ You can have a look at my notebooks:
317
+ [./notebooks/adding_tahoe.ipynb](https://github.com/jkobject/scdataloader/blob/main/notebooks/adding_tahoe.ipynb)
318
+ and
319
+ [./notebooks/adding_scbasecount.ipynb](https://github.com/jkobject/scdataloader/blob/main/notebooks/adding_scbasecount.ipynb)
320
+ where I create some remmaping to retrive metadata that can be used by
321
+ scdataloader and lamindb from these datasets.
264
322
 
265
- You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
323
+ If you do not have access for some reason to these datasets, please contact
324
+ laminlabs. But another solution, is to download them from the original sources
325
+ and add them one by one in your instance and then do the same preprocessing but
326
+ this time use `your_account/your_instance` instead of
327
+ `laminlabs/arc-virtual-cell-atlas`.
266
328
 
267
- ```bash
268
- scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
269
- ```
329
+ This is actually what I did in my own instance to create the full scPRINT-2
330
+ corpus and you can see some of it in the notebooks above.
331
+
332
+ ### Getting even more
333
+
334
+ They also host a pertubation atlas in `laminlabs/pertdata` that can be
335
+ downloaded the same way.
270
336
 
271
- ### command line usage
337
+ ### command line usage to train a moel
272
338
 
273
339
  The main way to use
274
340
 
275
- > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
341
+ > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/)
342
+ > and
343
+ > [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html)
344
+ > for more information on command line usage
276
345
 
277
346
  ## FAQ
278
347
 
@@ -295,13 +364,36 @@ from scdataloader import utils
295
364
  utils.populate_ontologies() # this might take from 5-20mins
296
365
  ```
297
366
 
367
+ ### how to move my lamin instance to another folder?
368
+
369
+ you cannot just move your folder from one place to another because lamin is
370
+ using absolute paths. You need to do 3 things:
371
+
372
+ 1. move your folder to the new place
373
+ 2. update your lamin config file (usually in `~/.lamin/my_env.yml`) to point to
374
+ the new place
375
+ 3. update the absolute paths in your lamin database. You can do it like this:
376
+
377
+ ```python
378
+ import lamin as ln
379
+ ln.Storage.df()
380
+ # view what is your current storage id (in my case it was GZgLW1TQ)
381
+ ln.Storage.filter(uid="GZgLW1TI").update(
382
+ root=Path("your_new_locations").as_posix().rstrip("/")
383
+ )
384
+ ```
385
+
298
386
  ## Development
299
387
 
300
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
388
+ Read the
389
+ [CONTRIBUTING.md](https://github.com/jkobject/scdataloader/blob/main/CONTRIBUTING.md)
390
+ file.
301
391
 
302
392
  ## License
303
393
 
304
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
394
+ This project is licensed under the MIT License - see the
395
+ [LICENSE](https://github.com/jkobject/scdataloader/blob/main/LICENSE) file for
396
+ details.
305
397
 
306
398
  ## Acknowledgments
307
399
 
@@ -0,0 +1,15 @@
1
+ scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
2
+ scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
3
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
4
+ scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
5
+ scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
6
+ scdataloader/data.py,sha256=tXvONJNgcdMQIRh2KlAq9KCsf-Sz2L4GUlcGyf1OMhw,25160
7
+ scdataloader/datamodule.py,sha256=pFBGUOHl3ibi8QhiV8x5ukjzVjnJMsZWNw3Ekk3P83Y,43810
8
+ scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
9
+ scdataloader/preprocess.py,sha256=VFmyJluk4drR4fcH5qBAcJLf0cJg26ElA0HDuHOK68s,40730
10
+ scdataloader/utils.py,sha256=B81iwnR6aJs9lzkOCSRF85RszAdwS-dvPZPXA7yoMg4,27734
11
+ scdataloader-2.0.9.dist-info/METADATA,sha256=RlbjaHlNjZ4aaRfLkxbCTHcmr1DZBLfMGSfCJDS4guE,13448
12
+ scdataloader-2.0.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ scdataloader-2.0.9.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
14
+ scdataloader-2.0.9.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
15
+ scdataloader-2.0.9.dist-info/RECORD,,
scdataloader/data.json DELETED
@@ -1,384 +0,0 @@
1
- 'lung': 42877488
2
- 'blood': 34180713
3
- 'brain': 29530595
4
- 'colon': 25830811
5
- 'unknown': 23810521
6
- 'pancreas': 21597602
7
- 'embryo': 16976623
8
- 'skin': 12513695
9
- 'liver': 10683313
10
- 'breast': 10539762
11
- 'marrow': 9543512
12
- 'kidney': 8213043
13
- 'heart': 4800567
14
- 'immune system': 4687291
15
- 'eye': 4677504
16
- # 'other (310 tissues)': 87200000
17
- 'UBERON:0002106': 4559701
18
- 'UBERON:0000029': 4314740
19
- 'UBERON:0000945': 4244624
20
- 'UBERON:0001295': 3485757
21
- 'UBERON:0002049': 3350893
22
- 'UBERON:0002108': 3226581
23
- 'UBERON:0001255': 2961628
24
- 'UBERON:0001434': 2823653
25
- 'UBERON:0001004': 2588530
26
- 'UBERON:0002037': 2101623
27
- 'UBERON:0001015': 1920181
28
- 'UBERON:0000995': 1864451
29
- 'UBERON:0001013': 1853702
30
- 'UBERON:0001017': 1818861
31
- 'UBERON:0002240': 1704269
32
- 'UBERON:0009834': 1631471
33
- 'UBERON:0000002': 1596082
34
- 'UBERON:0001893': 1565863
35
- 'UBERON:0001851': 1513857
36
- 'UBERON:0002771': 1317262
37
- 'UBERON:0002367': 1301074
38
- 'UBERON:0002369': 1279963
39
- 'UBERON:0000956': 1252422
40
- 'UBERON:0013682': 1195809
41
- 'UBERON:0000160': 1156931
42
- 'UBERON:0001987': 1127465
43
- 'UBERON:0001043': 1015278
44
- 'UBERON:0001032': 1011915
45
- 'UBERON:0000992': 948464
46
- 'UBERON:0000010': 897312
47
- 'UBERON:0000473': 838235
48
- 'UBERON:0002368': 814761
49
- 'UBERON:0002084': 761793
50
- 'UBERON:0001870': 730928
51
- 'UBERON:0000344': 655220
52
- 'UBERON:0001384': 625538
53
- 'UBERON:0000966': 598023
54
- 'UBERON:0002421': 578257
55
- 'UBERON:0001225': 542410
56
- 'UBERON:0000991': 507126
57
- 'UBERON:0002116': 506392
58
- 'UBERON:8440012': 494325
59
- 'UBERON:0001898': 483451
60
- 'UBERON:0000990': 440483
61
- 'UBERON:0002370': 405221
62
- 'UBERON:0002436': 399661
63
- 'UBERON:0001965': 387483
64
- 'UBERON:0000006': 362898
65
- 'UBERON:0001005': 358584
66
- 'UBERON:0010225': 343320
67
- 'UBERON:0002102': 335470
68
- 'UBERON:0008946': 334444
69
- 'UBERON:0000053': 310875
70
- 'UBERON:0008933': 310653
71
- 'UBERON:0001117': 308212
72
- 'UBERON:0001007': 307511
73
- 'UBERON:0000059': 288232
74
- 'UBERON:0002080': 286423
75
- 'UBERON:0002094': 284283
76
- 'UBERON:0000362': 283305
77
- 'UBERON:0002365': 282382
78
- 'UBERON:0002103': 264447
79
- 'UBERON:0001891': 249305
80
- 'UBERON:0001894': 249065
81
- 'UBERON:0000411': 236082
82
- 'UBERON:0002728': 221808
83
- 'UBERON:0000451': 219655
84
- 'UBERON:0001161': 214938
85
- 'UBERON:0000030': 207230
86
- 'UBERON:0009835': 206197
87
- 'UBERON:0000988': 204168
88
- 'UBERON:0001707': 198868
89
- 'UBERON:0016538': 192147
90
- 'UBERON:0002450': 189973
91
- 'UBERON:0016540': 187526
92
- 'UBERON:0000977': 185185
93
- 'UBERON:0001913': 183332
94
- 'UBERON:0001786': 179309
95
- 'UBERON:0034751': 145757
96
- 'UBERON:0001040': 144260
97
- 'UBERON:0016530': 141537
98
- 'UBERON:0001238': 136529
99
- 'UBERON:0003889': 132985
100
- 'UBERON:0000453': 125672
101
- 'UBERON:0001723': 125523
102
- 'UBERON:0002098': 120306
103
- 'UBERON:0016525': 118039
104
- 'UBERON:0000004': 114936
105
- 'UBERON:0002686': 110752
106
- 'UBERON:0022352': 110671
107
- 'UBERON:0002351': 107573
108
- 'UBERON:0001828': 105079
109
- 'UBERON:0001003': 104237
110
- 'UBERON:0002067': 100247
111
- 'UBERON:0002190': 99711
112
- 'UBERON:0005290': 99110
113
- 'UBERON:0002811': 98536
114
- 'UBERON:0001871': 97537
115
- 'UBERON:0003688': 92733
116
- 'UBERON:0010410': 91248
117
- 'UBERON:0000403': 90975
118
- 'UBERON:0000175': 90741
119
- 'UBERON:0001976': 87947
120
- 'UBERON:0002822': 87214
121
- 'UBERON:0001890': 86604
122
- 'UBERON:0014454': 86134
123
- 'UBERON:0002810': 85525
124
- 'UBERON:0002079': 82676
125
- 'UBERON:0010414': 80085
126
- 'UBERON:0003126': 79986
127
- 'UBERON:0003027': 75807
128
- 'UBERON:0008345': 73634
129
- 'UBERON:0002352': 73045
130
- 'UBERON:8410025': 72957
131
- 'UBERON:0001393': 72549
132
- 'UBERON:0010412': 71752
133
- 'UBERON:0001159': 69025
134
- 'UBERON:0014918': 65710
135
- 'UBERON:0002078': 64486
136
- 'UBERON:0007650': 64093
137
- 'UBERON:0002808': 63227
138
- 'UBERON:0007644': 62647
139
- 'UBERON:8410010': 61264
140
- 'UBERON:0001157': 60907
141
- 'UBERON:0002185': 59377
142
- 'UBERON:0002114': 59363
143
- 'UBERON:0000916': 59001
144
- 'UBERON:0014455': 58536
145
- 'UBERON:0002661': 58226
146
- 'UBERON:0001873': 57571
147
- 'UBERON:0001769': 57422
148
- 'UBERON:0008953': 56919
149
- 'UBERON:0002372': 56899
150
- 'UBERON:0002509': 55694
151
- 'UBERON:0000397': 55666
152
- 'UBERON:0001156': 54735
153
- 'UBERON:0023787': 54666
154
- 'UBERON:0002299': 54085
155
- 'UBERON:0034893': 53956
156
- 'UBERON:0001872': 53269
157
- 'UBERON:0007177': 49990
158
- 'UBERON:0004026': 48185
159
- 'UBERON:0012648': 48068
160
- 'UBERON:0001630': 47291
161
- 'UBERON:0002803': 47176
162
- 'UBERON:0016632': 45130
163
- 'UBERON:0008803': 43943
164
- 'UBERON:0001049': 43485
165
- 'UBERON:0016475': 42323
166
- 'UBERON:0002363': 42319
167
- 'UBERON:0001874': 41277
168
- 'UBERON:0000964': 40990
169
- 'UBERON:0011189': 40231
170
- 'UBERON:0036288': 36574
171
- 'UBERON:0008954': 35284
172
- 'UBERON:0018131': 34687
173
- 'UBERON:0004648': 34173
174
- 'UBERON:0034891': 34153
175
- 'UBERON:0001775': 34132
176
- 'UBERON:0018707': 33610
177
- 'UBERON:0003661': 32722
178
- 'UBERON:0003403': 32138
179
- 'UBERON:0035328': 31696
180
- 'UBERON:0001728': 31325
181
- 'UBERON:0001388': 30877
182
- 'UBERON:0008952': 29895
183
- 'UBERON:0000080': 29606
184
- 'UBERON:0004024': 29064
185
- 'UBERON:0035886': 28873
186
- 'UBERON:0004023': 28857
187
- 'UBERON:0013473': 28621
188
- 'UBERON:0018105': 28367
189
- 'UBERON:0005969': 27736
190
- 'UBERON:0012168': 27154
191
- 'UBERON:0001886': 27092
192
- 'UBERON:0000400': 27087
193
- 'UBERON:0001911': 26952
194
- 'UBERON:0000088': 26853
195
- 'UBERON:0001153': 25865
196
- 'UBERON:0001471': 24982
197
- 'UBERON:0001085': 24807
198
- 'UBERON:0000057': 24700
199
- 'UBERON:0006761': 24573
200
- 'UBERON:0002809': 24445
201
- 'UBERON:0001158': 23887
202
- 'UBERON:0008972': 23110
203
- 'UBERON:0002807': 22796
204
- 'UBERON:0010506': 22652
205
- 'UBERON:0001459': 21633
206
- 'UBERON:8410000': 21592
207
- 'UBERON:0001831': 21003
208
- 'UBERON:0003544': 20212
209
- 'UBERON:0002110': 19880
210
- 'UBERON:0014614': 19650
211
- 'UBERON:8300000': 19408
212
- 'UBERON:0035895': 18814
213
- 'UBERON:0035213': 18775
214
- 'UBERON:0001162': 18404
215
- 'UBERON:0000056': 18354
216
- 'UBERON:0001228': 17958
217
- 'UBERON:0008971': 17831
218
- 'UBERON:0013756': 17625
219
- 'UBERON:0001052': 16913
220
- 'UBERON:0012474': 16607
221
- 'UBERON:0039167': 16527
222
- 'UBERON:0002317': 15963
223
- 'UBERON:0002115': 15762
224
- 'UBERON:0014648': 15580
225
- 'UBERON:8480028': 15307
226
- 'UBERON:0000014': 15215
227
- 'UBERON:0002489': 15127
228
- 'UBERON:0001836': 14502
229
- 'UBERON:0005343': 14336
230
- 'UBERON:8410026': 14090
231
- 'UBERON:0002132': 13953
232
- 'UBERON:0000965': 13900
233
- 'UBERON:0010415': 12330
234
- 'UBERON:0000017': 11977
235
- 'UBERON:0018303': 11937
236
- 'UBERON:0002382': 11898
237
- 'UBERON:0002046': 11840
238
- 'UBERON:0001087': 11702
239
- 'UBERON:0009958': 11377
240
- 'UBERON:0005616': 11243
241
- 'UBERON:8480009': 10533
242
- 'UBERON:0013535': 9915
243
- 'UBERON:0007106': 9898
244
- 'UBERON:0001513': 9887
245
- 'UBERON:0015790': 9816
246
- 'UBERON:0001068': 9773
247
- 'UBERON:0035894': 9667
248
- 'UBERON:0015476': 9656
249
- 'UBERON:0001637': 9652
250
- 'UBERON:0002129': 9649
251
- 'UBERON:0010033': 9467
252
- 'UBERON:0000947': 9290
253
- 'UBERON:0001511': 9288
254
- 'UBERON:0004946': 9195
255
- 'UBERON:0016435': 9097
256
- 'UBERON:0002420': 9000
257
- 'UBERON:0001868': 8799
258
- 'UBERON:0002021': 8799
259
- 'UBERON:0001542': 8683
260
- 'UBERON:0002081': 8279
261
- 'UBERON:0004070': 8033
262
- 'UBERON:0008612': 7825
263
- 'UBERON:0001901': 7228
264
- 'UBERON:0004929': 7133
265
- 'UBERON:0003528': 6670
266
- 'UBERON:0002427': 6529
267
- 'UBERON:0004025': 6448
268
- 'UBERON:0009472': 6348
269
- 'UBERON:0002756': 6279
270
- 'UBERON:0001832': 6196
271
- 'UBERON:0002378': 6142
272
- 'UBERON:0004167': 5874
273
- 'UBERON:0002228': 5725
274
- 'UBERON:0003968': 5625
275
- 'UBERON:0001154': 5515
276
- 'UBERON:0001046': 5420
277
- 'UBERON:0010032': 5367
278
- 'UBERON:0004339': 4969
279
- 'UBERON:0002385': 4881
280
- 'UBERON:0001621': 4867
281
- 'UBERON:0001416': 4808
282
- 'UBERON:0001638': 4395
283
- 'UBERON:0002429': 4355
284
- 'UBERON:0001165': 4028
285
- 'UBERON:0008989': 3997
286
- 'UBERON:0001902': 3883
287
- 'UBERON:0003532': 3443
288
- 'UBERON:0003428': 3406
289
- 'UBERON:0002082': 3226
290
- 'UBERON:0001296': 3025
291
- 'UBERON:0015143': 3014
292
- 'UBERON:0000074': 3011
293
- 'UBERON:0002245': 2971
294
- 'UBERON:0001293': 2693
295
- 'UBERON:0007625': 2348
296
- 'UBERON:0003547': 2344
297
- 'UBERON:0022277': 2333
298
- 'UBERON:0001554': 2272
299
- 'UBERON:0001348': 2223
300
- 'UBERON:0005406': 2121
301
- 'UBERON:0001811': 2084
302
- 'UBERON:0013531': 2055
303
- 'UBERON:0008934': 1866
304
- 'UBERON:0001103': 1858
305
- 'UBERON:0005636': 1651
306
- 'UBERON:0007225': 1594
307
- 'UBERON:0007224': 1564
308
- 'UBERON:0000016': 1520
309
- 'UBERON:8440075': 1482
310
- 'UBERON:0004264': 1475
311
- 'UBERON:0001773': 1431
312
- 'UBERON:0013706': 1150
313
- 'UBERON:0023852': 1143
314
- 'UBERON:0001294': 849
315
- 'UBERON:0001134': 835
316
- 'UBERON:0003902': 675
317
- 'UBERON:0001224': 569
318
- 'UBERON:0005564': 399
319
- 'UBERON:0001817': 248
320
- 'UBERON:0002802': 163
321
- 'UBERON:0003072': 146
322
- 'UBERON:0000926': 139
323
- 'UBERON:0000416': 82
324
- 'UBERON:0003517': 48
325
- 'UBERON:0001483': 37
326
- 'UBERON:1000021': 37
327
- 'UBERON:0002023': 1
328
-
329
-
330
-
331
- human: 230822337
332
- mouse: 102753838
333
- # other 14070000
334
- macaque: 3161179
335
- zebrafish: 3117237
336
- pig: 1641179
337
- thale cress: 1564711
338
- drosophila: 1564460
339
- chicken: 606680
340
- nake mole rat: 441042
341
- rabbit: 417091
342
- cow: 390675
343
- corn: 336166
344
- chimpanzee: 293472
345
- c. elegans: 251759
346
- sheep: 177871
347
- marmoset: 115584
348
-
349
- 10x 3': 146909525
350
- 10x 3' v2: 12021763
351
- 10x 3' v3: 121241920
352
- # 10x 3': 280173208
353
- 10x 5' v1: 4716686
354
- 10x 5' v2: 3104399
355
- 10x 5': 28919386
356
- # 10x 5': 36740471
357
- 10x multiome: 6942314
358
- 10x (vdj): 1620181
359
- 10x (CITE): 1580477
360
- # 10x multiome: 10142972
361
- sciRNA-seq: 14924076
362
- slide-seq: 1867286
363
- scalebio: 685024
364
- microwell: 599459
365
- sciPlex: 581480
366
- drop-seq: 517799
367
- # other (17 assays): 1423506
368
- EFO:0010961: 286087
369
- EFO:0008931: 192101
370
- EFO:0700003: 146278
371
- EFO:0700010: 133430
372
- EFO:0700016: 128855
373
- EFO:0030007: 105584
374
- EFO:0008919: 87181
375
- EFO:0009901: 76022
376
- EFO:0008796: 68645
377
- EFO:0009919: 68544
378
- EFO:0700011: 58981
379
- EFO:0030019: 31775
380
- EFO:0008780: 25652
381
- EFO:0010010: 5231
382
- EFO:0008953: 4693
383
- EFO:0008720: 2768
384
- EFO:0008930: 1679
@@ -1,16 +0,0 @@
1
- scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
2
- scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
3
- scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
4
- scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
5
- scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
6
- scdataloader/data.json,sha256=Zb8c27yk3rwMgtAU8kkiWWAyUwYBrlCqKUyEtaAx9i8,8785
7
- scdataloader/data.py,sha256=fMW1OgllPCz87si3DpkzOSoqnufgKlh8aW5rEVmeC_c,25133
8
- scdataloader/datamodule.py,sha256=ojX0zr2cpGLoKGjWE1S_bHAEdwbFg0Ljl55hqTagW1k,43600
9
- scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
10
- scdataloader/preprocess.py,sha256=oAGMilgdIgggyp9B9c9627kdo6SCco2tnFhhIHY4-yc,39642
11
- scdataloader/utils.py,sha256=Xic6QI2phQlDvYfEPhIxfL8RWiBVvdgN5BKD1U_AN6A,27509
12
- scdataloader-2.0.7.dist-info/METADATA,sha256=zQc0Czwk1aaqWMjKTWCctFXWxuwnWBg6u5KFu4ejFqY,10314
13
- scdataloader-2.0.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
- scdataloader-2.0.7.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
15
- scdataloader-2.0.7.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
16
- scdataloader-2.0.7.dist-info/RECORD,,