scdataloader 0.0.3__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/utils.py CHANGED
@@ -11,9 +11,50 @@ from django.db import IntegrityError
11
11
  from scipy.sparse import csr_matrix
12
12
  from scipy.stats import median_abs_deviation
13
13
  from functools import lru_cache
14
+ from collections import Counter
15
+ from torch import Tensor
16
+ import torch
14
17
 
18
+ from typing import Union, List, Optional
15
19
 
16
- def createFoldersFor(filepath):
20
+ from anndata import AnnData
21
+
22
+
23
+ def downsample_profile(mat: Tensor, dropout: float):
24
+ """
25
+ This function downsamples the expression profile of a given single cell RNA matrix.
26
+
27
+ The noise is applied based on the renoise parameter,
28
+ the total counts of the matrix, and the number of genes. The function first calculates the noise
29
+ threshold (scaler) based on the renoise parameter. It then generates an initial matrix count by
30
+ applying a Poisson distribution to a random tensor scaled by the total counts and the number of genes.
31
+ The function then models the sampling zeros by applying a Poisson distribution to a random tensor
32
+ scaled by the noise threshold, the total counts, and the number of genes. The function also models
33
+ the technical zeros by generating a random tensor and comparing it to the noise threshold. The final
34
+ matrix count is calculated by subtracting the sampling zeros from the initial matrix count and
35
+ multiplying by the technical zeros. The function ensures that the final matrix count is not less
36
+ than zero by taking the maximum of the final matrix count and a tensor of zeros. The function
37
+ returns the final matrix count.
38
+
39
+ Args:
40
+ mat (torch.Tensor): The input matrix.
41
+ dropout (float): The renoise parameter.
42
+
43
+ Returns:
44
+ torch.Tensor: The matrix count after applying noise.
45
+ """
46
+ batch = mat.shape[0]
47
+ ngenes = mat.shape[1]
48
+ dropout = dropout * 1.1
49
+ # we model the sampling zeros (dropping 30% of the reads)
50
+ res = torch.poisson((mat * (dropout / 2))).int()
51
+ # we model the technical zeros (dropping 50% of the genes)
52
+ notdrop = (torch.rand((batch, ngenes), device=mat.device) >= (dropout / 2)).int()
53
+ mat = (mat - res) * notdrop
54
+ return torch.maximum(mat, torch.zeros((1, 1), device=mat.device, dtype=torch.int))
55
+
56
+
57
+ def createFoldersFor(filepath: str):
17
58
  """
18
59
  will recursively create folders if needed until having all the folders required to save the file in this filepath
19
60
  """
@@ -24,19 +65,22 @@ def createFoldersFor(filepath):
24
65
  os.mkdir(prevval)
25
66
 
26
67
 
27
- def _fetchFromServer(ensemble_server, attributes):
68
+ def _fetchFromServer(
69
+ ensemble_server: str, attributes: list, database: str = "hsapiens_gene_ensembl"
70
+ ):
28
71
  """
29
72
  Fetches data from the specified ensemble server.
30
73
 
31
74
  Args:
32
75
  ensemble_server (str): The URL of the ensemble server to fetch data from.
33
76
  attributes (list): The list of attributes to fetch from the server.
77
+ database (str): The database to fetch data from.
34
78
 
35
79
  Returns:
36
80
  pd.DataFrame: A pandas DataFrame containing the fetched data.
37
81
  """
38
82
  server = BiomartServer(ensemble_server)
39
- ensmbl = server.datasets["hsapiens_gene_ensembl"]
83
+ ensmbl = server.datasets[database]
40
84
  print(attributes)
41
85
  res = pd.read_csv(
42
86
  io.StringIO(
@@ -48,11 +92,12 @@ def _fetchFromServer(ensemble_server, attributes):
48
92
 
49
93
 
50
94
  def getBiomartTable(
51
- ensemble_server="http://jul2023.archive.ensembl.org/biomart",
52
- useCache=False,
53
- cache_folder="/tmp/biomart/",
54
- attributes=[],
55
- bypass_attributes=False,
95
+ ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
96
+ useCache: bool = False,
97
+ cache_folder: str = "/tmp/biomart/",
98
+ attributes: List[str] = [],
99
+ bypass_attributes: bool = False,
100
+ database: str = "hsapiens_gene_ensembl",
56
101
  ):
57
102
  """generate a genelist dataframe from ensembl's biomart
58
103
 
@@ -60,6 +105,9 @@ def getBiomartTable(
60
105
  ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
61
106
  useCache (bool, optional): whether to use the cache or not. Defaults to False.
62
107
  cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
108
+ attributes (List[str], optional): the attributes to fetch. Defaults to [].
109
+ bypass_attributes (bool, optional): whether to bypass the attributes or not. Defaults to False.
110
+ database (str, optional): the database to fetch from. Defaults to "hsapiens_gene_ensembl".
63
111
 
64
112
  Raises:
65
113
  ValueError: should be a dataframe (when the result from the server is something else)
@@ -88,21 +136,22 @@ def getBiomartTable(
88
136
  else:
89
137
  print("downloading gene names from biomart")
90
138
 
91
- res = _fetchFromServer(ensemble_server, attr + attributes)
139
+ res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
92
140
  res.to_csv(cachefile, index=False)
93
141
 
94
142
  res.columns = attr + attributes
95
143
  if type(res) is not type(pd.DataFrame()):
96
144
  raise ValueError("should be a dataframe")
97
- res = res[~(res["ensembl_gene_id"].isna() & res["hgnc_symbol"].isna())]
98
- res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
99
- res.hgnc_symbol.isna()
100
- ]["ensembl_gene_id"]
101
-
145
+ res = res[~(res["ensembl_gene_id"].isna())]
146
+ if "hgnc_symbol" in res.columns:
147
+ res = res[res["hgnc_symbol"].isna()]
148
+ res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
149
+ res.hgnc_symbol.isna()
150
+ ]["ensembl_gene_id"]
102
151
  return res
103
152
 
104
153
 
105
- def validate(adata, organism):
154
+ def validate(adata: AnnData, organism: str):
106
155
  """
107
156
  validate checks if the adata object is valid for lamindb
108
157
 
@@ -144,9 +193,6 @@ def validate(adata, organism):
144
193
  raise ValueError(
145
194
  f"Column '{val}' is missing in the provided anndata object."
146
195
  )
147
- bionty_source = bt.PublicSource.filter(
148
- entity="DevelopmentalStage", organism=organism
149
- ).one()
150
196
 
151
197
  if not bt.Ethnicity.validate(
152
198
  adata.obs["self_reported_ethnicity_ontology_term_id"],
@@ -169,14 +215,10 @@ def validate(adata, organism):
169
215
  adata.obs["cell_type_ontology_term_id"], field="ontology_id"
170
216
  ).all():
171
217
  raise ValueError("Invalid cell type ontology term id found")
172
- if (
173
- not bt.DevelopmentalStage.filter(bionty_source=bionty_source)
174
- .validate(
175
- adata.obs["development_stage_ontology_term_id"],
176
- field="ontology_id",
177
- )
178
- .all()
179
- ):
218
+ if not bt.DevelopmentalStage.validate(
219
+ adata.obs["development_stage_ontology_term_id"],
220
+ field="ontology_id",
221
+ ).all():
180
222
  raise ValueError("Invalid dev stage ontology term id found")
181
223
  if not bt.Tissue.validate(
182
224
  adata.obs["tissue_ontology_term_id"], field="ontology_id"
@@ -186,18 +228,16 @@ def validate(adata, organism):
186
228
  adata.obs["assay_ontology_term_id"], field="ontology_id"
187
229
  ).all():
188
230
  raise ValueError("Invalid assay ontology term id found")
189
- if (
190
- not bt.Gene.filter(organism=bt.settings.organism)
191
- .validate(adata.var.index, field="ensembl_gene_id")
192
- .all()
193
- ):
231
+ if not bt.Gene.validate(
232
+ adata.var.index, field="ensembl_gene_id", organism=organism
233
+ ).all():
194
234
  raise ValueError("Invalid gene ensembl id found")
195
235
  return True
196
236
 
197
237
 
198
238
  # setting a cache of 200 elements
199
239
  # @lru_cache(maxsize=200)
200
- def get_all_ancestors(val, df):
240
+ def get_all_ancestors(val: str, df: pd.DataFrame):
201
241
  if val not in df.index:
202
242
  return set()
203
243
  parents = df.loc[val].parents__ontology_id
@@ -207,7 +247,17 @@ def get_all_ancestors(val, df):
207
247
  return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
208
248
 
209
249
 
210
- def get_ancestry_mapping(all_elem, onto_df):
250
+ # setting a cache of 200 elements
251
+ # @lru_cache(maxsize=200)
252
+ def get_descendants(val, df):
253
+ ontos = set(df[df.parents__ontology_id.str.contains(val)].index.tolist())
254
+ r_onto = set()
255
+ for onto in ontos:
256
+ r_onto |= get_descendants(onto, df)
257
+ return r_onto | ontos
258
+
259
+
260
+ def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
211
261
  """
212
262
  This function generates a mapping of all elements to their ancestors in the ontology dataframe.
213
263
 
@@ -242,12 +292,12 @@ def get_ancestry_mapping(all_elem, onto_df):
242
292
 
243
293
 
244
294
  def load_dataset_local(
245
- remote_dataset,
246
- download_folder,
247
- name,
248
- description,
249
- use_cache=True,
250
- only=None,
295
+ remote_dataset: ln.Collection,
296
+ download_folder: str,
297
+ name: str,
298
+ description: str,
299
+ use_cache: bool = True,
300
+ only: Optional[List[int]] = None,
251
301
  ):
252
302
  """
253
303
  This function loads a remote lamindb dataset to local.
@@ -303,7 +353,7 @@ def load_dataset_local(
303
353
  return dataset
304
354
 
305
355
 
306
- def load_genes(organisms):
356
+ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
307
357
  organismdf = []
308
358
  if type(organisms) == str:
309
359
  organisms = [organisms]
@@ -313,7 +363,7 @@ def load_genes(organisms):
313
363
  ).df()
314
364
  genesdf = genesdf[~genesdf["public_source_id"].isna()]
315
365
  genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
316
- genesdf = genesdf.set_index("ensembl_gene_id")
366
+ genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
317
367
  # mitochondrial genes
318
368
  genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
319
369
  # ribosomal genes
@@ -326,14 +376,14 @@ def load_genes(organisms):
326
376
 
327
377
 
328
378
  def populate_my_ontology(
329
- organisms=["NCBITaxon:10090", "NCBITaxon:9606"],
330
- sex=["PATO:0000384", "PATO:0000383"],
331
- celltypes=[],
332
- ethnicities=[],
333
- assays=[],
334
- tissues=[],
335
- diseases=[],
336
- dev_stages=[],
379
+ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
380
+ sex: List[str] = ["PATO:0000384", "PATO:0000383"],
381
+ celltypes: List[str] = [],
382
+ ethnicities: List[str] = [],
383
+ assays: List[str] = [],
384
+ tissues: List[str] = [],
385
+ diseases: List[str] = [],
386
+ dev_stages: List[str] = [],
337
387
  ):
338
388
  """
339
389
  creates a local version of the lamin ontologies and add the required missing values in base ontologies
@@ -360,20 +410,20 @@ def populate_my_ontology(
360
410
  dev_stages (list, optional): List of developmental stages. Defaults to [].
361
411
  """
362
412
 
363
- names = bt.CellType.from_public().df().index if not celltypes else celltypes
413
+ names = bt.CellType.public().df().index if not celltypes else celltypes
364
414
  records = bt.CellType.from_values(names, field="ontology_id")
365
- ln.save(records)
415
+ ln.save(records, parents=bool(celltypes))
366
416
  bt.CellType(name="unknown", ontology_id="unknown").save()
367
417
  # Organism
368
- names = bt.Organism.from_public().df().index if not organisms else organisms
418
+ names = bt.Organism.public().df().index if not organisms else organisms
369
419
  records = [
370
420
  i[0] if type(i) is list else i
371
421
  for i in [bt.Organism.from_public(ontology_id=i) for i in names]
372
422
  ]
373
- ln.save(records)
423
+ ln.save(records, parents=bool(organisms))
374
424
  bt.Organism(name="unknown", ontology_id="unknown").save()
375
425
  # Phenotype
376
- names = bt.Phenotype.from_public().df().index if not sex else sex
426
+ names = bt.Phenotype.public().df().index if not sex else sex
377
427
  records = [
378
428
  bt.Phenotype.from_public(
379
429
  ontology_id=i,
@@ -383,38 +433,47 @@ def populate_my_ontology(
383
433
  )
384
434
  for i in names
385
435
  ]
386
- ln.save(records)
436
+ ln.save(records, parents=bool(sex))
387
437
  bt.Phenotype(name="unknown", ontology_id="unknown").save()
388
438
  # ethnicity
389
- names = bt.Ethnicity.from_public().df().index if not ethnicities else ethnicities
439
+ names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
390
440
  records = bt.Ethnicity.from_values(names, field="ontology_id")
391
- ln.save(records)
441
+ ln.save(records, parents=bool(ethnicities))
392
442
  bt.Ethnicity(
393
443
  name="unknown", ontology_id="unknown"
394
444
  ).save() # multi ethnic will have to get renamed
395
445
  # ExperimentalFactor
396
- names = bt.ExperimentalFactor.from_public().df().index if not assays else assays
446
+ names = bt.ExperimentalFactor.public().df().index if not assays else assays
397
447
  records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
398
- ln.save(records)
448
+ ln.save(records, parents=bool(assays))
399
449
  bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
400
450
  # lookup = bt.ExperimentalFactor.lookup()
401
451
  # lookup.smart_seq_v4.parents.add(lookup.smart_like)
402
452
  # Tissue
403
- names = bt.Tissue.from_public().df().index if not tissues else tissues
453
+ names = bt.Tissue.public().df().index if not tissues else tissues
404
454
  records = bt.Tissue.from_values(names, field="ontology_id")
405
- ln.save(records)
455
+ ln.save(records, parents=bool(tissues))
406
456
  bt.Tissue(name="unknown", ontology_id="unknown").save()
407
457
  # DevelopmentalStage
408
- names = (
409
- bt.DevelopmentalStage.from_public().df().index if not dev_stages else dev_stages
410
- )
458
+ names = bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
411
459
  records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
412
- ln.save(records)
460
+ ln.save(records, parents=bool(dev_stages))
413
461
  bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
462
+
463
+ names = bt.DevelopmentalStage.public(organism="mouse").df().name
464
+ bionty_source = bt.PublicSource.filter(
465
+ entity="DevelopmentalStage", organism="mouse"
466
+ ).one()
467
+ records = [
468
+ bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
469
+ for i in names.tolist()
470
+ ]
471
+ records[-4] = records[-4][0]
472
+ ln.save(records)
414
473
  # Disease
415
- names = bt.Disease.from_public().df().index if not diseases else diseases
474
+ names = bt.Disease.public().df().index if not diseases else diseases
416
475
  records = bt.Disease.from_values(names, field="ontology_id")
417
- ln.save(records)
476
+ ln.save(records, parents=bool(diseases))
418
477
  bt.Disease(name="normal", ontology_id="PATO:0000461").save()
419
478
  bt.Disease(name="unknown", ontology_id="unknown").save()
420
479
  # genes
@@ -430,7 +489,7 @@ def populate_my_ontology(
430
489
  ln.save(records)
431
490
 
432
491
 
433
- def is_outlier(adata, metric: str, nmads: int):
492
+ def is_outlier(adata: AnnData, metric: str, nmads: int):
434
493
  """
435
494
  is_outlier detects outliers in adata.obs[metric]
436
495
 
@@ -449,7 +508,7 @@ def is_outlier(adata, metric: str, nmads: int):
449
508
  return outlier
450
509
 
451
510
 
452
- def length_normalize(adata, gene_lengths):
511
+ def length_normalize(adata: AnnData, gene_lengths: list):
453
512
  """
454
513
  length_normalize normalizes the counts by the gene length
455
514
 
@@ -464,7 +523,7 @@ def length_normalize(adata, gene_lengths):
464
523
  return adata
465
524
 
466
525
 
467
- def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
526
+ def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
468
527
  """
469
528
  pd_load_cached downloads a file from a url and loads it as a pandas dataframe
470
529
 
@@ -482,3 +541,36 @@ def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
482
541
  urllib.request.urlretrieve(url, loc)
483
542
  # Load the data from the file
484
543
  return pd.read_csv(loc, **kwargs)
544
+
545
+
546
+ def translate(
547
+ val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
548
+ ):
549
+ """
550
+ translate translates the ontology term id to the name
551
+
552
+ Args:
553
+ val (str, dict, set, list, dict): the object to translate
554
+ t (flat, optional): the type of ontology terms.
555
+ one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
556
+ Defaults to "cell_type_ontology_term_id".
557
+
558
+ Returns:
559
+ dict: the mapping for the translation
560
+ """
561
+ if t == "cell_type_ontology_term_id":
562
+ obj = bt.CellType.public(organism="all")
563
+ elif t == "assay_ontology_term_id":
564
+ obj = bt.ExperimentalFactor.public()
565
+ elif t == "tissue_ontology_term_id":
566
+ obj = bt.Tissue.public()
567
+ else:
568
+ return None
569
+ if type(val) is str:
570
+ return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
571
+ elif type(val) is list or type(val) is set:
572
+ return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
573
+ elif type(val) is dict or type(val) is Counter:
574
+ return {
575
+ obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
576
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scdataloader
3
- Version: 0.0.3
3
+ Version: 1.0.1
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Home-page: https://github.com/jkobject/scDataLoader
6
6
  License: GPL3
@@ -34,12 +34,16 @@ Description-Content-Type: text/markdown
34
34
 
35
35
  [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
36
36
  [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
37
+ [![PyPI version](https://badge.fury.io/py/scDataLoader.svg)](https://badge.fury.io/py/scDataLoader)
38
+ [![Documentation Status](https://readthedocs.org/projects/scDataLoader/badge/?version=latest)](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
39
+ [![Downloads](https://pepy.tech/badge/scDataLoader)](https://pepy.tech/project/scDataLoader)
40
+ [![Downloads](https://pepy.tech/badge/scDataLoader/month)](https://pepy.tech/project/scDataLoader)
41
+ [![Downloads](https://pepy.tech/badge/scDataLoader/week)](https://pepy.tech/project/scDataLoader)
42
+ [![GitHub issues](https://img.shields.io/github/issues/jkobject/scDataLoader)](https://img.shields.io/github/issues/jkobject/scDataLoader)
43
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
44
+ [![DOI](https://img.shields.io/badge/DOI-10.1101%2F2024.07.29.605556-blue)](https://doi.org/10.1101/2024.07.29.605556)
37
45
 
38
- Awesome single cell dataloader created by @jkobject
39
-
40
- built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
41
-
42
- This data loader is designed to be used with:
46
+ This single cell pytorch dataloader / lighting datamodule is designed to be used with:
43
47
 
44
48
  - [lamindb](https://lamin.ai/)
45
49
 
@@ -55,18 +59,13 @@ It allows you to:
55
59
  3. create a more complex single cell dataset
56
60
  4. extend it to your need
57
61
 
58
- ## About
59
-
60
- the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
62
+ built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
61
63
 
62
- 1. loading from lamin
63
- 2. doing some dataset specific preprocessing if needed
64
- 3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
65
- 4. passing it to a dataloader object that can work with it correctly
64
+ ## More
66
65
 
67
- Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
66
+ I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
68
67
 
69
- ![](docs/scdataloader.drawio.png)
68
+ ![scdataloader.drawio.png](docs/scdataloader.drawio.png)
70
69
 
71
70
  ## Install it from PyPI
72
71
 
@@ -85,15 +84,85 @@ then run the notebooks with the poetry installed environment
85
84
 
86
85
  ## Usage
87
86
 
88
- see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
87
+ ### Direct Usage
88
+
89
+ ```python
90
+ # initialize a local lamin database
91
+ # !lamin init --storage ~/scdataloader --schema bionty
92
+
93
+ from scdataloader import utils
94
+ from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
95
+
96
+ # preprocess datasets
97
+ DESCRIPTION='preprocessed by scDataLoader'
98
+
99
+ cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
100
+ cx_dataset, len(cx_dataset.artifacts.all())
89
101
 
90
- 1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)
91
- 2. [create a dataset](https://jkobject.github.io/scDataLoader/notebooks/02_create_dataset.html)
102
+
103
+ do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
104
+
105
+ preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
106
+
107
+ # create dataloaders
108
+ from scdataloader import DataModule
109
+ import tqdm
110
+
111
+ datamodule = DataModule(
112
+ collection_name="preprocessed dataset",
113
+ organisms=["NCBITaxon:9606"], #organism that we will work on
114
+ how="most expr", # for the collator (most expr genes only will be selected)
115
+ max_len=1000, # only the 1000 most expressed
116
+ batch_size=64,
117
+ num_workers=1,
118
+ validation_split=0.1,
119
+ test_split=0)
120
+
121
+ for i in tqdm.tqdm(datamodule.train_dataloader()):
122
+ # pass #or do pass
123
+ print(i)
124
+ break
125
+
126
+ # with lightning:
127
+ # Trainer(model, datamodule)
128
+
129
+ ```
130
+
131
+ see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
132
+
133
+ 1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
134
+ 2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
135
+
136
+ ### command line preprocessing
137
+
138
+ You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
139
+
140
+ ```bash
141
+ scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
142
+ ```
143
+
144
+ ### command line usage
145
+
146
+ The main way to use
147
+
148
+ > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
92
149
 
93
150
  ## Development
94
151
 
95
152
  Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
96
153
 
154
+ ## License
155
+
156
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
157
+
158
+ ## Acknowledgments
159
+
160
+ - [lamin.ai](https://lamin.ai/)
161
+ - [scanpy](https://scanpy.readthedocs.io/en/stable/)
162
+ - [anndata](https://anndata.readthedocs.io/en/latest/)
163
+ - [scprint](https://www.jkobject.com/scPRINT/)
164
+
165
+ Awesome single cell dataloader created by @jkobject
97
166
  GNU GENERAL PUBLIC LICENSE
98
167
  Version 3, 29 June 2007
99
168
 
@@ -0,0 +1,16 @@
1
+ scdataloader/VERSION,sha256=WYVJhIUxBN9cNT4vaBoV_HkkdC-aLkaMKa8kjc5FzgM,6
2
+ scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
3
+ scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
4
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
5
+ scdataloader/collator.py,sha256=zkFdxirTDub1dJ1OJXO0p48kvd2r2ncKMdevAKIdTTc,13447
6
+ scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
7
+ scdataloader/data.py,sha256=VugtHo9T9PqoJSv3lkJJAB89KD-fRwdVw1D76gnCc9c,12584
8
+ scdataloader/datamodule.py,sha256=WLEWcDMcC1G3VD5tORfhfqRRHcTscpI0EzPikg3udbI,16881
9
+ scdataloader/mapped.py,sha256=yF9l3obuRWbQjW8QZGRSKhc50fizXTWf3Pe1m542fW8,19481
10
+ scdataloader/preprocess.py,sha256=noynYWuy9clhFu9UnN-vSvAHJHwakDttkI5aj1e_T98,29055
11
+ scdataloader/utils.py,sha256=xyDsWaqkjhzlVBP8FiYdBUWHsel3twcVWmI53PhKqTM,21888
12
+ scdataloader-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
13
+ scdataloader-1.0.1.dist-info/METADATA,sha256=2Xd8M1dq_JmvmFjmrrzn-1U4eOtwU6L51Y_7MCkGxvY,41327
14
+ scdataloader-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
15
+ scdataloader-1.0.1.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
16
+ scdataloader-1.0.1.dist-info/RECORD,,