scdataloader 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/utils.py CHANGED
@@ -11,9 +11,14 @@ from django.db import IntegrityError
11
11
  from scipy.sparse import csr_matrix
12
12
  from scipy.stats import median_abs_deviation
13
13
  from functools import lru_cache
14
+ from collections import Counter
14
15
 
16
+ from typing import Union, List, Optional
15
17
 
16
- def createFoldersFor(filepath):
18
+ from anndata import AnnData
19
+
20
+
21
+ def createFoldersFor(filepath: str):
17
22
  """
18
23
  will recursively create folders if needed until having all the folders required to save the file in this filepath
19
24
  """
@@ -24,7 +29,9 @@ def createFoldersFor(filepath):
24
29
  os.mkdir(prevval)
25
30
 
26
31
 
27
- def _fetchFromServer(ensemble_server, attributes):
32
+ def _fetchFromServer(
33
+ ensemble_server: str, attributes: list, database: str = "hsapiens_gene_ensembl"
34
+ ):
28
35
  """
29
36
  Fetches data from the specified ensemble server.
30
37
 
@@ -36,7 +43,7 @@ def _fetchFromServer(ensemble_server, attributes):
36
43
  pd.DataFrame: A pandas DataFrame containing the fetched data.
37
44
  """
38
45
  server = BiomartServer(ensemble_server)
39
- ensmbl = server.datasets["hsapiens_gene_ensembl"]
46
+ ensmbl = server.datasets[database]
40
47
  print(attributes)
41
48
  res = pd.read_csv(
42
49
  io.StringIO(
@@ -48,11 +55,12 @@ def _fetchFromServer(ensemble_server, attributes):
48
55
 
49
56
 
50
57
  def getBiomartTable(
51
- ensemble_server="http://jul2023.archive.ensembl.org/biomart",
52
- useCache=False,
53
- cache_folder="/tmp/biomart/",
54
- attributes=[],
55
- bypass_attributes=False,
58
+ ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
59
+ useCache: bool = False,
60
+ cache_folder: str = "/tmp/biomart/",
61
+ attributes: List[str] = [],
62
+ bypass_attributes: bool = False,
63
+ database: str = "hsapiens_gene_ensembl",
56
64
  ):
57
65
  """generate a genelist dataframe from ensembl's biomart
58
66
 
@@ -88,7 +96,7 @@ def getBiomartTable(
88
96
  else:
89
97
  print("downloading gene names from biomart")
90
98
 
91
- res = _fetchFromServer(ensemble_server, attr + attributes)
99
+ res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
92
100
  res.to_csv(cachefile, index=False)
93
101
 
94
102
  res.columns = attr + attributes
@@ -102,7 +110,7 @@ def getBiomartTable(
102
110
  return res
103
111
 
104
112
 
105
- def validate(adata, organism):
113
+ def validate(adata: AnnData, organism: str):
106
114
  """
107
115
  validate checks if the adata object is valid for lamindb
108
116
 
@@ -144,9 +152,6 @@ def validate(adata, organism):
144
152
  raise ValueError(
145
153
  f"Column '{val}' is missing in the provided anndata object."
146
154
  )
147
- bionty_source = bt.PublicSource.filter(
148
- entity="DevelopmentalStage", organism=organism
149
- ).one()
150
155
 
151
156
  if not bt.Ethnicity.validate(
152
157
  adata.obs["self_reported_ethnicity_ontology_term_id"],
@@ -169,14 +174,10 @@ def validate(adata, organism):
169
174
  adata.obs["cell_type_ontology_term_id"], field="ontology_id"
170
175
  ).all():
171
176
  raise ValueError("Invalid cell type ontology term id found")
172
- if (
173
- not bt.DevelopmentalStage.filter(bionty_source=bionty_source)
174
- .validate(
175
- adata.obs["development_stage_ontology_term_id"],
176
- field="ontology_id",
177
- )
178
- .all()
179
- ):
177
+ if not bt.DevelopmentalStage.validate(
178
+ adata.obs["development_stage_ontology_term_id"],
179
+ field="ontology_id",
180
+ ).all():
180
181
  raise ValueError("Invalid dev stage ontology term id found")
181
182
  if not bt.Tissue.validate(
182
183
  adata.obs["tissue_ontology_term_id"], field="ontology_id"
@@ -186,18 +187,16 @@ def validate(adata, organism):
186
187
  adata.obs["assay_ontology_term_id"], field="ontology_id"
187
188
  ).all():
188
189
  raise ValueError("Invalid assay ontology term id found")
189
- if (
190
- not bt.Gene.filter(organism=bt.settings.organism)
191
- .validate(adata.var.index, field="ensembl_gene_id")
192
- .all()
193
- ):
190
+ if not bt.Gene.validate(
191
+ adata.var.index, field="ensembl_gene_id", organism=organism
192
+ ).all():
194
193
  raise ValueError("Invalid gene ensembl id found")
195
194
  return True
196
195
 
197
196
 
198
197
  # setting a cache of 200 elements
199
198
  # @lru_cache(maxsize=200)
200
- def get_all_ancestors(val, df):
199
+ def get_all_ancestors(val: str, df: pd.DataFrame):
201
200
  if val not in df.index:
202
201
  return set()
203
202
  parents = df.loc[val].parents__ontology_id
@@ -207,7 +206,7 @@ def get_all_ancestors(val, df):
207
206
  return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
208
207
 
209
208
 
210
- def get_ancestry_mapping(all_elem, onto_df):
209
+ def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
211
210
  """
212
211
  This function generates a mapping of all elements to their ancestors in the ontology dataframe.
213
212
 
@@ -242,12 +241,12 @@ def get_ancestry_mapping(all_elem, onto_df):
242
241
 
243
242
 
244
243
  def load_dataset_local(
245
- remote_dataset,
246
- download_folder,
247
- name,
248
- description,
249
- use_cache=True,
250
- only=None,
244
+ remote_dataset: ln.Collection,
245
+ download_folder: str,
246
+ name: str,
247
+ description: str,
248
+ use_cache: bool = True,
249
+ only: Optional[List[int]] = None,
251
250
  ):
252
251
  """
253
252
  This function loads a remote lamindb dataset to local.
@@ -303,7 +302,7 @@ def load_dataset_local(
303
302
  return dataset
304
303
 
305
304
 
306
- def load_genes(organisms):
305
+ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
307
306
  organismdf = []
308
307
  if type(organisms) == str:
309
308
  organisms = [organisms]
@@ -313,7 +312,7 @@ def load_genes(organisms):
313
312
  ).df()
314
313
  genesdf = genesdf[~genesdf["public_source_id"].isna()]
315
314
  genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
316
- genesdf = genesdf.set_index("ensembl_gene_id")
315
+ genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
317
316
  # mitochondrial genes
318
317
  genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
319
318
  # ribosomal genes
@@ -326,14 +325,14 @@ def load_genes(organisms):
326
325
 
327
326
 
328
327
  def populate_my_ontology(
329
- organisms=["NCBITaxon:10090", "NCBITaxon:9606"],
330
- sex=["PATO:0000384", "PATO:0000383"],
331
- celltypes=[],
332
- ethnicities=[],
333
- assays=[],
334
- tissues=[],
335
- diseases=[],
336
- dev_stages=[],
328
+ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
329
+ sex: List[str] = ["PATO:0000384", "PATO:0000383"],
330
+ celltypes: List[str] = [],
331
+ ethnicities: List[str] = [],
332
+ assays: List[str] = [],
333
+ tissues: List[str] = [],
334
+ diseases: List[str] = [],
335
+ dev_stages: List[str] = [],
337
336
  ):
338
337
  """
339
338
  creates a local version of the lamin ontologies and add the required missing values in base ontologies
@@ -360,20 +359,20 @@ def populate_my_ontology(
360
359
  dev_stages (list, optional): List of developmental stages. Defaults to [].
361
360
  """
362
361
 
363
- names = bt.CellType.from_public().df().index if not celltypes else celltypes
362
+ names = bt.CellType.public().df().index if not celltypes else celltypes
364
363
  records = bt.CellType.from_values(names, field="ontology_id")
365
- ln.save(records)
364
+ ln.save(records, parents=bool(celltypes))
366
365
  bt.CellType(name="unknown", ontology_id="unknown").save()
367
366
  # Organism
368
- names = bt.Organism.from_public().df().index if not organisms else organisms
367
+ names = bt.Organism.public().df().index if not organisms else organisms
369
368
  records = [
370
369
  i[0] if type(i) is list else i
371
370
  for i in [bt.Organism.from_public(ontology_id=i) for i in names]
372
371
  ]
373
- ln.save(records)
372
+ ln.save(records, parents=bool(organisms))
374
373
  bt.Organism(name="unknown", ontology_id="unknown").save()
375
374
  # Phenotype
376
- names = bt.Phenotype.from_public().df().index if not sex else sex
375
+ names = bt.Phenotype.public().df().index if not sex else sex
377
376
  records = [
378
377
  bt.Phenotype.from_public(
379
378
  ontology_id=i,
@@ -383,38 +382,49 @@ def populate_my_ontology(
383
382
  )
384
383
  for i in names
385
384
  ]
386
- ln.save(records)
385
+ ln.save(records, parents=bool(sex))
387
386
  bt.Phenotype(name="unknown", ontology_id="unknown").save()
388
387
  # ethnicity
389
- names = bt.Ethnicity.from_public().df().index if not ethnicities else ethnicities
388
+ names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
390
389
  records = bt.Ethnicity.from_values(names, field="ontology_id")
391
- ln.save(records)
390
+ ln.save(records, parents=bool(ethnicities))
392
391
  bt.Ethnicity(
393
392
  name="unknown", ontology_id="unknown"
394
393
  ).save() # multi ethnic will have to get renamed
395
394
  # ExperimentalFactor
396
- names = bt.ExperimentalFactor.from_public().df().index if not assays else assays
395
+ names = bt.ExperimentalFactor.public().df().index if not assays else assays
397
396
  records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
398
- ln.save(records)
397
+ ln.save(records, parents=bool(assays))
399
398
  bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
400
399
  # lookup = bt.ExperimentalFactor.lookup()
401
400
  # lookup.smart_seq_v4.parents.add(lookup.smart_like)
402
401
  # Tissue
403
- names = bt.Tissue.from_public().df().index if not tissues else tissues
402
+ names = bt.Tissue.public().df().index if not tissues else tissues
404
403
  records = bt.Tissue.from_values(names, field="ontology_id")
405
- ln.save(records)
404
+ ln.save(records, parents=bool(tissues))
406
405
  bt.Tissue(name="unknown", ontology_id="unknown").save()
407
406
  # DevelopmentalStage
408
407
  names = (
409
- bt.DevelopmentalStage.from_public().df().index if not dev_stages else dev_stages
408
+ bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
410
409
  )
411
410
  records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
412
- ln.save(records)
411
+ ln.save(records, parents=bool(dev_stages))
413
412
  bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
413
+
414
+ names = bt.DevelopmentalStage.public(organism="mouse").df().name
415
+ bionty_source = bt.PublicSource.filter(
416
+ entity="DevelopmentalStage", organism="mouse"
417
+ ).one()
418
+ records = [
419
+ bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
420
+ for i in names.tolist()
421
+ ]
422
+ records[-4] = records[-4][0]
423
+ ln.save(records)
414
424
  # Disease
415
- names = bt.Disease.from_public().df().index if not diseases else diseases
425
+ names = bt.Disease.public().df().index if not diseases else diseases
416
426
  records = bt.Disease.from_values(names, field="ontology_id")
417
- ln.save(records)
427
+ ln.save(records, parents=bool(diseases))
418
428
  bt.Disease(name="normal", ontology_id="PATO:0000461").save()
419
429
  bt.Disease(name="unknown", ontology_id="unknown").save()
420
430
  # genes
@@ -430,7 +440,7 @@ def populate_my_ontology(
430
440
  ln.save(records)
431
441
 
432
442
 
433
- def is_outlier(adata, metric: str, nmads: int):
443
+ def is_outlier(adata: AnnData, metric: str, nmads: int):
434
444
  """
435
445
  is_outlier detects outliers in adata.obs[metric]
436
446
 
@@ -449,7 +459,7 @@ def is_outlier(adata, metric: str, nmads: int):
449
459
  return outlier
450
460
 
451
461
 
452
- def length_normalize(adata, gene_lengths):
462
+ def length_normalize(adata: AnnData, gene_lengths: list):
453
463
  """
454
464
  length_normalize normalizes the counts by the gene length
455
465
 
@@ -464,7 +474,7 @@ def length_normalize(adata, gene_lengths):
464
474
  return adata
465
475
 
466
476
 
467
- def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
477
+ def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
468
478
  """
469
479
  pd_load_cached downloads a file from a url and loads it as a pandas dataframe
470
480
 
@@ -482,3 +492,36 @@ def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
482
492
  urllib.request.urlretrieve(url, loc)
483
493
  # Load the data from the file
484
494
  return pd.read_csv(loc, **kwargs)
495
+
496
+
497
+ def translate(
498
+ val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
499
+ ):
500
+ """
501
+ translate translates the ontology term id to the name
502
+
503
+ Args:
504
+ val (str, dict, set, list, dict): the object to translate
505
+ t (flat, optional): the type of ontology terms.
506
+ one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
507
+ Defaults to "cell_type_ontology_term_id".
508
+
509
+ Returns:
510
+ dict: the mapping for the translation
511
+ """
512
+ if t == "cell_type_ontology_term_id":
513
+ obj = bt.CellType.public(organism="all")
514
+ elif t == "assay_ontology_term_id":
515
+ obj = bt.ExperimentalFactor.public()
516
+ elif t == "tissue_ontology_term_id":
517
+ obj = bt.Tissue.public()
518
+ else:
519
+ return None
520
+ if type(val) is str:
521
+ return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
522
+ elif type(val) is list or type(val) is set:
523
+ return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
524
+ elif type(val) is dict or type(val) is Counter:
525
+ return {
526
+ obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
527
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scdataloader
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Home-page: https://github.com/jkobject/scDataLoader
6
6
  License: GPL3
@@ -34,6 +34,8 @@ Description-Content-Type: text/markdown
34
34
 
35
35
  [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
36
36
  [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
37
+ [![DOI](https://zenodo.org/badge/731248665.svg)](https://zenodo.org/doi/10.5281/zenodo.10573143)
38
+
37
39
 
38
40
  Awesome single cell dataloader created by @jkobject
39
41
 
@@ -66,7 +68,7 @@ the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint
66
68
 
67
69
  Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
68
70
 
69
- ![](docs/scdataloader.drawio.png)
71
+ ![docs/scdataloader.drawio.png](docs/scdataloader.drawio.png)
70
72
 
71
73
  ## Install it from PyPI
72
74
 
@@ -85,6 +87,48 @@ then run the notebooks with the poetry installed environment
85
87
 
86
88
  ## Usage
87
89
 
90
+ ```python
91
+ # initialize a local lamin database
92
+ # !lamin init --storage ~/scdataloader --schema bionty
93
+
94
+ from scdataloader import utils
95
+ from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
96
+
97
+ # preprocess datasets
98
+ DESCRIPTION='preprocessed by scDataLoader'
99
+
100
+ cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
101
+ cx_dataset, len(cx_dataset.artifacts.all())
102
+
103
+
104
+ do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
105
+
106
+ preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
107
+
108
+ # create dataloaders
109
+ from scdataloader import DataModule
110
+ import tqdm
111
+
112
+ datamodule = DataModule(
113
+ collection_name="preprocessed dataset",
114
+ organisms=["NCBITaxon:9606"], #organism that we will work on
115
+ how="most expr", # for the collator (most expr genes only will be selected)
116
+ max_len=1000, # only the 1000 most expressed
117
+ batch_size=64,
118
+ num_workers=1,
119
+ validation_split=0.1,
120
+ test_split=0)
121
+
122
+ for i in tqdm.tqdm(datamodule.train_dataloader()):
123
+ # pass #or do pass
124
+ print(i)
125
+ break
126
+
127
+ # with lightning:
128
+ # Trainer(model, datamodule)
129
+
130
+ ```
131
+
88
132
  see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
89
133
 
90
134
  1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)
@@ -0,0 +1,16 @@
1
+ scdataloader/VERSION,sha256=ln2a-xATRmZxZvLnboGRC8GQSI19QdUMoAcunZLwDjI,6
2
+ scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
3
+ scdataloader/__main__.py,sha256=UyXtFHgWxE-ecJmM_oEDLlzBDBbH-uEKAVj1A7BkwmM,6297
4
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
5
+ scdataloader/collator.py,sha256=Ykjdw24GUvHdbowWUDtp28YTkaF3w65SiWTU2PKBzy4,11714
6
+ scdataloader/config.py,sha256=0_LoIblgdZZ19yM2qvPE-padMGQzdhuaxX20zYrhWq0,2780
7
+ scdataloader/data.py,sha256=faJWN--06N7irWBKcjeU6fcX5NbzyEPXs2_EVGxfBpw,12292
8
+ scdataloader/datamodule.py,sha256=OhHPb3jhGG5HbvahzTGxgzJ_lxbVJ4PfZspVW9h7SZk,14789
9
+ scdataloader/mapped.py,sha256=rhE11Xl3x_wIKu3m_wu8Is6mYsXdblu3nQpT5lNqr60,13301
10
+ scdataloader/preprocess.py,sha256=67ewe6b4HIjz_vTDjlOAJ4lMe4K2oCw2HHHUS-7S77M,38205
11
+ scdataloader/utils.py,sha256=6eKU3_cotEaQcxONMrCWzMx7U8DybabteNhk-vNqfUQ,19365
12
+ scdataloader-0.0.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
13
+ scdataloader-0.0.4.dist-info/METADATA,sha256=Bf8UjMwRcqSbWW8VbWrLhSb7qKQYdjZtJ7d6Oz4-rn8,39733
14
+ scdataloader-0.0.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
15
+ scdataloader-0.0.4.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
16
+ scdataloader-0.0.4.dist-info/RECORD,,