scdataloader 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/utils.py CHANGED
@@ -147,7 +147,7 @@ def getBiomartTable(
147
147
  return res
148
148
 
149
149
 
150
- def validate(adata: AnnData, organism: str):
150
+ def validate(adata: AnnData, organism: str, need_all=True):
151
151
  """
152
152
  validate checks if the adata object is valid for lamindb
153
153
 
@@ -185,7 +185,7 @@ def validate(adata: AnnData, organism: str):
185
185
  "tissue_ontology_term_id",
186
186
  "assay_ontology_term_id",
187
187
  ]:
188
- if val not in adata.obs.columns:
188
+ if val not in adata.obs.columns and need_all:
189
189
  raise ValueError(
190
190
  f"Column '{val}' is missing in the provided anndata object."
191
191
  )
@@ -193,7 +193,9 @@ def validate(adata: AnnData, organism: str):
193
193
  if not bt.Ethnicity.validate(
194
194
  adata.obs["self_reported_ethnicity_ontology_term_id"],
195
195
  field="ontology_id",
196
- ).all():
196
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
197
+ ["unknown"]
198
+ ):
197
199
  raise ValueError("Invalid ethnicity ontology term id found")
198
200
  if not bt.Organism.validate(
199
201
  adata.obs["organism_ontology_term_id"], field="ontology_id"
@@ -201,28 +203,40 @@ def validate(adata: AnnData, organism: str):
201
203
  raise ValueError("Invalid organism ontology term id found")
202
204
  if not bt.Phenotype.validate(
203
205
  adata.obs["sex_ontology_term_id"], field="ontology_id"
204
- ).all():
206
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
207
+ ["unknown"]
208
+ ):
205
209
  raise ValueError("Invalid sex ontology term id found")
206
210
  if not bt.Disease.validate(
207
211
  adata.obs["disease_ontology_term_id"], field="ontology_id"
208
- ).all():
212
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
213
+ ["unknown"]
214
+ ):
209
215
  raise ValueError("Invalid disease ontology term id found")
210
216
  if not bt.CellType.validate(
211
217
  adata.obs["cell_type_ontology_term_id"], field="ontology_id"
212
- ).all():
218
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
219
+ ["unknown"]
220
+ ):
213
221
  raise ValueError("Invalid cell type ontology term id found")
214
222
  if not bt.DevelopmentalStage.validate(
215
223
  adata.obs["development_stage_ontology_term_id"],
216
224
  field="ontology_id",
217
- ).all():
225
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
226
+ ["unknown"]
227
+ ):
218
228
  raise ValueError("Invalid dev stage ontology term id found")
219
229
  if not bt.Tissue.validate(
220
230
  adata.obs["tissue_ontology_term_id"], field="ontology_id"
221
- ).all():
231
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
232
+ ["unknown"]
233
+ ):
222
234
  raise ValueError("Invalid tissue ontology term id found")
223
235
  if not bt.ExperimentalFactor.validate(
224
236
  adata.obs["assay_ontology_term_id"], field="ontology_id"
225
- ).all():
237
+ ).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
238
+ ["unknown"]
239
+ ):
226
240
  raise ValueError("Invalid assay ontology term id found")
227
241
  if not bt.Gene.validate(
228
242
  adata.var.index, field="ensembl_gene_id", organism=organism
@@ -378,6 +392,169 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
378
392
  ]:
379
393
  if col in organismdf.columns:
380
394
  organismdf.drop(columns=[col], inplace=True)
395
+ # temp fix
396
+ drop = {
397
+ "ENSG00000112096",
398
+ "ENSG00000137808",
399
+ "ENSG00000161149",
400
+ "ENSG00000182230",
401
+ "ENSG00000203812",
402
+ "ENSG00000204092",
403
+ "ENSG00000205485",
404
+ "ENSG00000212951",
405
+ "ENSG00000215271",
406
+ "ENSG00000221995",
407
+ "ENSG00000224739",
408
+ "ENSG00000224745",
409
+ "ENSG00000225178",
410
+ "ENSG00000225932",
411
+ "ENSG00000226377",
412
+ "ENSG00000226380",
413
+ "ENSG00000226403",
414
+ "ENSG00000227021",
415
+ "ENSG00000227220",
416
+ "ENSG00000227902",
417
+ "ENSG00000228139",
418
+ "ENSG00000228206",
419
+ "ENSG00000228906",
420
+ "ENSG00000229352",
421
+ "ENSG00000231575",
422
+ "ENSG00000232196",
423
+ "ENSG00000232295",
424
+ "ENSG00000233776",
425
+ "ENSG00000236166",
426
+ "ENSG00000236673",
427
+ "ENSG00000236740",
428
+ "ENSG00000236886",
429
+ "ENSG00000236996",
430
+ "ENSG00000237133",
431
+ "ENSG00000237513",
432
+ "ENSG00000237548",
433
+ "ENSG00000237838",
434
+ "ENSG00000239446",
435
+ "ENSG00000239467",
436
+ "ENSG00000239665",
437
+ "ENSG00000244693",
438
+ "ENSG00000244952",
439
+ "ENSG00000249860",
440
+ "ENSG00000251044",
441
+ "ENSG00000253878",
442
+ "ENSG00000254561",
443
+ "ENSG00000254740",
444
+ "ENSG00000255633",
445
+ "ENSG00000255823",
446
+ "ENSG00000256045",
447
+ "ENSG00000256222",
448
+ "ENSG00000256374",
449
+ "ENSG00000256427",
450
+ "ENSG00000256618",
451
+ "ENSG00000256863",
452
+ "ENSG00000256892",
453
+ "ENSG00000258414",
454
+ "ENSG00000258808",
455
+ "ENSG00000258861",
456
+ "ENSG00000259444",
457
+ "ENSG00000259820",
458
+ "ENSG00000259834",
459
+ "ENSG00000259855",
460
+ "ENSG00000260461",
461
+ "ENSG00000261068",
462
+ "ENSG00000261438",
463
+ "ENSG00000261490",
464
+ "ENSG00000261534",
465
+ "ENSG00000261737",
466
+ "ENSG00000261773",
467
+ "ENSG00000261963",
468
+ "ENSG00000262668",
469
+ "ENSG00000263464",
470
+ "ENSG00000267637",
471
+ "ENSG00000268955",
472
+ "ENSG00000269028",
473
+ "ENSG00000269900",
474
+ "ENSG00000269933",
475
+ "ENSG00000269966",
476
+ "ENSG00000270188",
477
+ "ENSG00000270394",
478
+ "ENSG00000270672",
479
+ "ENSG00000271043",
480
+ "ENSG00000271409",
481
+ "ENSG00000271734",
482
+ "ENSG00000271870",
483
+ "ENSG00000272040",
484
+ "ENSG00000272196",
485
+ "ENSG00000272267",
486
+ "ENSG00000272354",
487
+ "ENSG00000272370",
488
+ "ENSG00000272551",
489
+ "ENSG00000272567",
490
+ "ENSG00000272880",
491
+ "ENSG00000272904",
492
+ "ENSG00000272934",
493
+ "ENSG00000273301",
494
+ "ENSG00000273370",
495
+ "ENSG00000273496",
496
+ "ENSG00000273576",
497
+ "ENSG00000273614",
498
+ "ENSG00000273837",
499
+ "ENSG00000273888",
500
+ "ENSG00000273923",
501
+ "ENSG00000276612",
502
+ "ENSG00000276814",
503
+ "ENSG00000277050",
504
+ "ENSG00000277077",
505
+ "ENSG00000277352",
506
+ "ENSG00000277666",
507
+ "ENSG00000277761",
508
+ "ENSG00000278198",
509
+ "ENSG00000278782",
510
+ "ENSG00000278927",
511
+ "ENSG00000278955",
512
+ "ENSG00000279226",
513
+ "ENSG00000279765",
514
+ "ENSG00000279769",
515
+ "ENSG00000279948",
516
+ "ENSG00000280058",
517
+ "ENSG00000280095",
518
+ "ENSG00000280250",
519
+ "ENSG00000280346",
520
+ "ENSG00000280374",
521
+ "ENSG00000280710",
522
+ "ENSG00000282080",
523
+ "ENSG00000282246",
524
+ "ENSG00000282965",
525
+ "ENSG00000283486",
526
+ "ENSG00000284299",
527
+ "ENSG00000284741",
528
+ "ENSG00000285106",
529
+ "ENSG00000285162",
530
+ "ENSG00000285476",
531
+ "ENSG00000285762",
532
+ "ENSG00000286065",
533
+ "ENSG00000286228",
534
+ "ENSG00000286601",
535
+ "ENSG00000286699",
536
+ "ENSG00000286949",
537
+ "ENSG00000286996",
538
+ "ENSG00000287116",
539
+ "ENSG00000287388",
540
+ "ENSG00000288541",
541
+ "ENSG00000288546",
542
+ "ENSG00000288630",
543
+ "ENSG00000288639",
544
+ "ENSMUSG00000069518",
545
+ "ENSMUSG00000073682",
546
+ "ENSMUSG00000075014",
547
+ "ENSMUSG00000075015",
548
+ "ENSMUSG00000078091",
549
+ "ENSMUSG00000094958",
550
+ "ENSMUSG00000095547",
551
+ "ENSMUSG00000095891",
552
+ "ENSMUSG00000096385",
553
+ "ENSMUSG00000096519",
554
+ "ENSMUSG00000096923",
555
+ "ENSMUSG00000097078",
556
+ }
557
+ organismdf = organismdf[~organismdf.index.isin(drop)]
381
558
  return organismdf
382
559
 
383
560
 
@@ -419,7 +596,7 @@ def populate_my_ontology(
419
596
  # cell type
420
597
  if celltypes is not None:
421
598
  if len(celltypes) == 0:
422
- bt.CellType.import_from_source(update=True)
599
+ bt.CellType.import_source()
423
600
  else:
424
601
  names = bt.CellType.public().df().index if not celltypes else celltypes
425
602
  records = bt.CellType.from_values(names, field="ontology_id")
@@ -434,9 +611,9 @@ def populate_my_ontology(
434
611
  )
435
612
  source = bt.PublicSource.filter(name="ensembl", organism=organism_clade).last()
436
613
  records = [
437
- i[0] if type(i) is list else i
438
- for i in [
439
- bt.Organism.from_source(ontology_id=i, source=source) for i in names
614
+ organism_or_organismlist if isinstance(organism_or_organismlist, bt.Organism) else organism_or_organismlist[0]
615
+ for organism_or_organismlist in [
616
+ bt.Organism.from_source(ontology_id=name, source=source) for name in names
440
617
  ]
441
618
  ]
442
619
  ln.save(records)
@@ -453,7 +630,7 @@ def populate_my_ontology(
453
630
  # ethnicity
454
631
  if ethnicities is not None:
455
632
  if len(ethnicities) == 0:
456
- bt.Ethnicity.import_from_source(update=True)
633
+ bt.Ethnicity.import_source()
457
634
  else:
458
635
  names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
459
636
  records = bt.Ethnicity.from_values(names, field="ontology_id")
@@ -464,7 +641,7 @@ def populate_my_ontology(
464
641
  # ExperimentalFactor
465
642
  if assays is not None:
466
643
  if len(assays) == 0:
467
- bt.ExperimentalFactor.import_from_source(update=True)
644
+ bt.ExperimentalFactor.import_source()
468
645
  else:
469
646
  names = bt.ExperimentalFactor.public().df().index if not assays else assays
470
647
  records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
@@ -475,7 +652,7 @@ def populate_my_ontology(
475
652
  # Tissue
476
653
  if tissues is not None:
477
654
  if len(tissues) == 0:
478
- bt.Tissue.import_from_source(update=True)
655
+ bt.Tissue.import_source()
479
656
  else:
480
657
  names = bt.Tissue.public().df().index if not tissues else tissues
481
658
  records = bt.Tissue.from_values(names, field="ontology_id")
@@ -484,9 +661,9 @@ def populate_my_ontology(
484
661
  # DevelopmentalStage
485
662
  if dev_stages is not None:
486
663
  if len(dev_stages) == 0:
487
- bt.DevelopmentalStage.import_from_source(update=True)
664
+ bt.DevelopmentalStage.import_source()
488
665
  source = bt.PublicSource.filter(organism="mouse", name="mmusdv").last()
489
- bt.DevelopmentalStage.import_from_source(source=source)
666
+ bt.DevelopmentalStage.import_source(source=source)
490
667
  else:
491
668
  names = (
492
669
  bt.DevelopmentalStage.public().df().index
@@ -500,7 +677,7 @@ def populate_my_ontology(
500
677
  # Disease
501
678
  if diseases is not None:
502
679
  if len(diseases) == 0:
503
- bt.Disease.import_from_source(update=True)
680
+ bt.Disease.import_source()
504
681
  else:
505
682
  names = bt.Disease.public().df().index if not diseases else diseases
506
683
  records = bt.Disease.from_values(names, field="ontology_id")
@@ -575,18 +752,26 @@ def translate(
575
752
  dict: the mapping for the translation
576
753
  """
577
754
  if t == "cell_type_ontology_term_id":
578
- obj = bt.CellType.public(organism="all")
755
+ obj = bt.CellType
579
756
  elif t == "assay_ontology_term_id":
580
- obj = bt.ExperimentalFactor.public()
757
+ obj = bt.ExperimentalFactor
581
758
  elif t == "tissue_ontology_term_id":
582
- obj = bt.Tissue.public()
759
+ obj = bt.Tissue
760
+ elif t in [
761
+ "development_stage_ontology_term_id",
762
+ "simplified_dev_stage",
763
+ "age_group",
764
+ ]:
765
+ obj = bt.DevelopmentalStage
766
+ elif t == "disease_ontology_term_id":
767
+ obj = bt.Disease
768
+ elif t == "self_reported_ethnicity_ontology_term_id":
769
+ obj = bt.Ethnicity
583
770
  else:
584
771
  return None
585
772
  if type(val) is str:
586
- return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
773
+ return {val: obj.filter(ontology_id=val).one().name}
587
774
  elif type(val) is list or type(val) is set:
588
- return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
775
+ return {i: obj.filter(ontology_id=i).one().name for i in set(val)}
589
776
  elif type(val) is dict or type(val) is Counter:
590
- return {
591
- obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
592
- }
777
+ return {obj.filter(ontology_id=k).one().name: v for k, v in val.items()}
@@ -1,23 +1,24 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: scdataloader
3
- Version: 1.6.3
3
+ Version: 1.7.0
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Project-URL: repository, https://github.com/jkobject/scDataLoader
6
6
  Author-email: jkobject <jkobject@gmail.com>
7
- License: MIT
7
+ License-Expression: MIT
8
+ License-File: LICENSE
8
9
  Keywords: dataloader,lamindb,pytorch,scPRINT,scRNAseq
9
- Requires-Python: <3.11,>=3.10
10
+ Requires-Python: <3.14,>=3.10
10
11
  Requires-Dist: anndata>=0.9.0
11
12
  Requires-Dist: biomart>=0.9.0
12
13
  Requires-Dist: cellxgene-census>=0.1.0
13
14
  Requires-Dist: django>=4.0.0
14
15
  Requires-Dist: harmonypy>=0.0.10
15
16
  Requires-Dist: ipykernel>=6.20.0
16
- Requires-Dist: lamindb[bionty]==0.76.12
17
+ Requires-Dist: lamindb[bionty]==0.77.2
17
18
  Requires-Dist: leidenalg>=0.8.0
18
19
  Requires-Dist: lightning>=2.0.0
19
20
  Requires-Dist: matplotlib>=3.5.0
20
- Requires-Dist: numpy>=1.26.0
21
+ Requires-Dist: numpy==1.26.0
21
22
  Requires-Dist: palantir>=1.3.3
22
23
  Requires-Dist: pandas>=2.0.0
23
24
  Requires-Dist: scikit-misc>=0.5.0
@@ -50,6 +51,8 @@ Description-Content-Type: text/markdown
50
51
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
51
52
  [![DOI](https://img.shields.io/badge/DOI-10.1101%2F2024.07.29.605556-blue)](https://doi.org/10.1101/2024.07.29.605556)
52
53
 
54
+ <img src="scdataloader.png" width="600">
55
+
53
56
  This single cell pytorch dataloader / lighting datamodule is designed to be used with:
54
57
 
55
58
  - [lamindb](https://lamin.ai/)
@@ -0,0 +1,15 @@
1
+ scdataloader/VERSION,sha256=u3Mg2DHnoVGqkBw15zJsdS-i71Ak8wdoxCMZuL7Rce0,6
2
+ scdataloader/__init__.py,sha256=4sSZSnNM-gtyiB28M_FM3o8lNabmsofct9SWWry1_zA,170
3
+ scdataloader/__main__.py,sha256=CcvUnvgnF2d1QQHjkIhhzeK9vgplbhdHiGMawmxhq6g,7454
4
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
5
+ scdataloader/collator.py,sha256=NmbMAxkFZLufWpn0yBY6d1me2nUKdV0VG11Js8rgghU,11560
6
+ scdataloader/config.py,sha256=tu9hkUiU2HfaIiVzdmrjbzt73yV4zP-t8lDuJqyGcDA,6546
7
+ scdataloader/data.py,sha256=K0r_RlLBza3WsWQVzybZjskKDfwFe8qMqLcJwdZ1yuw,15172
8
+ scdataloader/datamodule.py,sha256=-GumOkOXDn7DJnqo2yhmPpEcIZUtw0LulFOnl3nkouw,20193
9
+ scdataloader/mapped.py,sha256=u3vo7vcE4Q72qY0j7uHpZvlTTYr4yc3RaRrwE7AAhaE,27122
10
+ scdataloader/preprocess.py,sha256=feaXGQYNfChbISZCWCnIZL1qwmzfwmNygbL-xVTwC0o,34595
11
+ scdataloader/utils.py,sha256=MRuqbRcCkb45k_G4QCwog0C6-Az4ZcklVPn47aZJLGs,27870
12
+ scdataloader-1.7.0.dist-info/METADATA,sha256=r0oXvOe1kqoRlbYJim4MTqgRADjP3t_xDaxXGrGomkM,9907
13
+ scdataloader-1.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ scdataloader-1.7.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
15
+ scdataloader-1.7.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.26.3
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,14 +0,0 @@
1
- scdataloader/VERSION,sha256=hvj6gyp2NYIB1uL88LtHcn7-LbI69zDbZM6tZSd3a-o,6
2
- scdataloader/__init__.py,sha256=5y9VzRhOAUWeYMn2MrRRRlzgdiMjRFytr7gcn-I6IkE,147
3
- scdataloader/__main__.py,sha256=VXrt2IykBypnIXWydwA7NfF7LtRGc-0Khjtm5OIBNpI,6527
4
- scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
5
- scdataloader/collator.py,sha256=gzHiuixUwK8JClhAbG12kgWMU_VTKkowibA-tDFpbwo,11341
6
- scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
7
- scdataloader/data.py,sha256=3dCp-lIAfOkCi76SH5W3iSqFmAWZslwARkN9v5mylz8,14907
8
- scdataloader/datamodule.py,sha256=B-udBevPSPF__hfy0pOz1dGovgE95K2pxPupjB7RblI,16936
9
- scdataloader/preprocess.py,sha256=pH4EPrcRqH34o3t5X3A4kETiYdCZngih5SdP_PPfgOo,29178
10
- scdataloader/utils.py,sha256=7tgt3sPj_XTKb-UlJDAZWvQr0_DG9VTC6ioiLdBWFFE,22498
11
- scdataloader-1.6.3.dist-info/METADATA,sha256=iBh6pruWqZArL8vFjEEuc6FL2m1amZixVLTwQ5mpXcM,9833
12
- scdataloader-1.6.3.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
13
- scdataloader-1.6.3.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
14
- scdataloader-1.6.3.dist-info/RECORD,,