scdataloader 2.0.4__py3-none-any.whl → 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -190,6 +190,7 @@ class DataModule(L.LightningDataModule):
190
190
  self.classes = {k: len(v) for k, v in mdataset.class_topred.items()}
191
191
  # we might want not to order the genes by expression (or do it?)
192
192
  # we might want to not introduce zeros and
193
+
193
194
  if use_default_col:
194
195
  kwargs["collate_fn"] = Collator(
195
196
  organisms=mdataset.organisms if organisms is None else organisms,
scdataloader/utils.py CHANGED
@@ -20,6 +20,8 @@ from scipy.sparse import csr_matrix
20
20
  from scipy.stats import median_abs_deviation
21
21
  from torch import Tensor
22
22
 
23
+ from .config import DROP
24
+
23
25
 
24
26
  def fileToList(filename: str, strconv: callable = lambda x: x) -> list:
25
27
  """
@@ -442,168 +444,7 @@ def load_genes(
442
444
  if col in organismdf.columns:
443
445
  organismdf.drop(columns=[col], inplace=True)
444
446
  # temp fix
445
- drop = {
446
- "ENSG00000112096",
447
- "ENSG00000137808",
448
- "ENSG00000161149",
449
- "ENSG00000182230",
450
- "ENSG00000203812",
451
- "ENSG00000204092",
452
- "ENSG00000205485",
453
- "ENSG00000212951",
454
- "ENSG00000215271",
455
- "ENSG00000221995",
456
- "ENSG00000224739",
457
- "ENSG00000224745",
458
- "ENSG00000225178",
459
- "ENSG00000225932",
460
- "ENSG00000226377",
461
- "ENSG00000226380",
462
- "ENSG00000226403",
463
- "ENSG00000227021",
464
- "ENSG00000227220",
465
- "ENSG00000227902",
466
- "ENSG00000228139",
467
- "ENSG00000228206",
468
- "ENSG00000228906",
469
- "ENSG00000229352",
470
- "ENSG00000231575",
471
- "ENSG00000232196",
472
- "ENSG00000232295",
473
- "ENSG00000233776",
474
- "ENSG00000236166",
475
- "ENSG00000236673",
476
- "ENSG00000236740",
477
- "ENSG00000236886",
478
- "ENSG00000236996",
479
- "ENSG00000237133",
480
- "ENSG00000237513",
481
- "ENSG00000237548",
482
- "ENSG00000237838",
483
- "ENSG00000239446",
484
- "ENSG00000239467",
485
- "ENSG00000239665",
486
- "ENSG00000244693",
487
- "ENSG00000244952",
488
- "ENSG00000249860",
489
- "ENSG00000251044",
490
- "ENSG00000253878",
491
- "ENSG00000254561",
492
- "ENSG00000254740",
493
- "ENSG00000255633",
494
- "ENSG00000255823",
495
- "ENSG00000256045",
496
- "ENSG00000256222",
497
- "ENSG00000256374",
498
- "ENSG00000256427",
499
- "ENSG00000256618",
500
- "ENSG00000256863",
501
- "ENSG00000256892",
502
- "ENSG00000258414",
503
- "ENSG00000258808",
504
- "ENSG00000258861",
505
- "ENSG00000259444",
506
- "ENSG00000259820",
507
- "ENSG00000259834",
508
- "ENSG00000259855",
509
- "ENSG00000260461",
510
- "ENSG00000261068",
511
- "ENSG00000261438",
512
- "ENSG00000261490",
513
- "ENSG00000261534",
514
- "ENSG00000261737",
515
- "ENSG00000261773",
516
- "ENSG00000261963",
517
- "ENSG00000262668",
518
- "ENSG00000263464",
519
- "ENSG00000267637",
520
- "ENSG00000268955",
521
- "ENSG00000269028",
522
- "ENSG00000269900",
523
- "ENSG00000269933",
524
- "ENSG00000269966",
525
- "ENSG00000270188",
526
- "ENSG00000270394",
527
- "ENSG00000270672",
528
- "ENSG00000271043",
529
- "ENSG00000271409",
530
- "ENSG00000271734",
531
- "ENSG00000271870",
532
- "ENSG00000272040",
533
- "ENSG00000272196",
534
- "ENSG00000272267",
535
- "ENSG00000272354",
536
- "ENSG00000272370",
537
- "ENSG00000272551",
538
- "ENSG00000272567",
539
- "ENSG00000272880",
540
- "ENSG00000272904",
541
- "ENSG00000272934",
542
- "ENSG00000273301",
543
- "ENSG00000273370",
544
- "ENSG00000273496",
545
- "ENSG00000273576",
546
- "ENSG00000273614",
547
- "ENSG00000273837",
548
- "ENSG00000273888",
549
- "ENSG00000273923",
550
- "ENSG00000276612",
551
- "ENSG00000276814",
552
- "ENSG00000277050",
553
- "ENSG00000277077",
554
- "ENSG00000277352",
555
- "ENSG00000277666",
556
- "ENSG00000277761",
557
- "ENSG00000278198",
558
- "ENSG00000278782",
559
- "ENSG00000278927",
560
- "ENSG00000278955",
561
- "ENSG00000279226",
562
- "ENSG00000279765",
563
- "ENSG00000279769",
564
- "ENSG00000279948",
565
- "ENSG00000280058",
566
- "ENSG00000280095",
567
- "ENSG00000280250",
568
- "ENSG00000280346",
569
- "ENSG00000280374",
570
- "ENSG00000280710",
571
- "ENSG00000282080",
572
- "ENSG00000282246",
573
- "ENSG00000282965",
574
- "ENSG00000283486",
575
- "ENSG00000284299",
576
- "ENSG00000284741",
577
- "ENSG00000285106",
578
- "ENSG00000285162",
579
- "ENSG00000285476",
580
- "ENSG00000285762",
581
- "ENSG00000286065",
582
- "ENSG00000286228",
583
- "ENSG00000286601",
584
- "ENSG00000286699",
585
- "ENSG00000286949",
586
- "ENSG00000286996",
587
- "ENSG00000287116",
588
- "ENSG00000287388",
589
- "ENSG00000288541",
590
- "ENSG00000288546",
591
- "ENSG00000288630",
592
- "ENSG00000288639",
593
- "ENSMUSG00000069518",
594
- "ENSMUSG00000073682",
595
- "ENSMUSG00000075014",
596
- "ENSMUSG00000075015",
597
- "ENSMUSG00000078091",
598
- "ENSMUSG00000094958",
599
- "ENSMUSG00000095547",
600
- "ENSMUSG00000095891",
601
- "ENSMUSG00000096385",
602
- "ENSMUSG00000096519",
603
- "ENSMUSG00000096923",
604
- "ENSMUSG00000097078",
605
- }
606
- organismdf = organismdf[~organismdf.index.isin(drop)]
447
+ organismdf = organismdf[~organismdf.index.isin(DROP)]
607
448
  return organismdf
608
449
 
609
450
 
@@ -656,15 +497,16 @@ def _adding_scbasecamp_genes(
656
497
 
657
498
 
658
499
  def populate_my_ontology(
659
- sex: List[str] = ["PATO:0000384", "PATO:0000383"],
660
- celltypes: List[str] = [],
661
- ethnicities: List[str] = [],
662
- assays: List[str] = [],
663
- tissues: List[str] = [],
664
- diseases: List[str] = [],
665
- dev_stages: List[str] = [],
666
- organisms_clade: List[str] = ["vertebrates", "plants", "metazoa"],
667
- genes_from: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
500
+ sex: Optional[List[str]] = ["PATO:0000384", "PATO:0000383"],
501
+ celltypes: Optional[List[str]] = [],
502
+ ethnicities: Optional[List[str]] = [],
503
+ assays: Optional[List[str]] = [],
504
+ tissues: Optional[List[str]] = [],
505
+ diseases: Optional[List[str]] = [],
506
+ dev_stages: Optional[List[str]] = [],
507
+ organisms_clade: Optional[List[str]] = ["vertebrates"], # "plants", "metazoa"],
508
+ organisms: Optional[List[str]] = ["NCBITaxon:10090", "NCBITaxon:9606"],
509
+ genes_from: Optional[List[str]] = ["NCBITaxon:10090", "NCBITaxon:9606"],
668
510
  ):
669
511
  """
670
512
  creates a local version of the lamin ontologies and add the required missing values in base ontologies
@@ -697,8 +539,10 @@ def populate_my_ontology(
697
539
  names = bt.CellType.public().df().index if not celltypes else celltypes
698
540
  records = bt.CellType.from_values(names, field="ontology_id")
699
541
  ln.save(records)
700
- bt.CellType(name="unknown", ontology_id="unknown").save()
542
+ elem = bt.CellType(name="unknown", ontology_id="unknown")
543
+ ln.save([elem], ignore_conflicts=True)
701
544
  # OrganismClade
545
+ nrecords = []
702
546
  if organisms_clade is not None:
703
547
  records = []
704
548
  for organism_clade in organisms_clade:
@@ -709,7 +553,7 @@ def populate_my_ontology(
709
553
  records.append(bt.Organism.from_source(name=name, source=source))
710
554
  except DoesNotExist:
711
555
  print(f"Organism {name} not found in source {source}")
712
- nrecords = []
556
+
713
557
  prevrec = set()
714
558
  for rec in records:
715
559
  if rec is None:
@@ -717,10 +561,15 @@ def populate_my_ontology(
717
561
  if not isinstance(rec, bt.Organism):
718
562
  rec = rec[0]
719
563
  if rec.uid not in prevrec:
564
+ if organisms is not None:
565
+ if rec.ontology_id not in organisms:
566
+ continue
720
567
  nrecords.append(rec)
721
568
  prevrec.add(rec.uid)
569
+
722
570
  ln.save(nrecords)
723
- bt.Organism(name="unknown", ontology_id="unknown").save()
571
+ elem = bt.Organism(name="unknown", ontology_id="unknown").save()
572
+ ln.save([elem], ignore_conflicts=True)
724
573
  # Phenotype
725
574
  if sex is not None:
726
575
  names = bt.Phenotype.public().df().index if not sex else sex
@@ -729,7 +578,8 @@ def populate_my_ontology(
729
578
  bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
730
579
  ]
731
580
  ln.save(records)
732
- bt.Phenotype(name="unknown", ontology_id="unknown").save()
581
+ elem = bt.Phenotype(name="unknown", ontology_id="unknown").save()
582
+ ln.save([elem], ignore_conflicts=True)
733
583
  # ethnicity
734
584
  if ethnicities is not None:
735
585
  if len(ethnicities) == 0:
@@ -738,9 +588,8 @@ def populate_my_ontology(
738
588
  names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
739
589
  records = bt.Ethnicity.from_values(names, field="ontology_id")
740
590
  ln.save(records)
741
- bt.Ethnicity(
742
- name="unknown", ontology_id="unknown"
743
- ).save() # multi ethnic will have to get renamed
591
+ elem = bt.Ethnicity(name="unknown", ontology_id="unknown")
592
+ ln.save([elem], ignore_conflicts=True)
744
593
  # ExperimentalFactor
745
594
  if assays is not None:
746
595
  if len(assays) == 0:
@@ -749,7 +598,8 @@ def populate_my_ontology(
749
598
  names = bt.ExperimentalFactor.public().df().index if not assays else assays
750
599
  records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
751
600
  ln.save(records)
752
- bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
601
+ elem = bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
602
+ ln.save([elem], ignore_conflicts=True)
753
603
  # lookup = bt.ExperimentalFactor.lookup()
754
604
  # lookup.smart_seq_v4.parents.add(lookup.smart_like)
755
605
  # Tissue
@@ -760,7 +610,8 @@ def populate_my_ontology(
760
610
  names = bt.Tissue.public().df().index if not tissues else tissues
761
611
  records = bt.Tissue.from_values(names, field="ontology_id")
762
612
  ln.save(records)
763
- bt.Tissue(name="unknown", ontology_id="unknown").save()
613
+ elem = bt.Tissue(name="unknown", ontology_id="unknown").save()
614
+ ln.save([elem], ignore_conflicts=True)
764
615
  # DevelopmentalStage
765
616
  if dev_stages is not None:
766
617
  if len(dev_stages) == 0:
@@ -775,7 +626,6 @@ def populate_my_ontology(
775
626
  )
776
627
  records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
777
628
  ln.save(records)
778
- bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
779
629
 
780
630
  # Disease
781
631
  if diseases is not None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scdataloader
3
- Version: 2.0.4
3
+ Version: 2.0.5
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Project-URL: repository, https://github.com/jkobject/scDataLoader
6
6
  Author-email: jkobject <jkobject@gmail.com>
@@ -2,15 +2,15 @@ scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
2
2
  scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
3
3
  scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
4
4
  scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
5
- scdataloader/config.py,sha256=nM8J11z2-lornryy1KxDE9675Rcxge4RGhdmpeiMhuI,7173
5
+ scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
6
6
  scdataloader/data.json,sha256=Zb8c27yk3rwMgtAU8kkiWWAyUwYBrlCqKUyEtaAx9i8,8785
7
7
  scdataloader/data.py,sha256=fMW1OgllPCz87si3DpkzOSoqnufgKlh8aW5rEVmeC_c,25133
8
- scdataloader/datamodule.py,sha256=6B5nwo8NG_b8dNGPDRtDyFt5Hj095xiHDFa3ga0_s-Y,43599
8
+ scdataloader/datamodule.py,sha256=ojX0zr2cpGLoKGjWE1S_bHAEdwbFg0Ljl55hqTagW1k,43600
9
9
  scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
10
10
  scdataloader/preprocess.py,sha256=oAGMilgdIgggyp9B9c9627kdo6SCco2tnFhhIHY4-yc,39642
11
- scdataloader/utils.py,sha256=Z6td0cIphrYDLVrPrV8q4jUC_HtwGQmi-NcbpdbWrns,31034
12
- scdataloader-2.0.4.dist-info/METADATA,sha256=--g4uHOlhQ2Y_Jkxo9LOr--tH0BPL_sxODaLhUCMcw8,10314
13
- scdataloader-2.0.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
- scdataloader-2.0.4.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
15
- scdataloader-2.0.4.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
16
- scdataloader-2.0.4.dist-info/RECORD,,
11
+ scdataloader/utils.py,sha256=2zIgmQHPVKHOFWqLX56Ihqtqci3_rOfCcOs642CPnX4,27183
12
+ scdataloader-2.0.5.dist-info/METADATA,sha256=doiHkl9Iv_n4X5ifDm3cppsIkhPGEXE1zzWynFS1NHI,10314
13
+ scdataloader-2.0.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
+ scdataloader-2.0.5.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
15
+ scdataloader-2.0.5.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
16
+ scdataloader-2.0.5.dist-info/RECORD,,