scdataloader 2.0.11__tar.gz → 2.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-2.0.11 → scdataloader-2.0.12}/PKG-INFO +4 -4
- {scdataloader-2.0.11 → scdataloader-2.0.12}/README.md +2 -2
- {scdataloader-2.0.11 → scdataloader-2.0.12}/pyproject.toml +2 -2
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/data.py +18 -6
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/preprocess.py +10 -4
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/utils.py +41 -15
- {scdataloader-2.0.11 → scdataloader-2.0.12}/.gitignore +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/LICENSE +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/__init__.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/__main__.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/base.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/collator.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/config.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/datamodule.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.0.12}/scdataloader/mapped.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.12
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Project-URL: repository, https://github.com/jkobject/scDataLoader
|
|
6
6
|
Author-email: jkobject <jkobject@gmail.com>
|
|
@@ -14,7 +14,7 @@ Requires-Dist: cellxgene-census>=0.1.0
|
|
|
14
14
|
Requires-Dist: django>=4.0.0
|
|
15
15
|
Requires-Dist: ipykernel>=6.20.0
|
|
16
16
|
Requires-Dist: jupytext>=1.16.0
|
|
17
|
-
Requires-Dist: lamindb[
|
|
17
|
+
Requires-Dist: lamindb[gcp]==2.0.1
|
|
18
18
|
Requires-Dist: leidenalg>=0.8.0
|
|
19
19
|
Requires-Dist: lightning>=2.3.0
|
|
20
20
|
Requires-Dist: matplotlib>=3.5.0
|
|
@@ -162,7 +162,7 @@ adata = preprocessor(adata)
|
|
|
162
162
|
|
|
163
163
|
art = ln.Artifact(adata, description="test")
|
|
164
164
|
art.save()
|
|
165
|
-
ln.Collection(art,
|
|
165
|
+
ln.Collection(art, key="test", description="test").save()
|
|
166
166
|
|
|
167
167
|
datamodule = DataModule(
|
|
168
168
|
collection_name="test",
|
|
@@ -376,7 +376,7 @@ using absolute paths. You need to do 3 things:
|
|
|
376
376
|
|
|
377
377
|
```python
|
|
378
378
|
import lamin as ln
|
|
379
|
-
ln.Storage.
|
|
379
|
+
ln.Storage.to_dataframe(limit=None)
|
|
380
380
|
# view what is your current storage id (in my case it was GZgLW1TQ)
|
|
381
381
|
ln.Storage.filter(uid="GZgLW1TI").update(
|
|
382
382
|
root=Path("your_new_locations").as_posix().rstrip("/")
|
|
@@ -120,7 +120,7 @@ adata = preprocessor(adata)
|
|
|
120
120
|
|
|
121
121
|
art = ln.Artifact(adata, description="test")
|
|
122
122
|
art.save()
|
|
123
|
-
ln.Collection(art,
|
|
123
|
+
ln.Collection(art, key="test", description="test").save()
|
|
124
124
|
|
|
125
125
|
datamodule = DataModule(
|
|
126
126
|
collection_name="test",
|
|
@@ -334,7 +334,7 @@ using absolute paths. You need to do 3 things:
|
|
|
334
334
|
|
|
335
335
|
```python
|
|
336
336
|
import lamin as ln
|
|
337
|
-
ln.Storage.
|
|
337
|
+
ln.Storage.to_dataframe(limit=None)
|
|
338
338
|
# view what is your current storage id (in my case it was GZgLW1TQ)
|
|
339
339
|
ln.Storage.filter(uid="GZgLW1TI").update(
|
|
340
340
|
root=Path("your_new_locations").as_posix().rstrip("/")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "scdataloader"
|
|
3
|
-
version = "2.0.
|
|
3
|
+
version = "2.0.12"
|
|
4
4
|
description = "a dataloader for single cell data in lamindb"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "jkobject", email = "jkobject@gmail.com"}
|
|
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.13"
|
|
|
11
11
|
keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
|
|
12
12
|
dependencies = [
|
|
13
13
|
"numpy<=2.2.0",
|
|
14
|
-
"lamindb[
|
|
14
|
+
"lamindb[gcp]==2.0.1",
|
|
15
15
|
"cellxgene-census>=0.1.0",
|
|
16
16
|
"torch>=2.2.0",
|
|
17
17
|
"pytorch-lightning>=2.3.0",
|
|
@@ -282,19 +282,25 @@ class Dataset(torchDataset):
|
|
|
282
282
|
elif clss == "cell_type_ontology_term_id":
|
|
283
283
|
parentdf = (
|
|
284
284
|
bt.CellType.filter()
|
|
285
|
-
.
|
|
285
|
+
.to_dataframe(
|
|
286
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
287
|
+
)
|
|
286
288
|
.set_index("ontology_id")
|
|
287
289
|
)
|
|
288
290
|
elif clss == "tissue_ontology_term_id":
|
|
289
291
|
parentdf = (
|
|
290
292
|
bt.Tissue.filter()
|
|
291
|
-
.
|
|
293
|
+
.to_dataframe(
|
|
294
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
295
|
+
)
|
|
292
296
|
.set_index("ontology_id")
|
|
293
297
|
)
|
|
294
298
|
elif clss == "disease_ontology_term_id":
|
|
295
299
|
parentdf = (
|
|
296
300
|
bt.Disease.filter()
|
|
297
|
-
.
|
|
301
|
+
.to_dataframe(
|
|
302
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
303
|
+
)
|
|
298
304
|
.set_index("ontology_id")
|
|
299
305
|
)
|
|
300
306
|
elif clss in [
|
|
@@ -304,19 +310,25 @@ class Dataset(torchDataset):
|
|
|
304
310
|
]:
|
|
305
311
|
parentdf = (
|
|
306
312
|
bt.DevelopmentalStage.filter()
|
|
307
|
-
.
|
|
313
|
+
.to_dataframe(
|
|
314
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
315
|
+
)
|
|
308
316
|
.set_index("ontology_id")
|
|
309
317
|
)
|
|
310
318
|
elif clss == "assay_ontology_term_id":
|
|
311
319
|
parentdf = (
|
|
312
320
|
bt.ExperimentalFactor.filter()
|
|
313
|
-
.
|
|
321
|
+
.to_dataframe(
|
|
322
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
323
|
+
)
|
|
314
324
|
.set_index("ontology_id")
|
|
315
325
|
)
|
|
316
326
|
elif clss == "self_reported_ethnicity_ontology_term_id":
|
|
317
327
|
parentdf = (
|
|
318
328
|
bt.Ethnicity.filter()
|
|
319
|
-
.
|
|
329
|
+
.to_dataframe(
|
|
330
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
331
|
+
)
|
|
320
332
|
.set_index("ontology_id")
|
|
321
333
|
)
|
|
322
334
|
|
|
@@ -654,7 +654,7 @@ class LaminPreprocessor(Preprocessor):
|
|
|
654
654
|
# Reconstruct collection using keys
|
|
655
655
|
dataset = ln.Collection(
|
|
656
656
|
[ln.Artifact.filter(key=k).one() for k in files],
|
|
657
|
-
|
|
657
|
+
key=name,
|
|
658
658
|
description=description,
|
|
659
659
|
)
|
|
660
660
|
dataset.save()
|
|
@@ -819,9 +819,15 @@ def additional_postprocess(adata):
|
|
|
819
819
|
# else:
|
|
820
820
|
print("starting post processing")
|
|
821
821
|
sc.pp.neighbors(adata, use_rep="X_pca")
|
|
822
|
-
sc.tl.leiden(
|
|
823
|
-
|
|
824
|
-
|
|
822
|
+
sc.tl.leiden(
|
|
823
|
+
adata, key_added="leiden_2", resolution=2.0, flavor="igraph", n_iterations=2
|
|
824
|
+
)
|
|
825
|
+
sc.tl.leiden(
|
|
826
|
+
adata, key_added="leiden_1", resolution=1.0, flavor="igraph", n_iterations=2
|
|
827
|
+
)
|
|
828
|
+
sc.tl.leiden(
|
|
829
|
+
adata, key_added="leiden_0.5", resolution=0.5, flavor="igraph", n_iterations=2
|
|
830
|
+
)
|
|
825
831
|
sc.tl.umap(adata)
|
|
826
832
|
mid = adata.uns["dataset_id"] if "dataset_id" in adata.uns else "unknown_id"
|
|
827
833
|
sc.pl.umap(
|
|
@@ -389,7 +389,7 @@ def load_dataset_local(
|
|
|
389
389
|
except IntegrityError:
|
|
390
390
|
print(f"File {file.key} already exists in storage")
|
|
391
391
|
saved_files.append(file)
|
|
392
|
-
dataset = ln.Collection(saved_files,
|
|
392
|
+
dataset = ln.Collection(saved_files, key=name, description=description)
|
|
393
393
|
dataset.save()
|
|
394
394
|
return dataset
|
|
395
395
|
|
|
@@ -412,7 +412,7 @@ def load_genes(
|
|
|
412
412
|
for organism in organisms:
|
|
413
413
|
genesdf = bt.Gene.filter(
|
|
414
414
|
organism_id=bt.Organism.filter(ontology_id=organism).first().id
|
|
415
|
-
).
|
|
415
|
+
).to_dataframe(limit=None)
|
|
416
416
|
genesdf.loc[genesdf.ensembl_gene_id.isna(), "ensembl_gene_id"] = genesdf.loc[
|
|
417
417
|
genesdf.ensembl_gene_id.isna(), "stable_id"
|
|
418
418
|
]
|
|
@@ -453,12 +453,16 @@ def _adding_scbasecamp_genes(
|
|
|
453
453
|
):
|
|
454
454
|
if len(species) == 0:
|
|
455
455
|
species = set(
|
|
456
|
-
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
456
|
+
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
457
|
+
.to_dataframe(limit=None)
|
|
458
|
+
.ontology_id
|
|
457
459
|
) - set(["NCBITaxon:10090", "NCBITaxon:9606"])
|
|
458
460
|
species = list(species)
|
|
459
461
|
for i in set(
|
|
460
|
-
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
461
|
-
|
|
462
|
+
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
463
|
+
.to_dataframe(limit=None)
|
|
464
|
+
.ontology_id
|
|
465
|
+
) - set(bt.Organism.filter().to_dataframe(limit=None).ontology_id):
|
|
462
466
|
print(i)
|
|
463
467
|
rec = (
|
|
464
468
|
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
@@ -485,7 +489,7 @@ def _adding_scbasecamp_genes(
|
|
|
485
489
|
bt.Gene.using("laminlabs/arc-virtual-cell-atlas")
|
|
486
490
|
.filter(organism__ontology_id=i)
|
|
487
491
|
.all()
|
|
488
|
-
.
|
|
492
|
+
.to_dataframe(limit=None)
|
|
489
493
|
)
|
|
490
494
|
genes = []
|
|
491
495
|
org = bt.Organism.filter(ontology_id=i).one()
|
|
@@ -545,7 +549,11 @@ def populate_my_ontology(
|
|
|
545
549
|
if len(celltypes) == 0:
|
|
546
550
|
bt.CellType.import_source()
|
|
547
551
|
else:
|
|
548
|
-
names =
|
|
552
|
+
names = (
|
|
553
|
+
bt.CellType.public().to_dataframe().index
|
|
554
|
+
if not celltypes
|
|
555
|
+
else celltypes
|
|
556
|
+
)
|
|
549
557
|
records = bt.CellType.from_values(names, field="ontology_id")
|
|
550
558
|
ln.save(records)
|
|
551
559
|
elem = bt.CellType(name="unknown", ontology_id="unknown")
|
|
@@ -555,7 +563,7 @@ def populate_my_ontology(
|
|
|
555
563
|
if organisms_clade is not None:
|
|
556
564
|
records = []
|
|
557
565
|
for organism_clade in organisms_clade:
|
|
558
|
-
names = bt.Organism.public(organism=organism_clade).
|
|
566
|
+
names = bt.Organism.public(organism=organism_clade).to_dataframe().index
|
|
559
567
|
source = bt.Source.filter(name="ensembl", organism=organism_clade).last()
|
|
560
568
|
for name in names:
|
|
561
569
|
try:
|
|
@@ -581,7 +589,7 @@ def populate_my_ontology(
|
|
|
581
589
|
ln.save([elem], ignore_conflicts=True)
|
|
582
590
|
# Phenotype
|
|
583
591
|
if sex is not None:
|
|
584
|
-
names = bt.Phenotype.public().
|
|
592
|
+
names = bt.Phenotype.public().to_dataframe().index if not sex else sex
|
|
585
593
|
source = bt.Source.filter(name="pato").first()
|
|
586
594
|
records = [
|
|
587
595
|
bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
|
|
@@ -594,7 +602,11 @@ def populate_my_ontology(
|
|
|
594
602
|
if len(ethnicities) == 0:
|
|
595
603
|
bt.Ethnicity.import_source()
|
|
596
604
|
else:
|
|
597
|
-
names =
|
|
605
|
+
names = (
|
|
606
|
+
bt.Ethnicity.public().to_dataframe().index
|
|
607
|
+
if not ethnicities
|
|
608
|
+
else ethnicities
|
|
609
|
+
)
|
|
598
610
|
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
599
611
|
ln.save(records)
|
|
600
612
|
elem = bt.Ethnicity(name="unknown", ontology_id="unknown")
|
|
@@ -604,7 +616,11 @@ def populate_my_ontology(
|
|
|
604
616
|
if len(assays) == 0:
|
|
605
617
|
bt.ExperimentalFactor.import_source()
|
|
606
618
|
else:
|
|
607
|
-
names =
|
|
619
|
+
names = (
|
|
620
|
+
bt.ExperimentalFactor.public().to_dataframe().index
|
|
621
|
+
if not assays
|
|
622
|
+
else assays
|
|
623
|
+
)
|
|
608
624
|
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
609
625
|
ln.save(records)
|
|
610
626
|
elem = bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
@@ -616,7 +632,11 @@ def populate_my_ontology(
|
|
|
616
632
|
if len(tissues) == 0:
|
|
617
633
|
bt.Tissue.import_source()
|
|
618
634
|
else:
|
|
619
|
-
names =
|
|
635
|
+
names = (
|
|
636
|
+
bt.Tissue.public().to_dataframe().index
|
|
637
|
+
if not tissues
|
|
638
|
+
else tissues
|
|
639
|
+
)
|
|
620
640
|
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
621
641
|
ln.save(records)
|
|
622
642
|
elem = bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
@@ -629,7 +649,7 @@ def populate_my_ontology(
|
|
|
629
649
|
bt.DevelopmentalStage.import_source(source=source)
|
|
630
650
|
else:
|
|
631
651
|
names = (
|
|
632
|
-
bt.DevelopmentalStage.public().
|
|
652
|
+
bt.DevelopmentalStage.public().to_dataframe().index
|
|
633
653
|
if not dev_stages
|
|
634
654
|
else dev_stages
|
|
635
655
|
)
|
|
@@ -641,7 +661,11 @@ def populate_my_ontology(
|
|
|
641
661
|
if len(diseases) == 0:
|
|
642
662
|
bt.Disease.import_source()
|
|
643
663
|
else:
|
|
644
|
-
names =
|
|
664
|
+
names = (
|
|
665
|
+
bt.Disease.public().to_dataframe().index
|
|
666
|
+
if not diseases
|
|
667
|
+
else diseases
|
|
668
|
+
)
|
|
645
669
|
records = bt.Disease.from_values(names, field="ontology_id")
|
|
646
670
|
ln.save(records)
|
|
647
671
|
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
|
|
@@ -650,7 +674,9 @@ def populate_my_ontology(
|
|
|
650
674
|
for organism in genes_from:
|
|
651
675
|
# convert onto to name
|
|
652
676
|
organism = bt.Organism.filter(ontology_id=organism).one().name
|
|
653
|
-
names = bt.Gene.public(organism=organism).
|
|
677
|
+
names = bt.Gene.public(organism=organism).to_dataframe()[
|
|
678
|
+
"ensembl_gene_id"
|
|
679
|
+
]
|
|
654
680
|
|
|
655
681
|
# Process names in blocks of 10,000 elements
|
|
656
682
|
block_size = 10000
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|