scdataloader 2.0.11__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scdataloader-2.0.11 → scdataloader-2.1.0}/PKG-INFO +4 -4
- {scdataloader-2.0.11 → scdataloader-2.1.0}/README.md +2 -2
- {scdataloader-2.0.11 → scdataloader-2.1.0}/pyproject.toml +2 -2
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/data.py +18 -6
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/preprocess.py +13 -6
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/utils.py +42 -15
- {scdataloader-2.0.11 → scdataloader-2.1.0}/.gitignore +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/LICENSE +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/__init__.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/__main__.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/base.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/collator.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/config.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/datamodule.py +0 -0
- {scdataloader-2.0.11 → scdataloader-2.1.0}/scdataloader/mapped.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Project-URL: repository, https://github.com/jkobject/scDataLoader
|
|
6
6
|
Author-email: jkobject <jkobject@gmail.com>
|
|
@@ -14,7 +14,7 @@ Requires-Dist: cellxgene-census>=0.1.0
|
|
|
14
14
|
Requires-Dist: django>=4.0.0
|
|
15
15
|
Requires-Dist: ipykernel>=6.20.0
|
|
16
16
|
Requires-Dist: jupytext>=1.16.0
|
|
17
|
-
Requires-Dist: lamindb[
|
|
17
|
+
Requires-Dist: lamindb[gcp]==2.1.1
|
|
18
18
|
Requires-Dist: leidenalg>=0.8.0
|
|
19
19
|
Requires-Dist: lightning>=2.3.0
|
|
20
20
|
Requires-Dist: matplotlib>=3.5.0
|
|
@@ -162,7 +162,7 @@ adata = preprocessor(adata)
|
|
|
162
162
|
|
|
163
163
|
art = ln.Artifact(adata, description="test")
|
|
164
164
|
art.save()
|
|
165
|
-
ln.Collection(art,
|
|
165
|
+
ln.Collection(art, key="test", description="test").save()
|
|
166
166
|
|
|
167
167
|
datamodule = DataModule(
|
|
168
168
|
collection_name="test",
|
|
@@ -376,7 +376,7 @@ using absolute paths. You need to do 3 things:
|
|
|
376
376
|
|
|
377
377
|
```python
|
|
378
378
|
import lamin as ln
|
|
379
|
-
ln.Storage.
|
|
379
|
+
ln.Storage.to_dataframe(limit=None)
|
|
380
380
|
# view what is your current storage id (in my case it was GZgLW1TQ)
|
|
381
381
|
ln.Storage.filter(uid="GZgLW1TI").update(
|
|
382
382
|
root=Path("your_new_locations").as_posix().rstrip("/")
|
|
@@ -120,7 +120,7 @@ adata = preprocessor(adata)
|
|
|
120
120
|
|
|
121
121
|
art = ln.Artifact(adata, description="test")
|
|
122
122
|
art.save()
|
|
123
|
-
ln.Collection(art,
|
|
123
|
+
ln.Collection(art, key="test", description="test").save()
|
|
124
124
|
|
|
125
125
|
datamodule = DataModule(
|
|
126
126
|
collection_name="test",
|
|
@@ -334,7 +334,7 @@ using absolute paths. You need to do 3 things:
|
|
|
334
334
|
|
|
335
335
|
```python
|
|
336
336
|
import lamin as ln
|
|
337
|
-
ln.Storage.
|
|
337
|
+
ln.Storage.to_dataframe(limit=None)
|
|
338
338
|
# view what is your current storage id (in my case it was GZgLW1TQ)
|
|
339
339
|
ln.Storage.filter(uid="GZgLW1TI").update(
|
|
340
340
|
root=Path("your_new_locations").as_posix().rstrip("/")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "scdataloader"
|
|
3
|
-
version = "2.0
|
|
3
|
+
version = "2.1.0"
|
|
4
4
|
description = "a dataloader for single cell data in lamindb"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "jkobject", email = "jkobject@gmail.com"}
|
|
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.13"
|
|
|
11
11
|
keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
|
|
12
12
|
dependencies = [
|
|
13
13
|
"numpy<=2.2.0",
|
|
14
|
-
"lamindb[
|
|
14
|
+
"lamindb[gcp]==2.1.1",
|
|
15
15
|
"cellxgene-census>=0.1.0",
|
|
16
16
|
"torch>=2.2.0",
|
|
17
17
|
"pytorch-lightning>=2.3.0",
|
|
@@ -282,19 +282,25 @@ class Dataset(torchDataset):
|
|
|
282
282
|
elif clss == "cell_type_ontology_term_id":
|
|
283
283
|
parentdf = (
|
|
284
284
|
bt.CellType.filter()
|
|
285
|
-
.
|
|
285
|
+
.to_dataframe(
|
|
286
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
287
|
+
)
|
|
286
288
|
.set_index("ontology_id")
|
|
287
289
|
)
|
|
288
290
|
elif clss == "tissue_ontology_term_id":
|
|
289
291
|
parentdf = (
|
|
290
292
|
bt.Tissue.filter()
|
|
291
|
-
.
|
|
293
|
+
.to_dataframe(
|
|
294
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
295
|
+
)
|
|
292
296
|
.set_index("ontology_id")
|
|
293
297
|
)
|
|
294
298
|
elif clss == "disease_ontology_term_id":
|
|
295
299
|
parentdf = (
|
|
296
300
|
bt.Disease.filter()
|
|
297
|
-
.
|
|
301
|
+
.to_dataframe(
|
|
302
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
303
|
+
)
|
|
298
304
|
.set_index("ontology_id")
|
|
299
305
|
)
|
|
300
306
|
elif clss in [
|
|
@@ -304,19 +310,25 @@ class Dataset(torchDataset):
|
|
|
304
310
|
]:
|
|
305
311
|
parentdf = (
|
|
306
312
|
bt.DevelopmentalStage.filter()
|
|
307
|
-
.
|
|
313
|
+
.to_dataframe(
|
|
314
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
315
|
+
)
|
|
308
316
|
.set_index("ontology_id")
|
|
309
317
|
)
|
|
310
318
|
elif clss == "assay_ontology_term_id":
|
|
311
319
|
parentdf = (
|
|
312
320
|
bt.ExperimentalFactor.filter()
|
|
313
|
-
.
|
|
321
|
+
.to_dataframe(
|
|
322
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
323
|
+
)
|
|
314
324
|
.set_index("ontology_id")
|
|
315
325
|
)
|
|
316
326
|
elif clss == "self_reported_ethnicity_ontology_term_id":
|
|
317
327
|
parentdf = (
|
|
318
328
|
bt.Ethnicity.filter()
|
|
319
|
-
.
|
|
329
|
+
.to_dataframe(
|
|
330
|
+
limit=None, include=["parents__ontology_id", "ontology_id"]
|
|
331
|
+
)
|
|
320
332
|
.set_index("ontology_id")
|
|
321
333
|
)
|
|
322
334
|
|
|
@@ -264,8 +264,9 @@ class Preprocessor:
|
|
|
264
264
|
# For genes that are already ENS IDs, use them directly
|
|
265
265
|
prev_size = adata.shape[1]
|
|
266
266
|
# Handle symbol genes
|
|
267
|
+
cols_to_use = adata.var.columns.difference(genesdf.columns)
|
|
267
268
|
if self.is_symbol:
|
|
268
|
-
new_var = adata.var.merge(
|
|
269
|
+
new_var = adata.var[cols_to_use].merge(
|
|
269
270
|
genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
|
|
270
271
|
left_index=True,
|
|
271
272
|
right_index=True,
|
|
@@ -275,7 +276,7 @@ class Preprocessor:
|
|
|
275
276
|
adata = adata[:, new_var.index]
|
|
276
277
|
new_var.index = new_var["ensembl_gene_id"]
|
|
277
278
|
else:
|
|
278
|
-
new_var = adata.var.merge(
|
|
279
|
+
new_var = adata.var[cols_to_use].merge(
|
|
279
280
|
genesdf, left_index=True, right_index=True, how="inner"
|
|
280
281
|
)
|
|
281
282
|
adata = adata[:, new_var.index]
|
|
@@ -654,7 +655,7 @@ class LaminPreprocessor(Preprocessor):
|
|
|
654
655
|
# Reconstruct collection using keys
|
|
655
656
|
dataset = ln.Collection(
|
|
656
657
|
[ln.Artifact.filter(key=k).one() for k in files],
|
|
657
|
-
|
|
658
|
+
key=name,
|
|
658
659
|
description=description,
|
|
659
660
|
)
|
|
660
661
|
dataset.save()
|
|
@@ -819,9 +820,15 @@ def additional_postprocess(adata):
|
|
|
819
820
|
# else:
|
|
820
821
|
print("starting post processing")
|
|
821
822
|
sc.pp.neighbors(adata, use_rep="X_pca")
|
|
822
|
-
sc.tl.leiden(
|
|
823
|
-
|
|
824
|
-
|
|
823
|
+
sc.tl.leiden(
|
|
824
|
+
adata, key_added="leiden_2", resolution=2.0, flavor="igraph", n_iterations=2
|
|
825
|
+
)
|
|
826
|
+
sc.tl.leiden(
|
|
827
|
+
adata, key_added="leiden_1", resolution=1.0, flavor="igraph", n_iterations=2
|
|
828
|
+
)
|
|
829
|
+
sc.tl.leiden(
|
|
830
|
+
adata, key_added="leiden_0.5", resolution=0.5, flavor="igraph", n_iterations=2
|
|
831
|
+
)
|
|
825
832
|
sc.tl.umap(adata)
|
|
826
833
|
mid = adata.uns["dataset_id"] if "dataset_id" in adata.uns else "unknown_id"
|
|
827
834
|
sc.pl.umap(
|
|
@@ -389,7 +389,7 @@ def load_dataset_local(
|
|
|
389
389
|
except IntegrityError:
|
|
390
390
|
print(f"File {file.key} already exists in storage")
|
|
391
391
|
saved_files.append(file)
|
|
392
|
-
dataset = ln.Collection(saved_files,
|
|
392
|
+
dataset = ln.Collection(saved_files, key=name, description=description)
|
|
393
393
|
dataset.save()
|
|
394
394
|
return dataset
|
|
395
395
|
|
|
@@ -412,7 +412,7 @@ def load_genes(
|
|
|
412
412
|
for organism in organisms:
|
|
413
413
|
genesdf = bt.Gene.filter(
|
|
414
414
|
organism_id=bt.Organism.filter(ontology_id=organism).first().id
|
|
415
|
-
).
|
|
415
|
+
).to_dataframe(limit=None)
|
|
416
416
|
genesdf.loc[genesdf.ensembl_gene_id.isna(), "ensembl_gene_id"] = genesdf.loc[
|
|
417
417
|
genesdf.ensembl_gene_id.isna(), "stable_id"
|
|
418
418
|
]
|
|
@@ -440,6 +440,7 @@ def load_genes(
|
|
|
440
440
|
"ncbi_gene_ids",
|
|
441
441
|
"synonyms",
|
|
442
442
|
"description",
|
|
443
|
+
"abbr",
|
|
443
444
|
]:
|
|
444
445
|
if col in organismdf.columns:
|
|
445
446
|
organismdf.drop(columns=[col], inplace=True)
|
|
@@ -453,12 +454,16 @@ def _adding_scbasecamp_genes(
|
|
|
453
454
|
):
|
|
454
455
|
if len(species) == 0:
|
|
455
456
|
species = set(
|
|
456
|
-
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
457
|
+
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
458
|
+
.to_dataframe(limit=None)
|
|
459
|
+
.ontology_id
|
|
457
460
|
) - set(["NCBITaxon:10090", "NCBITaxon:9606"])
|
|
458
461
|
species = list(species)
|
|
459
462
|
for i in set(
|
|
460
|
-
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
461
|
-
|
|
463
|
+
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
464
|
+
.to_dataframe(limit=None)
|
|
465
|
+
.ontology_id
|
|
466
|
+
) - set(bt.Organism.filter().to_dataframe(limit=None).ontology_id):
|
|
462
467
|
print(i)
|
|
463
468
|
rec = (
|
|
464
469
|
bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
|
|
@@ -485,7 +490,7 @@ def _adding_scbasecamp_genes(
|
|
|
485
490
|
bt.Gene.using("laminlabs/arc-virtual-cell-atlas")
|
|
486
491
|
.filter(organism__ontology_id=i)
|
|
487
492
|
.all()
|
|
488
|
-
.
|
|
493
|
+
.to_dataframe(limit=None)
|
|
489
494
|
)
|
|
490
495
|
genes = []
|
|
491
496
|
org = bt.Organism.filter(ontology_id=i).one()
|
|
@@ -545,7 +550,11 @@ def populate_my_ontology(
|
|
|
545
550
|
if len(celltypes) == 0:
|
|
546
551
|
bt.CellType.import_source()
|
|
547
552
|
else:
|
|
548
|
-
names =
|
|
553
|
+
names = (
|
|
554
|
+
bt.CellType.public().to_dataframe().index
|
|
555
|
+
if not celltypes
|
|
556
|
+
else celltypes
|
|
557
|
+
)
|
|
549
558
|
records = bt.CellType.from_values(names, field="ontology_id")
|
|
550
559
|
ln.save(records)
|
|
551
560
|
elem = bt.CellType(name="unknown", ontology_id="unknown")
|
|
@@ -555,7 +564,7 @@ def populate_my_ontology(
|
|
|
555
564
|
if organisms_clade is not None:
|
|
556
565
|
records = []
|
|
557
566
|
for organism_clade in organisms_clade:
|
|
558
|
-
names = bt.Organism.public(organism=organism_clade).
|
|
567
|
+
names = bt.Organism.public(organism=organism_clade).to_dataframe().index
|
|
559
568
|
source = bt.Source.filter(name="ensembl", organism=organism_clade).last()
|
|
560
569
|
for name in names:
|
|
561
570
|
try:
|
|
@@ -581,7 +590,7 @@ def populate_my_ontology(
|
|
|
581
590
|
ln.save([elem], ignore_conflicts=True)
|
|
582
591
|
# Phenotype
|
|
583
592
|
if sex is not None:
|
|
584
|
-
names = bt.Phenotype.public().
|
|
593
|
+
names = bt.Phenotype.public().to_dataframe().index if not sex else sex
|
|
585
594
|
source = bt.Source.filter(name="pato").first()
|
|
586
595
|
records = [
|
|
587
596
|
bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
|
|
@@ -594,7 +603,11 @@ def populate_my_ontology(
|
|
|
594
603
|
if len(ethnicities) == 0:
|
|
595
604
|
bt.Ethnicity.import_source()
|
|
596
605
|
else:
|
|
597
|
-
names =
|
|
606
|
+
names = (
|
|
607
|
+
bt.Ethnicity.public().to_dataframe().index
|
|
608
|
+
if not ethnicities
|
|
609
|
+
else ethnicities
|
|
610
|
+
)
|
|
598
611
|
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
599
612
|
ln.save(records)
|
|
600
613
|
elem = bt.Ethnicity(name="unknown", ontology_id="unknown")
|
|
@@ -604,7 +617,11 @@ def populate_my_ontology(
|
|
|
604
617
|
if len(assays) == 0:
|
|
605
618
|
bt.ExperimentalFactor.import_source()
|
|
606
619
|
else:
|
|
607
|
-
names =
|
|
620
|
+
names = (
|
|
621
|
+
bt.ExperimentalFactor.public().to_dataframe().index
|
|
622
|
+
if not assays
|
|
623
|
+
else assays
|
|
624
|
+
)
|
|
608
625
|
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
609
626
|
ln.save(records)
|
|
610
627
|
elem = bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
@@ -616,7 +633,11 @@ def populate_my_ontology(
|
|
|
616
633
|
if len(tissues) == 0:
|
|
617
634
|
bt.Tissue.import_source()
|
|
618
635
|
else:
|
|
619
|
-
names =
|
|
636
|
+
names = (
|
|
637
|
+
bt.Tissue.public().to_dataframe().index
|
|
638
|
+
if not tissues
|
|
639
|
+
else tissues
|
|
640
|
+
)
|
|
620
641
|
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
621
642
|
ln.save(records)
|
|
622
643
|
elem = bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
@@ -629,7 +650,7 @@ def populate_my_ontology(
|
|
|
629
650
|
bt.DevelopmentalStage.import_source(source=source)
|
|
630
651
|
else:
|
|
631
652
|
names = (
|
|
632
|
-
bt.DevelopmentalStage.public().
|
|
653
|
+
bt.DevelopmentalStage.public().to_dataframe().index
|
|
633
654
|
if not dev_stages
|
|
634
655
|
else dev_stages
|
|
635
656
|
)
|
|
@@ -641,7 +662,11 @@ def populate_my_ontology(
|
|
|
641
662
|
if len(diseases) == 0:
|
|
642
663
|
bt.Disease.import_source()
|
|
643
664
|
else:
|
|
644
|
-
names =
|
|
665
|
+
names = (
|
|
666
|
+
bt.Disease.public().to_dataframe().index
|
|
667
|
+
if not diseases
|
|
668
|
+
else diseases
|
|
669
|
+
)
|
|
645
670
|
records = bt.Disease.from_values(names, field="ontology_id")
|
|
646
671
|
ln.save(records)
|
|
647
672
|
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
|
|
@@ -650,7 +675,9 @@ def populate_my_ontology(
|
|
|
650
675
|
for organism in genes_from:
|
|
651
676
|
# convert onto to name
|
|
652
677
|
organism = bt.Organism.filter(ontology_id=organism).one().name
|
|
653
|
-
names = bt.Gene.public(organism=organism).
|
|
678
|
+
names = bt.Gene.public(organism=organism).to_dataframe()[
|
|
679
|
+
"ensembl_gene_id"
|
|
680
|
+
]
|
|
654
681
|
|
|
655
682
|
# Process names in blocks of 10,000 elements
|
|
656
683
|
block_size = 10000
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|