scdataloader 1.6.4__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/VERSION +1 -1
- scdataloader/__init__.py +2 -0
- scdataloader/__main__.py +38 -8
- scdataloader/collator.py +6 -2
- scdataloader/config.py +99 -0
- scdataloader/data.py +42 -35
- scdataloader/datamodule.py +123 -40
- scdataloader/mapped.py +700 -0
- scdataloader/preprocess.py +229 -86
- scdataloader/utils.py +49 -27
- {scdataloader-1.6.4.dist-info → scdataloader-1.7.0.dist-info}/METADATA +7 -6
- scdataloader-1.7.0.dist-info/RECORD +15 -0
- {scdataloader-1.6.4.dist-info → scdataloader-1.7.0.dist-info}/WHEEL +1 -1
- scdataloader-1.6.4.dist-info/RECORD +0 -14
- {scdataloader-1.6.4.dist-info → scdataloader-1.7.0.dist-info}/licenses/LICENSE +0 -0
scdataloader/preprocess.py
CHANGED
|
@@ -8,8 +8,9 @@ import pandas as pd
|
|
|
8
8
|
import scanpy as sc
|
|
9
9
|
from anndata import AnnData
|
|
10
10
|
from scipy.sparse import csr_matrix
|
|
11
|
-
|
|
11
|
+
from anndata import read_h5ad
|
|
12
12
|
from scdataloader import utils as data_utils
|
|
13
|
+
from upath import UPath
|
|
13
14
|
|
|
14
15
|
FULL_LENGTH_ASSAYS = [
|
|
15
16
|
"EFO: 0700016",
|
|
@@ -31,7 +32,7 @@ class Preprocessor:
|
|
|
31
32
|
filter_gene_by_counts: Union[int, bool] = False,
|
|
32
33
|
filter_cell_by_counts: Union[int, bool] = False,
|
|
33
34
|
normalize_sum: float = 1e4,
|
|
34
|
-
|
|
35
|
+
n_hvg_for_postp: int = 0,
|
|
35
36
|
use_layer: Optional[str] = None,
|
|
36
37
|
is_symbol: bool = False,
|
|
37
38
|
hvg_flavor: str = "seurat_v3",
|
|
@@ -45,7 +46,13 @@ class Preprocessor:
|
|
|
45
46
|
maxdropamount: int = 50,
|
|
46
47
|
madoutlier: int = 5,
|
|
47
48
|
pct_mt_outlier: int = 8,
|
|
48
|
-
|
|
49
|
+
batch_keys: list[str] = [
|
|
50
|
+
"assay_ontology_term_id",
|
|
51
|
+
"self_reported_ethnicity_ontology_term_id",
|
|
52
|
+
"sex_ontology_term_id",
|
|
53
|
+
"donor_id",
|
|
54
|
+
"suspension_type",
|
|
55
|
+
],
|
|
49
56
|
skip_validate: bool = False,
|
|
50
57
|
additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
|
|
51
58
|
additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
|
|
@@ -65,7 +72,7 @@ class Preprocessor:
|
|
|
65
72
|
Defaults to 1e4.
|
|
66
73
|
log1p (bool, optional): Determines whether to apply log1p transform to the normalized data.
|
|
67
74
|
Defaults to True.
|
|
68
|
-
|
|
75
|
+
n_hvg_for_postp (int or bool, optional): Determines whether to subset to highly variable genes for the PCA.
|
|
69
76
|
Defaults to False.
|
|
70
77
|
hvg_flavor (str, optional): Specifies the flavor of highly variable genes selection.
|
|
71
78
|
See :func:`scanpy.pp.highly_variable_genes` for more details. Defaults to "seurat_v3".
|
|
@@ -96,7 +103,6 @@ class Preprocessor:
|
|
|
96
103
|
self.filter_gene_by_counts = filter_gene_by_counts
|
|
97
104
|
self.filter_cell_by_counts = filter_cell_by_counts
|
|
98
105
|
self.normalize_sum = normalize_sum
|
|
99
|
-
self.subset_hvg = subset_hvg
|
|
100
106
|
self.hvg_flavor = hvg_flavor
|
|
101
107
|
self.binning = binning
|
|
102
108
|
self.organisms = organisms
|
|
@@ -109,8 +115,9 @@ class Preprocessor:
|
|
|
109
115
|
self.min_nnz_genes = min_nnz_genes
|
|
110
116
|
self.maxdropamount = maxdropamount
|
|
111
117
|
self.madoutlier = madoutlier
|
|
118
|
+
self.n_hvg_for_postp = n_hvg_for_postp
|
|
112
119
|
self.pct_mt_outlier = pct_mt_outlier
|
|
113
|
-
self.
|
|
120
|
+
self.batch_keys = batch_keys
|
|
114
121
|
self.length_normalize = length_normalize
|
|
115
122
|
self.skip_validate = skip_validate
|
|
116
123
|
self.use_layer = use_layer
|
|
@@ -118,7 +125,7 @@ class Preprocessor:
|
|
|
118
125
|
self.do_postp = do_postp
|
|
119
126
|
self.use_raw = use_raw
|
|
120
127
|
|
|
121
|
-
def __call__(self, adata) -> AnnData:
|
|
128
|
+
def __call__(self, adata, dataset_id=None) -> AnnData:
|
|
122
129
|
if adata[0].obs.organism_ontology_term_id.iloc[0] not in self.organisms:
|
|
123
130
|
raise ValueError(
|
|
124
131
|
"we cannot work with this organism",
|
|
@@ -144,10 +151,6 @@ class Preprocessor:
|
|
|
144
151
|
del adata.obsm
|
|
145
152
|
if len(adata.obsp.keys()) > 0 and self.do_postp:
|
|
146
153
|
del adata.obsp
|
|
147
|
-
if len(adata.uns.keys()) > 0:
|
|
148
|
-
del adata.uns
|
|
149
|
-
if len(adata.varp.keys()) > 0:
|
|
150
|
-
del adata.varp
|
|
151
154
|
# check that it is a count
|
|
152
155
|
print("checking raw counts")
|
|
153
156
|
if np.abs(
|
|
@@ -209,9 +212,9 @@ class Preprocessor:
|
|
|
209
212
|
)
|
|
210
213
|
)
|
|
211
214
|
|
|
212
|
-
if self.is_symbol or not adata.var.index.str.contains("
|
|
213
|
-
if not adata.var.index.str.contains("
|
|
214
|
-
print("No
|
|
215
|
+
if self.is_symbol or not adata.var.index.str.contains("ENS").any():
|
|
216
|
+
if not adata.var.index.str.contains("ENS").any():
|
|
217
|
+
print("No ENS genes found, assuming gene symbols...")
|
|
215
218
|
genesdf["ensembl_gene_id"] = genesdf.index
|
|
216
219
|
var = (
|
|
217
220
|
adata.var.merge(
|
|
@@ -243,9 +246,13 @@ class Preprocessor:
|
|
|
243
246
|
adata = ad.concat([adata, emptyda], axis=1, join="outer", merge="only")
|
|
244
247
|
# do a validation function
|
|
245
248
|
adata.uns["unseen_genes"] = list(unseen)
|
|
249
|
+
if dataset_id is not None:
|
|
250
|
+
adata.uns["dataset_id"] = dataset_id
|
|
246
251
|
if not self.skip_validate:
|
|
247
252
|
print("validating")
|
|
248
|
-
data_utils.validate(
|
|
253
|
+
data_utils.validate(
|
|
254
|
+
adata, organism=adata.obs.organism_ontology_term_id[0], need_all=False
|
|
255
|
+
)
|
|
249
256
|
# length normalization
|
|
250
257
|
if (
|
|
251
258
|
adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
|
|
@@ -310,38 +317,42 @@ class Preprocessor:
|
|
|
310
317
|
)["X"]
|
|
311
318
|
)
|
|
312
319
|
# step 5: subset hvg
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
batch_key=self.batch_key,
|
|
318
|
-
flavor=self.hvg_flavor,
|
|
319
|
-
subset=False,
|
|
320
|
-
)
|
|
321
|
-
sc.pp.log1p(adata, layer="norm")
|
|
322
|
-
sc.pp.pca(
|
|
323
|
-
adata,
|
|
324
|
-
layer="norm",
|
|
325
|
-
n_comps=200 if adata.shape[0] > 200 else adata.shape[0] - 2,
|
|
326
|
-
)
|
|
327
|
-
sc.pp.neighbors(adata, use_rep="X_pca")
|
|
328
|
-
sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
|
|
329
|
-
sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
|
|
330
|
-
sc.tl.leiden(adata, key_added="leiden_0.5", resolution=0.5)
|
|
331
|
-
batches = [
|
|
332
|
-
"assay_ontology_term_id",
|
|
333
|
-
"self_reported_ethnicity_ontology_term_id",
|
|
334
|
-
"sex_ontology_term_id",
|
|
335
|
-
"development_stage_ontology_term_id",
|
|
336
|
-
]
|
|
337
|
-
if "donor_id" in adata.obs.columns:
|
|
338
|
-
batches.append("donor_id")
|
|
339
|
-
if "suspension_type" in adata.obs.columns:
|
|
340
|
-
batches.append("suspension_type")
|
|
320
|
+
batches = []
|
|
321
|
+
for i in self.batch_keys:
|
|
322
|
+
if i in adata.obs.columns:
|
|
323
|
+
batches.append(i)
|
|
341
324
|
adata.obs["batches"] = adata.obs[batches].apply(
|
|
342
325
|
lambda x: ",".join(x.dropna().astype(str)), axis=1
|
|
343
326
|
)
|
|
344
|
-
|
|
327
|
+
if self.n_hvg_for_postp:
|
|
328
|
+
try:
|
|
329
|
+
sc.pp.highly_variable_genes(
|
|
330
|
+
adata,
|
|
331
|
+
n_top_genes=self.n_hvg_for_postp,
|
|
332
|
+
batch_key="batches",
|
|
333
|
+
flavor=self.hvg_flavor,
|
|
334
|
+
subset=False,
|
|
335
|
+
layer="norm",
|
|
336
|
+
)
|
|
337
|
+
except (ValueError, ZeroDivisionError) as e:
|
|
338
|
+
print("retrying with span")
|
|
339
|
+
sc.pp.highly_variable_genes(
|
|
340
|
+
adata,
|
|
341
|
+
n_top_genes=self.n_hvg_for_postp,
|
|
342
|
+
# batch_key="batches",
|
|
343
|
+
flavor=self.hvg_flavor,
|
|
344
|
+
span=0.5,
|
|
345
|
+
subset=False,
|
|
346
|
+
layer="norm",
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
adata.obsm["X_pca"] = sc.pp.pca(
|
|
350
|
+
adata.layers["norm"][:, adata.var.highly_variable]
|
|
351
|
+
if "highly_variable" in adata.var.columns
|
|
352
|
+
else adata.layers["norm"],
|
|
353
|
+
n_comps=200 if adata.shape[0] > 200 else adata.shape[0] - 2,
|
|
354
|
+
)
|
|
355
|
+
|
|
345
356
|
# additional
|
|
346
357
|
if self.additional_postprocess is not None:
|
|
347
358
|
adata = self.additional_postprocess(adata)
|
|
@@ -393,6 +404,7 @@ class Preprocessor:
|
|
|
393
404
|
adata.layers[self.result_binned_key] = np.stack(binned_rows)
|
|
394
405
|
adata.obsm["bin_edges"] = np.stack(bin_edges)
|
|
395
406
|
print("done")
|
|
407
|
+
print(adata)
|
|
396
408
|
return adata
|
|
397
409
|
|
|
398
410
|
|
|
@@ -401,22 +413,22 @@ class LaminPreprocessor(Preprocessor):
|
|
|
401
413
|
self,
|
|
402
414
|
*args,
|
|
403
415
|
cache: bool = True,
|
|
404
|
-
stream: bool = False,
|
|
405
416
|
keep_files: bool = True,
|
|
417
|
+
force_preloaded: bool = False,
|
|
406
418
|
**kwargs,
|
|
407
419
|
):
|
|
408
420
|
super().__init__(*args, **kwargs)
|
|
409
421
|
self.cache = cache
|
|
410
|
-
self.stream = stream
|
|
411
422
|
self.keep_files = keep_files
|
|
423
|
+
self.force_preloaded = force_preloaded
|
|
412
424
|
|
|
413
425
|
def __call__(
|
|
414
426
|
self,
|
|
415
427
|
data: Union[ln.Collection, AnnData] = None,
|
|
416
|
-
name="preprocessed dataset",
|
|
417
|
-
description="preprocessed dataset using scprint",
|
|
418
|
-
start_at=0,
|
|
419
|
-
version=2,
|
|
428
|
+
name: str = "preprocessed dataset",
|
|
429
|
+
description: str = "preprocessed dataset using scprint",
|
|
430
|
+
start_at: int = 0,
|
|
431
|
+
version: str = "2",
|
|
420
432
|
):
|
|
421
433
|
"""
|
|
422
434
|
format controls the different input value wrapping, including categorical
|
|
@@ -437,12 +449,15 @@ class LaminPreprocessor(Preprocessor):
|
|
|
437
449
|
elif isinstance(data, ln.Collection):
|
|
438
450
|
for i, file in enumerate(data.artifacts.all()[start_at:]):
|
|
439
451
|
# use the counts matrix
|
|
440
|
-
|
|
452
|
+
i = i + start_at
|
|
453
|
+
print(i)
|
|
441
454
|
if file.stem_uid in all_ready_processed_keys:
|
|
442
455
|
print(f"{file.stem_uid} is already processed... not preprocessing")
|
|
443
456
|
continue
|
|
444
457
|
print(file)
|
|
445
|
-
|
|
458
|
+
|
|
459
|
+
path = cache_path(file) if self.force_preloaded else file.cache()
|
|
460
|
+
backed = read_h5ad(path, backed="r")
|
|
446
461
|
if backed.obs.is_primary_data.sum() == 0:
|
|
447
462
|
print(f"{file.key} only contains non primary cells.. dropping")
|
|
448
463
|
# Save the stem_uid to a file to avoid loading it again
|
|
@@ -455,12 +470,11 @@ class LaminPreprocessor(Preprocessor):
|
|
|
455
470
|
)
|
|
456
471
|
continue
|
|
457
472
|
if file.size <= MAXFILESIZE:
|
|
458
|
-
adata =
|
|
473
|
+
adata = backed.to_memory()
|
|
459
474
|
print(adata)
|
|
460
475
|
else:
|
|
461
476
|
badata = backed
|
|
462
477
|
print(badata)
|
|
463
|
-
|
|
464
478
|
try:
|
|
465
479
|
if file.size > MAXFILESIZE:
|
|
466
480
|
print(
|
|
@@ -472,16 +486,26 @@ class LaminPreprocessor(Preprocessor):
|
|
|
472
486
|
)
|
|
473
487
|
print("num blocks ", num_blocks)
|
|
474
488
|
for j in range(num_blocks):
|
|
489
|
+
if j == 0 and i == 390:
|
|
490
|
+
continue
|
|
475
491
|
start_index = j * block_size
|
|
476
492
|
end_index = min((j + 1) * block_size, badata.shape[0])
|
|
477
493
|
block = badata[start_index:end_index].to_memory()
|
|
478
494
|
print(block)
|
|
479
|
-
block = super().__call__(
|
|
480
|
-
|
|
495
|
+
block = super().__call__(
|
|
496
|
+
block, dataset_id=file.stem_uid + "_p" + str(j)
|
|
497
|
+
)
|
|
498
|
+
myfile = ln.Artifact.from_anndata(
|
|
481
499
|
block,
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
500
|
+
description=description
|
|
501
|
+
+ " n"
|
|
502
|
+
+ str(i)
|
|
503
|
+
+ " p"
|
|
504
|
+
+ str(j)
|
|
505
|
+
+ " ( revises file "
|
|
506
|
+
+ str(file.key)
|
|
507
|
+
+ " )",
|
|
508
|
+
version=version,
|
|
485
509
|
)
|
|
486
510
|
myfile.save()
|
|
487
511
|
if self.keep_files:
|
|
@@ -491,16 +515,12 @@ class LaminPreprocessor(Preprocessor):
|
|
|
491
515
|
del block
|
|
492
516
|
|
|
493
517
|
else:
|
|
494
|
-
adata = super().__call__(adata)
|
|
495
|
-
|
|
496
|
-
sc.pl.umap(adata, color=["cell_type"])
|
|
497
|
-
except Exception:
|
|
498
|
-
sc.pl.umap(adata, color=["cell_type_ontology_term_id"])
|
|
499
|
-
myfile = ln.from_anndata(
|
|
518
|
+
adata = super().__call__(adata, dataset_id=file.stem_uid)
|
|
519
|
+
myfile = ln.Artifact.from_anndata(
|
|
500
520
|
adata,
|
|
501
521
|
revises=file,
|
|
502
|
-
description=description,
|
|
503
|
-
version=
|
|
522
|
+
description=description + " p" + str(i),
|
|
523
|
+
version=version,
|
|
504
524
|
)
|
|
505
525
|
myfile.save()
|
|
506
526
|
if self.keep_files:
|
|
@@ -672,35 +692,158 @@ def additional_preprocess(adata):
|
|
|
672
692
|
|
|
673
693
|
|
|
674
694
|
def additional_postprocess(adata):
|
|
675
|
-
import palantir
|
|
695
|
+
# import palantir
|
|
676
696
|
|
|
677
697
|
# define the "up to" 10 neighbors for each cells and add to obs
|
|
678
698
|
# compute neighbors
|
|
679
699
|
# need to be connectivities and same labels [cell type, assay, dataset, disease]
|
|
680
700
|
# define the "neighbor" up to 10(N) cells and add to obs
|
|
681
701
|
# define the "next time point" up to 5(M) cells and add to obs # step 1: filter genes
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
sc.
|
|
685
|
-
|
|
702
|
+
# if len(adata.obs["batches"].unique()) > 1:
|
|
703
|
+
# sc.external.pp.harmony_integrate(adata, key="batches")
|
|
704
|
+
# sc.pp.neighbors(adata, use_rep="X_pca_harmony")
|
|
705
|
+
# else:
|
|
706
|
+
sc.pp.neighbors(adata, use_rep="X_pca")
|
|
707
|
+
sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
|
|
708
|
+
sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
|
|
709
|
+
sc.tl.leiden(adata, key_added="leiden_0.5", resolution=0.5)
|
|
686
710
|
sc.tl.umap(adata)
|
|
711
|
+
mid = adata.uns["dataset_id"] if "dataset_id" in adata.uns else "unknown_id"
|
|
687
712
|
sc.pl.umap(
|
|
688
713
|
adata,
|
|
714
|
+
ncols=1,
|
|
689
715
|
color=["cell_type", "batches"],
|
|
716
|
+
save="_" + mid + ".png",
|
|
690
717
|
)
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
718
|
+
COL = "cell_type_ontology_term_id"
|
|
719
|
+
NEWOBS = "clust_cell_type"
|
|
720
|
+
MINCELLS = 10
|
|
721
|
+
MAXSIM = 0.94
|
|
722
|
+
from collections import Counter
|
|
723
|
+
|
|
724
|
+
from .config import MAIN_HUMAN_MOUSE_DEV_STAGE_MAP
|
|
725
|
+
|
|
726
|
+
adata.obs[NEWOBS] = (
|
|
727
|
+
adata.obs[COL].astype(str) + "_" + adata.obs["leiden_1"].astype(str)
|
|
697
728
|
)
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
for
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
729
|
+
coun = Counter(adata.obs[NEWOBS])
|
|
730
|
+
relab = {}
|
|
731
|
+
for i in adata.obs[COL].unique():
|
|
732
|
+
num = 0
|
|
733
|
+
for n, c in sorted(coun.items(), key=lambda x: x[1], reverse=True):
|
|
734
|
+
if i in n:
|
|
735
|
+
if c < MINCELLS or num == 0:
|
|
736
|
+
relab[n] = i
|
|
737
|
+
else:
|
|
738
|
+
relab[n] = i + "_" + str(num)
|
|
739
|
+
num += 1
|
|
740
|
+
|
|
741
|
+
adata.obs[NEWOBS] = adata.obs[NEWOBS].map(relab)
|
|
742
|
+
|
|
743
|
+
cluster_means = pd.DataFrame(
|
|
744
|
+
np.array(
|
|
745
|
+
[
|
|
746
|
+
adata.X[adata.obs[NEWOBS] == i].mean(axis=0)
|
|
747
|
+
for i in adata.obs[NEWOBS].unique()
|
|
748
|
+
]
|
|
749
|
+
)[:, 0, :],
|
|
750
|
+
index=adata.obs[NEWOBS].unique(),
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
# Calculate correlation matrix between clusters
|
|
754
|
+
cluster_similarity = cluster_means.T.corr()
|
|
755
|
+
cluster_similarity.values[np.tril_indices(len(cluster_similarity), -1)] = 0
|
|
756
|
+
|
|
757
|
+
# Get pairs with similarity > 0.95
|
|
758
|
+
high_sim_pairs = []
|
|
759
|
+
for i in range(len(cluster_similarity)):
|
|
760
|
+
for j in range(i + 1, len(cluster_similarity)):
|
|
761
|
+
if (
|
|
762
|
+
cluster_similarity.iloc[i, j] > MAXSIM
|
|
763
|
+
and cluster_similarity.columns[i].split("_")[0]
|
|
764
|
+
== cluster_similarity.columns[j].split("_")[0]
|
|
765
|
+
):
|
|
766
|
+
high_sim_pairs.append(
|
|
767
|
+
(
|
|
768
|
+
cluster_similarity.index[i],
|
|
769
|
+
cluster_similarity.columns[j],
|
|
770
|
+
)
|
|
771
|
+
)
|
|
772
|
+
# Create mapping for merging similar clusters
|
|
773
|
+
merge_mapping = {}
|
|
774
|
+
for pair in high_sim_pairs:
|
|
775
|
+
if pair[0] not in merge_mapping:
|
|
776
|
+
merge_mapping[pair[1]] = pair[0]
|
|
777
|
+
else:
|
|
778
|
+
merge_mapping[pair[1]] = merge_mapping[pair[0]]
|
|
779
|
+
|
|
780
|
+
# Apply merging
|
|
781
|
+
adata.obs[NEWOBS] = adata.obs[NEWOBS].map(merge_mapping).fillna(adata.obs[NEWOBS])
|
|
782
|
+
adata.obs[NEWOBS] = adata.obs[NEWOBS].astype(str)
|
|
783
|
+
coun = Counter(adata.obs[NEWOBS]).most_common()
|
|
784
|
+
merge_mapping = {}
|
|
785
|
+
for i in adata.obs[COL].unique():
|
|
786
|
+
num = 0
|
|
787
|
+
for j, c in coun:
|
|
788
|
+
if i in j:
|
|
789
|
+
merge_mapping[j] = i + "_" + str(num) if num > 0 else i
|
|
790
|
+
num += 1
|
|
791
|
+
adata.obs[NEWOBS] = adata.obs[NEWOBS].map(merge_mapping).fillna(adata.obs[NEWOBS])
|
|
792
|
+
|
|
793
|
+
import bionty as bt
|
|
794
|
+
|
|
795
|
+
stages = adata.obs["development_stage_ontology_term_id"].unique()
|
|
796
|
+
if adata.obs.organism_ontology_term_id.unique() == ["NCBITaxon:9606"]:
|
|
797
|
+
relabel = {i: i for i in stages}
|
|
798
|
+
for stage in stages:
|
|
799
|
+
stage_obj = bt.DevelopmentalStage.filter(ontology_id=stage).first()
|
|
800
|
+
parents = set([i.ontology_id for i in stage_obj.parents.filter()])
|
|
801
|
+
parents = parents - set(
|
|
802
|
+
[
|
|
803
|
+
"HsapDv:0010000",
|
|
804
|
+
"HsapDv:0000204",
|
|
805
|
+
"HsapDv:0000227",
|
|
806
|
+
]
|
|
807
|
+
)
|
|
808
|
+
if len(parents) > 0:
|
|
809
|
+
for p in parents:
|
|
810
|
+
if p in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP:
|
|
811
|
+
relabel[stage] = p
|
|
812
|
+
adata.obs["simplified_dev_stage"] = adata.obs[
|
|
813
|
+
"development_stage_ontology_term_id"
|
|
814
|
+
].map(relabel)
|
|
815
|
+
elif adata.obs.organism_ontology_term_id.unique() == ["NCBITaxon:10090"]:
|
|
816
|
+
rename_mapping = {
|
|
817
|
+
k: v for v, j in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.items() for k in j
|
|
818
|
+
}
|
|
819
|
+
relabel = {i: "unknown" for i in stages}
|
|
820
|
+
for stage in stages:
|
|
821
|
+
if stage in rename_mapping:
|
|
822
|
+
relabel[stage] = rename_mapping[stage]
|
|
823
|
+
adata.obs["simplified_dev_stage"] = adata.obs[
|
|
824
|
+
"development_stage_ontology_term_id"
|
|
825
|
+
].map(relabel)
|
|
826
|
+
else:
|
|
827
|
+
raise ValueError("organism not supported")
|
|
828
|
+
# palantir.utils.run_diffusion_maps(adata, n_components=20)
|
|
829
|
+
# palantir.utils.determine_multiscale_space(adata)
|
|
830
|
+
# terminal_states = palantir.utils.find_terminal_states(
|
|
831
|
+
# adata,
|
|
832
|
+
# celltypes=adata.obs.cell_type_ontology_term_id.unique(),
|
|
833
|
+
# celltype_column="cell_type_ontology_term_id",
|
|
834
|
+
# )
|
|
835
|
+
# sc.tl.diffmap(adata)
|
|
836
|
+
# adata.obs["heat_diff"] = 1
|
|
837
|
+
# for terminal_state in terminal_states.index.tolist():
|
|
838
|
+
# adata.uns["iroot"] = np.where(adata.obs.index == terminal_state)[0][0]
|
|
839
|
+
# sc.tl.dpt(adata)
|
|
840
|
+
# adata.obs["heat_diff"] = np.minimum(
|
|
841
|
+
# adata.obs["heat_diff"], adata.obs["dpt_pseudotime"]
|
|
842
|
+
# )
|
|
706
843
|
return adata
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def cache_path(artifact):
|
|
847
|
+
cloud_path = UPath(artifact.storage.root) / artifact.key
|
|
848
|
+
cache_path = ln.setup.settings.paths.cloud_to_local_no_update(cloud_path)
|
|
849
|
+
return cache_path
|
scdataloader/utils.py
CHANGED
|
@@ -147,7 +147,7 @@ def getBiomartTable(
|
|
|
147
147
|
return res
|
|
148
148
|
|
|
149
149
|
|
|
150
|
-
def validate(adata: AnnData, organism: str):
|
|
150
|
+
def validate(adata: AnnData, organism: str, need_all=True):
|
|
151
151
|
"""
|
|
152
152
|
validate checks if the adata object is valid for lamindb
|
|
153
153
|
|
|
@@ -185,7 +185,7 @@ def validate(adata: AnnData, organism: str):
|
|
|
185
185
|
"tissue_ontology_term_id",
|
|
186
186
|
"assay_ontology_term_id",
|
|
187
187
|
]:
|
|
188
|
-
if val not in adata.obs.columns:
|
|
188
|
+
if val not in adata.obs.columns and need_all:
|
|
189
189
|
raise ValueError(
|
|
190
190
|
f"Column '{val}' is missing in the provided anndata object."
|
|
191
191
|
)
|
|
@@ -193,7 +193,9 @@ def validate(adata: AnnData, organism: str):
|
|
|
193
193
|
if not bt.Ethnicity.validate(
|
|
194
194
|
adata.obs["self_reported_ethnicity_ontology_term_id"],
|
|
195
195
|
field="ontology_id",
|
|
196
|
-
).all()
|
|
196
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
197
|
+
["unknown"]
|
|
198
|
+
):
|
|
197
199
|
raise ValueError("Invalid ethnicity ontology term id found")
|
|
198
200
|
if not bt.Organism.validate(
|
|
199
201
|
adata.obs["organism_ontology_term_id"], field="ontology_id"
|
|
@@ -201,28 +203,40 @@ def validate(adata: AnnData, organism: str):
|
|
|
201
203
|
raise ValueError("Invalid organism ontology term id found")
|
|
202
204
|
if not bt.Phenotype.validate(
|
|
203
205
|
adata.obs["sex_ontology_term_id"], field="ontology_id"
|
|
204
|
-
).all()
|
|
206
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
207
|
+
["unknown"]
|
|
208
|
+
):
|
|
205
209
|
raise ValueError("Invalid sex ontology term id found")
|
|
206
210
|
if not bt.Disease.validate(
|
|
207
211
|
adata.obs["disease_ontology_term_id"], field="ontology_id"
|
|
208
|
-
).all()
|
|
212
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
213
|
+
["unknown"]
|
|
214
|
+
):
|
|
209
215
|
raise ValueError("Invalid disease ontology term id found")
|
|
210
216
|
if not bt.CellType.validate(
|
|
211
217
|
adata.obs["cell_type_ontology_term_id"], field="ontology_id"
|
|
212
|
-
).all()
|
|
218
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
219
|
+
["unknown"]
|
|
220
|
+
):
|
|
213
221
|
raise ValueError("Invalid cell type ontology term id found")
|
|
214
222
|
if not bt.DevelopmentalStage.validate(
|
|
215
223
|
adata.obs["development_stage_ontology_term_id"],
|
|
216
224
|
field="ontology_id",
|
|
217
|
-
).all()
|
|
225
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
226
|
+
["unknown"]
|
|
227
|
+
):
|
|
218
228
|
raise ValueError("Invalid dev stage ontology term id found")
|
|
219
229
|
if not bt.Tissue.validate(
|
|
220
230
|
adata.obs["tissue_ontology_term_id"], field="ontology_id"
|
|
221
|
-
).all()
|
|
231
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
232
|
+
["unknown"]
|
|
233
|
+
):
|
|
222
234
|
raise ValueError("Invalid tissue ontology term id found")
|
|
223
235
|
if not bt.ExperimentalFactor.validate(
|
|
224
236
|
adata.obs["assay_ontology_term_id"], field="ontology_id"
|
|
225
|
-
).all()
|
|
237
|
+
).all() and not set(adata.obs["self_reported_ethnicity_ontology_term_id"]) == set(
|
|
238
|
+
["unknown"]
|
|
239
|
+
):
|
|
226
240
|
raise ValueError("Invalid assay ontology term id found")
|
|
227
241
|
if not bt.Gene.validate(
|
|
228
242
|
adata.var.index, field="ensembl_gene_id", organism=organism
|
|
@@ -582,7 +596,7 @@ def populate_my_ontology(
|
|
|
582
596
|
# cell type
|
|
583
597
|
if celltypes is not None:
|
|
584
598
|
if len(celltypes) == 0:
|
|
585
|
-
bt.CellType.
|
|
599
|
+
bt.CellType.import_source()
|
|
586
600
|
else:
|
|
587
601
|
names = bt.CellType.public().df().index if not celltypes else celltypes
|
|
588
602
|
records = bt.CellType.from_values(names, field="ontology_id")
|
|
@@ -597,9 +611,9 @@ def populate_my_ontology(
|
|
|
597
611
|
)
|
|
598
612
|
source = bt.PublicSource.filter(name="ensembl", organism=organism_clade).last()
|
|
599
613
|
records = [
|
|
600
|
-
|
|
601
|
-
for
|
|
602
|
-
bt.Organism.from_source(ontology_id=
|
|
614
|
+
organism_or_organismlist if isinstance(organism_or_organismlist, bt.Organism) else organism_or_organismlist[0]
|
|
615
|
+
for organism_or_organismlist in [
|
|
616
|
+
bt.Organism.from_source(ontology_id=name, source=source) for name in names
|
|
603
617
|
]
|
|
604
618
|
]
|
|
605
619
|
ln.save(records)
|
|
@@ -616,7 +630,7 @@ def populate_my_ontology(
|
|
|
616
630
|
# ethnicity
|
|
617
631
|
if ethnicities is not None:
|
|
618
632
|
if len(ethnicities) == 0:
|
|
619
|
-
bt.Ethnicity.
|
|
633
|
+
bt.Ethnicity.import_source()
|
|
620
634
|
else:
|
|
621
635
|
names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
|
|
622
636
|
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
@@ -627,7 +641,7 @@ def populate_my_ontology(
|
|
|
627
641
|
# ExperimentalFactor
|
|
628
642
|
if assays is not None:
|
|
629
643
|
if len(assays) == 0:
|
|
630
|
-
bt.ExperimentalFactor.
|
|
644
|
+
bt.ExperimentalFactor.import_source()
|
|
631
645
|
else:
|
|
632
646
|
names = bt.ExperimentalFactor.public().df().index if not assays else assays
|
|
633
647
|
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
@@ -638,7 +652,7 @@ def populate_my_ontology(
|
|
|
638
652
|
# Tissue
|
|
639
653
|
if tissues is not None:
|
|
640
654
|
if len(tissues) == 0:
|
|
641
|
-
bt.Tissue.
|
|
655
|
+
bt.Tissue.import_source()
|
|
642
656
|
else:
|
|
643
657
|
names = bt.Tissue.public().df().index if not tissues else tissues
|
|
644
658
|
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
@@ -647,9 +661,9 @@ def populate_my_ontology(
|
|
|
647
661
|
# DevelopmentalStage
|
|
648
662
|
if dev_stages is not None:
|
|
649
663
|
if len(dev_stages) == 0:
|
|
650
|
-
bt.DevelopmentalStage.
|
|
664
|
+
bt.DevelopmentalStage.import_source()
|
|
651
665
|
source = bt.PublicSource.filter(organism="mouse", name="mmusdv").last()
|
|
652
|
-
bt.DevelopmentalStage.
|
|
666
|
+
bt.DevelopmentalStage.import_source(source=source)
|
|
653
667
|
else:
|
|
654
668
|
names = (
|
|
655
669
|
bt.DevelopmentalStage.public().df().index
|
|
@@ -663,7 +677,7 @@ def populate_my_ontology(
|
|
|
663
677
|
# Disease
|
|
664
678
|
if diseases is not None:
|
|
665
679
|
if len(diseases) == 0:
|
|
666
|
-
bt.Disease.
|
|
680
|
+
bt.Disease.import_source()
|
|
667
681
|
else:
|
|
668
682
|
names = bt.Disease.public().df().index if not diseases else diseases
|
|
669
683
|
records = bt.Disease.from_values(names, field="ontology_id")
|
|
@@ -738,18 +752,26 @@ def translate(
|
|
|
738
752
|
dict: the mapping for the translation
|
|
739
753
|
"""
|
|
740
754
|
if t == "cell_type_ontology_term_id":
|
|
741
|
-
obj = bt.CellType
|
|
755
|
+
obj = bt.CellType
|
|
742
756
|
elif t == "assay_ontology_term_id":
|
|
743
|
-
obj = bt.ExperimentalFactor
|
|
757
|
+
obj = bt.ExperimentalFactor
|
|
744
758
|
elif t == "tissue_ontology_term_id":
|
|
745
|
-
obj = bt.Tissue
|
|
759
|
+
obj = bt.Tissue
|
|
760
|
+
elif t in [
|
|
761
|
+
"development_stage_ontology_term_id",
|
|
762
|
+
"simplified_dev_stage",
|
|
763
|
+
"age_group",
|
|
764
|
+
]:
|
|
765
|
+
obj = bt.DevelopmentalStage
|
|
766
|
+
elif t == "disease_ontology_term_id":
|
|
767
|
+
obj = bt.Disease
|
|
768
|
+
elif t == "self_reported_ethnicity_ontology_term_id":
|
|
769
|
+
obj = bt.Ethnicity
|
|
746
770
|
else:
|
|
747
771
|
return None
|
|
748
772
|
if type(val) is str:
|
|
749
|
-
return {val: obj.
|
|
773
|
+
return {val: obj.filter(ontology_id=val).one().name}
|
|
750
774
|
elif type(val) is list or type(val) is set:
|
|
751
|
-
return {i: obj.
|
|
775
|
+
return {i: obj.filter(ontology_id=i).one().name for i in set(val)}
|
|
752
776
|
elif type(val) is dict or type(val) is Counter:
|
|
753
|
-
return {
|
|
754
|
-
obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
|
|
755
|
-
}
|
|
777
|
+
return {obj.filter(ontology_id=k).one().name: v for k, v in val.items()}
|