scdataloader 1.9.2__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/__main__.py +4 -5
- scdataloader/collator.py +76 -78
- scdataloader/config.py +25 -9
- scdataloader/data.json +384 -0
- scdataloader/data.py +134 -77
- scdataloader/datamodule.py +638 -245
- scdataloader/mapped.py +104 -43
- scdataloader/preprocess.py +136 -110
- scdataloader/utils.py +158 -52
- {scdataloader-1.9.2.dist-info → scdataloader-2.0.2.dist-info}/METADATA +6 -7
- scdataloader-2.0.2.dist-info/RECORD +16 -0
- {scdataloader-1.9.2.dist-info → scdataloader-2.0.2.dist-info}/WHEEL +1 -1
- scdataloader-2.0.2.dist-info/licenses/LICENSE +21 -0
- scdataloader/VERSION +0 -1
- scdataloader-1.9.2.dist-info/RECORD +0 -16
- scdataloader-1.9.2.dist-info/licenses/LICENSE +0 -674
- {scdataloader-1.9.2.dist-info → scdataloader-2.0.2.dist-info}/entry_points.txt +0 -0
scdataloader/preprocess.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import gc
|
|
2
|
-
|
|
2
|
+
import time
|
|
3
|
+
from typing import Callable, List, Optional, Union
|
|
3
4
|
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
import anndata as ad
|
|
@@ -8,13 +9,14 @@ import numpy as np
|
|
|
8
9
|
import pandas as pd
|
|
9
10
|
import scanpy as sc
|
|
10
11
|
from anndata import AnnData, read_h5ad
|
|
12
|
+
from django.db.utils import OperationalError
|
|
11
13
|
from scipy.sparse import csr_matrix
|
|
12
14
|
from upath import UPath
|
|
13
15
|
|
|
14
16
|
from scdataloader import utils as data_utils
|
|
15
17
|
|
|
16
18
|
FULL_LENGTH_ASSAYS = [
|
|
17
|
-
"EFO:
|
|
19
|
+
"EFO:0700016",
|
|
18
20
|
"EFO:0008930",
|
|
19
21
|
"EFO:0008931",
|
|
20
22
|
]
|
|
@@ -47,20 +49,21 @@ class Preprocessor:
|
|
|
47
49
|
maxdropamount: int = 50,
|
|
48
50
|
madoutlier: int = 5,
|
|
49
51
|
pct_mt_outlier: int = 8,
|
|
50
|
-
batch_keys:
|
|
52
|
+
batch_keys: List[str] = [
|
|
51
53
|
"assay_ontology_term_id",
|
|
52
54
|
"self_reported_ethnicity_ontology_term_id",
|
|
53
55
|
"sex_ontology_term_id",
|
|
54
56
|
"donor_id",
|
|
55
57
|
"suspension_type",
|
|
56
58
|
],
|
|
57
|
-
skip_validate: bool =
|
|
59
|
+
skip_validate: bool = True,
|
|
58
60
|
additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
|
|
59
61
|
additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
|
|
60
62
|
do_postp: bool = True,
|
|
61
|
-
organisms:
|
|
63
|
+
organisms: List[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
|
|
62
64
|
use_raw: bool = True,
|
|
63
65
|
keepdata: bool = False,
|
|
66
|
+
drop_non_primary: bool = False,
|
|
64
67
|
) -> None:
|
|
65
68
|
"""
|
|
66
69
|
Initializes the preprocessor and configures the workflow steps.
|
|
@@ -108,6 +111,8 @@ class Preprocessor:
|
|
|
108
111
|
Defaults to False.
|
|
109
112
|
keepdata (bool, optional): Determines whether to keep the data in the AnnData object.
|
|
110
113
|
Defaults to False.
|
|
114
|
+
drop_non_primary (bool, optional): Determines whether to drop non-primary cells.
|
|
115
|
+
Defaults to False.
|
|
111
116
|
"""
|
|
112
117
|
self.filter_gene_by_counts = filter_gene_by_counts
|
|
113
118
|
self.filter_cell_by_counts = filter_cell_by_counts
|
|
@@ -123,6 +128,7 @@ class Preprocessor:
|
|
|
123
128
|
self.min_valid_genes_id = min_valid_genes_id
|
|
124
129
|
self.min_nnz_genes = min_nnz_genes
|
|
125
130
|
self.maxdropamount = maxdropamount
|
|
131
|
+
self.drop_non_primary = drop_non_primary
|
|
126
132
|
self.madoutlier = madoutlier
|
|
127
133
|
self.n_hvg_for_postp = n_hvg_for_postp
|
|
128
134
|
self.pct_mt_outlier = pct_mt_outlier
|
|
@@ -139,13 +145,14 @@ class Preprocessor:
|
|
|
139
145
|
if self.additional_preprocess is not None:
|
|
140
146
|
adata = self.additional_preprocess(adata)
|
|
141
147
|
if "organism_ontology_term_id" not in adata[0].obs.columns:
|
|
142
|
-
|
|
143
|
-
"organism_ontology_term_id
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
if "organism_ontology_term_id" in adata.uns:
|
|
149
|
+
adata.obs["organism_ontology_term_id"] = adata.uns[
|
|
150
|
+
"organism_ontology_term_id"
|
|
151
|
+
]
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"organism_ontology_term_id not found in adata.obs, you need to add an ontology term id for the organism of your anndata"
|
|
155
|
+
)
|
|
149
156
|
if adata.obs["organism_ontology_term_id"].iloc[0] not in self.organisms:
|
|
150
157
|
raise ValueError(
|
|
151
158
|
"we cannot work with this organism",
|
|
@@ -161,8 +168,8 @@ class Preprocessor:
|
|
|
161
168
|
if np.abs(adata[:50_000].X.astype(int) - adata[:50_000].X).sum():
|
|
162
169
|
print("X was not raw counts, using 'counts' layer")
|
|
163
170
|
adata.X = adata.layers["counts"].copy()
|
|
164
|
-
print("Dropping layers: ", adata.layers.keys())
|
|
165
171
|
if not self.keepdata:
|
|
172
|
+
print("Dropping layers: ", adata.layers.keys())
|
|
166
173
|
del adata.layers
|
|
167
174
|
if len(adata.varm.keys()) > 0 and not self.keepdata:
|
|
168
175
|
del adata.varm
|
|
@@ -170,6 +177,8 @@ class Preprocessor:
|
|
|
170
177
|
del adata.obsm
|
|
171
178
|
if len(adata.obsp.keys()) > 0 and not self.keepdata:
|
|
172
179
|
del adata.obsp
|
|
180
|
+
if len(adata.varp.keys()) > 0 and not self.keepdata:
|
|
181
|
+
del adata.varp
|
|
173
182
|
# check that it is a count
|
|
174
183
|
|
|
175
184
|
print("checking raw counts")
|
|
@@ -188,7 +197,7 @@ class Preprocessor:
|
|
|
188
197
|
# if not available count drop
|
|
189
198
|
prevsize = adata.shape[0]
|
|
190
199
|
# dropping non primary
|
|
191
|
-
if "is_primary_data" in adata.obs.columns:
|
|
200
|
+
if "is_primary_data" in adata.obs.columns and self.drop_non_primary:
|
|
192
201
|
adata = adata[adata.obs.is_primary_data]
|
|
193
202
|
if adata.shape[0] < self.min_dataset_size:
|
|
194
203
|
raise Exception("Dataset dropped due to too many secondary cells")
|
|
@@ -213,13 +222,10 @@ class Preprocessor:
|
|
|
213
222
|
min_genes=self.min_nnz_genes,
|
|
214
223
|
)
|
|
215
224
|
# if lost > 50% of the dataset, drop dataset
|
|
216
|
-
|
|
217
|
-
genesdf = data_utils.load_genes(adata.obs.organism_ontology_term_id.iloc[0])
|
|
218
|
-
|
|
219
|
-
if prevsize / adata.shape[0] > self.maxdropamount:
|
|
225
|
+
if prevsize / (adata.shape[0] + 1) > self.maxdropamount:
|
|
220
226
|
raise Exception(
|
|
221
227
|
"Dataset dropped due to low expressed genes and unexpressed cells: factor of "
|
|
222
|
-
+ str(prevsize / adata.shape[0])
|
|
228
|
+
+ str(prevsize / (adata.shape[0] + 1))
|
|
223
229
|
)
|
|
224
230
|
if adata.shape[0] < self.min_dataset_size:
|
|
225
231
|
raise Exception(
|
|
@@ -232,60 +238,39 @@ class Preprocessor:
|
|
|
232
238
|
)
|
|
233
239
|
)
|
|
234
240
|
|
|
235
|
-
#
|
|
236
|
-
|
|
237
|
-
all_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").all()
|
|
238
|
-
|
|
239
|
-
if not has_ens:
|
|
240
|
-
print("No ENS genes found, assuming gene symbols...")
|
|
241
|
-
elif not all_ens:
|
|
242
|
-
print("Mix of ENS and gene symbols found, converting all to ENS IDs...")
|
|
243
|
-
|
|
241
|
+
# load the genes
|
|
242
|
+
genesdf = data_utils.load_genes(adata.obs.organism_ontology_term_id.iloc[0])
|
|
244
243
|
genesdf["ensembl_gene_id"] = genesdf.index
|
|
245
244
|
|
|
246
245
|
# For genes that are already ENS IDs, use them directly
|
|
247
|
-
|
|
248
|
-
symbol_mask = ~ens_mask
|
|
249
|
-
|
|
246
|
+
prev_size = adata.shape[1]
|
|
250
247
|
# Handle symbol genes
|
|
251
|
-
if
|
|
252
|
-
|
|
248
|
+
if self.is_symbol:
|
|
249
|
+
new_var = adata.var.merge(
|
|
253
250
|
genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
|
|
254
251
|
left_index=True,
|
|
255
252
|
right_index=True,
|
|
256
253
|
how="inner",
|
|
257
254
|
)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
255
|
+
new_var["symbol"] = new_var.index
|
|
256
|
+
adata = adata[:, new_var.index]
|
|
257
|
+
new_var.index = new_var["ensembl_gene_id"]
|
|
258
|
+
else:
|
|
259
|
+
new_var = adata.var.merge(
|
|
262
260
|
genesdf, left_index=True, right_index=True, how="inner"
|
|
263
261
|
)
|
|
262
|
+
adata = adata[:, new_var.index]
|
|
263
|
+
print(f"Removed {prev_size - adata.shape[1]} genes not known to the ontology")
|
|
264
|
+
prev_size = adata.shape[1]
|
|
264
265
|
|
|
265
|
-
|
|
266
|
-
if symbol_mask.any() and ens_mask.any():
|
|
267
|
-
var = pd.concat([symbol_var, ens_var])
|
|
268
|
-
elif symbol_mask.any():
|
|
269
|
-
var = symbol_var
|
|
270
|
-
else:
|
|
271
|
-
var = ens_var
|
|
272
|
-
|
|
273
|
-
adata = adata[:, var.index]
|
|
274
|
-
# var = var.sort_values(by="ensembl_gene_id").set_index("ensembl_gene_id")
|
|
275
|
-
# Update adata with combined genes
|
|
276
|
-
if "ensembl_gene_id" in var.columns:
|
|
277
|
-
adata.var = var.set_index("ensembl_gene_id")
|
|
278
|
-
else:
|
|
279
|
-
adata.var = var
|
|
266
|
+
adata.var = new_var
|
|
280
267
|
# Drop duplicate genes, keeping first occurrence
|
|
281
268
|
adata = adata[:, ~adata.var.index.duplicated(keep="first")]
|
|
269
|
+
print(f"Removed {prev_size - adata.shape[1]} duplicate genes")
|
|
282
270
|
|
|
283
|
-
|
|
284
|
-
print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
|
|
285
|
-
if len(intersect_genes) < self.min_valid_genes_id:
|
|
271
|
+
if adata.shape[1] < self.min_valid_genes_id:
|
|
286
272
|
raise Exception("Dataset dropped due to too many genes not mapping to it")
|
|
287
|
-
|
|
288
|
-
# marking unseen genes
|
|
273
|
+
|
|
289
274
|
unseen = set(genesdf.index) - set(adata.var.index)
|
|
290
275
|
# adding them to adata
|
|
291
276
|
emptyda = ad.AnnData(
|
|
@@ -293,6 +278,9 @@ class Preprocessor:
|
|
|
293
278
|
var=pd.DataFrame(index=list(unseen)),
|
|
294
279
|
obs=pd.DataFrame(index=adata.obs.index),
|
|
295
280
|
)
|
|
281
|
+
print(
|
|
282
|
+
f"Added {len(unseen)} genes in the ontology but not present in the dataset"
|
|
283
|
+
)
|
|
296
284
|
adata = ad.concat([adata, emptyda], axis=1, join="outer", merge="only")
|
|
297
285
|
# do a validation function
|
|
298
286
|
adata.uns["unseen_genes"] = list(unseen)
|
|
@@ -330,7 +318,7 @@ class Preprocessor:
|
|
|
330
318
|
# QC
|
|
331
319
|
|
|
332
320
|
adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
|
|
333
|
-
print("
|
|
321
|
+
print("starting QC")
|
|
334
322
|
sc.pp.calculate_qc_metrics(
|
|
335
323
|
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
|
|
336
324
|
)
|
|
@@ -348,7 +336,7 @@ class Preprocessor:
|
|
|
348
336
|
)
|
|
349
337
|
total_outliers = (adata.obs["outlier"] | adata.obs["mt_outlier"]).sum()
|
|
350
338
|
total_cells = adata.shape[0]
|
|
351
|
-
percentage_outliers = (total_outliers / total_cells) * 100
|
|
339
|
+
percentage_outliers = (total_outliers / (total_cells + 1)) * 100
|
|
352
340
|
print(
|
|
353
341
|
f"Seeing {total_outliers} outliers ({percentage_outliers:.2f}% of total dataset):"
|
|
354
342
|
)
|
|
@@ -374,6 +362,8 @@ class Preprocessor:
|
|
|
374
362
|
adata.obs["batches"] = adata.obs[batches].apply(
|
|
375
363
|
lambda x: ",".join(x.dropna().astype(str)), axis=1
|
|
376
364
|
)
|
|
365
|
+
if "highly_variable" in adata.var.columns:
|
|
366
|
+
adata.var = adata.var.drop(columns=["highly_variable"])
|
|
377
367
|
if self.n_hvg_for_postp:
|
|
378
368
|
try:
|
|
379
369
|
sc.pp.highly_variable_genes(
|
|
@@ -395,12 +385,15 @@ class Preprocessor:
|
|
|
395
385
|
subset=False,
|
|
396
386
|
layer="norm",
|
|
397
387
|
)
|
|
398
|
-
|
|
388
|
+
print("starting PCA")
|
|
399
389
|
adata.obsm["X_pca"] = sc.pp.pca(
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
390
|
+
(
|
|
391
|
+
adata.layers["norm"][:, adata.var["highly_variable"]]
|
|
392
|
+
if "highly_variable" in adata.var.columns
|
|
393
|
+
else adata.layers["norm"]
|
|
394
|
+
),
|
|
395
|
+
n_comps=50 if adata.shape[0] > 1000 else adata.shape[0] // 20,
|
|
396
|
+
chunked=adata.shape[0] > 100_000,
|
|
404
397
|
)
|
|
405
398
|
|
|
406
399
|
# additional
|
|
@@ -464,13 +457,15 @@ class LaminPreprocessor(Preprocessor):
|
|
|
464
457
|
*args,
|
|
465
458
|
cache: bool = True,
|
|
466
459
|
keep_files: bool = True,
|
|
467
|
-
|
|
460
|
+
force_lamin_cache: bool = False,
|
|
461
|
+
assays_to_drop: List[str] = ["EFO:0008939"],
|
|
468
462
|
**kwargs,
|
|
469
463
|
):
|
|
470
464
|
super().__init__(*args, **kwargs)
|
|
471
465
|
self.cache = cache
|
|
472
466
|
self.keep_files = keep_files
|
|
473
|
-
self.
|
|
467
|
+
self.force_lamin_cache = force_lamin_cache
|
|
468
|
+
self.assays_to_drop = assays_to_drop
|
|
474
469
|
|
|
475
470
|
def __call__(
|
|
476
471
|
self,
|
|
@@ -505,19 +500,25 @@ class LaminPreprocessor(Preprocessor):
|
|
|
505
500
|
print(f"{file.stem_uid} is already processed... not preprocessing")
|
|
506
501
|
continue
|
|
507
502
|
print(file)
|
|
503
|
+
if self.force_lamin_cache:
|
|
504
|
+
path = cache_path(file)
|
|
505
|
+
backed = read_h5ad(path, backed="r")
|
|
506
|
+
else:
|
|
507
|
+
# file.cache()
|
|
508
|
+
backed = file.open()
|
|
508
509
|
|
|
509
|
-
_ = cache_path(file) if self.force_preloaded else file.cache()
|
|
510
|
-
backed = file.open()
|
|
511
|
-
# backed = read_h5ad(path, backed="r")
|
|
512
510
|
if "is_primary_data" in backed.obs.columns:
|
|
513
511
|
if backed.obs.is_primary_data.sum() == 0:
|
|
514
512
|
print(f"{file.key} only contains non primary cells.. dropping")
|
|
515
513
|
# Save the stem_uid to a file to avoid loading it again
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
514
|
+
with open("nonprimary.txt", "a") as f:
|
|
515
|
+
f.write(f"{file.stem_uid}\n")
|
|
516
|
+
continue
|
|
519
517
|
else:
|
|
520
518
|
print("Warning: couldn't check unicity from is_primary_data column")
|
|
519
|
+
if backed.obs.assay_ontology_term_id[0] in self.assays_to_drop:
|
|
520
|
+
print(f"{file.key} is in the assay drop list.. dropping")
|
|
521
|
+
continue
|
|
521
522
|
if backed.shape[1] < 1000:
|
|
522
523
|
print(
|
|
523
524
|
f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
|
|
@@ -556,37 +557,52 @@ class LaminPreprocessor(Preprocessor):
|
|
|
556
557
|
block,
|
|
557
558
|
dataset_id=file.stem_uid + "_p" + str(j),
|
|
558
559
|
)
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
560
|
+
saved = False
|
|
561
|
+
while not saved:
|
|
562
|
+
try:
|
|
563
|
+
myfile = ln.Artifact.from_anndata(
|
|
564
|
+
block,
|
|
565
|
+
description=description
|
|
566
|
+
+ " n"
|
|
567
|
+
+ str(i)
|
|
568
|
+
+ " p"
|
|
569
|
+
+ str(j)
|
|
570
|
+
+ " ( revises file "
|
|
571
|
+
+ str(file.stem_uid)
|
|
572
|
+
+ " )",
|
|
573
|
+
version=version,
|
|
574
|
+
)
|
|
575
|
+
myfile.save()
|
|
576
|
+
saved = True
|
|
577
|
+
except OperationalError:
|
|
578
|
+
print(
|
|
579
|
+
"Database locked, waiting 30 seconds and retrying..."
|
|
580
|
+
)
|
|
581
|
+
time.sleep(10)
|
|
573
582
|
if self.keep_files:
|
|
574
583
|
files.append(myfile)
|
|
575
584
|
del block
|
|
576
585
|
else:
|
|
577
586
|
del myfile
|
|
578
587
|
del block
|
|
579
|
-
gc.collect()
|
|
580
|
-
|
|
581
588
|
else:
|
|
582
589
|
adata = super().__call__(adata, dataset_id=file.stem_uid)
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
+
saved = False
|
|
591
|
+
while not saved:
|
|
592
|
+
try:
|
|
593
|
+
myfile = ln.Artifact.from_anndata(
|
|
594
|
+
adata,
|
|
595
|
+
# revises=file,
|
|
596
|
+
description=description + " p" + str(i),
|
|
597
|
+
version=version,
|
|
598
|
+
)
|
|
599
|
+
myfile.save()
|
|
600
|
+
saved = True
|
|
601
|
+
except OperationalError:
|
|
602
|
+
print(
|
|
603
|
+
"Database locked, waiting 10 seconds and retrying..."
|
|
604
|
+
)
|
|
605
|
+
time.sleep(10)
|
|
590
606
|
if self.keep_files:
|
|
591
607
|
files.append(myfile)
|
|
592
608
|
del adata
|
|
@@ -606,7 +622,7 @@ class LaminPreprocessor(Preprocessor):
|
|
|
606
622
|
continue
|
|
607
623
|
else:
|
|
608
624
|
raise e
|
|
609
|
-
|
|
625
|
+
gc.collect()
|
|
610
626
|
# issues with KLlggfw6I6lvmbqiZm46
|
|
611
627
|
if self.keep_files:
|
|
612
628
|
# Reconstruct collection using keys
|
|
@@ -716,7 +732,7 @@ def additional_preprocess(adata):
|
|
|
716
732
|
}
|
|
717
733
|
}
|
|
718
734
|
) # multi ethnic will have to get renamed
|
|
719
|
-
adata.obs["cell_culture"] = False
|
|
735
|
+
adata.obs["cell_culture"] = "False"
|
|
720
736
|
# if cell_type contains the word "(cell culture)" then it is a cell culture and we mark it as so and remove this from the cell type
|
|
721
737
|
loc = adata.obs["cell_type_ontology_term_id"].str.contains(
|
|
722
738
|
"(cell culture)", regex=False
|
|
@@ -725,7 +741,7 @@ def additional_preprocess(adata):
|
|
|
725
741
|
adata.obs["cell_type_ontology_term_id"] = adata.obs[
|
|
726
742
|
"cell_type_ontology_term_id"
|
|
727
743
|
].astype(str)
|
|
728
|
-
adata.obs.loc[loc, "cell_culture"] = True
|
|
744
|
+
adata.obs.loc[loc, "cell_culture"] = "True"
|
|
729
745
|
adata.obs.loc[loc, "cell_type_ontology_term_id"] = adata.obs.loc[
|
|
730
746
|
loc, "cell_type_ontology_term_id"
|
|
731
747
|
].str.replace(" (cell culture)", "")
|
|
@@ -734,7 +750,7 @@ def additional_preprocess(adata):
|
|
|
734
750
|
"(cell culture)", regex=False
|
|
735
751
|
)
|
|
736
752
|
if loc.sum() > 0:
|
|
737
|
-
adata.obs.loc[loc, "cell_culture"] = True
|
|
753
|
+
adata.obs.loc[loc, "cell_culture"] = "True"
|
|
738
754
|
adata.obs["tissue_ontology_term_id"] = adata.obs[
|
|
739
755
|
"tissue_ontology_term_id"
|
|
740
756
|
].astype(str)
|
|
@@ -744,7 +760,7 @@ def additional_preprocess(adata):
|
|
|
744
760
|
|
|
745
761
|
loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)", regex=False)
|
|
746
762
|
if loc.sum() > 0:
|
|
747
|
-
adata.obs.loc[loc, "cell_culture"] = True
|
|
763
|
+
adata.obs.loc[loc, "cell_culture"] = "True"
|
|
748
764
|
adata.obs["tissue_ontology_term_id"] = adata.obs[
|
|
749
765
|
"tissue_ontology_term_id"
|
|
750
766
|
].astype(str)
|
|
@@ -773,6 +789,7 @@ def additional_postprocess(adata):
|
|
|
773
789
|
# sc.external.pp.harmony_integrate(adata, key="batches")
|
|
774
790
|
# sc.pp.neighbors(adata, use_rep="X_pca_harmony")
|
|
775
791
|
# else:
|
|
792
|
+
print("starting post processing")
|
|
776
793
|
sc.pp.neighbors(adata, use_rep="X_pca")
|
|
777
794
|
sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
|
|
778
795
|
sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
|
|
@@ -791,8 +808,12 @@ def additional_postprocess(adata):
|
|
|
791
808
|
MAXSIM = 0.94
|
|
792
809
|
from collections import Counter
|
|
793
810
|
|
|
811
|
+
import bionty as bt
|
|
812
|
+
|
|
794
813
|
from .config import MAIN_HUMAN_MOUSE_DEV_STAGE_MAP
|
|
795
814
|
|
|
815
|
+
remap_stages = {u: k for k, v in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.items() for u in v}
|
|
816
|
+
|
|
796
817
|
adata.obs[NEWOBS] = (
|
|
797
818
|
adata.obs[COL].astype(str) + "_" + adata.obs["leiden_1"].astype(str)
|
|
798
819
|
)
|
|
@@ -860,18 +881,17 @@ def additional_postprocess(adata):
|
|
|
860
881
|
num += 1
|
|
861
882
|
adata.obs[NEWOBS] = adata.obs[NEWOBS].map(merge_mapping).fillna(adata.obs[NEWOBS])
|
|
862
883
|
|
|
863
|
-
import bionty as bt
|
|
864
|
-
|
|
865
884
|
stages = adata.obs["development_stage_ontology_term_id"].unique()
|
|
866
885
|
if adata.obs.organism_ontology_term_id.unique() == ["NCBITaxon:9606"]:
|
|
867
886
|
relabel = {i: i for i in stages}
|
|
868
887
|
for stage in stages:
|
|
888
|
+
if stage in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.keys():
|
|
889
|
+
continue
|
|
869
890
|
stage_obj = bt.DevelopmentalStage.filter(ontology_id=stage).first()
|
|
870
891
|
parents = set([i.ontology_id for i in stage_obj.parents.filter()])
|
|
871
892
|
parents = parents - set(
|
|
872
893
|
[
|
|
873
894
|
"HsapDv:0010000",
|
|
874
|
-
"HsapDv:0000204",
|
|
875
895
|
"HsapDv:0000227",
|
|
876
896
|
]
|
|
877
897
|
)
|
|
@@ -879,9 +899,14 @@ def additional_postprocess(adata):
|
|
|
879
899
|
for p in parents:
|
|
880
900
|
if p in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP:
|
|
881
901
|
relabel[stage] = p
|
|
882
|
-
adata.obs["
|
|
883
|
-
|
|
884
|
-
|
|
902
|
+
adata.obs["age_group"] = adata.obs["development_stage_ontology_term_id"].map(
|
|
903
|
+
relabel
|
|
904
|
+
)
|
|
905
|
+
for stage in adata.obs["age_group"].unique():
|
|
906
|
+
if stage in remap_stages.keys():
|
|
907
|
+
adata.obs["age_group"] = adata.obs["age_group"].map(
|
|
908
|
+
lambda x: remap_stages[x] if x == stage else x
|
|
909
|
+
)
|
|
885
910
|
elif adata.obs.organism_ontology_term_id.unique() == ["NCBITaxon:10090"]:
|
|
886
911
|
rename_mapping = {
|
|
887
912
|
k: v for v, j in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.items() for k in j
|
|
@@ -890,11 +915,12 @@ def additional_postprocess(adata):
|
|
|
890
915
|
for stage in stages:
|
|
891
916
|
if stage in rename_mapping:
|
|
892
917
|
relabel[stage] = rename_mapping[stage]
|
|
893
|
-
adata.obs["
|
|
894
|
-
|
|
895
|
-
|
|
918
|
+
adata.obs["age_group"] = adata.obs["development_stage_ontology_term_id"].map(
|
|
919
|
+
relabel
|
|
920
|
+
)
|
|
896
921
|
else:
|
|
897
|
-
raise ValueError("organism not supported")
|
|
922
|
+
# raise ValueError("organism not supported")
|
|
923
|
+
print("organism not supported for age labels")
|
|
898
924
|
# palantir.utils.run_diffusion_maps(adata, n_components=20)
|
|
899
925
|
# palantir.utils.determine_multiscale_space(adata)
|
|
900
926
|
# terminal_states = palantir.utils.find_terminal_states(
|