pertpy 0.7.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pertpy/__init__.py +2 -1
- pertpy/data/__init__.py +61 -0
- pertpy/data/_dataloader.py +27 -23
- pertpy/data/_datasets.py +58 -0
- pertpy/metadata/__init__.py +2 -0
- pertpy/metadata/_cell_line.py +39 -70
- pertpy/metadata/_compound.py +3 -4
- pertpy/metadata/_drug.py +2 -6
- pertpy/metadata/_look_up.py +38 -51
- pertpy/metadata/_metadata.py +7 -10
- pertpy/metadata/_moa.py +2 -6
- pertpy/plot/__init__.py +0 -5
- pertpy/preprocessing/__init__.py +2 -0
- pertpy/preprocessing/_guide_rna.py +6 -7
- pertpy/tools/__init__.py +67 -6
- pertpy/tools/_augur.py +14 -15
- pertpy/tools/_cinemaot.py +2 -2
- pertpy/tools/_coda/_base_coda.py +118 -142
- pertpy/tools/_coda/_sccoda.py +16 -15
- pertpy/tools/_coda/_tasccoda.py +21 -22
- pertpy/tools/_dialogue.py +18 -23
- pertpy/tools/_differential_gene_expression/__init__.py +20 -0
- pertpy/tools/_differential_gene_expression/_base.py +657 -0
- pertpy/tools/_differential_gene_expression/_checks.py +41 -0
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
- pertpy/tools/_differential_gene_expression/_edger.py +125 -0
- pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
- pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
- pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
- pertpy/tools/_distances/_distance_tests.py +21 -16
- pertpy/tools/_distances/_distances.py +406 -70
- pertpy/tools/_enrichment.py +10 -15
- pertpy/tools/_kernel_pca.py +1 -1
- pertpy/tools/_milo.py +77 -54
- pertpy/tools/_mixscape.py +15 -11
- pertpy/tools/_perturbation_space/_clustering.py +5 -2
- pertpy/tools/_perturbation_space/_comparison.py +112 -0
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +21 -23
- pertpy/tools/_perturbation_space/_perturbation_space.py +23 -21
- pertpy/tools/_perturbation_space/_simple.py +3 -3
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_base_components.py +2 -3
- pertpy/tools/_scgen/_scgen.py +33 -28
- pertpy/tools/_scgen/_utils.py +2 -2
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/METADATA +32 -14
- pertpy-0.9.1.dist-info/RECORD +57 -0
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/WHEEL +1 -1
- pertpy/plot/_augur.py +0 -171
- pertpy/plot/_coda.py +0 -601
- pertpy/plot/_guide_rna.py +0 -64
- pertpy/plot/_milopy.py +0 -209
- pertpy/plot/_mixscape.py +0 -355
- pertpy/tools/_differential_gene_expression.py +0 -325
- pertpy-0.7.0.dist-info/RECORD +0 -53
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/licenses/LICENSE +0 -0
pertpy/metadata/_look_up.py
CHANGED
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|
3
3
|
from collections import namedtuple
|
4
4
|
from typing import TYPE_CHECKING, Literal
|
5
5
|
|
6
|
+
from lamin_utils import logger
|
7
|
+
|
6
8
|
if TYPE_CHECKING:
|
7
9
|
from collections.abc import Sequence
|
8
10
|
|
9
|
-
from rich import print
|
10
|
-
|
11
11
|
if TYPE_CHECKING:
|
12
12
|
import pandas as pd
|
13
13
|
|
@@ -24,10 +24,9 @@ class LookUp:
|
|
24
24
|
):
|
25
25
|
"""
|
26
26
|
Args:
|
27
|
-
type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
|
27
|
+
type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
|
28
28
|
transfer_metadata: DataFrames used to generate Lookup object.
|
29
29
|
This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
|
30
|
-
Defaults to 'cell_line'.
|
31
30
|
"""
|
32
31
|
self.type = type
|
33
32
|
if type == "cell_line":
|
@@ -285,12 +284,11 @@ class LookUp:
|
|
285
284
|
"""A brief summary of cell line metadata.
|
286
285
|
|
287
286
|
Args:
|
288
|
-
cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
|
287
|
+
cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
|
289
288
|
reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName or StrippedCellLineName.
|
290
|
-
If fetch cell line metadata from Cancerrxgene, it is recommended to choose
|
291
|
-
"stripped_cell_line_name". Defaults to "ModelID".
|
289
|
+
If fetch cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
|
292
290
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the
|
293
|
-
metadata. If set to None, the query of metadata identifiers will be disabled.
|
291
|
+
metadata. If set to None, the query of metadata identifiers will be disabled.
|
294
292
|
"""
|
295
293
|
if self.type != "cell_line":
|
296
294
|
raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
|
@@ -313,8 +311,8 @@ class LookUp:
|
|
313
311
|
)
|
314
312
|
not_matched_identifiers = list(set(query_id_list) - set(self.cl_cancer_project_meta[reference_id]))
|
315
313
|
|
316
|
-
|
317
|
-
|
314
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
315
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
318
316
|
|
319
317
|
def available_bulk_rna(
|
320
318
|
self,
|
@@ -324,9 +322,9 @@ class LookUp:
|
|
324
322
|
"""A brief summary of bulk RNA expression data.
|
325
323
|
|
326
324
|
Args:
|
327
|
-
cell_line_source: the source of RNA-seq data, broad or sanger.
|
325
|
+
cell_line_source: the source of RNA-seq data, broad or sanger.
|
328
326
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the
|
329
|
-
metadata. If set to None, the query of metadata identifiers will be disabled.
|
327
|
+
metadata. If set to None, the query of metadata identifiers will be disabled.
|
330
328
|
"""
|
331
329
|
if self.type != "cell_line":
|
332
330
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
@@ -340,8 +338,8 @@ class LookUp:
|
|
340
338
|
identifier_num_all = len(query_id_list)
|
341
339
|
not_matched_identifiers = list(set(query_id_list) - set(bulk_rna.index))
|
342
340
|
|
343
|
-
|
344
|
-
|
341
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
342
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
345
343
|
|
346
344
|
def available_protein_expression(
|
347
345
|
self,
|
@@ -352,9 +350,8 @@ class LookUp:
|
|
352
350
|
|
353
351
|
Args:
|
354
352
|
reference_id: The type of cell line identifier in the meta data, model_name or model_id.
|
355
|
-
Defaults to "model_name".
|
356
353
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the
|
357
|
-
metadata. If set to None, the query of metadata identifiers will be disabled.
|
354
|
+
metadata. If set to None, the query of metadata identifiers will be disabled.
|
358
355
|
"""
|
359
356
|
if self.type != "cell_line":
|
360
357
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
@@ -367,8 +364,8 @@ class LookUp:
|
|
367
364
|
f"The specified `reference_id` {reference_id} is not available in the proteomics data. "
|
368
365
|
)
|
369
366
|
not_matched_identifiers = list(set(query_id_list) - set(self.proteomics_data[reference_id]))
|
370
|
-
|
371
|
-
|
367
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
368
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
372
369
|
|
373
370
|
def available_drug_response(
|
374
371
|
self,
|
@@ -381,20 +378,16 @@ class LookUp:
|
|
381
378
|
"""A brief summary of drug response data.
|
382
379
|
|
383
380
|
Args:
|
384
|
-
gdsc_dataset: The GDSC dataset, 1 or 2.
|
381
|
+
gdsc_dataset: The GDSC dataset, 1 or 2.
|
385
382
|
The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital.
|
386
383
|
It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
|
387
384
|
GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
|
388
385
|
reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
|
389
|
-
Defaults to 'cell_line_name'.
|
390
386
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata.
|
391
387
|
If set to None, the query of metadata identifiers will be disabled.
|
392
|
-
Defaults to None.
|
393
388
|
reference_perturbation: The perturbation information in the meta data, drug_name or drug_id.
|
394
|
-
Defaults to 'drug_name'.
|
395
389
|
query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata.
|
396
390
|
If set to None, the query of perturbation types will be disabled.
|
397
|
-
Defaults to None.
|
398
391
|
"""
|
399
392
|
if self.type != "cell_line":
|
400
393
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
@@ -410,8 +403,8 @@ class LookUp:
|
|
410
403
|
)
|
411
404
|
identifier_num_all = len(query_id_list)
|
412
405
|
not_matched_identifiers = list(set(query_id_list) - set(gdsc_data[reference_id]))
|
413
|
-
|
414
|
-
|
406
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
407
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
415
408
|
|
416
409
|
if query_perturbation_list is not None:
|
417
410
|
if reference_perturbation not in gdsc_data.columns:
|
@@ -420,8 +413,8 @@ class LookUp:
|
|
420
413
|
)
|
421
414
|
identifier_num_all = len(query_perturbation_list)
|
422
415
|
not_matched_identifiers = list(set(query_perturbation_list) - set(gdsc_data[reference_perturbation]))
|
423
|
-
|
424
|
-
|
416
|
+
logger.info(f"{len(not_matched_identifiers)} perturbation types are not found in the metadata.")
|
417
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbation types are found! ")
|
425
418
|
|
426
419
|
def available_genes_annotation(
|
427
420
|
self,
|
@@ -432,22 +425,20 @@ class LookUp:
|
|
432
425
|
|
433
426
|
Args:
|
434
427
|
reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
|
435
|
-
Defaults to "ensembl_gene_id".
|
436
428
|
query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata.
|
437
|
-
Defaults to None.
|
438
429
|
"""
|
439
430
|
if self.type != "cell_line":
|
440
431
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
441
432
|
|
442
|
-
|
443
|
-
|
444
|
-
|
433
|
+
logger.info("To summarize: in the DepMap_Sanger gene annotation file, you can find: ")
|
434
|
+
logger.info(f"{len(self.gene_annotation.index)} driver genes")
|
435
|
+
logger.info(
|
445
436
|
f"{len(self.gene_annotation.columns)} meta data including: ",
|
446
437
|
*list(self.gene_annotation.columns.values),
|
447
438
|
sep="\n- ",
|
448
439
|
)
|
449
|
-
|
450
|
-
|
440
|
+
logger.info("Overview of gene annotation: ")
|
441
|
+
logger.info(self.gene_annotation.head().to_string())
|
451
442
|
"""
|
452
443
|
#not implemented yet
|
453
444
|
print("Default parameters to annotate gene annotation: ")
|
@@ -472,26 +463,24 @@ class LookUp:
|
|
472
463
|
Args:
|
473
464
|
query_id_list: Unique perturbagens to test the number of matched ones present in the metadata.
|
474
465
|
If set to None, the query of metadata perturbagens will be disabled.
|
475
|
-
Defaults to None.
|
476
466
|
target_list: Unique molecular targets to test the number of matched ones present in the metadata.
|
477
467
|
If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled.
|
478
|
-
Defaults to None.
|
479
468
|
"""
|
480
|
-
if self.type != "moa":
|
481
|
-
raise ValueError("This is not a LookUp object specific for MoaMetaData!")
|
482
469
|
if query_id_list is not None:
|
470
|
+
if self.type != "moa":
|
471
|
+
raise ValueError("This is not a LookUp object specific for MoaMetaData!")
|
483
472
|
identifier_num_all = len(query_id_list)
|
484
473
|
not_matched_identifiers = list(set(query_id_list) - set(self.moa_meta.pert_iname))
|
485
|
-
|
486
|
-
|
474
|
+
logger.info(f"{len(not_matched_identifiers)} perturbagens are not found in the metadata.")
|
475
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbagens are found! ")
|
487
476
|
|
488
477
|
if target_list is not None:
|
489
478
|
targets = self.moa_meta.target.astype(str).apply(lambda x: x.split("|"))
|
490
479
|
all_targets = [t for tl in targets for t in tl]
|
491
480
|
identifier_num_all = len(target_list)
|
492
481
|
not_matched_identifiers = list(set(target_list) - set(all_targets))
|
493
|
-
|
494
|
-
|
482
|
+
logger.info(f"{len(not_matched_identifiers)} molecular targets are not found in the metadata.")
|
483
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} molecular targets are found! ")
|
495
484
|
|
496
485
|
def available_compounds(
|
497
486
|
self,
|
@@ -503,8 +492,7 @@ class LookUp:
|
|
503
492
|
Args:
|
504
493
|
query_id_list: Unique compounds to test the number of matched ones present in the metadata.
|
505
494
|
If set to None, query of compound identifiers will be disabled.
|
506
|
-
|
507
|
-
query_id_type: The type of compound identifiers, name or cid. Defaults to 'name'.
|
495
|
+
query_id_type: The type of compound identifiers, name or cid.
|
508
496
|
"""
|
509
497
|
if self.type != "compound":
|
510
498
|
raise ValueError("This is not a LookUp object specific for CompoundData!")
|
@@ -523,8 +511,8 @@ class LookUp:
|
|
523
511
|
except pcp.BadRequestError:
|
524
512
|
not_matched_identifiers.append(compound)
|
525
513
|
|
526
|
-
|
527
|
-
|
514
|
+
logger.info(f"{len(not_matched_identifiers)} compounds are not found in the metadata.")
|
515
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} compounds are found! ")
|
528
516
|
|
529
517
|
def available_drug_annotation(
|
530
518
|
self,
|
@@ -535,11 +523,10 @@ class LookUp:
|
|
535
523
|
"""A brief summary of drug annotation.
|
536
524
|
|
537
525
|
Args:
|
538
|
-
drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
|
526
|
+
drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
|
539
527
|
query_id_list: Unique target or compound names to test the number of matched ones present in the metadata.
|
540
528
|
If set to None, query of compound identifiers will be disabled.
|
541
|
-
|
542
|
-
query_id_type: The type of identifiers, target, compound and disease(pharmgkb only). Defaults to 'target'.
|
529
|
+
query_id_type: The type of identifiers, target, compound and disease(pharmgkb only).
|
543
530
|
"""
|
544
531
|
if self.type != "drug":
|
545
532
|
raise ValueError("This is not a LookUp object specific for DrugMetaData!")
|
@@ -578,5 +565,5 @@ class LookUp:
|
|
578
565
|
diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
|
579
566
|
not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
|
580
567
|
|
581
|
-
|
582
|
-
|
568
|
+
logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
|
569
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
|
pertpy/metadata/_metadata.py
CHANGED
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Literal
|
4
4
|
|
5
|
+
from lamin_utils import logger
|
6
|
+
|
5
7
|
if TYPE_CHECKING:
|
6
8
|
from collections.abc import Sequence
|
7
9
|
|
@@ -31,12 +33,10 @@ class MetaData:
|
|
31
33
|
total_identifiers: The total number of identifiers in the `adata` object.
|
32
34
|
unmatched_identifiers: Unmatched identifiers in the `adata` object.
|
33
35
|
query_id: The column of `.obs` with cell line information.
|
34
|
-
reference_id: The type of cell line identifier in the
|
36
|
+
reference_id: The type of cell line identifier in the metadata.
|
35
37
|
metadata_type: The type of metadata where some identifiers are not matched during annotation such as
|
36
38
|
cell line, protein expression, bulk RNA expression, drug response, moa or compound.
|
37
|
-
Defaults to 'cell line'.
|
38
39
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
39
|
-
Defaults to 5.
|
40
40
|
"""
|
41
41
|
if isinstance(verbosity, str):
|
42
42
|
if verbosity != "all":
|
@@ -60,14 +60,11 @@ class MetaData:
|
|
60
60
|
if isinstance(verbosity, int) and verbosity >= 0:
|
61
61
|
verbosity = min(verbosity, len(unmatched_identifiers))
|
62
62
|
if verbosity > 0:
|
63
|
-
|
64
|
-
f"
|
63
|
+
logger.info(
|
64
|
+
f"There are {total_identifiers} identifiers in `adata.obs`."
|
65
65
|
f"However, {len(unmatched_identifiers)} identifiers can't be found in the {metadata_type} annotation,"
|
66
|
-
"leading to the presence of NA values for their respective metadata.\n"
|
67
|
-
"Please check again: "
|
68
|
-
*unmatched_identifiers[:verbosity],
|
69
|
-
"...",
|
70
|
-
sep="\n- ",
|
66
|
+
"leading to the presence of NA values for their respective metadata.\n"
|
67
|
+
f"Please check again: *unmatched_identifiers[:verbosity]..."
|
71
68
|
)
|
72
69
|
else:
|
73
70
|
raise ValueError("Only 'all' or a non-negative value is accepted.")
|
pertpy/metadata/_moa.py
CHANGED
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
|
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
|
-
from rich import print
|
9
8
|
from scanpy import settings
|
10
9
|
|
11
10
|
from pertpy.data._dataloader import _download
|
@@ -26,7 +25,6 @@ class Moa(MetaData):
|
|
26
25
|
def _download_clue(self) -> None:
|
27
26
|
clue_path = Path(settings.cachedir) / "repurposing_drugs_20200324.txt"
|
28
27
|
if not Path(clue_path).exists():
|
29
|
-
print("[bold yellow]No metadata file was found for clue. Starting download now.")
|
30
28
|
_download(
|
31
29
|
url="https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt",
|
32
30
|
output_file_name="repurposing_drugs_20200324.txt",
|
@@ -51,12 +49,10 @@ class Moa(MetaData):
|
|
51
49
|
|
52
50
|
Args:
|
53
51
|
adata: The data object to annotate.
|
54
|
-
query_id: The column of `.obs` with the name of a perturbagen.
|
52
|
+
query_id: The column of `.obs` with the name of a perturbagen.
|
55
53
|
target: The column of `.obs` with target information. If set to None, all MoAs are retrieved without comparing molecular targets.
|
56
|
-
Defaults to None.
|
57
54
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
58
|
-
|
59
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
55
|
+
copy: Determines whether a copy of the `adata` is returned.
|
60
56
|
|
61
57
|
Returns:
|
62
58
|
Returns an AnnData object with MoA annotation.
|
pertpy/plot/__init__.py
CHANGED
pertpy/preprocessing/__init__.py
CHANGED
@@ -34,9 +34,8 @@ class GuideAssignment:
|
|
34
34
|
assignment_threshold: The count threshold that is required for an assignment to be viable.
|
35
35
|
layer: Key to the layer containing raw count values of the gRNAs.
|
36
36
|
adata.X is used if layer is None. Expects count data.
|
37
|
-
output_layer: Assigned guide will be saved on adata.layers[output_key].
|
37
|
+
output_layer: Assigned guide will be saved on adata.layers[output_key].
|
38
38
|
only_return_results: If True, input AnnData is not modified and the result is returned as an np.ndarray.
|
39
|
-
Defaults to False.
|
40
39
|
|
41
40
|
Examples:
|
42
41
|
Each cell is assigned to gRNA that occurs at least 5 times in the respective cell.
|
@@ -49,7 +48,7 @@ class GuideAssignment:
|
|
49
48
|
"""
|
50
49
|
counts = adata.X if layer is None else adata.layers[layer]
|
51
50
|
if scipy.sparse.issparse(counts):
|
52
|
-
counts = counts.
|
51
|
+
counts = counts.toarray()
|
53
52
|
|
54
53
|
assigned_grnas = np.where(counts >= assignment_threshold, 1, 0)
|
55
54
|
assigned_grnas = scipy.sparse.csr_matrix(assigned_grnas)
|
@@ -93,7 +92,7 @@ class GuideAssignment:
|
|
93
92
|
"""
|
94
93
|
counts = adata.X if layer is None else adata.layers[layer]
|
95
94
|
if scipy.sparse.issparse(counts):
|
96
|
-
counts = counts.
|
95
|
+
counts = counts.toarray()
|
97
96
|
|
98
97
|
assigned_grna = np.where(
|
99
98
|
counts.max(axis=1).squeeze() >= assignment_threshold,
|
@@ -127,7 +126,7 @@ class GuideAssignment:
|
|
127
126
|
adata: Annotated data matrix containing gRNA values
|
128
127
|
layer: Key to the layer containing log normalized count values of the gRNAs.
|
129
128
|
adata.X is used if layer is None.
|
130
|
-
order_by: The order of cells in y axis.
|
129
|
+
order_by: The order of cells in y axis.
|
131
130
|
If None, cells will be reordered to have a nice sparse representation.
|
132
131
|
If a string is provided, adata.obs[order_by] will be used as the order.
|
133
132
|
If a numpy array is provided, the array will be used for ordering.
|
@@ -153,9 +152,9 @@ class GuideAssignment:
|
|
153
152
|
|
154
153
|
if order_by is None:
|
155
154
|
if scipy.sparse.issparse(data):
|
156
|
-
max_values = data.max(axis=1).
|
155
|
+
max_values = data.max(axis=1).toarray().squeeze()
|
157
156
|
data_argmax = data.argmax(axis=1).A.squeeze()
|
158
|
-
max_guide_index = np.where(max_values != data.min(axis=1).
|
157
|
+
max_guide_index = np.where(max_values != data.min(axis=1).toarray().squeeze(), data_argmax, -1)
|
159
158
|
else:
|
160
159
|
max_guide_index = np.where(
|
161
160
|
data.max(axis=1).squeeze() != data.min(axis=1).squeeze(), data.argmax(axis=1).squeeze(), -1
|
pertpy/tools/__init__.py
CHANGED
@@ -1,19 +1,80 @@
|
|
1
|
+
from importlib import import_module
|
2
|
+
|
3
|
+
|
4
|
+
def lazy_import(module_path, class_name, extras):
|
5
|
+
def _import():
|
6
|
+
try:
|
7
|
+
for extra in extras:
|
8
|
+
import_module(extra)
|
9
|
+
except ImportError as e:
|
10
|
+
raise ImportError(
|
11
|
+
f"Extra dependencies required: {', '.join(extras)}. "
|
12
|
+
f"Please install with: pip install {' '.join(extras)}"
|
13
|
+
) from e
|
14
|
+
module = import_module(module_path)
|
15
|
+
return getattr(module, class_name)
|
16
|
+
|
17
|
+
return _import
|
18
|
+
|
19
|
+
|
1
20
|
from pertpy.tools._augur import Augur
|
2
21
|
from pertpy.tools._cinemaot import Cinemaot
|
3
|
-
from pertpy.tools._coda._sccoda import Sccoda
|
4
|
-
from pertpy.tools._coda._tasccoda import Tasccoda
|
5
22
|
from pertpy.tools._dialogue import Dialogue
|
6
|
-
from pertpy.tools._differential_gene_expression import DifferentialGeneExpression
|
7
23
|
from pertpy.tools._distances._distance_tests import DistanceTest
|
8
24
|
from pertpy.tools._distances._distances import Distance
|
9
25
|
from pertpy.tools._enrichment import Enrichment
|
10
26
|
from pertpy.tools._milo import Milo
|
11
27
|
from pertpy.tools._mixscape import Mixscape
|
12
28
|
from pertpy.tools._perturbation_space._clustering import ClusteringSpace
|
29
|
+
from pertpy.tools._perturbation_space._comparison import PerturbationComparison
|
13
30
|
from pertpy.tools._perturbation_space._discriminator_classifiers import (
|
14
|
-
DiscriminatorClassifierSpace,
|
15
31
|
LRClassifierSpace,
|
16
32
|
MLPClassifierSpace,
|
17
33
|
)
|
18
|
-
from pertpy.tools._perturbation_space._simple import
|
19
|
-
|
34
|
+
from pertpy.tools._perturbation_space._simple import (
|
35
|
+
CentroidSpace,
|
36
|
+
DBSCANSpace,
|
37
|
+
KMeansSpace,
|
38
|
+
PseudobulkSpace,
|
39
|
+
)
|
40
|
+
from pertpy.tools._scgen import Scgen
|
41
|
+
|
42
|
+
# from pertpy.tools._differential_gene_expression import DGEEVAL
|
43
|
+
|
44
|
+
CODA_EXTRAS = ["toytree", "arviz", "ete3"] # also pyqt5 technically
|
45
|
+
Sccoda = lazy_import("pertpy.tools._coda._sccoda", "Sccoda", CODA_EXTRAS)
|
46
|
+
Tasccoda = lazy_import("pertpy.tools._coda._tasccoda", "Tasccoda", CODA_EXTRAS)
|
47
|
+
|
48
|
+
DE_EXTRAS = ["formulaic", "pydeseq2"]
|
49
|
+
EdgeR = lazy_import("pertpy.tools._differential_gene_expression", "EdgeR", DE_EXTRAS + ["edger"])
|
50
|
+
PyDESeq2 = lazy_import("pertpy.tools._differential_gene_expression", "PyDESeq2", DE_EXTRAS)
|
51
|
+
Statsmodels = lazy_import("pertpy.tools._differential_gene_expression", "Statsmodels", DE_EXTRAS + ["statsmodels"])
|
52
|
+
TTest = lazy_import("pertpy.tools._differential_gene_expression", "TTest", DE_EXTRAS)
|
53
|
+
WilcoxonTest = lazy_import("pertpy.tools._differential_gene_expression", "WilcoxonTest", DE_EXTRAS)
|
54
|
+
|
55
|
+
__all__ = [
|
56
|
+
"Augur",
|
57
|
+
"Cinemaot",
|
58
|
+
"Sccoda",
|
59
|
+
"Tasccoda",
|
60
|
+
"Dialogue",
|
61
|
+
"EdgeR",
|
62
|
+
"PyDESeq2",
|
63
|
+
"WilcoxonTest",
|
64
|
+
"TTest",
|
65
|
+
"Statsmodels",
|
66
|
+
"DistanceTest",
|
67
|
+
"Distance",
|
68
|
+
"Enrichment",
|
69
|
+
"Milo",
|
70
|
+
"Mixscape",
|
71
|
+
"ClusteringSpace",
|
72
|
+
"LRClassifierSpace",
|
73
|
+
"MLPClassifierSpace",
|
74
|
+
"CentroidSpace",
|
75
|
+
"DBSCANSpace",
|
76
|
+
"KMeansSpace",
|
77
|
+
"PseudobulkSpace",
|
78
|
+
"Scgen",
|
79
|
+
"DGEEVAL",
|
80
|
+
]
|
pertpy/tools/_augur.py
CHANGED
@@ -14,6 +14,7 @@ import scanpy as sc
|
|
14
14
|
import statsmodels.api as sm
|
15
15
|
from anndata import AnnData
|
16
16
|
from joblib import Parallel, delayed
|
17
|
+
from lamin_utils import logger
|
17
18
|
from rich import print
|
18
19
|
from rich.progress import track
|
19
20
|
from scipy import sparse, stats
|
@@ -127,7 +128,7 @@ class Augur:
|
|
127
128
|
_ = input[cell_type_col]
|
128
129
|
_ = input[label_col]
|
129
130
|
except KeyError:
|
130
|
-
|
131
|
+
logger.error("No column names matching cell_type_col and label_col.")
|
131
132
|
|
132
133
|
label = input[label_col] if meta is None else meta[label_col]
|
133
134
|
cell_type = input[cell_type_col] if meta is None else meta[cell_type_col]
|
@@ -140,7 +141,7 @@ class Augur:
|
|
140
141
|
if adata.obs["label"].dtype.name == "category":
|
141
142
|
# filter samples according to label
|
142
143
|
if condition_label is not None and treatment_label is not None:
|
143
|
-
|
144
|
+
logger.info(f"Filtering samples with {condition_label} and {treatment_label} labels.")
|
144
145
|
adata = ad.concat(
|
145
146
|
[adata[adata.obs["label"] == condition_label], adata[adata.obs["label"] == treatment_label]]
|
146
147
|
)
|
@@ -556,7 +557,7 @@ class Augur:
|
|
556
557
|
try:
|
557
558
|
sc.pp.highly_variable_genes(adata)
|
558
559
|
except ValueError:
|
559
|
-
|
560
|
+
logger.warn("Data not normalized. Normalizing now using scanpy log1p normalize.")
|
560
561
|
sc.pp.log1p(adata)
|
561
562
|
sc.pp.highly_variable_genes(adata)
|
562
563
|
|
@@ -608,7 +609,7 @@ class Augur:
|
|
608
609
|
var_quantile: The quantile below which features will be filtered, based on their residuals in a loess model.
|
609
610
|
filter_negative_residuals: if `True`, filter residuals at a fixed threshold of zero, instead of `var_quantile`
|
610
611
|
span: Smoothing factor, as a fraction of the number of points to take into account.
|
611
|
-
Should be in the range (0, 1].
|
612
|
+
Should be in the range (0, 1].
|
612
613
|
|
613
614
|
Return:
|
614
615
|
AnnData object with additional select_variance column in var.
|
@@ -700,13 +701,11 @@ class Augur:
|
|
700
701
|
feature_perc: proportion of genes that are randomly selected as features for input to the classifier in each
|
701
702
|
subsample using the random gene filter
|
702
703
|
var_quantile: The quantile below which features will be filtered, based on their residuals in a loess model.
|
703
|
-
Defaults to 0.5.
|
704
704
|
span: Smoothing factor, as a fraction of the number of points to take into account. Should be in the range (0, 1].
|
705
|
-
Defaults to 0.75.
|
706
705
|
filter_negative_residuals: if `True`, filter residuals at a fixed threshold of zero, instead of `var_quantile`
|
707
706
|
n_threads: number of threads to use for parallelization
|
708
707
|
select_variance_features: Whether to select genes based on the original Augur implementation (True)
|
709
|
-
or using scanpy's highly_variable_genes (False).
|
708
|
+
or using scanpy's highly_variable_genes (False).
|
710
709
|
key_added: Key to add results to in .uns
|
711
710
|
augur_mode: One of 'default', 'velocity' or 'permute'. Setting augur_mode = "velocity" disables feature selection,
|
712
711
|
assuming feature selection has been performed by the RNA velocity procedure to produce the input matrix,
|
@@ -751,8 +750,8 @@ class Augur:
|
|
751
750
|
"full_results": defaultdict(list),
|
752
751
|
}
|
753
752
|
if select_variance_features:
|
754
|
-
|
755
|
-
|
753
|
+
logger.warning("Set smaller span value in the case of a `segmentation fault` error.")
|
754
|
+
logger.warning("Set larger span in case of svddc or other near singularities error.")
|
756
755
|
adata.obs["augur_score"] = nan
|
757
756
|
for cell_type in track(adata.obs["cell_type"].unique(), description="Processing data..."):
|
758
757
|
cell_type_subsample = adata[adata.obs["cell_type"] == cell_type].copy()
|
@@ -768,8 +767,8 @@ class Augur:
|
|
768
767
|
)
|
769
768
|
)
|
770
769
|
if len(cell_type_subsample) < min_cells:
|
771
|
-
|
772
|
-
f"
|
770
|
+
logger.warning(
|
771
|
+
f"Skipping {cell_type} cell type - {len(cell_type_subsample)} samples is less than min_cells {min_cells}."
|
773
772
|
)
|
774
773
|
elif (
|
775
774
|
cell_type_subsample.obs.groupby(
|
@@ -778,8 +777,8 @@ class Augur:
|
|
778
777
|
).y_.count()
|
779
778
|
< subsample_size
|
780
779
|
).any():
|
781
|
-
|
782
|
-
f"
|
780
|
+
logger.warning(
|
781
|
+
f"Skipping {cell_type} cell type - the number of samples for at least one class type is less than "
|
783
782
|
f"subsample size {subsample_size}."
|
784
783
|
)
|
785
784
|
else:
|
@@ -821,7 +820,7 @@ class Augur:
|
|
821
820
|
results["full_results"]["cell_type"].extend([cell_type] * folds * n_subsamples)
|
822
821
|
# make sure one cell type worked
|
823
822
|
if len(results) <= 2:
|
824
|
-
|
823
|
+
logger.warning("No cells types had more than min_cells needed. Please adjust data or min_cells parameter.")
|
825
824
|
|
826
825
|
results["summary_metrics"] = pd.DataFrame(results["summary_metrics"])
|
827
826
|
results["feature_importances"] = pd.DataFrame(results["feature_importances"])
|
@@ -850,7 +849,7 @@ class Augur:
|
|
850
849
|
augur2: Augurpy results from condition 2, obtained from `predict()[1]`
|
851
850
|
permuted1: permuted Augurpy results from condition 1, obtained from `predict()` with argument `augur_mode=permute`
|
852
851
|
permuted2: permuted Augurpy results from condition 2, obtained from `predict()` with argument `augur_mode=permute`
|
853
|
-
n_subsamples: number of subsamples to pool when calculating the mean augur score for each permutation
|
852
|
+
n_subsamples: number of subsamples to pool when calculating the mean augur score for each permutation.
|
854
853
|
n_permutations: the total number of mean augur scores to calculate from a background distribution
|
855
854
|
|
856
855
|
Returns:
|
pertpy/tools/_cinemaot.py
CHANGED
@@ -338,7 +338,7 @@ class Cinemaot:
|
|
338
338
|
sc.tl.leiden(adata, resolution=cf_resolution)
|
339
339
|
df["ct"] = adata.obs["leiden"].astype(str)
|
340
340
|
df["ptb"] = "control"
|
341
|
-
df[
|
341
|
+
df.loc[adata.obs[pert_key] != control, "ptb"] = de.obs["leiden"].astype(str)
|
342
342
|
label_list.append("ptb")
|
343
343
|
df = df.groupby(label_list).sum()
|
344
344
|
new_index = df.index.map(lambda x: "_".join(map(str, x)))
|
@@ -432,7 +432,7 @@ class Cinemaot:
|
|
432
432
|
expr_label = "control"
|
433
433
|
|
434
434
|
adata_.obs["ct"] = ref_label
|
435
|
-
adata_.obs[
|
435
|
+
adata_.obs.loc[adata_.obs[pert_key] == control, "ct"] = expr_label
|
436
436
|
pert_key = "ct"
|
437
437
|
z = np.zeros(adata_.shape[0]) + 1
|
438
438
|
|