pertpy 0.7.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +2 -1
- pertpy/data/__init__.py +61 -0
- pertpy/data/_dataloader.py +27 -23
- pertpy/data/_datasets.py +58 -0
- pertpy/metadata/__init__.py +2 -0
- pertpy/metadata/_cell_line.py +39 -70
- pertpy/metadata/_compound.py +3 -4
- pertpy/metadata/_drug.py +2 -6
- pertpy/metadata/_look_up.py +38 -51
- pertpy/metadata/_metadata.py +7 -10
- pertpy/metadata/_moa.py +2 -6
- pertpy/plot/__init__.py +0 -5
- pertpy/preprocessing/__init__.py +2 -0
- pertpy/preprocessing/_guide_rna.py +6 -7
- pertpy/tools/__init__.py +67 -6
- pertpy/tools/_augur.py +14 -15
- pertpy/tools/_cinemaot.py +2 -2
- pertpy/tools/_coda/_base_coda.py +118 -142
- pertpy/tools/_coda/_sccoda.py +16 -15
- pertpy/tools/_coda/_tasccoda.py +21 -22
- pertpy/tools/_dialogue.py +18 -23
- pertpy/tools/_differential_gene_expression/__init__.py +20 -0
- pertpy/tools/_differential_gene_expression/_base.py +657 -0
- pertpy/tools/_differential_gene_expression/_checks.py +41 -0
- pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
- pertpy/tools/_differential_gene_expression/_edger.py +125 -0
- pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
- pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
- pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
- pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
- pertpy/tools/_distances/_distance_tests.py +21 -16
- pertpy/tools/_distances/_distances.py +406 -70
- pertpy/tools/_enrichment.py +10 -15
- pertpy/tools/_kernel_pca.py +1 -1
- pertpy/tools/_milo.py +77 -54
- pertpy/tools/_mixscape.py +15 -11
- pertpy/tools/_perturbation_space/_clustering.py +5 -2
- pertpy/tools/_perturbation_space/_comparison.py +112 -0
- pertpy/tools/_perturbation_space/_discriminator_classifiers.py +21 -23
- pertpy/tools/_perturbation_space/_perturbation_space.py +23 -21
- pertpy/tools/_perturbation_space/_simple.py +3 -3
- pertpy/tools/_scgen/__init__.py +1 -1
- pertpy/tools/_scgen/_base_components.py +2 -3
- pertpy/tools/_scgen/_scgen.py +33 -28
- pertpy/tools/_scgen/_utils.py +2 -2
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/METADATA +32 -14
- pertpy-0.9.1.dist-info/RECORD +57 -0
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/WHEEL +1 -1
- pertpy/plot/_augur.py +0 -171
- pertpy/plot/_coda.py +0 -601
- pertpy/plot/_guide_rna.py +0 -64
- pertpy/plot/_milopy.py +0 -209
- pertpy/plot/_mixscape.py +0 -355
- pertpy/tools/_differential_gene_expression.py +0 -325
- pertpy-0.7.0.dist-info/RECORD +0 -53
- {pertpy-0.7.0.dist-info → pertpy-0.9.1.dist-info}/licenses/LICENSE +0 -0
pertpy/metadata/_look_up.py
CHANGED
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|
3
3
|
from collections import namedtuple
|
4
4
|
from typing import TYPE_CHECKING, Literal
|
5
5
|
|
6
|
+
from lamin_utils import logger
|
7
|
+
|
6
8
|
if TYPE_CHECKING:
|
7
9
|
from collections.abc import Sequence
|
8
10
|
|
9
|
-
from rich import print
|
10
|
-
|
11
11
|
if TYPE_CHECKING:
|
12
12
|
import pandas as pd
|
13
13
|
|
@@ -24,10 +24,9 @@ class LookUp:
|
|
24
24
|
):
|
25
25
|
"""
|
26
26
|
Args:
|
27
|
-
type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
|
27
|
+
type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
|
28
28
|
transfer_metadata: DataFrames used to generate Lookup object.
|
29
29
|
This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
|
30
|
-
Defaults to 'cell_line'.
|
31
30
|
"""
|
32
31
|
self.type = type
|
33
32
|
if type == "cell_line":
|
@@ -285,12 +284,11 @@ class LookUp:
|
|
285
284
|
"""A brief summary of cell line metadata.
|
286
285
|
|
287
286
|
Args:
|
288
|
-
cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
|
287
|
+
cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
|
289
288
|
reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName or StrippedCellLineName.
|
290
|
-
If fetch cell line metadata from Cancerrxgene, it is recommended to choose
|
291
|
-
"stripped_cell_line_name". Defaults to "ModelID".
|
289
|
+
If fetch cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
|
292
290
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the
|
293
|
-
metadata. If set to None, the query of metadata identifiers will be disabled.
|
291
|
+
metadata. If set to None, the query of metadata identifiers will be disabled.
|
294
292
|
"""
|
295
293
|
if self.type != "cell_line":
|
296
294
|
raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
|
@@ -313,8 +311,8 @@ class LookUp:
|
|
313
311
|
)
|
314
312
|
not_matched_identifiers = list(set(query_id_list) - set(self.cl_cancer_project_meta[reference_id]))
|
315
313
|
|
316
|
-
|
317
|
-
|
314
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
315
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
318
316
|
|
319
317
|
def available_bulk_rna(
|
320
318
|
self,
|
@@ -324,9 +322,9 @@ class LookUp:
|
|
324
322
|
"""A brief summary of bulk RNA expression data.
|
325
323
|
|
326
324
|
Args:
|
327
|
-
cell_line_source: the source of RNA-seq data, broad or sanger.
|
325
|
+
cell_line_source: the source of RNA-seq data, broad or sanger.
|
328
326
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the
|
329
|
-
metadata. If set to None, the query of metadata identifiers will be disabled.
|
327
|
+
metadata. If set to None, the query of metadata identifiers will be disabled.
|
330
328
|
"""
|
331
329
|
if self.type != "cell_line":
|
332
330
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
@@ -340,8 +338,8 @@ class LookUp:
|
|
340
338
|
identifier_num_all = len(query_id_list)
|
341
339
|
not_matched_identifiers = list(set(query_id_list) - set(bulk_rna.index))
|
342
340
|
|
343
|
-
|
344
|
-
|
341
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
342
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
345
343
|
|
346
344
|
def available_protein_expression(
|
347
345
|
self,
|
@@ -352,9 +350,8 @@ class LookUp:
|
|
352
350
|
|
353
351
|
Args:
|
354
352
|
reference_id: The type of cell line identifier in the meta data, model_name or model_id.
|
355
|
-
Defaults to "model_name".
|
356
353
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the
|
357
|
-
metadata. If set to None, the query of metadata identifiers will be disabled.
|
354
|
+
metadata. If set to None, the query of metadata identifiers will be disabled.
|
358
355
|
"""
|
359
356
|
if self.type != "cell_line":
|
360
357
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
@@ -367,8 +364,8 @@ class LookUp:
|
|
367
364
|
f"The specified `reference_id` {reference_id} is not available in the proteomics data. "
|
368
365
|
)
|
369
366
|
not_matched_identifiers = list(set(query_id_list) - set(self.proteomics_data[reference_id]))
|
370
|
-
|
371
|
-
|
367
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
368
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
372
369
|
|
373
370
|
def available_drug_response(
|
374
371
|
self,
|
@@ -381,20 +378,16 @@ class LookUp:
|
|
381
378
|
"""A brief summary of drug response data.
|
382
379
|
|
383
380
|
Args:
|
384
|
-
gdsc_dataset: The GDSC dataset, 1 or 2.
|
381
|
+
gdsc_dataset: The GDSC dataset, 1 or 2.
|
385
382
|
The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital.
|
386
383
|
It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
|
387
384
|
GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
|
388
385
|
reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
|
389
|
-
Defaults to 'cell_line_name'.
|
390
386
|
query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata.
|
391
387
|
If set to None, the query of metadata identifiers will be disabled.
|
392
|
-
Defaults to None.
|
393
388
|
reference_perturbation: The perturbation information in the meta data, drug_name or drug_id.
|
394
|
-
Defaults to 'drug_name'.
|
395
389
|
query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata.
|
396
390
|
If set to None, the query of perturbation types will be disabled.
|
397
|
-
Defaults to None.
|
398
391
|
"""
|
399
392
|
if self.type != "cell_line":
|
400
393
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
@@ -410,8 +403,8 @@ class LookUp:
|
|
410
403
|
)
|
411
404
|
identifier_num_all = len(query_id_list)
|
412
405
|
not_matched_identifiers = list(set(query_id_list) - set(gdsc_data[reference_id]))
|
413
|
-
|
414
|
-
|
406
|
+
logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
|
407
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
|
415
408
|
|
416
409
|
if query_perturbation_list is not None:
|
417
410
|
if reference_perturbation not in gdsc_data.columns:
|
@@ -420,8 +413,8 @@ class LookUp:
|
|
420
413
|
)
|
421
414
|
identifier_num_all = len(query_perturbation_list)
|
422
415
|
not_matched_identifiers = list(set(query_perturbation_list) - set(gdsc_data[reference_perturbation]))
|
423
|
-
|
424
|
-
|
416
|
+
logger.info(f"{len(not_matched_identifiers)} perturbation types are not found in the metadata.")
|
417
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbation types are found! ")
|
425
418
|
|
426
419
|
def available_genes_annotation(
|
427
420
|
self,
|
@@ -432,22 +425,20 @@ class LookUp:
|
|
432
425
|
|
433
426
|
Args:
|
434
427
|
reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
|
435
|
-
Defaults to "ensembl_gene_id".
|
436
428
|
query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata.
|
437
|
-
Defaults to None.
|
438
429
|
"""
|
439
430
|
if self.type != "cell_line":
|
440
431
|
raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
|
441
432
|
|
442
|
-
|
443
|
-
|
444
|
-
|
433
|
+
logger.info("To summarize: in the DepMap_Sanger gene annotation file, you can find: ")
|
434
|
+
logger.info(f"{len(self.gene_annotation.index)} driver genes")
|
435
|
+
logger.info(
|
445
436
|
f"{len(self.gene_annotation.columns)} meta data including: ",
|
446
437
|
*list(self.gene_annotation.columns.values),
|
447
438
|
sep="\n- ",
|
448
439
|
)
|
449
|
-
|
450
|
-
|
440
|
+
logger.info("Overview of gene annotation: ")
|
441
|
+
logger.info(self.gene_annotation.head().to_string())
|
451
442
|
"""
|
452
443
|
#not implemented yet
|
453
444
|
print("Default parameters to annotate gene annotation: ")
|
@@ -472,26 +463,24 @@ class LookUp:
|
|
472
463
|
Args:
|
473
464
|
query_id_list: Unique perturbagens to test the number of matched ones present in the metadata.
|
474
465
|
If set to None, the query of metadata perturbagens will be disabled.
|
475
|
-
Defaults to None.
|
476
466
|
target_list: Unique molecular targets to test the number of matched ones present in the metadata.
|
477
467
|
If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled.
|
478
|
-
Defaults to None.
|
479
468
|
"""
|
480
|
-
if self.type != "moa":
|
481
|
-
raise ValueError("This is not a LookUp object specific for MoaMetaData!")
|
482
469
|
if query_id_list is not None:
|
470
|
+
if self.type != "moa":
|
471
|
+
raise ValueError("This is not a LookUp object specific for MoaMetaData!")
|
483
472
|
identifier_num_all = len(query_id_list)
|
484
473
|
not_matched_identifiers = list(set(query_id_list) - set(self.moa_meta.pert_iname))
|
485
|
-
|
486
|
-
|
474
|
+
logger.info(f"{len(not_matched_identifiers)} perturbagens are not found in the metadata.")
|
475
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbagens are found! ")
|
487
476
|
|
488
477
|
if target_list is not None:
|
489
478
|
targets = self.moa_meta.target.astype(str).apply(lambda x: x.split("|"))
|
490
479
|
all_targets = [t for tl in targets for t in tl]
|
491
480
|
identifier_num_all = len(target_list)
|
492
481
|
not_matched_identifiers = list(set(target_list) - set(all_targets))
|
493
|
-
|
494
|
-
|
482
|
+
logger.info(f"{len(not_matched_identifiers)} molecular targets are not found in the metadata.")
|
483
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} molecular targets are found! ")
|
495
484
|
|
496
485
|
def available_compounds(
|
497
486
|
self,
|
@@ -503,8 +492,7 @@ class LookUp:
|
|
503
492
|
Args:
|
504
493
|
query_id_list: Unique compounds to test the number of matched ones present in the metadata.
|
505
494
|
If set to None, query of compound identifiers will be disabled.
|
506
|
-
|
507
|
-
query_id_type: The type of compound identifiers, name or cid. Defaults to 'name'.
|
495
|
+
query_id_type: The type of compound identifiers, name or cid.
|
508
496
|
"""
|
509
497
|
if self.type != "compound":
|
510
498
|
raise ValueError("This is not a LookUp object specific for CompoundData!")
|
@@ -523,8 +511,8 @@ class LookUp:
|
|
523
511
|
except pcp.BadRequestError:
|
524
512
|
not_matched_identifiers.append(compound)
|
525
513
|
|
526
|
-
|
527
|
-
|
514
|
+
logger.info(f"{len(not_matched_identifiers)} compounds are not found in the metadata.")
|
515
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} compounds are found! ")
|
528
516
|
|
529
517
|
def available_drug_annotation(
|
530
518
|
self,
|
@@ -535,11 +523,10 @@ class LookUp:
|
|
535
523
|
"""A brief summary of drug annotation.
|
536
524
|
|
537
525
|
Args:
|
538
|
-
drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
|
526
|
+
drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
|
539
527
|
query_id_list: Unique target or compound names to test the number of matched ones present in the metadata.
|
540
528
|
If set to None, query of compound identifiers will be disabled.
|
541
|
-
|
542
|
-
query_id_type: The type of identifiers, target, compound and disease(pharmgkb only). Defaults to 'target'.
|
529
|
+
query_id_type: The type of identifiers, target, compound and disease(pharmgkb only).
|
543
530
|
"""
|
544
531
|
if self.type != "drug":
|
545
532
|
raise ValueError("This is not a LookUp object specific for DrugMetaData!")
|
@@ -578,5 +565,5 @@ class LookUp:
|
|
578
565
|
diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
|
579
566
|
not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
|
580
567
|
|
581
|
-
|
582
|
-
|
568
|
+
logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
|
569
|
+
logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
|
pertpy/metadata/_metadata.py
CHANGED
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Literal
|
4
4
|
|
5
|
+
from lamin_utils import logger
|
6
|
+
|
5
7
|
if TYPE_CHECKING:
|
6
8
|
from collections.abc import Sequence
|
7
9
|
|
@@ -31,12 +33,10 @@ class MetaData:
|
|
31
33
|
total_identifiers: The total number of identifiers in the `adata` object.
|
32
34
|
unmatched_identifiers: Unmatched identifiers in the `adata` object.
|
33
35
|
query_id: The column of `.obs` with cell line information.
|
34
|
-
reference_id: The type of cell line identifier in the
|
36
|
+
reference_id: The type of cell line identifier in the metadata.
|
35
37
|
metadata_type: The type of metadata where some identifiers are not matched during annotation such as
|
36
38
|
cell line, protein expression, bulk RNA expression, drug response, moa or compound.
|
37
|
-
Defaults to 'cell line'.
|
38
39
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
39
|
-
Defaults to 5.
|
40
40
|
"""
|
41
41
|
if isinstance(verbosity, str):
|
42
42
|
if verbosity != "all":
|
@@ -60,14 +60,11 @@ class MetaData:
|
|
60
60
|
if isinstance(verbosity, int) and verbosity >= 0:
|
61
61
|
verbosity = min(verbosity, len(unmatched_identifiers))
|
62
62
|
if verbosity > 0:
|
63
|
-
|
64
|
-
f"
|
63
|
+
logger.info(
|
64
|
+
f"There are {total_identifiers} identifiers in `adata.obs`."
|
65
65
|
f"However, {len(unmatched_identifiers)} identifiers can't be found in the {metadata_type} annotation,"
|
66
|
-
"leading to the presence of NA values for their respective metadata.\n"
|
67
|
-
"Please check again: "
|
68
|
-
*unmatched_identifiers[:verbosity],
|
69
|
-
"...",
|
70
|
-
sep="\n- ",
|
66
|
+
"leading to the presence of NA values for their respective metadata.\n"
|
67
|
+
f"Please check again: *unmatched_identifiers[:verbosity]..."
|
71
68
|
)
|
72
69
|
else:
|
73
70
|
raise ValueError("Only 'all' or a non-negative value is accepted.")
|
pertpy/metadata/_moa.py
CHANGED
@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING
|
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
|
-
from rich import print
|
9
8
|
from scanpy import settings
|
10
9
|
|
11
10
|
from pertpy.data._dataloader import _download
|
@@ -26,7 +25,6 @@ class Moa(MetaData):
|
|
26
25
|
def _download_clue(self) -> None:
|
27
26
|
clue_path = Path(settings.cachedir) / "repurposing_drugs_20200324.txt"
|
28
27
|
if not Path(clue_path).exists():
|
29
|
-
print("[bold yellow]No metadata file was found for clue. Starting download now.")
|
30
28
|
_download(
|
31
29
|
url="https://s3.amazonaws.com/data.clue.io/repurposing/downloads/repurposing_drugs_20200324.txt",
|
32
30
|
output_file_name="repurposing_drugs_20200324.txt",
|
@@ -51,12 +49,10 @@ class Moa(MetaData):
|
|
51
49
|
|
52
50
|
Args:
|
53
51
|
adata: The data object to annotate.
|
54
|
-
query_id: The column of `.obs` with the name of a perturbagen.
|
52
|
+
query_id: The column of `.obs` with the name of a perturbagen.
|
55
53
|
target: The column of `.obs` with target information. If set to None, all MoAs are retrieved without comparing molecular targets.
|
56
|
-
Defaults to None.
|
57
54
|
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
|
58
|
-
|
59
|
-
copy: Determines whether a copy of the `adata` is returned. Defaults to False.
|
55
|
+
copy: Determines whether a copy of the `adata` is returned.
|
60
56
|
|
61
57
|
Returns:
|
62
58
|
Returns an AnnData object with MoA annotation.
|
pertpy/plot/__init__.py
CHANGED
pertpy/preprocessing/__init__.py
CHANGED
@@ -34,9 +34,8 @@ class GuideAssignment:
|
|
34
34
|
assignment_threshold: The count threshold that is required for an assignment to be viable.
|
35
35
|
layer: Key to the layer containing raw count values of the gRNAs.
|
36
36
|
adata.X is used if layer is None. Expects count data.
|
37
|
-
output_layer: Assigned guide will be saved on adata.layers[output_key].
|
37
|
+
output_layer: Assigned guide will be saved on adata.layers[output_key].
|
38
38
|
only_return_results: If True, input AnnData is not modified and the result is returned as an np.ndarray.
|
39
|
-
Defaults to False.
|
40
39
|
|
41
40
|
Examples:
|
42
41
|
Each cell is assigned to gRNA that occurs at least 5 times in the respective cell.
|
@@ -49,7 +48,7 @@ class GuideAssignment:
|
|
49
48
|
"""
|
50
49
|
counts = adata.X if layer is None else adata.layers[layer]
|
51
50
|
if scipy.sparse.issparse(counts):
|
52
|
-
counts = counts.
|
51
|
+
counts = counts.toarray()
|
53
52
|
|
54
53
|
assigned_grnas = np.where(counts >= assignment_threshold, 1, 0)
|
55
54
|
assigned_grnas = scipy.sparse.csr_matrix(assigned_grnas)
|
@@ -93,7 +92,7 @@ class GuideAssignment:
|
|
93
92
|
"""
|
94
93
|
counts = adata.X if layer is None else adata.layers[layer]
|
95
94
|
if scipy.sparse.issparse(counts):
|
96
|
-
counts = counts.
|
95
|
+
counts = counts.toarray()
|
97
96
|
|
98
97
|
assigned_grna = np.where(
|
99
98
|
counts.max(axis=1).squeeze() >= assignment_threshold,
|
@@ -127,7 +126,7 @@ class GuideAssignment:
|
|
127
126
|
adata: Annotated data matrix containing gRNA values
|
128
127
|
layer: Key to the layer containing log normalized count values of the gRNAs.
|
129
128
|
adata.X is used if layer is None.
|
130
|
-
order_by: The order of cells in y axis.
|
129
|
+
order_by: The order of cells in y axis.
|
131
130
|
If None, cells will be reordered to have a nice sparse representation.
|
132
131
|
If a string is provided, adata.obs[order_by] will be used as the order.
|
133
132
|
If a numpy array is provided, the array will be used for ordering.
|
@@ -153,9 +152,9 @@ class GuideAssignment:
|
|
153
152
|
|
154
153
|
if order_by is None:
|
155
154
|
if scipy.sparse.issparse(data):
|
156
|
-
max_values = data.max(axis=1).
|
155
|
+
max_values = data.max(axis=1).toarray().squeeze()
|
157
156
|
data_argmax = data.argmax(axis=1).A.squeeze()
|
158
|
-
max_guide_index = np.where(max_values != data.min(axis=1).
|
157
|
+
max_guide_index = np.where(max_values != data.min(axis=1).toarray().squeeze(), data_argmax, -1)
|
159
158
|
else:
|
160
159
|
max_guide_index = np.where(
|
161
160
|
data.max(axis=1).squeeze() != data.min(axis=1).squeeze(), data.argmax(axis=1).squeeze(), -1
|
pertpy/tools/__init__.py
CHANGED
@@ -1,19 +1,80 @@
|
|
1
|
+
from importlib import import_module
|
2
|
+
|
3
|
+
|
4
|
+
def lazy_import(module_path, class_name, extras):
|
5
|
+
def _import():
|
6
|
+
try:
|
7
|
+
for extra in extras:
|
8
|
+
import_module(extra)
|
9
|
+
except ImportError as e:
|
10
|
+
raise ImportError(
|
11
|
+
f"Extra dependencies required: {', '.join(extras)}. "
|
12
|
+
f"Please install with: pip install {' '.join(extras)}"
|
13
|
+
) from e
|
14
|
+
module = import_module(module_path)
|
15
|
+
return getattr(module, class_name)
|
16
|
+
|
17
|
+
return _import
|
18
|
+
|
19
|
+
|
1
20
|
from pertpy.tools._augur import Augur
|
2
21
|
from pertpy.tools._cinemaot import Cinemaot
|
3
|
-
from pertpy.tools._coda._sccoda import Sccoda
|
4
|
-
from pertpy.tools._coda._tasccoda import Tasccoda
|
5
22
|
from pertpy.tools._dialogue import Dialogue
|
6
|
-
from pertpy.tools._differential_gene_expression import DifferentialGeneExpression
|
7
23
|
from pertpy.tools._distances._distance_tests import DistanceTest
|
8
24
|
from pertpy.tools._distances._distances import Distance
|
9
25
|
from pertpy.tools._enrichment import Enrichment
|
10
26
|
from pertpy.tools._milo import Milo
|
11
27
|
from pertpy.tools._mixscape import Mixscape
|
12
28
|
from pertpy.tools._perturbation_space._clustering import ClusteringSpace
|
29
|
+
from pertpy.tools._perturbation_space._comparison import PerturbationComparison
|
13
30
|
from pertpy.tools._perturbation_space._discriminator_classifiers import (
|
14
|
-
DiscriminatorClassifierSpace,
|
15
31
|
LRClassifierSpace,
|
16
32
|
MLPClassifierSpace,
|
17
33
|
)
|
18
|
-
from pertpy.tools._perturbation_space._simple import
|
19
|
-
|
34
|
+
from pertpy.tools._perturbation_space._simple import (
|
35
|
+
CentroidSpace,
|
36
|
+
DBSCANSpace,
|
37
|
+
KMeansSpace,
|
38
|
+
PseudobulkSpace,
|
39
|
+
)
|
40
|
+
from pertpy.tools._scgen import Scgen
|
41
|
+
|
42
|
+
# from pertpy.tools._differential_gene_expression import DGEEVAL
|
43
|
+
|
44
|
+
CODA_EXTRAS = ["toytree", "arviz", "ete3"] # also pyqt5 technically
|
45
|
+
Sccoda = lazy_import("pertpy.tools._coda._sccoda", "Sccoda", CODA_EXTRAS)
|
46
|
+
Tasccoda = lazy_import("pertpy.tools._coda._tasccoda", "Tasccoda", CODA_EXTRAS)
|
47
|
+
|
48
|
+
DE_EXTRAS = ["formulaic", "pydeseq2"]
|
49
|
+
EdgeR = lazy_import("pertpy.tools._differential_gene_expression", "EdgeR", DE_EXTRAS + ["edger"])
|
50
|
+
PyDESeq2 = lazy_import("pertpy.tools._differential_gene_expression", "PyDESeq2", DE_EXTRAS)
|
51
|
+
Statsmodels = lazy_import("pertpy.tools._differential_gene_expression", "Statsmodels", DE_EXTRAS + ["statsmodels"])
|
52
|
+
TTest = lazy_import("pertpy.tools._differential_gene_expression", "TTest", DE_EXTRAS)
|
53
|
+
WilcoxonTest = lazy_import("pertpy.tools._differential_gene_expression", "WilcoxonTest", DE_EXTRAS)
|
54
|
+
|
55
|
+
__all__ = [
|
56
|
+
"Augur",
|
57
|
+
"Cinemaot",
|
58
|
+
"Sccoda",
|
59
|
+
"Tasccoda",
|
60
|
+
"Dialogue",
|
61
|
+
"EdgeR",
|
62
|
+
"PyDESeq2",
|
63
|
+
"WilcoxonTest",
|
64
|
+
"TTest",
|
65
|
+
"Statsmodels",
|
66
|
+
"DistanceTest",
|
67
|
+
"Distance",
|
68
|
+
"Enrichment",
|
69
|
+
"Milo",
|
70
|
+
"Mixscape",
|
71
|
+
"ClusteringSpace",
|
72
|
+
"LRClassifierSpace",
|
73
|
+
"MLPClassifierSpace",
|
74
|
+
"CentroidSpace",
|
75
|
+
"DBSCANSpace",
|
76
|
+
"KMeansSpace",
|
77
|
+
"PseudobulkSpace",
|
78
|
+
"Scgen",
|
79
|
+
"DGEEVAL",
|
80
|
+
]
|
pertpy/tools/_augur.py
CHANGED
@@ -14,6 +14,7 @@ import scanpy as sc
|
|
14
14
|
import statsmodels.api as sm
|
15
15
|
from anndata import AnnData
|
16
16
|
from joblib import Parallel, delayed
|
17
|
+
from lamin_utils import logger
|
17
18
|
from rich import print
|
18
19
|
from rich.progress import track
|
19
20
|
from scipy import sparse, stats
|
@@ -127,7 +128,7 @@ class Augur:
|
|
127
128
|
_ = input[cell_type_col]
|
128
129
|
_ = input[label_col]
|
129
130
|
except KeyError:
|
130
|
-
|
131
|
+
logger.error("No column names matching cell_type_col and label_col.")
|
131
132
|
|
132
133
|
label = input[label_col] if meta is None else meta[label_col]
|
133
134
|
cell_type = input[cell_type_col] if meta is None else meta[cell_type_col]
|
@@ -140,7 +141,7 @@ class Augur:
|
|
140
141
|
if adata.obs["label"].dtype.name == "category":
|
141
142
|
# filter samples according to label
|
142
143
|
if condition_label is not None and treatment_label is not None:
|
143
|
-
|
144
|
+
logger.info(f"Filtering samples with {condition_label} and {treatment_label} labels.")
|
144
145
|
adata = ad.concat(
|
145
146
|
[adata[adata.obs["label"] == condition_label], adata[adata.obs["label"] == treatment_label]]
|
146
147
|
)
|
@@ -556,7 +557,7 @@ class Augur:
|
|
556
557
|
try:
|
557
558
|
sc.pp.highly_variable_genes(adata)
|
558
559
|
except ValueError:
|
559
|
-
|
560
|
+
logger.warn("Data not normalized. Normalizing now using scanpy log1p normalize.")
|
560
561
|
sc.pp.log1p(adata)
|
561
562
|
sc.pp.highly_variable_genes(adata)
|
562
563
|
|
@@ -608,7 +609,7 @@ class Augur:
|
|
608
609
|
var_quantile: The quantile below which features will be filtered, based on their residuals in a loess model.
|
609
610
|
filter_negative_residuals: if `True`, filter residuals at a fixed threshold of zero, instead of `var_quantile`
|
610
611
|
span: Smoothing factor, as a fraction of the number of points to take into account.
|
611
|
-
Should be in the range (0, 1].
|
612
|
+
Should be in the range (0, 1].
|
612
613
|
|
613
614
|
Return:
|
614
615
|
AnnData object with additional select_variance column in var.
|
@@ -700,13 +701,11 @@ class Augur:
|
|
700
701
|
feature_perc: proportion of genes that are randomly selected as features for input to the classifier in each
|
701
702
|
subsample using the random gene filter
|
702
703
|
var_quantile: The quantile below which features will be filtered, based on their residuals in a loess model.
|
703
|
-
Defaults to 0.5.
|
704
704
|
span: Smoothing factor, as a fraction of the number of points to take into account. Should be in the range (0, 1].
|
705
|
-
Defaults to 0.75.
|
706
705
|
filter_negative_residuals: if `True`, filter residuals at a fixed threshold of zero, instead of `var_quantile`
|
707
706
|
n_threads: number of threads to use for parallelization
|
708
707
|
select_variance_features: Whether to select genes based on the original Augur implementation (True)
|
709
|
-
or using scanpy's highly_variable_genes (False).
|
708
|
+
or using scanpy's highly_variable_genes (False).
|
710
709
|
key_added: Key to add results to in .uns
|
711
710
|
augur_mode: One of 'default', 'velocity' or 'permute'. Setting augur_mode = "velocity" disables feature selection,
|
712
711
|
assuming feature selection has been performed by the RNA velocity procedure to produce the input matrix,
|
@@ -751,8 +750,8 @@ class Augur:
|
|
751
750
|
"full_results": defaultdict(list),
|
752
751
|
}
|
753
752
|
if select_variance_features:
|
754
|
-
|
755
|
-
|
753
|
+
logger.warning("Set smaller span value in the case of a `segmentation fault` error.")
|
754
|
+
logger.warning("Set larger span in case of svddc or other near singularities error.")
|
756
755
|
adata.obs["augur_score"] = nan
|
757
756
|
for cell_type in track(adata.obs["cell_type"].unique(), description="Processing data..."):
|
758
757
|
cell_type_subsample = adata[adata.obs["cell_type"] == cell_type].copy()
|
@@ -768,8 +767,8 @@ class Augur:
|
|
768
767
|
)
|
769
768
|
)
|
770
769
|
if len(cell_type_subsample) < min_cells:
|
771
|
-
|
772
|
-
f"
|
770
|
+
logger.warning(
|
771
|
+
f"Skipping {cell_type} cell type - {len(cell_type_subsample)} samples is less than min_cells {min_cells}."
|
773
772
|
)
|
774
773
|
elif (
|
775
774
|
cell_type_subsample.obs.groupby(
|
@@ -778,8 +777,8 @@ class Augur:
|
|
778
777
|
).y_.count()
|
779
778
|
< subsample_size
|
780
779
|
).any():
|
781
|
-
|
782
|
-
f"
|
780
|
+
logger.warning(
|
781
|
+
f"Skipping {cell_type} cell type - the number of samples for at least one class type is less than "
|
783
782
|
f"subsample size {subsample_size}."
|
784
783
|
)
|
785
784
|
else:
|
@@ -821,7 +820,7 @@ class Augur:
|
|
821
820
|
results["full_results"]["cell_type"].extend([cell_type] * folds * n_subsamples)
|
822
821
|
# make sure one cell type worked
|
823
822
|
if len(results) <= 2:
|
824
|
-
|
823
|
+
logger.warning("No cells types had more than min_cells needed. Please adjust data or min_cells parameter.")
|
825
824
|
|
826
825
|
results["summary_metrics"] = pd.DataFrame(results["summary_metrics"])
|
827
826
|
results["feature_importances"] = pd.DataFrame(results["feature_importances"])
|
@@ -850,7 +849,7 @@ class Augur:
|
|
850
849
|
augur2: Augurpy results from condition 2, obtained from `predict()[1]`
|
851
850
|
permuted1: permuted Augurpy results from condition 1, obtained from `predict()` with argument `augur_mode=permute`
|
852
851
|
permuted2: permuted Augurpy results from condition 2, obtained from `predict()` with argument `augur_mode=permute`
|
853
|
-
n_subsamples: number of subsamples to pool when calculating the mean augur score for each permutation
|
852
|
+
n_subsamples: number of subsamples to pool when calculating the mean augur score for each permutation.
|
854
853
|
n_permutations: the total number of mean augur scores to calculate from a background distribution
|
855
854
|
|
856
855
|
Returns:
|
pertpy/tools/_cinemaot.py
CHANGED
@@ -338,7 +338,7 @@ class Cinemaot:
|
|
338
338
|
sc.tl.leiden(adata, resolution=cf_resolution)
|
339
339
|
df["ct"] = adata.obs["leiden"].astype(str)
|
340
340
|
df["ptb"] = "control"
|
341
|
-
df[
|
341
|
+
df.loc[adata.obs[pert_key] != control, "ptb"] = de.obs["leiden"].astype(str)
|
342
342
|
label_list.append("ptb")
|
343
343
|
df = df.groupby(label_list).sum()
|
344
344
|
new_index = df.index.map(lambda x: "_".join(map(str, x)))
|
@@ -432,7 +432,7 @@ class Cinemaot:
|
|
432
432
|
expr_label = "control"
|
433
433
|
|
434
434
|
adata_.obs["ct"] = ref_label
|
435
|
-
adata_.obs[
|
435
|
+
adata_.obs.loc[adata_.obs[pert_key] == control, "ct"] = expr_label
|
436
436
|
pert_key = "ct"
|
437
437
|
z = np.zeros(adata_.shape[0]) + 1
|
438
438
|
|