pertpy 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pertpy/__init__.py +4 -2
 - pertpy/data/__init__.py +66 -1
 - pertpy/data/_dataloader.py +28 -26
 - pertpy/data/_datasets.py +261 -92
 - pertpy/metadata/__init__.py +6 -0
 - pertpy/metadata/_cell_line.py +795 -0
 - pertpy/metadata/_compound.py +128 -0
 - pertpy/metadata/_drug.py +238 -0
 - pertpy/metadata/_look_up.py +569 -0
 - pertpy/metadata/_metadata.py +70 -0
 - pertpy/metadata/_moa.py +125 -0
 - pertpy/plot/__init__.py +0 -13
 - pertpy/preprocessing/__init__.py +2 -0
 - pertpy/preprocessing/_guide_rna.py +89 -6
 - pertpy/tools/__init__.py +48 -15
 - pertpy/tools/_augur.py +329 -32
 - pertpy/tools/_cinemaot.py +145 -6
 - pertpy/tools/_coda/_base_coda.py +1237 -116
 - pertpy/tools/_coda/_sccoda.py +66 -36
 - pertpy/tools/_coda/_tasccoda.py +46 -39
 - pertpy/tools/_dialogue.py +180 -77
 - pertpy/tools/_differential_gene_expression/__init__.py +20 -0
 - pertpy/tools/_differential_gene_expression/_base.py +657 -0
 - pertpy/tools/_differential_gene_expression/_checks.py +41 -0
 - pertpy/tools/_differential_gene_expression/_dge_comparison.py +86 -0
 - pertpy/tools/_differential_gene_expression/_edger.py +125 -0
 - pertpy/tools/_differential_gene_expression/_formulaic.py +189 -0
 - pertpy/tools/_differential_gene_expression/_pydeseq2.py +95 -0
 - pertpy/tools/_differential_gene_expression/_simple_tests.py +162 -0
 - pertpy/tools/_differential_gene_expression/_statsmodels.py +72 -0
 - pertpy/tools/_distances/_distance_tests.py +29 -24
 - pertpy/tools/_distances/_distances.py +584 -98
 - pertpy/tools/_enrichment.py +460 -0
 - pertpy/tools/_kernel_pca.py +1 -1
 - pertpy/tools/_milo.py +406 -49
 - pertpy/tools/_mixscape.py +677 -55
 - pertpy/tools/_perturbation_space/_clustering.py +10 -3
 - pertpy/tools/_perturbation_space/_comparison.py +112 -0
 - pertpy/tools/_perturbation_space/_discriminator_classifiers.py +524 -0
 - pertpy/tools/_perturbation_space/_perturbation_space.py +146 -52
 - pertpy/tools/_perturbation_space/_simple.py +52 -11
 - pertpy/tools/_scgen/__init__.py +1 -1
 - pertpy/tools/_scgen/_base_components.py +2 -3
 - pertpy/tools/_scgen/_scgen.py +706 -0
 - pertpy/tools/_scgen/_utils.py +3 -5
 - pertpy/tools/decoupler_LICENSE +674 -0
 - {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/METADATA +48 -20
 - pertpy-0.8.0.dist-info/RECORD +57 -0
 - {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/WHEEL +1 -1
 - pertpy/plot/_augur.py +0 -234
 - pertpy/plot/_cinemaot.py +0 -81
 - pertpy/plot/_coda.py +0 -1001
 - pertpy/plot/_dialogue.py +0 -91
 - pertpy/plot/_guide_rna.py +0 -82
 - pertpy/plot/_milopy.py +0 -284
 - pertpy/plot/_mixscape.py +0 -594
 - pertpy/plot/_scgen.py +0 -337
 - pertpy/tools/_differential_gene_expression.py +0 -99
 - pertpy/tools/_metadata/__init__.py +0 -0
 - pertpy/tools/_metadata/_cell_line.py +0 -613
 - pertpy/tools/_metadata/_look_up.py +0 -342
 - pertpy/tools/_perturbation_space/_discriminator_classifier.py +0 -381
 - pertpy/tools/_scgen/_jax_scgen.py +0 -370
 - pertpy-0.6.0.dist-info/RECORD +0 -50
 - /pertpy/tools/_scgen/{_jax_scgenvae.py → _scgenvae.py} +0 -0
 - {pertpy-0.6.0.dist-info → pertpy-0.8.0.dist-info}/licenses/LICENSE +0 -0
 
| 
         @@ -0,0 +1,569 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from __future__ import annotations
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            from collections import namedtuple
         
     | 
| 
      
 4 
     | 
    
         
            +
            from typing import TYPE_CHECKING, Literal
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            from lamin_utils import logger
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            if TYPE_CHECKING:
         
     | 
| 
      
 9 
     | 
    
         
            +
                from collections.abc import Sequence
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            if TYPE_CHECKING:
         
     | 
| 
      
 12 
     | 
    
         
            +
                import pandas as pd
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            import pubchempy as pcp
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            class LookUp:
         
     | 
| 
      
 18 
     | 
    
         
            +
                """Generate LookUp object for different type of metadata."""
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                def __init__(
         
     | 
| 
      
 21 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 22 
     | 
    
         
            +
                    type: Literal["cell_line", "moa", "compound", "drug"] = "cell_line",
         
     | 
| 
      
 23 
     | 
    
         
            +
                    transfer_metadata: Sequence[pd.DataFrame] | None = None,
         
     | 
| 
      
 24 
     | 
    
         
            +
                ):
         
     | 
| 
      
 25 
     | 
    
         
            +
                    """
         
     | 
| 
      
 26 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 27 
     | 
    
         
            +
                        type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
         
     | 
| 
      
 28 
     | 
    
         
            +
                        transfer_metadata: DataFrames used to generate Lookup object.
         
     | 
| 
      
 29 
     | 
    
         
            +
                                           This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
         
     | 
| 
      
 30 
     | 
    
         
            +
                    """
         
     | 
| 
      
 31 
     | 
    
         
            +
                    self.type = type
         
     | 
| 
      
 32 
     | 
    
         
            +
                    if type == "cell_line":
         
     | 
| 
      
 33 
     | 
    
         
            +
                        self.cell_line_meta = transfer_metadata[0]
         
     | 
| 
      
 34 
     | 
    
         
            +
                        self.cl_cancer_project_meta = transfer_metadata[1]
         
     | 
| 
      
 35 
     | 
    
         
            +
                        self.gene_annotation = transfer_metadata[2]
         
     | 
| 
      
 36 
     | 
    
         
            +
                        self.bulk_rna_sanger = transfer_metadata[3]
         
     | 
| 
      
 37 
     | 
    
         
            +
                        self.bulk_rna_broad = transfer_metadata[4]
         
     | 
| 
      
 38 
     | 
    
         
            +
                        self.proteomics_data = transfer_metadata[5]
         
     | 
| 
      
 39 
     | 
    
         
            +
                        self.drug_response_gdsc1 = transfer_metadata[6]
         
     | 
| 
      
 40 
     | 
    
         
            +
                        self.drug_response_gdsc2 = transfer_metadata[7]
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                        cell_line_annotation = namedtuple(
         
     | 
| 
      
 43 
     | 
    
         
            +
                            "cell_line_annotation",
         
     | 
| 
      
 44 
     | 
    
         
            +
                            "n_cell_line cell_line n_metadata metadata reference_id reference_id_example default_parameter",
         
     | 
| 
      
 45 
     | 
    
         
            +
                        )
         
     | 
| 
      
 46 
     | 
    
         
            +
                        cell_lines = namedtuple("cell_lines", ["depmap", "cancerrxgene"])
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                        depmap_data = {
         
     | 
| 
      
 49 
     | 
    
         
            +
                            "n_cell_line": len(self.cell_line_meta.index),
         
     | 
| 
      
 50 
     | 
    
         
            +
                            "n_metadata": len(self.cell_line_meta.columns),
         
     | 
| 
      
 51 
     | 
    
         
            +
                            "cell_line": self.cell_line_meta.ModelID.values,
         
     | 
| 
      
 52 
     | 
    
         
            +
                            "metadata": self.cell_line_meta.columns.values,
         
     | 
| 
      
 53 
     | 
    
         
            +
                            "reference_id": [
         
     | 
| 
      
 54 
     | 
    
         
            +
                                "ModelID",
         
     | 
| 
      
 55 
     | 
    
         
            +
                                "CellLineName",
         
     | 
| 
      
 56 
     | 
    
         
            +
                                "StrippedCellLineName",
         
     | 
| 
      
 57 
     | 
    
         
            +
                                "CCLE_Name",
         
     | 
| 
      
 58 
     | 
    
         
            +
                            ],
         
     | 
| 
      
 59 
     | 
    
         
            +
                            "reference_id_example": "ModelID: ACH-000001 | CellLineName: NIH:OVCAR-3 | StrippedCellLineName: NIHOVCAR3 | CCLEName: NIHOVCAR3_OVARY",
         
     | 
| 
      
 60 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 61 
     | 
    
         
            +
                                "cell_line_source": "DepMap",
         
     | 
| 
      
 62 
     | 
    
         
            +
                                "query_id": "DepMap_ID",
         
     | 
| 
      
 63 
     | 
    
         
            +
                                "reference_id": "ModelID",
         
     | 
| 
      
 64 
     | 
    
         
            +
                                "fetch": "None",
         
     | 
| 
      
 65 
     | 
    
         
            +
                            },
         
     | 
| 
      
 66 
     | 
    
         
            +
                        }
         
     | 
| 
      
 67 
     | 
    
         
            +
                        depmap_record = cell_line_annotation(**depmap_data)
         
     | 
| 
      
 68 
     | 
    
         
            +
             
     | 
| 
      
 69 
     | 
    
         
            +
                        cancerrxgene_data = {
         
     | 
| 
      
 70 
     | 
    
         
            +
                            "n_cell_line": len(self.cl_cancer_project_meta.index),
         
     | 
| 
      
 71 
     | 
    
         
            +
                            "n_metadata": len(self.cl_cancer_project_meta.columns),
         
     | 
| 
      
 72 
     | 
    
         
            +
                            "cell_line": self.cl_cancer_project_meta.stripped_cell_line_name.values,
         
     | 
| 
      
 73 
     | 
    
         
            +
                            "metadata": self.cl_cancer_project_meta.columns.values,
         
     | 
| 
      
 74 
     | 
    
         
            +
                            "reference_id": [
         
     | 
| 
      
 75 
     | 
    
         
            +
                                "cell_line_name",
         
     | 
| 
      
 76 
     | 
    
         
            +
                                "stripped_cell_line_name",
         
     | 
| 
      
 77 
     | 
    
         
            +
                                "Model ID",
         
     | 
| 
      
 78 
     | 
    
         
            +
                                "COSMIC ID",
         
     | 
| 
      
 79 
     | 
    
         
            +
                            ],
         
     | 
| 
      
 80 
     | 
    
         
            +
                            "reference_id_example": "cell_line_name: SNU-283 | stripped_cell_line_name: SNU283 | Model ID: SIDM00215 | COSMIC ID: 1659929",
         
     | 
| 
      
 81 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 82 
     | 
    
         
            +
                                "query_id": "stripped_cell_line_name",
         
     | 
| 
      
 83 
     | 
    
         
            +
                                "reference_id": "stripped_cell_line_name",
         
     | 
| 
      
 84 
     | 
    
         
            +
                                "fetch": "None",
         
     | 
| 
      
 85 
     | 
    
         
            +
                            },
         
     | 
| 
      
 86 
     | 
    
         
            +
                        }
         
     | 
| 
      
 87 
     | 
    
         
            +
                        cancerrxgene_record = cell_line_annotation(**cancerrxgene_data)
         
     | 
| 
      
 88 
     | 
    
         
            +
                        self.cell_lines = cell_lines(depmap_record, cancerrxgene_record)
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
                        bulk_rna_annotation = namedtuple(
         
     | 
| 
      
 91 
     | 
    
         
            +
                            "bulk_rna_annotation",
         
     | 
| 
      
 92 
     | 
    
         
            +
                            "n_cell_line cell_line n_gene gene reference_id reference_id_example default_parameter",
         
     | 
| 
      
 93 
     | 
    
         
            +
                        )
         
     | 
| 
      
 94 
     | 
    
         
            +
                        bulk_rna_expression = namedtuple("bulk_rna_expression", ["broad", "sanger"])
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
                        broad_data = {
         
     | 
| 
      
 97 
     | 
    
         
            +
                            "n_cell_line": len(self.bulk_rna_broad.index),
         
     | 
| 
      
 98 
     | 
    
         
            +
                            "n_gene": len(self.bulk_rna_broad.columns),
         
     | 
| 
      
 99 
     | 
    
         
            +
                            "cell_line": self.bulk_rna_broad.index.values,
         
     | 
| 
      
 100 
     | 
    
         
            +
                            "gene": self.bulk_rna_broad.columns.values,
         
     | 
| 
      
 101 
     | 
    
         
            +
                            "reference_id": "DepMap_ID",
         
     | 
| 
      
 102 
     | 
    
         
            +
                            "reference_id_example": "DepMap_ID: ACH-001113",
         
     | 
| 
      
 103 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 104 
     | 
    
         
            +
                                "query_id": "DepMap_ID",
         
     | 
| 
      
 105 
     | 
    
         
            +
                                "cell_line_source": "broad",
         
     | 
| 
      
 106 
     | 
    
         
            +
                            },
         
     | 
| 
      
 107 
     | 
    
         
            +
                        }
         
     | 
| 
      
 108 
     | 
    
         
            +
                        broad_record = bulk_rna_annotation(**broad_data)
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
                        sanger_data = {
         
     | 
| 
      
 111 
     | 
    
         
            +
                            "n_cell_line": len(self.bulk_rna_sanger.index),
         
     | 
| 
      
 112 
     | 
    
         
            +
                            "n_gene": len(self.bulk_rna_sanger.columns),
         
     | 
| 
      
 113 
     | 
    
         
            +
                            "cell_line": self.bulk_rna_sanger.index.values,
         
     | 
| 
      
 114 
     | 
    
         
            +
                            "gene": self.bulk_rna_sanger.columns.values,
         
     | 
| 
      
 115 
     | 
    
         
            +
                            "reference_id": "model_name",
         
     | 
| 
      
 116 
     | 
    
         
            +
                            "reference_id_example": "model_name: MEC-1",
         
     | 
| 
      
 117 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 118 
     | 
    
         
            +
                                "query_id": "cell_line_name",
         
     | 
| 
      
 119 
     | 
    
         
            +
                                "cell_line_source": "sanger",
         
     | 
| 
      
 120 
     | 
    
         
            +
                            },
         
     | 
| 
      
 121 
     | 
    
         
            +
                        }
         
     | 
| 
      
 122 
     | 
    
         
            +
                        sanger_record = bulk_rna_annotation(**sanger_data)
         
     | 
| 
      
 123 
     | 
    
         
            +
                        self.bulk_rna = bulk_rna_expression(broad_record, sanger_record)
         
     | 
| 
      
 124 
     | 
    
         
            +
             
     | 
| 
      
 125 
     | 
    
         
            +
                        proteomics = namedtuple(
         
     | 
| 
      
 126 
     | 
    
         
            +
                            "proteomics",
         
     | 
| 
      
 127 
     | 
    
         
            +
                            "n_cell_line cell_line n_protein protein metadata reference_id reference_id_example default_parameter",
         
     | 
| 
      
 128 
     | 
    
         
            +
                        )
         
     | 
| 
      
 129 
     | 
    
         
            +
                        proteomics_data = {
         
     | 
| 
      
 130 
     | 
    
         
            +
                            "n_cell_line": len(self.proteomics_data["model_name"].unique()),
         
     | 
| 
      
 131 
     | 
    
         
            +
                            "n_protein": len(self.proteomics_data.uniprot_id.unique()),
         
     | 
| 
      
 132 
     | 
    
         
            +
                            "cell_line": self.proteomics_data["model_name"].unique(),
         
     | 
| 
      
 133 
     | 
    
         
            +
                            "protein": self.proteomics_data.uniprot_id.unique(),
         
     | 
| 
      
 134 
     | 
    
         
            +
                            "metadata": self.proteomics_data.columns.values,
         
     | 
| 
      
 135 
     | 
    
         
            +
                            "reference_id": ["model_id", "model_name"],
         
     | 
| 
      
 136 
     | 
    
         
            +
                            "reference_id_example": "model_id: SIDM00483 | model_name: SK-GT-4",
         
     | 
| 
      
 137 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 138 
     | 
    
         
            +
                                "query_id": "cell_line_name",
         
     | 
| 
      
 139 
     | 
    
         
            +
                                "reference_id": "model_name",
         
     | 
| 
      
 140 
     | 
    
         
            +
                                "bulk_rna_information": "read_count",
         
     | 
| 
      
 141 
     | 
    
         
            +
                                "protein_information": "protein_intensity",
         
     | 
| 
      
 142 
     | 
    
         
            +
                                "protein_id": "uniprot_id",
         
     | 
| 
      
 143 
     | 
    
         
            +
                            },
         
     | 
| 
      
 144 
     | 
    
         
            +
                        }
         
     | 
| 
      
 145 
     | 
    
         
            +
                        self.proteomics = proteomics(**proteomics_data)
         
     | 
| 
      
 146 
     | 
    
         
            +
             
     | 
| 
      
 147 
     | 
    
         
            +
                        drug_response_annotation = namedtuple(
         
     | 
| 
      
 148 
     | 
    
         
            +
                            "drug_response_annotation",
         
     | 
| 
      
 149 
     | 
    
         
            +
                            "n_cell_line cell_line n_drug drug_name metadata reference_id reference_id_example default_parameter",
         
     | 
| 
      
 150 
     | 
    
         
            +
                        )
         
     | 
| 
      
 151 
     | 
    
         
            +
                        drug_response = namedtuple("drug_response", ["gdsc1", "gdsc2"])
         
     | 
| 
      
 152 
     | 
    
         
            +
             
     | 
| 
      
 153 
     | 
    
         
            +
                        gdsc1_data = {
         
     | 
| 
      
 154 
     | 
    
         
            +
                            "n_cell_line": len(self.drug_response_gdsc1["cell_line_name"].unique()),
         
     | 
| 
      
 155 
     | 
    
         
            +
                            "n_drug": len(self.drug_response_gdsc1.drug_name.unique()),
         
     | 
| 
      
 156 
     | 
    
         
            +
                            "cell_line": self.drug_response_gdsc1.cell_line_name.unique(),
         
     | 
| 
      
 157 
     | 
    
         
            +
                            "drug_name": self.drug_response_gdsc1.drug_name.unique(),
         
     | 
| 
      
 158 
     | 
    
         
            +
                            "metadata": self.drug_response_gdsc1.columns.values,
         
     | 
| 
      
 159 
     | 
    
         
            +
                            "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
         
     | 
| 
      
 160 
     | 
    
         
            +
                            "reference_id_example": "cell_line_name: ES5 | sanger_model_id: SIDM00263 | cosmic_id: 684057",
         
     | 
| 
      
 161 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 162 
     | 
    
         
            +
                                "gdsc_dataset": "1",
         
     | 
| 
      
 163 
     | 
    
         
            +
                                "query_id": "cell_line_name",
         
     | 
| 
      
 164 
     | 
    
         
            +
                                "reference_id": "cell_line_name",
         
     | 
| 
      
 165 
     | 
    
         
            +
                                "query_perturbation": "perturbation",
         
     | 
| 
      
 166 
     | 
    
         
            +
                                "reference_perturbation": "drug_name",
         
     | 
| 
      
 167 
     | 
    
         
            +
                            },
         
     | 
| 
      
 168 
     | 
    
         
            +
                        }
         
     | 
| 
      
 169 
     | 
    
         
            +
                        gdsc1_dict = drug_response_annotation(**gdsc1_data)
         
     | 
| 
      
 170 
     | 
    
         
            +
             
     | 
| 
      
 171 
     | 
    
         
            +
                        gdsc2_data = {
         
     | 
| 
      
 172 
     | 
    
         
            +
                            "n_cell_line": len(self.drug_response_gdsc2["cell_line_name"].unique()),
         
     | 
| 
      
 173 
     | 
    
         
            +
                            "n_drug": len(self.drug_response_gdsc2.drug_name.unique()),
         
     | 
| 
      
 174 
     | 
    
         
            +
                            "cell_line": self.drug_response_gdsc2.cell_line_name.unique(),
         
     | 
| 
      
 175 
     | 
    
         
            +
                            "drug_name": self.drug_response_gdsc2.drug_name.unique(),
         
     | 
| 
      
 176 
     | 
    
         
            +
                            "metadata": self.drug_response_gdsc2.columns.values,
         
     | 
| 
      
 177 
     | 
    
         
            +
                            "reference_id": ["cell_line_name", "sanger_model_id", "cosmic_id"],
         
     | 
| 
      
 178 
     | 
    
         
            +
                            "reference_id_example": "cell_line_name: PFSK-1 | sanger_model_id: SIDM01132 | cosmic_id: 683667",
         
     | 
| 
      
 179 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 180 
     | 
    
         
            +
                                "gdsc_dataset": "1",
         
     | 
| 
      
 181 
     | 
    
         
            +
                                "query_id": "cell_line_name",
         
     | 
| 
      
 182 
     | 
    
         
            +
                                "reference_id": "cell_line_name",
         
     | 
| 
      
 183 
     | 
    
         
            +
                                "query_perturbation": "perturbation",
         
     | 
| 
      
 184 
     | 
    
         
            +
                                "reference_perturbation": "drug_name",
         
     | 
| 
      
 185 
     | 
    
         
            +
                            },
         
     | 
| 
      
 186 
     | 
    
         
            +
                        }
         
     | 
| 
      
 187 
     | 
    
         
            +
                        gdsc2_dict = drug_response_annotation(**gdsc2_data)
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
      
 189 
     | 
    
         
            +
                        self.drug_response = drug_response(gdsc1_dict, gdsc2_dict)
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
                    elif type == "moa":
         
     | 
| 
      
 192 
     | 
    
         
            +
                        self.moa_meta = transfer_metadata[0]
         
     | 
| 
      
 193 
     | 
    
         
            +
                        moa_annotation = namedtuple(
         
     | 
| 
      
 194 
     | 
    
         
            +
                            "moa_annotation",
         
     | 
| 
      
 195 
     | 
    
         
            +
                            "n_pert n_moa query_id query_id_example target_example default_parameter",
         
     | 
| 
      
 196 
     | 
    
         
            +
                        )
         
     | 
| 
      
 197 
     | 
    
         
            +
                        moa_data = {
         
     | 
| 
      
 198 
     | 
    
         
            +
                            "n_pert": len(self.moa_meta.pert_iname.unique()),
         
     | 
| 
      
 199 
     | 
    
         
            +
                            "n_moa": len(self.moa_meta.moa.unique()),
         
     | 
| 
      
 200 
     | 
    
         
            +
                            "query_id": "pert_iname",
         
     | 
| 
      
 201 
     | 
    
         
            +
                            "query_id_example": [
         
     | 
| 
      
 202 
     | 
    
         
            +
                                "(R)-(-)-apomorphine",
         
     | 
| 
      
 203 
     | 
    
         
            +
                                "9-aminocamptothecin",
         
     | 
| 
      
 204 
     | 
    
         
            +
                                "A-803467",
         
     | 
| 
      
 205 
     | 
    
         
            +
                            ],
         
     | 
| 
      
 206 
     | 
    
         
            +
                            "target_example": [
         
     | 
| 
      
 207 
     | 
    
         
            +
                                "ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|DRD5|HTR1A|HTR1B|HTR1D|HTR2A|HTR2B|HTR2C|HTR5A",
         
     | 
| 
      
 208 
     | 
    
         
            +
                                "SCN10A",
         
     | 
| 
      
 209 
     | 
    
         
            +
                                "TOP1",
         
     | 
| 
      
 210 
     | 
    
         
            +
                            ],
         
     | 
| 
      
 211 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 212 
     | 
    
         
            +
                                "query_id": "pert_iname",
         
     | 
| 
      
 213 
     | 
    
         
            +
                                "target": None,
         
     | 
| 
      
 214 
     | 
    
         
            +
                            },
         
     | 
| 
      
 215 
     | 
    
         
            +
                        }
         
     | 
| 
      
 216 
     | 
    
         
            +
                        self.moa = moa_annotation(**moa_data)
         
     | 
| 
      
 217 
     | 
    
         
            +
             
     | 
| 
      
 218 
     | 
    
         
            +
                    elif type == "compound":
         
     | 
| 
      
 219 
     | 
    
         
            +
                        compound_annotation = namedtuple("compound_annotation", "query_id query_id_example default_parameter")
         
     | 
| 
      
 220 
     | 
    
         
            +
                        compound_data = {
         
     | 
| 
      
 221 
     | 
    
         
            +
                            "query_id_type": ["name", "cid"],
         
     | 
| 
      
 222 
     | 
    
         
            +
                            "query_id_example": "name: ACH-000016 | cid: SLR 21",
         
     | 
| 
      
 223 
     | 
    
         
            +
                            "default_parameter": {
         
     | 
| 
      
 224 
     | 
    
         
            +
                                "query_id": "perturbation",
         
     | 
| 
      
 225 
     | 
    
         
            +
                                "query_id_type": "name",
         
     | 
| 
      
 226 
     | 
    
         
            +
                            },
         
     | 
| 
      
 227 
     | 
    
         
            +
                        }
         
     | 
| 
      
 228 
     | 
    
         
            +
                        self.compound = compound_annotation(**compound_data)
         
     | 
| 
      
 229 
     | 
    
         
            +
             
     | 
| 
      
 230 
     | 
    
         
            +
                    elif type == "drug":
         
     | 
| 
      
 231 
     | 
    
         
            +
                        self.chembl = transfer_metadata[0]
         
     | 
| 
      
 232 
     | 
    
         
            +
                        self.dgidb = transfer_metadata[1]
         
     | 
| 
      
 233 
     | 
    
         
            +
                        self.pharmgkb = transfer_metadata[2]
         
     | 
| 
      
 234 
     | 
    
         
            +
             
     | 
| 
      
 235 
     | 
    
         
            +
                        drug_annotation = namedtuple(
         
     | 
| 
      
 236 
     | 
    
         
            +
                            "drug_annotation",
         
     | 
| 
      
 237 
     | 
    
         
            +
                            "n_compound compound_example n_target target_example n_disease disease_example",
         
     | 
| 
      
 238 
     | 
    
         
            +
                        )
         
     | 
| 
      
 239 
     | 
    
         
            +
                        drugs = namedtuple("drugs", ["chembl", "dgidb", "pharmgkb"])
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
                        dgidb_data = {
         
     | 
| 
      
 242 
     | 
    
         
            +
                            "n_compound": len(self.dgidb.drug_claim_name.unique()),
         
     | 
| 
      
 243 
     | 
    
         
            +
                            "n_target": len(self.dgidb.gene_claim_name.unique()),
         
     | 
| 
      
 244 
     | 
    
         
            +
                            "compound_example": self.dgidb.drug_claim_name.values[0:5],
         
     | 
| 
      
 245 
     | 
    
         
            +
                            "target_example": self.dgidb.gene_claim_name.unique()[0:5],
         
     | 
| 
      
 246 
     | 
    
         
            +
                            "n_disease": 0,
         
     | 
| 
      
 247 
     | 
    
         
            +
                            "disease_example": "",
         
     | 
| 
      
 248 
     | 
    
         
            +
                        }
         
     | 
| 
      
 249 
     | 
    
         
            +
                        dgidb_record = drug_annotation(**dgidb_data)
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
                        chembl_targets = list(
         
     | 
| 
      
 252 
     | 
    
         
            +
                            {t for target in self.chembl.targets.tolist() for t in target}
         
     | 
| 
      
 253 
     | 
    
         
            +
                        )  # flatten the target column and remove duplicates
         
     | 
| 
      
 254 
     | 
    
         
            +
                        chembl_data = {
         
     | 
| 
      
 255 
     | 
    
         
            +
                            "n_compound": len(self.chembl.compounds),
         
     | 
| 
      
 256 
     | 
    
         
            +
                            "n_target": len(chembl_targets),
         
     | 
| 
      
 257 
     | 
    
         
            +
                            "compound_example": self.chembl.compounds.values[0:5],
         
     | 
| 
      
 258 
     | 
    
         
            +
                            "target_example": chembl_targets[0:5],
         
     | 
| 
      
 259 
     | 
    
         
            +
                            "n_disease": 0,
         
     | 
| 
      
 260 
     | 
    
         
            +
                            "disease_example": "",
         
     | 
| 
      
 261 
     | 
    
         
            +
                        }
         
     | 
| 
      
 262 
     | 
    
         
            +
                        chembl_record = drug_annotation(**chembl_data)
         
     | 
| 
      
 263 
     | 
    
         
            +
             
     | 
| 
      
 264 
     | 
    
         
            +
                        pharmgkb_data = {
         
     | 
| 
      
 265 
     | 
    
         
            +
                            "n_compound": len(self.pharmgkb[self.pharmgkb.Type == "Chemical"]["Compound|Disease"].unique()),
         
     | 
| 
      
 266 
     | 
    
         
            +
                            "n_target": len(self.pharmgkb.Gene.unique()),
         
     | 
| 
      
 267 
     | 
    
         
            +
                            "compound_example": self.pharmgkb[self.pharmgkb.Type == "Chemical"]["Compound|Disease"].unique()[0:5],
         
     | 
| 
      
 268 
     | 
    
         
            +
                            "target_example": self.pharmgkb.Gene.unique()[0:5],
         
     | 
| 
      
 269 
     | 
    
         
            +
                            "n_disease": len(self.pharmgkb[self.pharmgkb.Type == "Disease"]["Compound|Disease"].unique()),
         
     | 
| 
      
 270 
     | 
    
         
            +
                            "disease_example": self.pharmgkb[self.pharmgkb.Type == "Disease"]["Compound|Disease"].unique()[0:5],
         
     | 
| 
      
 271 
     | 
    
         
            +
                        }
         
     | 
| 
      
 272 
     | 
    
         
            +
                        pharmgkb_record = drug_annotation(**pharmgkb_data)
         
     | 
| 
      
 273 
     | 
    
         
            +
                        self.drugs = drugs(chembl_record, dgidb_record, pharmgkb_record)
         
     | 
| 
      
 274 
     | 
    
         
            +
             
     | 
| 
      
 275 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 276 
     | 
    
         
            +
                        raise NotImplementedError
         
     | 
| 
      
 277 
     | 
    
         
            +
             
     | 
| 
      
 278 
     | 
    
         
            +
                def available_cell_lines(
         
     | 
| 
      
 279 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 280 
     | 
    
         
            +
                    cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap",
         
     | 
| 
      
 281 
     | 
    
         
            +
                    reference_id: str = "ModelID",
         
     | 
| 
      
 282 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 283 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 284 
     | 
    
         
            +
                    """A brief summary of cell line metadata.
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 287 
     | 
    
         
            +
                        cell_line_source: the source of cell line annotation, DepMap or Cancerrxgene.
         
     | 
| 
      
 288 
     | 
    
         
            +
                        reference_id: The type of cell line identifier in the meta data, e.g. ModelID, CellLineName	or StrippedCellLineName.
         
     | 
| 
      
 289 
     | 
    
         
            +
                            If fetch cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
         
     | 
| 
      
 290 
     | 
    
         
            +
                        query_id_list: Unique cell line identifiers to test the number of matched ids present in the
         
     | 
| 
      
 291 
     | 
    
         
            +
                            metadata. If set to None, the query of metadata identifiers will be disabled.
         
     | 
| 
      
 292 
     | 
    
         
            +
                    """
         
     | 
| 
      
 293 
     | 
    
         
            +
                    if self.type != "cell_line":
         
     | 
| 
      
 294 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specifically for CellLineMetaData!")
         
     | 
| 
      
 295 
     | 
    
         
            +
             
     | 
| 
      
 296 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 297 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 298 
     | 
    
         
            +
                        if cell_line_source == "DepMap":
         
     | 
| 
      
 299 
     | 
    
         
            +
                            if reference_id not in self.cell_line_meta.columns:
         
     | 
| 
      
 300 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 301 
     | 
    
         
            +
                                    f"The specified `reference_id` {reference_id} is not available in the DepMap cell line annotation data. "
         
     | 
| 
      
 302 
     | 
    
         
            +
                                )
         
     | 
| 
      
 303 
     | 
    
         
            +
                            not_matched_identifiers = list(set(query_id_list) - set(self.cell_line_meta[reference_id]))
         
     | 
| 
      
 304 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 305 
     | 
    
         
            +
                            if reference_id == "ModelID":
         
     | 
| 
      
 306 
     | 
    
         
            +
                                reference_id = "stripped_cell_line_name"
         
     | 
| 
      
 307 
     | 
    
         
            +
                            if reference_id not in self.cl_cancer_project_meta.columns:
         
     | 
| 
      
 308 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 309 
     | 
    
         
            +
                                    f"The specified `reference_id` {reference_id} is not available "
         
     | 
| 
      
 310 
     | 
    
         
            +
                                    f"in the cell line annotation from the project Genomics of Drug Sensitivity in Cancer. "
         
     | 
| 
      
 311 
     | 
    
         
            +
                                )
         
     | 
| 
      
 312 
     | 
    
         
            +
                            not_matched_identifiers = list(set(query_id_list) - set(self.cl_cancer_project_meta[reference_id]))
         
     | 
| 
      
 313 
     | 
    
         
            +
             
     | 
| 
      
 314 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
         
     | 
| 
      
 315 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
         
     | 
| 
      
 316 
     | 
    
         
            +
             
     | 
| 
      
 317 
     | 
    
         
            +
                def available_bulk_rna(
         
     | 
| 
      
 318 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 319 
     | 
    
         
            +
                    cell_line_source: Literal["broad", "sanger"] = "sanger",
         
     | 
| 
      
 320 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 321 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 322 
     | 
    
         
            +
                    """A brief summary of bulk RNA expression data.
         
     | 
| 
      
 323 
     | 
    
         
            +
             
     | 
| 
      
 324 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 325 
     | 
    
         
            +
                        cell_line_source: the source of RNA-seq data, broad or sanger.
         
     | 
| 
      
 326 
     | 
    
         
            +
                        query_id_list: Unique cell line identifiers to test the number of matched ids present in the
         
     | 
| 
      
 327 
     | 
    
         
            +
                            metadata. If set to None, the query of metadata identifiers will be disabled.
         
     | 
| 
      
 328 
     | 
    
         
            +
                    """
         
     | 
| 
      
 329 
     | 
    
         
            +
                    if self.type != "cell_line":
         
     | 
| 
      
 330 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
         
     | 
| 
      
 331 
     | 
    
         
            +
             
     | 
| 
      
 332 
     | 
    
         
            +
                    if cell_line_source == "broad":
         
     | 
| 
      
 333 
     | 
    
         
            +
                        bulk_rna = self.bulk_rna_broad
         
     | 
| 
      
 334 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 335 
     | 
    
         
            +
                        bulk_rna = self.bulk_rna_sanger
         
     | 
| 
      
 336 
     | 
    
         
            +
             
     | 
| 
      
 337 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 338 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 339 
     | 
    
         
            +
                        not_matched_identifiers = list(set(query_id_list) - set(bulk_rna.index))
         
     | 
| 
      
 340 
     | 
    
         
            +
             
     | 
| 
      
 341 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
         
     | 
| 
      
 342 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
         
     | 
| 
      
 343 
     | 
    
         
            +
             
     | 
| 
      
 344 
     | 
    
         
            +
                def available_protein_expression(
         
     | 
| 
      
 345 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 346 
     | 
    
         
            +
                    reference_id: Literal["model_name", "model_id"] = "model_name",
         
     | 
| 
      
 347 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 348 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 349 
     | 
    
         
            +
                    """A brief summary of protein expression data.
         
     | 
| 
      
 350 
     | 
    
         
            +
             
     | 
| 
      
 351 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 352 
     | 
    
         
            +
                        reference_id: The type of cell line identifier in the meta data, model_name or model_id.
         
     | 
| 
      
 353 
     | 
    
         
            +
                        query_id_list: Unique cell line identifiers to test the number of matched ids present in the
         
     | 
| 
      
 354 
     | 
    
         
            +
                            metadata. If set to None, the query of metadata identifiers will be disabled.
         
     | 
| 
      
 355 
     | 
    
         
            +
                    """
         
     | 
| 
      
 356 
     | 
    
         
            +
                    if self.type != "cell_line":
         
     | 
| 
      
 357 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
         
     | 
| 
      
 358 
     | 
    
         
            +
             
     | 
| 
      
 359 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 360 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 361 
     | 
    
         
            +
             
     | 
| 
      
 362 
     | 
    
         
            +
                        if reference_id not in self.proteomics_data.columns:
         
     | 
| 
      
 363 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 364 
     | 
    
         
            +
                                f"The specified `reference_id` {reference_id} is not available in the proteomics data. "
         
     | 
| 
      
 365 
     | 
    
         
            +
                            )
         
     | 
| 
      
 366 
     | 
    
         
            +
                        not_matched_identifiers = list(set(query_id_list) - set(self.proteomics_data[reference_id]))
         
     | 
| 
      
 367 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
         
     | 
| 
      
 368 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
         
     | 
| 
      
 369 
     | 
    
         
            +
             
     | 
| 
      
 370 
     | 
    
         
            +
                def available_drug_response(
         
     | 
| 
      
 371 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 372 
     | 
    
         
            +
                    gdsc_dataset: Literal[1, 2] = 1,
         
     | 
| 
      
 373 
     | 
    
         
            +
                    reference_id: Literal["cell_line_name", "sanger_model_id", "cosmic_id"] = "cell_line_name",
         
     | 
| 
      
 374 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 375 
     | 
    
         
            +
                    reference_perturbation: Literal["drug_name", "drug_id"] = "drug_name",
         
     | 
| 
      
 376 
     | 
    
         
            +
                    query_perturbation_list: Sequence[str] | None = None,
         
     | 
| 
      
 377 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 378 
     | 
    
         
            +
                    """A brief summary of drug response data.
         
     | 
| 
      
 379 
     | 
    
         
            +
             
     | 
| 
      
 380 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 381 
     | 
    
         
            +
                        gdsc_dataset: The GDSC dataset, 1 or 2.
         
     | 
| 
      
 382 
     | 
    
         
            +
                                      The GDSC1 dataset updates previous releases with additional drug screening data from the Wellcome Sanger Institute and Massachusetts General Hospital.
         
     | 
| 
      
 383 
     | 
    
         
            +
                                      It covers 970 Cell lines and 403 Compounds with 333292 IC50s.
         
     | 
| 
      
 384 
     | 
    
         
            +
                                      GDSC2 is new and has 243,466 IC50 results from the latest screening at the Wellcome Sanger Institute using improved experimental procedures.
         
     | 
| 
      
 385 
     | 
    
         
            +
                        reference_id: The type of cell line identifier in the meta data, cell_line_name, sanger_model_id or cosmic_id.
         
     | 
| 
      
 386 
     | 
    
         
            +
                        query_id_list: Unique cell line identifiers to test the number of matched ids present in the metadata.
         
     | 
| 
      
 387 
     | 
    
         
            +
                                       If set to None, the query of metadata identifiers will be disabled.
         
     | 
| 
      
 388 
     | 
    
         
            +
                        reference_perturbation: The perturbation information in the meta data, drug_name or drug_id.
         
     | 
| 
      
 389 
     | 
    
         
            +
                        query_perturbation_list: Unique perturbation types to test the number of matched ones present in the metadata.
         
     | 
| 
      
 390 
     | 
    
         
            +
                                                 If set to None, the query of perturbation types will be disabled.
         
     | 
| 
      
 391 
     | 
    
         
            +
                    """
         
     | 
| 
      
 392 
     | 
    
         
            +
                    if self.type != "cell_line":
         
     | 
| 
      
 393 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
         
     | 
| 
      
 394 
     | 
    
         
            +
                    if gdsc_dataset == 1:
         
     | 
| 
      
 395 
     | 
    
         
            +
                        gdsc_data = self.drug_response_gdsc1
         
     | 
| 
      
 396 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 397 
     | 
    
         
            +
                        gdsc_data = self.drug_response_gdsc2
         
     | 
| 
      
 398 
     | 
    
         
            +
             
     | 
| 
      
 399 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 400 
     | 
    
         
            +
                        if reference_id not in gdsc_data.columns:
         
     | 
| 
      
 401 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 402 
     | 
    
         
            +
                                f"The specified `reference_id` {reference_id} is not available in the GDSC drug response data. "
         
     | 
| 
      
 403 
     | 
    
         
            +
                            )
         
     | 
| 
      
 404 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 405 
     | 
    
         
            +
                        not_matched_identifiers = list(set(query_id_list) - set(gdsc_data[reference_id]))
         
     | 
| 
      
 406 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} cell lines are not found in the metadata.")
         
     | 
| 
      
 407 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} cell lines are found! ")
         
     | 
| 
      
 408 
     | 
    
         
            +
             
     | 
| 
      
 409 
     | 
    
         
            +
                    if query_perturbation_list is not None:
         
     | 
| 
      
 410 
     | 
    
         
            +
                        if reference_perturbation not in gdsc_data.columns:
         
     | 
| 
      
 411 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 412 
     | 
    
         
            +
                                f"The specified `reference_perturbation` {reference_perturbation} is not available in the GDSC drug response data. "
         
     | 
| 
      
 413 
     | 
    
         
            +
                            )
         
     | 
| 
      
 414 
     | 
    
         
            +
                        identifier_num_all = len(query_perturbation_list)
         
     | 
| 
      
 415 
     | 
    
         
            +
                        not_matched_identifiers = list(set(query_perturbation_list) - set(gdsc_data[reference_perturbation]))
         
     | 
| 
      
 416 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} perturbation types are not found in the metadata.")
         
     | 
| 
      
 417 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbation types are found! ")
         
     | 
| 
      
 418 
     | 
    
         
            +
             
     | 
| 
      
 419 
     | 
    
         
            +
                def available_genes_annotation(
         
     | 
| 
      
 420 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 421 
     | 
    
         
            +
                    reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
         
     | 
| 
      
 422 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 423 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 424 
     | 
    
         
            +
                    """A brief summary of gene annotation metadata
         
     | 
| 
      
 425 
     | 
    
         
            +
             
     | 
| 
      
 426 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 427 
     | 
    
         
            +
                        reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
         
     | 
| 
      
 428 
     | 
    
         
            +
                        query_id_list: Unique gene identifiers to test the number of matched ids present in the metadata.
         
     | 
| 
      
 429 
     | 
    
         
            +
                    """
         
     | 
| 
      
 430 
     | 
    
         
            +
                    if self.type != "cell_line":
         
     | 
| 
      
 431 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
         
     | 
| 
      
 432 
     | 
    
         
            +
             
     | 
| 
      
 433 
     | 
    
         
            +
                    logger.info("To summarize: in the DepMap_Sanger gene annotation file, you can find: ")
         
     | 
| 
      
 434 
     | 
    
         
            +
                    logger.info(f"{len(self.gene_annotation.index)} driver genes")
         
     | 
| 
      
 435 
     | 
    
         
            +
                    logger.info(
         
     | 
| 
      
 436 
     | 
    
         
            +
                        f"{len(self.gene_annotation.columns)} meta data including: ",
         
     | 
| 
      
 437 
     | 
    
         
            +
                        *list(self.gene_annotation.columns.values),
         
     | 
| 
      
 438 
     | 
    
         
            +
                        sep="\n- ",
         
     | 
| 
      
 439 
     | 
    
         
            +
                    )
         
     | 
| 
      
 440 
     | 
    
         
            +
                    logger.info("Overview of gene annotation: ")
         
     | 
| 
      
 441 
     | 
    
         
            +
                    logger.info(self.gene_annotation.head().to_string())
         
     | 
| 
      
 442 
     | 
    
         
            +
                    """
         
     | 
| 
      
 443 
     | 
    
         
            +
                    #not implemented yet
         
     | 
| 
      
 444 
     | 
    
         
            +
                    print("Default parameters to annotate gene annotation: ")
         
     | 
| 
      
 445 
     | 
    
         
            +
                    default_param = {
         
     | 
| 
      
 446 
     | 
    
         
            +
                        "query_id": "ensembl_gene_id",
         
     | 
| 
      
 447 
     | 
    
         
            +
                    }
         
     | 
| 
      
 448 
     | 
    
         
            +
                    print("\n".join(f"- {k}: {v}" for k, v in default_param.items()))
         
     | 
| 
      
 449 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 450 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 451 
     | 
    
         
            +
                        not_matched_identifiers = list(set(query_id_list) - set(self.gene_annotation[reference_id]))
         
     | 
| 
      
 452 
     | 
    
         
            +
                        print(f"{len(not_matched_identifiers)} genes are not found in the metadata.")
         
     | 
| 
      
 453 
     | 
    
         
            +
                        print(f"{identifier_num_all - len(not_matched_identifiers)} genes are found! ")
         
     | 
| 
      
 454 
     | 
    
         
            +
                    """
         
     | 
| 
      
 455 
     | 
    
         
            +
             
     | 
| 
      
 456 
     | 
    
         
            +
                def available_moa(
         
     | 
| 
      
 457 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 458 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 459 
     | 
    
         
            +
                    target_list: Sequence[str] | None = None,
         
     | 
| 
      
 460 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 461 
     | 
    
         
            +
                    """A brief summary of MoA annotation.
         
     | 
| 
      
 462 
     | 
    
         
            +
             
     | 
| 
      
 463 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 464 
     | 
    
         
            +
                        query_id_list: Unique perturbagens to test the number of matched ones present in the metadata.
         
     | 
| 
      
 465 
     | 
    
         
            +
                                       If set to None, the query of metadata perturbagens will be disabled.
         
     | 
| 
      
 466 
     | 
    
         
            +
                        target_list: Unique molecular targets to test the number of matched ones present in the metadata.
         
     | 
| 
      
 467 
     | 
    
         
            +
                                     If set to None, the comparison of molecular targets in the query of metadata perturbagens will be disabled.
         
     | 
| 
      
 468 
     | 
    
         
            +
                    """
         
     | 
| 
      
 469 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 470 
     | 
    
         
            +
                        if self.type != "moa":
         
     | 
| 
      
 471 
     | 
    
         
            +
                            raise ValueError("This is not a LookUp object specific for MoaMetaData!")
         
     | 
| 
      
 472 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 473 
     | 
    
         
            +
                        not_matched_identifiers = list(set(query_id_list) - set(self.moa_meta.pert_iname))
         
     | 
| 
      
 474 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} perturbagens are not found in the metadata.")
         
     | 
| 
      
 475 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} perturbagens are found! ")
         
     | 
| 
      
 476 
     | 
    
         
            +
             
     | 
| 
      
 477 
     | 
    
         
            +
                    if target_list is not None:
         
     | 
| 
      
 478 
     | 
    
         
            +
                        targets = self.moa_meta.target.astype(str).apply(lambda x: x.split("|"))
         
     | 
| 
      
 479 
     | 
    
         
            +
                        all_targets = [t for tl in targets for t in tl]
         
     | 
| 
      
 480 
     | 
    
         
            +
                        identifier_num_all = len(target_list)
         
     | 
| 
      
 481 
     | 
    
         
            +
                        not_matched_identifiers = list(set(target_list) - set(all_targets))
         
     | 
| 
      
 482 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} molecular targets are not found in the metadata.")
         
     | 
| 
      
 483 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} molecular targets are found! ")
         
     | 
| 
      
 484 
     | 
    
         
            +
             
     | 
| 
      
 485 
     | 
    
         
            +
                def available_compounds(
         
     | 
| 
      
 486 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 487 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 488 
     | 
    
         
            +
                    query_id_type: Literal["name", "cid"] = "name",
         
     | 
| 
      
 489 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 490 
     | 
    
         
            +
                    """A brief summary of compound annotation.
         
     | 
| 
      
 491 
     | 
    
         
            +
             
     | 
| 
      
 492 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 493 
     | 
    
         
            +
                        query_id_list: Unique compounds to test the number of matched ones present in the metadata.
         
     | 
| 
      
 494 
     | 
    
         
            +
                                    If set to None, query of compound identifiers will be disabled.
         
     | 
| 
      
 495 
     | 
    
         
            +
                        query_id_type: The type of compound identifiers, name or cid.
         
     | 
| 
      
 496 
     | 
    
         
            +
                    """
         
     | 
| 
      
 497 
     | 
    
         
            +
                    if self.type != "compound":
         
     | 
| 
      
 498 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specific for CompoundData!")
         
     | 
| 
      
 499 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 500 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 501 
     | 
    
         
            +
                        not_matched_identifiers = []
         
     | 
| 
      
 502 
     | 
    
         
            +
             
     | 
| 
      
 503 
     | 
    
         
            +
                        for compound in query_id_list:
         
     | 
| 
      
 504 
     | 
    
         
            +
                            if query_id_type == "name":
         
     | 
| 
      
 505 
     | 
    
         
            +
                                cids = pcp.get_compounds(compound, "name")
         
     | 
| 
      
 506 
     | 
    
         
            +
                                if len(cids) == 0:  # search did not work
         
     | 
| 
      
 507 
     | 
    
         
            +
                                    not_matched_identifiers.append(compound)
         
     | 
| 
      
 508 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 509 
     | 
    
         
            +
                                try:
         
     | 
| 
      
 510 
     | 
    
         
            +
                                    pcp.Compound.from_cid(compound)
         
     | 
| 
      
 511 
     | 
    
         
            +
                                except pcp.BadRequestError:
         
     | 
| 
      
 512 
     | 
    
         
            +
                                    not_matched_identifiers.append(compound)
         
     | 
| 
      
 513 
     | 
    
         
            +
             
     | 
| 
      
 514 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} compounds are not found in the metadata.")
         
     | 
| 
      
 515 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} compounds are found! ")
         
     | 
| 
      
 516 
     | 
    
         
            +
             
     | 
| 
      
 517 
     | 
    
         
            +
                def available_drug_annotation(
         
     | 
| 
      
 518 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 519 
     | 
    
         
            +
                    drug_annotation_source: Literal["chembl", "dgidb", "pharmgkb"] = "chembl",
         
     | 
| 
      
 520 
     | 
    
         
            +
                    query_id_list: Sequence[str] | None = None,
         
     | 
| 
      
 521 
     | 
    
         
            +
                    query_id_type: Literal["target", "compound", "disease"] = "target",
         
     | 
| 
      
 522 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 523 
     | 
    
         
            +
                    """A brief summary of drug annotation.
         
     | 
| 
      
 524 
     | 
    
         
            +
             
     | 
| 
      
 525 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 526 
     | 
    
         
            +
                        drug_annotation_source: the source of drug annotation data, chembl, dgidb or pharmgkb.
         
     | 
| 
      
 527 
     | 
    
         
            +
                        query_id_list: Unique target or compound names to test the number of matched ones present in the metadata.
         
     | 
| 
      
 528 
     | 
    
         
            +
                                    If set to None, query of compound identifiers will be disabled.
         
     | 
| 
      
 529 
     | 
    
         
            +
                        query_id_type: The type of identifiers, target, compound and disease(pharmgkb only).
         
     | 
| 
      
 530 
     | 
    
         
            +
                    """
         
     | 
| 
      
 531 
     | 
    
         
            +
                    if self.type != "drug":
         
     | 
| 
      
 532 
     | 
    
         
            +
                        raise ValueError("This is not a LookUp object specific for DrugMetaData!")
         
     | 
| 
      
 533 
     | 
    
         
            +
                    if query_id_list is not None:
         
     | 
| 
      
 534 
     | 
    
         
            +
                        identifier_num_all = len(query_id_list)
         
     | 
| 
      
 535 
     | 
    
         
            +
                        not_matched_identifiers = []
         
     | 
| 
      
 536 
     | 
    
         
            +
             
     | 
| 
      
 537 
     | 
    
         
            +
                        if drug_annotation_source == "chembl":
         
     | 
| 
      
 538 
     | 
    
         
            +
                            if query_id_type == "target":
         
     | 
| 
      
 539 
     | 
    
         
            +
                                chembl_targets = {t for target in self.chembl.targets.tolist() for t in target}
         
     | 
| 
      
 540 
     | 
    
         
            +
                                # flatten the target column and remove duplicates
         
     | 
| 
      
 541 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - chembl_targets)
         
     | 
| 
      
 542 
     | 
    
         
            +
                            elif query_id_type == "compound":
         
     | 
| 
      
 543 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - self.chembl["compounds"])
         
     | 
| 
      
 544 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 545 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 546 
     | 
    
         
            +
                                    "Gene-disease association is not available in chembl dataset, please try with pharmgkb."
         
     | 
| 
      
 547 
     | 
    
         
            +
                                )
         
     | 
| 
      
 548 
     | 
    
         
            +
             
     | 
| 
      
 549 
     | 
    
         
            +
                        elif drug_annotation_source == "dgidb":
         
     | 
| 
      
 550 
     | 
    
         
            +
                            if query_id_type == "target":
         
     | 
| 
      
 551 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - set(self.dgidb["gene_claim_name"]))
         
     | 
| 
      
 552 
     | 
    
         
            +
                            elif query_id_type == "compound":
         
     | 
| 
      
 553 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - set(self.dgidb["drug_claim_name"]))
         
     | 
| 
      
 554 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 555 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 556 
     | 
    
         
            +
                                    "Gene-disease association is not available in dgidb dataset, please try with pharmgkb."
         
     | 
| 
      
 557 
     | 
    
         
            +
                                )
         
     | 
| 
      
 558 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 559 
     | 
    
         
            +
                            if query_id_type == "target":
         
     | 
| 
      
 560 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
         
     | 
| 
      
 561 
     | 
    
         
            +
                            elif query_id_type == "compound":
         
     | 
| 
      
 562 
     | 
    
         
            +
                                compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
         
     | 
| 
      
 563 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
         
     | 
| 
      
 564 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 565 
     | 
    
         
            +
                                diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
         
     | 
| 
      
 566 
     | 
    
         
            +
                                not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
         
     | 
| 
      
 567 
     | 
    
         
            +
             
     | 
| 
      
 568 
     | 
    
         
            +
                        logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
         
     | 
| 
      
 569 
     | 
    
         
            +
                        logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")
         
     | 
| 
         @@ -0,0 +1,70 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from __future__ import annotations
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            from typing import TYPE_CHECKING, Literal
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            from lamin_utils import logger
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            if TYPE_CHECKING:
         
     | 
| 
      
 8 
     | 
    
         
            +
                from collections.abc import Sequence
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            class MetaData:
         
     | 
| 
      
 12 
     | 
    
         
            +
                """Superclass for pertpy's MetaData components."""
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                def _warn_unmatch(
         
     | 
| 
      
 15 
     | 
    
         
            +
                    self,
         
     | 
| 
      
 16 
     | 
    
         
            +
                    total_identifiers: int,
         
     | 
| 
      
 17 
     | 
    
         
            +
                    unmatched_identifiers: Sequence[str],
         
     | 
| 
      
 18 
     | 
    
         
            +
                    query_id: str,
         
     | 
| 
      
 19 
     | 
    
         
            +
                    reference_id: str,
         
     | 
| 
      
 20 
     | 
    
         
            +
                    metadata_type: Literal[
         
     | 
| 
      
 21 
     | 
    
         
            +
                        "cell line",
         
     | 
| 
      
 22 
     | 
    
         
            +
                        "protein expression",
         
     | 
| 
      
 23 
     | 
    
         
            +
                        "bulk RNA",
         
     | 
| 
      
 24 
     | 
    
         
            +
                        "drug response",
         
     | 
| 
      
 25 
     | 
    
         
            +
                        "moa",
         
     | 
| 
      
 26 
     | 
    
         
            +
                        "compound",
         
     | 
| 
      
 27 
     | 
    
         
            +
                    ] = "cell line",
         
     | 
| 
      
 28 
     | 
    
         
            +
                    verbosity: int | str = 5,
         
     | 
| 
      
 29 
     | 
    
         
            +
                ) -> None:
         
     | 
| 
      
 30 
     | 
    
         
            +
                    """Helper function to print out the unmatched identifiers.
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                    Args:
         
     | 
| 
      
 33 
     | 
    
         
            +
                        total_identifiers: The total number of identifiers in the `adata` object.
         
     | 
| 
      
 34 
     | 
    
         
            +
                        unmatched_identifiers: Unmatched identifiers in the `adata` object.
         
     | 
| 
      
 35 
     | 
    
         
            +
                        query_id: The column of `.obs` with cell line information.
         
     | 
| 
      
 36 
     | 
    
         
            +
                        reference_id: The type of cell line identifier in the metadata.
         
     | 
| 
      
 37 
     | 
    
         
            +
                        metadata_type: The type of metadata where some identifiers are not matched during annotation such as
         
     | 
| 
      
 38 
     | 
    
         
            +
                                       cell line, protein expression, bulk RNA expression, drug response, moa or compound.
         
     | 
| 
      
 39 
     | 
    
         
            +
                        verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
         
     | 
| 
      
 40 
     | 
    
         
            +
                    """
         
     | 
| 
      
 41 
     | 
    
         
            +
                    if isinstance(verbosity, str):
         
     | 
| 
      
 42 
     | 
    
         
            +
                        if verbosity != "all":
         
     | 
| 
      
 43 
     | 
    
         
            +
                            raise ValueError("Only a non-negative value or 'all' is accepted.")
         
     | 
| 
      
 44 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 45 
     | 
    
         
            +
                            verbosity = len(unmatched_identifiers)
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                    if len(unmatched_identifiers) == total_identifiers:
         
     | 
| 
      
 48 
     | 
    
         
            +
                        hint = ""
         
     | 
| 
      
 49 
     | 
    
         
            +
                        if metadata_type in ["protein expression", "bulk RNA", "drug response"]:
         
     | 
| 
      
 50 
     | 
    
         
            +
                            hint = "Additionally, call the `CellLineMetaData.annotate()` function to acquire more possible query IDs that can be used for cell line annotation purposes."
         
     | 
| 
      
 51 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 52 
     | 
    
         
            +
                            f"Attempting to match the query id {query_id} in 'adata.obs' to the reference id {reference_id} in the metadata.\n"
         
     | 
| 
      
 53 
     | 
    
         
            +
                            "However, none of the query IDs could be found in the {metadata_type} annotation data.\n"
         
     | 
| 
      
 54 
     | 
    
         
            +
                            "To resolve this issue, call the `lookup()` function to create a LookUp object.\n"
         
     | 
| 
      
 55 
     | 
    
         
            +
                            "This enables obtaining the count of matched identifiers in the AnnData object for different types of reference and query IDs.\n"
         
     | 
| 
      
 56 
     | 
    
         
            +
                            f"{hint}"
         
     | 
| 
      
 57 
     | 
    
         
            +
                        )
         
     | 
| 
      
 58 
     | 
    
         
            +
                    if len(unmatched_identifiers) == 0:
         
     | 
| 
      
 59 
     | 
    
         
            +
                        return
         
     | 
| 
      
 60 
     | 
    
         
            +
                    if isinstance(verbosity, int) and verbosity >= 0:
         
     | 
| 
      
 61 
     | 
    
         
            +
                        verbosity = min(verbosity, len(unmatched_identifiers))
         
     | 
| 
      
 62 
     | 
    
         
            +
                        if verbosity > 0:
         
     | 
| 
      
 63 
     | 
    
         
            +
                            logger.info(
         
     | 
| 
      
 64 
     | 
    
         
            +
                                f"There are {total_identifiers} identifiers in `adata.obs`."
         
     | 
| 
      
 65 
     | 
    
         
            +
                                f"However, {len(unmatched_identifiers)} identifiers can't be found in the {metadata_type} annotation,"
         
     | 
| 
      
 66 
     | 
    
         
            +
                                "leading to the presence of NA values for their respective metadata.\n"
         
     | 
| 
      
 67 
     | 
    
         
            +
                                f"Please check again: *unmatched_identifiers[:verbosity]..."
         
     | 
| 
      
 68 
     | 
    
         
            +
                            )
         
     | 
| 
      
 69 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 70 
     | 
    
         
            +
                        raise ValueError("Only 'all' or a non-negative value is accepted.")
         
     |