PyPI - pertpy - Versions diffs - 0.9.5__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

pertpy 0.9.5py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

pertpy/__init__.py +5 -1
pertpy/_doc.py +2 -5
pertpy/_types.py +6 -0
pertpy/data/_dataloader.py +68 -24
pertpy/data/_datasets.py +9 -9
pertpy/metadata/__init__.py +2 -1
pertpy/metadata/_cell_line.py +136 -30
pertpy/metadata/_look_up.py +13 -19
pertpy/metadata/_moa.py +1 -1
pertpy/preprocessing/_guide_rna.py +221 -39
pertpy/preprocessing/_guide_rna_mixture.py +177 -0
pertpy/tools/__init__.py +1 -1
pertpy/tools/_augur.py +138 -142
pertpy/tools/_cinemaot.py +75 -117
pertpy/tools/_coda/_base_coda.py +150 -174
pertpy/tools/_coda/_sccoda.py +66 -69
pertpy/tools/_coda/_tasccoda.py +71 -79
pertpy/tools/_dialogue.py +60 -56
pertpy/tools/_differential_gene_expression/_base.py +25 -43
pertpy/tools/_differential_gene_expression/_checks.py +4 -6
pertpy/tools/_differential_gene_expression/_dge_comparison.py +5 -6
pertpy/tools/_differential_gene_expression/_edger.py +6 -10
pertpy/tools/_differential_gene_expression/_pydeseq2.py +1 -1
pertpy/tools/_differential_gene_expression/_simple_tests.py +3 -3
pertpy/tools/_differential_gene_expression/_statsmodels.py +8 -5
pertpy/tools/_distances/_distance_tests.py +1 -2
pertpy/tools/_distances/_distances.py +86 -92
pertpy/tools/_enrichment.py +8 -25
pertpy/tools/_milo.py +23 -27
pertpy/tools/_mixscape.py +261 -175
pertpy/tools/_perturbation_space/_clustering.py +4 -4
pertpy/tools/_perturbation_space/_comparison.py +4 -4
pertpy/tools/_perturbation_space/_discriminator_classifiers.py +83 -32
pertpy/tools/_perturbation_space/_perturbation_space.py +10 -10
pertpy/tools/_perturbation_space/_simple.py +13 -17
pertpy/tools/_scgen/_scgen.py +17 -20
pertpy/tools/_scgen/_scgenvae.py +2 -2
pertpy/tools/_scgen/_utils.py +3 -1
{pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/METADATA +37 -21
pertpy-0.11.0.dist-info/RECORD +58 -0
{pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/licenses/LICENSE +1 -0
pertpy/tools/_kernel_pca.py +0 -50
pertpy-0.9.5.dist-info/RECORD +0 -57
{pertpy-0.9.5.dist-info → pertpy-0.11.0.dist-info}/WHEEL +0 -0

pertpy/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 __author__ = "Lukas Heumos"
 __email__ = "lukas.heumos@posteo.net"
-__version__ = "0.9.5"
+__version__ = "0.11.0"
 import warnings
@@ -14,6 +14,10 @@ warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
 warnings.filterwarnings("ignore", category=SyntaxWarning)
 warnings.filterwarnings("ignore", category=UserWarning, module="scvi._settings")
+import mudata
+mudata.set_options(pull_on_update=False)
 from . import data as dt
 from . import metadata as md
 from . import plot as pl

pertpy/_doc.py CHANGED Viewed

@@ -2,9 +2,7 @@ from textwrap import dedent
 def _doc_params(**kwds):  # pragma: no cover
-    """\
-    Docstrings should start with "\" in the first line for proper formatting.
-    """
+    r"""Docstrings should start with "\" in the first line for proper formatting."""
     def dec(obj):
         obj.__orig_doc__ = obj.__doc__
@@ -15,6 +13,5 @@ def _doc_params(**kwds):  # pragma: no cover
 doc_common_plot_args = """\
-show: if `True`, shows the plot.
-            return_fig: if `True`, returns figure of the plot.\
+return_fig: if `True`, returns figure of the plot, that can be used for saving.\
 """

pertpy/_types.py ADDED Viewed

@@ -0,0 +1,6 @@
+from scipy import sparse
+CSBase = sparse.csr_matrix | sparse.csc_matrix
+CSRBase = sparse.csr_matrix
+CSCBase = sparse.csc_matrix
+SpBase = sparse.spmatrix

pertpy/data/_dataloader.py CHANGED Viewed

@@ -1,4 +1,6 @@
+import shutil
 import tempfile
+import time
 from pathlib import Path
 from random import choice
 from string import ascii_lowercase
@@ -7,6 +9,7 @@ from zipfile import ZipFile
 import requests
 from filelock import FileLock
 from lamin_utils import logger
+from requests.exceptions import RequestException
 from rich.progress import Progress
@@ -17,7 +20,10 @@ def _download(  # pragma: no cover
     block_size: int = 1024,
     overwrite: bool = False,
     is_zip: bool = False,
-) -> None:
+    timeout: int = 30,
+    max_retries: int = 3,
+    retry_delay: int = 5,
+) -> Path:
     """Downloads a dataset irrespective of the format.
     Args:
@@ -27,6 +33,9 @@ def _download(  # pragma: no cover
         block_size: Block size for downloads in bytes.
         overwrite: Whether to overwrite existing files.
         is_zip: Whether the downloaded file needs to be unzipped.
+        timeout: Request timeout in seconds.
+        max_retries: Maximum number of retry attempts.
+        retry_delay: Delay between retries in seconds.
     """
     if output_file_name is None:
         letters = ascii_lowercase
@@ -35,36 +44,71 @@ def _download(  # pragma: no cover
     if output_path is None:
         output_path = tempfile.gettempdir()
-    download_to_path = (
-        f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
-    )
+    download_to_path = Path(output_path) / output_file_name
     Path(output_path).mkdir(parents=True, exist_ok=True)
-    lock_path = f"{output_path}/{output_file_name}.lock"
-    with FileLock(lock_path):
+    lock_path = Path(output_path) / f"{output_file_name}.lock"
+    with FileLock(lock_path, timeout=300):
         if Path(download_to_path).exists() and not overwrite:
             logger.warning(f"File {download_to_path} already exists!")
-            return
+            return download_to_path
+        temp_file_name = Path(f"{download_to_path}.part")
+        retry_count = 0
+        while retry_count <= max_retries:
+            try:
+                head_response = requests.head(url, timeout=timeout)
+                head_response.raise_for_status()
+                content_length = int(head_response.headers.get("content-length", 0))
+                free_space = shutil.disk_usage(output_path).free
+                if content_length > free_space:
+                    raise OSError(
+                        f"Insufficient disk space. Need {content_length} bytes, but only {free_space} available."
+                    )
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                total = int(response.headers.get("content-length", 0))
-        temp_file_name = f"{download_to_path}.part"
+                with Progress(refresh_per_second=5) as progress:
+                    task = progress.add_task("[red]Downloading...", total=total)
+                    with Path(temp_file_name).open("wb") as file:
+                        for data in response.iter_content(block_size):
+                            file.write(data)
+                            progress.update(task, advance=len(data))
+                        progress.update(task, completed=total, refresh=True)
-        response = requests.get(url, stream=True)
-        total = int(response.headers.get("content-length", 0))
+                Path(temp_file_name).replace(download_to_path)
-        with Progress(refresh_per_second=100) as progress:
-            task = progress.add_task("[red]Downloading...", total=total)
-            with Path(temp_file_name).open("wb") as file:
-                for data in response.iter_content(block_size):
-                    file.write(data)
-                    progress.update(task, advance=block_size)
-            progress.update(task, completed=total, refresh=True)
+                if is_zip:
+                    with ZipFile(download_to_path, "r") as zip_obj:
+                        zip_obj.extractall(path=output_path)
+                    return Path(output_path)
-        Path(temp_file_name).replace(download_to_path)
+                return download_to_path
+            except (OSError, RequestException) as e:
+                retry_count += 1
+                if retry_count <= max_retries:
+                    logger.warning(
+                        f"Download attempt {retry_count}/{max_retries} failed: {str(e)}. Retrying in {retry_delay} seconds..."
+                    )
+                    time.sleep(retry_delay)
+                else:
+                    logger.error(f"Download failed after {max_retries} attempts: {str(e)}")
+                    if Path(temp_file_name).exists():
+                        Path(temp_file_name).unlink(missing_ok=True)
+                    raise
-        if is_zip:
-            output_path = output_path or tempfile.gettempdir()
-            with ZipFile(download_to_path, "r") as zip_obj:
-                zip_obj.extractall(path=output_path)
-                zip_obj.namelist()
+            except Exception as e:
+                logger.error(f"Download failed: {str(e)}")
+                if Path(temp_file_name).exists():
+                    Path(temp_file_name).unlink(missing_ok=True)
+                raise
+            finally:
+                if Path(temp_file_name).exists():
+                    Path(temp_file_name).unlink(missing_ok=True)
-    Path(lock_path).unlink()
+        return Path(download_to_path)

pertpy/data/_datasets.py CHANGED Viewed

@@ -37,7 +37,7 @@ def papalexi_2021() -> MuData:  # pragma: no cover
     Returns:
         :class:`~mudata.MuData` object of the ECCITE-seq dataset
     """
-    import muon as mu
+    import mudata as md
     output_file_name = "papalexi_2021.h5mu"
     output_file_path = settings.datasetdir / output_file_name
@@ -48,9 +48,11 @@ def papalexi_2021() -> MuData:  # pragma: no cover
             output_path=settings.datasetdir,
             is_zip=False,
         )
-    mudata = mu.read(output_file_path)
+    mdata = md.read_h5mu(output_file_path)
+    mdata.pull_obs()
+    mdata.pull_var()
-    return mudata
+    return mdata
 def sc_sim_augur() -> AnnData:  # pragma: no cover
@@ -408,7 +410,7 @@ def kang_2018() -> AnnData:  # pragma: no cover
 def stephenson_2021_subsampled() -> AnnData:  # pragma: no cover
-    """Processed 10X 5' scRNA-seq data from PBMC of COVID-19 patients and healthy donors
+    """Processed 10X 5' scRNA-seq data from PBMC of COVID-19 patients and healthy donors.
     The study profiled peripheral blood mononuclear cells from 90 COVID-19 patients with different disease severity and 23 healthy control donors.
     Here the dataset was downsampled to approx. 500 cells per donor and cells were mapped to a reference atlas of healthy PBMCs from 12 studies
@@ -453,7 +455,7 @@ def haber_2017_regions() -> AnnData:  # pragma: no cover
     output_file_path = settings.datasetdir / output_file_name
     if not Path(output_file_path).exists():
         _download(
-            url="https://figshare.com/ndownloader/files/38169900",
+            url="https://figshare.com/ndownloader/files/54169301",
             output_file_name=output_file_name,
             output_path=settings.datasetdir,
             is_zip=False,
@@ -650,7 +652,7 @@ def datlinger_2021() -> AnnData:  # pragma: no cover
         Publication: https://doi.org/10.1038/s41592-021-01153-z \
         Obtained from scperturb: http://projects.sanderlab.org/scperturb/
-     Returns:
+    Returns:
          :class:`~anndata.AnnData` object of scPerturb prepared single-cell perturbation data
     """
     output_file_name = "datlinger_2021.h5ad"
@@ -1516,9 +1518,7 @@ def combosciplex() -> AnnData:  # pragma: no cover
 def sciplex_gxe1() -> AnnData:  # pragma: no cover
-    """sci-Plex-GxE combined chemical and genetic profiling of A172 dCas9-KRAB cells
-    genetically perturbed for HPRT1 or mismtach repair genes exposed to 6-thioguanine and temozolomide,
-    respectively, and A172 dCas9-SunTag cells genetically perturbed for HPRT1 exposed to 6-thioguanine.
+    """sci-Plex-GxE profiling of A172 dCas9-KRAB (HPRT1 or MMR knockout) with 6-TG/TMZ and A172 dCas9-SunTag (HPRT1 knockout) with 6-TG.
     References:
         McFaline-Figueroa JL et al., Trapnell C. Multiplex single-cell chemical genomics reveals

pertpy/metadata/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from pertpy.metadata._cell_line import CellLine
 from pertpy.metadata._compound import Compound
 from pertpy.metadata._drug import Drug
+from pertpy.metadata._look_up import LookUp
 from pertpy.metadata._moa import Moa
-__all__ = ["CellLine", "Compound", "Drug", "Moa"]
+__all__ = ["CellLine", "Compound", "Drug", "Moa", "LookUp"]

pertpy/metadata/_cell_line.py CHANGED Viewed

@@ -39,6 +39,7 @@ class CellLine(MetaData):
         self.proteomics = None
         self.drug_response_gdsc1 = None
         self.drug_response_gdsc2 = None
+        self.drug_response_prism = None
     def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
         if cell_line_source == "DepMap":
@@ -54,6 +55,7 @@ class CellLine(MetaData):
                     is_zip=False,
                 )
             self.depmap = pd.read_csv(depmap_cell_line_path)
+            self.depmap = self.depmap.reset_index().rename(columns={"CellLineName": "cell_line_name"})
         else:
             # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project
             # Source: https://www.cancerrxgene.org/celllines
@@ -157,7 +159,7 @@ class CellLine(MetaData):
     def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
         if gdsc_dataset == 1:
             # Download GDSC drug response data
-            # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
+            # Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s and AUC)
             # URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
             drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
             if not Path(drug_response_gdsc1_file_path).exists():
@@ -181,6 +183,23 @@ class CellLine(MetaData):
                 )
             self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)
+    def _download_prism(self) -> None:
+        # Download PRISM drug response data
+        # Source: DepMap PRISM Repurposing 19Q4 secondary screen dose response curve parameters
+        drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
+        if not Path(drug_response_prism_file_path).exists():
+            _download(
+                url="https://figshare.com/ndownloader/files/20237739",
+                output_file_name="prism_info.csv",
+                output_path=settings.cachedir,
+                block_size=4096,
+                is_zip=False,
+            )
+        df = pd.read_csv(drug_response_prism_file_path, index_col=0)[["depmap_id", "name", "ic50", "ec50", "auc"]]
+        df = df.dropna(subset=["depmap_id", "name"])
+        df = df.groupby(["depmap_id", "name"]).mean().reset_index()
+        self.drug_response_prism = df
     def annotate(
         self,
         adata: AnnData,
@@ -197,13 +216,13 @@ class CellLine(MetaData):
         Args:
             adata: The data object to annotate.
-            query_id: The column of `.obs` with cell line information.
+            query_id: The column of ``.obs`` with cell line information.
             reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName	or StrippedCellLineName.
                           If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
             fetch: The metadata to fetch.
             cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
             verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
-            copy: Determines whether a copy of the `adata` is returned.
+            copy: Determines whether a copy of ``adata`` is returned.
         Returns:
             Returns an AnnData object with cell line annotation.
@@ -216,7 +235,7 @@ class CellLine(MetaData):
             >>> adata_annotated = pt_metadata.annotate(adata=adata,
             >>>                                        reference_id='cell_line_name',
             >>>                                        query_id='cell_line_name',
-            >>>                                        fetch=["cell_line_name", "age", "primary_disease"],
+            >>>                                        fetch=["cell_line_name", "Age", "OncotreePrimaryDisease"],
             >>>                                        copy=True)
         """
         if copy:
@@ -304,7 +323,7 @@ class CellLine(MetaData):
     def annotate_bulk_rna(
         self,
         adata: AnnData,
-        query_id: str = "cell_line_name",
+        query_id: str = None,
         cell_line_source: Literal["broad", "sanger"] = "sanger",
         verbosity: int | str = 5,
         gene_identifier: Literal["gene_name", "gene_ID", "both"] = "gene_ID",
@@ -316,9 +335,11 @@ class CellLine(MetaData):
         Args:
             adata: The data object to annotate.
-            query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
+            query_id: The column of `.obs` with cell line information.
+                Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
             cell_line_source: The bulk rna expression data from either broad or sanger cell line.
             verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
+            gene_identifier: The type of gene identifier saved in the fetched meta data, 'gene_name', 'gene_ID' or 'both'.
             copy: Determines whether a copy of the `adata` is returned.
         Returns:
@@ -339,7 +360,7 @@ class CellLine(MetaData):
         # Make sure that the specified `cell_line_type` can be found in the bulk rna expression data,
         # then we can compare these keys and fetch the corresponding metadata.
-        if query_id not in adata.obs.columns:
+        if query_id not in adata.obs.columns and query_id is not None:
             raise ValueError(
                 f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
                 "Ensure that you are using one of the available query IDs present in the adata.obs for the annotation."
@@ -347,25 +368,33 @@ class CellLine(MetaData):
                 "using the `annotate()` function before calling 'annotate_bulk_rna()'. "
                 "This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID."
             )
+        if query_id is None:
+            query_id = "cell_line_name" if cell_line_source == "sanger" else "DepMap_ID"
         identifier_num_all = len(adata.obs[query_id].unique())
         # Lazily download the bulk rna expression data
         if cell_line_source == "sanger":
+            if query_id not in adata.obs.columns:
+                raise ValueError(
+                    "To annotate bulk RNA data from Wellcome Sanger Institute, `cell_line_name` is used as default reference and query identifier if no `query_id` is given."
+                    "Ensure that you have column `cell_line_name` in `adata.obs` or specify column name in which cell line name is stored."
+                    "If cell line name isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
+                )
             if self.bulk_rna_sanger is None:
                 self._download_bulk_rna(cell_line_source="sanger")
             reference_id = "model_name"
             not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index))
         else:
+            if query_id not in adata.obs.columns:
+                raise ValueError(
+                    "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `query_id` is given."
+                    "Ensure that you have column `DepMap_ID` in `adata.obs` or specify column name in which DepMap ID is stored."
+                    "If DepMap ID isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
+                )
             reference_id = "DepMap_ID"
-            logger.warning(
-                "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given."
-                "If `DepMap_ID` isn't available in 'adata.obs', use `annotate()` to annotate the cell line first."
-            )
             if self.bulk_rna_broad is None:
                 self._download_bulk_rna(cell_line_source="broad")
-            if query_id == "cell_line_name":
-                query_id = "DepMap_ID"
             not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index))
         self._warn_unmatch(
@@ -474,7 +503,8 @@ class CellLine(MetaData):
         adata.obsm["proteomics_" + protein_information] = (
             self.proteomics[[reference_id, protein_id, protein_information]]
             .pivot(index=reference_id, columns=protein_id, values=protein_information)
-            .reindex(adata.obs.index)
+            .reindex(adata.obs[query_id])
+            .set_index(adata.obs.index)
         )
         return adata
@@ -491,7 +521,7 @@ class CellLine(MetaData):
     ) -> AnnData:
         """Fetch drug response data from GDSC.
-        For each cell, we fetch drug response data as natural log of the fitted IC50 for its
+        For each cell, we fetch drug response data as natural log of the fitted IC50 and AUC for its
         corresponding cell line and perturbation from GDSC fitted data results file.
         Args:
@@ -554,13 +584,86 @@ class CellLine(MetaData):
         adata.obs = (
             adata.obs.reset_index()
             .set_index([query_id, query_perturbation])
-            .assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
+            .assign(ln_ic50_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
+            .assign(auc_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).auc)
             .reset_index()
             .set_index(old_index_name)
         )
         return adata
+    def annotate_from_prism(
+        self,
+        adata: AnnData,
+        query_id: str = "DepMap_ID",
+        query_perturbation: str = "perturbation",
+        verbosity: int | str = 5,
+        copy: bool = False,
+    ) -> AnnData:
+        """Fetch drug response data from PRISM.
+        For each cell, we fetch drug response data as IC50, EC50 and AUC for its
+        corresponding cell line and perturbation from PRISM fitted data results file.
+        Note that all rows where either `depmap_id` or `name` is missing will be dropped.
+        Args:
+            adata: The data object to annotate.
+            query_id: The column of `.obs` with cell line information.
+            query_perturbation: The column of `.obs` with perturbation information.
+            verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
+            copy: Determines whether a copy of the `adata` is returned.
+        Returns:
+            Returns an AnnData object with drug response annotation.
+        Examples:
+            >>> import pertpy as pt
+            >>> adata = pt.dt.mcfarland_2020()
+            >>> pt_metadata = pt.md.CellLine()
+            >>> pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
+        """
+        if copy:
+            adata = adata.copy()
+        if query_id not in adata.obs.columns:
+            raise ValueError(
+                f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
+                "Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
+                "If the desired query ID is not available, you can fetch the cell line metadata "
+                "using the `annotate()` function before calling `annotate_from_prism()`. "
+                "This ensures that the required query ID is included in your data."
+            )
+        if self.drug_response_prism is None:
+            self._download_prism()
+        prism_data = self.drug_response_prism
+        # PRISM starts most drug names with a lowercase letter, so we want to make it case-insensitive
+        prism_data["name_lower"] = prism_data["name"].str.lower()
+        adata.obs["perturbation_lower"] = adata.obs[query_perturbation].str.lower()
+        identifier_num_all = len(adata.obs[query_id].unique())
+        not_matched_identifiers = list(set(adata.obs[query_id]) - set(prism_data["depmap_id"]))
+        self._warn_unmatch(
+            total_identifiers=identifier_num_all,
+            unmatched_identifiers=not_matched_identifiers,
+            query_id=query_id,
+            reference_id="depmap_id",
+            metadata_type="drug response",
+            verbosity=verbosity,
+        )
+        old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
+        adata.obs = (
+            adata.obs.reset_index()
+            .set_index([query_id, "perturbation_lower"])
+            .assign(ic50_prism=prism_data.set_index(["depmap_id", "name"]).ic50)
+            .assign(ec50_prism=prism_data.set_index(["depmap_id", "name"]).ec50)
+            .assign(auc_prism=prism_data.set_index(["depmap_id", "name"]).auc)
+            .reset_index()
+            .set_index(old_index_name)
+            .drop(columns="perturbation_lower")
+        )
+        return adata
     def lookup(self) -> LookUp:
         """Generate LookUp object for CellLineMetaData.
@@ -577,7 +680,7 @@ class CellLine(MetaData):
             >>> pt_metadata = pt.md.CellLine()
             >>> lookup = pt_metadata.lookup()
         """
-        # Fetch the metadata if it hasn't beed downloaded yet
+        # Fetch the metadata if it hasn't been downloaded yet
         if self.depmap is None:
             self._download_cell_line(cell_line_source="DepMap")
         if self.cancerxgene is None:
@@ -594,6 +697,8 @@ class CellLine(MetaData):
             self._download_gdsc(gdsc_dataset=1)
         if self.drug_response_gdsc2 is None:
             self._download_gdsc(gdsc_dataset=2)
+        if self.drug_response_prism is None:
+            self._download_prism()
         # Transfer the data
         return LookUp(
@@ -607,6 +712,7 @@ class CellLine(MetaData):
                 self.proteomics,
                 self.drug_response_gdsc1,
                 self.drug_response_gdsc2,
+                self.drug_response_prism,
             ],
         )
@@ -663,12 +769,14 @@ class CellLine(MetaData):
             raise ValueError(
                 "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list."
             )
-        if isinstance(adata.obsm[metadata_key], pd.DataFrame):
-            # Give warning if the genes are not the same
-            if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0:
-                logger.warning(
-                    "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
-                )
+        # Raise error if the genes are not the same
+        if (
+            isinstance(adata.obsm[metadata_key], pd.DataFrame)
+            and sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0
+        ):
+            raise ValueError(
+                "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order."
+            )
         # Divide cell lines into those are present and not present in the metadata
         overlapped_cl = adata[~adata.obsm[metadata_key].isna().all(axis=1), :]
@@ -693,7 +801,7 @@ class CellLine(MetaData):
         return corr, pvals, new_corr, new_pvals
     @_doc_params(common_plot_args=doc_common_plot_args)
-    def plot_correlation(
+    def plot_correlation(  # noqa: D417
         self,
         adata: AnnData,
         corr: pd.DataFrame,
@@ -703,7 +811,6 @@ class CellLine(MetaData):
         metadata_key: str = "bulk_rna_broad",
         category: str = "cell line",
         subset_identifier: str | int | Iterable[str] | Iterable[int] | None = None,
-        show: bool = True,
         return_fig: bool = False,
     ) -> Figure | None:
         """Visualise the correlation of cell lines with annotated metadata.
@@ -747,7 +854,7 @@ class CellLine(MetaData):
                 if all(isinstance(id, str) for id in subset_identifier_list):
                     if set(subset_identifier_list).issubset(adata.obs[identifier].unique()):
                         subset_identifier_list = np.where(
-                            np.in1d(adata.obs[identifier].values, subset_identifier_list)
+                            np.isin(adata.obs[identifier].values, subset_identifier_list)
                         )[0]
                     else:
                         raise ValueError("`Subset_identifier` must be found in adata.obs.`identifier`.")
@@ -798,10 +905,9 @@ class CellLine(MetaData):
                 },
             )
-            if show:
-                plt.show()
             if return_fig:
                 return plt.gcf()
+            plt.show()
             return None
         else:
-            raise NotImplementedError
+            raise NotImplementedError("Only 'cell line' category is supported for correlation comparison.")

pertpy/metadata/_look_up.py CHANGED Viewed

@@ -22,11 +22,12 @@ class LookUp:
         type: Literal["cell_line", "moa", "compound", "drug"] = "cell_line",
         transfer_metadata: Sequence[pd.DataFrame] | None = None,
     ):
-        """
+        """Lookup object for different type of metadata.
         Args:
             type: Metadata type for annotation. One of 'cell_line', 'compound', 'moa' or 'drug.
             transfer_metadata: DataFrames used to generate Lookup object.
-                               This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
+                           This is currently set to None for CompoundMetaData which does not require any dataframes for transfer.
         """
         self.type = type
         if type == "cell_line":
@@ -329,10 +330,7 @@ class LookUp:
         if self.type != "cell_line":
             raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
-        if cell_line_source == "broad":
-            bulk_rna = self.bulk_rna_broad
-        else:
-            bulk_rna = self.bulk_rna_sanger
+        bulk_rna = self.bulk_rna_broad if cell_line_source == "broad" else self.bulk_rna_sanger
         if query_id_list is not None:
             identifier_num_all = len(query_id_list)
@@ -391,10 +389,7 @@ class LookUp:
         """
         if self.type != "cell_line":
             raise ValueError("This is not a LookUp object specific for CellLineMetaData!")
-        if gdsc_dataset == 1:
-            gdsc_data = self.drug_response_gdsc1
-        else:
-            gdsc_data = self.drug_response_gdsc2
+        gdsc_data = self.drug_response_gdsc1 if gdsc_dataset == 1 else self.drug_response_gdsc2
         if query_id_list is not None:
             if reference_id not in gdsc_data.columns:
@@ -421,7 +416,7 @@ class LookUp:
         reference_id: Literal["gene_id", "ensembl_gene_id", "hgnc_id", "hgnc_symbol"] = "ensembl_gene_id",
         query_id_list: Sequence[str] | None = None,
     ) -> None:
-        """A brief summary of gene annotation metadata
+        """A brief summary of gene annotation metadata.
         Args:
             reference_id: The type of gene identifier in the meta data, gene_id, ensembl_gene_id, hgnc_id, hgnc_symbol.
@@ -555,15 +550,14 @@ class LookUp:
                     raise ValueError(
                         "Gene-disease association is not available in dgidb dataset, please try with pharmgkb."
                     )
+            elif query_id_type == "target":
+                not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
+            elif query_id_type == "compound":
+                compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
+                not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
             else:
-                if query_id_type == "target":
-                    not_matched_identifiers = list(set(query_id_list) - set(self.pharmgkb["Gene"]))
-                elif query_id_type == "compound":
-                    compounds = self.pharmgkb[self.pharmgkb["Type"] == "Chemical"]
-                    not_matched_identifiers = list(set(query_id_list) - set(compounds["Compound|Disease"]))
-                else:
-                    diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
-                    not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
+                diseases = self.pharmgkb[self.pharmgkb["Type"] == "Disease"]
+                not_matched_identifiers = list(set(query_id_list) - set(diseases["Compound|Disease"]))
             logger.info(f"{len(not_matched_identifiers)} {query_id_type}s are not found in the metadata.")
             logger.info(f"{identifier_num_all - len(not_matched_identifiers)} {query_id_type}s are found! ")

pertpy/metadata/_moa.py CHANGED Viewed

@@ -61,7 +61,7 @@ class Moa(MetaData):
             adata = adata.copy()
         if query_id not in adata.obs.columns:
-            raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" "Please check again.")
+            raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\nPlease check again.")
         if self.clue is None:
             self._download_clue()

pertpy 0.9.5__py3-none-any.whl → 0.11.0__py3-none-any.whl

pertpy 0.9.5py3-none-any.whl → 0.11.0py3-none-any.whl