PyPI - microarray - Versions diffs - 0.1.0__py3-none-any.whl - Mend

microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

microarray/__init__.py +15 -0
microarray/_version.py +3 -0
microarray/datasets/__init__.py +3 -0
microarray/datasets/_arrayexpress.py +1 -0
microarray/datasets/_cdf_files.py +35 -0
microarray/datasets/_geo.py +1 -0
microarray/datasets/_utils.py +143 -0
microarray/io/__init__.py +17 -0
microarray/io/_anndata_converter.py +198 -0
microarray/io/_cdf.py +575 -0
microarray/io/_cel.py +591 -0
microarray/io/_read.py +127 -0
microarray/plotting/__init__.py +28 -0
microarray/plotting/_base.py +253 -0
microarray/plotting/_cel.py +75 -0
microarray/plotting/_de_plots.py +239 -0
microarray/plotting/_diagnostic_plots.py +268 -0
microarray/plotting/_heatmap.py +279 -0
microarray/plotting/_ma_plots.py +136 -0
microarray/plotting/_pca.py +320 -0
microarray/plotting/_qc_plots.py +335 -0
microarray/plotting/_score.py +38 -0
microarray/plotting/_top_table_heatmap.py +98 -0
microarray/plotting/_utils.py +280 -0
microarray/preprocessing/__init__.py +39 -0
microarray/preprocessing/_background.py +862 -0
microarray/preprocessing/_log2.py +77 -0
microarray/preprocessing/_normalize.py +1292 -0
microarray/preprocessing/_rma.py +243 -0
microarray/preprocessing/_robust.py +170 -0
microarray/preprocessing/_summarize.py +318 -0
microarray/py.typed +0 -0
microarray/tools/__init__.py +26 -0
microarray/tools/_biomart.py +416 -0
microarray/tools/_empirical_bayes.py +401 -0
microarray/tools/_fdist.py +171 -0
microarray/tools/_linear_models.py +387 -0
microarray/tools/_mds.py +101 -0
microarray/tools/_pca.py +88 -0
microarray/tools/_score.py +86 -0
microarray/tools/_toptable.py +360 -0
microarray-0.1.0.dist-info/METADATA +75 -0
microarray-0.1.0.dist-info/RECORD +44 -0
microarray-0.1.0.dist-info/WHEEL +4 -0

microarray/tools/_biomart.py ADDED Viewed

@@ -0,0 +1,416 @@
+from collections.abc import Generator
+from dataclasses import dataclass
+from io import StringIO
+from pathlib import Path
+from xml.etree import ElementTree
+import pandas as pd
+import requests
+from anndata import AnnData
+class Attribute:
+    """Biomart dataset attribute."""
+    def __init__(
+        self,
+        name: str,
+        display_name: str = "",
+        description: str = "",
+        default: bool = False,
+    ):
+        """Attribute constructor.
+        Args:
+            name (str): Attribute name.
+            display_name (str): Attribute display name.
+            description (str): Attribute description.
+            default (bool): Whether the attribute is a default attribute.
+        """
+        self.name: str = name
+        self.display_name: str = display_name
+        self.description: str = description
+        self.default: bool = default
+@dataclass
+class Filter:
+    """Biomart dataset filter."""
+    def __init__(self, name: str, type: str, description: str = ""):
+        """Filter constructor.
+        Args:
+            name (str): Filter name.
+            type (str): Filter type.
+            description (str): Filter description.
+        """
+        self.name = name
+        self.type = type
+        self.description = description
+class BiomartDataset:
+    def __init__(
+        self,
+        name: str,
+        display_name: str = "",
+        schema: str = "default",
+        host: str = "http://www.ensembl.org",
+        path: str = "/biomart/martservice",
+        port: int = 80,
+        use_cache: bool = False,
+    ):
+        """BiomartDataset constructor.
+        Args:
+            name (str): Dataset name.
+            display_name (str): Dataset display name.
+            schema (str): Dataset schema.
+            host (str): Biomart host.
+            path (str): Biomart path.
+            port (int): Biomart port.
+            use_cache (bool): Whether to use caching.
+        """
+        # Add http prefix and remove trailing slash.
+        if not host.startswith("http://") and not host.startswith("https://"):
+            host = "http://" + host
+        if host.endswith("/"):
+            host = host[:-1]
+        # Ensure path starts with slash.
+        if not path.startswith("/"):
+            path = "/" + path
+        self.host: str = host
+        self.path: str = path
+        self.port: int = port
+        self.use_cache: bool = use_cache
+        self.name = name
+        self.display_name = display_name
+        self.schema = schema
+        self._filters: dict[str, Filter] | None = None
+        self._attributes: dict[str, Attribute] | None = None
+        self._default_attributes: dict[str, Attribute] | None = None
+        if use_cache:
+            raise NotImplementedError("Caching is not implemented yet.")
+    @property
+    def url(self) -> str:
+        """Url used to connect to the biomart service.
+        Returns:
+            str: Url used to connect to the biomart service.
+        """
+        # if self.port is not None:
+        #     return f"{self.host}:{self.port}{self.path}"
+        # return f"{self.host}{self.path}"
+        return f"{self.host}:{self.port}{self.path}"
+    def get(self, **params) -> requests.models.Response:
+        """Performs get request to the biomart service.
+        Args:
+            **params: Parameters to be sent in the get request.
+        Returns:
+            requests.models.Response: Response from the biomart service.
+        """
+        response = requests.get(self.url, params=params)
+        response.raise_for_status()
+        return response
+    @property
+    def filters(self) -> dict[str, Filter]:
+        """List of filters available for the dataset."""
+        if self._filters is None:
+            self._filters, self._attributes = self._fetch_configuration()
+        return self._filters
+    @property
+    def attributes(self) -> dict[str, Attribute]:
+        """List of attributes available for the dataset (cached)."""
+        if self._attributes is None:
+            self._filters, self._attributes = self._fetch_configuration()
+        return self._attributes
+    @property
+    def default_attributes(self) -> dict[str, Attribute]:
+        """List of default attributes for the dataset."""
+        if self._default_attributes is None:
+            self._default_attributes = {name: attr for name, attr in self.attributes.items() if attr.default is True}
+        return self._default_attributes
+    def list_attributes(self) -> pd.DataFrame:
+        """Lists available attributes in a readable DataFrame format.
+        Returns:
+            pd.DataFrame: Frame listing available attributes.
+        """
+        def _row_gen(attributes: dict[str, Attribute]):
+            for attr in attributes.values():
+                yield (attr.name, attr.display_name, attr.description)
+        return pd.DataFrame.from_records(
+            _row_gen(self.attributes),
+            columns=["name", "display_name", "description"],
+        )
+    def list_filters(self) -> pd.DataFrame:
+        """Lists available filters in a readable DataFrame format.
+        Returns:
+            pd.DataFrame: Frame listing available filters.
+        """
+        def _row_gen(filters: dict[str, Filter]):
+            for filt in filters.values():
+                yield (filt.name, filt.type, filt.description)
+        return pd.DataFrame.from_records(
+            _row_gen(self.filters),
+            columns=["name", "type", "description"],
+        )
+    def _fetch_configuration(self) -> tuple[dict[str, Filter], dict[str, Attribute]]:
+        # Get datasets using biomart.
+        response = self.get(type="configuration", dataset=self.name)
+        # Check response for problems.
+        if "Problem retrieving configuration" in response.text:
+            raise RuntimeError("Failed to retrieve dataset configuration, check the dataset name and schema.")
+        # Get filters and attributes from xml.
+        xml = ElementTree.fromstring(response.content)
+        filters = {f.name: f for f in self._filters_from_xml(xml)}
+        attributes = {a.name: a for a in self._attributes_from_xml(xml)}
+        return filters, attributes
+    @staticmethod
+    def _filters_from_xml(xml: ElementTree.Element) -> Generator[Filter, None, None]:
+        for node in xml.iter("FilterDescription"):
+            attrib = node.attrib
+            yield Filter(name=attrib["internalName"], type=attrib.get("type", ""))
+    @staticmethod
+    def _attributes_from_xml(xml: ElementTree.Element) -> Generator[Attribute, None, None]:
+        for page_index, page in enumerate(xml.iter("AttributePage")):
+            for desc in page.iter("AttributeDescription"):
+                attrib = desc.attrib
+                # Default attributes can only be from the first page.
+                default = page_index == 0 and attrib.get("default", "") == "true"
+                yield Attribute(
+                    name=attrib["internalName"],
+                    display_name=attrib.get("displayName", ""),
+                    description=attrib.get("description", ""),
+                    default=default,
+                )
+    def query(
+        self,
+        attributes: list[str] | None = None,
+        filters: dict[str, str | bool | list | tuple] | None = None,
+        only_unique: bool = True,
+        use_attr_names: bool = False,
+        dtypes: dict[str, type] | None = None,
+    ) -> pd.DataFrame:
+        """Queries the dataset to retrieve the contained data.
+        Args:
+            attributes (list[str] | None): List of attribute names to retrieve, if None default attributes are used.
+            filters (dict[str, str | bool | list | tuple] | None): Dictionary of filter name to filter value, if None no filters are applied.
+            only_unique (bool): Whether to only return unique rows.
+            use_attr_names (bool): Whether to use attribute names instead of display names as column names in the result.
+            dtypes (dict[str, type] | None): Optional dictionary mapping attribute names to data types for the resulting DataFrame.
+        Returns:
+            pandas.DataFrame: DataFrame containing the query results.
+        """
+        # Setup query element.
+        root = ElementTree.Element("Query")
+        root.set("virtualSchemaName", self.schema)
+        root.set("formatter", "TSV")
+        root.set("header", "1")
+        root.set("uniqueRows", str(int(only_unique)))
+        root.set("datasetConfigVersion", "0.6")
+        # Add dataset element.
+        dataset = ElementTree.SubElement(root, "Dataset")
+        dataset.set("name", self.name)
+        dataset.set("interface", "default")
+        # Default to default attributes if none requested.
+        if attributes is None:
+            attributes = list(self.default_attributes.keys())
+        # Add attribute elements.
+        for name in attributes:
+            try:
+                attr = self.attributes[name]
+                self._add_attr_node(dataset, attr)
+            except KeyError as err:
+                raise KeyError(
+                    f"Unknown attribute {name}, check dataset attributes for a list of valid attributes."
+                ) from err
+        if filters is not None:
+            # Add filter elements.
+            for name, value in filters.items():
+                try:
+                    filter_ = self.filters[name]
+                    self._add_filter_node(dataset, filter_, value)
+                except KeyError as err:
+                    raise KeyError(
+                        f"Unknown filter {name}, check dataset filters for a list of valid filters."
+                    ) from err
+        # Fetch response.
+        response = self.get(query=ElementTree.tostring(root))
+        # Raise exception if an error occurred.
+        if "Query ERROR" in response.text:
+            raise RuntimeError(response.text)
+        # Parse results into a DataFrame.
+        try:
+            result = pd.read_csv(StringIO(response.text), sep="\t", dtype=dtypes)
+        # Type error is raised of a data type is not understood by pandas
+        except TypeError as err:
+            raise ValueError("Non valid data type is used in dtypes") from err
+        if use_attr_names:
+            # Rename columns with attribute names instead of display names.
+            column_map = {self.attributes[attr].display_name: attr for attr in attributes}
+            result.rename(columns=column_map, inplace=True)
+        return result
+    @staticmethod
+    def _add_attr_node(
+        root: ElementTree.Element,
+        attr: Attribute,
+    ) -> None:
+        attr_el = ElementTree.SubElement(root, "Attribute")
+        attr_el.set("name", attr.name)
+    @staticmethod
+    def _add_filter_node(
+        root: ElementTree.Element,
+        filter_: Filter,
+        value: str | bool | list | tuple,
+    ) -> None:
+        """Adds filter xml node to root."""
+        filter_el = ElementTree.SubElement(root, "Filter")
+        filter_el.set("name", filter_.name)
+        # Set filter value depending on type.
+        if filter_.type == "boolean":
+            # Boolean case.
+            if value is True or value.lower() in {"included", "only"}:
+                filter_el.set("excluded", "0")
+            elif value is False or value.lower() == "excluded":
+                filter_el.set("excluded", "1")
+            else:
+                raise ValueError(f"Invalid value for boolean filter ({value})")
+        elif isinstance(value, list) or isinstance(value, tuple):
+            # List case.
+            filter_el.set("value", ",".join(map(str, value)))
+        else:
+            # Default case.
+            filter_el.set("value", str(value))
+def annotate(
+    adata: AnnData,
+    entrez_key: str = "gene_id",
+    symbol_key: str = "gene_symbol",
+    dataset_name: str = "hsapiens_gene_ensembl",
+    cache_dir: str = ".cache/biomart",
+    remove_unannotated: bool = False,
+    copy: bool = False,
+    use_cache: bool = True,
+    set_index: bool = True,
+) -> AnnData | None:
+    """Annotate Entrez gene IDs with symbols using Biomart.
+    Args:
+        adata (AnnData): AnnData object with feature metadata in ``.var``.
+        entrez_key (str): Column in ``adata.var`` containing Entrez gene IDs.
+        symbol_key (str): Output column in ``adata.var`` for gene symbols.
+        dataset_name (str): Biomart dataset name.
+        cache_dir (str): Directory where annotation tables are cached as CSV.
+        remove_unannotated (bool): If True, drop features without symbol annotation.
+        copy (bool): If True, return a new AnnData object.
+        use_cache (bool): Whether to use cached annotation tables if available.
+        set_index (bool): Whether to set the resulting symbol column as the index of .var.
+    Returns:
+        AnnData | None: Annotated AnnData if ``copy=True``, otherwise ``None``.
+    """
+    if remove_unannotated is True and copy is False:
+        raise ValueError(
+            "Cannot remove unannotated features when copy=False, as this would modify the input AnnData in-place."
+        )
+    if entrez_key not in adata.var.columns:
+        raise KeyError(f"AnnData .var has no '{entrez_key}' column")
+    adata = adata.copy() if copy else adata
+    entrez = adata.var[entrez_key].astype("string")
+    valid_mask = entrez.notna() & (entrez.str.strip() != "")
+    if not valid_mask.any():
+        adata.var[symbol_key] = pd.Series(pd.NA, index=adata.var.index, dtype="string")
+        if remove_unannotated:
+            adata = adata[:, adata.var[symbol_key].notna()].copy()
+        return adata if copy else None
+    entrez_col, symbol_col = "entrezgene_id", "hgnc_symbol"
+    fetch_entries = [entrez_col, symbol_col]
+    cache_path = Path(cache_dir) / f"{dataset_name}_entrez_symbol.csv"
+    if cache_path.exists() and use_cache:
+        annotations = pd.read_csv(
+            cache_path,
+            dtype={
+                "entrezgene_id": float,
+                "hgnc_symbol": str,
+            },
+        )
+    else:
+        biomart = BiomartDataset(name=dataset_name, port=80)
+        annotations = biomart.query(attributes=fetch_entries)
+        annotations.columns = fetch_entries
+        cache_path.parent.mkdir(parents=True, exist_ok=True)
+        annotations.to_csv(cache_path, index=False)
+    annotations = annotations[[entrez_col, symbol_col]].copy()
+    annotations = annotations[annotations[entrez_col].notna() & (annotations[entrez_col] != "")]
+    annotations[entrez_col] = annotations[entrez_col].astype("Int64").astype("string")
+    annotations[symbol_col] = annotations[symbol_col].astype("string").str.strip()
+    annotations = annotations[annotations[symbol_col].notna() & (annotations[symbol_col] != "")]
+    annotations = annotations.drop_duplicates(subset=[entrez_col], keep="first")
+    symbol_map = dict(zip(annotations[entrez_col], annotations[symbol_col], strict=False))
+    adata.var[symbol_key] = entrez.map(symbol_map).astype("string")
+    if remove_unannotated is True:
+        adata = adata[:, adata.var[symbol_key].notnull()].copy()
+    if set_index is True:
+        adata.var.set_index(symbol_key, inplace=True)
+        adata.var.index.name = None
+    return adata if copy else None