PyPI - microarray - Versions diffs - 0.1.0__py3-none-any.whl - Mend

microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

microarray/__init__.py +15 -0
microarray/_version.py +3 -0
microarray/datasets/__init__.py +3 -0
microarray/datasets/_arrayexpress.py +1 -0
microarray/datasets/_cdf_files.py +35 -0
microarray/datasets/_geo.py +1 -0
microarray/datasets/_utils.py +143 -0
microarray/io/__init__.py +17 -0
microarray/io/_anndata_converter.py +198 -0
microarray/io/_cdf.py +575 -0
microarray/io/_cel.py +591 -0
microarray/io/_read.py +127 -0
microarray/plotting/__init__.py +28 -0
microarray/plotting/_base.py +253 -0
microarray/plotting/_cel.py +75 -0
microarray/plotting/_de_plots.py +239 -0
microarray/plotting/_diagnostic_plots.py +268 -0
microarray/plotting/_heatmap.py +279 -0
microarray/plotting/_ma_plots.py +136 -0
microarray/plotting/_pca.py +320 -0
microarray/plotting/_qc_plots.py +335 -0
microarray/plotting/_score.py +38 -0
microarray/plotting/_top_table_heatmap.py +98 -0
microarray/plotting/_utils.py +280 -0
microarray/preprocessing/__init__.py +39 -0
microarray/preprocessing/_background.py +862 -0
microarray/preprocessing/_log2.py +77 -0
microarray/preprocessing/_normalize.py +1292 -0
microarray/preprocessing/_rma.py +243 -0
microarray/preprocessing/_robust.py +170 -0
microarray/preprocessing/_summarize.py +318 -0
microarray/py.typed +0 -0
microarray/tools/__init__.py +26 -0
microarray/tools/_biomart.py +416 -0
microarray/tools/_empirical_bayes.py +401 -0
microarray/tools/_fdist.py +171 -0
microarray/tools/_linear_models.py +387 -0
microarray/tools/_mds.py +101 -0
microarray/tools/_pca.py +88 -0
microarray/tools/_score.py +86 -0
microarray/tools/_toptable.py +360 -0
microarray-0.1.0.dist-info/METADATA +75 -0
microarray-0.1.0.dist-info/RECORD +44 -0
microarray-0.1.0.dist-info/WHEEL +4 -0

microarray/io/_cdf.py ADDED Viewed

@@ -0,0 +1,575 @@
+import gzip
+import re
+import warnings
+from dataclasses import dataclass, field
+import numpy as np
+import pandas as pd
+# Complement base mapping used for PM/MM probe type determination
+_BASE_COMPLEMENTS: dict[str, str] = {"A": "T", "T": "A", "G": "C", "C": "G"}
+@dataclass
+class CdfCell:
+    """Represents a single cell (probe) on the microarray."""
+    x: int
+    y: int
+    probe: str
+    plen: int | None = None
+    atom: int | None = None
+    index: int | None = None
+    match: int | None = None
+    bg: int | None = None
+    # Additional fields for regular units
+    feat: str | None = None
+    qual: str | None = None
+    expos: int | None = None
+    pos: int | None = None
+    cbase: str | None = None
+    pbase: str | None = None
+    tbase: str | None = None
+    codonind: int | None = None
+    codon: int | None = None
+    regiontype: int | None = None
+    region: str | None = None
+    @property
+    def is_pm(self) -> bool | None:
+        """Return True if this is a Perfect Match (PM) probe, False for Mismatch (MM).
+        For regular unit cells, the probe type is determined by comparing the probe
+        base (``pbase``) to the complement of the context base (``cbase``).  A probe
+        is PM when ``pbase`` is the Watson-Crick complement of ``cbase`` (A↔T, G↔C).
+        For QC cells the ``match`` field is used directly (1 = PM, 0 = MM).
+        Returns ``None`` when neither source of information is available.
+        """
+        if self.pbase is not None and self.cbase is not None:
+            complement = _BASE_COMPLEMENTS.get(self.cbase.upper())
+            if complement is not None:
+                return self.pbase.upper() == complement
+            # Fall back to inequality check when base is non-standard
+            return self.pbase.upper() != self.cbase.upper()
+        if self.match is not None:
+            return self.match == 1
+        return None
+@dataclass
+class CdfBlock:
+    """Represents a block within a unit (probeset)."""
+    name: str
+    block_number: int
+    num_atoms: int
+    num_cells: int
+    start_position: int
+    stop_position: int
+    cells: list[CdfCell] = field(default_factory=list)
+    @property
+    def pm_cells(self) -> list[CdfCell]:
+        """Return all Perfect Match (PM) probe cells in this block."""
+        return [c for c in self.cells if c.is_pm is True]
+    @property
+    def mm_cells(self) -> list[CdfCell]:
+        """Return all Mismatch (MM) probe cells in this block."""
+        return [c for c in self.cells if c.is_pm is False]
+@dataclass
+class CdfUnit:
+    """Represents a unit (probeset) in the CDF file."""
+    name: str
+    unit_number: int
+    direction: int | None = None
+    num_atoms: int = 0
+    num_cells: int = 0
+    unit_type: int | None = None
+    number_blocks: int = 0
+    blocks: list[CdfBlock] = field(default_factory=list)
+@dataclass
+class CdfQCUnit:
+    """Represents a quality control unit."""
+    qc_number: int
+    unit_type: int
+    number_cells: int
+    cells: list[CdfCell] = field(default_factory=list)
+@dataclass
+class CdfChipInfo:
+    """Chip metadata from the CDF file."""
+    name: str
+    rows: int
+    cols: int
+    number_of_units: int
+    max_unit: int
+    num_qc_units: int
+    chip_reference: str = ""
+class CdfFile:
+    """Parser and container for CDF (Chip Definition File) data."""
+    def __init__(self):
+        self.version: str = ""
+        self.chip_info: CdfChipInfo | None = None
+        self.qc_units: list[CdfQCUnit] = []
+        self.units: list[CdfUnit] = []
+        self.units_by_name: dict[str, CdfUnit] = {}
+        self.probeset_info: pd.DataFrame | None = None
+        self.recognized_suffixes: set[str] = set()
+    @classmethod
+    def read(cls, filepath: str) -> "CdfFile":
+        """Read and parse a CDF file (plain text or gzipped).
+        Parameters
+        ----------
+        filepath : str
+            Path to the CDF file (.cdf or .cdf.gz)
+        Returns:
+        -------
+        CdfFile
+            Parsed CDF file object
+        """
+        cdf = cls()
+        # Open file (handle gzipped files)
+        if filepath.endswith(".gz"):
+            with gzip.open(filepath, "rt") as f:
+                content = f.read()
+        else:
+            with open(filepath) as f:
+                content = f.read()
+        cdf._parse(content)
+        cdf._create_probeset_info()
+        return cdf
+    def _parse(self, content: str):
+        """Parse the CDF file content."""
+        lines = content.split("\n")
+        i = 0
+        while i < len(lines):
+            line = lines[i].strip()
+            # Parse CDF section
+            if line == "[CDF]":
+                i = self._parse_cdf_section(lines, i + 1)
+            # Parse Chip section
+            elif line == "[Chip]":
+                i = self._parse_chip_section(lines, i + 1)
+            # Parse QC units
+            elif line.startswith("[QC"):
+                i = self._parse_qc_unit(lines, i)
+            # Parse regular units
+            elif line.startswith("[Unit") and "_Block" not in line:
+                i = self._parse_unit(lines, i)
+            else:
+                i += 1
+    def _create_probeset_info(self):
+        """Create a DataFrame with probeset information including parsed suffixes.
+        Extracts gene annotation (prefix) and probe type suffix from probe set names.
+        Common Affymetrix suffixes include:
+        - '_at': standard probe set
+        - '_s_at': consensus/similar sequences
+        - '_x_at': cross-hybridizing sequences
+        - '_a_at': alternative/ambiguous sequences
+        - '_g_at': gene-level probe set
+        - '_i_at': intronic probe set
+        Warnings:
+        --------
+        Issues warnings if probes with unrecognized suffixes are detected.
+        """
+        probeset_data = []
+        recognized_count = 0
+        unrecognized_probes = []
+        # Known Affymetrix probe set suffixes (ordered by specificity)
+        # More specific patterns first (e.g., _s_at before _at)
+        suffix_patterns = [
+            r"(_s_at)$",  # similar/consensus
+            r"(_x_at)$",  # cross-hybridizing
+            r"(_a_at)$",  # alternative/ambiguous
+            r"(_g_at)$",  # gene-level
+            r"(_i_at)$",  # intronic
+            r"(_st)$",  # sense target (newer arrays)
+            r"(_at)$",  # standard (check last)
+        ]
+        for probeset_name in self.units_by_name.keys():
+            if probeset_name == "NONE":
+                continue
+            gene_id = probeset_name
+            suffix = ""
+            # Try to match known suffixes
+            for pattern in suffix_patterns:
+                match = re.search(pattern, probeset_name)
+                if match:
+                    suffix = match.group(1)
+                    gene_id = probeset_name[: match.start()]
+                    recognized_count += 1
+                    break
+            # Track unrecognized probes (those without a suffix)
+            if not suffix:
+                unrecognized_probes.append(probeset_name)
+            probeset_data.append({"probe_id": probeset_name, "gene_id": gene_id, "suffix": suffix})
+        self.probeset_info = pd.DataFrame(probeset_data)
+        # Set probe_id as index for fast lookup
+        self.probeset_info = self.probeset_info.set_index("probe_id", drop=False)
+        # Store the set of recognized suffixes that were actually found
+        self.recognized_suffixes = set(self.probeset_info[self.probeset_info["suffix"] != ""]["suffix"].unique())
+        # Warn if there are unrecognized suffixes
+        total_probes = len(probeset_data)
+        unrecognized_count = len(unrecognized_probes)
+        if unrecognized_count > 0:
+            percentage = (unrecognized_count / total_probes) * 100
+            warnings.warn(
+                f"Found {unrecognized_count} probe(s) ({percentage:.1f}%) with unrecognized suffixes. "
+                f"Examples: {unrecognized_probes[:5]}. "
+                f"Recognized suffixes in this CDF: {sorted(self.recognized_suffixes)}",
+                category=UserWarning,
+                stacklevel=3,
+            )
+    def _parse_cdf_section(self, lines: list[str], start_idx: int) -> int:
+        """Parse the [CDF] section."""
+        i = start_idx
+        while i < len(lines) and not lines[i].startswith("["):
+            line = lines[i].strip()
+            if line.startswith("Version="):
+                self.version = line.split("=", 1)[1]
+            i += 1
+        return i
+    def _parse_chip_section(self, lines: list[str], start_idx: int) -> int:
+        """Parse the [Chip] section."""
+        i = start_idx
+        chip_data = {}
+        while i < len(lines) and not lines[i].startswith("["):
+            line = lines[i].strip()
+            if "=" in line:
+                key, value = line.split("=", 1)
+                chip_data[key] = value
+            i += 1
+        self.chip_info = CdfChipInfo(
+            name=chip_data.get("Name", ""),
+            rows=int(chip_data.get("Rows", 0)),
+            cols=int(chip_data.get("Cols", 0)),
+            number_of_units=int(chip_data.get("NumberOfUnits", 0)),
+            max_unit=int(chip_data.get("MaxUnit", 0)),
+            num_qc_units=int(chip_data.get("NumQCUnits", 0)),
+            chip_reference=chip_data.get("ChipReference", ""),
+        )
+        return i
+    def _parse_qc_unit(self, lines: list[str], start_idx: int) -> int:
+        """Parse a QC unit section."""
+        i = start_idx
+        line = lines[i].strip()
+        # Extract QC number from [QC1], [QC2], etc.
+        qc_num = int(line[3:-1])
+        i += 1
+        qc_data = {}
+        while i < len(lines) and not lines[i].startswith("["):
+            line = lines[i].strip()
+            if "=" in line:
+                key, value = line.split("=", 1)
+                qc_data[key] = value
+            i += 1
+        qc_unit = CdfQCUnit(
+            qc_number=qc_num, unit_type=int(qc_data.get("Type", 0)), number_cells=int(qc_data.get("NumberCells", 0))
+        )
+        # Parse cells
+        cell_header = qc_data.get("CellHeader", "").split()
+        for key, value in qc_data.items():
+            if key.startswith("Cell") and key[4:].isdigit():
+                qc_unit.cells.append(self._parse_cell(value, cell_header, is_qc=True))
+        self.qc_units.append(qc_unit)
+        return i
+    def _parse_unit(self, lines: list[str], start_idx: int) -> int:
+        """Parse a regular unit section and its blocks."""
+        i = start_idx
+        line = lines[i].strip()
+        # Extract unit number from [Unit1], [Unit2], etc.
+        unit_num_str = line[5:-1]  # Remove [Unit and ]
+        unit_num = int(unit_num_str)
+        i += 1
+        # Parse unit properties
+        unit_data = {}
+        while i < len(lines) and not lines[i].startswith("["):
+            line = lines[i].strip()
+            if "=" in line:
+                key, value = line.split("=", 1)
+                unit_data[key] = value
+            i += 1
+        unit = CdfUnit(
+            name=unit_data.get("Name", "NONE"),
+            unit_number=unit_num,
+            direction=int(unit_data["Direction"]) if "Direction" in unit_data else None,
+            num_atoms=int(unit_data.get("NumAtoms", 0)),
+            num_cells=int(unit_data.get("NumCells", 0)),
+            unit_type=int(unit_data["UnitType"]) if "UnitType" in unit_data else None,
+            number_blocks=int(unit_data.get("NumberBlocks", 0)),
+        )
+        # Parse blocks for this unit
+        while i < len(lines) and lines[i].startswith(f"[Unit{unit_num}_Block"):
+            i = self._parse_block(lines, i, unit)
+        self.units.append(unit)
+        # Index by block name (probeset name)
+        for block in unit.blocks:
+            if block.name != "NONE":
+                self.units_by_name[block.name] = unit
+        return i
+    def _parse_block(self, lines: list[str], start_idx: int, unit: CdfUnit) -> int:
+        """Parse a unit block section."""
+        i = start_idx + 1
+        block_data = {}
+        while i < len(lines) and not lines[i].startswith("["):
+            line = lines[i].strip()
+            if "=" in line:
+                key, value = line.split("=", 1)
+                block_data[key] = value
+            i += 1
+        block = CdfBlock(
+            name=block_data.get("Name", ""),
+            block_number=int(block_data.get("BlockNumber", 0)),
+            num_atoms=int(block_data.get("NumAtoms", 0)),
+            num_cells=int(block_data.get("NumCells", 0)),
+            start_position=int(block_data.get("StartPosition", 0)),
+            stop_position=int(block_data.get("StopPosition", 0)),
+        )
+        # Parse cells
+        cell_header = block_data.get("CellHeader", "").split()
+        for key, value in block_data.items():
+            if key.startswith("Cell") and key[4:].isdigit():
+                block.cells.append(self._parse_cell(value, cell_header, is_qc=False))
+        unit.blocks.append(block)
+        return i
+    def _parse_cell(self, cell_line: str, header: list[str], is_qc: bool) -> CdfCell:
+        """Parse a cell line into a CdfCell object."""
+        parts = cell_line.split("\t")
+        if len(parts) < len(header):
+            parts = cell_line.split()
+        cell_dict = {header[i].lower(): parts[i] if i < len(parts) else None for i in range(len(header))}
+        # Create cell with common fields
+        cell = CdfCell(x=int(cell_dict.get("x", 0)), y=int(cell_dict.get("y", 0)), probe=cell_dict.get("probe", "N"))
+        # Add QC-specific fields
+        if is_qc:
+            if "plen" in cell_dict and cell_dict["plen"]:
+                cell.plen = int(cell_dict["plen"])
+            if "atom" in cell_dict and cell_dict["atom"]:
+                cell.atom = int(cell_dict["atom"])
+            if "index" in cell_dict and cell_dict["index"]:
+                cell.index = int(cell_dict["index"])
+            if "match" in cell_dict and cell_dict["match"]:
+                cell.match = int(cell_dict["match"])
+            if "bg" in cell_dict and cell_dict["bg"]:
+                cell.bg = int(cell_dict["bg"])
+        # Add regular unit fields
+        else:
+            if "feat" in cell_dict and cell_dict["feat"]:
+                cell.feat = cell_dict["feat"]
+            if "qual" in cell_dict and cell_dict["qual"]:
+                cell.qual = cell_dict["qual"]
+            if "expos" in cell_dict and cell_dict["expos"]:
+                cell.expos = int(cell_dict["expos"])
+            if "pos" in cell_dict and cell_dict["pos"]:
+                cell.pos = int(cell_dict["pos"])
+            if "cbase" in cell_dict and cell_dict["cbase"]:
+                cell.cbase = cell_dict["cbase"]
+            if "pbase" in cell_dict and cell_dict["pbase"]:
+                cell.pbase = cell_dict["pbase"]
+            if "tbase" in cell_dict and cell_dict["tbase"]:
+                cell.tbase = cell_dict["tbase"]
+            if "atom" in cell_dict and cell_dict["atom"]:
+                cell.atom = int(cell_dict["atom"])
+            if "index" in cell_dict and cell_dict["index"]:
+                cell.index = int(cell_dict["index"])
+            if "codonind" in cell_dict and cell_dict["codonind"]:
+                cell.codonind = int(cell_dict["codonind"])
+            if "codon" in cell_dict and cell_dict["codon"]:
+                cell.codon = int(cell_dict["codon"])
+            if "regiontype" in cell_dict and cell_dict["regiontype"]:
+                cell.regiontype = int(cell_dict["regiontype"])
+            if "region" in cell_dict and cell_dict["region"]:
+                cell.region = cell_dict["region"]
+        return cell
+    def get_probeset_indices(self, probeset_name: str) -> list[tuple]:
+        """Get all (X, Y) positions for a given probeset name.
+        Parameters
+        ----------
+        probeset_name : str
+            Name of the probeset (e.g., 'AFFX-BioB-5_at')
+        Returns:
+        -------
+        List[tuple]
+            List of (x, y) coordinate tuples
+        """
+        if probeset_name in self.units_by_name:
+            unit = self.units_by_name[probeset_name]
+            positions = []
+            for block in unit.blocks:
+                for cell in block.cells:
+                    positions.append((cell.x, cell.y))
+            return positions
+        return []
+    def get_pm_indices(self, probeset_name: str) -> list[tuple]:
+        """Get (X, Y) positions of all Perfect Match (PM) probes for a probeset.
+        Parameters
+        ----------
+        probeset_name : str
+            Name of the probeset (e.g., 'AFFX-BioB-5_at')
+        Returns:
+        -------
+        list[tuple]
+            List of (x, y) coordinate tuples for PM probes.
+        """
+        if probeset_name not in self.units_by_name:
+            return []
+        unit = self.units_by_name[probeset_name]
+        return [(c.x, c.y) for block in unit.blocks for c in block.pm_cells]
+    def get_mm_indices(self, probeset_name: str) -> list[tuple]:
+        """Get (X, Y) positions of all Mismatch (MM) probes for a probeset.
+        Parameters
+        ----------
+        probeset_name : str
+            Name of the probeset (e.g., 'AFFX-BioB-5_at')
+        Returns:
+        -------
+        list[tuple]
+            List of (x, y) coordinate tuples for MM probes.
+        """
+        if probeset_name not in self.units_by_name:
+            return []
+        unit = self.units_by_name[probeset_name]
+        return [(c.x, c.y) for block in unit.blocks for c in block.mm_cells]
+    def get_pm_mm_map(self) -> dict[tuple, str]:
+        """Create a mapping from (X, Y) positions to probe type ('pm' or 'mm').
+        Only positions where the probe type can be determined are included.
+        Returns:
+        -------
+        dict[tuple, str]
+            Dictionary mapping (x, y) tuples to ``'pm'`` or ``'mm'``.
+        """
+        pm_mm: dict[tuple, str] = {}
+        for unit in self.units:
+            for block in unit.blocks:
+                for cell in block.cells:
+                    if cell.is_pm is True:
+                        pm_mm[(cell.x, cell.y)] = "pm"
+                    elif cell.is_pm is False:
+                        pm_mm[(cell.x, cell.y)] = "mm"
+        return pm_mm
+    def get_xy_to_probeset_map(self) -> dict[tuple, str]:
+        """Create a mapping from (X, Y) positions to probeset names.
+        Returns:
+        -------
+        Dict[tuple, str]
+            Dictionary mapping (x, y) tuples to probeset names
+        """
+        xy_map = {}
+        for unit in self.units:
+            for block in unit.blocks:
+                if block.name != "NONE":
+                    for cell in block.cells:
+                        xy_map[(cell.x, cell.y)] = block.name
+        return xy_map
+    def __repr__(self):
+        return (
+            f"CdfFile(version='{self.version}', "
+            f"chip='{self.chip_info.name if self.chip_info else 'N/A'}', "
+            f"units={len(self.units)}, qc_units={len(self.qc_units)})"
+        )
+    def get_annotated_array(self) -> np.ndarray:
+        """Generate numpy array in size of the microarray plate containing annotations."""
+        array = np.full((self.chip_info.rows, self.chip_info.cols), fill_value=None, dtype=object)
+        probes = self.get_xy_to_probeset_map()
+        for (x, y), probeset in probes.items():
+            if array[y, x] is not None:
+                warnings.warn(
+                    f"Warning: Overwriting existing probeset at ({x}, {y})",
+                    category=UserWarning,
+                    stacklevel=2,
+                )
+            if 0 <= x < self.chip_info.cols and 0 <= y < self.chip_info.rows:
+                array[y, x] = probeset
+        return array
+def parse_cdf(filepath: str) -> CdfFile:
+    """Convenience function to parse a CDF file."""
+    return CdfFile.read(filepath)