PyPI - microarray - Versions diffs - 0.1.0__py3-none-any.whl - Mend

microarray 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

microarray/__init__.py +15 -0
microarray/_version.py +3 -0
microarray/datasets/__init__.py +3 -0
microarray/datasets/_arrayexpress.py +1 -0
microarray/datasets/_cdf_files.py +35 -0
microarray/datasets/_geo.py +1 -0
microarray/datasets/_utils.py +143 -0
microarray/io/__init__.py +17 -0
microarray/io/_anndata_converter.py +198 -0
microarray/io/_cdf.py +575 -0
microarray/io/_cel.py +591 -0
microarray/io/_read.py +127 -0
microarray/plotting/__init__.py +28 -0
microarray/plotting/_base.py +253 -0
microarray/plotting/_cel.py +75 -0
microarray/plotting/_de_plots.py +239 -0
microarray/plotting/_diagnostic_plots.py +268 -0
microarray/plotting/_heatmap.py +279 -0
microarray/plotting/_ma_plots.py +136 -0
microarray/plotting/_pca.py +320 -0
microarray/plotting/_qc_plots.py +335 -0
microarray/plotting/_score.py +38 -0
microarray/plotting/_top_table_heatmap.py +98 -0
microarray/plotting/_utils.py +280 -0
microarray/preprocessing/__init__.py +39 -0
microarray/preprocessing/_background.py +862 -0
microarray/preprocessing/_log2.py +77 -0
microarray/preprocessing/_normalize.py +1292 -0
microarray/preprocessing/_rma.py +243 -0
microarray/preprocessing/_robust.py +170 -0
microarray/preprocessing/_summarize.py +318 -0
microarray/py.typed +0 -0
microarray/tools/__init__.py +26 -0
microarray/tools/_biomart.py +416 -0
microarray/tools/_empirical_bayes.py +401 -0
microarray/tools/_fdist.py +171 -0
microarray/tools/_linear_models.py +387 -0
microarray/tools/_mds.py +101 -0
microarray/tools/_pca.py +88 -0
microarray/tools/_score.py +86 -0
microarray/tools/_toptable.py +360 -0
microarray-0.1.0.dist-info/METADATA +75 -0
microarray-0.1.0.dist-info/RECORD +44 -0
microarray-0.1.0.dist-info/WHEEL +4 -0

microarray/io/_cel.py ADDED Viewed

@@ -0,0 +1,591 @@
+import gzip
+import os
+import struct
+from io import FileIO
+from typing import TYPE_CHECKING
+import numpy as np
+from anndata import AnnData
+if TYPE_CHECKING:
+    from microarray.io._cdf import CdfFile
+__supported_sections = [
+    "HEADER",
+    "INTENSITY",
+    "MASKS",
+    "OUTLIERS",
+    "MODIFIED",
+]
+__header_integer_keys = {
+    "Cols": "ncols",
+    "Rows": "nrows",
+    "TotalX": "total_x",
+    "TotalY": "total_y",
+    "OffsetX": "offset_x",
+    "OffsetY": "offset_y",
+}
+__header_integer_tuple_keys = {
+    "GridCornerUL": "grid_corner_ul",
+    "GridCornerUR": "grid_corner_ur",
+    "GridCornerLL": "grid_corner_ll",
+    "GridCornerLR": "grid_corner_lr",
+}
+__header_boolean_keys = {
+    "Axis-invertX": "axis_invert_x",
+    "AxisInvertY": "axis_invert_y",
+    "swapXY": "swap_xy",
+}
+__header_string_keys = {
+    "Algorithm": "algorithm",
+}
+class CelFile:
+    """Class representing the contents of a CEL file."""
+    def __init__(self):
+        self.version: int
+        self.nrows: int | None = None
+        self.ncols: int | None = None
+        self.total_x: int | None = None
+        self.total_y: int | None = None
+        self.offset_x: int | None = None
+        self.offset_y: int | None = None
+        self.grid_corner_ul: tuple[int, int] | None = None
+        self.grid_corner_ur: tuple[int, int] | None = None
+        self.grid_corner_ll: tuple[int, int] | None = None
+        self.grid_corner_lr: tuple[int, int] | None = None
+        self.algorithm: str | None = None
+        self.axis_invert_x: bool = False
+        self.axis_invert_y: bool = False
+        self.swap_xy: bool = False
+        self.dat_header: dict[str, str] = {}
+        self.algorithm_parameters: dict[str, str] = {}
+        # Numpy arrays - initialized after dimensions are known
+        self.intensities: np.ndarray | None = None
+        self.stdevs: np.ndarray | None = None
+        self.npixels: np.ndarray | None = None
+        self.masks: np.ndarray | None = None
+        self.outliers: np.ndarray | None = None
+        self.modified: np.ndarray | None = None
+        # Probe annotation array - populated by apply_probe_annotation()
+        self.probe_annotation: np.ndarray | None = None
+    def summary(self):
+        """Print a summary of the CEL file contents."""
+        print(f"Version: {self.version}")
+        print(f"Dimensions: {self.ncols} x {self.nrows}")
+        print(f"Algorithm: {self.algorithm}")
+        print(f"Algorithm Parameters: {self.algorithm_parameters}")
+        if self.intensities is not None:
+            print("\nData Summary:")
+            print(f"\tIntensity array shape: {self.intensities.shape}")
+            print(f"\tNon-zero intensity cells: {np.count_nonzero(self.intensities)}")
+            print(f"\tMasked cells: {np.sum(self.masks)}")
+            print(f"\tOutlier cells: {np.sum(self.outliers)}")
+            print(f"\tModified cells: {np.sum(~np.isnan(self.modified))}")
+            print("\nIntensity statistics:")
+            print(f"\tMean: {np.mean(self.intensities):.2f}")
+            print(f"\tStd: {np.std(self.intensities):.2f}")
+            print(f"\tMin: {np.min(self.intensities):.2f}")
+            print(f"\tMax: {np.max(self.intensities):.2f}")
+    def __str__(self):
+        return f"CelFile(version={self.version}, dimensions=({self.ncols} x {self.nrows}), algorithm={self.algorithm})"
+    def __repr__(self):
+        return self.__str__()
+def read_cel(
+    path: str,
+    cdf_path: str | None = None,
+    as_anndata: bool = True,
+    **kwargs,
+) -> AnnData | CelFile:
+    """Read a CEL file and return a probe-level AnnData object or CelFile object.
+    Parameters
+    ----------
+    path : str
+        Path to the CEL file.
+    cdf_path : str | None
+        Path to the CDF file (required if as_anndata=True).
+    as_anndata : bool
+        If ``True`` (default), return a probe-level AnnData object (requires
+        *cdf_path*).  If ``False``, return a :class:`CelFile` object.
+    **kwargs
+        Additional keyword arguments forwarded to
+        :func:`microarray.io.cel_to_anndata` (``sample_name``).
+    Returns:
+    -------
+    AnnData | CelFile
+        If ``as_anndata=True``: probe-level AnnData with shape (1, n_cells).
+        If ``as_anndata=False``: :class:`CelFile` object.
+    Raises:
+    ------
+    ValueError
+        If ``as_anndata=True`` but *cdf_path* is not provided.
+    """
+    if as_anndata:
+        if cdf_path is None:
+            raise ValueError(
+                "cdf_path is required when as_anndata=True. Set as_anndata=False to get a CelFile object instead."
+            )
+        from microarray.io._anndata_converter import cel_to_anndata
+        # Forward only the kwargs accepted by the new cel_to_anndata signature
+        allowed = {"sample_name"}
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in allowed}
+        return cel_to_anndata(path, cdf_path, **filtered_kwargs)
+    else:
+        return parse_cel(path)
+def apply_probe_annotation(cel_file: "CelFile", cdf_file: "CdfFile") -> None:
+    """Apply probe annotation from a CDF file to a CelFile instance.
+    Populates the ``probe_annotation`` attribute of *cel_file* with a
+    2-D numpy array of shape ``(nrows, ncols)`` and dtype ``object``.  Each
+    element contains the probeset (unit) name for that array position as
+    defined in the CDF file, or ``None`` for positions not covered by the CDF.
+    Parameters
+    ----------
+    cel_file : CelFile
+        A parsed CEL file instance.  Must have valid ``nrows`` and ``ncols``
+        attributes (i.e. :func:`parse_cel` must have been called first).
+    cdf_file : CdfFile
+        A parsed CDF file instance.
+    Returns:
+    --------
+    None
+        The ``probe_annotation`` attribute of *cel_file* is updated in-place.
+    Raises:
+    -------
+    ValueError
+        If *cel_file* does not have valid dimensions.
+    """
+    if cel_file.nrows is None or cel_file.ncols is None:
+        raise ValueError(
+            "CelFile does not have valid dimensions. "
+            "Make sure parse_cel was called successfully before applying annotation."
+        )
+    probe_annotation = cdf_file.get_annotated_array()
+    assert probe_annotation.shape == (cel_file.nrows, cel_file.ncols), (
+        f"Probe annotation shape {probe_annotation.shape} does not match CEL file dimensions "
+        f"({cel_file.nrows}, {cel_file.ncols})."
+    )
+    cel_file.probe_annotation = probe_annotation
+def parse_cel(path: str | os.PathLike | FileIO) -> CelFile:
+    """Parse a CEL file and return a CelFile object.
+    Supports both version 3 (text) and version 4 (binary) CEL formats.
+    Parameters
+    ----------
+    path : str | os.PathLike | FileIO
+        Path to the CEL file.
+    Returns:
+    --------
+    CelFile
+        A CelFile object containing the parsed contents of the CEL file.
+    """
+    # Read the entire file as binary
+    if isinstance(path, (str, os.PathLike)):
+        if str(path).endswith(".gz"):
+            with gzip.open(path, "rb") as f:
+                file_data = f.read()
+        else:
+            with open(path, "rb") as f:
+                file_data = f.read()
+    elif isinstance(path, FileIO):
+        file_data = path.read()
+    else:
+        raise TypeError("Unsupported path type")
+    # Detect version
+    is_v4 = False
+    if len(file_data) >= 8:
+        try:
+            magic, version = struct.unpack("<II", file_data[:8])
+            if magic == 64:
+                is_v4 = True
+        except struct.error:
+            pass
+    if is_v4:
+        return _parse_cel_v4(file_data)
+    # Default to version 3 (text format)
+    return _parse_cel_v3(file_data)
+def _parse_cel_v3(file_data: bytes) -> CelFile:
+    """Parse version 3 (text format) CEL file."""
+    try:
+        content = file_data.decode("utf-8").splitlines()
+    except UnicodeDecodeError:
+        content = file_data.decode("latin-1").splitlines()
+    assert content[0].strip() == "[CEL]", f"Expected '[CEL]' header, got: {content[0].strip()}"
+    def _parse_line_key_value(line: str) -> tuple[str, str]:
+        """Parse a line in the format 'key=value' and return the key and value as a tuple."""
+        key, value = line.strip().split("=", 1)
+        return key, value
+    cel_file = CelFile()
+    # Parse version from second line
+    version = _parse_line_key_value(content[1])[1]
+    cel_file.version = int(version)
+    last_section = None
+    for line in content[2:]:
+        clean_line = line.strip("\r\n")
+        if not clean_line:
+            continue
+        if clean_line.startswith("[") and clean_line.endswith("]"):
+            section_name = clean_line[1:-1]
+            if section_name in __supported_sections:
+                last_section = section_name
+        else:
+            assert last_section is not None, "Data line found before any section header"
+            if last_section == "HEADER":
+                key, value = _parse_line_key_value(clean_line)
+                if key in __header_integer_keys:
+                    setattr(cel_file, __header_integer_keys[key], int(value))
+                    # Initialize numpy arrays once we have dimensions
+                    if cel_file.ncols is not None and cel_file.nrows is not None:
+                        if cel_file.intensities is None:
+                            cel_file.intensities = np.zeros((cel_file.nrows, cel_file.ncols), dtype=np.float32)
+                            cel_file.stdevs = np.zeros((cel_file.nrows, cel_file.ncols), dtype=np.float32)
+                            cel_file.npixels = np.zeros((cel_file.nrows, cel_file.ncols), dtype=np.int32)
+                            cel_file.masks = np.zeros((cel_file.nrows, cel_file.ncols), dtype=bool)
+                            cel_file.outliers = np.zeros((cel_file.nrows, cel_file.ncols), dtype=bool)
+                            cel_file.modified = np.full((cel_file.nrows, cel_file.ncols), np.nan, dtype=np.float32)
+                elif key in __header_integer_tuple_keys:
+                    tuple_value = tuple(map(int, value.split(" ")))
+                    setattr(cel_file, __header_integer_tuple_keys[key], tuple_value)
+                elif key in __header_boolean_keys:
+                    setattr(cel_file, __header_boolean_keys[key], bool(int(value)))
+                elif key in __header_string_keys:
+                    setattr(cel_file, __header_string_keys[key], value)
+                elif key == "DatHeader":
+                    # Parse DatHeader - complex format with multiple fields
+                    cel_file.dat_header["_raw"] = value
+                    # Split by first colon to separate identifier from the rest
+                    if ":" in value:
+                        parts = value.split(":", 1)
+                        # The part before first colon often contains identifier info
+                        # identifier_part = parts[0]
+                        data_part = parts[1] if len(parts) > 1 else ""
+                        # Extract key=value pairs from the data part
+                        tokens = data_part.split()
+                        for token in tokens:
+                            if "=" in token:
+                                try:
+                                    dat_key, dat_value = token.split("=", 1)
+                                    # Try to convert to int or float
+                                    if dat_value.isdecimal():
+                                        dat_value = int(dat_value)
+                                    elif "." in dat_value and dat_value.replace(".", "", 1).isdecimal():
+                                        dat_value = float(dat_value)
+                                    cel_file.dat_header[dat_key] = dat_value
+                                except ValueError:
+                                    # Skip malformed pairs
+                                    pass
+                elif key == "AlgorithmParameters":
+                    # Parse algorithm parameters in format "key1:value1;key2:value2"
+                    pairs = value.split(";")
+                    for pair in pairs:
+                        if ":" in pair:
+                            param_key, param_value = pair.split(":", 1)
+                            # Try to convert to int or float
+                            if param_value.isdecimal():
+                                param_value = int(param_value)
+                            elif "." in param_value and param_value.replace(".", "", 1).isdecimal():
+                                param_value = float(param_value)
+                            cel_file.algorithm_parameters[param_key] = param_value
+                else:
+                    # Skip unknown header keys or store them
+                    pass
+            elif last_section == "INTENSITY":
+                # Skip section headers like "NumberCells=..." and "CellHeader=..."
+                if "=" in clean_line and not clean_line[0].isdigit():
+                    continue
+                # Parse intensity data: X Y MEAN STDV NPIXELS
+                parts = clean_line.split()
+                if len(parts) == 5:
+                    x, y, mean, stdv, npix = parts
+                    x, y = int(x), int(y)
+                    cel_file.intensities[y, x] = float(mean)
+                    cel_file.stdevs[y, x] = float(stdv)
+                    cel_file.npixels[y, x] = int(npix)
+            elif last_section == "MASKS":
+                # Skip section headers
+                if "=" in clean_line and not clean_line[0].isdigit():
+                    continue
+                # Parse mask data: X Y
+                parts = clean_line.split()
+                if len(parts) == 2:
+                    x, y = int(parts[0]), int(parts[1])
+                    cel_file.masks[y, x] = True
+            elif last_section == "OUTLIERS":
+                # Skip section headers
+                if "=" in clean_line and not clean_line[0].isdigit():
+                    continue
+                # Parse outlier data: X Y
+                parts = clean_line.split()
+                if len(parts) == 2:
+                    x, y = int(parts[0]), int(parts[1])
+                    cel_file.outliers[y, x] = True
+            elif last_section == "MODIFIED":
+                # Skip section headers
+                if "=" in clean_line and not clean_line[0].isdigit():
+                    continue
+                # Parse modified data: X Y ORIGMEAN
+                parts = clean_line.split()
+                if len(parts) == 3:
+                    x, y = int(parts[0]), int(parts[1])
+                    origmean = float(parts[2])
+                    cel_file.modified[y, x] = origmean
+    return cel_file
+def _parse_cel_v4(file_data: bytes) -> CelFile:
+    """Parse version 4 (binary format) CEL file.
+    Version 4 CEL format:
+    - 64-byte binary header
+    - Text metadata section
+    - Binary algorithm name and parameters (length-prefixed strings)
+    - Masks section (count + coordinate pairs)
+    - Outliers section (count + coordinate pairs)
+    - Intensity data (mean, stdv, npixels per cell in row-major order)
+    Parameters
+    ----------
+    file_data : bytes
+        The raw binary data of the CEL file
+    Returns:
+    -------
+    CelFile
+        Parsed CEL file object
+    """
+    cel_file = CelFile()
+    # Parse 64-byte binary header
+    magic, version, ncols, nrows, ncells, header_len = struct.unpack("<IIIIII", file_data[:24])
+    cel_file.version = version
+    cel_file.ncols = ncols
+    cel_file.nrows = nrows
+    cel_file.total_x = ncols
+    cel_file.total_y = nrows
+    # Initialize numpy arrays
+    cel_file.intensities = np.zeros((nrows, ncols), dtype=np.float32)
+    cel_file.stdevs = np.zeros((nrows, ncols), dtype=np.float32)
+    cel_file.npixels = np.zeros((nrows, ncols), dtype=np.int32)
+    cel_file.masks = np.zeros((nrows, ncols), dtype=bool)
+    cel_file.outliers = np.zeros((nrows, ncols), dtype=bool)
+    cel_file.modified = np.full((nrows, ncols), np.nan, dtype=np.float32)
+    # Parse text metadata section (starts at byte 24, length is header_len bytes)
+    text_metadata = file_data[24 : 24 + header_len].decode("latin-1", errors="ignore")
+    # Parse key=value pairs from text metadata
+    for line in text_metadata.splitlines():
+        line = line.strip()
+        if not line or "=" not in line:
+            continue
+        try:
+            key, value = line.split("=", 1)
+            if key in __header_integer_keys:
+                # Skip since we already have dims from binary header
+                pass
+            elif key in __header_integer_tuple_keys:
+                tuple_value = tuple(map(int, value.split(" ")))
+                setattr(cel_file, __header_integer_tuple_keys[key], tuple_value)
+            elif key in __header_boolean_keys:
+                setattr(cel_file, __header_boolean_keys[key], bool(int(value)))
+            elif key in __header_string_keys:
+                setattr(cel_file, __header_string_keys[key], value)
+            elif key == "DatHeader":
+                # Parse DatHeader - complex format with multiple fields
+                # Format: [range] scanner_info:CLS=... RWS=... XIN=... YIN=... VE=... date time ... chip_type ...
+                cel_file.dat_header["_raw"] = value
+                # Extract pixel range [min..max]
+                import re
+                range_match = re.match(r"\[(\d+)\.\.(\d+)\]", value)
+                if range_match:
+                    cel_file.dat_header["pixel_min"] = int(range_match.group(1))
+                    cel_file.dat_header["pixel_max"] = int(range_match.group(2))
+                # Split by first colon to separate scanner info from the rest
+                if ":" in value:
+                    parts = value.split(":", 1)
+                    # Extract scanner info (between ] and :)
+                    scanner_part = parts[0]
+                    if "]" in scanner_part:
+                        scanner_info = scanner_part.split("]", 1)[1].strip()
+                        if scanner_info:
+                            cel_file.dat_header["scanner_info"] = scanner_info
+                    data_part = parts[1] if len(parts) > 1 else ""
+                    tokens = data_part.split()
+                    non_kv_tokens = []
+                    for token in tokens:
+                        if "=" in token:
+                            try:
+                                dat_key, dat_value = token.split("=", 1)
+                                if dat_value.isdecimal():
+                                    dat_value = int(dat_value)
+                                elif "." in dat_value and dat_value.replace(".", "", 1).isdecimal():
+                                    dat_value = float(dat_value)
+                                cel_file.dat_header[dat_key] = dat_value
+                            except ValueError:
+                                pass
+                        else:
+                            non_kv_tokens.append(token)
+                    # Try to extract date/time and chip type from remaining tokens
+                    date_pattern = re.compile(r"\d{1,2}/\d{1,2}/\d{2,4}")
+                    time_pattern = re.compile(r"\d{1,2}:\d{2}:\d{2}")
+                    for i, token in enumerate(non_kv_tokens):
+                        if date_pattern.match(token):
+                            cel_file.dat_header["scan_date"] = token
+                            if i + 1 < len(non_kv_tokens) and time_pattern.match(non_kv_tokens[i + 1]):
+                                cel_file.dat_header["scan_time"] = non_kv_tokens[i + 1]
+                        elif "_" in token or "." in token:
+                            if any(c.isalpha() for c in token):
+                                cel_file.dat_header["chip_type"] = token
+            elif key == "Algorithm":
+                cel_file.algorithm = value
+            elif key == "AlgorithmParameters":
+                pairs = value.split(";")
+                for pair in pairs:
+                    if ":" in pair:
+                        param_key, param_value = pair.split(":", 1)
+                        try:
+                            if param_value.isdecimal():
+                                param_value = int(param_value)
+                            elif "." in param_value and param_value.replace(".", "", 1).isdecimal():
+                                param_value = float(param_value)
+                            cel_file.algorithm_parameters[param_key] = param_value
+                        except ValueError:
+                            cel_file.algorithm_parameters[param_key] = param_value
+        except ValueError:
+            # Skip malformed lines
+            continue
+    # Position after text metadata
+    pos = 24 + header_len
+    # Read algorithm name (length-prefixed string)
+    algo_len = struct.unpack("<I", file_data[pos : pos + 4])[0]
+    pos += 4
+    if cel_file.algorithm is None and algo_len < 1000:
+        cel_file.algorithm = file_data[pos : pos + algo_len].decode("latin-1", errors="ignore")
+    pos += algo_len
+    # Read algorithm parameters (length-prefixed string)
+    param_len = struct.unpack("<I", file_data[pos : pos + 4])[0]
+    pos += 4
+    pos += param_len  # Skip, already parsed from text section
+    # Read masks section
+    nmasks = struct.unpack("<I", file_data[pos : pos + 4])[0]
+    pos += 4
+    for _ in range(nmasks):
+        if pos + 4 <= len(file_data):
+            x, y = struct.unpack("<HH", file_data[pos : pos + 4])
+            if y < nrows and x < ncols:
+                cel_file.masks[y, x] = True
+            pos += 4
+    # Read outliers section
+    noutliers = struct.unpack("<I", file_data[pos : pos + 4])[0]
+    pos += 4
+    for _ in range(noutliers):
+        if pos + 4 <= len(file_data):
+            x, y = struct.unpack("<HH", file_data[pos : pos + 4])
+            if y < nrows and x < ncols:
+                cel_file.outliers[y, x] = True
+            pos += 4
+    # Read modified section (if exists)
+    # Note: Modified section format is: count (4 bytes) + entries (x, y, orig_value)
+    # Each entry is 8 bytes: x (uint16), y (uint16), orig_value (float32)
+    if pos + 4 <= len(file_data):
+        nmodified = struct.unpack("<I", file_data[pos : pos + 4])[0]
+        # Sanity check: nmodified should be reasonable
+        if 0 < nmodified < ncells and pos + 4 + nmodified * 8 < len(file_data):
+            pos += 4
+            for _ in range(nmodified):
+                if pos + 8 <= len(file_data):
+                    x, y, orig_val = struct.unpack("<HHf", file_data[pos : pos + 8])
+                    if y < nrows and x < ncols:
+                        cel_file.modified[y, x] = orig_val
+                    pos += 8
+                else:
+                    break
+        elif nmodified == 0:
+            pos += 4  # Skip the zero count
+    # Read intensity data
+    # Format: mean (float32), stdv (float32), npixels (uint16) per cell
+    # Data is in row-major order (cell 0 = (0,0), cell 1 = (0,1), etc.)
+    for i in range(ncells):
+        if pos + 10 <= len(file_data):
+            mean, stdv, npix = struct.unpack("<ffH", file_data[pos : pos + 10])
+            y = i // ncols
+            x = i % ncols
+            cel_file.intensities[y, x] = mean
+            cel_file.stdevs[y, x] = stdv
+            cel_file.npixels[y, x] = npix
+            pos += 10
+        else:
+            # If we run out of data, stop reading
+            break
+    return cel_file