microarray 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. microarray-0.1.0/PKG-INFO +75 -0
  2. microarray-0.1.0/README.md +21 -0
  3. microarray-0.1.0/pyproject.toml +107 -0
  4. microarray-0.1.0/src/microarray/__init__.py +15 -0
  5. microarray-0.1.0/src/microarray/_version.py +3 -0
  6. microarray-0.1.0/src/microarray/datasets/__init__.py +3 -0
  7. microarray-0.1.0/src/microarray/datasets/_arrayexpress.py +1 -0
  8. microarray-0.1.0/src/microarray/datasets/_cdf_files.py +35 -0
  9. microarray-0.1.0/src/microarray/datasets/_geo.py +1 -0
  10. microarray-0.1.0/src/microarray/datasets/_utils.py +143 -0
  11. microarray-0.1.0/src/microarray/io/__init__.py +17 -0
  12. microarray-0.1.0/src/microarray/io/_anndata_converter.py +198 -0
  13. microarray-0.1.0/src/microarray/io/_cdf.py +575 -0
  14. microarray-0.1.0/src/microarray/io/_cel.py +591 -0
  15. microarray-0.1.0/src/microarray/io/_read.py +127 -0
  16. microarray-0.1.0/src/microarray/plotting/__init__.py +28 -0
  17. microarray-0.1.0/src/microarray/plotting/_base.py +253 -0
  18. microarray-0.1.0/src/microarray/plotting/_cel.py +75 -0
  19. microarray-0.1.0/src/microarray/plotting/_de_plots.py +239 -0
  20. microarray-0.1.0/src/microarray/plotting/_diagnostic_plots.py +268 -0
  21. microarray-0.1.0/src/microarray/plotting/_heatmap.py +279 -0
  22. microarray-0.1.0/src/microarray/plotting/_ma_plots.py +136 -0
  23. microarray-0.1.0/src/microarray/plotting/_pca.py +320 -0
  24. microarray-0.1.0/src/microarray/plotting/_qc_plots.py +335 -0
  25. microarray-0.1.0/src/microarray/plotting/_score.py +38 -0
  26. microarray-0.1.0/src/microarray/plotting/_top_table_heatmap.py +98 -0
  27. microarray-0.1.0/src/microarray/plotting/_utils.py +280 -0
  28. microarray-0.1.0/src/microarray/preprocessing/__init__.py +39 -0
  29. microarray-0.1.0/src/microarray/preprocessing/_background.py +862 -0
  30. microarray-0.1.0/src/microarray/preprocessing/_log2.py +77 -0
  31. microarray-0.1.0/src/microarray/preprocessing/_normalize.py +1292 -0
  32. microarray-0.1.0/src/microarray/preprocessing/_rma.py +243 -0
  33. microarray-0.1.0/src/microarray/preprocessing/_robust.py +170 -0
  34. microarray-0.1.0/src/microarray/preprocessing/_summarize.py +318 -0
  35. microarray-0.1.0/src/microarray/py.typed +0 -0
  36. microarray-0.1.0/src/microarray/tools/__init__.py +26 -0
  37. microarray-0.1.0/src/microarray/tools/_biomart.py +416 -0
  38. microarray-0.1.0/src/microarray/tools/_empirical_bayes.py +401 -0
  39. microarray-0.1.0/src/microarray/tools/_fdist.py +171 -0
  40. microarray-0.1.0/src/microarray/tools/_linear_models.py +387 -0
  41. microarray-0.1.0/src/microarray/tools/_mds.py +101 -0
  42. microarray-0.1.0/src/microarray/tools/_pca.py +88 -0
  43. microarray-0.1.0/src/microarray/tools/_score.py +86 -0
  44. microarray-0.1.0/src/microarray/tools/_toptable.py +360 -0
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.4
2
+ Name: microarray
3
+ Version: 0.1.0
4
+ Summary: Microarray analysis tools
5
+ Author: harryhaller001
6
+ Author-email: harryhaller001 <harryhaller001@gmail.com>
7
+ License-Expression: MIT
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: Natural Language :: English
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Programming Language :: Python :: 3.14
17
+ Classifier: Typing :: Typed
18
+ Requires-Dist: adjusttext>=1.3
19
+ Requires-Dist: anndata
20
+ Requires-Dist: click
21
+ Requires-Dist: matplotlib
22
+ Requires-Dist: requests
23
+ Requires-Dist: scikit-learn
24
+ Requires-Dist: scipy
25
+ Requires-Dist: statsmodels
26
+ Requires-Dist: ipython ; extra == 'docs'
27
+ Requires-Dist: myst-parser ; extra == 'docs'
28
+ Requires-Dist: nbsphinx ; extra == 'docs'
29
+ Requires-Dist: sphinx ; extra == 'docs'
30
+ Requires-Dist: sphinx-autoapi ; extra == 'docs'
31
+ Requires-Dist: sphinx-autodoc-typehints ; extra == 'docs'
32
+ Requires-Dist: sphinx-book-theme ; extra == 'docs'
33
+ Requires-Dist: decoupler>=2.1.4 ; extra == 'test'
34
+ Requires-Dist: ipykernel ; extra == 'test'
35
+ Requires-Dist: ipython ; extra == 'test'
36
+ Requires-Dist: ipywidgets ; extra == 'test'
37
+ Requires-Dist: pre-commit ; extra == 'test'
38
+ Requires-Dist: pytest ; extra == 'test'
39
+ Requires-Dist: pytest-cov ; extra == 'test'
40
+ Requires-Dist: responses ; extra == 'test'
41
+ Requires-Dist: ruff ; extra == 'test'
42
+ Requires-Dist: scanpy>=1.11.5 ; extra == 'test'
43
+ Requires-Dist: tqdm>=4.67.3 ; extra == 'test'
44
+ Requires-Dist: twine ; extra == 'test'
45
+ Requires-Dist: ty>=0.0.16 ; extra == 'test'
46
+ Requires-Dist: types-requests ; extra == 'test'
47
+ Maintainer: harryhaller001
48
+ Maintainer-email: harryhaller001 <harryhaller001@gmail.com>
49
+ Requires-Python: >=3.11
50
+ Project-URL: Source, https://github.com/harryhaller001/microarray
51
+ Provides-Extra: docs
52
+ Provides-Extra: test
53
+ Description-Content-Type: text/markdown
54
+
55
+ # microarray
56
+ Processing microarray data in Python
57
+
58
+
59
+ ## Installation
60
+
61
+ Use `pip` to install the microarray package:
62
+
63
+ `pip install microarray`
64
+
65
+ ## Usage
66
+
67
+ ```python
68
+ import microarray as ma
69
+ ```
70
+
71
+ See tutorial notebook for detailed workflow examples.
72
+
73
+ ## License
74
+
75
+ MIT
@@ -0,0 +1,21 @@
1
+ # microarray
2
+ Processing microarray data in Python
3
+
4
+
5
+ ## Installation
6
+
7
+ Use `pip` to install the microarray package:
8
+
9
+ `pip install microarray`
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ import microarray as ma
15
+ ```
16
+
17
+ See tutorial notebook for detailed workflow examples.
18
+
19
+ ## License
20
+
21
+ MIT
@@ -0,0 +1,107 @@
1
+ [build-system]
2
+ build-backend = "uv_build"
3
+ requires = [ "uv-build>=0.9.28,<0.10" ]
4
+
5
+ [project]
6
+ name = "microarray"
7
+ version = "0.1.0"
8
+ description = "Microarray analysis tools"
9
+ readme = { file = "README.md", content-type = "text/markdown" }
10
+ license = "MIT"
11
+ maintainers = [ { name = "harryhaller001", email = "harryhaller001@gmail.com" } ]
12
+ authors = [ { name = "harryhaller001", email = "harryhaller001@gmail.com" } ]
13
+ requires-python = ">=3.11"
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "Natural Language :: English",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Programming Language :: Python :: 3.14",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = [
27
+ "adjusttext>=1.3",
28
+ "anndata",
29
+ "click",
30
+ "matplotlib",
31
+ "requests",
32
+ "scikit-learn",
33
+ "scipy",
34
+ "statsmodels",
35
+ ]
36
+ optional-dependencies.docs = [
37
+ "ipython", # Required for syntax highlighing (https://github.com/spatialaudio/nbsphinx/issues/24)
38
+ "myst-parser",
39
+ "nbsphinx",
40
+ "sphinx",
41
+ "sphinx-autoapi",
42
+ "sphinx-autodoc-typehints",
43
+ "sphinx-book-theme",
44
+ ]
45
+ optional-dependencies.test = [
46
+ "decoupler>=2.1.4",
47
+ "ipykernel",
48
+ "ipython",
49
+ "ipywidgets",
50
+ "pre-commit",
51
+ "pytest",
52
+ "pytest-cov",
53
+ "responses",
54
+ "ruff",
55
+ "scanpy>=1.11.5",
56
+ "tqdm>=4.67.3",
57
+ "twine",
58
+ "ty>=0.0.16",
59
+ "types-requests",
60
+ ]
61
+ # urls.Documentation = "TODO"
62
+ urls.Source = "https://github.com/harryhaller001/microarray"
63
+
64
+ [tool.uv]
65
+ package = true
66
+
67
+ [tool.ruff]
68
+ line-length = 120
69
+ format.docstring-code-format = true
70
+ lint.select = [ "B", "BLE", "C4", "D", "E", "F", "I", "RUF100", "TID", "UP", "W" ]
71
+ lint.ignore = [ "B008", "C408", "D100", "D104", "D105", "D107", "D203", "D213", "D400", "D401", "E501", "E731", "E741" ]
72
+ lint.per-file-ignores."*/__init__.py" = [ "F401" ]
73
+ lint.per-file-ignores."docs/*" = [ "I" ]
74
+ lint.per-file-ignores."test/*" = [ "D" ]
75
+ lint.pydocstyle.convention = "google"
76
+
77
+ [tool.pyproject-fmt]
78
+ column_width = 120 # after how many column width split arrays/dicts into multiple lines, 1 will force always
79
+ indent = 4
80
+ keep_full_version = false # if false will remove unnecessary trailing ``.0``'s from version specifiers
81
+ max_supported_python = "3.14" # maximum Python version to use when generating version specifiers
82
+
83
+ [tool.pytest]
84
+ ini_options.minversion = "7.0"
85
+ ini_options.log_format = "%(asctime)s %(levelname)s %(message)s"
86
+ ini_options.log_date_format = "%Y-%m-%d %H:%M:%S"
87
+ ini_options.log_level = "INFO"
88
+ ini_options.log_cli = true
89
+ ini_options.python_files = "test_*.py"
90
+ ini_options.testpaths = [ "tests" ]
91
+ ini_options.xfail_strict = true
92
+ ini_options.addopts = [
93
+ "--import-mode=importlib", # allow using test files with same name
94
+ "--cov=src/microarray",
95
+ "--cov-report=html:coverage_report",
96
+ "--cov-report=term",
97
+ ]
98
+
99
+ [tool.coverage]
100
+ run.omit = [ "*/tests/*" ]
101
+ run.source = [ "src/microarray" ]
102
+ report.exclude_lines = [ "raise" ]
103
+ report.ignore_errors = true
104
+ html.directory = "coverage_report"
105
+
106
+ [tool.ty]
107
+ src.include = [ "src", "tests", "docs" ]
@@ -0,0 +1,15 @@
1
+ import microarray.datasets as datasets
2
+ import microarray.io as io
3
+ import microarray.plotting as pl
4
+ import microarray.preprocessing as pp
5
+ import microarray.tools as tl
6
+ from microarray._version import __version__
7
+
8
+ __all__ = [
9
+ "__version__",
10
+ "io",
11
+ "pl",
12
+ "pp",
13
+ "tl",
14
+ "datasets",
15
+ ]
@@ -0,0 +1,3 @@
1
+ from importlib.metadata import version
2
+
3
+ __version__: str = version("microarray")
@@ -0,0 +1,3 @@
1
+ from microarray.datasets._cdf_files import hgu133a_cdf, hgu133plus2_cdf
2
+
3
+ __all__ = ["hgu133a_cdf", "hgu133plus2_cdf"]
@@ -0,0 +1 @@
1
+ # TODO: download dataset utils for arrayexpress datasets, e.g. from https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-2109/files/
@@ -0,0 +1,35 @@
1
+ import os
2
+
3
+ from microarray.io._cdf import CdfFile, parse_cdf
4
+
5
+
6
+ def _get_cache_dir() -> str:
7
+ """Returns the path to the cache directory for CDF files."""
8
+ cache_dir = os.path.join(os.getcwd(), ".cache", "cdf")
9
+ # os.makedirs(cache_dir, exist_ok=True)
10
+ assert os.path.isdir(cache_dir), f"Cache directory not found: {cache_dir}"
11
+ return cache_dir
12
+
13
+
14
+ def hgu133a_cdf() -> CdfFile:
15
+ """Returns the CDF file for the hgu133a microarray platform."""
16
+ cache_dir = _get_cache_dir()
17
+ cdf_path = os.path.join(cache_dir, "GPL24120_HGU133A_Hs_ENTREZG.cdf.gz")
18
+
19
+ if not os.path.isfile(cdf_path):
20
+ # TODO: download from GEO if not found
21
+ raise FileNotFoundError(f"CDF file not found: {cdf_path}")
22
+
23
+ return parse_cdf(cdf_path)
24
+
25
+
26
+ def hgu133plus2_cdf() -> CdfFile:
27
+ """Returns the CDF file for the hgu133plus2 microarray platform."""
28
+ cache_dir = _get_cache_dir()
29
+ cdf_path = os.path.join(cache_dir, "GPL22945_HGU133Plus2_Hs_ENTREZG.cdf.gz")
30
+
31
+ if not os.path.isfile(cdf_path):
32
+ # TODO: download from GEO if not found
33
+ raise FileNotFoundError(f"CDF file not found: {cdf_path}")
34
+
35
+ return parse_cdf(cdf_path)
@@ -0,0 +1 @@
1
+ # TODO: Download utils from GEO datasets, e.g. from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE2109
@@ -0,0 +1,143 @@
1
+ import os
2
+ from typing import Any, BinaryIO, Literal
3
+ from urllib.parse import urlencode, urlunparse
4
+
5
+ import requests
6
+
7
+ TIMEOUT = 1000
8
+
9
+
10
+ def build_url(
11
+ hostname: str,
12
+ path: str,
13
+ scheme: Literal["http", "https", "ftp"] | str = "https",
14
+ query: dict[str, str | int | float] | None = None,
15
+ fragment: str | None = None,
16
+ ) -> str:
17
+ """Build url from parts.
18
+
19
+ Args:
20
+ hostname (str): Hostname of url (e.g. "example.com").
21
+ path (str): Path of url.
22
+ scheme (Literal["http", "https", "ftp"] | str, optional): Schema of url. Defaults to `https`.
23
+ query (dict[str, str | int | float] | None, optional): Query parameter. Defaults to `None`.
24
+ fragment (str | None, optional): Fragment of url. Defaults to `None`.
25
+
26
+ Returns:
27
+ str: Full url as string.
28
+ """
29
+ query_encoded: str = ""
30
+
31
+ if query is not None:
32
+ query_encoded = urlencode(query)
33
+
34
+ url = urlunparse(
35
+ [
36
+ scheme,
37
+ hostname,
38
+ path,
39
+ "",
40
+ query_encoded,
41
+ "" if fragment is None else fragment,
42
+ ]
43
+ )
44
+ return url
45
+
46
+
47
+ def _file_or_buffer(
48
+ output_file: str | BinaryIO,
49
+ overwrite: bool = False,
50
+ ) -> BinaryIO:
51
+ """Harmonize file name and buffer.
52
+
53
+ Args:
54
+ output_file (str | BinaryIO): Filename to save object to or file object.
55
+ overwrite (bool, optional): Flag to overwrite, if output file already exist. Defaults to `False`.
56
+
57
+ Returns:
58
+ Memory buffer or opened File buffer.
59
+
60
+ Raises:
61
+ FileExistsError: If `overwrite=False` and output file already exists.
62
+ """
63
+ file_object: BinaryIO
64
+
65
+ if isinstance(output_file, str):
66
+ if overwrite is False and os.path.isfile(output_file):
67
+ raise FileExistsError(f"Output file '{output_file}' already exists.")
68
+
69
+ file_object = open(output_file, "wb")
70
+ else:
71
+ file_object = output_file
72
+
73
+ return file_object
74
+
75
+
76
+ def download_file_stream(
77
+ url: str,
78
+ output_file: str | BinaryIO,
79
+ overwrite: bool = False,
80
+ chunk_size: int = 8192,
81
+ request_kwargs: dict[str, Any] | None = None,
82
+ ) -> None:
83
+ """Download file as stream. Source: https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests.
84
+
85
+ Args:
86
+ url (str): URL to download.
87
+ output_file (str | BinaryIO): Filename to save object to or file object.
88
+ overwrite (bool, optional): Flag to overwrite, if output file already exist. Defaults to `False`.
89
+ chunk_size (int, optional): Size of downloaded chunk. Defaults to `8192`.
90
+ request_kwargs (dict[str, Any] | None, optional): Extra arguments passed to `requests.get` function. Defaults to `None`.
91
+
92
+ Raises:
93
+ FileExistsError: If `overwrite=False` and output file already exists.
94
+ """
95
+ file_object = _file_or_buffer(
96
+ output_file=output_file,
97
+ overwrite=overwrite,
98
+ )
99
+
100
+ if request_kwargs is None:
101
+ request_kwargs = {}
102
+
103
+ if "timeout" not in request_kwargs:
104
+ request_kwargs["timeout"] = TIMEOUT
105
+
106
+ with requests.get(url, stream=True, **request_kwargs) as requests_content:
107
+ requests_content.raise_for_status()
108
+ for chunk in requests_content.iter_content(chunk_size=chunk_size):
109
+ file_object.write(chunk)
110
+
111
+
112
+ def download_file(
113
+ url: str,
114
+ output_file: str | BinaryIO,
115
+ overwrite: bool = False,
116
+ request_kwargs: dict[str, Any] | None = None,
117
+ ) -> None:
118
+ """Download file.
119
+
120
+ Args:
121
+ url (str): URL to download.
122
+ output_file (str | BinaryIO): Filename to save object to or file object.
123
+ overwrite (bool, optional): Flag to overwrite, if output file already exist. Defaults to `False`.
124
+ request_kwargs (dict[str, Any] | None, optional): Extra arguments passed to `requests.get` function. Defaults to `None`.
125
+
126
+ Raises:
127
+ FileExistsError: If `overwrite=False` and output file already exists.
128
+ """
129
+ file_object = _file_or_buffer(
130
+ output_file=output_file,
131
+ overwrite=overwrite,
132
+ )
133
+
134
+ if request_kwargs is None:
135
+ request_kwargs = {}
136
+
137
+ if "timeout" not in request_kwargs:
138
+ request_kwargs["timeout"] = TIMEOUT
139
+
140
+ response = requests.get(url, **request_kwargs)
141
+ response.raise_for_status()
142
+
143
+ file_object.write(response.content)
@@ -0,0 +1,17 @@
1
+ from microarray.io._anndata_converter import (
2
+ cel_to_anndata,
3
+ )
4
+ from microarray.io._cdf import CdfFile, parse_cdf
5
+ from microarray.io._cel import CelFile, apply_probe_annotation, parse_cel
6
+ from microarray.io._read import read_cel, read_cel_batch
7
+
8
+ __all__ = [
9
+ "apply_probe_annotation",
10
+ "parse_cel",
11
+ "parse_cdf",
12
+ "read_cel",
13
+ "read_cel_batch",
14
+ "CelFile",
15
+ "CdfFile",
16
+ "cel_to_anndata",
17
+ ]
@@ -0,0 +1,198 @@
1
+ """AnnData converter for microarray CEL/CDF data."""
2
+
3
+ import os
4
+ from collections import defaultdict
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from anndata import AnnData
9
+
10
+ from microarray.io._cdf import CdfFile, parse_cdf
11
+ from microarray.io._cel import CelFile, parse_cel
12
+
13
+
14
+ def cel_to_anndata(
15
+ cel_path: str | CelFile,
16
+ cdf_path: str | CdfFile,
17
+ sample_name: str | None = None,
18
+ ) -> AnnData:
19
+ """Convert a single CEL file to a probe-level AnnData object.
20
+
21
+ Creates an AnnData with shape (1, n_cells) where each variable represents
22
+ an individual CDF cell (probe):
23
+
24
+ - ``.X`` contains per-cell intensities.
25
+ - ``.var`` contains ``probeset_id``, ``probe_index`` (0-based rank within
26
+ probeset by ascending ``expos``, so 0 = 3' end), ``probe_type``
27
+ (``"pm"``, ``"mm"``, or ``"unknown"``), ``gene_id`` (gene identifier
28
+ extracted from probeset name), and ``suffix`` (Affymetrix probe type
29
+ suffix such as ``"_at"`` for antisense target).
30
+ - ``.obs`` contains sample metadata from the CEL header.
31
+ - ``.layers`` contains per-cell QC arrays: ``stdevs``, ``npixels``,
32
+ ``masks``, ``outliers``, ``modified``.
33
+ - ``.uns`` contains CDF chip info and CEL file metadata.
34
+
35
+
36
+ Parameters
37
+ ----------
38
+ cel_path : str | CelFile
39
+ Path to the CEL file or a parsed :class:`~microarray.io.CelFile`
40
+ instance.
41
+ cdf_path : str | CdfFile
42
+ Path to the CDF file or a parsed :class:`~microarray.io.CdfFile`
43
+ instance.
44
+ sample_name : str | None
45
+ Name used as the observation index. Defaults to the CEL file's
46
+ basename, or ``"unknown"`` when a :class:`~microarray.io.CelFile`
47
+ instance is supplied without a path.
48
+
49
+ Returns:
50
+ -------
51
+ AnnData
52
+ Probe-level AnnData with shape ``(1, n_cells)``.
53
+ """
54
+ # Parse input files if they are paths
55
+ if isinstance(cel_path, str):
56
+ cel_file_path = cel_path
57
+ cel_file = parse_cel(cel_path)
58
+ else:
59
+ cel_file_path = None
60
+ cel_file = cel_path
61
+
62
+ if isinstance(cdf_path, str):
63
+ cdf_file = parse_cdf(cdf_path)
64
+ else:
65
+ cdf_file = cdf_path
66
+
67
+ if sample_name is None:
68
+ sample_name = os.path.basename(cel_file_path) if cel_file_path else "unknown"
69
+
70
+ obs = pd.DataFrame(
71
+ {
72
+ "sample_name": [sample_name],
73
+ "cel_version": [cel_file.version],
74
+ "nrows": [cel_file.nrows],
75
+ "ncols": [cel_file.ncols],
76
+ "algorithm": [cel_file.algorithm],
77
+ },
78
+ index=[sample_name],
79
+ )
80
+
81
+ uns = {
82
+ "cel_metadata": {
83
+ "total_x": cel_file.total_x,
84
+ "total_y": cel_file.total_y,
85
+ "offset_x": cel_file.offset_x,
86
+ "offset_y": cel_file.offset_y,
87
+ "grid_corner_ul": cel_file.grid_corner_ul,
88
+ "grid_corner_ur": cel_file.grid_corner_ur,
89
+ "grid_corner_ll": cel_file.grid_corner_ll,
90
+ "grid_corner_lr": cel_file.grid_corner_lr,
91
+ "axis_invert_x": cel_file.axis_invert_x,
92
+ "axis_invert_y": cel_file.axis_invert_y,
93
+ "swap_xy": cel_file.swap_xy,
94
+ "algorithm_parameters": cel_file.algorithm_parameters,
95
+ "dat_header": cel_file.dat_header,
96
+ },
97
+ "cdf_metadata": {
98
+ "chip_name": cdf_file.chip_info.name,
99
+ "chip_rows": cdf_file.chip_info.rows,
100
+ "chip_cols": cdf_file.chip_info.cols,
101
+ "number_of_units": cdf_file.chip_info.number_of_units,
102
+ "max_unit": cdf_file.chip_info.max_unit,
103
+ "num_qc_units": cdf_file.chip_info.num_qc_units,
104
+ },
105
+ }
106
+
107
+ # Collect cells per probeset to compute ordinal ranks.
108
+ # Structure: {probeset_id: [(expos, x, y, is_pm), ...]}
109
+ ps_cells: dict[str, list[tuple]] = defaultdict(list)
110
+
111
+ for unit in cdf_file.units:
112
+ for block in unit.blocks:
113
+ probeset_id = block.name if block.name and block.name != "NONE" else unit.name
114
+ for cell in block.cells:
115
+ expos_val = cell.expos if cell.expos is not None else 0
116
+ ps_cells[probeset_id].append((expos_val, cell.x, cell.y, cell.is_pm))
117
+
118
+ # Sort by expos within each probeset and assign ordinal rank.
119
+ # Smallest expos = 3' end = rank 0.
120
+ probeset_ids: list[str] = []
121
+ probe_indices: list[int] = []
122
+ probe_types: list[str] = []
123
+ probe_var_ids: list[str] = []
124
+ intensities_list: list[float] = []
125
+ stdevs_list: list[float] = []
126
+ npixels_list: list[float] = []
127
+ masks_list: list[float] = []
128
+ outliers_list: list[float] = []
129
+ modified_list: list[float] = []
130
+
131
+ for probeset_id, cells in ps_cells.items():
132
+ cells_sorted = sorted(cells, key=lambda c: c[0])
133
+ for ordinal_rank, (_, cx, cy, cell_is_pm) in enumerate(cells_sorted):
134
+ intensity = float(cel_file.intensities[cy, cx]) if cel_file.intensities is not None else float("nan")
135
+ stdev = float(cel_file.stdevs[cy, cx]) if cel_file.stdevs is not None else float("nan")
136
+ npixels = float(cel_file.npixels[cy, cx]) if cel_file.npixels is not None else float("nan")
137
+ masks = float(cel_file.masks[cy, cx]) if cel_file.masks is not None else float("nan")
138
+ outliers = float(cel_file.outliers[cy, cx]) if cel_file.outliers is not None else float("nan")
139
+ modified = float(cel_file.modified[cy, cx]) if cel_file.modified is not None else float("nan")
140
+
141
+ if cell_is_pm is True:
142
+ probe_type = "pm"
143
+ elif cell_is_pm is False:
144
+ probe_type = "mm"
145
+ else:
146
+ probe_type = "unknown"
147
+
148
+ var_id = f"{probeset_id}:{cx}:{cy}"
149
+ probeset_ids.append(probeset_id)
150
+ probe_indices.append(ordinal_rank)
151
+ probe_types.append(probe_type)
152
+ probe_var_ids.append(var_id)
153
+ intensities_list.append(intensity)
154
+ stdevs_list.append(stdev)
155
+ npixels_list.append(npixels)
156
+ masks_list.append(masks)
157
+ outliers_list.append(outliers)
158
+ modified_list.append(modified)
159
+
160
+ n_cells = len(probeset_ids)
161
+ X = np.array(intensities_list, dtype=np.float32).reshape(1, n_cells)
162
+
163
+ var = pd.DataFrame(
164
+ {
165
+ "probeset_id": probeset_ids,
166
+ "probe_index": probe_indices,
167
+ "probe_type": probe_types,
168
+ },
169
+ index=probe_var_ids,
170
+ )
171
+
172
+ # Merge probeset_info (gene_id and suffix) into var
173
+ if cdf_file.probeset_info is not None:
174
+ # Map probeset_id to gene_id and suffix by looking up in probeset_info
175
+ gene_ids = []
176
+ suffixes = []
177
+ for ps_id in probeset_ids:
178
+ if ps_id in cdf_file.probeset_info.index:
179
+ info = cdf_file.probeset_info.loc[ps_id]
180
+ gene_ids.append(info["gene_id"])
181
+ suffixes.append(info["suffix"])
182
+ else:
183
+ # Fallback if probeset not in probeset_info
184
+ gene_ids.append(ps_id)
185
+ suffixes.append("")
186
+
187
+ var["gene_id"] = gene_ids
188
+ var["suffix"] = suffixes
189
+
190
+ layers = {
191
+ "stdevs": np.array(stdevs_list, dtype=np.float32).reshape(1, n_cells),
192
+ "npixels": np.array(npixels_list, dtype=np.float32).reshape(1, n_cells),
193
+ "masks": np.array(masks_list, dtype=np.float32).reshape(1, n_cells),
194
+ "outliers": np.array(outliers_list, dtype=np.float32).reshape(1, n_cells),
195
+ "modified": np.array(modified_list, dtype=np.float32).reshape(1, n_cells),
196
+ }
197
+
198
+ return AnnData(X=X, obs=obs, var=var, layers=layers, uns=uns)