microarray 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- microarray-0.1.0/PKG-INFO +75 -0
- microarray-0.1.0/README.md +21 -0
- microarray-0.1.0/pyproject.toml +107 -0
- microarray-0.1.0/src/microarray/__init__.py +15 -0
- microarray-0.1.0/src/microarray/_version.py +3 -0
- microarray-0.1.0/src/microarray/datasets/__init__.py +3 -0
- microarray-0.1.0/src/microarray/datasets/_arrayexpress.py +1 -0
- microarray-0.1.0/src/microarray/datasets/_cdf_files.py +35 -0
- microarray-0.1.0/src/microarray/datasets/_geo.py +1 -0
- microarray-0.1.0/src/microarray/datasets/_utils.py +143 -0
- microarray-0.1.0/src/microarray/io/__init__.py +17 -0
- microarray-0.1.0/src/microarray/io/_anndata_converter.py +198 -0
- microarray-0.1.0/src/microarray/io/_cdf.py +575 -0
- microarray-0.1.0/src/microarray/io/_cel.py +591 -0
- microarray-0.1.0/src/microarray/io/_read.py +127 -0
- microarray-0.1.0/src/microarray/plotting/__init__.py +28 -0
- microarray-0.1.0/src/microarray/plotting/_base.py +253 -0
- microarray-0.1.0/src/microarray/plotting/_cel.py +75 -0
- microarray-0.1.0/src/microarray/plotting/_de_plots.py +239 -0
- microarray-0.1.0/src/microarray/plotting/_diagnostic_plots.py +268 -0
- microarray-0.1.0/src/microarray/plotting/_heatmap.py +279 -0
- microarray-0.1.0/src/microarray/plotting/_ma_plots.py +136 -0
- microarray-0.1.0/src/microarray/plotting/_pca.py +320 -0
- microarray-0.1.0/src/microarray/plotting/_qc_plots.py +335 -0
- microarray-0.1.0/src/microarray/plotting/_score.py +38 -0
- microarray-0.1.0/src/microarray/plotting/_top_table_heatmap.py +98 -0
- microarray-0.1.0/src/microarray/plotting/_utils.py +280 -0
- microarray-0.1.0/src/microarray/preprocessing/__init__.py +39 -0
- microarray-0.1.0/src/microarray/preprocessing/_background.py +862 -0
- microarray-0.1.0/src/microarray/preprocessing/_log2.py +77 -0
- microarray-0.1.0/src/microarray/preprocessing/_normalize.py +1292 -0
- microarray-0.1.0/src/microarray/preprocessing/_rma.py +243 -0
- microarray-0.1.0/src/microarray/preprocessing/_robust.py +170 -0
- microarray-0.1.0/src/microarray/preprocessing/_summarize.py +318 -0
- microarray-0.1.0/src/microarray/py.typed +0 -0
- microarray-0.1.0/src/microarray/tools/__init__.py +26 -0
- microarray-0.1.0/src/microarray/tools/_biomart.py +416 -0
- microarray-0.1.0/src/microarray/tools/_empirical_bayes.py +401 -0
- microarray-0.1.0/src/microarray/tools/_fdist.py +171 -0
- microarray-0.1.0/src/microarray/tools/_linear_models.py +387 -0
- microarray-0.1.0/src/microarray/tools/_mds.py +101 -0
- microarray-0.1.0/src/microarray/tools/_pca.py +88 -0
- microarray-0.1.0/src/microarray/tools/_score.py +86 -0
- microarray-0.1.0/src/microarray/tools/_toptable.py +360 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: microarray
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Microarray analysis tools
|
|
5
|
+
Author: harryhaller001
|
|
6
|
+
Author-email: harryhaller001 <harryhaller001@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Dist: adjusttext>=1.3
|
|
19
|
+
Requires-Dist: anndata
|
|
20
|
+
Requires-Dist: click
|
|
21
|
+
Requires-Dist: matplotlib
|
|
22
|
+
Requires-Dist: requests
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Requires-Dist: scipy
|
|
25
|
+
Requires-Dist: statsmodels
|
|
26
|
+
Requires-Dist: ipython ; extra == 'docs'
|
|
27
|
+
Requires-Dist: myst-parser ; extra == 'docs'
|
|
28
|
+
Requires-Dist: nbsphinx ; extra == 'docs'
|
|
29
|
+
Requires-Dist: sphinx ; extra == 'docs'
|
|
30
|
+
Requires-Dist: sphinx-autoapi ; extra == 'docs'
|
|
31
|
+
Requires-Dist: sphinx-autodoc-typehints ; extra == 'docs'
|
|
32
|
+
Requires-Dist: sphinx-book-theme ; extra == 'docs'
|
|
33
|
+
Requires-Dist: decoupler>=2.1.4 ; extra == 'test'
|
|
34
|
+
Requires-Dist: ipykernel ; extra == 'test'
|
|
35
|
+
Requires-Dist: ipython ; extra == 'test'
|
|
36
|
+
Requires-Dist: ipywidgets ; extra == 'test'
|
|
37
|
+
Requires-Dist: pre-commit ; extra == 'test'
|
|
38
|
+
Requires-Dist: pytest ; extra == 'test'
|
|
39
|
+
Requires-Dist: pytest-cov ; extra == 'test'
|
|
40
|
+
Requires-Dist: responses ; extra == 'test'
|
|
41
|
+
Requires-Dist: ruff ; extra == 'test'
|
|
42
|
+
Requires-Dist: scanpy>=1.11.5 ; extra == 'test'
|
|
43
|
+
Requires-Dist: tqdm>=4.67.3 ; extra == 'test'
|
|
44
|
+
Requires-Dist: twine ; extra == 'test'
|
|
45
|
+
Requires-Dist: ty>=0.0.16 ; extra == 'test'
|
|
46
|
+
Requires-Dist: types-requests ; extra == 'test'
|
|
47
|
+
Maintainer: harryhaller001
|
|
48
|
+
Maintainer-email: harryhaller001 <harryhaller001@gmail.com>
|
|
49
|
+
Requires-Python: >=3.11
|
|
50
|
+
Project-URL: Source, https://github.com/harryhaller001/microarray
|
|
51
|
+
Provides-Extra: docs
|
|
52
|
+
Provides-Extra: test
|
|
53
|
+
Description-Content-Type: text/markdown
|
|
54
|
+
|
|
55
|
+
# microarray
|
|
56
|
+
Processing microarray data in Python
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
Use `pip` to install the microarray package:
|
|
62
|
+
|
|
63
|
+
`pip install microarray`
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import microarray as ma
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
See tutorial notebook for detailed workflow examples.
|
|
72
|
+
|
|
73
|
+
## License
|
|
74
|
+
|
|
75
|
+
MIT
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# microarray
|
|
2
|
+
Processing microarray data in Python
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Use `pip` to install the microarray package:
|
|
8
|
+
|
|
9
|
+
`pip install microarray`
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import microarray as ma
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
See tutorial notebook for detailed workflow examples.
|
|
18
|
+
|
|
19
|
+
## License
|
|
20
|
+
|
|
21
|
+
MIT
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
build-backend = "uv_build"
|
|
3
|
+
requires = [ "uv-build>=0.9.28,<0.10" ]
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "microarray"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Microarray analysis tools"
|
|
9
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
|
+
license = "MIT"
|
|
11
|
+
maintainers = [ { name = "harryhaller001", email = "harryhaller001@gmail.com" } ]
|
|
12
|
+
authors = [ { name = "harryhaller001", email = "harryhaller001@gmail.com" } ]
|
|
13
|
+
requires-python = ">=3.11"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"Natural Language :: English",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3.14",
|
|
24
|
+
"Typing :: Typed",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"adjusttext>=1.3",
|
|
28
|
+
"anndata",
|
|
29
|
+
"click",
|
|
30
|
+
"matplotlib",
|
|
31
|
+
"requests",
|
|
32
|
+
"scikit-learn",
|
|
33
|
+
"scipy",
|
|
34
|
+
"statsmodels",
|
|
35
|
+
]
|
|
36
|
+
optional-dependencies.docs = [
|
|
37
|
+
"ipython", # Required for syntax highlighing (https://github.com/spatialaudio/nbsphinx/issues/24)
|
|
38
|
+
"myst-parser",
|
|
39
|
+
"nbsphinx",
|
|
40
|
+
"sphinx",
|
|
41
|
+
"sphinx-autoapi",
|
|
42
|
+
"sphinx-autodoc-typehints",
|
|
43
|
+
"sphinx-book-theme",
|
|
44
|
+
]
|
|
45
|
+
optional-dependencies.test = [
|
|
46
|
+
"decoupler>=2.1.4",
|
|
47
|
+
"ipykernel",
|
|
48
|
+
"ipython",
|
|
49
|
+
"ipywidgets",
|
|
50
|
+
"pre-commit",
|
|
51
|
+
"pytest",
|
|
52
|
+
"pytest-cov",
|
|
53
|
+
"responses",
|
|
54
|
+
"ruff",
|
|
55
|
+
"scanpy>=1.11.5",
|
|
56
|
+
"tqdm>=4.67.3",
|
|
57
|
+
"twine",
|
|
58
|
+
"ty>=0.0.16",
|
|
59
|
+
"types-requests",
|
|
60
|
+
]
|
|
61
|
+
# urls.Documentation = "TODO"
|
|
62
|
+
urls.Source = "https://github.com/harryhaller001/microarray"
|
|
63
|
+
|
|
64
|
+
[tool.uv]
|
|
65
|
+
package = true
|
|
66
|
+
|
|
67
|
+
[tool.ruff]
|
|
68
|
+
line-length = 120
|
|
69
|
+
format.docstring-code-format = true
|
|
70
|
+
lint.select = [ "B", "BLE", "C4", "D", "E", "F", "I", "RUF100", "TID", "UP", "W" ]
|
|
71
|
+
lint.ignore = [ "B008", "C408", "D100", "D104", "D105", "D107", "D203", "D213", "D400", "D401", "E501", "E731", "E741" ]
|
|
72
|
+
lint.per-file-ignores."*/__init__.py" = [ "F401" ]
|
|
73
|
+
lint.per-file-ignores."docs/*" = [ "I" ]
|
|
74
|
+
lint.per-file-ignores."test/*" = [ "D" ]
|
|
75
|
+
lint.pydocstyle.convention = "google"
|
|
76
|
+
|
|
77
|
+
[tool.pyproject-fmt]
|
|
78
|
+
column_width = 120 # after how many column width split arrays/dicts into multiple lines, 1 will force always
|
|
79
|
+
indent = 4
|
|
80
|
+
keep_full_version = false # if false will remove unnecessary trailing ``.0``'s from version specifiers
|
|
81
|
+
max_supported_python = "3.14" # maximum Python version to use when generating version specifiers
|
|
82
|
+
|
|
83
|
+
[tool.pytest]
|
|
84
|
+
ini_options.minversion = "7.0"
|
|
85
|
+
ini_options.log_format = "%(asctime)s %(levelname)s %(message)s"
|
|
86
|
+
ini_options.log_date_format = "%Y-%m-%d %H:%M:%S"
|
|
87
|
+
ini_options.log_level = "INFO"
|
|
88
|
+
ini_options.log_cli = true
|
|
89
|
+
ini_options.python_files = "test_*.py"
|
|
90
|
+
ini_options.testpaths = [ "tests" ]
|
|
91
|
+
ini_options.xfail_strict = true
|
|
92
|
+
ini_options.addopts = [
|
|
93
|
+
"--import-mode=importlib", # allow using test files with same name
|
|
94
|
+
"--cov=src/microarray",
|
|
95
|
+
"--cov-report=html:coverage_report",
|
|
96
|
+
"--cov-report=term",
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
[tool.coverage]
|
|
100
|
+
run.omit = [ "*/tests/*" ]
|
|
101
|
+
run.source = [ "src/microarray" ]
|
|
102
|
+
report.exclude_lines = [ "raise" ]
|
|
103
|
+
report.ignore_errors = true
|
|
104
|
+
html.directory = "coverage_report"
|
|
105
|
+
|
|
106
|
+
[tool.ty]
|
|
107
|
+
src.include = [ "src", "tests", "docs" ]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import microarray.datasets as datasets
|
|
2
|
+
import microarray.io as io
|
|
3
|
+
import microarray.plotting as pl
|
|
4
|
+
import microarray.preprocessing as pp
|
|
5
|
+
import microarray.tools as tl
|
|
6
|
+
from microarray._version import __version__
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"__version__",
|
|
10
|
+
"io",
|
|
11
|
+
"pl",
|
|
12
|
+
"pp",
|
|
13
|
+
"tl",
|
|
14
|
+
"datasets",
|
|
15
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# TODO: download dataset utils for arrayexpress datasets, e.g. from https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-2109/files/
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from microarray.io._cdf import CdfFile, parse_cdf
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _get_cache_dir() -> str:
|
|
7
|
+
"""Returns the path to the cache directory for CDF files."""
|
|
8
|
+
cache_dir = os.path.join(os.getcwd(), ".cache", "cdf")
|
|
9
|
+
# os.makedirs(cache_dir, exist_ok=True)
|
|
10
|
+
assert os.path.isdir(cache_dir), f"Cache directory not found: {cache_dir}"
|
|
11
|
+
return cache_dir
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def hgu133a_cdf() -> CdfFile:
|
|
15
|
+
"""Returns the CDF file for the hgu133a microarray platform."""
|
|
16
|
+
cache_dir = _get_cache_dir()
|
|
17
|
+
cdf_path = os.path.join(cache_dir, "GPL24120_HGU133A_Hs_ENTREZG.cdf.gz")
|
|
18
|
+
|
|
19
|
+
if not os.path.isfile(cdf_path):
|
|
20
|
+
# TODO: download from GEO if not found
|
|
21
|
+
raise FileNotFoundError(f"CDF file not found: {cdf_path}")
|
|
22
|
+
|
|
23
|
+
return parse_cdf(cdf_path)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def hgu133plus2_cdf() -> CdfFile:
|
|
27
|
+
"""Returns the CDF file for the hgu133plus2 microarray platform."""
|
|
28
|
+
cache_dir = _get_cache_dir()
|
|
29
|
+
cdf_path = os.path.join(cache_dir, "GPL22945_HGU133Plus2_Hs_ENTREZG.cdf.gz")
|
|
30
|
+
|
|
31
|
+
if not os.path.isfile(cdf_path):
|
|
32
|
+
# TODO: download from GEO if not found
|
|
33
|
+
raise FileNotFoundError(f"CDF file not found: {cdf_path}")
|
|
34
|
+
|
|
35
|
+
return parse_cdf(cdf_path)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# TODO: Download utils from GEO datasets, e.g. from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE2109
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, BinaryIO, Literal
|
|
3
|
+
from urllib.parse import urlencode, urlunparse
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
TIMEOUT = 1000
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def build_url(
|
|
11
|
+
hostname: str,
|
|
12
|
+
path: str,
|
|
13
|
+
scheme: Literal["http", "https", "ftp"] | str = "https",
|
|
14
|
+
query: dict[str, str | int | float] | None = None,
|
|
15
|
+
fragment: str | None = None,
|
|
16
|
+
) -> str:
|
|
17
|
+
"""Build url from parts.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
hostname (str): Hostname of url (e.g. "example.com").
|
|
21
|
+
path (str): Path of url.
|
|
22
|
+
scheme (Literal["http", "https", "ftp"] | str, optional): Schema of url. Defaults to `https`.
|
|
23
|
+
query (dict[str, str | int | float] | None, optional): Query parameter. Defaults to `None`.
|
|
24
|
+
fragment (str | None, optional): Fragment of url. Defaults to `None`.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
str: Full url as string.
|
|
28
|
+
"""
|
|
29
|
+
query_encoded: str = ""
|
|
30
|
+
|
|
31
|
+
if query is not None:
|
|
32
|
+
query_encoded = urlencode(query)
|
|
33
|
+
|
|
34
|
+
url = urlunparse(
|
|
35
|
+
[
|
|
36
|
+
scheme,
|
|
37
|
+
hostname,
|
|
38
|
+
path,
|
|
39
|
+
"",
|
|
40
|
+
query_encoded,
|
|
41
|
+
"" if fragment is None else fragment,
|
|
42
|
+
]
|
|
43
|
+
)
|
|
44
|
+
return url
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _file_or_buffer(
|
|
48
|
+
output_file: str | BinaryIO,
|
|
49
|
+
overwrite: bool = False,
|
|
50
|
+
) -> BinaryIO:
|
|
51
|
+
"""Harmonize file name and buffer.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
output_file (str | BinaryIO): Filename to save object to or file object.
|
|
55
|
+
overwrite (bool, optional): Flag to overwrite, if output file already exist. Defaults to `False`.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Memory buffer or opened File buffer.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
FileExistsError: If `overwrite=False` and output file already exists.
|
|
62
|
+
"""
|
|
63
|
+
file_object: BinaryIO
|
|
64
|
+
|
|
65
|
+
if isinstance(output_file, str):
|
|
66
|
+
if overwrite is False and os.path.isfile(output_file):
|
|
67
|
+
raise FileExistsError(f"Output file '{output_file}' already exists.")
|
|
68
|
+
|
|
69
|
+
file_object = open(output_file, "wb")
|
|
70
|
+
else:
|
|
71
|
+
file_object = output_file
|
|
72
|
+
|
|
73
|
+
return file_object
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def download_file_stream(
|
|
77
|
+
url: str,
|
|
78
|
+
output_file: str | BinaryIO,
|
|
79
|
+
overwrite: bool = False,
|
|
80
|
+
chunk_size: int = 8192,
|
|
81
|
+
request_kwargs: dict[str, Any] | None = None,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Download file as stream. Source: https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
url (str): URL to download.
|
|
87
|
+
output_file (str | BinaryIO): Filename to save object to or file object.
|
|
88
|
+
overwrite (bool, optional): Flag to overwrite, if output file already exist. Defaults to `False`.
|
|
89
|
+
chunk_size (int, optional): Size of downloaded chunk. Defaults to `8192`.
|
|
90
|
+
request_kwargs (dict[str, Any] | None, optional): Extra arguments passed to `requests.get` function. Defaults to `None`.
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
FileExistsError: If `overwrite=False` and output file already exists.
|
|
94
|
+
"""
|
|
95
|
+
file_object = _file_or_buffer(
|
|
96
|
+
output_file=output_file,
|
|
97
|
+
overwrite=overwrite,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if request_kwargs is None:
|
|
101
|
+
request_kwargs = {}
|
|
102
|
+
|
|
103
|
+
if "timeout" not in request_kwargs:
|
|
104
|
+
request_kwargs["timeout"] = TIMEOUT
|
|
105
|
+
|
|
106
|
+
with requests.get(url, stream=True, **request_kwargs) as requests_content:
|
|
107
|
+
requests_content.raise_for_status()
|
|
108
|
+
for chunk in requests_content.iter_content(chunk_size=chunk_size):
|
|
109
|
+
file_object.write(chunk)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def download_file(
|
|
113
|
+
url: str,
|
|
114
|
+
output_file: str | BinaryIO,
|
|
115
|
+
overwrite: bool = False,
|
|
116
|
+
request_kwargs: dict[str, Any] | None = None,
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Download file.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
url (str): URL to download.
|
|
122
|
+
output_file (str | BinaryIO): Filename to save object to or file object.
|
|
123
|
+
overwrite (bool, optional): Flag to overwrite, if output file already exist. Defaults to `False`.
|
|
124
|
+
request_kwargs (dict[str, Any] | None, optional): Extra arguments passed to `requests.get` function. Defaults to `None`.
|
|
125
|
+
|
|
126
|
+
Raises:
|
|
127
|
+
FileExistsError: If `overwrite=False` and output file already exists.
|
|
128
|
+
"""
|
|
129
|
+
file_object = _file_or_buffer(
|
|
130
|
+
output_file=output_file,
|
|
131
|
+
overwrite=overwrite,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if request_kwargs is None:
|
|
135
|
+
request_kwargs = {}
|
|
136
|
+
|
|
137
|
+
if "timeout" not in request_kwargs:
|
|
138
|
+
request_kwargs["timeout"] = TIMEOUT
|
|
139
|
+
|
|
140
|
+
response = requests.get(url, **request_kwargs)
|
|
141
|
+
response.raise_for_status()
|
|
142
|
+
|
|
143
|
+
file_object.write(response.content)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from microarray.io._anndata_converter import (
|
|
2
|
+
cel_to_anndata,
|
|
3
|
+
)
|
|
4
|
+
from microarray.io._cdf import CdfFile, parse_cdf
|
|
5
|
+
from microarray.io._cel import CelFile, apply_probe_annotation, parse_cel
|
|
6
|
+
from microarray.io._read import read_cel, read_cel_batch
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"apply_probe_annotation",
|
|
10
|
+
"parse_cel",
|
|
11
|
+
"parse_cdf",
|
|
12
|
+
"read_cel",
|
|
13
|
+
"read_cel_batch",
|
|
14
|
+
"CelFile",
|
|
15
|
+
"CdfFile",
|
|
16
|
+
"cel_to_anndata",
|
|
17
|
+
]
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""AnnData converter for microarray CEL/CDF data."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from anndata import AnnData
|
|
9
|
+
|
|
10
|
+
from microarray.io._cdf import CdfFile, parse_cdf
|
|
11
|
+
from microarray.io._cel import CelFile, parse_cel
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cel_to_anndata(
|
|
15
|
+
cel_path: str | CelFile,
|
|
16
|
+
cdf_path: str | CdfFile,
|
|
17
|
+
sample_name: str | None = None,
|
|
18
|
+
) -> AnnData:
|
|
19
|
+
"""Convert a single CEL file to a probe-level AnnData object.
|
|
20
|
+
|
|
21
|
+
Creates an AnnData with shape (1, n_cells) where each variable represents
|
|
22
|
+
an individual CDF cell (probe):
|
|
23
|
+
|
|
24
|
+
- ``.X`` contains per-cell intensities.
|
|
25
|
+
- ``.var`` contains ``probeset_id``, ``probe_index`` (0-based rank within
|
|
26
|
+
probeset by ascending ``expos``, so 0 = 3' end), ``probe_type``
|
|
27
|
+
(``"pm"``, ``"mm"``, or ``"unknown"``), ``gene_id`` (gene identifier
|
|
28
|
+
extracted from probeset name), and ``suffix`` (Affymetrix probe type
|
|
29
|
+
suffix such as ``"_at"`` for antisense target).
|
|
30
|
+
- ``.obs`` contains sample metadata from the CEL header.
|
|
31
|
+
- ``.layers`` contains per-cell QC arrays: ``stdevs``, ``npixels``,
|
|
32
|
+
``masks``, ``outliers``, ``modified``.
|
|
33
|
+
- ``.uns`` contains CDF chip info and CEL file metadata.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
cel_path : str | CelFile
|
|
39
|
+
Path to the CEL file or a parsed :class:`~microarray.io.CelFile`
|
|
40
|
+
instance.
|
|
41
|
+
cdf_path : str | CdfFile
|
|
42
|
+
Path to the CDF file or a parsed :class:`~microarray.io.CdfFile`
|
|
43
|
+
instance.
|
|
44
|
+
sample_name : str | None
|
|
45
|
+
Name used as the observation index. Defaults to the CEL file's
|
|
46
|
+
basename, or ``"unknown"`` when a :class:`~microarray.io.CelFile`
|
|
47
|
+
instance is supplied without a path.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
-------
|
|
51
|
+
AnnData
|
|
52
|
+
Probe-level AnnData with shape ``(1, n_cells)``.
|
|
53
|
+
"""
|
|
54
|
+
# Parse input files if they are paths
|
|
55
|
+
if isinstance(cel_path, str):
|
|
56
|
+
cel_file_path = cel_path
|
|
57
|
+
cel_file = parse_cel(cel_path)
|
|
58
|
+
else:
|
|
59
|
+
cel_file_path = None
|
|
60
|
+
cel_file = cel_path
|
|
61
|
+
|
|
62
|
+
if isinstance(cdf_path, str):
|
|
63
|
+
cdf_file = parse_cdf(cdf_path)
|
|
64
|
+
else:
|
|
65
|
+
cdf_file = cdf_path
|
|
66
|
+
|
|
67
|
+
if sample_name is None:
|
|
68
|
+
sample_name = os.path.basename(cel_file_path) if cel_file_path else "unknown"
|
|
69
|
+
|
|
70
|
+
obs = pd.DataFrame(
|
|
71
|
+
{
|
|
72
|
+
"sample_name": [sample_name],
|
|
73
|
+
"cel_version": [cel_file.version],
|
|
74
|
+
"nrows": [cel_file.nrows],
|
|
75
|
+
"ncols": [cel_file.ncols],
|
|
76
|
+
"algorithm": [cel_file.algorithm],
|
|
77
|
+
},
|
|
78
|
+
index=[sample_name],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
uns = {
|
|
82
|
+
"cel_metadata": {
|
|
83
|
+
"total_x": cel_file.total_x,
|
|
84
|
+
"total_y": cel_file.total_y,
|
|
85
|
+
"offset_x": cel_file.offset_x,
|
|
86
|
+
"offset_y": cel_file.offset_y,
|
|
87
|
+
"grid_corner_ul": cel_file.grid_corner_ul,
|
|
88
|
+
"grid_corner_ur": cel_file.grid_corner_ur,
|
|
89
|
+
"grid_corner_ll": cel_file.grid_corner_ll,
|
|
90
|
+
"grid_corner_lr": cel_file.grid_corner_lr,
|
|
91
|
+
"axis_invert_x": cel_file.axis_invert_x,
|
|
92
|
+
"axis_invert_y": cel_file.axis_invert_y,
|
|
93
|
+
"swap_xy": cel_file.swap_xy,
|
|
94
|
+
"algorithm_parameters": cel_file.algorithm_parameters,
|
|
95
|
+
"dat_header": cel_file.dat_header,
|
|
96
|
+
},
|
|
97
|
+
"cdf_metadata": {
|
|
98
|
+
"chip_name": cdf_file.chip_info.name,
|
|
99
|
+
"chip_rows": cdf_file.chip_info.rows,
|
|
100
|
+
"chip_cols": cdf_file.chip_info.cols,
|
|
101
|
+
"number_of_units": cdf_file.chip_info.number_of_units,
|
|
102
|
+
"max_unit": cdf_file.chip_info.max_unit,
|
|
103
|
+
"num_qc_units": cdf_file.chip_info.num_qc_units,
|
|
104
|
+
},
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Collect cells per probeset to compute ordinal ranks.
|
|
108
|
+
# Structure: {probeset_id: [(expos, x, y, is_pm), ...]}
|
|
109
|
+
ps_cells: dict[str, list[tuple]] = defaultdict(list)
|
|
110
|
+
|
|
111
|
+
for unit in cdf_file.units:
|
|
112
|
+
for block in unit.blocks:
|
|
113
|
+
probeset_id = block.name if block.name and block.name != "NONE" else unit.name
|
|
114
|
+
for cell in block.cells:
|
|
115
|
+
expos_val = cell.expos if cell.expos is not None else 0
|
|
116
|
+
ps_cells[probeset_id].append((expos_val, cell.x, cell.y, cell.is_pm))
|
|
117
|
+
|
|
118
|
+
# Sort by expos within each probeset and assign ordinal rank.
|
|
119
|
+
# Smallest expos = 3' end = rank 0.
|
|
120
|
+
probeset_ids: list[str] = []
|
|
121
|
+
probe_indices: list[int] = []
|
|
122
|
+
probe_types: list[str] = []
|
|
123
|
+
probe_var_ids: list[str] = []
|
|
124
|
+
intensities_list: list[float] = []
|
|
125
|
+
stdevs_list: list[float] = []
|
|
126
|
+
npixels_list: list[float] = []
|
|
127
|
+
masks_list: list[float] = []
|
|
128
|
+
outliers_list: list[float] = []
|
|
129
|
+
modified_list: list[float] = []
|
|
130
|
+
|
|
131
|
+
for probeset_id, cells in ps_cells.items():
|
|
132
|
+
cells_sorted = sorted(cells, key=lambda c: c[0])
|
|
133
|
+
for ordinal_rank, (_, cx, cy, cell_is_pm) in enumerate(cells_sorted):
|
|
134
|
+
intensity = float(cel_file.intensities[cy, cx]) if cel_file.intensities is not None else float("nan")
|
|
135
|
+
stdev = float(cel_file.stdevs[cy, cx]) if cel_file.stdevs is not None else float("nan")
|
|
136
|
+
npixels = float(cel_file.npixels[cy, cx]) if cel_file.npixels is not None else float("nan")
|
|
137
|
+
masks = float(cel_file.masks[cy, cx]) if cel_file.masks is not None else float("nan")
|
|
138
|
+
outliers = float(cel_file.outliers[cy, cx]) if cel_file.outliers is not None else float("nan")
|
|
139
|
+
modified = float(cel_file.modified[cy, cx]) if cel_file.modified is not None else float("nan")
|
|
140
|
+
|
|
141
|
+
if cell_is_pm is True:
|
|
142
|
+
probe_type = "pm"
|
|
143
|
+
elif cell_is_pm is False:
|
|
144
|
+
probe_type = "mm"
|
|
145
|
+
else:
|
|
146
|
+
probe_type = "unknown"
|
|
147
|
+
|
|
148
|
+
var_id = f"{probeset_id}:{cx}:{cy}"
|
|
149
|
+
probeset_ids.append(probeset_id)
|
|
150
|
+
probe_indices.append(ordinal_rank)
|
|
151
|
+
probe_types.append(probe_type)
|
|
152
|
+
probe_var_ids.append(var_id)
|
|
153
|
+
intensities_list.append(intensity)
|
|
154
|
+
stdevs_list.append(stdev)
|
|
155
|
+
npixels_list.append(npixels)
|
|
156
|
+
masks_list.append(masks)
|
|
157
|
+
outliers_list.append(outliers)
|
|
158
|
+
modified_list.append(modified)
|
|
159
|
+
|
|
160
|
+
n_cells = len(probeset_ids)
|
|
161
|
+
X = np.array(intensities_list, dtype=np.float32).reshape(1, n_cells)
|
|
162
|
+
|
|
163
|
+
var = pd.DataFrame(
|
|
164
|
+
{
|
|
165
|
+
"probeset_id": probeset_ids,
|
|
166
|
+
"probe_index": probe_indices,
|
|
167
|
+
"probe_type": probe_types,
|
|
168
|
+
},
|
|
169
|
+
index=probe_var_ids,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Merge probeset_info (gene_id and suffix) into var
|
|
173
|
+
if cdf_file.probeset_info is not None:
|
|
174
|
+
# Map probeset_id to gene_id and suffix by looking up in probeset_info
|
|
175
|
+
gene_ids = []
|
|
176
|
+
suffixes = []
|
|
177
|
+
for ps_id in probeset_ids:
|
|
178
|
+
if ps_id in cdf_file.probeset_info.index:
|
|
179
|
+
info = cdf_file.probeset_info.loc[ps_id]
|
|
180
|
+
gene_ids.append(info["gene_id"])
|
|
181
|
+
suffixes.append(info["suffix"])
|
|
182
|
+
else:
|
|
183
|
+
# Fallback if probeset not in probeset_info
|
|
184
|
+
gene_ids.append(ps_id)
|
|
185
|
+
suffixes.append("")
|
|
186
|
+
|
|
187
|
+
var["gene_id"] = gene_ids
|
|
188
|
+
var["suffix"] = suffixes
|
|
189
|
+
|
|
190
|
+
layers = {
|
|
191
|
+
"stdevs": np.array(stdevs_list, dtype=np.float32).reshape(1, n_cells),
|
|
192
|
+
"npixels": np.array(npixels_list, dtype=np.float32).reshape(1, n_cells),
|
|
193
|
+
"masks": np.array(masks_list, dtype=np.float32).reshape(1, n_cells),
|
|
194
|
+
"outliers": np.array(outliers_list, dtype=np.float32).reshape(1, n_cells),
|
|
195
|
+
"modified": np.array(modified_list, dtype=np.float32).reshape(1, n_cells),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return AnnData(X=X, obs=obs, var=var, layers=layers, uns=uns)
|