microarray 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {microarray-0.1.0 → microarray-0.2.0}/PKG-INFO +3 -2
- {microarray-0.1.0 → microarray-0.2.0}/pyproject.toml +5 -3
- microarray-0.2.0/src/microarray/datasets/__init__.py +35 -0
- microarray-0.2.0/src/microarray/datasets/_arrayexpress.py +587 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/datasets/_cdf_files.py +21 -5
- microarray-0.2.0/src/microarray/datasets/_geo.py +263 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/io/_cel.py +195 -7
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/io/_read.py +26 -8
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/__init__.py +2 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_cel.py +11 -1
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_de_plots.py +49 -29
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_qc_plots.py +6 -4
- microarray-0.2.0/src/microarray/plotting/_top_table_venn.py +188 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_empirical_bayes.py +1 -1
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_linear_models.py +43 -63
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_toptable.py +28 -31
- microarray-0.1.0/src/microarray/datasets/__init__.py +0 -3
- microarray-0.1.0/src/microarray/datasets/_arrayexpress.py +0 -1
- microarray-0.1.0/src/microarray/datasets/_geo.py +0 -1
- {microarray-0.1.0 → microarray-0.2.0}/README.md +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/__init__.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/_version.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/datasets/_utils.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/io/__init__.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/io/_anndata_converter.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/io/_cdf.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_base.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_diagnostic_plots.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_heatmap.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_ma_plots.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_pca.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_score.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_top_table_heatmap.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/plotting/_utils.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/__init__.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/_background.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/_log2.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/_normalize.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/_rma.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/_robust.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/preprocessing/_summarize.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/py.typed +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/__init__.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_biomart.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_fdist.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_mds.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_pca.py +0 -0
- {microarray-0.1.0 → microarray-0.2.0}/src/microarray/tools/_score.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: microarray
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Microarray analysis tools
|
|
5
5
|
Author: harryhaller001
|
|
6
6
|
Author-email: harryhaller001 <harryhaller001@gmail.com>
|
|
@@ -17,12 +17,13 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
17
17
|
Classifier: Typing :: Typed
|
|
18
18
|
Requires-Dist: adjusttext>=1.3
|
|
19
19
|
Requires-Dist: anndata
|
|
20
|
-
Requires-Dist:
|
|
20
|
+
Requires-Dist: geofetch>=0.12.10
|
|
21
21
|
Requires-Dist: matplotlib
|
|
22
22
|
Requires-Dist: requests
|
|
23
23
|
Requires-Dist: scikit-learn
|
|
24
24
|
Requires-Dist: scipy
|
|
25
25
|
Requires-Dist: statsmodels
|
|
26
|
+
Requires-Dist: tqdm>=4.67.3
|
|
26
27
|
Requires-Dist: ipython ; extra == 'docs'
|
|
27
28
|
Requires-Dist: myst-parser ; extra == 'docs'
|
|
28
29
|
Requires-Dist: nbsphinx ; extra == 'docs'
|
|
@@ -4,7 +4,7 @@ requires = [ "uv-build>=0.9.28,<0.10" ]
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "microarray"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "Microarray analysis tools"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
license = "MIT"
|
|
@@ -26,12 +26,13 @@ classifiers = [
|
|
|
26
26
|
dependencies = [
|
|
27
27
|
"adjusttext>=1.3",
|
|
28
28
|
"anndata",
|
|
29
|
-
"
|
|
29
|
+
"geofetch>=0.12.10",
|
|
30
30
|
"matplotlib",
|
|
31
31
|
"requests",
|
|
32
32
|
"scikit-learn",
|
|
33
33
|
"scipy",
|
|
34
34
|
"statsmodels",
|
|
35
|
+
"tqdm>=4.67.3",
|
|
35
36
|
]
|
|
36
37
|
optional-dependencies.docs = [
|
|
37
38
|
"ipython", # Required for syntax highlighing (https://github.com/spatialaudio/nbsphinx/issues/24)
|
|
@@ -104,4 +105,5 @@ report.ignore_errors = true
|
|
|
104
105
|
html.directory = "coverage_report"
|
|
105
106
|
|
|
106
107
|
[tool.ty]
|
|
107
|
-
src.include = [ "src", "
|
|
108
|
+
src.include = [ "src", "docs" ]
|
|
109
|
+
src.exclude = [ "tests", "examples" ]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from microarray.datasets._arrayexpress import (
|
|
2
|
+
ArrayExpressError,
|
|
3
|
+
ArrayExpressMetadata,
|
|
4
|
+
ArrayExpressRemoteFile,
|
|
5
|
+
ArrayExpressStudyFiles,
|
|
6
|
+
get_arrayexpress,
|
|
7
|
+
parse_adf,
|
|
8
|
+
parse_arrayexpress_metadata,
|
|
9
|
+
parse_idf,
|
|
10
|
+
parse_sdrf,
|
|
11
|
+
query_arrayexpress,
|
|
12
|
+
read_arrayexpress,
|
|
13
|
+
read_arrayexpress_metadata,
|
|
14
|
+
)
|
|
15
|
+
from microarray.datasets._cdf_files import hgu133a_cdf, hgu133plus2_cdf
|
|
16
|
+
from microarray.datasets._geo import read_geo, read_geo_metadata
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"ArrayExpressError",
|
|
20
|
+
"ArrayExpressMetadata",
|
|
21
|
+
"ArrayExpressRemoteFile",
|
|
22
|
+
"ArrayExpressStudyFiles",
|
|
23
|
+
"get_arrayexpress",
|
|
24
|
+
"hgu133a_cdf",
|
|
25
|
+
"hgu133plus2_cdf",
|
|
26
|
+
"read_arrayexpress",
|
|
27
|
+
"parse_adf",
|
|
28
|
+
"parse_arrayexpress_metadata",
|
|
29
|
+
"parse_idf",
|
|
30
|
+
"parse_sdrf",
|
|
31
|
+
"query_arrayexpress",
|
|
32
|
+
"read_arrayexpress_metadata",
|
|
33
|
+
"read_geo",
|
|
34
|
+
"read_geo_metadata",
|
|
35
|
+
]
|
|
@@ -0,0 +1,587 @@
|
|
|
1
|
+
"""Utilities for fetching ArrayExpress studies via BioStudies.
|
|
2
|
+
|
|
3
|
+
This module covers two tasks:
|
|
4
|
+
1. Discover studies through the ArrayExpress search endpoint.
|
|
5
|
+
2. Download study files (raw/processsed/MAGE-TAB) and parse MAGE-TAB metadata.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
import zipfile
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Literal
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import requests
|
|
21
|
+
from anndata import AnnData, concat
|
|
22
|
+
|
|
23
|
+
from microarray.datasets._utils import TIMEOUT, build_url, download_file_stream
|
|
24
|
+
from microarray.io import CdfFile, read_cel
|
|
25
|
+
|
|
26
|
+
_BIOSTUDIES_HOST = "www.ebi.ac.uk"
|
|
27
|
+
_STUDIES_PATH = "/biostudies/api/v1/studies"
|
|
28
|
+
_SEARCH_PATH = "/biostudies/api/v1/arrayexpress/search"
|
|
29
|
+
|
|
30
|
+
_AE_ACCESSION_PATTERN = re.compile(r"^E-[A-Z0-9]+-[A-Z0-9]+$", re.IGNORECASE)
|
|
31
|
+
|
|
32
|
+
ArrayExpressType = Literal["raw", "processed", "mage", "full"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ArrayExpressError(RuntimeError):
|
|
36
|
+
"""Raised when ArrayExpress/BioStudies requests fail."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class ArrayExpressRemoteFile:
|
|
41
|
+
"""One remotely available file entry for a study."""
|
|
42
|
+
|
|
43
|
+
category: Literal["Raw Data", "Processed Data", "MAGE-TAB Files", "Array Designs"]
|
|
44
|
+
path: str
|
|
45
|
+
url: str
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def name(self) -> str:
|
|
49
|
+
"""Return the basename of ``path`` for convenience."""
|
|
50
|
+
return os.path.basename(self.path)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class ArrayExpressStudyFiles:
|
|
55
|
+
"""Downloaded and categorized study files for one accession."""
|
|
56
|
+
|
|
57
|
+
accession: str
|
|
58
|
+
destination: str
|
|
59
|
+
data_type: ArrayExpressType
|
|
60
|
+
files: tuple[ArrayExpressRemoteFile, ...]
|
|
61
|
+
raw_archives: tuple[str, ...]
|
|
62
|
+
processed_archives: tuple[str, ...]
|
|
63
|
+
mage_tab_files: tuple[str, ...]
|
|
64
|
+
adf_files: tuple[str, ...]
|
|
65
|
+
raw_files: tuple[str, ...]
|
|
66
|
+
processed_files: tuple[str, ...]
|
|
67
|
+
idf: str | None
|
|
68
|
+
sdrf: str | None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(frozen=True)
|
|
72
|
+
class ArrayExpressMetadata:
|
|
73
|
+
"""Parsed MAGE-TAB metadata tables for a downloaded study."""
|
|
74
|
+
|
|
75
|
+
idf_map: dict[str, list[str]]
|
|
76
|
+
idf_table: pd.DataFrame
|
|
77
|
+
sdrf: pd.DataFrame | None
|
|
78
|
+
adf_tables: dict[str, pd.DataFrame]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _normalize_accession(accession: str) -> str:
|
|
82
|
+
value = str(accession).strip().upper()
|
|
83
|
+
if not _AE_ACCESSION_PATTERN.match(value):
|
|
84
|
+
raise ValueError(f"Invalid ArrayExpress accession '{accession}'. Expected format like 'E-MEXP-21'.")
|
|
85
|
+
return value
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _normalize_type(value: ArrayExpressType | str) -> ArrayExpressType:
|
|
89
|
+
normalized = str(value).strip().lower()
|
|
90
|
+
options: tuple[ArrayExpressType, ...] = ("raw", "processed", "mage", "full")
|
|
91
|
+
if normalized not in options:
|
|
92
|
+
raise ValueError(f"Invalid type '{value}'. Expected one of {options}.")
|
|
93
|
+
return normalized # type: ignore[return-value]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _request_json(url: str) -> dict[str, Any]:
|
|
97
|
+
response = requests.get(url, timeout=TIMEOUT)
|
|
98
|
+
if response.status_code != 200:
|
|
99
|
+
raise ArrayExpressError(
|
|
100
|
+
f"Error running query. Received HTTP {response.status_code} from BioStudies for URL '{url}'."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
payload = response.json()
|
|
104
|
+
if not isinstance(payload, dict):
|
|
105
|
+
raise ArrayExpressError(f"Unexpected JSON payload type for URL '{url}'.")
|
|
106
|
+
return payload
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def query_arrayexpress(
|
|
110
|
+
keywords: str | None = None,
|
|
111
|
+
page: int = 1,
|
|
112
|
+
all_pages: bool = False,
|
|
113
|
+
page_size: int = 100,
|
|
114
|
+
) -> pd.DataFrame:
|
|
115
|
+
"""Search ArrayExpress studies through the BioStudies search API.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
keywords: Free-text search query.
|
|
119
|
+
page: 1-based page index used for the initial request.
|
|
120
|
+
all_pages: If True, keep fetching subsequent pages until no hits are returned.
|
|
121
|
+
page_size: Number of rows fetched per page.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
DataFrame with search hits. By default this contains one page only.
|
|
125
|
+
"""
|
|
126
|
+
if page < 1:
|
|
127
|
+
raise ValueError("'page' must be >= 1.")
|
|
128
|
+
|
|
129
|
+
base_query = {
|
|
130
|
+
"query": "" if keywords is None else keywords,
|
|
131
|
+
"pageSize": int(page_size),
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
current_page = int(page)
|
|
135
|
+
all_hits: list[dict[str, Any]] = []
|
|
136
|
+
|
|
137
|
+
while True:
|
|
138
|
+
query = dict(base_query)
|
|
139
|
+
query["page"] = current_page
|
|
140
|
+
url = build_url(hostname=_BIOSTUDIES_HOST, path=_SEARCH_PATH, query=query)
|
|
141
|
+
payload = _request_json(url)
|
|
142
|
+
|
|
143
|
+
hits = payload.get("hits", [])
|
|
144
|
+
if not isinstance(hits, list) or len(hits) == 0:
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
all_hits.extend(hit for hit in hits if isinstance(hit, dict))
|
|
148
|
+
if not all_pages:
|
|
149
|
+
break
|
|
150
|
+
|
|
151
|
+
current_page += 1
|
|
152
|
+
|
|
153
|
+
return pd.DataFrame(all_hits)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _iter_section_nodes(nodes: list[Any]) -> list[dict[str, Any]]:
|
|
157
|
+
flattened: list[dict[str, Any]] = []
|
|
158
|
+
for node in nodes:
|
|
159
|
+
if isinstance(node, dict):
|
|
160
|
+
flattened.append(node)
|
|
161
|
+
elif isinstance(node, list):
|
|
162
|
+
flattened.extend(_iter_section_nodes(node))
|
|
163
|
+
return flattened
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _iter_assay_sections(study_payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|
167
|
+
section = study_payload.get("section", {})
|
|
168
|
+
if not isinstance(section, dict):
|
|
169
|
+
return []
|
|
170
|
+
|
|
171
|
+
subsections = section.get("subsections", [])
|
|
172
|
+
if not isinstance(subsections, list):
|
|
173
|
+
return []
|
|
174
|
+
|
|
175
|
+
for node in _iter_section_nodes(subsections):
|
|
176
|
+
if node.get("type") == "Assays and Data":
|
|
177
|
+
assays = node.get("subsections", [])
|
|
178
|
+
if isinstance(assays, list):
|
|
179
|
+
return _iter_section_nodes(assays)
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _iter_links(payload_section: dict[str, Any]) -> list[dict[str, Any]]:
|
|
184
|
+
links = payload_section.get("links", [])
|
|
185
|
+
if not isinstance(links, list):
|
|
186
|
+
return []
|
|
187
|
+
return _iter_section_nodes(links)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _iter_array_design_accessions(study_payload: dict[str, Any]) -> list[str]:
|
|
191
|
+
values: list[str] = []
|
|
192
|
+
for assay in _iter_assay_sections(study_payload):
|
|
193
|
+
if assay.get("type") != "Array Designs":
|
|
194
|
+
continue
|
|
195
|
+
for link in _iter_links(assay):
|
|
196
|
+
accession = link.get("url")
|
|
197
|
+
if isinstance(accession, str) and accession.strip() != "":
|
|
198
|
+
values.append(accession.strip())
|
|
199
|
+
return values
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _resolve_arrayexpress_cdf(accession: str, cdf_file: str | CdfFile | None) -> str | CdfFile:
|
|
203
|
+
if cdf_file is not None:
|
|
204
|
+
return cdf_file
|
|
205
|
+
|
|
206
|
+
from microarray.datasets._cdf_files import hgu133a_cdf, hgu133plus2_cdf
|
|
207
|
+
|
|
208
|
+
cdf_loaders: dict[str, Callable[[], CdfFile]] = {
|
|
209
|
+
"A-AFFY-2": hgu133a_cdf,
|
|
210
|
+
"A-AFFY-44": hgu133plus2_cdf,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
study_url = build_url(hostname=_BIOSTUDIES_HOST, path=f"{_STUDIES_PATH}/{accession}")
|
|
214
|
+
study_payload = _request_json(study_url)
|
|
215
|
+
designs = _iter_array_design_accessions(study_payload)
|
|
216
|
+
|
|
217
|
+
for design in designs:
|
|
218
|
+
loader = cdf_loaders.get(design)
|
|
219
|
+
if loader is not None:
|
|
220
|
+
return loader()
|
|
221
|
+
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"Unable to infer CDF for accession '{accession}'. Please pass 'cdf_file' explicitly to read_arrayexpress()."
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _normalize_file_key(value: object) -> str | None:
|
|
228
|
+
if value is None:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
raw = str(value).strip()
|
|
232
|
+
if raw == "" or raw.lower() == "nan":
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
base = os.path.basename(raw)
|
|
236
|
+
lower = base.lower()
|
|
237
|
+
if lower.endswith(".gz"):
|
|
238
|
+
lower = lower[:-3]
|
|
239
|
+
if lower.endswith(".cel"):
|
|
240
|
+
lower = lower[:-4]
|
|
241
|
+
|
|
242
|
+
return lower
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _collect_cel_paths_recursive(folder: Path) -> list[Path]:
|
|
246
|
+
cel_paths = [
|
|
247
|
+
path for path in folder.rglob("*") if path.is_file() and path.name.lower().endswith((".cel", ".cel.gz"))
|
|
248
|
+
]
|
|
249
|
+
return sorted(cel_paths)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _collect_files(
|
|
253
|
+
study_payload: dict[str, Any],
|
|
254
|
+
files_base_url: str,
|
|
255
|
+
data_type: ArrayExpressType,
|
|
256
|
+
) -> list[ArrayExpressRemoteFile]:
|
|
257
|
+
assays = _iter_assay_sections(study_payload)
|
|
258
|
+
selected: list[ArrayExpressRemoteFile] = []
|
|
259
|
+
|
|
260
|
+
for assay in assays:
|
|
261
|
+
assay_type = assay.get("type")
|
|
262
|
+
if assay_type not in {"Raw Data", "Processed Data", "MAGE-TAB Files", "Array Designs"}:
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
files = assay.get("files", [])
|
|
266
|
+
if isinstance(files, list):
|
|
267
|
+
for file_item in _iter_section_nodes(files):
|
|
268
|
+
rel_path = file_item.get("path")
|
|
269
|
+
if not isinstance(rel_path, str) or rel_path.strip() == "":
|
|
270
|
+
continue
|
|
271
|
+
url = f"{files_base_url.rstrip('/')}/Files/{rel_path.lstrip('/')}"
|
|
272
|
+
selected.append(ArrayExpressRemoteFile(category=assay_type, path=rel_path, url=url))
|
|
273
|
+
|
|
274
|
+
if assay_type == "Array Designs":
|
|
275
|
+
for link in _iter_links(assay):
|
|
276
|
+
accession = link.get("url")
|
|
277
|
+
if not isinstance(accession, str) or accession.strip() == "":
|
|
278
|
+
continue
|
|
279
|
+
adf_name = f"{accession}.adf.txt"
|
|
280
|
+
adf_url = build_url(
|
|
281
|
+
hostname=_BIOSTUDIES_HOST,
|
|
282
|
+
path=f"/biostudies/files/{accession}/{adf_name}",
|
|
283
|
+
)
|
|
284
|
+
selected.append(ArrayExpressRemoteFile(category="Array Designs", path=adf_name, url=adf_url))
|
|
285
|
+
|
|
286
|
+
if data_type == "full":
|
|
287
|
+
return selected
|
|
288
|
+
|
|
289
|
+
category_map = {
|
|
290
|
+
"raw": "Raw Data",
|
|
291
|
+
"processed": "Processed Data",
|
|
292
|
+
"mage": "MAGE-TAB Files",
|
|
293
|
+
}
|
|
294
|
+
target = category_map[data_type]
|
|
295
|
+
return [entry for entry in selected if entry.category == target]
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _extract_zip_file(archive_path: Path) -> list[str]:
|
|
299
|
+
extracted_paths: list[str] = []
|
|
300
|
+
with zipfile.ZipFile(archive_path) as archive:
|
|
301
|
+
for member in archive.infolist():
|
|
302
|
+
if member.is_dir():
|
|
303
|
+
continue
|
|
304
|
+
archive.extract(member, path=archive_path.parent)
|
|
305
|
+
extracted_paths.append(str(archive_path.parent / member.filename))
|
|
306
|
+
return extracted_paths
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def get_arrayexpress(
|
|
310
|
+
accession: str,
|
|
311
|
+
destination: str,
|
|
312
|
+
data_type: ArrayExpressType = "full",
|
|
313
|
+
extract: bool = True,
|
|
314
|
+
overwrite: bool = False,
|
|
315
|
+
) -> ArrayExpressStudyFiles:
|
|
316
|
+
"""Download one ArrayExpress study and return a file manifest.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
accession: ArrayExpress accession (for example ``"E-MEXP-21"``).
|
|
320
|
+
destination: Local destination folder.
|
|
321
|
+
data_type: Download mode: ``"raw"``, ``"processed"``, ``"mage"``, or ``"full"``.
|
|
322
|
+
extract: Whether zip archives should be unpacked after download.
|
|
323
|
+
overwrite: Overwrite existing local files when True.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
A manifest describing downloaded files and extracted contents.
|
|
327
|
+
"""
|
|
328
|
+
accession = _normalize_accession(accession)
|
|
329
|
+
data_type = _normalize_type(data_type)
|
|
330
|
+
destination_path = Path(destination).expanduser().resolve()
|
|
331
|
+
destination_path.mkdir(parents=True, exist_ok=True)
|
|
332
|
+
|
|
333
|
+
study_url = build_url(hostname=_BIOSTUDIES_HOST, path=f"{_STUDIES_PATH}/{accession}")
|
|
334
|
+
info_url = build_url(hostname=_BIOSTUDIES_HOST, path=f"{_STUDIES_PATH}/{accession}/info")
|
|
335
|
+
study_payload = _request_json(study_url)
|
|
336
|
+
info_payload = _request_json(info_url)
|
|
337
|
+
|
|
338
|
+
files_base_url = info_payload.get("httpLink")
|
|
339
|
+
if not isinstance(files_base_url, str) or files_base_url.strip() == "":
|
|
340
|
+
files_base_url = info_payload.get("ftpLink")
|
|
341
|
+
|
|
342
|
+
if not isinstance(files_base_url, str) or files_base_url.strip() == "":
|
|
343
|
+
raise ArrayExpressError(f"Missing httpLink/ftpLink in BioStudies info payload for accession '{accession}'.")
|
|
344
|
+
|
|
345
|
+
if files_base_url.startswith("ftp://"):
|
|
346
|
+
files_base_url = f"https://{files_base_url[len('ftp://') :]}"
|
|
347
|
+
|
|
348
|
+
remote_files = _collect_files(
|
|
349
|
+
study_payload=study_payload,
|
|
350
|
+
files_base_url=files_base_url,
|
|
351
|
+
data_type=data_type,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
raw_archives: list[str] = []
|
|
355
|
+
processed_archives: list[str] = []
|
|
356
|
+
mage_tab_files: list[str] = []
|
|
357
|
+
adf_files: list[str] = []
|
|
358
|
+
raw_files: list[str] = []
|
|
359
|
+
processed_files: list[str] = []
|
|
360
|
+
|
|
361
|
+
for remote_file in remote_files:
|
|
362
|
+
local_path = destination_path / remote_file.path
|
|
363
|
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
364
|
+
download_file_stream(url=remote_file.url, output_file=str(local_path), overwrite=overwrite)
|
|
365
|
+
|
|
366
|
+
if remote_file.category == "Raw Data":
|
|
367
|
+
raw_archives.append(str(local_path))
|
|
368
|
+
elif remote_file.category == "Processed Data":
|
|
369
|
+
processed_archives.append(str(local_path))
|
|
370
|
+
elif remote_file.category == "MAGE-TAB Files":
|
|
371
|
+
mage_tab_files.append(str(local_path))
|
|
372
|
+
elif remote_file.category == "Array Designs":
|
|
373
|
+
adf_files.append(str(local_path))
|
|
374
|
+
|
|
375
|
+
if extract and local_path.suffix.lower() == ".zip":
|
|
376
|
+
extracted = _extract_zip_file(local_path)
|
|
377
|
+
if remote_file.category == "Raw Data":
|
|
378
|
+
raw_files.extend(extracted)
|
|
379
|
+
elif remote_file.category == "Processed Data":
|
|
380
|
+
processed_files.extend(extracted)
|
|
381
|
+
|
|
382
|
+
sdrf = next((x for x in mage_tab_files if x.lower().endswith("sdrf.txt")), None)
|
|
383
|
+
idf = next((x for x in mage_tab_files if x.lower().endswith("idf.txt")), None)
|
|
384
|
+
|
|
385
|
+
return ArrayExpressStudyFiles(
|
|
386
|
+
accession=accession,
|
|
387
|
+
destination=str(destination_path),
|
|
388
|
+
data_type=data_type,
|
|
389
|
+
files=tuple(remote_files),
|
|
390
|
+
raw_archives=tuple(raw_archives),
|
|
391
|
+
processed_archives=tuple(processed_archives),
|
|
392
|
+
mage_tab_files=tuple(mage_tab_files),
|
|
393
|
+
adf_files=tuple(adf_files),
|
|
394
|
+
raw_files=tuple(raw_files),
|
|
395
|
+
processed_files=tuple(processed_files),
|
|
396
|
+
idf=idf,
|
|
397
|
+
sdrf=sdrf,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def parse_idf(idf_file: str) -> tuple[dict[str, list[str]], pd.DataFrame]:
|
|
402
|
+
"""Parse an IDF file and return key/value structures.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
idf_file: Path to ``*.idf.txt`` file.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Tuple of ``(idf_map, idf_table)`` where ``idf_map`` stores key to list of values,
|
|
409
|
+
and ``idf_table`` is a normalized long-form dataframe with columns ``key``,
|
|
410
|
+
``value_index``, and ``value``.
|
|
411
|
+
"""
|
|
412
|
+
idf_map: dict[str, list[str]] = {}
|
|
413
|
+
rows: list[dict[str, Any]] = []
|
|
414
|
+
|
|
415
|
+
with open(idf_file, encoding="utf-8") as handle:
|
|
416
|
+
for line in handle:
|
|
417
|
+
stripped = line.rstrip("\n")
|
|
418
|
+
if stripped.strip() == "":
|
|
419
|
+
continue
|
|
420
|
+
parts = stripped.split("\t")
|
|
421
|
+
key = parts[0].strip()
|
|
422
|
+
values = [value for value in parts[1:] if value != ""]
|
|
423
|
+
idf_map[key] = values
|
|
424
|
+
for idx, value in enumerate(values):
|
|
425
|
+
rows.append({"key": key, "value_index": idx, "value": value})
|
|
426
|
+
|
|
427
|
+
return idf_map, pd.DataFrame(rows)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def parse_sdrf(sdrf_file: str, deduplicate_array_data_file: bool = True) -> pd.DataFrame:
|
|
431
|
+
"""Parse an SDRF file into a DataFrame.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
sdrf_file: Path to ``*.sdrf.txt`` file.
|
|
435
|
+
deduplicate_array_data_file: Drop duplicated rows by "Array Data File" when available.
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
DataFrame with SDRF contents.
|
|
439
|
+
"""
|
|
440
|
+
sdrf = pd.read_csv(sdrf_file, sep="\t", dtype=str, keep_default_na=False)
|
|
441
|
+
if deduplicate_array_data_file and "Array Data File" in sdrf.columns:
|
|
442
|
+
sdrf = sdrf.drop_duplicates(subset=["Array Data File"], keep="first")
|
|
443
|
+
return sdrf.reset_index(drop=True)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _find_adf_header_line(adf_text: str) -> int:
|
|
447
|
+
with io.StringIO(adf_text) as handle:
|
|
448
|
+
for idx, line in enumerate(handle):
|
|
449
|
+
lowered = line.lower()
|
|
450
|
+
if "reporter name" in lowered:
|
|
451
|
+
return idx
|
|
452
|
+
if all(token in lowered for token in ["block", "row", "column"]):
|
|
453
|
+
return idx
|
|
454
|
+
raise ValueError("Unable to detect ADF table header.")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def parse_adf(adf_file: str) -> pd.DataFrame:
|
|
458
|
+
"""Parse an ADF file by skipping pre-table metadata lines.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
adf_file: Path to ``*.adf.txt`` file.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
DataFrame with parsed ADF rows.
|
|
465
|
+
"""
|
|
466
|
+
text = Path(adf_file).read_text(encoding="utf-8", errors="replace")
|
|
467
|
+
skiprows = _find_adf_header_line(text)
|
|
468
|
+
return pd.read_csv(adf_file, sep="\t", dtype=str, keep_default_na=False, skiprows=skiprows)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def parse_arrayexpress_metadata(files: ArrayExpressStudyFiles) -> ArrayExpressMetadata:
|
|
472
|
+
"""Parse IDF/SDRF/ADF files from a downloaded study manifest.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
files: Result from :func:`get_arrayexpress`.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Parsed metadata bundle.
|
|
479
|
+
"""
|
|
480
|
+
if files.idf is None:
|
|
481
|
+
raise FileNotFoundError("IDF file was not found in downloaded MAGE-TAB files.")
|
|
482
|
+
|
|
483
|
+
idf_map, idf_table = parse_idf(files.idf)
|
|
484
|
+
sdrf = parse_sdrf(files.sdrf) if files.sdrf is not None else None
|
|
485
|
+
|
|
486
|
+
adf_tables: dict[str, pd.DataFrame] = {}
|
|
487
|
+
for adf_file in files.adf_files:
|
|
488
|
+
adf_tables[adf_file] = parse_adf(adf_file)
|
|
489
|
+
|
|
490
|
+
return ArrayExpressMetadata(idf_map=idf_map, idf_table=idf_table, sdrf=sdrf, adf_tables=adf_tables)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def read_arrayexpress_metadata(accession: str, cache_dir: str) -> pd.DataFrame:
|
|
494
|
+
"""Fetch and return sample-level metadata (SDRF) for an ArrayExpress study.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
accession: ArrayExpress accession (for example ``"E-MTAB-1944"``).
|
|
498
|
+
cache_dir: Local cache root directory used for downloaded study files.
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
A DataFrame with SDRF sample metadata.
|
|
502
|
+
"""
|
|
503
|
+
accession = _normalize_accession(accession)
|
|
504
|
+
study_dir = Path(cache_dir).expanduser().resolve() / accession
|
|
505
|
+
study_dir.mkdir(parents=True, exist_ok=True)
|
|
506
|
+
|
|
507
|
+
sdrf = next((str(path) for path in sorted(study_dir.glob("*.sdrf.txt"))), None)
|
|
508
|
+
if sdrf is None:
|
|
509
|
+
files = get_arrayexpress(
|
|
510
|
+
accession=accession,
|
|
511
|
+
destination=str(study_dir),
|
|
512
|
+
data_type="mage",
|
|
513
|
+
extract=False,
|
|
514
|
+
overwrite=False,
|
|
515
|
+
)
|
|
516
|
+
sdrf = files.sdrf
|
|
517
|
+
|
|
518
|
+
if sdrf is None:
|
|
519
|
+
raise FileNotFoundError(f"SDRF file was not found for ArrayExpress accession '{accession}'.")
|
|
520
|
+
|
|
521
|
+
return parse_sdrf(sdrf)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def read_arrayexpress(
|
|
525
|
+
accession: str,
|
|
526
|
+
cache_dir: str,
|
|
527
|
+
cdf_file: str | CdfFile | None = None,
|
|
528
|
+
verbose: bool = True,
|
|
529
|
+
) -> AnnData:
|
|
530
|
+
"""Fetch ArrayExpress raw CEL files and return an AnnData object with merged metadata.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
accession: ArrayExpress accession (for example ``"E-MTAB-1944"``).
|
|
534
|
+
cache_dir: Local cache root directory used for downloaded study files.
|
|
535
|
+
cdf_file: Optional CDF file path or CdfFile object. If omitted, infer from Array Design accession.
|
|
536
|
+
verbose: If True, show progress while reading CEL files.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Probe-level AnnData with sample metadata merged into ``adata.obs``.
|
|
540
|
+
"""
|
|
541
|
+
accession = _normalize_accession(accession)
|
|
542
|
+
study_dir = Path(cache_dir).expanduser().resolve() / accession
|
|
543
|
+
study_dir.mkdir(parents=True, exist_ok=True)
|
|
544
|
+
|
|
545
|
+
metadata = read_arrayexpress_metadata(accession=accession, cache_dir=cache_dir)
|
|
546
|
+
|
|
547
|
+
cel_paths = _collect_cel_paths_recursive(study_dir)
|
|
548
|
+
if len(cel_paths) == 0:
|
|
549
|
+
get_arrayexpress(
|
|
550
|
+
accession=accession,
|
|
551
|
+
destination=str(study_dir),
|
|
552
|
+
data_type="raw",
|
|
553
|
+
extract=True,
|
|
554
|
+
overwrite=False,
|
|
555
|
+
)
|
|
556
|
+
cel_paths = _collect_cel_paths_recursive(study_dir)
|
|
557
|
+
|
|
558
|
+
if len(cel_paths) == 0:
|
|
559
|
+
raise FileNotFoundError(f"No CEL files were found for ArrayExpress accession '{accession}'.")
|
|
560
|
+
|
|
561
|
+
resolved_cdf = _resolve_arrayexpress_cdf(accession=accession, cdf_file=cdf_file)
|
|
562
|
+
|
|
563
|
+
batch: list[AnnData] = []
|
|
564
|
+
iterator = cel_paths if not verbose else cel_paths
|
|
565
|
+
for cel_path in iterator:
|
|
566
|
+
batch.append(read_cel(str(cel_path), resolved_cdf))
|
|
567
|
+
|
|
568
|
+
adata = concat(batch, axis=0, merge="same", join="outer")
|
|
569
|
+
|
|
570
|
+
obs = adata.obs.copy()
|
|
571
|
+
obs["_obs_file_key"] = [(_normalize_file_key(index) or "") for index in obs.index]
|
|
572
|
+
|
|
573
|
+
meta = metadata.copy()
|
|
574
|
+
array_data_col = next((col for col in meta.columns if col.lower() == "array data file"), None)
|
|
575
|
+
if array_data_col is None:
|
|
576
|
+
adata.obs = obs.drop(columns=["_obs_file_key"], errors="ignore")
|
|
577
|
+
return adata
|
|
578
|
+
|
|
579
|
+
meta["_meta_file_key"] = meta[array_data_col].map(lambda x: _normalize_file_key(x) or "")
|
|
580
|
+
meta_by_file = meta.loc[meta["_meta_file_key"] != ""].drop_duplicates("_meta_file_key").set_index("_meta_file_key")
|
|
581
|
+
|
|
582
|
+
merged = obs.join(meta_by_file, on="_obs_file_key", rsuffix="_meta")
|
|
583
|
+
merged = merged.drop(columns=["_obs_file_key"], errors="ignore")
|
|
584
|
+
merged = merged.drop(columns=["_meta_file_key"], errors="ignore")
|
|
585
|
+
adata.obs = merged
|
|
586
|
+
|
|
587
|
+
return adata
|