celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/omics.py
ADDED
|
@@ -0,0 +1,3330 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Omics data discovery, download, and inspection tools.
|
|
3
|
+
|
|
4
|
+
Provides search and fetch capabilities for major public omics repositories:
|
|
5
|
+
- NCBI GEO (Gene Expression Omnibus)
|
|
6
|
+
- CELLxGENE Discover (Chan Zuckerberg Initiative)
|
|
7
|
+
- TCGA/GDC (The Cancer Genome Atlas via Genomic Data Commons)
|
|
8
|
+
|
|
9
|
+
Also provides local dataset inspection for downloaded files.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import gzip
|
|
13
|
+
import logging
|
|
14
|
+
import re
|
|
15
|
+
import shutil
|
|
16
|
+
import tempfile
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from ct.tools import registry
|
|
20
|
+
from ct.tools.http_client import request, request_json
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("ct.tools.omics")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Shared helpers
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _downloads_dir() -> Path:
|
|
31
|
+
"""Return (and create) the downloads directory."""
|
|
32
|
+
from ct.agent.config import Config
|
|
33
|
+
|
|
34
|
+
config = Config.load()
|
|
35
|
+
base = config.get("data.downloads_dir", None)
|
|
36
|
+
if base:
|
|
37
|
+
d = Path(base).expanduser()
|
|
38
|
+
else:
|
|
39
|
+
d = Path.home() / ".ct" / "downloads"
|
|
40
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
return d
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _max_download_mb() -> int:
|
|
45
|
+
"""Return the configured max download size in MB."""
|
|
46
|
+
from ct.agent.config import Config
|
|
47
|
+
|
|
48
|
+
config = Config.load()
|
|
49
|
+
return int(config.get("data.max_download_mb", 500))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _stream_download(url: str, dest_path: Path, max_mb: int | None = None) -> tuple[Path | None, str | None]:
|
|
53
|
+
"""Stream-download a file with size cap.
|
|
54
|
+
|
|
55
|
+
Returns (path, None) on success or (None, error_string) on failure.
|
|
56
|
+
"""
|
|
57
|
+
import httpx
|
|
58
|
+
|
|
59
|
+
if max_mb is None:
|
|
60
|
+
max_mb = _max_download_mb()
|
|
61
|
+
|
|
62
|
+
max_bytes = max_mb * 1024 * 1024
|
|
63
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
tmp_path = dest_path.with_suffix(dest_path.suffix + ".tmp")
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
with httpx.stream("GET", url, follow_redirects=True, timeout=120) as resp:
|
|
68
|
+
resp.raise_for_status()
|
|
69
|
+
|
|
70
|
+
# Check Content-Length if available
|
|
71
|
+
content_length = resp.headers.get("content-length")
|
|
72
|
+
if content_length and int(content_length) > max_bytes:
|
|
73
|
+
return None, (
|
|
74
|
+
f"File size ({int(content_length) // (1024*1024)} MB) "
|
|
75
|
+
f"exceeds limit ({max_mb} MB). "
|
|
76
|
+
f"Increase with: ct config set data.max_download_mb <value>"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
downloaded = 0
|
|
80
|
+
with open(tmp_path, "wb") as f:
|
|
81
|
+
for chunk in resp.iter_bytes(chunk_size=65536):
|
|
82
|
+
downloaded += len(chunk)
|
|
83
|
+
if downloaded > max_bytes:
|
|
84
|
+
tmp_path.unlink(missing_ok=True)
|
|
85
|
+
return None, (
|
|
86
|
+
f"Download exceeded size limit ({max_mb} MB). "
|
|
87
|
+
f"Increase with: ct config set data.max_download_mb <value>"
|
|
88
|
+
)
|
|
89
|
+
f.write(chunk)
|
|
90
|
+
|
|
91
|
+
# Atomic rename
|
|
92
|
+
shutil.move(str(tmp_path), str(dest_path))
|
|
93
|
+
return dest_path, None
|
|
94
|
+
|
|
95
|
+
except httpx.HTTPStatusError as exc:
|
|
96
|
+
tmp_path.unlink(missing_ok=True)
|
|
97
|
+
return None, f"HTTP {exc.response.status_code}: {str(exc)[:200]}"
|
|
98
|
+
except Exception as exc:
|
|
99
|
+
tmp_path.unlink(missing_ok=True)
|
|
100
|
+
return None, f"Download failed: {str(exc)[:200]}"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _check_scanpy():
|
|
104
|
+
"""Check if scanpy is available."""
|
|
105
|
+
try:
|
|
106
|
+
import scanpy as sc
|
|
107
|
+
|
|
108
|
+
return sc
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
logger.debug("scanpy unavailable or failed to import: %s", exc)
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# 1. omics.geo_search
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@registry.register(
|
|
120
|
+
name="omics.geo_search",
|
|
121
|
+
description="Search NCBI GEO for datasets by keyword, organism, and study type",
|
|
122
|
+
category="omics",
|
|
123
|
+
parameters={
|
|
124
|
+
"query": "Search terms (gene, disease, compound, etc.)",
|
|
125
|
+
"organism": "Organism filter (default 'Homo sapiens')",
|
|
126
|
+
"study_type": "Filter: 'scRNA-seq', 'bulk RNA-seq', 'methylation', 'ATAC-seq', 'ChIP-seq', or 'all'",
|
|
127
|
+
"max_results": "Maximum results to return (default 10)",
|
|
128
|
+
},
|
|
129
|
+
usage_guide=(
|
|
130
|
+
"Search NCBI GEO for public omics datasets. Use before omics.geo_fetch "
|
|
131
|
+
"to find relevant accessions. Supports filtering by organism and study type."
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
def geo_search(
|
|
135
|
+
query: str,
|
|
136
|
+
organism: str = "Homo sapiens",
|
|
137
|
+
study_type: str = "all",
|
|
138
|
+
max_results: int = 10,
|
|
139
|
+
**kwargs,
|
|
140
|
+
) -> dict:
|
|
141
|
+
"""Search NCBI GEO for datasets."""
|
|
142
|
+
if not query or not query.strip():
|
|
143
|
+
return {"error": "Query is required", "summary": "No query provided"}
|
|
144
|
+
|
|
145
|
+
# Build search term
|
|
146
|
+
terms = [query.strip()]
|
|
147
|
+
if organism and organism.lower() != "all":
|
|
148
|
+
terms.append(f'"{organism}"[Organism]')
|
|
149
|
+
|
|
150
|
+
study_type_keywords = {
|
|
151
|
+
"scrna-seq": "single cell RNA-seq",
|
|
152
|
+
"bulk rna-seq": "RNA-seq",
|
|
153
|
+
"methylation": "methylation profiling",
|
|
154
|
+
"atac-seq": "ATAC-seq",
|
|
155
|
+
"chip-seq": "ChIP-seq",
|
|
156
|
+
}
|
|
157
|
+
st = study_type.lower().strip()
|
|
158
|
+
if st != "all" and st in study_type_keywords:
|
|
159
|
+
terms.append(study_type_keywords[st])
|
|
160
|
+
|
|
161
|
+
search_term = " AND ".join(terms)
|
|
162
|
+
|
|
163
|
+
# Step 1: esearch
|
|
164
|
+
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
|
165
|
+
data, error = request_json(
|
|
166
|
+
"GET",
|
|
167
|
+
esearch_url,
|
|
168
|
+
params={
|
|
169
|
+
"db": "gds",
|
|
170
|
+
"term": search_term,
|
|
171
|
+
"retmax": str(min(max_results, 50)),
|
|
172
|
+
"retmode": "json",
|
|
173
|
+
},
|
|
174
|
+
timeout=15,
|
|
175
|
+
)
|
|
176
|
+
if error:
|
|
177
|
+
return {"error": f"GEO search failed: {error}", "summary": f"GEO search error: {error}"}
|
|
178
|
+
|
|
179
|
+
esearch_result = data.get("esearchresult", {})
|
|
180
|
+
id_list = esearch_result.get("idlist", [])
|
|
181
|
+
if not id_list:
|
|
182
|
+
return {
|
|
183
|
+
"datasets": [],
|
|
184
|
+
"query": search_term,
|
|
185
|
+
"count": 0,
|
|
186
|
+
"summary": f"No GEO datasets found for '{query}' (organism={organism}, type={study_type})",
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
# Step 2: esummary
|
|
190
|
+
esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
|
|
191
|
+
summary_data, error = request_json(
|
|
192
|
+
"GET",
|
|
193
|
+
esummary_url,
|
|
194
|
+
params={
|
|
195
|
+
"db": "gds",
|
|
196
|
+
"id": ",".join(id_list),
|
|
197
|
+
"retmode": "json",
|
|
198
|
+
},
|
|
199
|
+
timeout=15,
|
|
200
|
+
)
|
|
201
|
+
if error:
|
|
202
|
+
return {"error": f"GEO summary fetch failed: {error}", "summary": f"GEO summary error: {error}"}
|
|
203
|
+
|
|
204
|
+
result_block = summary_data.get("result", {})
|
|
205
|
+
datasets = []
|
|
206
|
+
for uid in id_list:
|
|
207
|
+
entry = result_block.get(uid, {})
|
|
208
|
+
if not entry or isinstance(entry, str):
|
|
209
|
+
continue
|
|
210
|
+
accession = entry.get("accession", "")
|
|
211
|
+
# GDS entries may not have GSE accession directly; extract from related
|
|
212
|
+
if not accession.startswith("GSE"):
|
|
213
|
+
gse = entry.get("gse", "")
|
|
214
|
+
if gse:
|
|
215
|
+
accession = f"GSE{gse}"
|
|
216
|
+
datasets.append({
|
|
217
|
+
"accession": accession,
|
|
218
|
+
"title": entry.get("title", ""),
|
|
219
|
+
"summary": (entry.get("summary", "") or "")[:300],
|
|
220
|
+
"organism": entry.get("taxon", ""),
|
|
221
|
+
"platform": entry.get("gpl", ""),
|
|
222
|
+
"sample_count": entry.get("n_samples", 0),
|
|
223
|
+
"study_type": entry.get("gdstype", study_type),
|
|
224
|
+
"date": entry.get("pdat", ""),
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
"datasets": datasets,
|
|
229
|
+
"query": search_term,
|
|
230
|
+
"count": len(datasets),
|
|
231
|
+
"summary": (
|
|
232
|
+
f"Found {len(datasets)} GEO dataset(s) for '{query}'. "
|
|
233
|
+
+ "; ".join(
|
|
234
|
+
f"{d['accession']}: {d['title'][:60]}" for d in datasets[:3]
|
|
235
|
+
)
|
|
236
|
+
),
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
# 2. omics.geo_fetch
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@registry.register(
|
|
246
|
+
name="omics.geo_fetch",
|
|
247
|
+
description="Download a GEO dataset (expression matrix or supplementary files)",
|
|
248
|
+
category="omics",
|
|
249
|
+
parameters={
|
|
250
|
+
"accession": "GEO accession (e.g., 'GSE12345')",
|
|
251
|
+
"file_type": "Type to download: 'matrix', 'h5ad', 'supplementary' (default 'matrix')",
|
|
252
|
+
},
|
|
253
|
+
usage_guide=(
|
|
254
|
+
"Download data from NCBI GEO after finding accessions with omics.geo_search. "
|
|
255
|
+
"Use 'matrix' for series matrix files, 'supplementary' for raw/processed supplements."
|
|
256
|
+
),
|
|
257
|
+
)
|
|
258
|
+
def geo_fetch(accession: str, file_type: str = "matrix", **kwargs) -> dict:
|
|
259
|
+
"""Download a GEO dataset."""
|
|
260
|
+
# Validate accession
|
|
261
|
+
if not accession or not re.match(r"^GSE\d+$", accession.strip()):
|
|
262
|
+
return {
|
|
263
|
+
"error": f"Invalid GEO accession '{accession}'. Expected format: GSE12345",
|
|
264
|
+
"summary": f"Invalid accession format: {accession}",
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
accession = accession.strip().upper()
|
|
268
|
+
# GEO FTP path uses first 3+nnn digits: GSE12345 → GSE12nnn
|
|
269
|
+
prefix = accession[:len(accession) - 3] + "nnn"
|
|
270
|
+
|
|
271
|
+
dest_dir = _downloads_dir() / "geo" / accession
|
|
272
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
273
|
+
|
|
274
|
+
if file_type == "matrix":
|
|
275
|
+
filename = f"{accession}_series_matrix.txt.gz"
|
|
276
|
+
url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}/{accession}/matrix/{filename}"
|
|
277
|
+
dest = dest_dir / filename
|
|
278
|
+
|
|
279
|
+
if dest.exists():
|
|
280
|
+
size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
|
|
281
|
+
return {
|
|
282
|
+
"path": str(dest),
|
|
283
|
+
"accession": accession,
|
|
284
|
+
"file_type": file_type,
|
|
285
|
+
"size_mb": size_mb,
|
|
286
|
+
"summary": f"Already downloaded: {dest.name} ({size_mb} MB)",
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
path, error = _stream_download(url, dest)
|
|
290
|
+
if error:
|
|
291
|
+
return {"error": error, "accession": accession, "summary": f"Download failed for {accession}: {error}"}
|
|
292
|
+
|
|
293
|
+
size_mb = round(path.stat().st_size / (1024 * 1024), 2)
|
|
294
|
+
return {
|
|
295
|
+
"path": str(path),
|
|
296
|
+
"accession": accession,
|
|
297
|
+
"file_type": file_type,
|
|
298
|
+
"size_mb": size_mb,
|
|
299
|
+
"summary": f"Downloaded {accession} series matrix ({size_mb} MB) to {path}",
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
elif file_type in ("h5ad", "supplementary"):
|
|
303
|
+
# List supplementary files page
|
|
304
|
+
suppl_url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}/{accession}/suppl/"
|
|
305
|
+
resp, error = request("GET", suppl_url, timeout=15, raise_for_status=False)
|
|
306
|
+
if error:
|
|
307
|
+
return {"error": f"Could not list supplementary files: {error}", "summary": f"Supplementary listing failed for {accession}"}
|
|
308
|
+
|
|
309
|
+
# Parse HTML directory listing for file links
|
|
310
|
+
text = resp.text if hasattr(resp, "text") else str(resp)
|
|
311
|
+
links = re.findall(r'href="([^"]+)"', text)
|
|
312
|
+
data_files = [l for l in links if not l.startswith("?") and not l.startswith("/") and l != "../"]
|
|
313
|
+
|
|
314
|
+
if not data_files:
|
|
315
|
+
return {
|
|
316
|
+
"error": f"No supplementary files found for {accession}",
|
|
317
|
+
"summary": f"No supplementary files available for {accession}",
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
# For h5ad, prefer .h5ad files; otherwise take first data file
|
|
321
|
+
target = None
|
|
322
|
+
if file_type == "h5ad":
|
|
323
|
+
h5ad_files = [f for f in data_files if f.endswith(".h5ad") or f.endswith(".h5ad.gz")]
|
|
324
|
+
if h5ad_files:
|
|
325
|
+
target = h5ad_files[0]
|
|
326
|
+
else:
|
|
327
|
+
return {
|
|
328
|
+
"error": f"No h5ad files found in {accession} supplementary files",
|
|
329
|
+
"files_available": data_files[:10],
|
|
330
|
+
"summary": f"No h5ad files in {accession}. Available: {', '.join(data_files[:5])}",
|
|
331
|
+
}
|
|
332
|
+
else:
|
|
333
|
+
target = data_files[0]
|
|
334
|
+
|
|
335
|
+
file_url = f"{suppl_url}{target}"
|
|
336
|
+
dest = dest_dir / target
|
|
337
|
+
|
|
338
|
+
if dest.exists():
|
|
339
|
+
size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
|
|
340
|
+
return {
|
|
341
|
+
"path": str(dest),
|
|
342
|
+
"accession": accession,
|
|
343
|
+
"file_type": file_type,
|
|
344
|
+
"filename": target,
|
|
345
|
+
"size_mb": size_mb,
|
|
346
|
+
"summary": f"Already downloaded: {target} ({size_mb} MB)",
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
path, error = _stream_download(file_url, dest)
|
|
350
|
+
if error:
|
|
351
|
+
return {"error": error, "accession": accession, "summary": f"Download failed: {error}"}
|
|
352
|
+
|
|
353
|
+
size_mb = round(path.stat().st_size / (1024 * 1024), 2)
|
|
354
|
+
return {
|
|
355
|
+
"path": str(path),
|
|
356
|
+
"accession": accession,
|
|
357
|
+
"file_type": file_type,
|
|
358
|
+
"filename": target,
|
|
359
|
+
"size_mb": size_mb,
|
|
360
|
+
"summary": f"Downloaded {target} ({size_mb} MB) from {accession}",
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
else:
|
|
364
|
+
return {
|
|
365
|
+
"error": f"Invalid file_type '{file_type}'. Choose: matrix, h5ad, supplementary",
|
|
366
|
+
"summary": f"Invalid file_type: {file_type}",
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
# ---------------------------------------------------------------------------
|
|
371
|
+
# 3. omics.cellxgene_search
|
|
372
|
+
# ---------------------------------------------------------------------------
|
|
373
|
+
|
|
374
|
+
_CELLXGENE_API = "https://api.cellxgene.cziscience.com/curation/v1"
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
@registry.register(
|
|
378
|
+
name="omics.cellxgene_search",
|
|
379
|
+
description="Search CELLxGENE Discover for curated single-cell datasets",
|
|
380
|
+
category="omics",
|
|
381
|
+
parameters={
|
|
382
|
+
"query": "Search terms (gene, disease, tissue, etc.)",
|
|
383
|
+
"tissue": "Filter by tissue (optional)",
|
|
384
|
+
"disease": "Filter by disease (optional)",
|
|
385
|
+
"organism": "Filter by organism (default 'Homo sapiens')",
|
|
386
|
+
"max_results": "Maximum results to return (default 10)",
|
|
387
|
+
},
|
|
388
|
+
usage_guide=(
|
|
389
|
+
"Search the CZI CELLxGENE Discover portal for curated, analysis-ready "
|
|
390
|
+
"single-cell datasets. Use before omics.cellxgene_fetch to get dataset IDs."
|
|
391
|
+
),
|
|
392
|
+
)
|
|
393
|
+
def cellxgene_search(
|
|
394
|
+
query: str,
|
|
395
|
+
tissue: str = "",
|
|
396
|
+
disease: str = "",
|
|
397
|
+
organism: str = "Homo sapiens",
|
|
398
|
+
max_results: int = 10,
|
|
399
|
+
**kwargs,
|
|
400
|
+
) -> dict:
|
|
401
|
+
"""Search CELLxGENE Discover for single-cell datasets."""
|
|
402
|
+
if not query or not query.strip():
|
|
403
|
+
return {"error": "Query is required", "summary": "No query provided"}
|
|
404
|
+
|
|
405
|
+
# Fetch collections
|
|
406
|
+
url = f"{_CELLXGENE_API}/collections"
|
|
407
|
+
data, error = request_json("GET", url, timeout=20)
|
|
408
|
+
if error:
|
|
409
|
+
return {"error": f"CELLxGENE search failed: {error}", "summary": f"CELLxGENE error: {error}"}
|
|
410
|
+
|
|
411
|
+
if not isinstance(data, list):
|
|
412
|
+
return {"error": "Unexpected CELLxGENE response format", "summary": "CELLxGENE returned unexpected format"}
|
|
413
|
+
|
|
414
|
+
query_lower = query.lower().strip()
|
|
415
|
+
query_terms = query_lower.split()
|
|
416
|
+
results = []
|
|
417
|
+
|
|
418
|
+
for collection in data:
|
|
419
|
+
# Check collection-level match
|
|
420
|
+
col_title = (collection.get("name") or "").lower()
|
|
421
|
+
col_desc = (collection.get("description") or "").lower()
|
|
422
|
+
col_text = col_title + " " + col_desc
|
|
423
|
+
|
|
424
|
+
col_matches = any(term in col_text for term in query_terms)
|
|
425
|
+
|
|
426
|
+
for dataset in collection.get("datasets", []):
|
|
427
|
+
ds_title = (dataset.get("title") or dataset.get("name") or "").lower()
|
|
428
|
+
ds_text = ds_title + " " + col_text
|
|
429
|
+
|
|
430
|
+
# Match query
|
|
431
|
+
if not col_matches and not any(term in ds_text for term in query_terms):
|
|
432
|
+
continue
|
|
433
|
+
|
|
434
|
+
# Filter organism
|
|
435
|
+
ds_organisms = [
|
|
436
|
+
o.get("label", "").lower()
|
|
437
|
+
for o in (dataset.get("organism", []) if isinstance(dataset.get("organism"), list) else [])
|
|
438
|
+
]
|
|
439
|
+
if organism and organism.lower() not in " ".join(ds_organisms) and ds_organisms:
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
# Filter tissue
|
|
443
|
+
ds_tissues = [
|
|
444
|
+
t.get("label", "").lower()
|
|
445
|
+
for t in (dataset.get("tissue", []) if isinstance(dataset.get("tissue"), list) else [])
|
|
446
|
+
]
|
|
447
|
+
if tissue and tissue.lower() not in " ".join(ds_tissues):
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
# Filter disease
|
|
451
|
+
ds_diseases = [
|
|
452
|
+
d.get("label", "").lower()
|
|
453
|
+
for d in (dataset.get("disease", []) if isinstance(dataset.get("disease"), list) else [])
|
|
454
|
+
]
|
|
455
|
+
if disease and disease.lower() not in " ".join(ds_diseases):
|
|
456
|
+
continue
|
|
457
|
+
|
|
458
|
+
# Extract assay info
|
|
459
|
+
ds_assays = [
|
|
460
|
+
a.get("label", "")
|
|
461
|
+
for a in (dataset.get("assay", []) if isinstance(dataset.get("assay"), list) else [])
|
|
462
|
+
]
|
|
463
|
+
|
|
464
|
+
results.append({
|
|
465
|
+
"dataset_id": dataset.get("dataset_id", ""),
|
|
466
|
+
"collection_id": collection.get("collection_id", ""),
|
|
467
|
+
"title": dataset.get("title") or dataset.get("name") or col_title,
|
|
468
|
+
"description": (col_desc[:200] if col_desc else ""),
|
|
469
|
+
"tissue": ", ".join(t.get("label", "") for t in (dataset.get("tissue", []) if isinstance(dataset.get("tissue"), list) else [])),
|
|
470
|
+
"disease": ", ".join(d.get("label", "") for d in (dataset.get("disease", []) if isinstance(dataset.get("disease"), list) else [])),
|
|
471
|
+
"cell_count": dataset.get("cell_count", 0),
|
|
472
|
+
"organism": ", ".join(o.get("label", "") for o in (dataset.get("organism", []) if isinstance(dataset.get("organism"), list) else [])),
|
|
473
|
+
"assay": ", ".join(ds_assays),
|
|
474
|
+
})
|
|
475
|
+
|
|
476
|
+
if len(results) >= max_results:
|
|
477
|
+
break
|
|
478
|
+
if len(results) >= max_results:
|
|
479
|
+
break
|
|
480
|
+
|
|
481
|
+
return {
|
|
482
|
+
"datasets": results,
|
|
483
|
+
"query": query,
|
|
484
|
+
"count": len(results),
|
|
485
|
+
"summary": (
|
|
486
|
+
f"Found {len(results)} CELLxGENE dataset(s) for '{query}'. "
|
|
487
|
+
+ ("; ".join(f"{d['title'][:50]} ({d['cell_count']} cells)" for d in results[:3]) if results else "Try broader search terms.")
|
|
488
|
+
),
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
# ---------------------------------------------------------------------------
|
|
493
|
+
# 4. omics.cellxgene_fetch
|
|
494
|
+
# ---------------------------------------------------------------------------
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
@registry.register(
|
|
498
|
+
name="omics.cellxgene_fetch",
|
|
499
|
+
description="Download an h5ad dataset from CELLxGENE Discover",
|
|
500
|
+
category="omics",
|
|
501
|
+
parameters={
|
|
502
|
+
"dataset_id": "CELLxGENE dataset ID (from omics.cellxgene_search results)",
|
|
503
|
+
},
|
|
504
|
+
usage_guide=(
|
|
505
|
+
"Download a single-cell dataset from CELLxGENE. Requires a dataset_id "
|
|
506
|
+
"from omics.cellxgene_search results. Downloads as h5ad format."
|
|
507
|
+
),
|
|
508
|
+
)
|
|
509
|
+
def cellxgene_fetch(dataset_id: str, **kwargs) -> dict:
|
|
510
|
+
"""Download an h5ad dataset from CELLxGENE."""
|
|
511
|
+
if not dataset_id or not dataset_id.strip():
|
|
512
|
+
return {"error": "dataset_id is required", "summary": "No dataset_id provided"}
|
|
513
|
+
|
|
514
|
+
dataset_id = dataset_id.strip()
|
|
515
|
+
|
|
516
|
+
# Get asset list
|
|
517
|
+
assets_url = f"{_CELLXGENE_API}/datasets/{dataset_id}/assets"
|
|
518
|
+
assets, error = request_json("GET", assets_url, timeout=15)
|
|
519
|
+
if error:
|
|
520
|
+
return {"error": f"Failed to get assets: {error}", "summary": f"CELLxGENE asset lookup failed: {error}"}
|
|
521
|
+
|
|
522
|
+
if not isinstance(assets, list) or not assets:
|
|
523
|
+
return {
|
|
524
|
+
"error": f"No downloadable assets found for dataset {dataset_id}",
|
|
525
|
+
"summary": f"No assets for dataset {dataset_id}",
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
# Find h5ad asset
|
|
529
|
+
h5ad_asset = None
|
|
530
|
+
for asset in assets:
|
|
531
|
+
filetype = (asset.get("filetype") or asset.get("file_type") or "").lower()
|
|
532
|
+
filename = (asset.get("filename") or "").lower()
|
|
533
|
+
if "h5ad" in filetype or filename.endswith(".h5ad"):
|
|
534
|
+
h5ad_asset = asset
|
|
535
|
+
break
|
|
536
|
+
|
|
537
|
+
if not h5ad_asset:
|
|
538
|
+
# Fall back to first asset
|
|
539
|
+
h5ad_asset = assets[0]
|
|
540
|
+
|
|
541
|
+
download_url = h5ad_asset.get("presigned_url") or h5ad_asset.get("url", "")
|
|
542
|
+
if not download_url:
|
|
543
|
+
return {
|
|
544
|
+
"error": "No download URL in asset metadata",
|
|
545
|
+
"summary": "CELLxGENE asset has no download URL",
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
filename = h5ad_asset.get("filename", f"{dataset_id}.h5ad")
|
|
549
|
+
dest_dir = _downloads_dir() / "cellxgene" / dataset_id
|
|
550
|
+
dest = dest_dir / filename
|
|
551
|
+
|
|
552
|
+
if dest.exists():
|
|
553
|
+
size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
|
|
554
|
+
return {
|
|
555
|
+
"path": str(dest),
|
|
556
|
+
"dataset_id": dataset_id,
|
|
557
|
+
"size_mb": size_mb,
|
|
558
|
+
"summary": f"Already downloaded: {filename} ({size_mb} MB)",
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
path, error = _stream_download(download_url, dest)
|
|
562
|
+
if error:
|
|
563
|
+
return {"error": error, "dataset_id": dataset_id, "summary": f"Download failed: {error}"}
|
|
564
|
+
|
|
565
|
+
size_mb = round(path.stat().st_size / (1024 * 1024), 2)
|
|
566
|
+
return {
|
|
567
|
+
"path": str(path),
|
|
568
|
+
"dataset_id": dataset_id,
|
|
569
|
+
"filename": filename,
|
|
570
|
+
"size_mb": size_mb,
|
|
571
|
+
"summary": f"Downloaded CELLxGENE dataset {dataset_id} ({size_mb} MB) to {path}",
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
# ---------------------------------------------------------------------------
|
|
576
|
+
# 5. omics.tcga_search
|
|
577
|
+
# ---------------------------------------------------------------------------
|
|
578
|
+
|
|
579
|
+
_GDC_API = "https://api.gdc.cancer.gov"
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
@registry.register(
|
|
583
|
+
name="omics.tcga_search",
|
|
584
|
+
description="Search TCGA/GDC for cancer genomics projects and data files",
|
|
585
|
+
category="omics",
|
|
586
|
+
parameters={
|
|
587
|
+
"query": "Search terms (cancer type, gene, etc.)",
|
|
588
|
+
"data_type": "Filter: 'gene_expression', 'methylation', 'mutation', 'clinical' (default 'gene_expression')",
|
|
589
|
+
"max_results": "Maximum results to return (default 10)",
|
|
590
|
+
},
|
|
591
|
+
usage_guide=(
|
|
592
|
+
"Search the NCI Genomic Data Commons (GDC) for TCGA and other cancer "
|
|
593
|
+
"genomics projects. Use before omics.tcga_fetch to find file UUIDs."
|
|
594
|
+
),
|
|
595
|
+
)
|
|
596
|
+
def tcga_search(
|
|
597
|
+
query: str,
|
|
598
|
+
data_type: str = "gene_expression",
|
|
599
|
+
max_results: int = 10,
|
|
600
|
+
**kwargs,
|
|
601
|
+
) -> dict:
|
|
602
|
+
"""Search TCGA/GDC for projects and data files."""
|
|
603
|
+
if not query or not query.strip():
|
|
604
|
+
return {"error": "Query is required", "summary": "No query provided"}
|
|
605
|
+
|
|
606
|
+
valid_types = {"gene_expression", "methylation", "mutation", "clinical"}
|
|
607
|
+
if data_type not in valid_types:
|
|
608
|
+
return {
|
|
609
|
+
"error": f"Invalid data_type '{data_type}'. Choose from: {', '.join(valid_types)}",
|
|
610
|
+
"summary": f"Invalid data_type: {data_type}",
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
# Map requested analysis type to GDC project summary data categories.
|
|
614
|
+
# Project summaries expose category-level counts, not file-level data_type counts.
|
|
615
|
+
gdc_data_category_map = {
|
|
616
|
+
"gene_expression": "Transcriptome Profiling",
|
|
617
|
+
"methylation": "DNA Methylation",
|
|
618
|
+
"mutation": "Simple Nucleotide Variation",
|
|
619
|
+
"clinical": "Clinical",
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
# Search projects first
|
|
623
|
+
projects_url = f"{_GDC_API}/projects"
|
|
624
|
+
filters = {
|
|
625
|
+
"op": "or",
|
|
626
|
+
"content": [
|
|
627
|
+
{"op": "in", "content": {"field": "project_id", "value": [query.upper()]}},
|
|
628
|
+
{"op": "like", "content": {"field": "name", "value": f"*{query}*"}},
|
|
629
|
+
{"op": "like", "content": {"field": "disease_type", "value": f"*{query}*"}},
|
|
630
|
+
{"op": "like", "content": {"field": "primary_site", "value": f"*{query}*"}},
|
|
631
|
+
],
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
import json
|
|
635
|
+
|
|
636
|
+
params = {
|
|
637
|
+
"filters": json.dumps(filters),
|
|
638
|
+
"fields": "project_id,name,disease_type,primary_site,summary.case_count,summary.file_count,summary.data_categories.data_category,summary.data_categories.file_count",
|
|
639
|
+
"size": str(min(max_results, 50)),
|
|
640
|
+
"format": "json",
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
data, error = request_json("GET", projects_url, params=params, timeout=15)
|
|
644
|
+
if error:
|
|
645
|
+
return {"error": f"GDC search failed: {error}", "summary": f"GDC error: {error}"}
|
|
646
|
+
|
|
647
|
+
hits = data.get("data", {}).get("hits", [])
|
|
648
|
+
projects = []
|
|
649
|
+
for hit in hits:
|
|
650
|
+
summary = hit.get("summary", {})
|
|
651
|
+
# Count files in the category most relevant to requested data_type.
|
|
652
|
+
data_cats = summary.get("data_categories", [])
|
|
653
|
+
requested_category = gdc_data_category_map.get(data_type, "")
|
|
654
|
+
category_file_count = 0
|
|
655
|
+
available_categories = []
|
|
656
|
+
for cat in data_cats:
|
|
657
|
+
cat_name = cat.get("data_category", "")
|
|
658
|
+
if cat_name:
|
|
659
|
+
available_categories.append(cat_name)
|
|
660
|
+
if cat_name.lower() == requested_category.lower():
|
|
661
|
+
category_file_count = int(cat.get("file_count", 0) or 0)
|
|
662
|
+
|
|
663
|
+
projects.append({
|
|
664
|
+
"project_id": hit.get("project_id", ""),
|
|
665
|
+
"name": hit.get("name", ""),
|
|
666
|
+
"disease_type": hit.get("disease_type", ""),
|
|
667
|
+
"primary_site": hit.get("primary_site", ""),
|
|
668
|
+
"case_count": summary.get("case_count", 0),
|
|
669
|
+
"file_count": summary.get("file_count", 0),
|
|
670
|
+
"data_type": data_type,
|
|
671
|
+
"matching_data_category": requested_category,
|
|
672
|
+
"data_type_file_count": category_file_count,
|
|
673
|
+
"available_data_categories": available_categories[:20],
|
|
674
|
+
"count_method": "project_summary_data_category",
|
|
675
|
+
})
|
|
676
|
+
|
|
677
|
+
if not projects:
|
|
678
|
+
return {
|
|
679
|
+
"projects": [],
|
|
680
|
+
"query": query,
|
|
681
|
+
"count": 0,
|
|
682
|
+
"summary": f"No TCGA/GDC projects found for '{query}'",
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
return {
|
|
686
|
+
"projects": projects,
|
|
687
|
+
"query": query,
|
|
688
|
+
"data_type": data_type,
|
|
689
|
+
"count": len(projects),
|
|
690
|
+
"summary": (
|
|
691
|
+
f"Found {len(projects)} GDC project(s) for '{query}'. "
|
|
692
|
+
+ "; ".join(f"{p['project_id']}: {p['name'][:40]} ({p['case_count']} cases)" for p in projects[:3])
|
|
693
|
+
),
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
# ---------------------------------------------------------------------------
|
|
698
|
+
# 6. omics.tcga_fetch
|
|
699
|
+
# ---------------------------------------------------------------------------
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
@registry.register(
|
|
703
|
+
name="omics.tcga_fetch",
|
|
704
|
+
description="Download a data file from TCGA/GDC",
|
|
705
|
+
category="omics",
|
|
706
|
+
parameters={
|
|
707
|
+
"file_id": "GDC file UUID to download",
|
|
708
|
+
"project_id": "GDC project ID (optional, used to search for files if file_id not provided)",
|
|
709
|
+
},
|
|
710
|
+
usage_guide=(
|
|
711
|
+
"Download a specific file from GDC by UUID. If only project_id is given, "
|
|
712
|
+
"searches for the most relevant gene expression file and downloads it."
|
|
713
|
+
),
|
|
714
|
+
)
|
|
715
|
+
def tcga_fetch(file_id: str = "", project_id: str = "", **kwargs) -> dict:
|
|
716
|
+
"""Download a data file from TCGA/GDC."""
|
|
717
|
+
import json
|
|
718
|
+
|
|
719
|
+
if not file_id and not project_id:
|
|
720
|
+
return {
|
|
721
|
+
"error": "Either file_id or project_id is required",
|
|
722
|
+
"summary": "No file_id or project_id provided",
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
# If no file_id, search for one from the project
|
|
726
|
+
if not file_id:
|
|
727
|
+
files_url = f"{_GDC_API}/files"
|
|
728
|
+
filters = {
|
|
729
|
+
"op": "and",
|
|
730
|
+
"content": [
|
|
731
|
+
{"op": "=", "content": {"field": "cases.project.project_id", "value": project_id}},
|
|
732
|
+
{"op": "=", "content": {"field": "data_type", "value": "Gene Expression Quantification"}},
|
|
733
|
+
{"op": "=", "content": {"field": "access", "value": "open"}},
|
|
734
|
+
],
|
|
735
|
+
}
|
|
736
|
+
params = {
|
|
737
|
+
"filters": json.dumps(filters),
|
|
738
|
+
"fields": "file_id,file_name,file_size,data_type",
|
|
739
|
+
"size": "1",
|
|
740
|
+
"format": "json",
|
|
741
|
+
}
|
|
742
|
+
data, error = request_json("GET", files_url, params=params, timeout=15)
|
|
743
|
+
if error:
|
|
744
|
+
return {"error": f"File search failed: {error}", "summary": f"GDC file search error: {error}"}
|
|
745
|
+
|
|
746
|
+
hits = data.get("data", {}).get("hits", [])
|
|
747
|
+
if not hits:
|
|
748
|
+
return {
|
|
749
|
+
"error": f"No open-access files found for project {project_id}",
|
|
750
|
+
"summary": f"No downloadable files for {project_id}",
|
|
751
|
+
}
|
|
752
|
+
file_id = hits[0].get("file_id", "")
|
|
753
|
+
file_name = hits[0].get("file_name", f"{file_id}.gz")
|
|
754
|
+
else:
|
|
755
|
+
file_name = f"{file_id}.gz"
|
|
756
|
+
|
|
757
|
+
# Download the file
|
|
758
|
+
download_url = f"{_GDC_API}/data/{file_id}"
|
|
759
|
+
label = project_id or file_id[:12]
|
|
760
|
+
dest_dir = _downloads_dir() / "tcga" / label
|
|
761
|
+
dest = dest_dir / file_name
|
|
762
|
+
|
|
763
|
+
if dest.exists():
|
|
764
|
+
size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
|
|
765
|
+
return {
|
|
766
|
+
"path": str(dest),
|
|
767
|
+
"file_id": file_id,
|
|
768
|
+
"project_id": project_id,
|
|
769
|
+
"size_mb": size_mb,
|
|
770
|
+
"summary": f"Already downloaded: {file_name} ({size_mb} MB)",
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
path, error = _stream_download(download_url, dest)
|
|
774
|
+
if error:
|
|
775
|
+
return {"error": error, "file_id": file_id, "summary": f"Download failed: {error}"}
|
|
776
|
+
|
|
777
|
+
size_mb = round(path.stat().st_size / (1024 * 1024), 2)
|
|
778
|
+
return {
|
|
779
|
+
"path": str(path),
|
|
780
|
+
"file_id": file_id,
|
|
781
|
+
"project_id": project_id,
|
|
782
|
+
"filename": file_name,
|
|
783
|
+
"size_mb": size_mb,
|
|
784
|
+
"summary": f"Downloaded GDC file {file_name} ({size_mb} MB) to {path}",
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
# ---------------------------------------------------------------------------
|
|
789
|
+
# 7. omics.dataset_info
|
|
790
|
+
# ---------------------------------------------------------------------------
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
@registry.register(
|
|
794
|
+
name="omics.dataset_info",
|
|
795
|
+
description="Inspect a downloaded dataset file and return metadata summary",
|
|
796
|
+
category="omics",
|
|
797
|
+
parameters={
|
|
798
|
+
"path": "Path to the downloaded dataset file (h5ad, CSV, TSV, or matrix.txt.gz)",
|
|
799
|
+
},
|
|
800
|
+
usage_guide=(
|
|
801
|
+
"Inspect a downloaded omics file before analysis. Returns shape, columns, "
|
|
802
|
+
"metadata. Use after omics.*_fetch to understand the data before running "
|
|
803
|
+
"singlecell.* or code.execute on it."
|
|
804
|
+
),
|
|
805
|
+
)
|
|
806
|
+
def dataset_info(path: str, **kwargs) -> dict:
|
|
807
|
+
"""Inspect a downloaded dataset file and return metadata."""
|
|
808
|
+
if not path:
|
|
809
|
+
return {"error": "Path is required", "summary": "No path provided"}
|
|
810
|
+
|
|
811
|
+
filepath = Path(path).expanduser()
|
|
812
|
+
if not filepath.exists():
|
|
813
|
+
return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
|
|
814
|
+
|
|
815
|
+
size_mb = round(filepath.stat().st_size / (1024 * 1024), 2)
|
|
816
|
+
suffix = filepath.suffix.lower()
|
|
817
|
+
|
|
818
|
+
# Handle .gz suffix
|
|
819
|
+
if suffix == ".gz":
|
|
820
|
+
inner_suffix = Path(filepath.stem).suffix.lower()
|
|
821
|
+
suffix = inner_suffix + suffix # e.g. ".txt.gz"
|
|
822
|
+
|
|
823
|
+
try:
|
|
824
|
+
if suffix == ".h5ad":
|
|
825
|
+
return _inspect_h5ad(filepath, size_mb)
|
|
826
|
+
elif suffix in (".csv", ".tsv", ".txt"):
|
|
827
|
+
return _inspect_tabular(filepath, size_mb, sep="," if suffix == ".csv" else "\t")
|
|
828
|
+
elif suffix in (".txt.gz",):
|
|
829
|
+
return _inspect_matrix_gz(filepath, size_mb)
|
|
830
|
+
else:
|
|
831
|
+
return {
|
|
832
|
+
"path": str(filepath),
|
|
833
|
+
"file_type": suffix,
|
|
834
|
+
"size_mb": size_mb,
|
|
835
|
+
"summary": f"File type '{suffix}' not directly inspectable. Size: {size_mb} MB. Try loading with code.execute.",
|
|
836
|
+
}
|
|
837
|
+
except Exception as exc:
|
|
838
|
+
return {
|
|
839
|
+
"error": f"Inspection failed: {str(exc)[:200]}",
|
|
840
|
+
"path": str(filepath),
|
|
841
|
+
"size_mb": size_mb,
|
|
842
|
+
"summary": f"Could not inspect {filepath.name}: {str(exc)[:100]}",
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def _inspect_h5ad(filepath: Path, size_mb: float) -> dict:
|
|
847
|
+
"""Inspect an h5ad file using scanpy."""
|
|
848
|
+
sc = _check_scanpy()
|
|
849
|
+
if sc is None:
|
|
850
|
+
return {
|
|
851
|
+
"path": str(filepath),
|
|
852
|
+
"file_type": "h5ad",
|
|
853
|
+
"size_mb": size_mb,
|
|
854
|
+
"error": "scanpy not installed. Install with: pip install scanpy",
|
|
855
|
+
"summary": f"h5ad file ({size_mb} MB) — install scanpy to inspect: pip install scanpy",
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
adata = sc.read_h5ad(filepath)
|
|
859
|
+
obs_cols = list(adata.obs.columns)
|
|
860
|
+
var_cols = list(adata.var.columns)
|
|
861
|
+
layers = list(adata.layers.keys()) if adata.layers else []
|
|
862
|
+
|
|
863
|
+
return {
|
|
864
|
+
"path": str(filepath),
|
|
865
|
+
"file_type": "h5ad",
|
|
866
|
+
"size_mb": size_mb,
|
|
867
|
+
"n_cells": adata.n_obs,
|
|
868
|
+
"n_genes": adata.n_vars,
|
|
869
|
+
"obs_columns": obs_cols[:20],
|
|
870
|
+
"var_columns": var_cols[:20],
|
|
871
|
+
"layers": layers,
|
|
872
|
+
"obs_preview": {col: list(adata.obs[col].unique()[:5]) for col in obs_cols[:5]},
|
|
873
|
+
"summary": (
|
|
874
|
+
f"h5ad: {adata.n_obs:,} cells x {adata.n_vars:,} genes ({size_mb} MB). "
|
|
875
|
+
f"Obs columns: {', '.join(obs_cols[:8])}. "
|
|
876
|
+
f"Layers: {', '.join(layers) if layers else 'X only'}."
|
|
877
|
+
),
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def _inspect_tabular(filepath: Path, size_mb: float, sep: str = ",") -> dict:
|
|
882
|
+
"""Inspect a CSV/TSV file."""
|
|
883
|
+
import pandas as pd
|
|
884
|
+
|
|
885
|
+
# Read just the first rows to get shape info without loading everything
|
|
886
|
+
df_head = pd.read_csv(filepath, sep=sep, nrows=5, index_col=0)
|
|
887
|
+
# Get full shape by counting lines
|
|
888
|
+
with open(filepath) as f:
|
|
889
|
+
n_lines = sum(1 for _ in f) - 1 # subtract header
|
|
890
|
+
|
|
891
|
+
columns = list(df_head.columns)
|
|
892
|
+
dtypes = {col: str(dtype) for col, dtype in df_head.dtypes.items()}
|
|
893
|
+
|
|
894
|
+
return {
|
|
895
|
+
"path": str(filepath),
|
|
896
|
+
"file_type": "csv" if sep == "," else "tsv",
|
|
897
|
+
"size_mb": size_mb,
|
|
898
|
+
"shape": [n_lines, len(columns)],
|
|
899
|
+
"columns": columns[:30],
|
|
900
|
+
"dtypes": {k: v for k, v in list(dtypes.items())[:15]},
|
|
901
|
+
"head_preview": df_head.head(3).to_dict(),
|
|
902
|
+
"summary": (
|
|
903
|
+
f"Tabular: {n_lines:,} rows x {len(columns)} columns ({size_mb} MB). "
|
|
904
|
+
f"Columns: {', '.join(columns[:8])}"
|
|
905
|
+
),
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def _inspect_matrix_gz(filepath: Path, size_mb: float) -> dict:
|
|
910
|
+
"""Inspect a GEO series matrix .txt.gz file."""
|
|
911
|
+
metadata = {}
|
|
912
|
+
n_rows = 0
|
|
913
|
+
columns = []
|
|
914
|
+
|
|
915
|
+
with gzip.open(filepath, "rt", errors="replace") as f:
|
|
916
|
+
for line in f:
|
|
917
|
+
if line.startswith("!"):
|
|
918
|
+
# Parse metadata lines
|
|
919
|
+
parts = line.strip().split("\t", 1)
|
|
920
|
+
if len(parts) == 2:
|
|
921
|
+
key = parts[0].lstrip("!").strip()
|
|
922
|
+
val = parts[1].strip().strip('"')
|
|
923
|
+
if key not in metadata:
|
|
924
|
+
metadata[key] = val
|
|
925
|
+
elif isinstance(metadata[key], list):
|
|
926
|
+
metadata[key].append(val)
|
|
927
|
+
else:
|
|
928
|
+
metadata[key] = [metadata[key], val]
|
|
929
|
+
elif line.startswith('"ID_REF"') or line.startswith("ID_REF"):
|
|
930
|
+
columns = [c.strip('"') for c in line.strip().split("\t")]
|
|
931
|
+
elif not line.startswith("!") and line.strip():
|
|
932
|
+
n_rows += 1
|
|
933
|
+
|
|
934
|
+
# Extract key metadata fields
|
|
935
|
+
title = metadata.get("Series_title", "")
|
|
936
|
+
organism = metadata.get("Series_organism", "")
|
|
937
|
+
n_samples = len(columns) - 1 if columns else 0
|
|
938
|
+
|
|
939
|
+
return {
|
|
940
|
+
"path": str(filepath),
|
|
941
|
+
"file_type": "matrix.txt.gz",
|
|
942
|
+
"size_mb": size_mb,
|
|
943
|
+
"title": title,
|
|
944
|
+
"organism": organism,
|
|
945
|
+
"n_probes_or_genes": n_rows,
|
|
946
|
+
"n_samples": n_samples,
|
|
947
|
+
"sample_ids": columns[1:11] if columns else [],
|
|
948
|
+
"metadata_keys": list(metadata.keys())[:15],
|
|
949
|
+
"summary": (
|
|
950
|
+
f"GEO matrix: {n_rows:,} probes/genes x {n_samples} samples ({size_mb} MB). "
|
|
951
|
+
f"Title: {title[:80]}. Organism: {organism}."
|
|
952
|
+
),
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
# ===========================================================================
|
|
957
|
+
# Analysis tools — modality-specific processing of downloaded data
|
|
958
|
+
# ===========================================================================
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def _load_tabular(path: str, **read_kwargs) -> "tuple[pd.DataFrame | None, str | None]":
|
|
962
|
+
"""Load a tabular file, returning (df, error)."""
|
|
963
|
+
import pandas as pd
|
|
964
|
+
|
|
965
|
+
filepath = Path(path).expanduser()
|
|
966
|
+
if not filepath.exists():
|
|
967
|
+
return None, f"File not found: {path}"
|
|
968
|
+
suffix = filepath.suffix.lower()
|
|
969
|
+
kwargs = dict(read_kwargs)
|
|
970
|
+
try:
|
|
971
|
+
# Keep prior behavior by defaulting to first column as index,
|
|
972
|
+
# while allowing callers to override.
|
|
973
|
+
kwargs.setdefault("index_col", 0)
|
|
974
|
+
if suffix in {".xlsx", ".xls"}:
|
|
975
|
+
df = pd.read_excel(filepath, **kwargs)
|
|
976
|
+
return df, None
|
|
977
|
+
if suffix == ".csv":
|
|
978
|
+
df = pd.read_csv(filepath, sep=",", **kwargs)
|
|
979
|
+
return df, None
|
|
980
|
+
if suffix in {".tsv", ".tab"}:
|
|
981
|
+
df = pd.read_csv(filepath, sep="\t", **kwargs)
|
|
982
|
+
return df, None
|
|
983
|
+
if suffix == ".txt":
|
|
984
|
+
# Many omics count matrices are whitespace-delimited.
|
|
985
|
+
try:
|
|
986
|
+
df = pd.read_csv(filepath, sep=r"\s+", engine="python", **kwargs)
|
|
987
|
+
return df, None
|
|
988
|
+
except Exception:
|
|
989
|
+
df = pd.read_csv(filepath, sep="\t", **kwargs)
|
|
990
|
+
return df, None
|
|
991
|
+
# Generic fallback: delimiter sniffing for unknown text-like files.
|
|
992
|
+
df = pd.read_csv(filepath, sep=None, engine="python", **kwargs)
|
|
993
|
+
return df, None
|
|
994
|
+
except Exception as exc:
|
|
995
|
+
return None, f"Failed to read {filepath.name}: {str(exc)[:200]}"
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
def _parse_sample_groups(
|
|
999
|
+
df,
|
|
1000
|
+
group1: str = "",
|
|
1001
|
+
group2: str = "",
|
|
1002
|
+
*,
|
|
1003
|
+
auto_grouping: bool = False,
|
|
1004
|
+
min_group_size: int = 2,
|
|
1005
|
+
group_names: tuple[str, str] = ("group1", "group2"),
|
|
1006
|
+
) -> tuple[list[str], list[str], dict | None]:
|
|
1007
|
+
"""Resolve and validate group sample assignments for two-group comparisons."""
|
|
1008
|
+
all_samples = [str(c) for c in df.columns]
|
|
1009
|
+
g1_label, g2_label = group_names
|
|
1010
|
+
g1_samples = [s.strip() for s in group1.split(",") if s.strip()] if group1 else []
|
|
1011
|
+
g2_samples = [s.strip() for s in group2.split(",") if s.strip()] if group2 else []
|
|
1012
|
+
|
|
1013
|
+
# Require explicit groups unless user opts in to auto-splitting.
|
|
1014
|
+
if not g1_samples and not g2_samples:
|
|
1015
|
+
if not auto_grouping:
|
|
1016
|
+
return [], [], {
|
|
1017
|
+
"error": (
|
|
1018
|
+
f"Explicit sample groups are required. Provide {g1_label} and {g2_label} "
|
|
1019
|
+
"as comma-separated sample names. "
|
|
1020
|
+
"Set auto_grouping=True only for quick exploratory analysis."
|
|
1021
|
+
),
|
|
1022
|
+
"available_samples": all_samples[:30],
|
|
1023
|
+
"n_samples": len(all_samples),
|
|
1024
|
+
"summary": (
|
|
1025
|
+
f"No groups provided. Define {g1_label}/{g2_label} using sample names "
|
|
1026
|
+
f"(found {len(all_samples)} samples)."
|
|
1027
|
+
),
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
if len(all_samples) < (min_group_size * 2):
|
|
1031
|
+
return [], [], {
|
|
1032
|
+
"error": (
|
|
1033
|
+
f"Need at least {min_group_size * 2} samples for auto_grouping "
|
|
1034
|
+
f"({min_group_size} per group), found {len(all_samples)}."
|
|
1035
|
+
),
|
|
1036
|
+
"available_samples": all_samples[:30],
|
|
1037
|
+
"summary": f"Too few samples for auto_grouping: {len(all_samples)}",
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
mid = len(all_samples) // 2
|
|
1041
|
+
g1_samples = all_samples[:mid]
|
|
1042
|
+
g2_samples = all_samples[mid:]
|
|
1043
|
+
|
|
1044
|
+
elif (g1_samples and not g2_samples) or (g2_samples and not g1_samples):
|
|
1045
|
+
return [], [], {
|
|
1046
|
+
"error": f"Both {g1_label} and {g2_label} must be provided together.",
|
|
1047
|
+
"available_samples": all_samples[:30],
|
|
1048
|
+
"summary": f"Incomplete group definition: need both {g1_label} and {g2_label}",
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
missing = [s for s in (g1_samples + g2_samples) if s not in all_samples]
|
|
1052
|
+
if missing:
|
|
1053
|
+
return [], [], {
|
|
1054
|
+
"error": f"Samples not found: {missing}",
|
|
1055
|
+
"available_samples": all_samples[:30],
|
|
1056
|
+
"summary": f"Sample names not found in matrix. Available: {', '.join(all_samples[:10])}",
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
overlap = sorted(set(g1_samples).intersection(g2_samples))
|
|
1060
|
+
if overlap:
|
|
1061
|
+
return [], [], {
|
|
1062
|
+
"error": f"Samples cannot appear in both groups: {overlap}",
|
|
1063
|
+
"summary": "Group overlap detected",
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
if len(g1_samples) < min_group_size or len(g2_samples) < min_group_size:
|
|
1067
|
+
return [], [], {
|
|
1068
|
+
"error": (
|
|
1069
|
+
f"Each group needs at least {min_group_size} samples. "
|
|
1070
|
+
f"Got {g1_label}={len(g1_samples)}, {g2_label}={len(g2_samples)}."
|
|
1071
|
+
),
|
|
1072
|
+
"summary": "Insufficient replicates per group",
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
return g1_samples, g2_samples, None
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
def _fdr_correct(pvalues):
|
|
1079
|
+
"""Benjamini-Hochberg FDR correction. Returns array of q-values."""
|
|
1080
|
+
import numpy as np
|
|
1081
|
+
|
|
1082
|
+
pvals = np.asarray(pvalues, dtype=float)
|
|
1083
|
+
n = len(pvals)
|
|
1084
|
+
if n == 0:
|
|
1085
|
+
return pvals
|
|
1086
|
+
ranked = pvals.argsort().argsort() + 1 # 1-based rank
|
|
1087
|
+
qvals = pvals * n / ranked
|
|
1088
|
+
# Enforce monotonicity (from largest p-value down)
|
|
1089
|
+
order = pvals.argsort()[::-1]
|
|
1090
|
+
qvals_sorted = qvals[order]
|
|
1091
|
+
for i in range(1, len(qvals_sorted)):
|
|
1092
|
+
if qvals_sorted[i] > qvals_sorted[i - 1]:
|
|
1093
|
+
qvals_sorted[i] = qvals_sorted[i - 1]
|
|
1094
|
+
qvals[order] = qvals_sorted
|
|
1095
|
+
return np.clip(qvals, 0, 1)
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
# ---------------------------------------------------------------------------
|
|
1099
|
+
# 8. omics.methylation_diff
|
|
1100
|
+
# ---------------------------------------------------------------------------
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
@registry.register(
|
|
1104
|
+
name="omics.methylation_diff",
|
|
1105
|
+
description="Differential methylation analysis between two sample groups",
|
|
1106
|
+
category="omics",
|
|
1107
|
+
parameters={
|
|
1108
|
+
"path": "Path to methylation beta-value matrix (rows=CpG sites, cols=samples)",
|
|
1109
|
+
"group1": "Comma-separated sample names for group 1",
|
|
1110
|
+
"group2": "Comma-separated sample names for group 2",
|
|
1111
|
+
"auto_grouping": "If true, splits samples by column order for exploratory use (default false)",
|
|
1112
|
+
"delta_beta_cutoff": "Minimum absolute delta-beta to call DMR (default 0.2)",
|
|
1113
|
+
"fdr_cutoff": "FDR significance threshold (default 0.05)",
|
|
1114
|
+
},
|
|
1115
|
+
usage_guide=(
|
|
1116
|
+
"Analyze differential methylation from beta-value matrices (e.g., Illumina 450K/EPIC). "
|
|
1117
|
+
"Requires a matrix with CpG sites as rows and samples as columns. "
|
|
1118
|
+
"Use after omics.geo_fetch or omics.tcga_fetch to download methylation data. "
|
|
1119
|
+
"For reliable analysis, provide explicit group1/group2 sample lists."
|
|
1120
|
+
),
|
|
1121
|
+
)
|
|
1122
|
+
def methylation_diff(
|
|
1123
|
+
path: str,
|
|
1124
|
+
group1: str = "",
|
|
1125
|
+
group2: str = "",
|
|
1126
|
+
auto_grouping: bool = False,
|
|
1127
|
+
delta_beta_cutoff: float = 0.2,
|
|
1128
|
+
fdr_cutoff: float = 0.05,
|
|
1129
|
+
**kwargs,
|
|
1130
|
+
) -> dict:
|
|
1131
|
+
"""Differential methylation analysis between two groups."""
|
|
1132
|
+
import numpy as np
|
|
1133
|
+
from scipy import stats
|
|
1134
|
+
|
|
1135
|
+
df, error = _load_tabular(path)
|
|
1136
|
+
if error:
|
|
1137
|
+
return {"error": error, "summary": f"Could not load methylation data: {error}"}
|
|
1138
|
+
|
|
1139
|
+
g1_samples, g2_samples, group_error = _parse_sample_groups(
|
|
1140
|
+
df,
|
|
1141
|
+
group1=group1,
|
|
1142
|
+
group2=group2,
|
|
1143
|
+
auto_grouping=auto_grouping,
|
|
1144
|
+
min_group_size=2,
|
|
1145
|
+
)
|
|
1146
|
+
if group_error:
|
|
1147
|
+
return group_error
|
|
1148
|
+
|
|
1149
|
+
g1 = df[g1_samples].dropna(how="all")
|
|
1150
|
+
g2 = df[g2_samples].dropna(how="all")
|
|
1151
|
+
common_sites = g1.index.intersection(g2.index)
|
|
1152
|
+
g1 = g1.loc[common_sites]
|
|
1153
|
+
g2 = g2.loc[common_sites]
|
|
1154
|
+
|
|
1155
|
+
# Calculate delta-beta and p-values
|
|
1156
|
+
mean1 = g1.mean(axis=1)
|
|
1157
|
+
mean2 = g2.mean(axis=1)
|
|
1158
|
+
delta_beta = mean2 - mean1
|
|
1159
|
+
|
|
1160
|
+
pvals = []
|
|
1161
|
+
for site in common_sites:
|
|
1162
|
+
v1 = g1.loc[site].dropna().values
|
|
1163
|
+
v2 = g2.loc[site].dropna().values
|
|
1164
|
+
if len(v1) >= 2 and len(v2) >= 2:
|
|
1165
|
+
_, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
|
|
1166
|
+
pvals.append(p)
|
|
1167
|
+
else:
|
|
1168
|
+
pvals.append(1.0)
|
|
1169
|
+
|
|
1170
|
+
pvals = np.array(pvals)
|
|
1171
|
+
qvals = _fdr_correct(pvals)
|
|
1172
|
+
|
|
1173
|
+
# Identify DMRs
|
|
1174
|
+
sig_mask = (qvals < fdr_cutoff) & (np.abs(delta_beta.values) >= delta_beta_cutoff)
|
|
1175
|
+
n_sig = int(sig_mask.sum())
|
|
1176
|
+
hyper = int(((qvals < fdr_cutoff) & (delta_beta.values >= delta_beta_cutoff)).sum())
|
|
1177
|
+
hypo = int(((qvals < fdr_cutoff) & (delta_beta.values <= -delta_beta_cutoff)).sum())
|
|
1178
|
+
|
|
1179
|
+
# Top hits
|
|
1180
|
+
import pandas as pd
|
|
1181
|
+
|
|
1182
|
+
results_df = pd.DataFrame({
|
|
1183
|
+
"mean_group1": mean1,
|
|
1184
|
+
"mean_group2": mean2,
|
|
1185
|
+
"delta_beta": delta_beta,
|
|
1186
|
+
"pvalue": pvals,
|
|
1187
|
+
"fdr": qvals,
|
|
1188
|
+
}, index=common_sites)
|
|
1189
|
+
results_df = results_df.sort_values("fdr")
|
|
1190
|
+
top_hits = results_df.head(20).to_dict("index")
|
|
1191
|
+
|
|
1192
|
+
return {
|
|
1193
|
+
"n_sites_tested": len(common_sites),
|
|
1194
|
+
"n_significant": n_sig,
|
|
1195
|
+
"n_hypermethylated": hyper,
|
|
1196
|
+
"n_hypomethylated": hypo,
|
|
1197
|
+
"group1_samples": g1_samples,
|
|
1198
|
+
"group2_samples": g2_samples,
|
|
1199
|
+
"auto_grouping_used": bool(auto_grouping and not group1 and not group2),
|
|
1200
|
+
"delta_beta_cutoff": delta_beta_cutoff,
|
|
1201
|
+
"fdr_cutoff": fdr_cutoff,
|
|
1202
|
+
"top_hits": top_hits,
|
|
1203
|
+
"summary": (
|
|
1204
|
+
f"Tested {len(common_sites):,} CpG sites: {n_sig} significant (FDR<{fdr_cutoff}, "
|
|
1205
|
+
f"|Δβ|≥{delta_beta_cutoff}). {hyper} hypermethylated, {hypo} hypomethylated."
|
|
1206
|
+
),
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
|
|
1210
|
+
# ---------------------------------------------------------------------------
|
|
1211
|
+
# 9. omics.methylation_profile
|
|
1212
|
+
# ---------------------------------------------------------------------------
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
@registry.register(
|
|
1216
|
+
name="omics.methylation_profile",
|
|
1217
|
+
description="Summarize methylation landscape: distribution, variability, and global patterns",
|
|
1218
|
+
category="omics",
|
|
1219
|
+
parameters={
|
|
1220
|
+
"path": "Path to methylation beta-value matrix",
|
|
1221
|
+
},
|
|
1222
|
+
usage_guide=(
|
|
1223
|
+
"Get an overview of a methylation dataset: global methylation levels, "
|
|
1224
|
+
"bimodal distribution (typical of 450K/EPIC), most variable CpGs. "
|
|
1225
|
+
"Use as a first step before methylation_diff."
|
|
1226
|
+
),
|
|
1227
|
+
)
|
|
1228
|
+
def methylation_profile(path: str, **kwargs) -> dict:
|
|
1229
|
+
"""Summarize methylation dataset landscape."""
|
|
1230
|
+
import numpy as np
|
|
1231
|
+
|
|
1232
|
+
df, error = _load_tabular(path)
|
|
1233
|
+
if error:
|
|
1234
|
+
return {"error": error, "summary": f"Could not load: {error}"}
|
|
1235
|
+
|
|
1236
|
+
n_sites, n_samples = df.shape
|
|
1237
|
+
all_vals = df.values.flatten()
|
|
1238
|
+
all_vals = all_vals[~np.isnan(all_vals)]
|
|
1239
|
+
|
|
1240
|
+
# Global statistics
|
|
1241
|
+
global_mean = float(np.mean(all_vals))
|
|
1242
|
+
global_median = float(np.median(all_vals))
|
|
1243
|
+
frac_low = float(np.mean(all_vals < 0.2)) # unmethylated
|
|
1244
|
+
frac_mid = float(np.mean((all_vals >= 0.2) & (all_vals <= 0.8))) # intermediate
|
|
1245
|
+
frac_high = float(np.mean(all_vals > 0.8)) # methylated
|
|
1246
|
+
|
|
1247
|
+
# Most variable sites
|
|
1248
|
+
site_var = df.var(axis=1).dropna().sort_values(ascending=False)
|
|
1249
|
+
top_variable = list(site_var.head(20).index)
|
|
1250
|
+
|
|
1251
|
+
# Per-sample mean methylation
|
|
1252
|
+
sample_means = df.mean(axis=0).to_dict()
|
|
1253
|
+
|
|
1254
|
+
return {
|
|
1255
|
+
"n_sites": n_sites,
|
|
1256
|
+
"n_samples": n_samples,
|
|
1257
|
+
"global_mean_beta": round(global_mean, 4),
|
|
1258
|
+
"global_median_beta": round(global_median, 4),
|
|
1259
|
+
"fraction_unmethylated": round(frac_low, 3),
|
|
1260
|
+
"fraction_intermediate": round(frac_mid, 3),
|
|
1261
|
+
"fraction_methylated": round(frac_high, 3),
|
|
1262
|
+
"top_variable_sites": top_variable,
|
|
1263
|
+
"sample_mean_betas": {k: round(v, 4) for k, v in list(sample_means.items())[:20]},
|
|
1264
|
+
"summary": (
|
|
1265
|
+
f"Methylation profile: {n_sites:,} sites x {n_samples} samples. "
|
|
1266
|
+
f"Global mean β={global_mean:.3f}. "
|
|
1267
|
+
f"Distribution: {frac_low:.0%} low (<0.2), {frac_mid:.0%} intermediate, {frac_high:.0%} high (>0.8)."
|
|
1268
|
+
),
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
# ---------------------------------------------------------------------------
|
|
1273
|
+
# 10. omics.proteomics_diff
|
|
1274
|
+
# ---------------------------------------------------------------------------
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
@registry.register(
|
|
1278
|
+
name="omics.proteomics_diff",
|
|
1279
|
+
description="Differential protein abundance analysis between two groups",
|
|
1280
|
+
category="omics",
|
|
1281
|
+
parameters={
|
|
1282
|
+
"path": "Path to protein abundance matrix (rows=proteins, cols=samples)",
|
|
1283
|
+
"group1": "Comma-separated sample names for group 1",
|
|
1284
|
+
"group2": "Comma-separated sample names for group 2",
|
|
1285
|
+
"auto_grouping": "If true, splits samples by column order for exploratory use (default false)",
|
|
1286
|
+
"fc_cutoff": "Minimum absolute log2 fold-change (default 1.0)",
|
|
1287
|
+
"fdr_cutoff": "FDR significance threshold (default 0.05)",
|
|
1288
|
+
},
|
|
1289
|
+
usage_guide=(
|
|
1290
|
+
"Differential protein abundance from proteomics data (e.g., TMT, LFQ). "
|
|
1291
|
+
"Input is a protein x sample matrix of log2 abundances or intensities. "
|
|
1292
|
+
"Provide explicit group1/group2 sample lists for production analyses."
|
|
1293
|
+
),
|
|
1294
|
+
)
|
|
1295
|
+
def proteomics_diff(
|
|
1296
|
+
path: str,
|
|
1297
|
+
group1: str = "",
|
|
1298
|
+
group2: str = "",
|
|
1299
|
+
auto_grouping: bool = False,
|
|
1300
|
+
fc_cutoff: float = 1.0,
|
|
1301
|
+
fdr_cutoff: float = 0.05,
|
|
1302
|
+
**kwargs,
|
|
1303
|
+
) -> dict:
|
|
1304
|
+
"""Differential protein abundance analysis."""
|
|
1305
|
+
import numpy as np
|
|
1306
|
+
from scipy import stats
|
|
1307
|
+
|
|
1308
|
+
df, error = _load_tabular(path)
|
|
1309
|
+
if error:
|
|
1310
|
+
return {"error": error, "summary": f"Could not load proteomics data: {error}"}
|
|
1311
|
+
|
|
1312
|
+
g1_samples, g2_samples, group_error = _parse_sample_groups(
|
|
1313
|
+
df,
|
|
1314
|
+
group1=group1,
|
|
1315
|
+
group2=group2,
|
|
1316
|
+
auto_grouping=auto_grouping,
|
|
1317
|
+
min_group_size=2,
|
|
1318
|
+
)
|
|
1319
|
+
if group_error:
|
|
1320
|
+
return group_error
|
|
1321
|
+
|
|
1322
|
+
g1 = df[g1_samples]
|
|
1323
|
+
g2 = df[g2_samples]
|
|
1324
|
+
|
|
1325
|
+
mean1 = g1.mean(axis=1)
|
|
1326
|
+
mean2 = g2.mean(axis=1)
|
|
1327
|
+
log2fc = mean2 - mean1 # already log2 if input is log2
|
|
1328
|
+
|
|
1329
|
+
pvals = []
|
|
1330
|
+
for prot in df.index:
|
|
1331
|
+
v1 = g1.loc[prot].dropna().values
|
|
1332
|
+
v2 = g2.loc[prot].dropna().values
|
|
1333
|
+
if len(v1) >= 2 and len(v2) >= 2:
|
|
1334
|
+
_, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
|
|
1335
|
+
pvals.append(p)
|
|
1336
|
+
else:
|
|
1337
|
+
pvals.append(1.0)
|
|
1338
|
+
|
|
1339
|
+
pvals = np.array(pvals)
|
|
1340
|
+
qvals = _fdr_correct(pvals)
|
|
1341
|
+
|
|
1342
|
+
sig_mask = (qvals < fdr_cutoff) & (np.abs(log2fc.values) >= fc_cutoff)
|
|
1343
|
+
n_sig = int(sig_mask.sum())
|
|
1344
|
+
n_up = int(((qvals < fdr_cutoff) & (log2fc.values >= fc_cutoff)).sum())
|
|
1345
|
+
n_down = int(((qvals < fdr_cutoff) & (log2fc.values <= -fc_cutoff)).sum())
|
|
1346
|
+
|
|
1347
|
+
import pandas as pd
|
|
1348
|
+
|
|
1349
|
+
results_df = pd.DataFrame({
|
|
1350
|
+
"mean_group1": mean1,
|
|
1351
|
+
"mean_group2": mean2,
|
|
1352
|
+
"log2fc": log2fc,
|
|
1353
|
+
"pvalue": pvals,
|
|
1354
|
+
"fdr": qvals,
|
|
1355
|
+
}, index=df.index)
|
|
1356
|
+
results_df = results_df.sort_values("fdr")
|
|
1357
|
+
|
|
1358
|
+
return {
|
|
1359
|
+
"n_proteins_tested": len(df.index),
|
|
1360
|
+
"n_significant": n_sig,
|
|
1361
|
+
"n_upregulated": n_up,
|
|
1362
|
+
"n_downregulated": n_down,
|
|
1363
|
+
"group1_samples": g1_samples,
|
|
1364
|
+
"group2_samples": g2_samples,
|
|
1365
|
+
"auto_grouping_used": bool(auto_grouping and not group1 and not group2),
|
|
1366
|
+
"top_hits": results_df.head(20).to_dict("index"),
|
|
1367
|
+
"summary": (
|
|
1368
|
+
f"Tested {len(df.index):,} proteins: {n_sig} significant "
|
|
1369
|
+
f"(FDR<{fdr_cutoff}, |log2FC|≥{fc_cutoff}). {n_up} up, {n_down} down."
|
|
1370
|
+
),
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
|
|
1374
|
+
# ---------------------------------------------------------------------------
|
|
1375
|
+
# 11. omics.proteomics_enrich
|
|
1376
|
+
# ---------------------------------------------------------------------------
|
|
1377
|
+
|
|
1378
|
+
|
|
1379
|
+
def _parse_gene_list_file(path: str) -> tuple[set[str], str | None]:
|
|
1380
|
+
"""Load a gene list from text/CSV/TSV file and return an uppercase gene set."""
|
|
1381
|
+
import pandas as pd
|
|
1382
|
+
|
|
1383
|
+
fp = Path(path).expanduser()
|
|
1384
|
+
if not fp.exists():
|
|
1385
|
+
return set(), f"Background file not found: {path}"
|
|
1386
|
+
|
|
1387
|
+
suffix = fp.suffix.lower()
|
|
1388
|
+
try:
|
|
1389
|
+
if suffix == ".txt":
|
|
1390
|
+
genes = {
|
|
1391
|
+
line.strip().split("\t")[0].split(",")[0].strip().upper()
|
|
1392
|
+
for line in fp.read_text(errors="replace").splitlines()
|
|
1393
|
+
if line.strip()
|
|
1394
|
+
}
|
|
1395
|
+
return {g for g in genes if g}, None
|
|
1396
|
+
|
|
1397
|
+
if suffix in {".csv", ".tsv"}:
|
|
1398
|
+
sep = "," if suffix == ".csv" else "\t"
|
|
1399
|
+
df = pd.read_csv(fp, sep=sep)
|
|
1400
|
+
if df.empty:
|
|
1401
|
+
return set(), f"Background file is empty: {path}"
|
|
1402
|
+
first_col = df.columns[0]
|
|
1403
|
+
genes = {
|
|
1404
|
+
str(v).strip().upper()
|
|
1405
|
+
for v in df[first_col].dropna().tolist()
|
|
1406
|
+
if str(v).strip()
|
|
1407
|
+
}
|
|
1408
|
+
return genes, None
|
|
1409
|
+
|
|
1410
|
+
# Fallback: treat as newline-delimited text.
|
|
1411
|
+
genes = {
|
|
1412
|
+
line.strip().split("\t")[0].split(",")[0].strip().upper()
|
|
1413
|
+
for line in fp.read_text(errors="replace").splitlines()
|
|
1414
|
+
if line.strip()
|
|
1415
|
+
}
|
|
1416
|
+
return {g for g in genes if g}, None
|
|
1417
|
+
except Exception as exc:
|
|
1418
|
+
return set(), f"Failed to parse background file: {str(exc)[:200]}"
|
|
1419
|
+
|
|
1420
|
+
|
|
1421
|
+
def _enrichr_libraries_for_organism(organism: str) -> tuple[list[str] | None, str | None]:
|
|
1422
|
+
"""Map organism names to Enrichr libraries."""
|
|
1423
|
+
org = (organism or "Homo sapiens").strip().lower()
|
|
1424
|
+
human_aliases = {"human", "homo sapiens", "hs", "h. sapiens"}
|
|
1425
|
+
mouse_aliases = {"mouse", "mus musculus", "mm", "m. musculus"}
|
|
1426
|
+
|
|
1427
|
+
if org in human_aliases:
|
|
1428
|
+
return ["KEGG_2021_Human", "Reactome_2022", "GO_Biological_Process_2023"], None
|
|
1429
|
+
if org in mouse_aliases:
|
|
1430
|
+
return ["KEGG_2021_Mouse", "WikiPathway_2021_Mouse", "GO_Biological_Process_2023"], None
|
|
1431
|
+
return None, (
|
|
1432
|
+
f"Unsupported organism '{organism}'. "
|
|
1433
|
+
"Supported: Homo sapiens, Mus musculus."
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
|
|
1437
|
+
@registry.register(
|
|
1438
|
+
name="omics.proteomics_enrich",
|
|
1439
|
+
description="Pathway enrichment analysis from a list of differentially abundant proteins",
|
|
1440
|
+
category="omics",
|
|
1441
|
+
parameters={
|
|
1442
|
+
"proteins": "Comma-separated list of protein/gene symbols",
|
|
1443
|
+
"background_path": "Path to full protein list (optional, for background set)",
|
|
1444
|
+
"organism": "Organism for gene set lookup (default 'Homo sapiens')",
|
|
1445
|
+
},
|
|
1446
|
+
usage_guide=(
|
|
1447
|
+
"Run over-representation analysis on a set of differentially expressed proteins. "
|
|
1448
|
+
"Uses Enrichr API for pathway databases (KEGG, Reactome, GO)."
|
|
1449
|
+
),
|
|
1450
|
+
)
|
|
1451
|
+
def proteomics_enrich(
|
|
1452
|
+
proteins: str = "",
|
|
1453
|
+
background_path: str = "",
|
|
1454
|
+
organism: str = "Homo sapiens",
|
|
1455
|
+
**kwargs,
|
|
1456
|
+
) -> dict:
|
|
1457
|
+
"""Pathway enrichment for a protein list via Enrichr."""
|
|
1458
|
+
seen = set()
|
|
1459
|
+
gene_list = []
|
|
1460
|
+
for gene in (g.strip() for g in proteins.split(",") if g.strip()):
|
|
1461
|
+
key = gene.upper()
|
|
1462
|
+
if key not in seen:
|
|
1463
|
+
seen.add(key)
|
|
1464
|
+
gene_list.append(gene)
|
|
1465
|
+
if not gene_list:
|
|
1466
|
+
return {"error": "No proteins provided", "summary": "Empty protein list"}
|
|
1467
|
+
|
|
1468
|
+
libraries, org_error = _enrichr_libraries_for_organism(organism)
|
|
1469
|
+
if org_error:
|
|
1470
|
+
return {"error": org_error, "summary": org_error}
|
|
1471
|
+
|
|
1472
|
+
background_info = {}
|
|
1473
|
+
if background_path:
|
|
1474
|
+
background_genes, bg_error = _parse_gene_list_file(background_path)
|
|
1475
|
+
if bg_error:
|
|
1476
|
+
return {"error": bg_error, "summary": bg_error}
|
|
1477
|
+
if not background_genes:
|
|
1478
|
+
return {
|
|
1479
|
+
"error": "Background file contains no genes after parsing",
|
|
1480
|
+
"summary": f"Empty background set: {background_path}",
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
original_n = len(gene_list)
|
|
1484
|
+
gene_list = [g for g in gene_list if g.upper() in background_genes]
|
|
1485
|
+
if not gene_list:
|
|
1486
|
+
return {
|
|
1487
|
+
"error": "None of the input genes were found in the provided background set",
|
|
1488
|
+
"background_gene_count": len(background_genes),
|
|
1489
|
+
"summary": "No overlap between input list and background set",
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
background_info = {
|
|
1493
|
+
"background_path": str(Path(background_path).expanduser()),
|
|
1494
|
+
"background_gene_count": len(background_genes),
|
|
1495
|
+
"n_proteins_before_background_filter": original_n,
|
|
1496
|
+
"n_proteins_after_background_filter": len(gene_list),
|
|
1497
|
+
# Enrichr endpoint has no custom-universe parameter; we apply background as input filter.
|
|
1498
|
+
"background_mode": "input_filter_only",
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
# Submit to Enrichr
|
|
1502
|
+
add_url = "https://maayanlab.cloud/Enrichr/addList"
|
|
1503
|
+
payload = {"list": (None, "\n".join(gene_list)), "description": (None, "ct proteomics enrichment")}
|
|
1504
|
+
|
|
1505
|
+
import httpx
|
|
1506
|
+
|
|
1507
|
+
try:
|
|
1508
|
+
resp = httpx.post(add_url, files=payload, timeout=15)
|
|
1509
|
+
resp.raise_for_status()
|
|
1510
|
+
user_list_id = resp.json().get("userListId")
|
|
1511
|
+
except Exception as exc:
|
|
1512
|
+
return {"error": f"Enrichr submission failed: {str(exc)[:200]}", "summary": f"Enrichr error: {str(exc)[:100]}"}
|
|
1513
|
+
|
|
1514
|
+
if not user_list_id:
|
|
1515
|
+
return {"error": "Enrichr did not return a list ID", "summary": "Enrichr submission failed"}
|
|
1516
|
+
|
|
1517
|
+
# Query key libraries
|
|
1518
|
+
all_results = {}
|
|
1519
|
+
library_errors = {}
|
|
1520
|
+
|
|
1521
|
+
for lib in libraries:
|
|
1522
|
+
enrich_url = f"https://maayanlab.cloud/Enrichr/enrich?userListId={user_list_id}&backgroundType={lib}"
|
|
1523
|
+
try:
|
|
1524
|
+
resp = httpx.get(enrich_url, timeout=15)
|
|
1525
|
+
resp.raise_for_status()
|
|
1526
|
+
data = resp.json()
|
|
1527
|
+
terms = data.get(lib, [])
|
|
1528
|
+
top_terms = []
|
|
1529
|
+
for term in terms[:10]:
|
|
1530
|
+
top_terms.append({
|
|
1531
|
+
"term": term[1],
|
|
1532
|
+
"pvalue": term[2],
|
|
1533
|
+
"adj_pvalue": term[6],
|
|
1534
|
+
"odds_ratio": term[3],
|
|
1535
|
+
"genes": term[5],
|
|
1536
|
+
})
|
|
1537
|
+
all_results[lib] = top_terms
|
|
1538
|
+
except Exception as exc:
|
|
1539
|
+
all_results[lib] = []
|
|
1540
|
+
library_errors[lib] = str(exc)[:200]
|
|
1541
|
+
|
|
1542
|
+
# Flatten top hits
|
|
1543
|
+
top_summary = []
|
|
1544
|
+
for lib, terms in all_results.items():
|
|
1545
|
+
for t in terms[:3]:
|
|
1546
|
+
if t["adj_pvalue"] < 0.05:
|
|
1547
|
+
top_summary.append(f"{t['term']} (q={t['adj_pvalue']:.2e})")
|
|
1548
|
+
|
|
1549
|
+
return {
|
|
1550
|
+
"n_proteins_submitted": len(gene_list),
|
|
1551
|
+
"organism": organism,
|
|
1552
|
+
"libraries": libraries,
|
|
1553
|
+
"enrichment_results": all_results,
|
|
1554
|
+
"library_errors": library_errors,
|
|
1555
|
+
**background_info,
|
|
1556
|
+
"summary": (
|
|
1557
|
+
f"Enrichment of {len(gene_list)} proteins. "
|
|
1558
|
+
+ (f"Top enriched: {'; '.join(top_summary[:5])}" if top_summary else "No significant enrichments (FDR<0.05).")
|
|
1559
|
+
+ (" Background set applied as input filter." if background_path else "")
|
|
1560
|
+
),
|
|
1561
|
+
}
|
|
1562
|
+
|
|
1563
|
+
|
|
1564
|
+
# ---------------------------------------------------------------------------
|
|
1565
|
+
# 12. omics.atac_peak_annotate
|
|
1566
|
+
# ---------------------------------------------------------------------------
|
|
1567
|
+
|
|
1568
|
+
|
|
1569
|
+
@registry.register(
|
|
1570
|
+
name="omics.atac_peak_annotate",
|
|
1571
|
+
description="Annotate ATAC-seq peaks by genomic features and summarize accessibility landscape",
|
|
1572
|
+
category="omics",
|
|
1573
|
+
parameters={
|
|
1574
|
+
"path": "Path to peak file (BED-like CSV/TSV with chr, start, end columns or peak count matrix)",
|
|
1575
|
+
},
|
|
1576
|
+
usage_guide=(
|
|
1577
|
+
"Summarize ATAC-seq peak data: genomic distribution, peak sizes, "
|
|
1578
|
+
"chromosome distribution. Works on BED-like files or peak count matrices. "
|
|
1579
|
+
"Use after omics.geo_fetch to download ATAC-seq data."
|
|
1580
|
+
),
|
|
1581
|
+
)
|
|
1582
|
+
def atac_peak_annotate(path: str, **kwargs) -> dict:
|
|
1583
|
+
"""Annotate and summarize ATAC-seq peaks."""
|
|
1584
|
+
import numpy as np
|
|
1585
|
+
import pandas as pd
|
|
1586
|
+
|
|
1587
|
+
filepath = Path(path).expanduser()
|
|
1588
|
+
if not filepath.exists():
|
|
1589
|
+
return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
|
|
1590
|
+
|
|
1591
|
+
suffix = filepath.suffix.lower()
|
|
1592
|
+
sep = "," if suffix == ".csv" else "\t"
|
|
1593
|
+
|
|
1594
|
+
try:
|
|
1595
|
+
df = pd.read_csv(filepath, sep=sep, comment="#")
|
|
1596
|
+
except Exception as exc:
|
|
1597
|
+
return {"error": f"Failed to read: {str(exc)[:200]}", "summary": f"Parse error: {str(exc)[:100]}"}
|
|
1598
|
+
|
|
1599
|
+
# Detect BED-like format: look for chr/start/end columns
|
|
1600
|
+
col_lower = {c.lower(): c for c in df.columns}
|
|
1601
|
+
chr_col = col_lower.get("chr") or col_lower.get("chrom") or col_lower.get("chromosome")
|
|
1602
|
+
start_col = col_lower.get("start") or col_lower.get("chromstart")
|
|
1603
|
+
end_col = col_lower.get("end") or col_lower.get("chromend")
|
|
1604
|
+
|
|
1605
|
+
# Also try positional (first 3 columns as chr, start, end)
|
|
1606
|
+
if not chr_col and len(df.columns) >= 3:
|
|
1607
|
+
first_col_vals = df.iloc[:, 0].astype(str)
|
|
1608
|
+
if first_col_vals.str.startswith("chr").mean() > 0.5:
|
|
1609
|
+
chr_col = df.columns[0]
|
|
1610
|
+
start_col = df.columns[1]
|
|
1611
|
+
end_col = df.columns[2]
|
|
1612
|
+
|
|
1613
|
+
if chr_col and start_col and end_col:
|
|
1614
|
+
# BED-like format
|
|
1615
|
+
peaks = df[[chr_col, start_col, end_col]].copy()
|
|
1616
|
+
peaks.columns = ["chr", "start", "end"]
|
|
1617
|
+
peaks["start"] = pd.to_numeric(peaks["start"], errors="coerce")
|
|
1618
|
+
peaks["end"] = pd.to_numeric(peaks["end"], errors="coerce")
|
|
1619
|
+
peaks = peaks.dropna()
|
|
1620
|
+
peaks["width"] = peaks["end"] - peaks["start"]
|
|
1621
|
+
|
|
1622
|
+
n_peaks = len(peaks)
|
|
1623
|
+
chr_counts = peaks["chr"].value_counts().head(24).to_dict()
|
|
1624
|
+
width_stats = {
|
|
1625
|
+
"mean": round(float(peaks["width"].mean()), 0),
|
|
1626
|
+
"median": round(float(peaks["width"].median()), 0),
|
|
1627
|
+
"min": int(peaks["width"].min()),
|
|
1628
|
+
"max": int(peaks["width"].max()),
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1631
|
+
# Estimate genomic feature distribution by peak width heuristic
|
|
1632
|
+
promoter_like = int((peaks["width"] < 500).sum())
|
|
1633
|
+
enhancer_like = int(((peaks["width"] >= 500) & (peaks["width"] < 2000)).sum())
|
|
1634
|
+
broad_peaks = int((peaks["width"] >= 2000).sum())
|
|
1635
|
+
|
|
1636
|
+
return {
|
|
1637
|
+
"n_peaks": n_peaks,
|
|
1638
|
+
"chromosome_distribution": chr_counts,
|
|
1639
|
+
"peak_width_stats": width_stats,
|
|
1640
|
+
"promoter_like_peaks": promoter_like,
|
|
1641
|
+
"enhancer_like_peaks": enhancer_like,
|
|
1642
|
+
"broad_peaks": broad_peaks,
|
|
1643
|
+
"summary": (
|
|
1644
|
+
f"ATAC-seq: {n_peaks:,} peaks. Median width: {width_stats['median']:.0f} bp. "
|
|
1645
|
+
f"Estimated: {promoter_like:,} promoter-like (<500bp), "
|
|
1646
|
+
f"{enhancer_like:,} enhancer-like (500-2000bp), {broad_peaks:,} broad (>2000bp). "
|
|
1647
|
+
f"Top chromosomes: {', '.join(f'{k}:{v}' for k, v in list(chr_counts.items())[:5])}"
|
|
1648
|
+
),
|
|
1649
|
+
}
|
|
1650
|
+
else:
|
|
1651
|
+
# Peak count matrix (peaks x samples)
|
|
1652
|
+
n_peaks, n_samples = df.shape
|
|
1653
|
+
return {
|
|
1654
|
+
"n_peaks": n_peaks,
|
|
1655
|
+
"n_samples": n_samples,
|
|
1656
|
+
"columns": list(df.columns[:20]),
|
|
1657
|
+
"summary": (
|
|
1658
|
+
f"ATAC-seq count matrix: {n_peaks:,} peaks x {n_samples} samples. "
|
|
1659
|
+
f"Use omics.chromatin_accessibility for differential analysis."
|
|
1660
|
+
),
|
|
1661
|
+
}
|
|
1662
|
+
|
|
1663
|
+
|
|
1664
|
+
# ---------------------------------------------------------------------------
|
|
1665
|
+
# 13. omics.chromatin_accessibility
|
|
1666
|
+
# ---------------------------------------------------------------------------
|
|
1667
|
+
|
|
1668
|
+
|
|
1669
|
+
@registry.register(
|
|
1670
|
+
name="omics.chromatin_accessibility",
|
|
1671
|
+
description="Differential chromatin accessibility analysis between two sample groups",
|
|
1672
|
+
category="omics",
|
|
1673
|
+
parameters={
|
|
1674
|
+
"path": "Path to peak count matrix (rows=peaks/genes, cols=samples)",
|
|
1675
|
+
"group1": "Comma-separated sample names for group 1",
|
|
1676
|
+
"group2": "Comma-separated sample names for group 2",
|
|
1677
|
+
"auto_grouping": "If true, splits samples by column order for exploratory use (default false)",
|
|
1678
|
+
"fdr_cutoff": "FDR threshold (default 0.05)",
|
|
1679
|
+
},
|
|
1680
|
+
usage_guide=(
|
|
1681
|
+
"Compare chromatin accessibility between groups from ATAC-seq count matrices. "
|
|
1682
|
+
"Works on peak-level or gene-level accessibility scores. "
|
|
1683
|
+
"Provide explicit group1/group2 sample lists for robust comparisons."
|
|
1684
|
+
),
|
|
1685
|
+
)
|
|
1686
|
+
def chromatin_accessibility(
|
|
1687
|
+
path: str,
|
|
1688
|
+
group1: str = "",
|
|
1689
|
+
group2: str = "",
|
|
1690
|
+
auto_grouping: bool = False,
|
|
1691
|
+
fdr_cutoff: float = 0.05,
|
|
1692
|
+
**kwargs,
|
|
1693
|
+
) -> dict:
|
|
1694
|
+
"""Differential chromatin accessibility analysis."""
|
|
1695
|
+
import numpy as np
|
|
1696
|
+
from scipy import stats
|
|
1697
|
+
|
|
1698
|
+
df, error = _load_tabular(path)
|
|
1699
|
+
if error:
|
|
1700
|
+
return {"error": error, "summary": f"Could not load: {error}"}
|
|
1701
|
+
|
|
1702
|
+
g1_samples, g2_samples, group_error = _parse_sample_groups(
|
|
1703
|
+
df,
|
|
1704
|
+
group1=group1,
|
|
1705
|
+
group2=group2,
|
|
1706
|
+
auto_grouping=auto_grouping,
|
|
1707
|
+
min_group_size=2,
|
|
1708
|
+
)
|
|
1709
|
+
if group_error:
|
|
1710
|
+
return group_error
|
|
1711
|
+
|
|
1712
|
+
g1 = df[g1_samples]
|
|
1713
|
+
g2 = df[g2_samples]
|
|
1714
|
+
|
|
1715
|
+
mean1 = g1.mean(axis=1)
|
|
1716
|
+
mean2 = g2.mean(axis=1)
|
|
1717
|
+
# Log2 fold-change (add pseudocount to avoid log(0))
|
|
1718
|
+
log2fc = np.log2((mean2 + 1) / (mean1 + 1))
|
|
1719
|
+
|
|
1720
|
+
pvals = []
|
|
1721
|
+
for region in df.index:
|
|
1722
|
+
v1 = g1.loc[region].dropna().values
|
|
1723
|
+
v2 = g2.loc[region].dropna().values
|
|
1724
|
+
if len(v1) >= 2 and len(v2) >= 2:
|
|
1725
|
+
_, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
|
|
1726
|
+
pvals.append(p)
|
|
1727
|
+
else:
|
|
1728
|
+
pvals.append(1.0)
|
|
1729
|
+
|
|
1730
|
+
pvals = np.array(pvals)
|
|
1731
|
+
qvals = _fdr_correct(pvals)
|
|
1732
|
+
|
|
1733
|
+
sig_mask = qvals < fdr_cutoff
|
|
1734
|
+
n_sig = int(sig_mask.sum())
|
|
1735
|
+
n_more_open = int((sig_mask & (log2fc.values > 0)).sum())
|
|
1736
|
+
n_more_closed = int((sig_mask & (log2fc.values < 0)).sum())
|
|
1737
|
+
|
|
1738
|
+
import pandas as pd
|
|
1739
|
+
|
|
1740
|
+
results_df = pd.DataFrame({
|
|
1741
|
+
"mean_group1": mean1, "mean_group2": mean2,
|
|
1742
|
+
"log2fc": log2fc, "pvalue": pvals, "fdr": qvals,
|
|
1743
|
+
}, index=df.index).sort_values("fdr")
|
|
1744
|
+
|
|
1745
|
+
return {
|
|
1746
|
+
"n_regions_tested": len(df.index),
|
|
1747
|
+
"n_significant": n_sig,
|
|
1748
|
+
"n_more_accessible": n_more_open,
|
|
1749
|
+
"n_less_accessible": n_more_closed,
|
|
1750
|
+
"group1_samples": g1_samples,
|
|
1751
|
+
"group2_samples": g2_samples,
|
|
1752
|
+
"auto_grouping_used": bool(auto_grouping and not group1 and not group2),
|
|
1753
|
+
"top_hits": results_df.head(20).to_dict("index"),
|
|
1754
|
+
"summary": (
|
|
1755
|
+
f"Tested {len(df.index):,} regions: {n_sig} differentially accessible "
|
|
1756
|
+
f"(FDR<{fdr_cutoff}). {n_more_open} gained, {n_more_closed} lost accessibility."
|
|
1757
|
+
),
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
|
|
1761
|
+
# ---------------------------------------------------------------------------
|
|
1762
|
+
# 14. omics.chipseq_enrich
|
|
1763
|
+
# ---------------------------------------------------------------------------
|
|
1764
|
+
|
|
1765
|
+
|
|
1766
|
+
@registry.register(
|
|
1767
|
+
name="omics.chipseq_enrich",
|
|
1768
|
+
description="Enrichment analysis of ChIP-seq target genes",
|
|
1769
|
+
category="omics",
|
|
1770
|
+
parameters={
|
|
1771
|
+
"path": "Path to peak file with gene annotations (CSV/TSV with a gene column)",
|
|
1772
|
+
"gene_column": "Column name containing gene symbols (default auto-detect)",
|
|
1773
|
+
},
|
|
1774
|
+
usage_guide=(
|
|
1775
|
+
"Extract target genes from ChIP-seq peak annotations and run pathway "
|
|
1776
|
+
"enrichment. Works on peak files that include nearest-gene annotations."
|
|
1777
|
+
),
|
|
1778
|
+
)
|
|
1779
|
+
def chipseq_enrich(path: str, gene_column: str = "", **kwargs) -> dict:
|
|
1780
|
+
"""Enrichment analysis of ChIP-seq target genes."""
|
|
1781
|
+
import pandas as pd
|
|
1782
|
+
|
|
1783
|
+
filepath = Path(path).expanduser()
|
|
1784
|
+
if not filepath.exists():
|
|
1785
|
+
return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
|
|
1786
|
+
|
|
1787
|
+
suffix = filepath.suffix.lower()
|
|
1788
|
+
sep = "," if suffix == ".csv" else "\t"
|
|
1789
|
+
try:
|
|
1790
|
+
df = pd.read_csv(filepath, sep=sep, comment="#")
|
|
1791
|
+
except Exception as exc:
|
|
1792
|
+
return {"error": f"Failed to read: {str(exc)[:200]}", "summary": f"Parse error: {str(exc)[:100]}"}
|
|
1793
|
+
|
|
1794
|
+
# Auto-detect gene column
|
|
1795
|
+
if gene_column and gene_column in df.columns:
|
|
1796
|
+
gcol = gene_column
|
|
1797
|
+
else:
|
|
1798
|
+
candidates = ["gene", "gene_name", "symbol", "gene_symbol", "nearest_gene",
|
|
1799
|
+
"GENE", "Gene", "SYMBOL", "geneName"]
|
|
1800
|
+
gcol = None
|
|
1801
|
+
for c in candidates:
|
|
1802
|
+
if c in df.columns:
|
|
1803
|
+
gcol = c
|
|
1804
|
+
break
|
|
1805
|
+
if gcol is None:
|
|
1806
|
+
# Try case-insensitive
|
|
1807
|
+
col_lower = {c.lower(): c for c in df.columns}
|
|
1808
|
+
for c in ["gene", "gene_name", "symbol"]:
|
|
1809
|
+
if c in col_lower:
|
|
1810
|
+
gcol = col_lower[c]
|
|
1811
|
+
break
|
|
1812
|
+
|
|
1813
|
+
if gcol is None:
|
|
1814
|
+
return {
|
|
1815
|
+
"error": "No gene column found. Provide gene_column parameter.",
|
|
1816
|
+
"available_columns": list(df.columns[:20]),
|
|
1817
|
+
"summary": f"Could not auto-detect gene column. Columns: {', '.join(df.columns[:10])}",
|
|
1818
|
+
}
|
|
1819
|
+
|
|
1820
|
+
genes = df[gcol].dropna().unique().tolist()
|
|
1821
|
+
genes = [str(g).strip() for g in genes if str(g).strip() and str(g).strip().upper() != "NAN"]
|
|
1822
|
+
|
|
1823
|
+
if not genes:
|
|
1824
|
+
return {"error": "No genes found in column", "summary": "Empty gene list after filtering"}
|
|
1825
|
+
|
|
1826
|
+
# Delegate to Enrichr
|
|
1827
|
+
return proteomics_enrich(proteins=",".join(genes), **kwargs)
|
|
1828
|
+
|
|
1829
|
+
|
|
1830
|
+
# ---------------------------------------------------------------------------
|
|
1831
|
+
# 15. omics.spatial_cluster
|
|
1832
|
+
# ---------------------------------------------------------------------------
|
|
1833
|
+
|
|
1834
|
+
|
|
1835
|
+
@registry.register(
|
|
1836
|
+
name="omics.spatial_cluster",
|
|
1837
|
+
description="Spatial-aware clustering of spatial transcriptomics data",
|
|
1838
|
+
category="omics",
|
|
1839
|
+
parameters={
|
|
1840
|
+
"path": "Path to h5ad file with spatial coordinates in .obsm['spatial']",
|
|
1841
|
+
"resolution": "Leiden clustering resolution (default 1.0)",
|
|
1842
|
+
"n_neighbors": "Number of spatial neighbors (default 15)",
|
|
1843
|
+
},
|
|
1844
|
+
usage_guide=(
|
|
1845
|
+
"Cluster spatial transcriptomics data (Visium, MERFISH, etc.) using "
|
|
1846
|
+
"both expression similarity and spatial proximity. Requires scanpy; "
|
|
1847
|
+
"squidpy is optional for enhanced spatial analysis."
|
|
1848
|
+
),
|
|
1849
|
+
)
|
|
1850
|
+
def spatial_cluster(
|
|
1851
|
+
path: str,
|
|
1852
|
+
resolution: float = 1.0,
|
|
1853
|
+
n_neighbors: int = 15,
|
|
1854
|
+
**kwargs,
|
|
1855
|
+
) -> dict:
|
|
1856
|
+
"""Spatial-aware clustering of spatial transcriptomics data."""
|
|
1857
|
+
filepath = Path(path).expanduser()
|
|
1858
|
+
if not filepath.exists():
|
|
1859
|
+
return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
|
|
1860
|
+
|
|
1861
|
+
sc = _check_scanpy()
|
|
1862
|
+
if sc is None:
|
|
1863
|
+
return {
|
|
1864
|
+
"error": "scanpy required. Install with: pip install scanpy",
|
|
1865
|
+
"summary": "Install scanpy for spatial clustering: pip install scanpy",
|
|
1866
|
+
}
|
|
1867
|
+
|
|
1868
|
+
try:
|
|
1869
|
+
adata = sc.read_h5ad(filepath)
|
|
1870
|
+
except Exception as exc:
|
|
1871
|
+
return {"error": f"Failed to load h5ad: {str(exc)[:200]}", "summary": f"Could not read file: {str(exc)[:100]}"}
|
|
1872
|
+
|
|
1873
|
+
has_spatial = "spatial" in (adata.obsm or {})
|
|
1874
|
+
|
|
1875
|
+
# Standard preprocessing if raw
|
|
1876
|
+
if adata.X.max() > 50: # likely raw counts
|
|
1877
|
+
sc.pp.normalize_total(adata, target_sum=1e4)
|
|
1878
|
+
sc.pp.log1p(adata)
|
|
1879
|
+
|
|
1880
|
+
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
|
|
1881
|
+
sc.tl.pca(adata, n_comps=min(50, adata.n_vars - 1, adata.n_obs - 1))
|
|
1882
|
+
sc.pp.neighbors(adata, n_neighbors=n_neighbors)
|
|
1883
|
+
|
|
1884
|
+
# Try squidpy for spatial neighbors
|
|
1885
|
+
sq = None
|
|
1886
|
+
try:
|
|
1887
|
+
import squidpy as sq_mod
|
|
1888
|
+
sq = sq_mod
|
|
1889
|
+
except ImportError:
|
|
1890
|
+
pass
|
|
1891
|
+
|
|
1892
|
+
if has_spatial and sq:
|
|
1893
|
+
try:
|
|
1894
|
+
sq.gr.spatial_neighbors(adata, n_neighs=n_neighbors)
|
|
1895
|
+
# Combine spatial + expression connectivity
|
|
1896
|
+
from scipy.sparse import csr_matrix
|
|
1897
|
+
expr_conn = adata.obsp.get("connectivities")
|
|
1898
|
+
spatial_conn = adata.obsp.get("spatial_connectivities")
|
|
1899
|
+
if expr_conn is not None and spatial_conn is not None:
|
|
1900
|
+
combined = 0.5 * expr_conn + 0.5 * spatial_conn
|
|
1901
|
+
adata.obsp["connectivities"] = csr_matrix(combined)
|
|
1902
|
+
except Exception:
|
|
1903
|
+
pass # Fall back to expression-only neighbors
|
|
1904
|
+
|
|
1905
|
+
sc.tl.leiden(adata, resolution=resolution, key_added="spatial_cluster")
|
|
1906
|
+
|
|
1907
|
+
clusters = adata.obs["spatial_cluster"].value_counts().to_dict()
|
|
1908
|
+
n_clusters = len(clusters)
|
|
1909
|
+
|
|
1910
|
+
result = {
|
|
1911
|
+
"n_cells": adata.n_obs,
|
|
1912
|
+
"n_genes": adata.n_vars,
|
|
1913
|
+
"n_clusters": n_clusters,
|
|
1914
|
+
"cluster_sizes": clusters,
|
|
1915
|
+
"has_spatial_coords": has_spatial,
|
|
1916
|
+
"used_squidpy": sq is not None and has_spatial,
|
|
1917
|
+
"resolution": resolution,
|
|
1918
|
+
"summary": (
|
|
1919
|
+
f"Spatial clustering: {adata.n_obs:,} cells → {n_clusters} clusters "
|
|
1920
|
+
f"(resolution={resolution}). "
|
|
1921
|
+
f"{'Used spatial+expression neighbors (squidpy).' if sq and has_spatial else 'Expression-based neighbors only.'} "
|
|
1922
|
+
f"Largest cluster: {max(clusters.values()):,} cells."
|
|
1923
|
+
),
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
# Try to find marker genes per cluster
|
|
1927
|
+
try:
|
|
1928
|
+
sc.tl.rank_genes_groups(adata, "spatial_cluster", method="wilcoxon", n_genes=5)
|
|
1929
|
+
markers = {}
|
|
1930
|
+
for cl in adata.obs["spatial_cluster"].unique():
|
|
1931
|
+
genes = list(adata.uns["rank_genes_groups"]["names"][cl][:5])
|
|
1932
|
+
markers[str(cl)] = genes
|
|
1933
|
+
result["cluster_markers"] = markers
|
|
1934
|
+
except Exception:
|
|
1935
|
+
pass
|
|
1936
|
+
|
|
1937
|
+
return result
|
|
1938
|
+
|
|
1939
|
+
|
|
1940
|
+
# ---------------------------------------------------------------------------
|
|
1941
|
+
# 16. omics.spatial_autocorrelation
|
|
1942
|
+
# ---------------------------------------------------------------------------
|
|
1943
|
+
|
|
1944
|
+
|
|
1945
|
+
@registry.register(
|
|
1946
|
+
name="omics.spatial_autocorrelation",
|
|
1947
|
+
description="Compute spatial autocorrelation (Moran's I) for gene expression patterns",
|
|
1948
|
+
category="omics",
|
|
1949
|
+
parameters={
|
|
1950
|
+
"path": "Path to h5ad file with spatial coordinates",
|
|
1951
|
+
"genes": "Comma-separated gene names to test (default: top variable genes)",
|
|
1952
|
+
"n_genes": "Number of top variable genes to test if genes not specified (default 50)",
|
|
1953
|
+
},
|
|
1954
|
+
usage_guide=(
|
|
1955
|
+
"Test whether gene expression shows spatial patterning using Moran's I. "
|
|
1956
|
+
"High Moran's I = spatially clustered expression. Requires scanpy."
|
|
1957
|
+
),
|
|
1958
|
+
)
|
|
1959
|
+
def spatial_autocorrelation(
|
|
1960
|
+
path: str,
|
|
1961
|
+
genes: str = "",
|
|
1962
|
+
n_genes: int = 50,
|
|
1963
|
+
**kwargs,
|
|
1964
|
+
) -> dict:
|
|
1965
|
+
"""Compute Moran's I spatial autocorrelation for genes."""
|
|
1966
|
+
filepath = Path(path).expanduser()
|
|
1967
|
+
if not filepath.exists():
|
|
1968
|
+
return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
|
|
1969
|
+
|
|
1970
|
+
sc = _check_scanpy()
|
|
1971
|
+
if sc is None:
|
|
1972
|
+
return {"error": "scanpy required", "summary": "Install scanpy: pip install scanpy"}
|
|
1973
|
+
|
|
1974
|
+
try:
|
|
1975
|
+
adata = sc.read_h5ad(filepath)
|
|
1976
|
+
except Exception as exc:
|
|
1977
|
+
return {"error": f"Failed to load: {str(exc)[:200]}", "summary": f"Read error: {str(exc)[:100]}"}
|
|
1978
|
+
|
|
1979
|
+
has_spatial = "spatial" in (adata.obsm or {})
|
|
1980
|
+
if not has_spatial:
|
|
1981
|
+
return {"error": "No spatial coordinates found in .obsm['spatial']", "summary": "Not spatial data: no coordinates found"}
|
|
1982
|
+
|
|
1983
|
+
# Try squidpy
|
|
1984
|
+
try:
|
|
1985
|
+
import squidpy as sq
|
|
1986
|
+
except ImportError:
|
|
1987
|
+
return {"error": "squidpy required for Moran's I. Install: pip install squidpy", "summary": "Install squidpy: pip install squidpy"}
|
|
1988
|
+
|
|
1989
|
+
# Preprocess if needed
|
|
1990
|
+
if adata.X.max() > 50:
|
|
1991
|
+
sc.pp.normalize_total(adata, target_sum=1e4)
|
|
1992
|
+
sc.pp.log1p(adata)
|
|
1993
|
+
|
|
1994
|
+
# Build spatial graph
|
|
1995
|
+
sq.gr.spatial_neighbors(adata)
|
|
1996
|
+
|
|
1997
|
+
# Select genes
|
|
1998
|
+
gene_list = [g.strip() for g in genes.split(",") if g.strip()] if genes else []
|
|
1999
|
+
if not gene_list:
|
|
2000
|
+
sc.pp.highly_variable_genes(adata, n_top_genes=min(n_genes, adata.n_vars))
|
|
2001
|
+
gene_list = list(adata.var_names[adata.var["highly_variable"]])[:n_genes]
|
|
2002
|
+
|
|
2003
|
+
# Filter to genes present in data
|
|
2004
|
+
valid_genes = [g for g in gene_list if g in adata.var_names]
|
|
2005
|
+
if not valid_genes:
|
|
2006
|
+
return {"error": "None of the specified genes found in dataset", "summary": "No matching genes in data"}
|
|
2007
|
+
|
|
2008
|
+
# Compute Moran's I
|
|
2009
|
+
sq.gr.spatial_autocorr(adata, mode="moran", genes=valid_genes)
|
|
2010
|
+
|
|
2011
|
+
moranI = adata.uns.get("moranI")
|
|
2012
|
+
if moranI is None:
|
|
2013
|
+
return {"error": "Moran's I computation failed", "summary": "Spatial autocorrelation computation failed"}
|
|
2014
|
+
|
|
2015
|
+
results = moranI.sort_values("I", ascending=False)
|
|
2016
|
+
top_spatial = results.head(20).to_dict("index")
|
|
2017
|
+
|
|
2018
|
+
highly_spatial = results[results["pval_norm"] < 0.05]
|
|
2019
|
+
n_spatial = len(highly_spatial)
|
|
2020
|
+
|
|
2021
|
+
return {
|
|
2022
|
+
"n_genes_tested": len(valid_genes),
|
|
2023
|
+
"n_spatially_patterned": n_spatial,
|
|
2024
|
+
"top_spatial_genes": top_spatial,
|
|
2025
|
+
"summary": (
|
|
2026
|
+
f"Moran's I on {len(valid_genes)} genes: {n_spatial} show significant spatial "
|
|
2027
|
+
f"patterning (p<0.05). Top: "
|
|
2028
|
+
+ ", ".join(f"{g} (I={row['I']:.3f})" for g, row in list(results.head(5).iterrows()))
|
|
2029
|
+
),
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
|
|
2033
|
+
# ---------------------------------------------------------------------------
|
|
2034
|
+
# 17. omics.cytof_cluster
|
|
2035
|
+
# ---------------------------------------------------------------------------
|
|
2036
|
+
|
|
2037
|
+
|
|
2038
|
+
@registry.register(
|
|
2039
|
+
name="omics.cytof_cluster",
|
|
2040
|
+
description="Cluster CyTOF or flow cytometry data and characterize marker expression per cluster",
|
|
2041
|
+
category="omics",
|
|
2042
|
+
parameters={
|
|
2043
|
+
"path": "Path to CyTOF/flow data (CSV with markers as columns, cells as rows)",
|
|
2044
|
+
"n_clusters": "Number of clusters for KMeans (default 10). Use 0 for auto (Leiden).",
|
|
2045
|
+
"markers": "Comma-separated marker columns to use (default: all numeric columns)",
|
|
2046
|
+
},
|
|
2047
|
+
usage_guide=(
|
|
2048
|
+
"Cluster mass/flow cytometry data. Input is a cells x markers matrix. "
|
|
2049
|
+
"Identifies cell populations and characterizes each by marker expression."
|
|
2050
|
+
),
|
|
2051
|
+
)
|
|
2052
|
+
def cytof_cluster(
|
|
2053
|
+
path: str,
|
|
2054
|
+
n_clusters: int = 10,
|
|
2055
|
+
markers: str = "",
|
|
2056
|
+
**kwargs,
|
|
2057
|
+
) -> dict:
|
|
2058
|
+
"""Cluster CyTOF/flow cytometry data."""
|
|
2059
|
+
import numpy as np
|
|
2060
|
+
|
|
2061
|
+
df, error = _load_tabular(path)
|
|
2062
|
+
if error:
|
|
2063
|
+
# Try without index_col since CyTOF data often has no row names
|
|
2064
|
+
import pandas as pd
|
|
2065
|
+
|
|
2066
|
+
filepath = Path(path).expanduser()
|
|
2067
|
+
if not filepath.exists():
|
|
2068
|
+
return {"error": error, "summary": f"Could not load: {error}"}
|
|
2069
|
+
suffix = filepath.suffix.lower()
|
|
2070
|
+
sep = "," if suffix == ".csv" else "\t"
|
|
2071
|
+
try:
|
|
2072
|
+
df = pd.read_csv(filepath, sep=sep)
|
|
2073
|
+
except Exception as exc2:
|
|
2074
|
+
return {"error": str(exc2), "summary": f"Could not load: {str(exc2)[:100]}"}
|
|
2075
|
+
|
|
2076
|
+
marker_cols = [m.strip() for m in markers.split(",") if m.strip()] if markers else []
|
|
2077
|
+
if not marker_cols:
|
|
2078
|
+
marker_cols = list(df.select_dtypes(include=[np.number]).columns)
|
|
2079
|
+
|
|
2080
|
+
if not marker_cols:
|
|
2081
|
+
return {"error": "No numeric marker columns found", "summary": "No numeric columns in data"}
|
|
2082
|
+
|
|
2083
|
+
data = df[marker_cols].dropna()
|
|
2084
|
+
n_cells = len(data)
|
|
2085
|
+
if n_cells < 10:
|
|
2086
|
+
return {"error": f"Too few cells ({n_cells}) for clustering", "summary": f"Only {n_cells} cells — need at least 10"}
|
|
2087
|
+
|
|
2088
|
+
# Standardize
|
|
2089
|
+
from sklearn.preprocessing import StandardScaler
|
|
2090
|
+
|
|
2091
|
+
scaler = StandardScaler()
|
|
2092
|
+
scaled = scaler.fit_transform(data.values)
|
|
2093
|
+
|
|
2094
|
+
# Cluster
|
|
2095
|
+
if n_clusters > 0:
|
|
2096
|
+
from sklearn.cluster import MiniBatchKMeans
|
|
2097
|
+
|
|
2098
|
+
model = MiniBatchKMeans(n_clusters=min(n_clusters, n_cells), random_state=42, n_init=3)
|
|
2099
|
+
labels = model.fit_predict(scaled)
|
|
2100
|
+
else:
|
|
2101
|
+
# Use Leiden via scanpy on a neighbors graph
|
|
2102
|
+
sc = _check_scanpy()
|
|
2103
|
+
if sc is not None:
|
|
2104
|
+
import anndata
|
|
2105
|
+
|
|
2106
|
+
adata = anndata.AnnData(X=scaled)
|
|
2107
|
+
sc.pp.neighbors(adata, n_neighbors=15)
|
|
2108
|
+
sc.tl.leiden(adata, resolution=1.0)
|
|
2109
|
+
labels = adata.obs["leiden"].astype(int).values
|
|
2110
|
+
else:
|
|
2111
|
+
from sklearn.cluster import MiniBatchKMeans
|
|
2112
|
+
|
|
2113
|
+
model = MiniBatchKMeans(n_clusters=10, random_state=42, n_init=3)
|
|
2114
|
+
labels = model.fit_predict(scaled)
|
|
2115
|
+
|
|
2116
|
+
import pandas as pd
|
|
2117
|
+
|
|
2118
|
+
data = data.copy()
|
|
2119
|
+
data["cluster"] = labels
|
|
2120
|
+
|
|
2121
|
+
cluster_sizes = data["cluster"].value_counts().sort_index().to_dict()
|
|
2122
|
+
n_clusters_found = len(cluster_sizes)
|
|
2123
|
+
|
|
2124
|
+
# Per-cluster marker expression (median)
|
|
2125
|
+
cluster_medians = data.groupby("cluster")[marker_cols].median()
|
|
2126
|
+
cluster_profiles = cluster_medians.to_dict("index")
|
|
2127
|
+
|
|
2128
|
+
# Find defining markers per cluster (highest z-score)
|
|
2129
|
+
defining_markers = {}
|
|
2130
|
+
global_means = data[marker_cols].mean()
|
|
2131
|
+
global_stds = data[marker_cols].std().replace(0, 1)
|
|
2132
|
+
for cl in sorted(cluster_sizes.keys()):
|
|
2133
|
+
cl_means = cluster_medians.loc[cl]
|
|
2134
|
+
z_scores = ((cl_means - global_means) / global_stds).sort_values(ascending=False)
|
|
2135
|
+
defining_markers[str(cl)] = list(z_scores.head(5).index)
|
|
2136
|
+
|
|
2137
|
+
return {
|
|
2138
|
+
"n_cells": n_cells,
|
|
2139
|
+
"n_markers": len(marker_cols),
|
|
2140
|
+
"n_clusters": n_clusters_found,
|
|
2141
|
+
"cluster_sizes": {str(k): v for k, v in cluster_sizes.items()},
|
|
2142
|
+
"defining_markers": defining_markers,
|
|
2143
|
+
"cluster_profiles": {str(k): {mk: round(v, 3) for mk, v in prof.items()} for k, prof in cluster_profiles.items()},
|
|
2144
|
+
"summary": (
|
|
2145
|
+
f"CyTOF clustering: {n_cells:,} cells x {len(marker_cols)} markers → "
|
|
2146
|
+
f"{n_clusters_found} clusters. Largest: {max(cluster_sizes.values()):,} cells. "
|
|
2147
|
+
f"Top defining markers per cluster identified."
|
|
2148
|
+
),
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
|
|
2152
|
+
# ---------------------------------------------------------------------------
|
|
2153
|
+
# 18. omics.hic_compartments
|
|
2154
|
+
# ---------------------------------------------------------------------------
|
|
2155
|
+
|
|
2156
|
+
|
|
2157
|
+
@registry.register(
|
|
2158
|
+
name="omics.hic_compartments",
|
|
2159
|
+
description="Identify A/B compartments from Hi-C contact matrices",
|
|
2160
|
+
category="omics",
|
|
2161
|
+
parameters={
|
|
2162
|
+
"path": "Path to Hi-C contact matrix (CSV/TSV, symmetric matrix with genomic bins)",
|
|
2163
|
+
"resolution": "Bin resolution description (for reporting, e.g. '50kb')",
|
|
2164
|
+
},
|
|
2165
|
+
usage_guide=(
|
|
2166
|
+
"Identify chromatin A/B compartments from Hi-C contact frequency matrices. "
|
|
2167
|
+
"A compartments are gene-rich/active, B compartments are gene-poor/repressed. "
|
|
2168
|
+
"Input should be a symmetric bin x bin contact matrix."
|
|
2169
|
+
),
|
|
2170
|
+
)
|
|
2171
|
+
def hic_compartments(path: str, resolution: str = "unknown", **kwargs) -> dict:
|
|
2172
|
+
"""Identify A/B compartments from Hi-C contact matrix via PCA."""
|
|
2173
|
+
import numpy as np
|
|
2174
|
+
|
|
2175
|
+
df, error = _load_tabular(path)
|
|
2176
|
+
if error:
|
|
2177
|
+
return {"error": error, "summary": f"Could not load: {error}"}
|
|
2178
|
+
|
|
2179
|
+
n_bins = df.shape[0]
|
|
2180
|
+
if df.shape[0] != df.shape[1]:
|
|
2181
|
+
return {
|
|
2182
|
+
"error": f"Expected symmetric matrix, got {df.shape[0]}x{df.shape[1]}",
|
|
2183
|
+
"summary": "Hi-C contact matrix must be square (symmetric)",
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
if n_bins < 3:
|
|
2187
|
+
return {"error": f"Too few bins ({n_bins})", "summary": "Need at least 3 bins for compartment analysis"}
|
|
2188
|
+
|
|
2189
|
+
# Convert to numpy, handle NaN
|
|
2190
|
+
matrix = df.values.astype(float)
|
|
2191
|
+
matrix = np.nan_to_num(matrix, nan=0.0)
|
|
2192
|
+
|
|
2193
|
+
# Normalize: observed/expected
|
|
2194
|
+
row_sums = matrix.sum(axis=1)
|
|
2195
|
+
row_sums[row_sums == 0] = 1 # avoid division by zero
|
|
2196
|
+
expected = np.outer(row_sums, row_sums) / row_sums.sum()
|
|
2197
|
+
expected[expected == 0] = 1
|
|
2198
|
+
oe_matrix = matrix / expected
|
|
2199
|
+
|
|
2200
|
+
# Correlation matrix
|
|
2201
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
2202
|
+
corr = np.corrcoef(oe_matrix)
|
|
2203
|
+
corr = np.nan_to_num(corr, nan=0.0)
|
|
2204
|
+
|
|
2205
|
+
# PCA — first eigenvector gives A/B compartments
|
|
2206
|
+
eigenvalues, eigenvectors = np.linalg.eigh(corr)
|
|
2207
|
+
# Take the eigenvector with largest eigenvalue
|
|
2208
|
+
pc1 = eigenvectors[:, -1]
|
|
2209
|
+
|
|
2210
|
+
# A compartments = positive PC1 (convention: gene-rich is positive)
|
|
2211
|
+
compartments = np.where(pc1 > 0, "A", "B")
|
|
2212
|
+
n_A = int((compartments == "A").sum())
|
|
2213
|
+
n_B = int((compartments == "B").sum())
|
|
2214
|
+
frac_A = n_A / n_bins
|
|
2215
|
+
|
|
2216
|
+
# Compartment runs (contiguous blocks)
|
|
2217
|
+
transitions = 0
|
|
2218
|
+
for i in range(1, len(compartments)):
|
|
2219
|
+
if compartments[i] != compartments[i - 1]:
|
|
2220
|
+
transitions += 1
|
|
2221
|
+
|
|
2222
|
+
return {
|
|
2223
|
+
"n_bins": n_bins,
|
|
2224
|
+
"resolution": resolution,
|
|
2225
|
+
"n_compartment_A": n_A,
|
|
2226
|
+
"n_compartment_B": n_B,
|
|
2227
|
+
"fraction_A": round(frac_A, 3),
|
|
2228
|
+
"n_transitions": transitions,
|
|
2229
|
+
"pc1_values": pc1.tolist()[:50],
|
|
2230
|
+
"compartment_assignments": compartments.tolist()[:50],
|
|
2231
|
+
"explained_variance": round(float(eigenvalues[-1] / eigenvalues.sum()), 4),
|
|
2232
|
+
"summary": (
|
|
2233
|
+
f"Hi-C compartments ({resolution} resolution): {n_bins} bins → "
|
|
2234
|
+
f"{n_A} A-compartment ({frac_A:.0%}), {n_B} B-compartment ({1-frac_A:.0%}). "
|
|
2235
|
+
f"{transitions} A/B transitions. PC1 explains {eigenvalues[-1]/eigenvalues.sum():.1%} of variance."
|
|
2236
|
+
),
|
|
2237
|
+
}
|
|
2238
|
+
|
|
2239
|
+
|
|
2240
|
+
# ===========================================================================
|
|
2241
|
+
# Specialized library integrations (optional deps)
|
|
2242
|
+
# ===========================================================================
|
|
2243
|
+
|
|
2244
|
+
|
|
2245
|
+
def _check_pydeseq2():
|
|
2246
|
+
"""Check if pyDESeq2 is available."""
|
|
2247
|
+
try:
|
|
2248
|
+
from pydeseq2.dds import DeseqDataSet
|
|
2249
|
+
from pydeseq2.ds import DeseqStats
|
|
2250
|
+
|
|
2251
|
+
return True
|
|
2252
|
+
except Exception as exc:
|
|
2253
|
+
logger.debug("pyDESeq2 unavailable or failed to import: %s", exc)
|
|
2254
|
+
return False
|
|
2255
|
+
|
|
2256
|
+
|
|
2257
|
+
def _check_muon():
|
|
2258
|
+
"""Check if muon is available."""
|
|
2259
|
+
try:
|
|
2260
|
+
import muon
|
|
2261
|
+
import mudata
|
|
2262
|
+
|
|
2263
|
+
return muon
|
|
2264
|
+
except Exception as exc:
|
|
2265
|
+
logger.debug("muon unavailable or failed to import: %s", exc)
|
|
2266
|
+
return None
|
|
2267
|
+
|
|
2268
|
+
|
|
2269
|
+
def _check_episcanpy():
|
|
2270
|
+
"""Check if episcanpy is available."""
|
|
2271
|
+
try:
|
|
2272
|
+
import episcanpy.api as epi
|
|
2273
|
+
|
|
2274
|
+
return epi
|
|
2275
|
+
except Exception as exc:
|
|
2276
|
+
logger.debug("episcanpy unavailable or failed to import: %s", exc)
|
|
2277
|
+
return None
|
|
2278
|
+
|
|
2279
|
+
|
|
2280
|
+
# ---------------------------------------------------------------------------
|
|
2281
|
+
# 19. omics.deseq2
|
|
2282
|
+
# ---------------------------------------------------------------------------
|
|
2283
|
+
|
|
2284
|
+
|
|
2285
|
+
@registry.register(
|
|
2286
|
+
name="omics.deseq2",
|
|
2287
|
+
description="Differential expression with DESeq2 (negative binomial model for count data)",
|
|
2288
|
+
category="omics",
|
|
2289
|
+
parameters={
|
|
2290
|
+
"counts_path": "Path to raw count matrix (genes as rows, samples as columns)",
|
|
2291
|
+
"metadata_path": "Path to sample metadata table (CSV/TSV/TXT/XLSX; must have a condition column; required unless infer_metadata=true)",
|
|
2292
|
+
"condition_col": "Column in metadata for the contrast (default 'condition')",
|
|
2293
|
+
"ref_level": "Reference level for contrast (default: alphabetically first)",
|
|
2294
|
+
"test_level": "Test level for contrast (default: alphabetically second)",
|
|
2295
|
+
"covariates": "Optional comma-separated covariates to include in design (e.g., 'sex,batch')",
|
|
2296
|
+
"infer_metadata": "If true, infer two groups from sample column order for exploratory use only (default false)",
|
|
2297
|
+
"alpha": "Significance threshold for adjusted p-values (default 0.05)",
|
|
2298
|
+
"use_r_deseq2": "If true, prefer R DESeq2 backend via rpy2 when available (default true)",
|
|
2299
|
+
"prefilter_min_count": "Optional prefilter threshold: minimum count per sample (default 0 disables prefilter)",
|
|
2300
|
+
"prefilter_min_samples": "Optional prefilter threshold: minimum number of samples meeting prefilter_min_count (default 1)",
|
|
2301
|
+
"lfc_shrink": "If true, apply apeglm LFC shrinkage when possible (default false)",
|
|
2302
|
+
"enrichment_library": "Optional gseapy/Enrichr library name (e.g., Reactome_2022) for post-DE enrichment",
|
|
2303
|
+
"pathway_term": "Optional pathway term to match and extract odds ratio from enrichment results",
|
|
2304
|
+
"gene_map_path": "Optional gene ID -> symbol mapping table used before enrichment",
|
|
2305
|
+
"gene_id_col": "Optional gene ID column name in mapping table",
|
|
2306
|
+
"gene_symbol_col": "Optional symbol column name in mapping table",
|
|
2307
|
+
"min_abs_lfc": "Optional absolute log2FC threshold for enrichment gene list",
|
|
2308
|
+
"min_base_mean": "Optional baseMean threshold for enrichment gene list",
|
|
2309
|
+
"target_gene": "Optional target gene symbol/ID to report explicitly (returns log2FoldChange/baseMean/padj even if not in top hits)",
|
|
2310
|
+
},
|
|
2311
|
+
usage_guide=(
|
|
2312
|
+
"Proper count-based differential expression using the DESeq2 negative binomial model. "
|
|
2313
|
+
"Preferred over Mann-Whitney for bulk RNA-seq count data. Requires pydeseq2: "
|
|
2314
|
+
"pip install pydeseq2. Falls back to scipy Mann-Whitney if not installed. "
|
|
2315
|
+
"Supports optional covariate-adjusted design, LFC shrinkage, and optional gseapy enrichment "
|
|
2316
|
+
"from DE genes (including pathway-specific odds ratio extraction). "
|
|
2317
|
+
"Use explicit sample metadata in production; inferred metadata is exploratory only."
|
|
2318
|
+
),
|
|
2319
|
+
)
|
|
2320
|
+
def deseq2(
|
|
2321
|
+
counts_path: str,
|
|
2322
|
+
metadata_path: str = "",
|
|
2323
|
+
condition_col: str = "condition",
|
|
2324
|
+
ref_level: str = "",
|
|
2325
|
+
test_level: str = "",
|
|
2326
|
+
covariates: str = "",
|
|
2327
|
+
infer_metadata: bool = False,
|
|
2328
|
+
alpha: float = 0.05,
|
|
2329
|
+
use_r_deseq2: bool = True,
|
|
2330
|
+
prefilter_min_count: int = 0,
|
|
2331
|
+
prefilter_min_samples: int = 1,
|
|
2332
|
+
lfc_shrink: bool = False,
|
|
2333
|
+
enrichment_library: str = "",
|
|
2334
|
+
pathway_term: str = "",
|
|
2335
|
+
gene_map_path: str = "",
|
|
2336
|
+
gene_id_col: str = "",
|
|
2337
|
+
gene_symbol_col: str = "",
|
|
2338
|
+
min_abs_lfc: float = 0.0,
|
|
2339
|
+
min_base_mean: float = 0.0,
|
|
2340
|
+
target_gene: str = "",
|
|
2341
|
+
**kwargs,
|
|
2342
|
+
) -> dict:
|
|
2343
|
+
"""Run DESeq2 differential expression on count data."""
|
|
2344
|
+
import pandas as pd
|
|
2345
|
+
|
|
2346
|
+
# Load counts
|
|
2347
|
+
df, error = _load_tabular(counts_path)
|
|
2348
|
+
if error:
|
|
2349
|
+
return {"error": error, "summary": f"Could not load counts: {error}"}
|
|
2350
|
+
|
|
2351
|
+
# Load or infer metadata
|
|
2352
|
+
if metadata_path:
|
|
2353
|
+
metadata, meta_error = _load_tabular(metadata_path)
|
|
2354
|
+
if meta_error:
|
|
2355
|
+
return {"error": meta_error, "summary": f"Metadata load failed: {meta_error}"}
|
|
2356
|
+
else:
|
|
2357
|
+
if not infer_metadata:
|
|
2358
|
+
samples = list(df.columns)
|
|
2359
|
+
return {
|
|
2360
|
+
"error": (
|
|
2361
|
+
"metadata_path is required for reliable DESeq2 analysis. "
|
|
2362
|
+
"Set infer_metadata=True only for quick exploratory analysis."
|
|
2363
|
+
),
|
|
2364
|
+
"available_samples": samples[:30],
|
|
2365
|
+
"summary": f"No metadata provided for {len(samples)} samples; cannot define conditions.",
|
|
2366
|
+
}
|
|
2367
|
+
|
|
2368
|
+
# Exploratory-only mode: split samples into two halves.
|
|
2369
|
+
samples = list(df.columns)
|
|
2370
|
+
mid = len(samples) // 2
|
|
2371
|
+
if mid < 2:
|
|
2372
|
+
return {"error": "Need at least 4 samples (2 per group) without metadata", "summary": "Too few samples"}
|
|
2373
|
+
metadata = pd.DataFrame(
|
|
2374
|
+
{"condition": ["control"] * mid + ["treatment"] * (len(samples) - mid)},
|
|
2375
|
+
index=samples,
|
|
2376
|
+
)
|
|
2377
|
+
|
|
2378
|
+
if condition_col not in metadata.columns:
|
|
2379
|
+
return {
|
|
2380
|
+
"error": f"Column '{condition_col}' not in metadata. Available: {list(metadata.columns)}",
|
|
2381
|
+
"summary": f"Missing condition column: {condition_col}",
|
|
2382
|
+
}
|
|
2383
|
+
|
|
2384
|
+
# Align samples (drop metadata-only and counts-only samples deterministically)
|
|
2385
|
+
common = df.columns.intersection(metadata.index)
|
|
2386
|
+
if len(common) < 4:
|
|
2387
|
+
return {"error": f"Need ≥4 shared samples, found {len(common)}", "summary": "Too few matching samples"}
|
|
2388
|
+
counts = df[common]
|
|
2389
|
+
metadata = metadata.loc[common]
|
|
2390
|
+
|
|
2391
|
+
# Optional pre-filtering to stabilize dispersion fitting and reduce noise.
|
|
2392
|
+
if prefilter_min_count > 0:
|
|
2393
|
+
required = max(int(prefilter_min_samples), 1)
|
|
2394
|
+
keep_mask = (counts >= int(prefilter_min_count)).sum(axis=1) >= required
|
|
2395
|
+
counts = counts.loc[keep_mask]
|
|
2396
|
+
if counts.empty:
|
|
2397
|
+
return {
|
|
2398
|
+
"error": "All genes were removed by prefilter.",
|
|
2399
|
+
"summary": "No genes left after prefilter; relax prefilter thresholds.",
|
|
2400
|
+
}
|
|
2401
|
+
|
|
2402
|
+
levels = sorted(metadata[condition_col].unique())
|
|
2403
|
+
if len(levels) < 2:
|
|
2404
|
+
return {"error": "Need at least 2 condition levels", "summary": "Only one condition level found"}
|
|
2405
|
+
ref = ref_level if ref_level else levels[0]
|
|
2406
|
+
test = test_level if test_level else levels[1]
|
|
2407
|
+
|
|
2408
|
+
def _resolve_level_name(requested: str, available_levels: list[str]) -> str:
|
|
2409
|
+
"""Best-effort map of user/planner shorthand labels to metadata factor levels."""
|
|
2410
|
+
import re
|
|
2411
|
+
|
|
2412
|
+
req = str(requested or "").strip()
|
|
2413
|
+
if not req:
|
|
2414
|
+
return req
|
|
2415
|
+
if req in available_levels:
|
|
2416
|
+
return req
|
|
2417
|
+
|
|
2418
|
+
def _norm(s: str) -> str:
|
|
2419
|
+
return re.sub(r"[^a-z0-9]+", "", s.lower())
|
|
2420
|
+
|
|
2421
|
+
req_norm = _norm(req)
|
|
2422
|
+
if not req_norm:
|
|
2423
|
+
return req
|
|
2424
|
+
|
|
2425
|
+
# 1) Exact normalized match.
|
|
2426
|
+
exact = [lvl for lvl in available_levels if _norm(str(lvl)) == req_norm]
|
|
2427
|
+
if len(exact) == 1:
|
|
2428
|
+
return exact[0]
|
|
2429
|
+
|
|
2430
|
+
# 2) Prefix/token containment match (e.g., "CBD" -> "CBD_IC50").
|
|
2431
|
+
token_like = [
|
|
2432
|
+
lvl
|
|
2433
|
+
for lvl in available_levels
|
|
2434
|
+
if _norm(str(lvl)).startswith(req_norm) or req_norm in _norm(str(lvl))
|
|
2435
|
+
]
|
|
2436
|
+
if len(token_like) == 1:
|
|
2437
|
+
return token_like[0]
|
|
2438
|
+
|
|
2439
|
+
# 3) Prefer non-combined condition when shorthand maps to several levels.
|
|
2440
|
+
if len(token_like) > 1:
|
|
2441
|
+
non_combo = [
|
|
2442
|
+
lvl
|
|
2443
|
+
for lvl in token_like
|
|
2444
|
+
if "serum_starvation" not in str(lvl).lower()
|
|
2445
|
+
and "cisplatin" not in str(lvl).lower()
|
|
2446
|
+
and "comb" not in str(lvl).lower()
|
|
2447
|
+
and "plus" not in str(lvl).lower()
|
|
2448
|
+
]
|
|
2449
|
+
if len(non_combo) == 1:
|
|
2450
|
+
return non_combo[0]
|
|
2451
|
+
return req
|
|
2452
|
+
|
|
2453
|
+
ref = _resolve_level_name(ref, levels)
|
|
2454
|
+
test = _resolve_level_name(test, levels)
|
|
2455
|
+
if ref not in levels or test not in levels:
|
|
2456
|
+
return {
|
|
2457
|
+
"error": f"Requested contrast levels not found. Levels available: {levels}",
|
|
2458
|
+
"summary": f"Invalid contrast levels: ref={ref}, test={test}",
|
|
2459
|
+
}
|
|
2460
|
+
if ref == test:
|
|
2461
|
+
return {
|
|
2462
|
+
"error": "ref_level and test_level must be different",
|
|
2463
|
+
"summary": "Invalid contrast: identical levels",
|
|
2464
|
+
}
|
|
2465
|
+
|
|
2466
|
+
n_ref = int((metadata[condition_col] == ref).sum())
|
|
2467
|
+
n_test = int((metadata[condition_col] == test).sum())
|
|
2468
|
+
if n_ref < 2 or n_test < 2:
|
|
2469
|
+
return {
|
|
2470
|
+
"error": f"Need at least 2 replicates per condition for {ref} vs {test} (found {n_ref} and {n_test})",
|
|
2471
|
+
"summary": "Insufficient biological replicates per condition",
|
|
2472
|
+
}
|
|
2473
|
+
|
|
2474
|
+
# Build design formula with optional covariates.
|
|
2475
|
+
if isinstance(covariates, (list, tuple)):
|
|
2476
|
+
covars = [str(c).strip() for c in covariates if str(c).strip()]
|
|
2477
|
+
else:
|
|
2478
|
+
raw = str(covariates or "").strip()
|
|
2479
|
+
if raw.startswith("[") and raw.endswith("]"):
|
|
2480
|
+
raw = raw[1:-1]
|
|
2481
|
+
covars = [c.strip().strip("'\"") for c in raw.split(",") if c.strip().strip("'\"")]
|
|
2482
|
+
missing_covars = [c for c in covars if c not in metadata.columns]
|
|
2483
|
+
if missing_covars:
|
|
2484
|
+
return {
|
|
2485
|
+
"error": f"Covariate column(s) not in metadata: {missing_covars}",
|
|
2486
|
+
"summary": f"Missing covariates: {', '.join(missing_covars)}",
|
|
2487
|
+
}
|
|
2488
|
+
design_terms = covars + [condition_col]
|
|
2489
|
+
design_formula = "~ " + " + ".join(design_terms)
|
|
2490
|
+
|
|
2491
|
+
target_gene = str(target_gene or "").strip()
|
|
2492
|
+
|
|
2493
|
+
def _resolve_target_gene(results_df: "pd.DataFrame") -> "dict | None":
|
|
2494
|
+
"""Resolve a user-requested target gene against DE results."""
|
|
2495
|
+
if not target_gene:
|
|
2496
|
+
return None
|
|
2497
|
+
|
|
2498
|
+
idx_series = pd.Series(results_df.index.astype(str), index=results_df.index)
|
|
2499
|
+
idx_no_ver = idx_series.str.split(".").str[0]
|
|
2500
|
+
tgt = target_gene
|
|
2501
|
+
tgt_no_ver = tgt.split(".")[0]
|
|
2502
|
+
tgt_lower = tgt.lower()
|
|
2503
|
+
tgt_no_ver_lower = tgt_no_ver.lower()
|
|
2504
|
+
|
|
2505
|
+
mask = (idx_series.str.lower() == tgt_lower) | (idx_no_ver.str.lower() == tgt_no_ver_lower)
|
|
2506
|
+
|
|
2507
|
+
# If the target appears to be a symbol and IDs are in results, use mapping if provided/discoverable.
|
|
2508
|
+
if not mask.any():
|
|
2509
|
+
mapper = None
|
|
2510
|
+
|
|
2511
|
+
def _build_mapper(gm_df: "pd.DataFrame") -> "dict[str, str]":
|
|
2512
|
+
nonlocal gene_id_col, gene_symbol_col
|
|
2513
|
+
id_col = gene_id_col or ("ENSG_ID" if "ENSG_ID" in gm_df.columns else gm_df.columns[0])
|
|
2514
|
+
sym_col = gene_symbol_col or (
|
|
2515
|
+
"gene_name"
|
|
2516
|
+
if "gene_name" in gm_df.columns
|
|
2517
|
+
else ("symbol" if "symbol" in gm_df.columns else gm_df.columns[-1])
|
|
2518
|
+
)
|
|
2519
|
+
gm2 = gm_df[[id_col, sym_col]].dropna().copy()
|
|
2520
|
+
gm2[id_col] = gm2[id_col].astype(str).str.split(".").str[0]
|
|
2521
|
+
gm2[sym_col] = gm2[sym_col].astype(str)
|
|
2522
|
+
return {
|
|
2523
|
+
symbol.lower(): gid
|
|
2524
|
+
for gid, symbol in zip(gm2[id_col], gm2[sym_col])
|
|
2525
|
+
if symbol
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
if gene_map_path:
|
|
2529
|
+
gm, gm_err = _load_tabular(gene_map_path, index_col=None)
|
|
2530
|
+
if not gm_err and gm is not None and not gm.empty:
|
|
2531
|
+
mapper = _build_mapper(gm)
|
|
2532
|
+
else:
|
|
2533
|
+
# Best-effort auto-discovery for common capsule naming patterns.
|
|
2534
|
+
try:
|
|
2535
|
+
base_dir = Path(counts_path).expanduser().resolve().parent
|
|
2536
|
+
candidates = sorted(
|
|
2537
|
+
[
|
|
2538
|
+
p
|
|
2539
|
+
for p in base_dir.iterdir()
|
|
2540
|
+
if p.is_file()
|
|
2541
|
+
and p.suffix.lower() in {".csv", ".tsv", ".txt", ".xlsx", ".xls"}
|
|
2542
|
+
and ("gene" in p.name.lower() and ("meta" in p.name.lower() or "annot" in p.name.lower()))
|
|
2543
|
+
]
|
|
2544
|
+
)
|
|
2545
|
+
for cand in candidates:
|
|
2546
|
+
gm, gm_err = _load_tabular(str(cand), index_col=None)
|
|
2547
|
+
if gm_err or gm is None or gm.empty:
|
|
2548
|
+
continue
|
|
2549
|
+
mapper = _build_mapper(gm)
|
|
2550
|
+
if mapper:
|
|
2551
|
+
break
|
|
2552
|
+
except Exception:
|
|
2553
|
+
mapper = None
|
|
2554
|
+
|
|
2555
|
+
if mapper:
|
|
2556
|
+
mapped_id = mapper.get(tgt_lower)
|
|
2557
|
+
if mapped_id:
|
|
2558
|
+
mask = idx_no_ver.str.lower() == mapped_id.lower()
|
|
2559
|
+
|
|
2560
|
+
if not mask.any():
|
|
2561
|
+
return {
|
|
2562
|
+
"target_gene": target_gene,
|
|
2563
|
+
"found": False,
|
|
2564
|
+
}
|
|
2565
|
+
|
|
2566
|
+
row = results_df.loc[mask].iloc[0]
|
|
2567
|
+
return {
|
|
2568
|
+
"target_gene": target_gene,
|
|
2569
|
+
"found": True,
|
|
2570
|
+
"matched_gene_id": str(results_df.loc[mask].index[0]),
|
|
2571
|
+
"log2FoldChange": float(row.get("log2FoldChange")) if pd.notna(row.get("log2FoldChange")) else None,
|
|
2572
|
+
"baseMean": float(row.get("baseMean")) if pd.notna(row.get("baseMean")) else None,
|
|
2573
|
+
"padj": float(row.get("padj")) if pd.notna(row.get("padj")) else None,
|
|
2574
|
+
"pvalue": float(row.get("pvalue")) if "pvalue" in row and pd.notna(row.get("pvalue")) else None,
|
|
2575
|
+
}
|
|
2576
|
+
|
|
2577
|
+
# Ensure categorical encoding for design variables.
|
|
2578
|
+
# Coerce to string first so mixed numeric/string covariates (e.g., batch IDs)
|
|
2579
|
+
# convert cleanly through pandas2ri into R factors.
|
|
2580
|
+
metadata = metadata.copy()
|
|
2581
|
+
for col in design_terms:
|
|
2582
|
+
metadata[col] = metadata[col].astype(str).astype("category")
|
|
2583
|
+
if ref in metadata[condition_col].cat.categories:
|
|
2584
|
+
ordered_levels = [ref] + [x for x in metadata[condition_col].cat.categories if x != ref]
|
|
2585
|
+
metadata[condition_col] = metadata[condition_col].cat.reorder_categories(ordered_levels)
|
|
2586
|
+
|
|
2587
|
+
# Try native R DESeq2 first (when requested and available), then fall back
|
|
2588
|
+
# to pyDESeq2 for environments without DESeq2.
|
|
2589
|
+
if use_r_deseq2:
|
|
2590
|
+
try:
|
|
2591
|
+
import rpy2.robjects as ro
|
|
2592
|
+
from rpy2.robjects import pandas2ri
|
|
2593
|
+
from rpy2.robjects.conversion import localconverter
|
|
2594
|
+
from rpy2.robjects.packages import importr
|
|
2595
|
+
|
|
2596
|
+
# Ensure DESeq2 is available in either user or system R library.
|
|
2597
|
+
ro.r(".libPaths(c('~/R/library', .libPaths()))")
|
|
2598
|
+
importr("DESeq2")
|
|
2599
|
+
if lfc_shrink:
|
|
2600
|
+
try:
|
|
2601
|
+
importr("apeglm")
|
|
2602
|
+
except Exception:
|
|
2603
|
+
# Shrinkage is optional; proceed without if apeglm unavailable.
|
|
2604
|
+
lfc_shrink = False
|
|
2605
|
+
|
|
2606
|
+
counts_r = counts.astype(int)
|
|
2607
|
+
meta_r = metadata.copy()
|
|
2608
|
+
# R expects colData rownames to match countData colnames.
|
|
2609
|
+
meta_r.index = counts_r.columns
|
|
2610
|
+
|
|
2611
|
+
with localconverter(ro.default_converter + pandas2ri.converter):
|
|
2612
|
+
ro.globalenv["counts_df"] = counts_r
|
|
2613
|
+
ro.globalenv["meta_df"] = meta_r
|
|
2614
|
+
|
|
2615
|
+
ro.globalenv["design_formula_str"] = design_formula
|
|
2616
|
+
ro.globalenv["condition_col_str"] = condition_col
|
|
2617
|
+
ro.globalenv["test_level_str"] = test
|
|
2618
|
+
ro.globalenv["ref_level_str"] = ref
|
|
2619
|
+
ro.globalenv["alpha_val"] = float(alpha)
|
|
2620
|
+
ro.globalenv["do_shrink"] = bool(lfc_shrink)
|
|
2621
|
+
|
|
2622
|
+
r_script = """
|
|
2623
|
+
suppressPackageStartupMessages(library(DESeq2))
|
|
2624
|
+
if (isTRUE(do_shrink)) {
|
|
2625
|
+
suppressPackageStartupMessages(library(apeglm))
|
|
2626
|
+
}
|
|
2627
|
+
counts_mat <- as.matrix(counts_df)
|
|
2628
|
+
mode(counts_mat) <- "integer"
|
|
2629
|
+
meta <- as.data.frame(meta_df)
|
|
2630
|
+
cond_vals <- as.character(meta[[condition_col_str]])
|
|
2631
|
+
meta[[condition_col_str]] <- factor(cond_vals)
|
|
2632
|
+
meta[[condition_col_str]] <- relevel(meta[[condition_col_str]], ref = ref_level_str)
|
|
2633
|
+
dds <- DESeqDataSetFromMatrix(
|
|
2634
|
+
countData = counts_mat,
|
|
2635
|
+
colData = meta,
|
|
2636
|
+
design = as.formula(design_formula_str)
|
|
2637
|
+
)
|
|
2638
|
+
dds <- DESeq(dds, quiet = TRUE)
|
|
2639
|
+
res <- results(
|
|
2640
|
+
dds,
|
|
2641
|
+
contrast = c(condition_col_str, test_level_str, ref_level_str),
|
|
2642
|
+
alpha = alpha_val
|
|
2643
|
+
)
|
|
2644
|
+
shrink_coeff <- NA_character_
|
|
2645
|
+
if (isTRUE(do_shrink)) {
|
|
2646
|
+
rn <- resultsNames(dds)
|
|
2647
|
+
cand <- rn[grepl(paste0("^", condition_col_str, "_"), rn)]
|
|
2648
|
+
if (length(cand) > 0) {
|
|
2649
|
+
shrink_coeff <- cand[1]
|
|
2650
|
+
res <- lfcShrink(dds, coef = shrink_coeff, type = "apeglm")
|
|
2651
|
+
}
|
|
2652
|
+
}
|
|
2653
|
+
res_df <- as.data.frame(res)
|
|
2654
|
+
res_df$gene_id <- rownames(res_df)
|
|
2655
|
+
"""
|
|
2656
|
+
ro.r(r_script)
|
|
2657
|
+
with localconverter(ro.default_converter + pandas2ri.converter):
|
|
2658
|
+
res_df = ro.conversion.rpy2py(ro.globalenv["res_df"])
|
|
2659
|
+
shrink_coeff = str(ro.globalenv["shrink_coeff"][0]) if "shrink_coeff" in ro.globalenv else None
|
|
2660
|
+
if shrink_coeff in {"NA", "NA_character_", "None"}:
|
|
2661
|
+
shrink_coeff = None
|
|
2662
|
+
|
|
2663
|
+
# Normalize column names to match pyDESeq2-style payload.
|
|
2664
|
+
if "log2FoldChange" not in res_df.columns and "log2FoldChange" in [str(c) for c in res_df.columns]:
|
|
2665
|
+
pass
|
|
2666
|
+
if "baseMean" not in res_df.columns or "padj" not in res_df.columns:
|
|
2667
|
+
raise ValueError("R DESeq2 results missing required columns")
|
|
2668
|
+
res_df = res_df.set_index("gene_id")
|
|
2669
|
+
results = res_df.sort_values("padj")
|
|
2670
|
+
|
|
2671
|
+
n_sig = int((results["padj"] < alpha).sum())
|
|
2672
|
+
n_up = int(((results["padj"] < alpha) & (results["log2FoldChange"] > 0)).sum())
|
|
2673
|
+
n_down = int(((results["padj"] < alpha) & (results["log2FoldChange"] < 0)).sum())
|
|
2674
|
+
target_gene_result = _resolve_target_gene(results)
|
|
2675
|
+
target_gene_summary = ""
|
|
2676
|
+
if target_gene_result:
|
|
2677
|
+
if target_gene_result.get("found"):
|
|
2678
|
+
lfc_val = target_gene_result.get("log2FoldChange")
|
|
2679
|
+
lfc_txt = f"{lfc_val:.6g}" if lfc_val is not None else "NA"
|
|
2680
|
+
target_gene_summary = f" {target_gene} log2FoldChange={lfc_txt}."
|
|
2681
|
+
else:
|
|
2682
|
+
target_gene_summary = f" {target_gene} was not found in result gene IDs."
|
|
2683
|
+
return {
|
|
2684
|
+
"method": "DESeq2 (R via rpy2)",
|
|
2685
|
+
"n_genes_tested": len(results),
|
|
2686
|
+
"n_significant": n_sig,
|
|
2687
|
+
"n_upregulated": n_up,
|
|
2688
|
+
"n_downregulated": n_down,
|
|
2689
|
+
"contrast": f"{test} vs {ref}",
|
|
2690
|
+
"design": design_formula,
|
|
2691
|
+
"covariates": covars,
|
|
2692
|
+
"n_samples_ref": n_ref,
|
|
2693
|
+
"n_samples_test": n_test,
|
|
2694
|
+
"n_shared_samples": int(len(common)),
|
|
2695
|
+
"prefilter": {
|
|
2696
|
+
"min_count": int(prefilter_min_count),
|
|
2697
|
+
"min_samples": int(prefilter_min_samples),
|
|
2698
|
+
"n_genes_after": int(len(results)),
|
|
2699
|
+
},
|
|
2700
|
+
"metadata_inferred": bool(infer_metadata and not metadata_path),
|
|
2701
|
+
"alpha": alpha,
|
|
2702
|
+
"lfc_shrink": bool(lfc_shrink),
|
|
2703
|
+
"lfc_shrink_coeff": shrink_coeff,
|
|
2704
|
+
"top_hits": results.head(20).to_dict("index"),
|
|
2705
|
+
"target_gene_result": target_gene_result,
|
|
2706
|
+
"summary": (
|
|
2707
|
+
f"DESeq2 (R): {len(results):,} genes tested ({test} vs {ref}) with design {design_formula}. "
|
|
2708
|
+
f"{n_sig} significant (padj<{alpha}): {n_up} up, {n_down} down."
|
|
2709
|
+
+ (" Metadata was inferred from sample order (exploratory)." if infer_metadata and not metadata_path else "")
|
|
2710
|
+
+ target_gene_summary
|
|
2711
|
+
),
|
|
2712
|
+
}
|
|
2713
|
+
except Exception as exc:
|
|
2714
|
+
logger.warning("R DESeq2 backend failed, falling back to pyDESeq2: %s", exc)
|
|
2715
|
+
|
|
2716
|
+
# Try pyDESeq2
|
|
2717
|
+
if _check_pydeseq2():
|
|
2718
|
+
try:
|
|
2719
|
+
from pydeseq2.dds import DeseqDataSet
|
|
2720
|
+
from pydeseq2.ds import DeseqStats
|
|
2721
|
+
from pydeseq2.default_inference import DefaultInference
|
|
2722
|
+
|
|
2723
|
+
inference = DefaultInference(n_cpus=1)
|
|
2724
|
+
# pyDESeq2 wants samples as rows, genes as columns
|
|
2725
|
+
dds = DeseqDataSet(
|
|
2726
|
+
counts=counts.T,
|
|
2727
|
+
metadata=metadata,
|
|
2728
|
+
design=design_formula,
|
|
2729
|
+
refit_cooks=True,
|
|
2730
|
+
inference=inference,
|
|
2731
|
+
quiet=True,
|
|
2732
|
+
)
|
|
2733
|
+
dds.deseq2()
|
|
2734
|
+
|
|
2735
|
+
stat = DeseqStats(
|
|
2736
|
+
dds,
|
|
2737
|
+
contrast=[condition_col, test, ref],
|
|
2738
|
+
alpha=alpha,
|
|
2739
|
+
inference=inference,
|
|
2740
|
+
quiet=True,
|
|
2741
|
+
)
|
|
2742
|
+
stat.summary()
|
|
2743
|
+
|
|
2744
|
+
# Optional apeglm shrinkage on the requested condition coefficient.
|
|
2745
|
+
shrink_coeff = None
|
|
2746
|
+
if lfc_shrink and hasattr(dds, "varm") and "LFC" in dds.varm:
|
|
2747
|
+
coeffs = list(dds.varm["LFC"].columns)
|
|
2748
|
+
preferred = [
|
|
2749
|
+
c for c in coeffs
|
|
2750
|
+
if condition_col in c and (test in c or test.replace("-", "_") in c)
|
|
2751
|
+
]
|
|
2752
|
+
if preferred:
|
|
2753
|
+
shrink_coeff = preferred[0]
|
|
2754
|
+
try:
|
|
2755
|
+
stat.lfc_shrink(coeff=shrink_coeff)
|
|
2756
|
+
except Exception as exc:
|
|
2757
|
+
logger.warning("LFC shrinkage failed for coeff %s: %s", shrink_coeff, exc)
|
|
2758
|
+
|
|
2759
|
+
results = stat.results_df.sort_values("padj")
|
|
2760
|
+
n_sig = int((results["padj"] < alpha).sum())
|
|
2761
|
+
n_up = int(((results["padj"] < alpha) & (results["log2FoldChange"] > 0)).sum())
|
|
2762
|
+
n_down = int(((results["padj"] < alpha) & (results["log2FoldChange"] < 0)).sum())
|
|
2763
|
+
target_gene_result = _resolve_target_gene(results)
|
|
2764
|
+
target_gene_summary = ""
|
|
2765
|
+
if target_gene_result:
|
|
2766
|
+
if target_gene_result.get("found"):
|
|
2767
|
+
lfc_val = target_gene_result.get("log2FoldChange")
|
|
2768
|
+
lfc_txt = f"{lfc_val:.6g}" if lfc_val is not None else "NA"
|
|
2769
|
+
target_gene_summary = f" {target_gene} log2FoldChange={lfc_txt}."
|
|
2770
|
+
else:
|
|
2771
|
+
target_gene_summary = f" {target_gene} was not found in result gene IDs."
|
|
2772
|
+
|
|
2773
|
+
result_payload = {
|
|
2774
|
+
"method": "DESeq2 (pydeseq2)",
|
|
2775
|
+
"n_genes_tested": len(results),
|
|
2776
|
+
"n_significant": n_sig,
|
|
2777
|
+
"n_upregulated": n_up,
|
|
2778
|
+
"n_downregulated": n_down,
|
|
2779
|
+
"contrast": f"{test} vs {ref}",
|
|
2780
|
+
"design": design_formula,
|
|
2781
|
+
"covariates": covars,
|
|
2782
|
+
"n_samples_ref": n_ref,
|
|
2783
|
+
"n_samples_test": n_test,
|
|
2784
|
+
"n_shared_samples": int(len(common)),
|
|
2785
|
+
"prefilter": {
|
|
2786
|
+
"min_count": int(prefilter_min_count),
|
|
2787
|
+
"min_samples": int(prefilter_min_samples),
|
|
2788
|
+
"n_genes_after": int(len(results)),
|
|
2789
|
+
},
|
|
2790
|
+
"metadata_inferred": bool(infer_metadata and not metadata_path),
|
|
2791
|
+
"alpha": alpha,
|
|
2792
|
+
"lfc_shrink": bool(lfc_shrink),
|
|
2793
|
+
"lfc_shrink_coeff": shrink_coeff,
|
|
2794
|
+
"top_hits": results.head(20).to_dict("index"),
|
|
2795
|
+
"target_gene_result": target_gene_result,
|
|
2796
|
+
"summary": (
|
|
2797
|
+
f"DESeq2: {len(results):,} genes tested ({test} vs {ref}) with design {design_formula}. "
|
|
2798
|
+
f"{n_sig} significant (padj<{alpha}): {n_up} up, {n_down} down."
|
|
2799
|
+
+ (" Metadata was inferred from sample order (exploratory)." if infer_metadata and not metadata_path else "")
|
|
2800
|
+
+ target_gene_summary
|
|
2801
|
+
),
|
|
2802
|
+
}
|
|
2803
|
+
|
|
2804
|
+
# Optional enrichment over significant DEGs with effect filters.
|
|
2805
|
+
if enrichment_library:
|
|
2806
|
+
sig = results[results["padj"] < alpha].copy()
|
|
2807
|
+
if min_abs_lfc > 0:
|
|
2808
|
+
sig = sig[sig["log2FoldChange"].abs() >= float(min_abs_lfc)]
|
|
2809
|
+
if min_base_mean > 0 and "baseMean" in sig.columns:
|
|
2810
|
+
sig = sig[sig["baseMean"] >= float(min_base_mean)]
|
|
2811
|
+
|
|
2812
|
+
genes_for_enrichment = list(sig.index.astype(str))
|
|
2813
|
+
mapped_gene_count = None
|
|
2814
|
+
if gene_map_path:
|
|
2815
|
+
gm, gm_err = _load_tabular(gene_map_path, index_col=None)
|
|
2816
|
+
if gm_err:
|
|
2817
|
+
result_payload["enrichment_error"] = f"Gene map load failed: {gm_err}"
|
|
2818
|
+
else:
|
|
2819
|
+
id_col = gene_id_col or ("ENSG_ID" if "ENSG_ID" in gm.columns else gm.columns[0])
|
|
2820
|
+
sym_col = gene_symbol_col or (
|
|
2821
|
+
"gene_name" if "gene_name" in gm.columns else ("symbol" if "symbol" in gm.columns else gm.columns[-1])
|
|
2822
|
+
)
|
|
2823
|
+
gm2 = gm[[id_col, sym_col]].dropna().copy()
|
|
2824
|
+
gm2[id_col] = gm2[id_col].astype(str)
|
|
2825
|
+
gm2[sym_col] = gm2[sym_col].astype(str)
|
|
2826
|
+
mapper = dict(zip(gm2[id_col], gm2[sym_col]))
|
|
2827
|
+
mapped = []
|
|
2828
|
+
for gid in genes_for_enrichment:
|
|
2829
|
+
mapped_sym = mapper.get(gid, mapper.get(gid.split(".")[0]))
|
|
2830
|
+
if mapped_sym:
|
|
2831
|
+
mapped.append(mapped_sym)
|
|
2832
|
+
genes_for_enrichment = sorted(set(mapped))
|
|
2833
|
+
mapped_gene_count = len(genes_for_enrichment)
|
|
2834
|
+
|
|
2835
|
+
if genes_for_enrichment:
|
|
2836
|
+
try:
|
|
2837
|
+
import gseapy
|
|
2838
|
+
|
|
2839
|
+
enr = gseapy.enrichr(
|
|
2840
|
+
gene_list=genes_for_enrichment,
|
|
2841
|
+
gene_sets=enrichment_library,
|
|
2842
|
+
outdir=None,
|
|
2843
|
+
no_plot=True,
|
|
2844
|
+
)
|
|
2845
|
+
enr_df = enr.results.copy()
|
|
2846
|
+
result_payload["enrichment"] = {
|
|
2847
|
+
"library": enrichment_library,
|
|
2848
|
+
"n_input_genes": len(genes_for_enrichment),
|
|
2849
|
+
"mapped_gene_count": mapped_gene_count,
|
|
2850
|
+
"n_terms": int(len(enr_df)),
|
|
2851
|
+
"top_terms": enr_df.head(20).to_dict("records"),
|
|
2852
|
+
}
|
|
2853
|
+
|
|
2854
|
+
if pathway_term:
|
|
2855
|
+
terms = enr_df["Term"].astype(str)
|
|
2856
|
+
exact = enr_df[terms.str.lower() == pathway_term.lower()]
|
|
2857
|
+
target_df = exact if not exact.empty else enr_df[terms.str.contains(pathway_term, case=False, na=False)]
|
|
2858
|
+
if not target_df.empty:
|
|
2859
|
+
target = target_df.iloc[0].to_dict()
|
|
2860
|
+
result_payload["pathway_match"] = target
|
|
2861
|
+
result_payload["pathway_odds_ratio"] = target.get("Odds Ratio")
|
|
2862
|
+
result_payload["summary"] += (
|
|
2863
|
+
f" Enrichment: '{target.get('Term', pathway_term)}' odds ratio "
|
|
2864
|
+
f"{target.get('Odds Ratio')}."
|
|
2865
|
+
)
|
|
2866
|
+
else:
|
|
2867
|
+
result_payload["pathway_match"] = None
|
|
2868
|
+
result_payload["summary"] += f" Enrichment ran but pathway '{pathway_term}' was not found."
|
|
2869
|
+
except Exception as exc:
|
|
2870
|
+
result_payload["enrichment_error"] = str(exc)
|
|
2871
|
+
result_payload["summary"] += " Enrichment step failed."
|
|
2872
|
+
|
|
2873
|
+
return result_payload
|
|
2874
|
+
|
|
2875
|
+
except Exception as exc:
|
|
2876
|
+
logger.warning("pyDESeq2 failed, falling back to Mann-Whitney: %s", exc)
|
|
2877
|
+
|
|
2878
|
+
# Fallback: Mann-Whitney U
|
|
2879
|
+
import numpy as np
|
|
2880
|
+
from scipy import stats
|
|
2881
|
+
|
|
2882
|
+
g1_samples = metadata.index[metadata[condition_col] == ref].tolist()
|
|
2883
|
+
g2_samples = metadata.index[metadata[condition_col] == test].tolist()
|
|
2884
|
+
g1 = counts[g1_samples]
|
|
2885
|
+
g2 = counts[g2_samples]
|
|
2886
|
+
|
|
2887
|
+
log2fc = np.log2((g2.mean(axis=1) + 1) / (g1.mean(axis=1) + 1))
|
|
2888
|
+
pvals = []
|
|
2889
|
+
for gene in counts.index:
|
|
2890
|
+
v1 = g1.loc[gene].dropna().values
|
|
2891
|
+
v2 = g2.loc[gene].dropna().values
|
|
2892
|
+
if len(v1) >= 2 and len(v2) >= 2:
|
|
2893
|
+
_, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
|
|
2894
|
+
pvals.append(p)
|
|
2895
|
+
else:
|
|
2896
|
+
pvals.append(1.0)
|
|
2897
|
+
|
|
2898
|
+
pvals = np.array(pvals)
|
|
2899
|
+
qvals = _fdr_correct(pvals)
|
|
2900
|
+
|
|
2901
|
+
n_sig = int((qvals < alpha).sum())
|
|
2902
|
+
n_up = int(((qvals < alpha) & (log2fc.values > 0)).sum())
|
|
2903
|
+
n_down = int(((qvals < alpha) & (log2fc.values < 0)).sum())
|
|
2904
|
+
|
|
2905
|
+
results = pd.DataFrame({
|
|
2906
|
+
"log2FoldChange": log2fc, "pvalue": pvals, "padj": qvals,
|
|
2907
|
+
}, index=counts.index).sort_values("padj")
|
|
2908
|
+
target_gene_result = _resolve_target_gene(results)
|
|
2909
|
+
target_gene_summary = ""
|
|
2910
|
+
if target_gene_result:
|
|
2911
|
+
if target_gene_result.get("found"):
|
|
2912
|
+
lfc_val = target_gene_result.get("log2FoldChange")
|
|
2913
|
+
lfc_txt = f"{lfc_val:.6g}" if lfc_val is not None else "NA"
|
|
2914
|
+
target_gene_summary = f" {target_gene} log2FoldChange={lfc_txt}."
|
|
2915
|
+
else:
|
|
2916
|
+
target_gene_summary = f" {target_gene} was not found in result gene IDs."
|
|
2917
|
+
|
|
2918
|
+
return {
|
|
2919
|
+
"method": "Mann-Whitney U (fallback — install pydeseq2 for proper DESeq2)",
|
|
2920
|
+
"n_genes_tested": len(results),
|
|
2921
|
+
"n_significant": n_sig,
|
|
2922
|
+
"n_upregulated": n_up,
|
|
2923
|
+
"n_downregulated": n_down,
|
|
2924
|
+
"contrast": f"{test} vs {ref}",
|
|
2925
|
+
"n_samples_ref": n_ref,
|
|
2926
|
+
"n_samples_test": n_test,
|
|
2927
|
+
"metadata_inferred": bool(infer_metadata and not metadata_path),
|
|
2928
|
+
"alpha": alpha,
|
|
2929
|
+
"top_hits": results.head(20).to_dict("index"),
|
|
2930
|
+
"target_gene_result": target_gene_result,
|
|
2931
|
+
"summary": (
|
|
2932
|
+
f"Differential expression (Mann-Whitney fallback): {len(results):,} genes ({test} vs {ref}). "
|
|
2933
|
+
f"{n_sig} significant (FDR<{alpha}): {n_up} up, {n_down} down. "
|
|
2934
|
+
f"Install pydeseq2 for proper negative binomial modeling."
|
|
2935
|
+
+ (" Metadata was inferred from sample order (exploratory)." if infer_metadata and not metadata_path else "")
|
|
2936
|
+
+ target_gene_summary
|
|
2937
|
+
),
|
|
2938
|
+
}
|
|
2939
|
+
|
|
2940
|
+
|
|
2941
|
+
# ---------------------------------------------------------------------------
|
|
2942
|
+
# 20. omics.multiomics_integrate
|
|
2943
|
+
# ---------------------------------------------------------------------------
|
|
2944
|
+
|
|
2945
|
+
|
|
2946
|
+
@registry.register(
|
|
2947
|
+
name="omics.multiomics_integrate",
|
|
2948
|
+
description="Integrate multiple omics modalities using MOFA+ (Multi-Omics Factor Analysis)",
|
|
2949
|
+
category="omics",
|
|
2950
|
+
parameters={
|
|
2951
|
+
"paths": "Comma-separated paths to h5ad files for each modality",
|
|
2952
|
+
"modality_names": "Comma-separated names for each modality (e.g., 'rna,atac,protein')",
|
|
2953
|
+
"n_factors": "Number of latent factors to learn (default 10)",
|
|
2954
|
+
},
|
|
2955
|
+
usage_guide=(
|
|
2956
|
+
"Integrate multiple omics datasets (RNA + ATAC, RNA + protein, etc.) into a shared "
|
|
2957
|
+
"latent space using MOFA+. Requires muon: pip install muon. Each modality should be "
|
|
2958
|
+
"an h5ad file with overlapping cell barcodes."
|
|
2959
|
+
),
|
|
2960
|
+
)
|
|
2961
|
+
def multiomics_integrate(
|
|
2962
|
+
paths: str = "",
|
|
2963
|
+
modality_names: str = "",
|
|
2964
|
+
n_factors: int = 10,
|
|
2965
|
+
**kwargs,
|
|
2966
|
+
) -> dict:
|
|
2967
|
+
"""Integrate multiple omics modalities using MOFA+."""
|
|
2968
|
+
mu = _check_muon()
|
|
2969
|
+
if mu is None:
|
|
2970
|
+
return {
|
|
2971
|
+
"error": "muon required. Install with: pip install muon mudata",
|
|
2972
|
+
"summary": "Install muon for multi-omics integration: pip install muon mudata",
|
|
2973
|
+
}
|
|
2974
|
+
|
|
2975
|
+
sc = _check_scanpy()
|
|
2976
|
+
if sc is None:
|
|
2977
|
+
return {"error": "scanpy required. Install with: pip install scanpy", "summary": "Install scanpy: pip install scanpy"}
|
|
2978
|
+
|
|
2979
|
+
from mudata import MuData
|
|
2980
|
+
|
|
2981
|
+
path_list = [p.strip() for p in paths.split(",") if p.strip()]
|
|
2982
|
+
name_list = [n.strip() for n in modality_names.split(",") if n.strip()]
|
|
2983
|
+
|
|
2984
|
+
if len(path_list) < 2:
|
|
2985
|
+
return {"error": "Need at least 2 modality paths", "summary": "Provide ≥2 h5ad paths for integration"}
|
|
2986
|
+
|
|
2987
|
+
if not name_list:
|
|
2988
|
+
name_list = [f"modality_{i}" for i in range(len(path_list))]
|
|
2989
|
+
if len(name_list) != len(path_list):
|
|
2990
|
+
return {"error": "Number of names must match number of paths", "summary": "Mismatched path/name count"}
|
|
2991
|
+
|
|
2992
|
+
# Load modalities
|
|
2993
|
+
modalities = {}
|
|
2994
|
+
for name, fpath in zip(name_list, path_list):
|
|
2995
|
+
fp = Path(fpath).expanduser()
|
|
2996
|
+
if not fp.exists():
|
|
2997
|
+
return {"error": f"File not found: {fpath}", "summary": f"Missing file: {fpath}"}
|
|
2998
|
+
try:
|
|
2999
|
+
adata = sc.read_h5ad(fp)
|
|
3000
|
+
modalities[name] = adata
|
|
3001
|
+
except Exception as exc:
|
|
3002
|
+
return {"error": f"Failed to load {fpath}: {str(exc)[:200]}", "summary": f"Load error: {str(exc)[:100]}"}
|
|
3003
|
+
|
|
3004
|
+
# Create MuData
|
|
3005
|
+
try:
|
|
3006
|
+
mdata = MuData(modalities)
|
|
3007
|
+
except Exception as exc:
|
|
3008
|
+
return {"error": f"MuData creation failed: {str(exc)[:200]}", "summary": f"Integration setup error: {str(exc)[:100]}"}
|
|
3009
|
+
|
|
3010
|
+
n_shared = mdata.n_obs
|
|
3011
|
+
mod_shapes = {name: (ad.n_obs, ad.n_vars) for name, ad in modalities.items()}
|
|
3012
|
+
|
|
3013
|
+
# Preprocess each modality
|
|
3014
|
+
for name in name_list:
|
|
3015
|
+
ad = mdata.mod[name]
|
|
3016
|
+
if ad.X.max() > 50: # likely raw counts
|
|
3017
|
+
sc.pp.normalize_total(ad, target_sum=1e4)
|
|
3018
|
+
sc.pp.log1p(ad)
|
|
3019
|
+
sc.pp.highly_variable_genes(ad, min_mean=0.0125, max_mean=3, min_disp=0.5)
|
|
3020
|
+
|
|
3021
|
+
# Run MOFA+
|
|
3022
|
+
try:
|
|
3023
|
+
mu.tl.mofa(mdata, n_factors=n_factors, quiet=True)
|
|
3024
|
+
except Exception as exc:
|
|
3025
|
+
return {
|
|
3026
|
+
"error": f"MOFA+ failed: {str(exc)[:200]}",
|
|
3027
|
+
"summary": f"MOFA+ integration failed: {str(exc)[:100]}",
|
|
3028
|
+
"n_shared_cells": n_shared,
|
|
3029
|
+
"modality_shapes": mod_shapes,
|
|
3030
|
+
}
|
|
3031
|
+
|
|
3032
|
+
# Extract results
|
|
3033
|
+
has_mofa = "X_mofa" in mdata.obsm
|
|
3034
|
+
if not has_mofa:
|
|
3035
|
+
return {
|
|
3036
|
+
"error": "MOFA+ did not produce embeddings",
|
|
3037
|
+
"summary": "Integration ran but produced no factors",
|
|
3038
|
+
}
|
|
3039
|
+
|
|
3040
|
+
# Downstream: neighbors + leiden on MOFA space
|
|
3041
|
+
sc.pp.neighbors(mdata, use_rep="X_mofa")
|
|
3042
|
+
sc.tl.leiden(mdata, resolution=1.0, key_added="joint_cluster")
|
|
3043
|
+
|
|
3044
|
+
clusters = mdata.obs["joint_cluster"].value_counts().to_dict()
|
|
3045
|
+
n_clusters = len(clusters)
|
|
3046
|
+
|
|
3047
|
+
return {
|
|
3048
|
+
"n_shared_cells": n_shared,
|
|
3049
|
+
"n_factors": n_factors,
|
|
3050
|
+
"modalities": name_list,
|
|
3051
|
+
"modality_shapes": mod_shapes,
|
|
3052
|
+
"n_joint_clusters": n_clusters,
|
|
3053
|
+
"joint_cluster_sizes": clusters,
|
|
3054
|
+
"summary": (
|
|
3055
|
+
f"MOFA+ integration of {len(name_list)} modalities "
|
|
3056
|
+
f"({', '.join(f'{n}: {s[0]}cells x {s[1]}features' for n, s in mod_shapes.items())}). "
|
|
3057
|
+
f"{n_shared:,} shared cells → {n_factors} factors → {n_clusters} joint clusters."
|
|
3058
|
+
),
|
|
3059
|
+
}
|
|
3060
|
+
|
|
3061
|
+
|
|
3062
|
+
# ---------------------------------------------------------------------------
|
|
3063
|
+
# 21. omics.methylation_cluster
|
|
3064
|
+
# ---------------------------------------------------------------------------
|
|
3065
|
+
|
|
3066
|
+
|
|
3067
|
+
@registry.register(
|
|
3068
|
+
name="omics.methylation_cluster",
|
|
3069
|
+
description="Cluster samples by methylation patterns using episcanpy",
|
|
3070
|
+
category="omics",
|
|
3071
|
+
parameters={
|
|
3072
|
+
"path": "Path to methylation matrix (h5ad or CSV, CpG sites as rows, samples as columns)",
|
|
3073
|
+
"n_top_features": "Number of most variable CpGs to use (default 5000)",
|
|
3074
|
+
"resolution": "Leiden clustering resolution (default 1.0)",
|
|
3075
|
+
},
|
|
3076
|
+
usage_guide=(
|
|
3077
|
+
"Cluster cells/samples by DNA methylation profiles. Uses episcanpy for "
|
|
3078
|
+
"methylation-aware preprocessing if available, falls back to scanpy/sklearn. "
|
|
3079
|
+
"Works on Illumina 450K/EPIC beta-value matrices or single-cell methylation h5ad."
|
|
3080
|
+
),
|
|
3081
|
+
)
|
|
3082
|
+
def methylation_cluster(
|
|
3083
|
+
path: str,
|
|
3084
|
+
n_top_features: int = 5000,
|
|
3085
|
+
resolution: float = 1.0,
|
|
3086
|
+
**kwargs,
|
|
3087
|
+
) -> dict:
|
|
3088
|
+
"""Cluster samples by methylation patterns."""
|
|
3089
|
+
import numpy as np
|
|
3090
|
+
|
|
3091
|
+
filepath = Path(path).expanduser()
|
|
3092
|
+
if not filepath.exists():
|
|
3093
|
+
return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
|
|
3094
|
+
|
|
3095
|
+
epi = _check_episcanpy()
|
|
3096
|
+
sc = _check_scanpy()
|
|
3097
|
+
|
|
3098
|
+
# Load data
|
|
3099
|
+
adata = None
|
|
3100
|
+
if filepath.suffix.lower() == ".h5ad":
|
|
3101
|
+
if sc is None and epi is None:
|
|
3102
|
+
return {"error": "scanpy or episcanpy required for h5ad", "summary": "Install scanpy or episcanpy"}
|
|
3103
|
+
reader = epi if epi else sc
|
|
3104
|
+
try:
|
|
3105
|
+
adata = reader.read_h5ad(filepath)
|
|
3106
|
+
except Exception as exc:
|
|
3107
|
+
return {"error": f"Failed to load h5ad: {str(exc)[:200]}", "summary": f"Load error: {str(exc)[:100]}"}
|
|
3108
|
+
else:
|
|
3109
|
+
# Tabular: load as AnnData
|
|
3110
|
+
import pandas as pd
|
|
3111
|
+
|
|
3112
|
+
df, error = _load_tabular(str(filepath))
|
|
3113
|
+
if error:
|
|
3114
|
+
return {"error": error, "summary": f"Could not load: {error}"}
|
|
3115
|
+
try:
|
|
3116
|
+
import anndata
|
|
3117
|
+
|
|
3118
|
+
# Transpose so samples are obs and CpGs are var
|
|
3119
|
+
adata = anndata.AnnData(X=df.T.values, obs=pd.DataFrame(index=df.columns), var=pd.DataFrame(index=df.index))
|
|
3120
|
+
except ImportError:
|
|
3121
|
+
return {"error": "anndata required: pip install anndata", "summary": "Install anndata"}
|
|
3122
|
+
|
|
3123
|
+
n_obs, n_vars = adata.n_obs, adata.n_vars
|
|
3124
|
+
|
|
3125
|
+
# Use episcanpy pipeline if available
|
|
3126
|
+
if epi is not None:
|
|
3127
|
+
try:
|
|
3128
|
+
# episcanpy variable feature selection
|
|
3129
|
+
epi.pp.filter_features(adata, min_cells=max(1, int(n_obs * 0.05)))
|
|
3130
|
+
epi.pp.select_var_feature(adata, nb_features=min(n_top_features, adata.n_vars))
|
|
3131
|
+
adata_use = adata[:, adata.var["highly_variable"]] if "highly_variable" in adata.var else adata
|
|
3132
|
+
epi.pp.pca(adata_use, n_comps=min(50, adata_use.n_vars - 1, adata_use.n_obs - 1))
|
|
3133
|
+
epi.pp.neighbors(adata_use, n_neighbors=15)
|
|
3134
|
+
epi.tl.leiden(adata_use, resolution=resolution)
|
|
3135
|
+
|
|
3136
|
+
clusters = adata_use.obs["leiden"].value_counts().to_dict()
|
|
3137
|
+
|
|
3138
|
+
result = {
|
|
3139
|
+
"method": "episcanpy",
|
|
3140
|
+
"n_samples": n_obs,
|
|
3141
|
+
"n_features_input": n_vars,
|
|
3142
|
+
"n_features_used": adata_use.n_vars,
|
|
3143
|
+
"n_clusters": len(clusters),
|
|
3144
|
+
"cluster_sizes": clusters,
|
|
3145
|
+
"summary": (
|
|
3146
|
+
f"Methylation clustering (episcanpy): {n_obs} samples, {adata_use.n_vars} variable CpGs → "
|
|
3147
|
+
f"{len(clusters)} clusters."
|
|
3148
|
+
),
|
|
3149
|
+
}
|
|
3150
|
+
|
|
3151
|
+
# Try to find marker CpGs
|
|
3152
|
+
try:
|
|
3153
|
+
epi.tl.rank_features(adata_use, groupby="leiden")
|
|
3154
|
+
markers = {}
|
|
3155
|
+
for cl in adata_use.obs["leiden"].unique():
|
|
3156
|
+
markers[str(cl)] = list(adata_use.uns["rank_features_groups"]["names"][cl][:5])
|
|
3157
|
+
result["cluster_markers"] = markers
|
|
3158
|
+
except Exception:
|
|
3159
|
+
pass
|
|
3160
|
+
|
|
3161
|
+
return result
|
|
3162
|
+
|
|
3163
|
+
except Exception as exc:
|
|
3164
|
+
logger.warning("episcanpy pipeline failed, falling back to scanpy: %s", exc)
|
|
3165
|
+
|
|
3166
|
+
# Fallback: scanpy or sklearn
|
|
3167
|
+
if sc is not None:
|
|
3168
|
+
try:
|
|
3169
|
+
sc.pp.highly_variable_genes(adata, n_top_genes=min(n_top_features, adata.n_vars))
|
|
3170
|
+
adata_use = adata[:, adata.var["highly_variable"]]
|
|
3171
|
+
sc.tl.pca(adata_use, n_comps=min(50, adata_use.n_vars - 1, adata_use.n_obs - 1))
|
|
3172
|
+
sc.pp.neighbors(adata_use, n_neighbors=15)
|
|
3173
|
+
sc.tl.leiden(adata_use, resolution=resolution)
|
|
3174
|
+
|
|
3175
|
+
clusters = adata_use.obs["leiden"].value_counts().to_dict()
|
|
3176
|
+
return {
|
|
3177
|
+
"method": "scanpy (episcanpy not installed)",
|
|
3178
|
+
"n_samples": n_obs,
|
|
3179
|
+
"n_features_input": n_vars,
|
|
3180
|
+
"n_features_used": adata_use.n_vars,
|
|
3181
|
+
"n_clusters": len(clusters),
|
|
3182
|
+
"cluster_sizes": clusters,
|
|
3183
|
+
"summary": (
|
|
3184
|
+
f"Methylation clustering (scanpy fallback): {n_obs} samples → "
|
|
3185
|
+
f"{len(clusters)} clusters. Install episcanpy for methylation-specific analysis."
|
|
3186
|
+
),
|
|
3187
|
+
}
|
|
3188
|
+
except Exception as exc:
|
|
3189
|
+
logger.warning("scanpy fallback failed: %s", exc)
|
|
3190
|
+
|
|
3191
|
+
# Last resort: sklearn KMeans
|
|
3192
|
+
from sklearn.decomposition import PCA
|
|
3193
|
+
from sklearn.cluster import KMeans
|
|
3194
|
+
from sklearn.preprocessing import StandardScaler
|
|
3195
|
+
|
|
3196
|
+
X = adata.X
|
|
3197
|
+
X = np.nan_to_num(X, nan=0.0)
|
|
3198
|
+
X = StandardScaler().fit_transform(X)
|
|
3199
|
+
n_comps = min(50, X.shape[0] - 1, X.shape[1] - 1)
|
|
3200
|
+
X_pca = PCA(n_components=n_comps).fit_transform(X)
|
|
3201
|
+
n_k = min(10, X.shape[0] // 2)
|
|
3202
|
+
labels = KMeans(n_clusters=max(n_k, 2), random_state=42, n_init=3).fit_predict(X_pca)
|
|
3203
|
+
|
|
3204
|
+
import pandas as pd
|
|
3205
|
+
|
|
3206
|
+
cluster_counts = pd.Series(labels).value_counts().to_dict()
|
|
3207
|
+
return {
|
|
3208
|
+
"method": "sklearn (install episcanpy or scanpy for better results)",
|
|
3209
|
+
"n_samples": n_obs,
|
|
3210
|
+
"n_features_input": n_vars,
|
|
3211
|
+
"n_clusters": len(cluster_counts),
|
|
3212
|
+
"cluster_sizes": {str(k): v for k, v in cluster_counts.items()},
|
|
3213
|
+
"summary": (
|
|
3214
|
+
f"Methylation clustering (sklearn fallback): {n_obs} samples → "
|
|
3215
|
+
f"{len(cluster_counts)} clusters. Install episcanpy for methylation-specific analysis."
|
|
3216
|
+
),
|
|
3217
|
+
}
|
|
3218
|
+
|
|
3219
|
+
|
|
3220
|
+
# ---------------------------------------------------------------------------
|
|
3221
|
+
# KEGG over-representation analysis (code-gen tool)
|
|
3222
|
+
# ---------------------------------------------------------------------------
|
|
3223
|
+
|
|
3224
|
+
KEGG_ORA_SYSTEM_PROMPT = """You are an expert bioinformatics data analyst performing KEGG pathway over-representation analysis.
|
|
3225
|
+
|
|
3226
|
+
{namespace_description}
|
|
3227
|
+
|
|
3228
|
+
## Available Data
|
|
3229
|
+
{data_files_description}
|
|
3230
|
+
|
|
3231
|
+
## DATA EXPLORATION (DO THIS FIRST)
|
|
3232
|
+
```python
|
|
3233
|
+
print("Columns:", df.columns.tolist())
|
|
3234
|
+
print("Shape:", df.shape)
|
|
3235
|
+
print("Head:\\n", df.head(3))
|
|
3236
|
+
if 'Unnamed: 0' in df.columns:
|
|
3237
|
+
df = df.set_index('Unnamed: 0')
|
|
3238
|
+
```
|
|
3239
|
+
|
|
3240
|
+
## KEGG ORA METHOD
|
|
3241
|
+
### Step 1: Determine organism code
|
|
3242
|
+
Common codes: 'hsa' (human), 'mmu' (mouse), 'eco' (E. coli), 'sce' (yeast).
|
|
3243
|
+
Check https://rest.kegg.jp/list/organism for others.
|
|
3244
|
+
|
|
3245
|
+
### Step 2: Fetch gene-pathway mappings
|
|
3246
|
+
- `/link/pathway/{{org}}` returns gene-to-pathway mapping (strip `path:` prefix from pathway IDs)
|
|
3247
|
+
- `/list/pathway/{{org}}` returns pathway names (already without `path:` prefix)
|
|
3248
|
+
- `/list/{{org}}` returns ALL genes (use as background universe — not just pathway-annotated genes)
|
|
3249
|
+
- Pathway names include organism suffix; use substring matching when searching.
|
|
3250
|
+
|
|
3251
|
+
### ORA parameters
|
|
3252
|
+
- **Background**: all genes from `/list/{{org}}` (typically much larger than the pathway-annotated subset)
|
|
3253
|
+
- **Size filters**: skip pathways with < 5 or > 500 genes
|
|
3254
|
+
- **Significance**: p < 0.05 and BH-adjusted p < 0.05
|
|
3255
|
+
|
|
3256
|
+
### Step 3: Fisher's exact test
|
|
3257
|
+
```python
|
|
3258
|
+
import urllib.request
|
|
3259
|
+
from scipy.stats import fisher_exact
|
|
3260
|
+
from statsmodels.stats.multitest import multipletests
|
|
3261
|
+
|
|
3262
|
+
def run_kegg_ora(gene_ids, all_kegg_genes, path2genes, path_names, min_size=5, max_size=500):
|
|
3263
|
+
deg_kegg = set(gene_ids) & all_kegg_genes
|
|
3264
|
+
N = len(all_kegg_genes)
|
|
3265
|
+
n = len(deg_kegg)
|
|
3266
|
+
if n == 0:
|
|
3267
|
+
return pd.DataFrame()
|
|
3268
|
+
results = []
|
|
3269
|
+
for pid, pgenes in path2genes.items():
|
|
3270
|
+
K = len(pgenes)
|
|
3271
|
+
if K < min_size or K > max_size:
|
|
3272
|
+
continue
|
|
3273
|
+
k = len(deg_kegg & pgenes)
|
|
3274
|
+
if k == 0:
|
|
3275
|
+
continue
|
|
3276
|
+
_, pval = fisher_exact([[k, n-k], [K-k, N-K-n+k]], alternative='greater')
|
|
3277
|
+
results.append({{'pathway': pid, 'name': path_names.get(pid, ''),
|
|
3278
|
+
'overlap': k, 'pathway_size': K, 'pvalue': pval}})
|
|
3279
|
+
if not results:
|
|
3280
|
+
return pd.DataFrame()
|
|
3281
|
+
res_df = pd.DataFrame(results)
|
|
3282
|
+
_, res_df['padj'], _, _ = multipletests(res_df['pvalue'], method='fdr_bh')
|
|
3283
|
+
return res_df
|
|
3284
|
+
```
|
|
3285
|
+
|
|
3286
|
+
### Step 4: Gene ID matching
|
|
3287
|
+
KEGG uses its own gene IDs. Always print examples from both your DEG list and KEGG to
|
|
3288
|
+
verify overlap. If overlap is low (< 10%), try stripping prefixes or case normalization.
|
|
3289
|
+
|
|
3290
|
+
### Directional analysis
|
|
3291
|
+
When working with DEG results (log2FoldChange), run ORA separately on upregulated
|
|
3292
|
+
(log2FC > threshold) and downregulated (log2FC < -threshold) genes. Combined analysis
|
|
3293
|
+
mixes opposing signals and can produce different pathway results.
|
|
3294
|
+
|
|
3295
|
+
## Rules
|
|
3296
|
+
1. Do NOT import libraries already in the namespace (pd, np, plt, sns, scipy_stats, etc.)
|
|
3297
|
+
2. Save plots to OUTPUT_DIR: `plt.savefig(OUTPUT_DIR / "filename.png", dpi=150, bbox_inches="tight")`; `plt.close()`
|
|
3298
|
+
3. Assign result: `result = {{"summary": "...", "answer": "PRECISE_ANSWER"}}`
|
|
3299
|
+
4. Use print() for intermediate output to verify correctness.
|
|
3300
|
+
5. If 0 results from a filter: print the column values and debug — do not return "N/A".
|
|
3301
|
+
|
|
3302
|
+
Write ONLY the Python code. No explanation, no markdown fences.
|
|
3303
|
+
"""
|
|
3304
|
+
|
|
3305
|
+
|
|
3306
|
+
@registry.register(
|
|
3307
|
+
name="omics.kegg_ora",
|
|
3308
|
+
description=(
|
|
3309
|
+
"KEGG pathway over-representation analysis (ORA) on differentially expressed genes "
|
|
3310
|
+
"using KEGG REST API + Fisher's exact test + BH correction"
|
|
3311
|
+
),
|
|
3312
|
+
category="omics",
|
|
3313
|
+
parameters={"goal": "ORA analysis to perform (include organism code if known, e.g. 'hsa' for human)"},
|
|
3314
|
+
usage_guide=(
|
|
3315
|
+
"Use when the question asks about KEGG pathway enrichment via ORA (not GSEA). "
|
|
3316
|
+
"Handles non-human organisms via KEGG REST API. Uses Fisher's exact test with "
|
|
3317
|
+
"Benjamini-Hochberg FDR correction. "
|
|
3318
|
+
"For human gene set enrichment with gseapy, use code.execute instead."
|
|
3319
|
+
),
|
|
3320
|
+
)
|
|
3321
|
+
def kegg_ora(goal: str, _session=None, _prior_results=None, **kwargs) -> dict:
|
|
3322
|
+
"""Perform KEGG pathway over-representation analysis using generated code."""
|
|
3323
|
+
from ct.tools.code import _generate_and_execute_code
|
|
3324
|
+
|
|
3325
|
+
return _generate_and_execute_code(
|
|
3326
|
+
goal=goal,
|
|
3327
|
+
system_prompt_template=KEGG_ORA_SYSTEM_PROMPT,
|
|
3328
|
+
session=_session,
|
|
3329
|
+
prior_results=_prior_results,
|
|
3330
|
+
)
|