celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/omics.py ADDED
@@ -0,0 +1,3330 @@
1
+ """
2
+ Omics data discovery, download, and inspection tools.
3
+
4
+ Provides search and fetch capabilities for major public omics repositories:
5
+ - NCBI GEO (Gene Expression Omnibus)
6
+ - CELLxGENE Discover (Chan Zuckerberg Initiative)
7
+ - TCGA/GDC (The Cancer Genome Atlas via Genomic Data Commons)
8
+
9
+ Also provides local dataset inspection for downloaded files.
10
+ """
11
+
12
+ import gzip
13
+ import logging
14
+ import re
15
+ import shutil
16
+ import tempfile
17
+ from pathlib import Path
18
+
19
+ from ct.tools import registry
20
+ from ct.tools.http_client import request, request_json
21
+
22
+ logger = logging.getLogger("ct.tools.omics")
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Shared helpers
27
+ # ---------------------------------------------------------------------------
28
+
29
+
30
+ def _downloads_dir() -> Path:
31
+ """Return (and create) the downloads directory."""
32
+ from ct.agent.config import Config
33
+
34
+ config = Config.load()
35
+ base = config.get("data.downloads_dir", None)
36
+ if base:
37
+ d = Path(base).expanduser()
38
+ else:
39
+ d = Path.home() / ".ct" / "downloads"
40
+ d.mkdir(parents=True, exist_ok=True)
41
+ return d
42
+
43
+
44
+ def _max_download_mb() -> int:
45
+ """Return the configured max download size in MB."""
46
+ from ct.agent.config import Config
47
+
48
+ config = Config.load()
49
+ return int(config.get("data.max_download_mb", 500))
50
+
51
+
52
+ def _stream_download(url: str, dest_path: Path, max_mb: int | None = None) -> tuple[Path | None, str | None]:
53
+ """Stream-download a file with size cap.
54
+
55
+ Returns (path, None) on success or (None, error_string) on failure.
56
+ """
57
+ import httpx
58
+
59
+ if max_mb is None:
60
+ max_mb = _max_download_mb()
61
+
62
+ max_bytes = max_mb * 1024 * 1024
63
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
64
+ tmp_path = dest_path.with_suffix(dest_path.suffix + ".tmp")
65
+
66
+ try:
67
+ with httpx.stream("GET", url, follow_redirects=True, timeout=120) as resp:
68
+ resp.raise_for_status()
69
+
70
+ # Check Content-Length if available
71
+ content_length = resp.headers.get("content-length")
72
+ if content_length and int(content_length) > max_bytes:
73
+ return None, (
74
+ f"File size ({int(content_length) // (1024*1024)} MB) "
75
+ f"exceeds limit ({max_mb} MB). "
76
+ f"Increase with: ct config set data.max_download_mb <value>"
77
+ )
78
+
79
+ downloaded = 0
80
+ with open(tmp_path, "wb") as f:
81
+ for chunk in resp.iter_bytes(chunk_size=65536):
82
+ downloaded += len(chunk)
83
+ if downloaded > max_bytes:
84
+ tmp_path.unlink(missing_ok=True)
85
+ return None, (
86
+ f"Download exceeded size limit ({max_mb} MB). "
87
+ f"Increase with: ct config set data.max_download_mb <value>"
88
+ )
89
+ f.write(chunk)
90
+
91
+ # Atomic rename
92
+ shutil.move(str(tmp_path), str(dest_path))
93
+ return dest_path, None
94
+
95
+ except httpx.HTTPStatusError as exc:
96
+ tmp_path.unlink(missing_ok=True)
97
+ return None, f"HTTP {exc.response.status_code}: {str(exc)[:200]}"
98
+ except Exception as exc:
99
+ tmp_path.unlink(missing_ok=True)
100
+ return None, f"Download failed: {str(exc)[:200]}"
101
+
102
+
103
+ def _check_scanpy():
104
+ """Check if scanpy is available."""
105
+ try:
106
+ import scanpy as sc
107
+
108
+ return sc
109
+ except Exception as exc:
110
+ logger.debug("scanpy unavailable or failed to import: %s", exc)
111
+ return None
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # 1. omics.geo_search
116
+ # ---------------------------------------------------------------------------
117
+
118
+
119
+ @registry.register(
120
+ name="omics.geo_search",
121
+ description="Search NCBI GEO for datasets by keyword, organism, and study type",
122
+ category="omics",
123
+ parameters={
124
+ "query": "Search terms (gene, disease, compound, etc.)",
125
+ "organism": "Organism filter (default 'Homo sapiens')",
126
+ "study_type": "Filter: 'scRNA-seq', 'bulk RNA-seq', 'methylation', 'ATAC-seq', 'ChIP-seq', or 'all'",
127
+ "max_results": "Maximum results to return (default 10)",
128
+ },
129
+ usage_guide=(
130
+ "Search NCBI GEO for public omics datasets. Use before omics.geo_fetch "
131
+ "to find relevant accessions. Supports filtering by organism and study type."
132
+ ),
133
+ )
134
+ def geo_search(
135
+ query: str,
136
+ organism: str = "Homo sapiens",
137
+ study_type: str = "all",
138
+ max_results: int = 10,
139
+ **kwargs,
140
+ ) -> dict:
141
+ """Search NCBI GEO for datasets."""
142
+ if not query or not query.strip():
143
+ return {"error": "Query is required", "summary": "No query provided"}
144
+
145
+ # Build search term
146
+ terms = [query.strip()]
147
+ if organism and organism.lower() != "all":
148
+ terms.append(f'"{organism}"[Organism]')
149
+
150
+ study_type_keywords = {
151
+ "scrna-seq": "single cell RNA-seq",
152
+ "bulk rna-seq": "RNA-seq",
153
+ "methylation": "methylation profiling",
154
+ "atac-seq": "ATAC-seq",
155
+ "chip-seq": "ChIP-seq",
156
+ }
157
+ st = study_type.lower().strip()
158
+ if st != "all" and st in study_type_keywords:
159
+ terms.append(study_type_keywords[st])
160
+
161
+ search_term = " AND ".join(terms)
162
+
163
+ # Step 1: esearch
164
+ esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
165
+ data, error = request_json(
166
+ "GET",
167
+ esearch_url,
168
+ params={
169
+ "db": "gds",
170
+ "term": search_term,
171
+ "retmax": str(min(max_results, 50)),
172
+ "retmode": "json",
173
+ },
174
+ timeout=15,
175
+ )
176
+ if error:
177
+ return {"error": f"GEO search failed: {error}", "summary": f"GEO search error: {error}"}
178
+
179
+ esearch_result = data.get("esearchresult", {})
180
+ id_list = esearch_result.get("idlist", [])
181
+ if not id_list:
182
+ return {
183
+ "datasets": [],
184
+ "query": search_term,
185
+ "count": 0,
186
+ "summary": f"No GEO datasets found for '{query}' (organism={organism}, type={study_type})",
187
+ }
188
+
189
+ # Step 2: esummary
190
+ esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
191
+ summary_data, error = request_json(
192
+ "GET",
193
+ esummary_url,
194
+ params={
195
+ "db": "gds",
196
+ "id": ",".join(id_list),
197
+ "retmode": "json",
198
+ },
199
+ timeout=15,
200
+ )
201
+ if error:
202
+ return {"error": f"GEO summary fetch failed: {error}", "summary": f"GEO summary error: {error}"}
203
+
204
+ result_block = summary_data.get("result", {})
205
+ datasets = []
206
+ for uid in id_list:
207
+ entry = result_block.get(uid, {})
208
+ if not entry or isinstance(entry, str):
209
+ continue
210
+ accession = entry.get("accession", "")
211
+ # GDS entries may not have GSE accession directly; extract from related
212
+ if not accession.startswith("GSE"):
213
+ gse = entry.get("gse", "")
214
+ if gse:
215
+ accession = f"GSE{gse}"
216
+ datasets.append({
217
+ "accession": accession,
218
+ "title": entry.get("title", ""),
219
+ "summary": (entry.get("summary", "") or "")[:300],
220
+ "organism": entry.get("taxon", ""),
221
+ "platform": entry.get("gpl", ""),
222
+ "sample_count": entry.get("n_samples", 0),
223
+ "study_type": entry.get("gdstype", study_type),
224
+ "date": entry.get("pdat", ""),
225
+ })
226
+
227
+ return {
228
+ "datasets": datasets,
229
+ "query": search_term,
230
+ "count": len(datasets),
231
+ "summary": (
232
+ f"Found {len(datasets)} GEO dataset(s) for '{query}'. "
233
+ + "; ".join(
234
+ f"{d['accession']}: {d['title'][:60]}" for d in datasets[:3]
235
+ )
236
+ ),
237
+ }
238
+
239
+
240
+ # ---------------------------------------------------------------------------
241
+ # 2. omics.geo_fetch
242
+ # ---------------------------------------------------------------------------
243
+
244
+
245
+ @registry.register(
246
+ name="omics.geo_fetch",
247
+ description="Download a GEO dataset (expression matrix or supplementary files)",
248
+ category="omics",
249
+ parameters={
250
+ "accession": "GEO accession (e.g., 'GSE12345')",
251
+ "file_type": "Type to download: 'matrix', 'h5ad', 'supplementary' (default 'matrix')",
252
+ },
253
+ usage_guide=(
254
+ "Download data from NCBI GEO after finding accessions with omics.geo_search. "
255
+ "Use 'matrix' for series matrix files, 'supplementary' for raw/processed supplements."
256
+ ),
257
+ )
258
+ def geo_fetch(accession: str, file_type: str = "matrix", **kwargs) -> dict:
259
+ """Download a GEO dataset."""
260
+ # Validate accession
261
+ if not accession or not re.match(r"^GSE\d+$", accession.strip()):
262
+ return {
263
+ "error": f"Invalid GEO accession '{accession}'. Expected format: GSE12345",
264
+ "summary": f"Invalid accession format: {accession}",
265
+ }
266
+
267
+ accession = accession.strip().upper()
268
+ # GEO FTP path uses first 3+nnn digits: GSE12345 → GSE12nnn
269
+ prefix = accession[:len(accession) - 3] + "nnn"
270
+
271
+ dest_dir = _downloads_dir() / "geo" / accession
272
+ dest_dir.mkdir(parents=True, exist_ok=True)
273
+
274
+ if file_type == "matrix":
275
+ filename = f"{accession}_series_matrix.txt.gz"
276
+ url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}/{accession}/matrix/{filename}"
277
+ dest = dest_dir / filename
278
+
279
+ if dest.exists():
280
+ size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
281
+ return {
282
+ "path": str(dest),
283
+ "accession": accession,
284
+ "file_type": file_type,
285
+ "size_mb": size_mb,
286
+ "summary": f"Already downloaded: {dest.name} ({size_mb} MB)",
287
+ }
288
+
289
+ path, error = _stream_download(url, dest)
290
+ if error:
291
+ return {"error": error, "accession": accession, "summary": f"Download failed for {accession}: {error}"}
292
+
293
+ size_mb = round(path.stat().st_size / (1024 * 1024), 2)
294
+ return {
295
+ "path": str(path),
296
+ "accession": accession,
297
+ "file_type": file_type,
298
+ "size_mb": size_mb,
299
+ "summary": f"Downloaded {accession} series matrix ({size_mb} MB) to {path}",
300
+ }
301
+
302
+ elif file_type in ("h5ad", "supplementary"):
303
+ # List supplementary files page
304
+ suppl_url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}/{accession}/suppl/"
305
+ resp, error = request("GET", suppl_url, timeout=15, raise_for_status=False)
306
+ if error:
307
+ return {"error": f"Could not list supplementary files: {error}", "summary": f"Supplementary listing failed for {accession}"}
308
+
309
+ # Parse HTML directory listing for file links
310
+ text = resp.text if hasattr(resp, "text") else str(resp)
311
+ links = re.findall(r'href="([^"]+)"', text)
312
+ data_files = [l for l in links if not l.startswith("?") and not l.startswith("/") and l != "../"]
313
+
314
+ if not data_files:
315
+ return {
316
+ "error": f"No supplementary files found for {accession}",
317
+ "summary": f"No supplementary files available for {accession}",
318
+ }
319
+
320
+ # For h5ad, prefer .h5ad files; otherwise take first data file
321
+ target = None
322
+ if file_type == "h5ad":
323
+ h5ad_files = [f for f in data_files if f.endswith(".h5ad") or f.endswith(".h5ad.gz")]
324
+ if h5ad_files:
325
+ target = h5ad_files[0]
326
+ else:
327
+ return {
328
+ "error": f"No h5ad files found in {accession} supplementary files",
329
+ "files_available": data_files[:10],
330
+ "summary": f"No h5ad files in {accession}. Available: {', '.join(data_files[:5])}",
331
+ }
332
+ else:
333
+ target = data_files[0]
334
+
335
+ file_url = f"{suppl_url}{target}"
336
+ dest = dest_dir / target
337
+
338
+ if dest.exists():
339
+ size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
340
+ return {
341
+ "path": str(dest),
342
+ "accession": accession,
343
+ "file_type": file_type,
344
+ "filename": target,
345
+ "size_mb": size_mb,
346
+ "summary": f"Already downloaded: {target} ({size_mb} MB)",
347
+ }
348
+
349
+ path, error = _stream_download(file_url, dest)
350
+ if error:
351
+ return {"error": error, "accession": accession, "summary": f"Download failed: {error}"}
352
+
353
+ size_mb = round(path.stat().st_size / (1024 * 1024), 2)
354
+ return {
355
+ "path": str(path),
356
+ "accession": accession,
357
+ "file_type": file_type,
358
+ "filename": target,
359
+ "size_mb": size_mb,
360
+ "summary": f"Downloaded {target} ({size_mb} MB) from {accession}",
361
+ }
362
+
363
+ else:
364
+ return {
365
+ "error": f"Invalid file_type '{file_type}'. Choose: matrix, h5ad, supplementary",
366
+ "summary": f"Invalid file_type: {file_type}",
367
+ }
368
+
369
+
370
+ # ---------------------------------------------------------------------------
371
+ # 3. omics.cellxgene_search
372
+ # ---------------------------------------------------------------------------
373
+
374
+ _CELLXGENE_API = "https://api.cellxgene.cziscience.com/curation/v1"
375
+
376
+
377
+ @registry.register(
378
+ name="omics.cellxgene_search",
379
+ description="Search CELLxGENE Discover for curated single-cell datasets",
380
+ category="omics",
381
+ parameters={
382
+ "query": "Search terms (gene, disease, tissue, etc.)",
383
+ "tissue": "Filter by tissue (optional)",
384
+ "disease": "Filter by disease (optional)",
385
+ "organism": "Filter by organism (default 'Homo sapiens')",
386
+ "max_results": "Maximum results to return (default 10)",
387
+ },
388
+ usage_guide=(
389
+ "Search the CZI CELLxGENE Discover portal for curated, analysis-ready "
390
+ "single-cell datasets. Use before omics.cellxgene_fetch to get dataset IDs."
391
+ ),
392
+ )
393
+ def cellxgene_search(
394
+ query: str,
395
+ tissue: str = "",
396
+ disease: str = "",
397
+ organism: str = "Homo sapiens",
398
+ max_results: int = 10,
399
+ **kwargs,
400
+ ) -> dict:
401
+ """Search CELLxGENE Discover for single-cell datasets."""
402
+ if not query or not query.strip():
403
+ return {"error": "Query is required", "summary": "No query provided"}
404
+
405
+ # Fetch collections
406
+ url = f"{_CELLXGENE_API}/collections"
407
+ data, error = request_json("GET", url, timeout=20)
408
+ if error:
409
+ return {"error": f"CELLxGENE search failed: {error}", "summary": f"CELLxGENE error: {error}"}
410
+
411
+ if not isinstance(data, list):
412
+ return {"error": "Unexpected CELLxGENE response format", "summary": "CELLxGENE returned unexpected format"}
413
+
414
+ query_lower = query.lower().strip()
415
+ query_terms = query_lower.split()
416
+ results = []
417
+
418
+ for collection in data:
419
+ # Check collection-level match
420
+ col_title = (collection.get("name") or "").lower()
421
+ col_desc = (collection.get("description") or "").lower()
422
+ col_text = col_title + " " + col_desc
423
+
424
+ col_matches = any(term in col_text for term in query_terms)
425
+
426
+ for dataset in collection.get("datasets", []):
427
+ ds_title = (dataset.get("title") or dataset.get("name") or "").lower()
428
+ ds_text = ds_title + " " + col_text
429
+
430
+ # Match query
431
+ if not col_matches and not any(term in ds_text for term in query_terms):
432
+ continue
433
+
434
+ # Filter organism
435
+ ds_organisms = [
436
+ o.get("label", "").lower()
437
+ for o in (dataset.get("organism", []) if isinstance(dataset.get("organism"), list) else [])
438
+ ]
439
+ if organism and organism.lower() not in " ".join(ds_organisms) and ds_organisms:
440
+ continue
441
+
442
+ # Filter tissue
443
+ ds_tissues = [
444
+ t.get("label", "").lower()
445
+ for t in (dataset.get("tissue", []) if isinstance(dataset.get("tissue"), list) else [])
446
+ ]
447
+ if tissue and tissue.lower() not in " ".join(ds_tissues):
448
+ continue
449
+
450
+ # Filter disease
451
+ ds_diseases = [
452
+ d.get("label", "").lower()
453
+ for d in (dataset.get("disease", []) if isinstance(dataset.get("disease"), list) else [])
454
+ ]
455
+ if disease and disease.lower() not in " ".join(ds_diseases):
456
+ continue
457
+
458
+ # Extract assay info
459
+ ds_assays = [
460
+ a.get("label", "")
461
+ for a in (dataset.get("assay", []) if isinstance(dataset.get("assay"), list) else [])
462
+ ]
463
+
464
+ results.append({
465
+ "dataset_id": dataset.get("dataset_id", ""),
466
+ "collection_id": collection.get("collection_id", ""),
467
+ "title": dataset.get("title") or dataset.get("name") or col_title,
468
+ "description": (col_desc[:200] if col_desc else ""),
469
+ "tissue": ", ".join(t.get("label", "") for t in (dataset.get("tissue", []) if isinstance(dataset.get("tissue"), list) else [])),
470
+ "disease": ", ".join(d.get("label", "") for d in (dataset.get("disease", []) if isinstance(dataset.get("disease"), list) else [])),
471
+ "cell_count": dataset.get("cell_count", 0),
472
+ "organism": ", ".join(o.get("label", "") for o in (dataset.get("organism", []) if isinstance(dataset.get("organism"), list) else [])),
473
+ "assay": ", ".join(ds_assays),
474
+ })
475
+
476
+ if len(results) >= max_results:
477
+ break
478
+ if len(results) >= max_results:
479
+ break
480
+
481
+ return {
482
+ "datasets": results,
483
+ "query": query,
484
+ "count": len(results),
485
+ "summary": (
486
+ f"Found {len(results)} CELLxGENE dataset(s) for '{query}'. "
487
+ + ("; ".join(f"{d['title'][:50]} ({d['cell_count']} cells)" for d in results[:3]) if results else "Try broader search terms.")
488
+ ),
489
+ }
490
+
491
+
492
+ # ---------------------------------------------------------------------------
493
+ # 4. omics.cellxgene_fetch
494
+ # ---------------------------------------------------------------------------
495
+
496
+
497
+ @registry.register(
498
+ name="omics.cellxgene_fetch",
499
+ description="Download an h5ad dataset from CELLxGENE Discover",
500
+ category="omics",
501
+ parameters={
502
+ "dataset_id": "CELLxGENE dataset ID (from omics.cellxgene_search results)",
503
+ },
504
+ usage_guide=(
505
+ "Download a single-cell dataset from CELLxGENE. Requires a dataset_id "
506
+ "from omics.cellxgene_search results. Downloads as h5ad format."
507
+ ),
508
+ )
509
+ def cellxgene_fetch(dataset_id: str, **kwargs) -> dict:
510
+ """Download an h5ad dataset from CELLxGENE."""
511
+ if not dataset_id or not dataset_id.strip():
512
+ return {"error": "dataset_id is required", "summary": "No dataset_id provided"}
513
+
514
+ dataset_id = dataset_id.strip()
515
+
516
+ # Get asset list
517
+ assets_url = f"{_CELLXGENE_API}/datasets/{dataset_id}/assets"
518
+ assets, error = request_json("GET", assets_url, timeout=15)
519
+ if error:
520
+ return {"error": f"Failed to get assets: {error}", "summary": f"CELLxGENE asset lookup failed: {error}"}
521
+
522
+ if not isinstance(assets, list) or not assets:
523
+ return {
524
+ "error": f"No downloadable assets found for dataset {dataset_id}",
525
+ "summary": f"No assets for dataset {dataset_id}",
526
+ }
527
+
528
+ # Find h5ad asset
529
+ h5ad_asset = None
530
+ for asset in assets:
531
+ filetype = (asset.get("filetype") or asset.get("file_type") or "").lower()
532
+ filename = (asset.get("filename") or "").lower()
533
+ if "h5ad" in filetype or filename.endswith(".h5ad"):
534
+ h5ad_asset = asset
535
+ break
536
+
537
+ if not h5ad_asset:
538
+ # Fall back to first asset
539
+ h5ad_asset = assets[0]
540
+
541
+ download_url = h5ad_asset.get("presigned_url") or h5ad_asset.get("url", "")
542
+ if not download_url:
543
+ return {
544
+ "error": "No download URL in asset metadata",
545
+ "summary": "CELLxGENE asset has no download URL",
546
+ }
547
+
548
+ filename = h5ad_asset.get("filename", f"{dataset_id}.h5ad")
549
+ dest_dir = _downloads_dir() / "cellxgene" / dataset_id
550
+ dest = dest_dir / filename
551
+
552
+ if dest.exists():
553
+ size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
554
+ return {
555
+ "path": str(dest),
556
+ "dataset_id": dataset_id,
557
+ "size_mb": size_mb,
558
+ "summary": f"Already downloaded: {filename} ({size_mb} MB)",
559
+ }
560
+
561
+ path, error = _stream_download(download_url, dest)
562
+ if error:
563
+ return {"error": error, "dataset_id": dataset_id, "summary": f"Download failed: {error}"}
564
+
565
+ size_mb = round(path.stat().st_size / (1024 * 1024), 2)
566
+ return {
567
+ "path": str(path),
568
+ "dataset_id": dataset_id,
569
+ "filename": filename,
570
+ "size_mb": size_mb,
571
+ "summary": f"Downloaded CELLxGENE dataset {dataset_id} ({size_mb} MB) to {path}",
572
+ }
573
+
574
+
575
+ # ---------------------------------------------------------------------------
576
+ # 5. omics.tcga_search
577
+ # ---------------------------------------------------------------------------
578
+
579
+ _GDC_API = "https://api.gdc.cancer.gov"
580
+
581
+
582
+ @registry.register(
583
+ name="omics.tcga_search",
584
+ description="Search TCGA/GDC for cancer genomics projects and data files",
585
+ category="omics",
586
+ parameters={
587
+ "query": "Search terms (cancer type, gene, etc.)",
588
+ "data_type": "Filter: 'gene_expression', 'methylation', 'mutation', 'clinical' (default 'gene_expression')",
589
+ "max_results": "Maximum results to return (default 10)",
590
+ },
591
+ usage_guide=(
592
+ "Search the NCI Genomic Data Commons (GDC) for TCGA and other cancer "
593
+ "genomics projects. Use before omics.tcga_fetch to find file UUIDs."
594
+ ),
595
+ )
596
+ def tcga_search(
597
+ query: str,
598
+ data_type: str = "gene_expression",
599
+ max_results: int = 10,
600
+ **kwargs,
601
+ ) -> dict:
602
+ """Search TCGA/GDC for projects and data files."""
603
+ if not query or not query.strip():
604
+ return {"error": "Query is required", "summary": "No query provided"}
605
+
606
+ valid_types = {"gene_expression", "methylation", "mutation", "clinical"}
607
+ if data_type not in valid_types:
608
+ return {
609
+ "error": f"Invalid data_type '{data_type}'. Choose from: {', '.join(valid_types)}",
610
+ "summary": f"Invalid data_type: {data_type}",
611
+ }
612
+
613
+ # Map requested analysis type to GDC project summary data categories.
614
+ # Project summaries expose category-level counts, not file-level data_type counts.
615
+ gdc_data_category_map = {
616
+ "gene_expression": "Transcriptome Profiling",
617
+ "methylation": "DNA Methylation",
618
+ "mutation": "Simple Nucleotide Variation",
619
+ "clinical": "Clinical",
620
+ }
621
+
622
+ # Search projects first
623
+ projects_url = f"{_GDC_API}/projects"
624
+ filters = {
625
+ "op": "or",
626
+ "content": [
627
+ {"op": "in", "content": {"field": "project_id", "value": [query.upper()]}},
628
+ {"op": "like", "content": {"field": "name", "value": f"*{query}*"}},
629
+ {"op": "like", "content": {"field": "disease_type", "value": f"*{query}*"}},
630
+ {"op": "like", "content": {"field": "primary_site", "value": f"*{query}*"}},
631
+ ],
632
+ }
633
+
634
+ import json
635
+
636
+ params = {
637
+ "filters": json.dumps(filters),
638
+ "fields": "project_id,name,disease_type,primary_site,summary.case_count,summary.file_count,summary.data_categories.data_category,summary.data_categories.file_count",
639
+ "size": str(min(max_results, 50)),
640
+ "format": "json",
641
+ }
642
+
643
+ data, error = request_json("GET", projects_url, params=params, timeout=15)
644
+ if error:
645
+ return {"error": f"GDC search failed: {error}", "summary": f"GDC error: {error}"}
646
+
647
+ hits = data.get("data", {}).get("hits", [])
648
+ projects = []
649
+ for hit in hits:
650
+ summary = hit.get("summary", {})
651
+ # Count files in the category most relevant to requested data_type.
652
+ data_cats = summary.get("data_categories", [])
653
+ requested_category = gdc_data_category_map.get(data_type, "")
654
+ category_file_count = 0
655
+ available_categories = []
656
+ for cat in data_cats:
657
+ cat_name = cat.get("data_category", "")
658
+ if cat_name:
659
+ available_categories.append(cat_name)
660
+ if cat_name.lower() == requested_category.lower():
661
+ category_file_count = int(cat.get("file_count", 0) or 0)
662
+
663
+ projects.append({
664
+ "project_id": hit.get("project_id", ""),
665
+ "name": hit.get("name", ""),
666
+ "disease_type": hit.get("disease_type", ""),
667
+ "primary_site": hit.get("primary_site", ""),
668
+ "case_count": summary.get("case_count", 0),
669
+ "file_count": summary.get("file_count", 0),
670
+ "data_type": data_type,
671
+ "matching_data_category": requested_category,
672
+ "data_type_file_count": category_file_count,
673
+ "available_data_categories": available_categories[:20],
674
+ "count_method": "project_summary_data_category",
675
+ })
676
+
677
+ if not projects:
678
+ return {
679
+ "projects": [],
680
+ "query": query,
681
+ "count": 0,
682
+ "summary": f"No TCGA/GDC projects found for '{query}'",
683
+ }
684
+
685
+ return {
686
+ "projects": projects,
687
+ "query": query,
688
+ "data_type": data_type,
689
+ "count": len(projects),
690
+ "summary": (
691
+ f"Found {len(projects)} GDC project(s) for '{query}'. "
692
+ + "; ".join(f"{p['project_id']}: {p['name'][:40]} ({p['case_count']} cases)" for p in projects[:3])
693
+ ),
694
+ }
695
+
696
+
697
+ # ---------------------------------------------------------------------------
698
+ # 6. omics.tcga_fetch
699
+ # ---------------------------------------------------------------------------
700
+
701
+
702
+ @registry.register(
703
+ name="omics.tcga_fetch",
704
+ description="Download a data file from TCGA/GDC",
705
+ category="omics",
706
+ parameters={
707
+ "file_id": "GDC file UUID to download",
708
+ "project_id": "GDC project ID (optional, used to search for files if file_id not provided)",
709
+ },
710
+ usage_guide=(
711
+ "Download a specific file from GDC by UUID. If only project_id is given, "
712
+ "searches for the most relevant gene expression file and downloads it."
713
+ ),
714
+ )
715
+ def tcga_fetch(file_id: str = "", project_id: str = "", **kwargs) -> dict:
716
+ """Download a data file from TCGA/GDC."""
717
+ import json
718
+
719
+ if not file_id and not project_id:
720
+ return {
721
+ "error": "Either file_id or project_id is required",
722
+ "summary": "No file_id or project_id provided",
723
+ }
724
+
725
+ # If no file_id, search for one from the project
726
+ if not file_id:
727
+ files_url = f"{_GDC_API}/files"
728
+ filters = {
729
+ "op": "and",
730
+ "content": [
731
+ {"op": "=", "content": {"field": "cases.project.project_id", "value": project_id}},
732
+ {"op": "=", "content": {"field": "data_type", "value": "Gene Expression Quantification"}},
733
+ {"op": "=", "content": {"field": "access", "value": "open"}},
734
+ ],
735
+ }
736
+ params = {
737
+ "filters": json.dumps(filters),
738
+ "fields": "file_id,file_name,file_size,data_type",
739
+ "size": "1",
740
+ "format": "json",
741
+ }
742
+ data, error = request_json("GET", files_url, params=params, timeout=15)
743
+ if error:
744
+ return {"error": f"File search failed: {error}", "summary": f"GDC file search error: {error}"}
745
+
746
+ hits = data.get("data", {}).get("hits", [])
747
+ if not hits:
748
+ return {
749
+ "error": f"No open-access files found for project {project_id}",
750
+ "summary": f"No downloadable files for {project_id}",
751
+ }
752
+ file_id = hits[0].get("file_id", "")
753
+ file_name = hits[0].get("file_name", f"{file_id}.gz")
754
+ else:
755
+ file_name = f"{file_id}.gz"
756
+
757
+ # Download the file
758
+ download_url = f"{_GDC_API}/data/{file_id}"
759
+ label = project_id or file_id[:12]
760
+ dest_dir = _downloads_dir() / "tcga" / label
761
+ dest = dest_dir / file_name
762
+
763
+ if dest.exists():
764
+ size_mb = round(dest.stat().st_size / (1024 * 1024), 2)
765
+ return {
766
+ "path": str(dest),
767
+ "file_id": file_id,
768
+ "project_id": project_id,
769
+ "size_mb": size_mb,
770
+ "summary": f"Already downloaded: {file_name} ({size_mb} MB)",
771
+ }
772
+
773
+ path, error = _stream_download(download_url, dest)
774
+ if error:
775
+ return {"error": error, "file_id": file_id, "summary": f"Download failed: {error}"}
776
+
777
+ size_mb = round(path.stat().st_size / (1024 * 1024), 2)
778
+ return {
779
+ "path": str(path),
780
+ "file_id": file_id,
781
+ "project_id": project_id,
782
+ "filename": file_name,
783
+ "size_mb": size_mb,
784
+ "summary": f"Downloaded GDC file {file_name} ({size_mb} MB) to {path}",
785
+ }
786
+
787
+
788
+ # ---------------------------------------------------------------------------
789
+ # 7. omics.dataset_info
790
+ # ---------------------------------------------------------------------------
791
+
792
+
793
+ @registry.register(
794
+ name="omics.dataset_info",
795
+ description="Inspect a downloaded dataset file and return metadata summary",
796
+ category="omics",
797
+ parameters={
798
+ "path": "Path to the downloaded dataset file (h5ad, CSV, TSV, or matrix.txt.gz)",
799
+ },
800
+ usage_guide=(
801
+ "Inspect a downloaded omics file before analysis. Returns shape, columns, "
802
+ "metadata. Use after omics.*_fetch to understand the data before running "
803
+ "singlecell.* or code.execute on it."
804
+ ),
805
+ )
806
+ def dataset_info(path: str, **kwargs) -> dict:
807
+ """Inspect a downloaded dataset file and return metadata."""
808
+ if not path:
809
+ return {"error": "Path is required", "summary": "No path provided"}
810
+
811
+ filepath = Path(path).expanduser()
812
+ if not filepath.exists():
813
+ return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
814
+
815
+ size_mb = round(filepath.stat().st_size / (1024 * 1024), 2)
816
+ suffix = filepath.suffix.lower()
817
+
818
+ # Handle .gz suffix
819
+ if suffix == ".gz":
820
+ inner_suffix = Path(filepath.stem).suffix.lower()
821
+ suffix = inner_suffix + suffix # e.g. ".txt.gz"
822
+
823
+ try:
824
+ if suffix == ".h5ad":
825
+ return _inspect_h5ad(filepath, size_mb)
826
+ elif suffix in (".csv", ".tsv", ".txt"):
827
+ return _inspect_tabular(filepath, size_mb, sep="," if suffix == ".csv" else "\t")
828
+ elif suffix in (".txt.gz",):
829
+ return _inspect_matrix_gz(filepath, size_mb)
830
+ else:
831
+ return {
832
+ "path": str(filepath),
833
+ "file_type": suffix,
834
+ "size_mb": size_mb,
835
+ "summary": f"File type '{suffix}' not directly inspectable. Size: {size_mb} MB. Try loading with code.execute.",
836
+ }
837
+ except Exception as exc:
838
+ return {
839
+ "error": f"Inspection failed: {str(exc)[:200]}",
840
+ "path": str(filepath),
841
+ "size_mb": size_mb,
842
+ "summary": f"Could not inspect {filepath.name}: {str(exc)[:100]}",
843
+ }
844
+
845
+
846
+ def _inspect_h5ad(filepath: Path, size_mb: float) -> dict:
847
+ """Inspect an h5ad file using scanpy."""
848
+ sc = _check_scanpy()
849
+ if sc is None:
850
+ return {
851
+ "path": str(filepath),
852
+ "file_type": "h5ad",
853
+ "size_mb": size_mb,
854
+ "error": "scanpy not installed. Install with: pip install scanpy",
855
+ "summary": f"h5ad file ({size_mb} MB) — install scanpy to inspect: pip install scanpy",
856
+ }
857
+
858
+ adata = sc.read_h5ad(filepath)
859
+ obs_cols = list(adata.obs.columns)
860
+ var_cols = list(adata.var.columns)
861
+ layers = list(adata.layers.keys()) if adata.layers else []
862
+
863
+ return {
864
+ "path": str(filepath),
865
+ "file_type": "h5ad",
866
+ "size_mb": size_mb,
867
+ "n_cells": adata.n_obs,
868
+ "n_genes": adata.n_vars,
869
+ "obs_columns": obs_cols[:20],
870
+ "var_columns": var_cols[:20],
871
+ "layers": layers,
872
+ "obs_preview": {col: list(adata.obs[col].unique()[:5]) for col in obs_cols[:5]},
873
+ "summary": (
874
+ f"h5ad: {adata.n_obs:,} cells x {adata.n_vars:,} genes ({size_mb} MB). "
875
+ f"Obs columns: {', '.join(obs_cols[:8])}. "
876
+ f"Layers: {', '.join(layers) if layers else 'X only'}."
877
+ ),
878
+ }
879
+
880
+
881
+ def _inspect_tabular(filepath: Path, size_mb: float, sep: str = ",") -> dict:
882
+ """Inspect a CSV/TSV file."""
883
+ import pandas as pd
884
+
885
+ # Read just the first rows to get shape info without loading everything
886
+ df_head = pd.read_csv(filepath, sep=sep, nrows=5, index_col=0)
887
+ # Get full shape by counting lines
888
+ with open(filepath) as f:
889
+ n_lines = sum(1 for _ in f) - 1 # subtract header
890
+
891
+ columns = list(df_head.columns)
892
+ dtypes = {col: str(dtype) for col, dtype in df_head.dtypes.items()}
893
+
894
+ return {
895
+ "path": str(filepath),
896
+ "file_type": "csv" if sep == "," else "tsv",
897
+ "size_mb": size_mb,
898
+ "shape": [n_lines, len(columns)],
899
+ "columns": columns[:30],
900
+ "dtypes": {k: v for k, v in list(dtypes.items())[:15]},
901
+ "head_preview": df_head.head(3).to_dict(),
902
+ "summary": (
903
+ f"Tabular: {n_lines:,} rows x {len(columns)} columns ({size_mb} MB). "
904
+ f"Columns: {', '.join(columns[:8])}"
905
+ ),
906
+ }
907
+
908
+
909
+ def _inspect_matrix_gz(filepath: Path, size_mb: float) -> dict:
910
+ """Inspect a GEO series matrix .txt.gz file."""
911
+ metadata = {}
912
+ n_rows = 0
913
+ columns = []
914
+
915
+ with gzip.open(filepath, "rt", errors="replace") as f:
916
+ for line in f:
917
+ if line.startswith("!"):
918
+ # Parse metadata lines
919
+ parts = line.strip().split("\t", 1)
920
+ if len(parts) == 2:
921
+ key = parts[0].lstrip("!").strip()
922
+ val = parts[1].strip().strip('"')
923
+ if key not in metadata:
924
+ metadata[key] = val
925
+ elif isinstance(metadata[key], list):
926
+ metadata[key].append(val)
927
+ else:
928
+ metadata[key] = [metadata[key], val]
929
+ elif line.startswith('"ID_REF"') or line.startswith("ID_REF"):
930
+ columns = [c.strip('"') for c in line.strip().split("\t")]
931
+ elif not line.startswith("!") and line.strip():
932
+ n_rows += 1
933
+
934
+ # Extract key metadata fields
935
+ title = metadata.get("Series_title", "")
936
+ organism = metadata.get("Series_organism", "")
937
+ n_samples = len(columns) - 1 if columns else 0
938
+
939
+ return {
940
+ "path": str(filepath),
941
+ "file_type": "matrix.txt.gz",
942
+ "size_mb": size_mb,
943
+ "title": title,
944
+ "organism": organism,
945
+ "n_probes_or_genes": n_rows,
946
+ "n_samples": n_samples,
947
+ "sample_ids": columns[1:11] if columns else [],
948
+ "metadata_keys": list(metadata.keys())[:15],
949
+ "summary": (
950
+ f"GEO matrix: {n_rows:,} probes/genes x {n_samples} samples ({size_mb} MB). "
951
+ f"Title: {title[:80]}. Organism: {organism}."
952
+ ),
953
+ }
954
+
955
+
956
+ # ===========================================================================
957
+ # Analysis tools — modality-specific processing of downloaded data
958
+ # ===========================================================================
959
+
960
+
961
+ def _load_tabular(path: str, **read_kwargs) -> "tuple[pd.DataFrame | None, str | None]":
962
+ """Load a tabular file, returning (df, error)."""
963
+ import pandas as pd
964
+
965
+ filepath = Path(path).expanduser()
966
+ if not filepath.exists():
967
+ return None, f"File not found: {path}"
968
+ suffix = filepath.suffix.lower()
969
+ kwargs = dict(read_kwargs)
970
+ try:
971
+ # Keep prior behavior by defaulting to first column as index,
972
+ # while allowing callers to override.
973
+ kwargs.setdefault("index_col", 0)
974
+ if suffix in {".xlsx", ".xls"}:
975
+ df = pd.read_excel(filepath, **kwargs)
976
+ return df, None
977
+ if suffix == ".csv":
978
+ df = pd.read_csv(filepath, sep=",", **kwargs)
979
+ return df, None
980
+ if suffix in {".tsv", ".tab"}:
981
+ df = pd.read_csv(filepath, sep="\t", **kwargs)
982
+ return df, None
983
+ if suffix == ".txt":
984
+ # Many omics count matrices are whitespace-delimited.
985
+ try:
986
+ df = pd.read_csv(filepath, sep=r"\s+", engine="python", **kwargs)
987
+ return df, None
988
+ except Exception:
989
+ df = pd.read_csv(filepath, sep="\t", **kwargs)
990
+ return df, None
991
+ # Generic fallback: delimiter sniffing for unknown text-like files.
992
+ df = pd.read_csv(filepath, sep=None, engine="python", **kwargs)
993
+ return df, None
994
+ except Exception as exc:
995
+ return None, f"Failed to read {filepath.name}: {str(exc)[:200]}"
996
+
997
+
998
+ def _parse_sample_groups(
999
+ df,
1000
+ group1: str = "",
1001
+ group2: str = "",
1002
+ *,
1003
+ auto_grouping: bool = False,
1004
+ min_group_size: int = 2,
1005
+ group_names: tuple[str, str] = ("group1", "group2"),
1006
+ ) -> tuple[list[str], list[str], dict | None]:
1007
+ """Resolve and validate group sample assignments for two-group comparisons."""
1008
+ all_samples = [str(c) for c in df.columns]
1009
+ g1_label, g2_label = group_names
1010
+ g1_samples = [s.strip() for s in group1.split(",") if s.strip()] if group1 else []
1011
+ g2_samples = [s.strip() for s in group2.split(",") if s.strip()] if group2 else []
1012
+
1013
+ # Require explicit groups unless user opts in to auto-splitting.
1014
+ if not g1_samples and not g2_samples:
1015
+ if not auto_grouping:
1016
+ return [], [], {
1017
+ "error": (
1018
+ f"Explicit sample groups are required. Provide {g1_label} and {g2_label} "
1019
+ "as comma-separated sample names. "
1020
+ "Set auto_grouping=True only for quick exploratory analysis."
1021
+ ),
1022
+ "available_samples": all_samples[:30],
1023
+ "n_samples": len(all_samples),
1024
+ "summary": (
1025
+ f"No groups provided. Define {g1_label}/{g2_label} using sample names "
1026
+ f"(found {len(all_samples)} samples)."
1027
+ ),
1028
+ }
1029
+
1030
+ if len(all_samples) < (min_group_size * 2):
1031
+ return [], [], {
1032
+ "error": (
1033
+ f"Need at least {min_group_size * 2} samples for auto_grouping "
1034
+ f"({min_group_size} per group), found {len(all_samples)}."
1035
+ ),
1036
+ "available_samples": all_samples[:30],
1037
+ "summary": f"Too few samples for auto_grouping: {len(all_samples)}",
1038
+ }
1039
+
1040
+ mid = len(all_samples) // 2
1041
+ g1_samples = all_samples[:mid]
1042
+ g2_samples = all_samples[mid:]
1043
+
1044
+ elif (g1_samples and not g2_samples) or (g2_samples and not g1_samples):
1045
+ return [], [], {
1046
+ "error": f"Both {g1_label} and {g2_label} must be provided together.",
1047
+ "available_samples": all_samples[:30],
1048
+ "summary": f"Incomplete group definition: need both {g1_label} and {g2_label}",
1049
+ }
1050
+
1051
+ missing = [s for s in (g1_samples + g2_samples) if s not in all_samples]
1052
+ if missing:
1053
+ return [], [], {
1054
+ "error": f"Samples not found: {missing}",
1055
+ "available_samples": all_samples[:30],
1056
+ "summary": f"Sample names not found in matrix. Available: {', '.join(all_samples[:10])}",
1057
+ }
1058
+
1059
+ overlap = sorted(set(g1_samples).intersection(g2_samples))
1060
+ if overlap:
1061
+ return [], [], {
1062
+ "error": f"Samples cannot appear in both groups: {overlap}",
1063
+ "summary": "Group overlap detected",
1064
+ }
1065
+
1066
+ if len(g1_samples) < min_group_size or len(g2_samples) < min_group_size:
1067
+ return [], [], {
1068
+ "error": (
1069
+ f"Each group needs at least {min_group_size} samples. "
1070
+ f"Got {g1_label}={len(g1_samples)}, {g2_label}={len(g2_samples)}."
1071
+ ),
1072
+ "summary": "Insufficient replicates per group",
1073
+ }
1074
+
1075
+ return g1_samples, g2_samples, None
1076
+
1077
+
1078
+ def _fdr_correct(pvalues):
1079
+ """Benjamini-Hochberg FDR correction. Returns array of q-values."""
1080
+ import numpy as np
1081
+
1082
+ pvals = np.asarray(pvalues, dtype=float)
1083
+ n = len(pvals)
1084
+ if n == 0:
1085
+ return pvals
1086
+ ranked = pvals.argsort().argsort() + 1 # 1-based rank
1087
+ qvals = pvals * n / ranked
1088
+ # Enforce monotonicity (from largest p-value down)
1089
+ order = pvals.argsort()[::-1]
1090
+ qvals_sorted = qvals[order]
1091
+ for i in range(1, len(qvals_sorted)):
1092
+ if qvals_sorted[i] > qvals_sorted[i - 1]:
1093
+ qvals_sorted[i] = qvals_sorted[i - 1]
1094
+ qvals[order] = qvals_sorted
1095
+ return np.clip(qvals, 0, 1)
1096
+
1097
+
1098
+ # ---------------------------------------------------------------------------
1099
+ # 8. omics.methylation_diff
1100
+ # ---------------------------------------------------------------------------
1101
+
1102
+
1103
+ @registry.register(
1104
+ name="omics.methylation_diff",
1105
+ description="Differential methylation analysis between two sample groups",
1106
+ category="omics",
1107
+ parameters={
1108
+ "path": "Path to methylation beta-value matrix (rows=CpG sites, cols=samples)",
1109
+ "group1": "Comma-separated sample names for group 1",
1110
+ "group2": "Comma-separated sample names for group 2",
1111
+ "auto_grouping": "If true, splits samples by column order for exploratory use (default false)",
1112
+ "delta_beta_cutoff": "Minimum absolute delta-beta to call DMR (default 0.2)",
1113
+ "fdr_cutoff": "FDR significance threshold (default 0.05)",
1114
+ },
1115
+ usage_guide=(
1116
+ "Analyze differential methylation from beta-value matrices (e.g., Illumina 450K/EPIC). "
1117
+ "Requires a matrix with CpG sites as rows and samples as columns. "
1118
+ "Use after omics.geo_fetch or omics.tcga_fetch to download methylation data. "
1119
+ "For reliable analysis, provide explicit group1/group2 sample lists."
1120
+ ),
1121
+ )
1122
+ def methylation_diff(
1123
+ path: str,
1124
+ group1: str = "",
1125
+ group2: str = "",
1126
+ auto_grouping: bool = False,
1127
+ delta_beta_cutoff: float = 0.2,
1128
+ fdr_cutoff: float = 0.05,
1129
+ **kwargs,
1130
+ ) -> dict:
1131
+ """Differential methylation analysis between two groups."""
1132
+ import numpy as np
1133
+ from scipy import stats
1134
+
1135
+ df, error = _load_tabular(path)
1136
+ if error:
1137
+ return {"error": error, "summary": f"Could not load methylation data: {error}"}
1138
+
1139
+ g1_samples, g2_samples, group_error = _parse_sample_groups(
1140
+ df,
1141
+ group1=group1,
1142
+ group2=group2,
1143
+ auto_grouping=auto_grouping,
1144
+ min_group_size=2,
1145
+ )
1146
+ if group_error:
1147
+ return group_error
1148
+
1149
+ g1 = df[g1_samples].dropna(how="all")
1150
+ g2 = df[g2_samples].dropna(how="all")
1151
+ common_sites = g1.index.intersection(g2.index)
1152
+ g1 = g1.loc[common_sites]
1153
+ g2 = g2.loc[common_sites]
1154
+
1155
+ # Calculate delta-beta and p-values
1156
+ mean1 = g1.mean(axis=1)
1157
+ mean2 = g2.mean(axis=1)
1158
+ delta_beta = mean2 - mean1
1159
+
1160
+ pvals = []
1161
+ for site in common_sites:
1162
+ v1 = g1.loc[site].dropna().values
1163
+ v2 = g2.loc[site].dropna().values
1164
+ if len(v1) >= 2 and len(v2) >= 2:
1165
+ _, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
1166
+ pvals.append(p)
1167
+ else:
1168
+ pvals.append(1.0)
1169
+
1170
+ pvals = np.array(pvals)
1171
+ qvals = _fdr_correct(pvals)
1172
+
1173
+ # Identify DMRs
1174
+ sig_mask = (qvals < fdr_cutoff) & (np.abs(delta_beta.values) >= delta_beta_cutoff)
1175
+ n_sig = int(sig_mask.sum())
1176
+ hyper = int(((qvals < fdr_cutoff) & (delta_beta.values >= delta_beta_cutoff)).sum())
1177
+ hypo = int(((qvals < fdr_cutoff) & (delta_beta.values <= -delta_beta_cutoff)).sum())
1178
+
1179
+ # Top hits
1180
+ import pandas as pd
1181
+
1182
+ results_df = pd.DataFrame({
1183
+ "mean_group1": mean1,
1184
+ "mean_group2": mean2,
1185
+ "delta_beta": delta_beta,
1186
+ "pvalue": pvals,
1187
+ "fdr": qvals,
1188
+ }, index=common_sites)
1189
+ results_df = results_df.sort_values("fdr")
1190
+ top_hits = results_df.head(20).to_dict("index")
1191
+
1192
+ return {
1193
+ "n_sites_tested": len(common_sites),
1194
+ "n_significant": n_sig,
1195
+ "n_hypermethylated": hyper,
1196
+ "n_hypomethylated": hypo,
1197
+ "group1_samples": g1_samples,
1198
+ "group2_samples": g2_samples,
1199
+ "auto_grouping_used": bool(auto_grouping and not group1 and not group2),
1200
+ "delta_beta_cutoff": delta_beta_cutoff,
1201
+ "fdr_cutoff": fdr_cutoff,
1202
+ "top_hits": top_hits,
1203
+ "summary": (
1204
+ f"Tested {len(common_sites):,} CpG sites: {n_sig} significant (FDR<{fdr_cutoff}, "
1205
+ f"|Δβ|≥{delta_beta_cutoff}). {hyper} hypermethylated, {hypo} hypomethylated."
1206
+ ),
1207
+ }
1208
+
1209
+
1210
+ # ---------------------------------------------------------------------------
1211
+ # 9. omics.methylation_profile
1212
+ # ---------------------------------------------------------------------------
1213
+
1214
+
1215
+ @registry.register(
1216
+ name="omics.methylation_profile",
1217
+ description="Summarize methylation landscape: distribution, variability, and global patterns",
1218
+ category="omics",
1219
+ parameters={
1220
+ "path": "Path to methylation beta-value matrix",
1221
+ },
1222
+ usage_guide=(
1223
+ "Get an overview of a methylation dataset: global methylation levels, "
1224
+ "bimodal distribution (typical of 450K/EPIC), most variable CpGs. "
1225
+ "Use as a first step before methylation_diff."
1226
+ ),
1227
+ )
1228
+ def methylation_profile(path: str, **kwargs) -> dict:
1229
+ """Summarize methylation dataset landscape."""
1230
+ import numpy as np
1231
+
1232
+ df, error = _load_tabular(path)
1233
+ if error:
1234
+ return {"error": error, "summary": f"Could not load: {error}"}
1235
+
1236
+ n_sites, n_samples = df.shape
1237
+ all_vals = df.values.flatten()
1238
+ all_vals = all_vals[~np.isnan(all_vals)]
1239
+
1240
+ # Global statistics
1241
+ global_mean = float(np.mean(all_vals))
1242
+ global_median = float(np.median(all_vals))
1243
+ frac_low = float(np.mean(all_vals < 0.2)) # unmethylated
1244
+ frac_mid = float(np.mean((all_vals >= 0.2) & (all_vals <= 0.8))) # intermediate
1245
+ frac_high = float(np.mean(all_vals > 0.8)) # methylated
1246
+
1247
+ # Most variable sites
1248
+ site_var = df.var(axis=1).dropna().sort_values(ascending=False)
1249
+ top_variable = list(site_var.head(20).index)
1250
+
1251
+ # Per-sample mean methylation
1252
+ sample_means = df.mean(axis=0).to_dict()
1253
+
1254
+ return {
1255
+ "n_sites": n_sites,
1256
+ "n_samples": n_samples,
1257
+ "global_mean_beta": round(global_mean, 4),
1258
+ "global_median_beta": round(global_median, 4),
1259
+ "fraction_unmethylated": round(frac_low, 3),
1260
+ "fraction_intermediate": round(frac_mid, 3),
1261
+ "fraction_methylated": round(frac_high, 3),
1262
+ "top_variable_sites": top_variable,
1263
+ "sample_mean_betas": {k: round(v, 4) for k, v in list(sample_means.items())[:20]},
1264
+ "summary": (
1265
+ f"Methylation profile: {n_sites:,} sites x {n_samples} samples. "
1266
+ f"Global mean β={global_mean:.3f}. "
1267
+ f"Distribution: {frac_low:.0%} low (<0.2), {frac_mid:.0%} intermediate, {frac_high:.0%} high (>0.8)."
1268
+ ),
1269
+ }
1270
+
1271
+
1272
+ # ---------------------------------------------------------------------------
1273
+ # 10. omics.proteomics_diff
1274
+ # ---------------------------------------------------------------------------
1275
+
1276
+
1277
+ @registry.register(
1278
+ name="omics.proteomics_diff",
1279
+ description="Differential protein abundance analysis between two groups",
1280
+ category="omics",
1281
+ parameters={
1282
+ "path": "Path to protein abundance matrix (rows=proteins, cols=samples)",
1283
+ "group1": "Comma-separated sample names for group 1",
1284
+ "group2": "Comma-separated sample names for group 2",
1285
+ "auto_grouping": "If true, splits samples by column order for exploratory use (default false)",
1286
+ "fc_cutoff": "Minimum absolute log2 fold-change (default 1.0)",
1287
+ "fdr_cutoff": "FDR significance threshold (default 0.05)",
1288
+ },
1289
+ usage_guide=(
1290
+ "Differential protein abundance from proteomics data (e.g., TMT, LFQ). "
1291
+ "Input is a protein x sample matrix of log2 abundances or intensities. "
1292
+ "Provide explicit group1/group2 sample lists for production analyses."
1293
+ ),
1294
+ )
1295
+ def proteomics_diff(
1296
+ path: str,
1297
+ group1: str = "",
1298
+ group2: str = "",
1299
+ auto_grouping: bool = False,
1300
+ fc_cutoff: float = 1.0,
1301
+ fdr_cutoff: float = 0.05,
1302
+ **kwargs,
1303
+ ) -> dict:
1304
+ """Differential protein abundance analysis."""
1305
+ import numpy as np
1306
+ from scipy import stats
1307
+
1308
+ df, error = _load_tabular(path)
1309
+ if error:
1310
+ return {"error": error, "summary": f"Could not load proteomics data: {error}"}
1311
+
1312
+ g1_samples, g2_samples, group_error = _parse_sample_groups(
1313
+ df,
1314
+ group1=group1,
1315
+ group2=group2,
1316
+ auto_grouping=auto_grouping,
1317
+ min_group_size=2,
1318
+ )
1319
+ if group_error:
1320
+ return group_error
1321
+
1322
+ g1 = df[g1_samples]
1323
+ g2 = df[g2_samples]
1324
+
1325
+ mean1 = g1.mean(axis=1)
1326
+ mean2 = g2.mean(axis=1)
1327
+ log2fc = mean2 - mean1 # already log2 if input is log2
1328
+
1329
+ pvals = []
1330
+ for prot in df.index:
1331
+ v1 = g1.loc[prot].dropna().values
1332
+ v2 = g2.loc[prot].dropna().values
1333
+ if len(v1) >= 2 and len(v2) >= 2:
1334
+ _, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
1335
+ pvals.append(p)
1336
+ else:
1337
+ pvals.append(1.0)
1338
+
1339
+ pvals = np.array(pvals)
1340
+ qvals = _fdr_correct(pvals)
1341
+
1342
+ sig_mask = (qvals < fdr_cutoff) & (np.abs(log2fc.values) >= fc_cutoff)
1343
+ n_sig = int(sig_mask.sum())
1344
+ n_up = int(((qvals < fdr_cutoff) & (log2fc.values >= fc_cutoff)).sum())
1345
+ n_down = int(((qvals < fdr_cutoff) & (log2fc.values <= -fc_cutoff)).sum())
1346
+
1347
+ import pandas as pd
1348
+
1349
+ results_df = pd.DataFrame({
1350
+ "mean_group1": mean1,
1351
+ "mean_group2": mean2,
1352
+ "log2fc": log2fc,
1353
+ "pvalue": pvals,
1354
+ "fdr": qvals,
1355
+ }, index=df.index)
1356
+ results_df = results_df.sort_values("fdr")
1357
+
1358
+ return {
1359
+ "n_proteins_tested": len(df.index),
1360
+ "n_significant": n_sig,
1361
+ "n_upregulated": n_up,
1362
+ "n_downregulated": n_down,
1363
+ "group1_samples": g1_samples,
1364
+ "group2_samples": g2_samples,
1365
+ "auto_grouping_used": bool(auto_grouping and not group1 and not group2),
1366
+ "top_hits": results_df.head(20).to_dict("index"),
1367
+ "summary": (
1368
+ f"Tested {len(df.index):,} proteins: {n_sig} significant "
1369
+ f"(FDR<{fdr_cutoff}, |log2FC|≥{fc_cutoff}). {n_up} up, {n_down} down."
1370
+ ),
1371
+ }
1372
+
1373
+
1374
+ # ---------------------------------------------------------------------------
1375
+ # 11. omics.proteomics_enrich
1376
+ # ---------------------------------------------------------------------------
1377
+
1378
+
1379
+ def _parse_gene_list_file(path: str) -> tuple[set[str], str | None]:
1380
+ """Load a gene list from text/CSV/TSV file and return an uppercase gene set."""
1381
+ import pandas as pd
1382
+
1383
+ fp = Path(path).expanduser()
1384
+ if not fp.exists():
1385
+ return set(), f"Background file not found: {path}"
1386
+
1387
+ suffix = fp.suffix.lower()
1388
+ try:
1389
+ if suffix == ".txt":
1390
+ genes = {
1391
+ line.strip().split("\t")[0].split(",")[0].strip().upper()
1392
+ for line in fp.read_text(errors="replace").splitlines()
1393
+ if line.strip()
1394
+ }
1395
+ return {g for g in genes if g}, None
1396
+
1397
+ if suffix in {".csv", ".tsv"}:
1398
+ sep = "," if suffix == ".csv" else "\t"
1399
+ df = pd.read_csv(fp, sep=sep)
1400
+ if df.empty:
1401
+ return set(), f"Background file is empty: {path}"
1402
+ first_col = df.columns[0]
1403
+ genes = {
1404
+ str(v).strip().upper()
1405
+ for v in df[first_col].dropna().tolist()
1406
+ if str(v).strip()
1407
+ }
1408
+ return genes, None
1409
+
1410
+ # Fallback: treat as newline-delimited text.
1411
+ genes = {
1412
+ line.strip().split("\t")[0].split(",")[0].strip().upper()
1413
+ for line in fp.read_text(errors="replace").splitlines()
1414
+ if line.strip()
1415
+ }
1416
+ return {g for g in genes if g}, None
1417
+ except Exception as exc:
1418
+ return set(), f"Failed to parse background file: {str(exc)[:200]}"
1419
+
1420
+
1421
+ def _enrichr_libraries_for_organism(organism: str) -> tuple[list[str] | None, str | None]:
1422
+ """Map organism names to Enrichr libraries."""
1423
+ org = (organism or "Homo sapiens").strip().lower()
1424
+ human_aliases = {"human", "homo sapiens", "hs", "h. sapiens"}
1425
+ mouse_aliases = {"mouse", "mus musculus", "mm", "m. musculus"}
1426
+
1427
+ if org in human_aliases:
1428
+ return ["KEGG_2021_Human", "Reactome_2022", "GO_Biological_Process_2023"], None
1429
+ if org in mouse_aliases:
1430
+ return ["KEGG_2021_Mouse", "WikiPathway_2021_Mouse", "GO_Biological_Process_2023"], None
1431
+ return None, (
1432
+ f"Unsupported organism '{organism}'. "
1433
+ "Supported: Homo sapiens, Mus musculus."
1434
+ )
1435
+
1436
+
1437
+ @registry.register(
1438
+ name="omics.proteomics_enrich",
1439
+ description="Pathway enrichment analysis from a list of differentially abundant proteins",
1440
+ category="omics",
1441
+ parameters={
1442
+ "proteins": "Comma-separated list of protein/gene symbols",
1443
+ "background_path": "Path to full protein list (optional, for background set)",
1444
+ "organism": "Organism for gene set lookup (default 'Homo sapiens')",
1445
+ },
1446
+ usage_guide=(
1447
+ "Run over-representation analysis on a set of differentially expressed proteins. "
1448
+ "Uses Enrichr API for pathway databases (KEGG, Reactome, GO)."
1449
+ ),
1450
+ )
1451
+ def proteomics_enrich(
1452
+ proteins: str = "",
1453
+ background_path: str = "",
1454
+ organism: str = "Homo sapiens",
1455
+ **kwargs,
1456
+ ) -> dict:
1457
+ """Pathway enrichment for a protein list via Enrichr."""
1458
+ seen = set()
1459
+ gene_list = []
1460
+ for gene in (g.strip() for g in proteins.split(",") if g.strip()):
1461
+ key = gene.upper()
1462
+ if key not in seen:
1463
+ seen.add(key)
1464
+ gene_list.append(gene)
1465
+ if not gene_list:
1466
+ return {"error": "No proteins provided", "summary": "Empty protein list"}
1467
+
1468
+ libraries, org_error = _enrichr_libraries_for_organism(organism)
1469
+ if org_error:
1470
+ return {"error": org_error, "summary": org_error}
1471
+
1472
+ background_info = {}
1473
+ if background_path:
1474
+ background_genes, bg_error = _parse_gene_list_file(background_path)
1475
+ if bg_error:
1476
+ return {"error": bg_error, "summary": bg_error}
1477
+ if not background_genes:
1478
+ return {
1479
+ "error": "Background file contains no genes after parsing",
1480
+ "summary": f"Empty background set: {background_path}",
1481
+ }
1482
+
1483
+ original_n = len(gene_list)
1484
+ gene_list = [g for g in gene_list if g.upper() in background_genes]
1485
+ if not gene_list:
1486
+ return {
1487
+ "error": "None of the input genes were found in the provided background set",
1488
+ "background_gene_count": len(background_genes),
1489
+ "summary": "No overlap between input list and background set",
1490
+ }
1491
+
1492
+ background_info = {
1493
+ "background_path": str(Path(background_path).expanduser()),
1494
+ "background_gene_count": len(background_genes),
1495
+ "n_proteins_before_background_filter": original_n,
1496
+ "n_proteins_after_background_filter": len(gene_list),
1497
+ # Enrichr endpoint has no custom-universe parameter; we apply background as input filter.
1498
+ "background_mode": "input_filter_only",
1499
+ }
1500
+
1501
+ # Submit to Enrichr
1502
+ add_url = "https://maayanlab.cloud/Enrichr/addList"
1503
+ payload = {"list": (None, "\n".join(gene_list)), "description": (None, "ct proteomics enrichment")}
1504
+
1505
+ import httpx
1506
+
1507
+ try:
1508
+ resp = httpx.post(add_url, files=payload, timeout=15)
1509
+ resp.raise_for_status()
1510
+ user_list_id = resp.json().get("userListId")
1511
+ except Exception as exc:
1512
+ return {"error": f"Enrichr submission failed: {str(exc)[:200]}", "summary": f"Enrichr error: {str(exc)[:100]}"}
1513
+
1514
+ if not user_list_id:
1515
+ return {"error": "Enrichr did not return a list ID", "summary": "Enrichr submission failed"}
1516
+
1517
+ # Query key libraries
1518
+ all_results = {}
1519
+ library_errors = {}
1520
+
1521
+ for lib in libraries:
1522
+ enrich_url = f"https://maayanlab.cloud/Enrichr/enrich?userListId={user_list_id}&backgroundType={lib}"
1523
+ try:
1524
+ resp = httpx.get(enrich_url, timeout=15)
1525
+ resp.raise_for_status()
1526
+ data = resp.json()
1527
+ terms = data.get(lib, [])
1528
+ top_terms = []
1529
+ for term in terms[:10]:
1530
+ top_terms.append({
1531
+ "term": term[1],
1532
+ "pvalue": term[2],
1533
+ "adj_pvalue": term[6],
1534
+ "odds_ratio": term[3],
1535
+ "genes": term[5],
1536
+ })
1537
+ all_results[lib] = top_terms
1538
+ except Exception as exc:
1539
+ all_results[lib] = []
1540
+ library_errors[lib] = str(exc)[:200]
1541
+
1542
+ # Flatten top hits
1543
+ top_summary = []
1544
+ for lib, terms in all_results.items():
1545
+ for t in terms[:3]:
1546
+ if t["adj_pvalue"] < 0.05:
1547
+ top_summary.append(f"{t['term']} (q={t['adj_pvalue']:.2e})")
1548
+
1549
+ return {
1550
+ "n_proteins_submitted": len(gene_list),
1551
+ "organism": organism,
1552
+ "libraries": libraries,
1553
+ "enrichment_results": all_results,
1554
+ "library_errors": library_errors,
1555
+ **background_info,
1556
+ "summary": (
1557
+ f"Enrichment of {len(gene_list)} proteins. "
1558
+ + (f"Top enriched: {'; '.join(top_summary[:5])}" if top_summary else "No significant enrichments (FDR<0.05).")
1559
+ + (" Background set applied as input filter." if background_path else "")
1560
+ ),
1561
+ }
1562
+
1563
+
1564
+ # ---------------------------------------------------------------------------
1565
+ # 12. omics.atac_peak_annotate
1566
+ # ---------------------------------------------------------------------------
1567
+
1568
+
1569
+ @registry.register(
1570
+ name="omics.atac_peak_annotate",
1571
+ description="Annotate ATAC-seq peaks by genomic features and summarize accessibility landscape",
1572
+ category="omics",
1573
+ parameters={
1574
+ "path": "Path to peak file (BED-like CSV/TSV with chr, start, end columns or peak count matrix)",
1575
+ },
1576
+ usage_guide=(
1577
+ "Summarize ATAC-seq peak data: genomic distribution, peak sizes, "
1578
+ "chromosome distribution. Works on BED-like files or peak count matrices. "
1579
+ "Use after omics.geo_fetch to download ATAC-seq data."
1580
+ ),
1581
+ )
1582
+ def atac_peak_annotate(path: str, **kwargs) -> dict:
1583
+ """Annotate and summarize ATAC-seq peaks."""
1584
+ import numpy as np
1585
+ import pandas as pd
1586
+
1587
+ filepath = Path(path).expanduser()
1588
+ if not filepath.exists():
1589
+ return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
1590
+
1591
+ suffix = filepath.suffix.lower()
1592
+ sep = "," if suffix == ".csv" else "\t"
1593
+
1594
+ try:
1595
+ df = pd.read_csv(filepath, sep=sep, comment="#")
1596
+ except Exception as exc:
1597
+ return {"error": f"Failed to read: {str(exc)[:200]}", "summary": f"Parse error: {str(exc)[:100]}"}
1598
+
1599
+ # Detect BED-like format: look for chr/start/end columns
1600
+ col_lower = {c.lower(): c for c in df.columns}
1601
+ chr_col = col_lower.get("chr") or col_lower.get("chrom") or col_lower.get("chromosome")
1602
+ start_col = col_lower.get("start") or col_lower.get("chromstart")
1603
+ end_col = col_lower.get("end") or col_lower.get("chromend")
1604
+
1605
+ # Also try positional (first 3 columns as chr, start, end)
1606
+ if not chr_col and len(df.columns) >= 3:
1607
+ first_col_vals = df.iloc[:, 0].astype(str)
1608
+ if first_col_vals.str.startswith("chr").mean() > 0.5:
1609
+ chr_col = df.columns[0]
1610
+ start_col = df.columns[1]
1611
+ end_col = df.columns[2]
1612
+
1613
+ if chr_col and start_col and end_col:
1614
+ # BED-like format
1615
+ peaks = df[[chr_col, start_col, end_col]].copy()
1616
+ peaks.columns = ["chr", "start", "end"]
1617
+ peaks["start"] = pd.to_numeric(peaks["start"], errors="coerce")
1618
+ peaks["end"] = pd.to_numeric(peaks["end"], errors="coerce")
1619
+ peaks = peaks.dropna()
1620
+ peaks["width"] = peaks["end"] - peaks["start"]
1621
+
1622
+ n_peaks = len(peaks)
1623
+ chr_counts = peaks["chr"].value_counts().head(24).to_dict()
1624
+ width_stats = {
1625
+ "mean": round(float(peaks["width"].mean()), 0),
1626
+ "median": round(float(peaks["width"].median()), 0),
1627
+ "min": int(peaks["width"].min()),
1628
+ "max": int(peaks["width"].max()),
1629
+ }
1630
+
1631
+ # Estimate genomic feature distribution by peak width heuristic
1632
+ promoter_like = int((peaks["width"] < 500).sum())
1633
+ enhancer_like = int(((peaks["width"] >= 500) & (peaks["width"] < 2000)).sum())
1634
+ broad_peaks = int((peaks["width"] >= 2000).sum())
1635
+
1636
+ return {
1637
+ "n_peaks": n_peaks,
1638
+ "chromosome_distribution": chr_counts,
1639
+ "peak_width_stats": width_stats,
1640
+ "promoter_like_peaks": promoter_like,
1641
+ "enhancer_like_peaks": enhancer_like,
1642
+ "broad_peaks": broad_peaks,
1643
+ "summary": (
1644
+ f"ATAC-seq: {n_peaks:,} peaks. Median width: {width_stats['median']:.0f} bp. "
1645
+ f"Estimated: {promoter_like:,} promoter-like (<500bp), "
1646
+ f"{enhancer_like:,} enhancer-like (500-2000bp), {broad_peaks:,} broad (>2000bp). "
1647
+ f"Top chromosomes: {', '.join(f'{k}:{v}' for k, v in list(chr_counts.items())[:5])}"
1648
+ ),
1649
+ }
1650
+ else:
1651
+ # Peak count matrix (peaks x samples)
1652
+ n_peaks, n_samples = df.shape
1653
+ return {
1654
+ "n_peaks": n_peaks,
1655
+ "n_samples": n_samples,
1656
+ "columns": list(df.columns[:20]),
1657
+ "summary": (
1658
+ f"ATAC-seq count matrix: {n_peaks:,} peaks x {n_samples} samples. "
1659
+ f"Use omics.chromatin_accessibility for differential analysis."
1660
+ ),
1661
+ }
1662
+
1663
+
1664
+ # ---------------------------------------------------------------------------
1665
+ # 13. omics.chromatin_accessibility
1666
+ # ---------------------------------------------------------------------------
1667
+
1668
+
1669
+ @registry.register(
1670
+ name="omics.chromatin_accessibility",
1671
+ description="Differential chromatin accessibility analysis between two sample groups",
1672
+ category="omics",
1673
+ parameters={
1674
+ "path": "Path to peak count matrix (rows=peaks/genes, cols=samples)",
1675
+ "group1": "Comma-separated sample names for group 1",
1676
+ "group2": "Comma-separated sample names for group 2",
1677
+ "auto_grouping": "If true, splits samples by column order for exploratory use (default false)",
1678
+ "fdr_cutoff": "FDR threshold (default 0.05)",
1679
+ },
1680
+ usage_guide=(
1681
+ "Compare chromatin accessibility between groups from ATAC-seq count matrices. "
1682
+ "Works on peak-level or gene-level accessibility scores. "
1683
+ "Provide explicit group1/group2 sample lists for robust comparisons."
1684
+ ),
1685
+ )
1686
+ def chromatin_accessibility(
1687
+ path: str,
1688
+ group1: str = "",
1689
+ group2: str = "",
1690
+ auto_grouping: bool = False,
1691
+ fdr_cutoff: float = 0.05,
1692
+ **kwargs,
1693
+ ) -> dict:
1694
+ """Differential chromatin accessibility analysis."""
1695
+ import numpy as np
1696
+ from scipy import stats
1697
+
1698
+ df, error = _load_tabular(path)
1699
+ if error:
1700
+ return {"error": error, "summary": f"Could not load: {error}"}
1701
+
1702
+ g1_samples, g2_samples, group_error = _parse_sample_groups(
1703
+ df,
1704
+ group1=group1,
1705
+ group2=group2,
1706
+ auto_grouping=auto_grouping,
1707
+ min_group_size=2,
1708
+ )
1709
+ if group_error:
1710
+ return group_error
1711
+
1712
+ g1 = df[g1_samples]
1713
+ g2 = df[g2_samples]
1714
+
1715
+ mean1 = g1.mean(axis=1)
1716
+ mean2 = g2.mean(axis=1)
1717
+ # Log2 fold-change (add pseudocount to avoid log(0))
1718
+ log2fc = np.log2((mean2 + 1) / (mean1 + 1))
1719
+
1720
+ pvals = []
1721
+ for region in df.index:
1722
+ v1 = g1.loc[region].dropna().values
1723
+ v2 = g2.loc[region].dropna().values
1724
+ if len(v1) >= 2 and len(v2) >= 2:
1725
+ _, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
1726
+ pvals.append(p)
1727
+ else:
1728
+ pvals.append(1.0)
1729
+
1730
+ pvals = np.array(pvals)
1731
+ qvals = _fdr_correct(pvals)
1732
+
1733
+ sig_mask = qvals < fdr_cutoff
1734
+ n_sig = int(sig_mask.sum())
1735
+ n_more_open = int((sig_mask & (log2fc.values > 0)).sum())
1736
+ n_more_closed = int((sig_mask & (log2fc.values < 0)).sum())
1737
+
1738
+ import pandas as pd
1739
+
1740
+ results_df = pd.DataFrame({
1741
+ "mean_group1": mean1, "mean_group2": mean2,
1742
+ "log2fc": log2fc, "pvalue": pvals, "fdr": qvals,
1743
+ }, index=df.index).sort_values("fdr")
1744
+
1745
+ return {
1746
+ "n_regions_tested": len(df.index),
1747
+ "n_significant": n_sig,
1748
+ "n_more_accessible": n_more_open,
1749
+ "n_less_accessible": n_more_closed,
1750
+ "group1_samples": g1_samples,
1751
+ "group2_samples": g2_samples,
1752
+ "auto_grouping_used": bool(auto_grouping and not group1 and not group2),
1753
+ "top_hits": results_df.head(20).to_dict("index"),
1754
+ "summary": (
1755
+ f"Tested {len(df.index):,} regions: {n_sig} differentially accessible "
1756
+ f"(FDR<{fdr_cutoff}). {n_more_open} gained, {n_more_closed} lost accessibility."
1757
+ ),
1758
+ }
1759
+
1760
+
1761
+ # ---------------------------------------------------------------------------
1762
+ # 14. omics.chipseq_enrich
1763
+ # ---------------------------------------------------------------------------
1764
+
1765
+
1766
+ @registry.register(
1767
+ name="omics.chipseq_enrich",
1768
+ description="Enrichment analysis of ChIP-seq target genes",
1769
+ category="omics",
1770
+ parameters={
1771
+ "path": "Path to peak file with gene annotations (CSV/TSV with a gene column)",
1772
+ "gene_column": "Column name containing gene symbols (default auto-detect)",
1773
+ },
1774
+ usage_guide=(
1775
+ "Extract target genes from ChIP-seq peak annotations and run pathway "
1776
+ "enrichment. Works on peak files that include nearest-gene annotations."
1777
+ ),
1778
+ )
1779
+ def chipseq_enrich(path: str, gene_column: str = "", **kwargs) -> dict:
1780
+ """Enrichment analysis of ChIP-seq target genes."""
1781
+ import pandas as pd
1782
+
1783
+ filepath = Path(path).expanduser()
1784
+ if not filepath.exists():
1785
+ return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
1786
+
1787
+ suffix = filepath.suffix.lower()
1788
+ sep = "," if suffix == ".csv" else "\t"
1789
+ try:
1790
+ df = pd.read_csv(filepath, sep=sep, comment="#")
1791
+ except Exception as exc:
1792
+ return {"error": f"Failed to read: {str(exc)[:200]}", "summary": f"Parse error: {str(exc)[:100]}"}
1793
+
1794
+ # Auto-detect gene column
1795
+ if gene_column and gene_column in df.columns:
1796
+ gcol = gene_column
1797
+ else:
1798
+ candidates = ["gene", "gene_name", "symbol", "gene_symbol", "nearest_gene",
1799
+ "GENE", "Gene", "SYMBOL", "geneName"]
1800
+ gcol = None
1801
+ for c in candidates:
1802
+ if c in df.columns:
1803
+ gcol = c
1804
+ break
1805
+ if gcol is None:
1806
+ # Try case-insensitive
1807
+ col_lower = {c.lower(): c for c in df.columns}
1808
+ for c in ["gene", "gene_name", "symbol"]:
1809
+ if c in col_lower:
1810
+ gcol = col_lower[c]
1811
+ break
1812
+
1813
+ if gcol is None:
1814
+ return {
1815
+ "error": "No gene column found. Provide gene_column parameter.",
1816
+ "available_columns": list(df.columns[:20]),
1817
+ "summary": f"Could not auto-detect gene column. Columns: {', '.join(df.columns[:10])}",
1818
+ }
1819
+
1820
+ genes = df[gcol].dropna().unique().tolist()
1821
+ genes = [str(g).strip() for g in genes if str(g).strip() and str(g).strip().upper() != "NAN"]
1822
+
1823
+ if not genes:
1824
+ return {"error": "No genes found in column", "summary": "Empty gene list after filtering"}
1825
+
1826
+ # Delegate to Enrichr
1827
+ return proteomics_enrich(proteins=",".join(genes), **kwargs)
1828
+
1829
+
1830
+ # ---------------------------------------------------------------------------
1831
+ # 15. omics.spatial_cluster
1832
+ # ---------------------------------------------------------------------------
1833
+
1834
+
1835
+ @registry.register(
1836
+ name="omics.spatial_cluster",
1837
+ description="Spatial-aware clustering of spatial transcriptomics data",
1838
+ category="omics",
1839
+ parameters={
1840
+ "path": "Path to h5ad file with spatial coordinates in .obsm['spatial']",
1841
+ "resolution": "Leiden clustering resolution (default 1.0)",
1842
+ "n_neighbors": "Number of spatial neighbors (default 15)",
1843
+ },
1844
+ usage_guide=(
1845
+ "Cluster spatial transcriptomics data (Visium, MERFISH, etc.) using "
1846
+ "both expression similarity and spatial proximity. Requires scanpy; "
1847
+ "squidpy is optional for enhanced spatial analysis."
1848
+ ),
1849
+ )
1850
+ def spatial_cluster(
1851
+ path: str,
1852
+ resolution: float = 1.0,
1853
+ n_neighbors: int = 15,
1854
+ **kwargs,
1855
+ ) -> dict:
1856
+ """Spatial-aware clustering of spatial transcriptomics data."""
1857
+ filepath = Path(path).expanduser()
1858
+ if not filepath.exists():
1859
+ return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
1860
+
1861
+ sc = _check_scanpy()
1862
+ if sc is None:
1863
+ return {
1864
+ "error": "scanpy required. Install with: pip install scanpy",
1865
+ "summary": "Install scanpy for spatial clustering: pip install scanpy",
1866
+ }
1867
+
1868
+ try:
1869
+ adata = sc.read_h5ad(filepath)
1870
+ except Exception as exc:
1871
+ return {"error": f"Failed to load h5ad: {str(exc)[:200]}", "summary": f"Could not read file: {str(exc)[:100]}"}
1872
+
1873
+ has_spatial = "spatial" in (adata.obsm or {})
1874
+
1875
+ # Standard preprocessing if raw
1876
+ if adata.X.max() > 50: # likely raw counts
1877
+ sc.pp.normalize_total(adata, target_sum=1e4)
1878
+ sc.pp.log1p(adata)
1879
+
1880
+ sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
1881
+ sc.tl.pca(adata, n_comps=min(50, adata.n_vars - 1, adata.n_obs - 1))
1882
+ sc.pp.neighbors(adata, n_neighbors=n_neighbors)
1883
+
1884
+ # Try squidpy for spatial neighbors
1885
+ sq = None
1886
+ try:
1887
+ import squidpy as sq_mod
1888
+ sq = sq_mod
1889
+ except ImportError:
1890
+ pass
1891
+
1892
+ if has_spatial and sq:
1893
+ try:
1894
+ sq.gr.spatial_neighbors(adata, n_neighs=n_neighbors)
1895
+ # Combine spatial + expression connectivity
1896
+ from scipy.sparse import csr_matrix
1897
+ expr_conn = adata.obsp.get("connectivities")
1898
+ spatial_conn = adata.obsp.get("spatial_connectivities")
1899
+ if expr_conn is not None and spatial_conn is not None:
1900
+ combined = 0.5 * expr_conn + 0.5 * spatial_conn
1901
+ adata.obsp["connectivities"] = csr_matrix(combined)
1902
+ except Exception:
1903
+ pass # Fall back to expression-only neighbors
1904
+
1905
+ sc.tl.leiden(adata, resolution=resolution, key_added="spatial_cluster")
1906
+
1907
+ clusters = adata.obs["spatial_cluster"].value_counts().to_dict()
1908
+ n_clusters = len(clusters)
1909
+
1910
+ result = {
1911
+ "n_cells": adata.n_obs,
1912
+ "n_genes": adata.n_vars,
1913
+ "n_clusters": n_clusters,
1914
+ "cluster_sizes": clusters,
1915
+ "has_spatial_coords": has_spatial,
1916
+ "used_squidpy": sq is not None and has_spatial,
1917
+ "resolution": resolution,
1918
+ "summary": (
1919
+ f"Spatial clustering: {adata.n_obs:,} cells → {n_clusters} clusters "
1920
+ f"(resolution={resolution}). "
1921
+ f"{'Used spatial+expression neighbors (squidpy).' if sq and has_spatial else 'Expression-based neighbors only.'} "
1922
+ f"Largest cluster: {max(clusters.values()):,} cells."
1923
+ ),
1924
+ }
1925
+
1926
+ # Try to find marker genes per cluster
1927
+ try:
1928
+ sc.tl.rank_genes_groups(adata, "spatial_cluster", method="wilcoxon", n_genes=5)
1929
+ markers = {}
1930
+ for cl in adata.obs["spatial_cluster"].unique():
1931
+ genes = list(adata.uns["rank_genes_groups"]["names"][cl][:5])
1932
+ markers[str(cl)] = genes
1933
+ result["cluster_markers"] = markers
1934
+ except Exception:
1935
+ pass
1936
+
1937
+ return result
1938
+
1939
+
1940
+ # ---------------------------------------------------------------------------
1941
+ # 16. omics.spatial_autocorrelation
1942
+ # ---------------------------------------------------------------------------
1943
+
1944
+
1945
+ @registry.register(
1946
+ name="omics.spatial_autocorrelation",
1947
+ description="Compute spatial autocorrelation (Moran's I) for gene expression patterns",
1948
+ category="omics",
1949
+ parameters={
1950
+ "path": "Path to h5ad file with spatial coordinates",
1951
+ "genes": "Comma-separated gene names to test (default: top variable genes)",
1952
+ "n_genes": "Number of top variable genes to test if genes not specified (default 50)",
1953
+ },
1954
+ usage_guide=(
1955
+ "Test whether gene expression shows spatial patterning using Moran's I. "
1956
+ "High Moran's I = spatially clustered expression. Requires scanpy."
1957
+ ),
1958
+ )
1959
+ def spatial_autocorrelation(
1960
+ path: str,
1961
+ genes: str = "",
1962
+ n_genes: int = 50,
1963
+ **kwargs,
1964
+ ) -> dict:
1965
+ """Compute Moran's I spatial autocorrelation for genes."""
1966
+ filepath = Path(path).expanduser()
1967
+ if not filepath.exists():
1968
+ return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
1969
+
1970
+ sc = _check_scanpy()
1971
+ if sc is None:
1972
+ return {"error": "scanpy required", "summary": "Install scanpy: pip install scanpy"}
1973
+
1974
+ try:
1975
+ adata = sc.read_h5ad(filepath)
1976
+ except Exception as exc:
1977
+ return {"error": f"Failed to load: {str(exc)[:200]}", "summary": f"Read error: {str(exc)[:100]}"}
1978
+
1979
+ has_spatial = "spatial" in (adata.obsm or {})
1980
+ if not has_spatial:
1981
+ return {"error": "No spatial coordinates found in .obsm['spatial']", "summary": "Not spatial data: no coordinates found"}
1982
+
1983
+ # Try squidpy
1984
+ try:
1985
+ import squidpy as sq
1986
+ except ImportError:
1987
+ return {"error": "squidpy required for Moran's I. Install: pip install squidpy", "summary": "Install squidpy: pip install squidpy"}
1988
+
1989
+ # Preprocess if needed
1990
+ if adata.X.max() > 50:
1991
+ sc.pp.normalize_total(adata, target_sum=1e4)
1992
+ sc.pp.log1p(adata)
1993
+
1994
+ # Build spatial graph
1995
+ sq.gr.spatial_neighbors(adata)
1996
+
1997
+ # Select genes
1998
+ gene_list = [g.strip() for g in genes.split(",") if g.strip()] if genes else []
1999
+ if not gene_list:
2000
+ sc.pp.highly_variable_genes(adata, n_top_genes=min(n_genes, adata.n_vars))
2001
+ gene_list = list(adata.var_names[adata.var["highly_variable"]])[:n_genes]
2002
+
2003
+ # Filter to genes present in data
2004
+ valid_genes = [g for g in gene_list if g in adata.var_names]
2005
+ if not valid_genes:
2006
+ return {"error": "None of the specified genes found in dataset", "summary": "No matching genes in data"}
2007
+
2008
+ # Compute Moran's I
2009
+ sq.gr.spatial_autocorr(adata, mode="moran", genes=valid_genes)
2010
+
2011
+ moranI = adata.uns.get("moranI")
2012
+ if moranI is None:
2013
+ return {"error": "Moran's I computation failed", "summary": "Spatial autocorrelation computation failed"}
2014
+
2015
+ results = moranI.sort_values("I", ascending=False)
2016
+ top_spatial = results.head(20).to_dict("index")
2017
+
2018
+ highly_spatial = results[results["pval_norm"] < 0.05]
2019
+ n_spatial = len(highly_spatial)
2020
+
2021
+ return {
2022
+ "n_genes_tested": len(valid_genes),
2023
+ "n_spatially_patterned": n_spatial,
2024
+ "top_spatial_genes": top_spatial,
2025
+ "summary": (
2026
+ f"Moran's I on {len(valid_genes)} genes: {n_spatial} show significant spatial "
2027
+ f"patterning (p<0.05). Top: "
2028
+ + ", ".join(f"{g} (I={row['I']:.3f})" for g, row in list(results.head(5).iterrows()))
2029
+ ),
2030
+ }
2031
+
2032
+
2033
+ # ---------------------------------------------------------------------------
2034
+ # 17. omics.cytof_cluster
2035
+ # ---------------------------------------------------------------------------
2036
+
2037
+
2038
+ @registry.register(
2039
+ name="omics.cytof_cluster",
2040
+ description="Cluster CyTOF or flow cytometry data and characterize marker expression per cluster",
2041
+ category="omics",
2042
+ parameters={
2043
+ "path": "Path to CyTOF/flow data (CSV with markers as columns, cells as rows)",
2044
+ "n_clusters": "Number of clusters for KMeans (default 10). Use 0 for auto (Leiden).",
2045
+ "markers": "Comma-separated marker columns to use (default: all numeric columns)",
2046
+ },
2047
+ usage_guide=(
2048
+ "Cluster mass/flow cytometry data. Input is a cells x markers matrix. "
2049
+ "Identifies cell populations and characterizes each by marker expression."
2050
+ ),
2051
+ )
2052
+ def cytof_cluster(
2053
+ path: str,
2054
+ n_clusters: int = 10,
2055
+ markers: str = "",
2056
+ **kwargs,
2057
+ ) -> dict:
2058
+ """Cluster CyTOF/flow cytometry data."""
2059
+ import numpy as np
2060
+
2061
+ df, error = _load_tabular(path)
2062
+ if error:
2063
+ # Try without index_col since CyTOF data often has no row names
2064
+ import pandas as pd
2065
+
2066
+ filepath = Path(path).expanduser()
2067
+ if not filepath.exists():
2068
+ return {"error": error, "summary": f"Could not load: {error}"}
2069
+ suffix = filepath.suffix.lower()
2070
+ sep = "," if suffix == ".csv" else "\t"
2071
+ try:
2072
+ df = pd.read_csv(filepath, sep=sep)
2073
+ except Exception as exc2:
2074
+ return {"error": str(exc2), "summary": f"Could not load: {str(exc2)[:100]}"}
2075
+
2076
+ marker_cols = [m.strip() for m in markers.split(",") if m.strip()] if markers else []
2077
+ if not marker_cols:
2078
+ marker_cols = list(df.select_dtypes(include=[np.number]).columns)
2079
+
2080
+ if not marker_cols:
2081
+ return {"error": "No numeric marker columns found", "summary": "No numeric columns in data"}
2082
+
2083
+ data = df[marker_cols].dropna()
2084
+ n_cells = len(data)
2085
+ if n_cells < 10:
2086
+ return {"error": f"Too few cells ({n_cells}) for clustering", "summary": f"Only {n_cells} cells — need at least 10"}
2087
+
2088
+ # Standardize
2089
+ from sklearn.preprocessing import StandardScaler
2090
+
2091
+ scaler = StandardScaler()
2092
+ scaled = scaler.fit_transform(data.values)
2093
+
2094
+ # Cluster
2095
+ if n_clusters > 0:
2096
+ from sklearn.cluster import MiniBatchKMeans
2097
+
2098
+ model = MiniBatchKMeans(n_clusters=min(n_clusters, n_cells), random_state=42, n_init=3)
2099
+ labels = model.fit_predict(scaled)
2100
+ else:
2101
+ # Use Leiden via scanpy on a neighbors graph
2102
+ sc = _check_scanpy()
2103
+ if sc is not None:
2104
+ import anndata
2105
+
2106
+ adata = anndata.AnnData(X=scaled)
2107
+ sc.pp.neighbors(adata, n_neighbors=15)
2108
+ sc.tl.leiden(adata, resolution=1.0)
2109
+ labels = adata.obs["leiden"].astype(int).values
2110
+ else:
2111
+ from sklearn.cluster import MiniBatchKMeans
2112
+
2113
+ model = MiniBatchKMeans(n_clusters=10, random_state=42, n_init=3)
2114
+ labels = model.fit_predict(scaled)
2115
+
2116
+ import pandas as pd
2117
+
2118
+ data = data.copy()
2119
+ data["cluster"] = labels
2120
+
2121
+ cluster_sizes = data["cluster"].value_counts().sort_index().to_dict()
2122
+ n_clusters_found = len(cluster_sizes)
2123
+
2124
+ # Per-cluster marker expression (median)
2125
+ cluster_medians = data.groupby("cluster")[marker_cols].median()
2126
+ cluster_profiles = cluster_medians.to_dict("index")
2127
+
2128
+ # Find defining markers per cluster (highest z-score)
2129
+ defining_markers = {}
2130
+ global_means = data[marker_cols].mean()
2131
+ global_stds = data[marker_cols].std().replace(0, 1)
2132
+ for cl in sorted(cluster_sizes.keys()):
2133
+ cl_means = cluster_medians.loc[cl]
2134
+ z_scores = ((cl_means - global_means) / global_stds).sort_values(ascending=False)
2135
+ defining_markers[str(cl)] = list(z_scores.head(5).index)
2136
+
2137
+ return {
2138
+ "n_cells": n_cells,
2139
+ "n_markers": len(marker_cols),
2140
+ "n_clusters": n_clusters_found,
2141
+ "cluster_sizes": {str(k): v for k, v in cluster_sizes.items()},
2142
+ "defining_markers": defining_markers,
2143
+ "cluster_profiles": {str(k): {mk: round(v, 3) for mk, v in prof.items()} for k, prof in cluster_profiles.items()},
2144
+ "summary": (
2145
+ f"CyTOF clustering: {n_cells:,} cells x {len(marker_cols)} markers → "
2146
+ f"{n_clusters_found} clusters. Largest: {max(cluster_sizes.values()):,} cells. "
2147
+ f"Top defining markers per cluster identified."
2148
+ ),
2149
+ }
2150
+
2151
+
2152
+ # ---------------------------------------------------------------------------
2153
+ # 18. omics.hic_compartments
2154
+ # ---------------------------------------------------------------------------
2155
+
2156
+
2157
+ @registry.register(
2158
+ name="omics.hic_compartments",
2159
+ description="Identify A/B compartments from Hi-C contact matrices",
2160
+ category="omics",
2161
+ parameters={
2162
+ "path": "Path to Hi-C contact matrix (CSV/TSV, symmetric matrix with genomic bins)",
2163
+ "resolution": "Bin resolution description (for reporting, e.g. '50kb')",
2164
+ },
2165
+ usage_guide=(
2166
+ "Identify chromatin A/B compartments from Hi-C contact frequency matrices. "
2167
+ "A compartments are gene-rich/active, B compartments are gene-poor/repressed. "
2168
+ "Input should be a symmetric bin x bin contact matrix."
2169
+ ),
2170
+ )
2171
+ def hic_compartments(path: str, resolution: str = "unknown", **kwargs) -> dict:
2172
+ """Identify A/B compartments from Hi-C contact matrix via PCA."""
2173
+ import numpy as np
2174
+
2175
+ df, error = _load_tabular(path)
2176
+ if error:
2177
+ return {"error": error, "summary": f"Could not load: {error}"}
2178
+
2179
+ n_bins = df.shape[0]
2180
+ if df.shape[0] != df.shape[1]:
2181
+ return {
2182
+ "error": f"Expected symmetric matrix, got {df.shape[0]}x{df.shape[1]}",
2183
+ "summary": "Hi-C contact matrix must be square (symmetric)",
2184
+ }
2185
+
2186
+ if n_bins < 3:
2187
+ return {"error": f"Too few bins ({n_bins})", "summary": "Need at least 3 bins for compartment analysis"}
2188
+
2189
+ # Convert to numpy, handle NaN
2190
+ matrix = df.values.astype(float)
2191
+ matrix = np.nan_to_num(matrix, nan=0.0)
2192
+
2193
+ # Normalize: observed/expected
2194
+ row_sums = matrix.sum(axis=1)
2195
+ row_sums[row_sums == 0] = 1 # avoid division by zero
2196
+ expected = np.outer(row_sums, row_sums) / row_sums.sum()
2197
+ expected[expected == 0] = 1
2198
+ oe_matrix = matrix / expected
2199
+
2200
+ # Correlation matrix
2201
+ with np.errstate(divide="ignore", invalid="ignore"):
2202
+ corr = np.corrcoef(oe_matrix)
2203
+ corr = np.nan_to_num(corr, nan=0.0)
2204
+
2205
+ # PCA — first eigenvector gives A/B compartments
2206
+ eigenvalues, eigenvectors = np.linalg.eigh(corr)
2207
+ # Take the eigenvector with largest eigenvalue
2208
+ pc1 = eigenvectors[:, -1]
2209
+
2210
+ # A compartments = positive PC1 (convention: gene-rich is positive)
2211
+ compartments = np.where(pc1 > 0, "A", "B")
2212
+ n_A = int((compartments == "A").sum())
2213
+ n_B = int((compartments == "B").sum())
2214
+ frac_A = n_A / n_bins
2215
+
2216
+ # Compartment runs (contiguous blocks)
2217
+ transitions = 0
2218
+ for i in range(1, len(compartments)):
2219
+ if compartments[i] != compartments[i - 1]:
2220
+ transitions += 1
2221
+
2222
+ return {
2223
+ "n_bins": n_bins,
2224
+ "resolution": resolution,
2225
+ "n_compartment_A": n_A,
2226
+ "n_compartment_B": n_B,
2227
+ "fraction_A": round(frac_A, 3),
2228
+ "n_transitions": transitions,
2229
+ "pc1_values": pc1.tolist()[:50],
2230
+ "compartment_assignments": compartments.tolist()[:50],
2231
+ "explained_variance": round(float(eigenvalues[-1] / eigenvalues.sum()), 4),
2232
+ "summary": (
2233
+ f"Hi-C compartments ({resolution} resolution): {n_bins} bins → "
2234
+ f"{n_A} A-compartment ({frac_A:.0%}), {n_B} B-compartment ({1-frac_A:.0%}). "
2235
+ f"{transitions} A/B transitions. PC1 explains {eigenvalues[-1]/eigenvalues.sum():.1%} of variance."
2236
+ ),
2237
+ }
2238
+
2239
+
2240
+ # ===========================================================================
2241
+ # Specialized library integrations (optional deps)
2242
+ # ===========================================================================
2243
+
2244
+
2245
+ def _check_pydeseq2():
2246
+ """Check if pyDESeq2 is available."""
2247
+ try:
2248
+ from pydeseq2.dds import DeseqDataSet
2249
+ from pydeseq2.ds import DeseqStats
2250
+
2251
+ return True
2252
+ except Exception as exc:
2253
+ logger.debug("pyDESeq2 unavailable or failed to import: %s", exc)
2254
+ return False
2255
+
2256
+
2257
+ def _check_muon():
2258
+ """Check if muon is available."""
2259
+ try:
2260
+ import muon
2261
+ import mudata
2262
+
2263
+ return muon
2264
+ except Exception as exc:
2265
+ logger.debug("muon unavailable or failed to import: %s", exc)
2266
+ return None
2267
+
2268
+
2269
+ def _check_episcanpy():
2270
+ """Check if episcanpy is available."""
2271
+ try:
2272
+ import episcanpy.api as epi
2273
+
2274
+ return epi
2275
+ except Exception as exc:
2276
+ logger.debug("episcanpy unavailable or failed to import: %s", exc)
2277
+ return None
2278
+
2279
+
2280
+ # ---------------------------------------------------------------------------
2281
+ # 19. omics.deseq2
2282
+ # ---------------------------------------------------------------------------
2283
+
2284
+
2285
+ @registry.register(
2286
+ name="omics.deseq2",
2287
+ description="Differential expression with DESeq2 (negative binomial model for count data)",
2288
+ category="omics",
2289
+ parameters={
2290
+ "counts_path": "Path to raw count matrix (genes as rows, samples as columns)",
2291
+ "metadata_path": "Path to sample metadata table (CSV/TSV/TXT/XLSX; must have a condition column; required unless infer_metadata=true)",
2292
+ "condition_col": "Column in metadata for the contrast (default 'condition')",
2293
+ "ref_level": "Reference level for contrast (default: alphabetically first)",
2294
+ "test_level": "Test level for contrast (default: alphabetically second)",
2295
+ "covariates": "Optional comma-separated covariates to include in design (e.g., 'sex,batch')",
2296
+ "infer_metadata": "If true, infer two groups from sample column order for exploratory use only (default false)",
2297
+ "alpha": "Significance threshold for adjusted p-values (default 0.05)",
2298
+ "use_r_deseq2": "If true, prefer R DESeq2 backend via rpy2 when available (default true)",
2299
+ "prefilter_min_count": "Optional prefilter threshold: minimum count per sample (default 0 disables prefilter)",
2300
+ "prefilter_min_samples": "Optional prefilter threshold: minimum number of samples meeting prefilter_min_count (default 1)",
2301
+ "lfc_shrink": "If true, apply apeglm LFC shrinkage when possible (default false)",
2302
+ "enrichment_library": "Optional gseapy/Enrichr library name (e.g., Reactome_2022) for post-DE enrichment",
2303
+ "pathway_term": "Optional pathway term to match and extract odds ratio from enrichment results",
2304
+ "gene_map_path": "Optional gene ID -> symbol mapping table used before enrichment",
2305
+ "gene_id_col": "Optional gene ID column name in mapping table",
2306
+ "gene_symbol_col": "Optional symbol column name in mapping table",
2307
+ "min_abs_lfc": "Optional absolute log2FC threshold for enrichment gene list",
2308
+ "min_base_mean": "Optional baseMean threshold for enrichment gene list",
2309
+ "target_gene": "Optional target gene symbol/ID to report explicitly (returns log2FoldChange/baseMean/padj even if not in top hits)",
2310
+ },
2311
+ usage_guide=(
2312
+ "Proper count-based differential expression using the DESeq2 negative binomial model. "
2313
+ "Preferred over Mann-Whitney for bulk RNA-seq count data. Requires pydeseq2: "
2314
+ "pip install pydeseq2. Falls back to scipy Mann-Whitney if not installed. "
2315
+ "Supports optional covariate-adjusted design, LFC shrinkage, and optional gseapy enrichment "
2316
+ "from DE genes (including pathway-specific odds ratio extraction). "
2317
+ "Use explicit sample metadata in production; inferred metadata is exploratory only."
2318
+ ),
2319
+ )
2320
+ def deseq2(
2321
+ counts_path: str,
2322
+ metadata_path: str = "",
2323
+ condition_col: str = "condition",
2324
+ ref_level: str = "",
2325
+ test_level: str = "",
2326
+ covariates: str = "",
2327
+ infer_metadata: bool = False,
2328
+ alpha: float = 0.05,
2329
+ use_r_deseq2: bool = True,
2330
+ prefilter_min_count: int = 0,
2331
+ prefilter_min_samples: int = 1,
2332
+ lfc_shrink: bool = False,
2333
+ enrichment_library: str = "",
2334
+ pathway_term: str = "",
2335
+ gene_map_path: str = "",
2336
+ gene_id_col: str = "",
2337
+ gene_symbol_col: str = "",
2338
+ min_abs_lfc: float = 0.0,
2339
+ min_base_mean: float = 0.0,
2340
+ target_gene: str = "",
2341
+ **kwargs,
2342
+ ) -> dict:
2343
+ """Run DESeq2 differential expression on count data."""
2344
+ import pandas as pd
2345
+
2346
+ # Load counts
2347
+ df, error = _load_tabular(counts_path)
2348
+ if error:
2349
+ return {"error": error, "summary": f"Could not load counts: {error}"}
2350
+
2351
+ # Load or infer metadata
2352
+ if metadata_path:
2353
+ metadata, meta_error = _load_tabular(metadata_path)
2354
+ if meta_error:
2355
+ return {"error": meta_error, "summary": f"Metadata load failed: {meta_error}"}
2356
+ else:
2357
+ if not infer_metadata:
2358
+ samples = list(df.columns)
2359
+ return {
2360
+ "error": (
2361
+ "metadata_path is required for reliable DESeq2 analysis. "
2362
+ "Set infer_metadata=True only for quick exploratory analysis."
2363
+ ),
2364
+ "available_samples": samples[:30],
2365
+ "summary": f"No metadata provided for {len(samples)} samples; cannot define conditions.",
2366
+ }
2367
+
2368
+ # Exploratory-only mode: split samples into two halves.
2369
+ samples = list(df.columns)
2370
+ mid = len(samples) // 2
2371
+ if mid < 2:
2372
+ return {"error": "Need at least 4 samples (2 per group) without metadata", "summary": "Too few samples"}
2373
+ metadata = pd.DataFrame(
2374
+ {"condition": ["control"] * mid + ["treatment"] * (len(samples) - mid)},
2375
+ index=samples,
2376
+ )
2377
+
2378
+ if condition_col not in metadata.columns:
2379
+ return {
2380
+ "error": f"Column '{condition_col}' not in metadata. Available: {list(metadata.columns)}",
2381
+ "summary": f"Missing condition column: {condition_col}",
2382
+ }
2383
+
2384
+ # Align samples (drop metadata-only and counts-only samples deterministically)
2385
+ common = df.columns.intersection(metadata.index)
2386
+ if len(common) < 4:
2387
+ return {"error": f"Need ≥4 shared samples, found {len(common)}", "summary": "Too few matching samples"}
2388
+ counts = df[common]
2389
+ metadata = metadata.loc[common]
2390
+
2391
+ # Optional pre-filtering to stabilize dispersion fitting and reduce noise.
2392
+ if prefilter_min_count > 0:
2393
+ required = max(int(prefilter_min_samples), 1)
2394
+ keep_mask = (counts >= int(prefilter_min_count)).sum(axis=1) >= required
2395
+ counts = counts.loc[keep_mask]
2396
+ if counts.empty:
2397
+ return {
2398
+ "error": "All genes were removed by prefilter.",
2399
+ "summary": "No genes left after prefilter; relax prefilter thresholds.",
2400
+ }
2401
+
2402
+ levels = sorted(metadata[condition_col].unique())
2403
+ if len(levels) < 2:
2404
+ return {"error": "Need at least 2 condition levels", "summary": "Only one condition level found"}
2405
+ ref = ref_level if ref_level else levels[0]
2406
+ test = test_level if test_level else levels[1]
2407
+
2408
+ def _resolve_level_name(requested: str, available_levels: list[str]) -> str:
2409
+ """Best-effort map of user/planner shorthand labels to metadata factor levels."""
2410
+ import re
2411
+
2412
+ req = str(requested or "").strip()
2413
+ if not req:
2414
+ return req
2415
+ if req in available_levels:
2416
+ return req
2417
+
2418
+ def _norm(s: str) -> str:
2419
+ return re.sub(r"[^a-z0-9]+", "", s.lower())
2420
+
2421
+ req_norm = _norm(req)
2422
+ if not req_norm:
2423
+ return req
2424
+
2425
+ # 1) Exact normalized match.
2426
+ exact = [lvl for lvl in available_levels if _norm(str(lvl)) == req_norm]
2427
+ if len(exact) == 1:
2428
+ return exact[0]
2429
+
2430
+ # 2) Prefix/token containment match (e.g., "CBD" -> "CBD_IC50").
2431
+ token_like = [
2432
+ lvl
2433
+ for lvl in available_levels
2434
+ if _norm(str(lvl)).startswith(req_norm) or req_norm in _norm(str(lvl))
2435
+ ]
2436
+ if len(token_like) == 1:
2437
+ return token_like[0]
2438
+
2439
+ # 3) Prefer non-combined condition when shorthand maps to several levels.
2440
+ if len(token_like) > 1:
2441
+ non_combo = [
2442
+ lvl
2443
+ for lvl in token_like
2444
+ if "serum_starvation" not in str(lvl).lower()
2445
+ and "cisplatin" not in str(lvl).lower()
2446
+ and "comb" not in str(lvl).lower()
2447
+ and "plus" not in str(lvl).lower()
2448
+ ]
2449
+ if len(non_combo) == 1:
2450
+ return non_combo[0]
2451
+ return req
2452
+
2453
+ ref = _resolve_level_name(ref, levels)
2454
+ test = _resolve_level_name(test, levels)
2455
+ if ref not in levels or test not in levels:
2456
+ return {
2457
+ "error": f"Requested contrast levels not found. Levels available: {levels}",
2458
+ "summary": f"Invalid contrast levels: ref={ref}, test={test}",
2459
+ }
2460
+ if ref == test:
2461
+ return {
2462
+ "error": "ref_level and test_level must be different",
2463
+ "summary": "Invalid contrast: identical levels",
2464
+ }
2465
+
2466
+ n_ref = int((metadata[condition_col] == ref).sum())
2467
+ n_test = int((metadata[condition_col] == test).sum())
2468
+ if n_ref < 2 or n_test < 2:
2469
+ return {
2470
+ "error": f"Need at least 2 replicates per condition for {ref} vs {test} (found {n_ref} and {n_test})",
2471
+ "summary": "Insufficient biological replicates per condition",
2472
+ }
2473
+
2474
+ # Build design formula with optional covariates.
2475
+ if isinstance(covariates, (list, tuple)):
2476
+ covars = [str(c).strip() for c in covariates if str(c).strip()]
2477
+ else:
2478
+ raw = str(covariates or "").strip()
2479
+ if raw.startswith("[") and raw.endswith("]"):
2480
+ raw = raw[1:-1]
2481
+ covars = [c.strip().strip("'\"") for c in raw.split(",") if c.strip().strip("'\"")]
2482
+ missing_covars = [c for c in covars if c not in metadata.columns]
2483
+ if missing_covars:
2484
+ return {
2485
+ "error": f"Covariate column(s) not in metadata: {missing_covars}",
2486
+ "summary": f"Missing covariates: {', '.join(missing_covars)}",
2487
+ }
2488
+ design_terms = covars + [condition_col]
2489
+ design_formula = "~ " + " + ".join(design_terms)
2490
+
2491
+ target_gene = str(target_gene or "").strip()
2492
+
2493
+ def _resolve_target_gene(results_df: "pd.DataFrame") -> "dict | None":
2494
+ """Resolve a user-requested target gene against DE results."""
2495
+ if not target_gene:
2496
+ return None
2497
+
2498
+ idx_series = pd.Series(results_df.index.astype(str), index=results_df.index)
2499
+ idx_no_ver = idx_series.str.split(".").str[0]
2500
+ tgt = target_gene
2501
+ tgt_no_ver = tgt.split(".")[0]
2502
+ tgt_lower = tgt.lower()
2503
+ tgt_no_ver_lower = tgt_no_ver.lower()
2504
+
2505
+ mask = (idx_series.str.lower() == tgt_lower) | (idx_no_ver.str.lower() == tgt_no_ver_lower)
2506
+
2507
+ # If the target appears to be a symbol and IDs are in results, use mapping if provided/discoverable.
2508
+ if not mask.any():
2509
+ mapper = None
2510
+
2511
+ def _build_mapper(gm_df: "pd.DataFrame") -> "dict[str, str]":
2512
+ nonlocal gene_id_col, gene_symbol_col
2513
+ id_col = gene_id_col or ("ENSG_ID" if "ENSG_ID" in gm_df.columns else gm_df.columns[0])
2514
+ sym_col = gene_symbol_col or (
2515
+ "gene_name"
2516
+ if "gene_name" in gm_df.columns
2517
+ else ("symbol" if "symbol" in gm_df.columns else gm_df.columns[-1])
2518
+ )
2519
+ gm2 = gm_df[[id_col, sym_col]].dropna().copy()
2520
+ gm2[id_col] = gm2[id_col].astype(str).str.split(".").str[0]
2521
+ gm2[sym_col] = gm2[sym_col].astype(str)
2522
+ return {
2523
+ symbol.lower(): gid
2524
+ for gid, symbol in zip(gm2[id_col], gm2[sym_col])
2525
+ if symbol
2526
+ }
2527
+
2528
+ if gene_map_path:
2529
+ gm, gm_err = _load_tabular(gene_map_path, index_col=None)
2530
+ if not gm_err and gm is not None and not gm.empty:
2531
+ mapper = _build_mapper(gm)
2532
+ else:
2533
+ # Best-effort auto-discovery for common capsule naming patterns.
2534
+ try:
2535
+ base_dir = Path(counts_path).expanduser().resolve().parent
2536
+ candidates = sorted(
2537
+ [
2538
+ p
2539
+ for p in base_dir.iterdir()
2540
+ if p.is_file()
2541
+ and p.suffix.lower() in {".csv", ".tsv", ".txt", ".xlsx", ".xls"}
2542
+ and ("gene" in p.name.lower() and ("meta" in p.name.lower() or "annot" in p.name.lower()))
2543
+ ]
2544
+ )
2545
+ for cand in candidates:
2546
+ gm, gm_err = _load_tabular(str(cand), index_col=None)
2547
+ if gm_err or gm is None or gm.empty:
2548
+ continue
2549
+ mapper = _build_mapper(gm)
2550
+ if mapper:
2551
+ break
2552
+ except Exception:
2553
+ mapper = None
2554
+
2555
+ if mapper:
2556
+ mapped_id = mapper.get(tgt_lower)
2557
+ if mapped_id:
2558
+ mask = idx_no_ver.str.lower() == mapped_id.lower()
2559
+
2560
+ if not mask.any():
2561
+ return {
2562
+ "target_gene": target_gene,
2563
+ "found": False,
2564
+ }
2565
+
2566
+ row = results_df.loc[mask].iloc[0]
2567
+ return {
2568
+ "target_gene": target_gene,
2569
+ "found": True,
2570
+ "matched_gene_id": str(results_df.loc[mask].index[0]),
2571
+ "log2FoldChange": float(row.get("log2FoldChange")) if pd.notna(row.get("log2FoldChange")) else None,
2572
+ "baseMean": float(row.get("baseMean")) if pd.notna(row.get("baseMean")) else None,
2573
+ "padj": float(row.get("padj")) if pd.notna(row.get("padj")) else None,
2574
+ "pvalue": float(row.get("pvalue")) if "pvalue" in row and pd.notna(row.get("pvalue")) else None,
2575
+ }
2576
+
2577
+ # Ensure categorical encoding for design variables.
2578
+ # Coerce to string first so mixed numeric/string covariates (e.g., batch IDs)
2579
+ # convert cleanly through pandas2ri into R factors.
2580
+ metadata = metadata.copy()
2581
+ for col in design_terms:
2582
+ metadata[col] = metadata[col].astype(str).astype("category")
2583
+ if ref in metadata[condition_col].cat.categories:
2584
+ ordered_levels = [ref] + [x for x in metadata[condition_col].cat.categories if x != ref]
2585
+ metadata[condition_col] = metadata[condition_col].cat.reorder_categories(ordered_levels)
2586
+
2587
+ # Try native R DESeq2 first (when requested and available), then fall back
2588
+ # to pyDESeq2 for environments without DESeq2.
2589
+ if use_r_deseq2:
2590
+ try:
2591
+ import rpy2.robjects as ro
2592
+ from rpy2.robjects import pandas2ri
2593
+ from rpy2.robjects.conversion import localconverter
2594
+ from rpy2.robjects.packages import importr
2595
+
2596
+ # Ensure DESeq2 is available in either user or system R library.
2597
+ ro.r(".libPaths(c('~/R/library', .libPaths()))")
2598
+ importr("DESeq2")
2599
+ if lfc_shrink:
2600
+ try:
2601
+ importr("apeglm")
2602
+ except Exception:
2603
+ # Shrinkage is optional; proceed without if apeglm unavailable.
2604
+ lfc_shrink = False
2605
+
2606
+ counts_r = counts.astype(int)
2607
+ meta_r = metadata.copy()
2608
+ # R expects colData rownames to match countData colnames.
2609
+ meta_r.index = counts_r.columns
2610
+
2611
+ with localconverter(ro.default_converter + pandas2ri.converter):
2612
+ ro.globalenv["counts_df"] = counts_r
2613
+ ro.globalenv["meta_df"] = meta_r
2614
+
2615
+ ro.globalenv["design_formula_str"] = design_formula
2616
+ ro.globalenv["condition_col_str"] = condition_col
2617
+ ro.globalenv["test_level_str"] = test
2618
+ ro.globalenv["ref_level_str"] = ref
2619
+ ro.globalenv["alpha_val"] = float(alpha)
2620
+ ro.globalenv["do_shrink"] = bool(lfc_shrink)
2621
+
2622
+ r_script = """
2623
+ suppressPackageStartupMessages(library(DESeq2))
2624
+ if (isTRUE(do_shrink)) {
2625
+ suppressPackageStartupMessages(library(apeglm))
2626
+ }
2627
+ counts_mat <- as.matrix(counts_df)
2628
+ mode(counts_mat) <- "integer"
2629
+ meta <- as.data.frame(meta_df)
2630
+ cond_vals <- as.character(meta[[condition_col_str]])
2631
+ meta[[condition_col_str]] <- factor(cond_vals)
2632
+ meta[[condition_col_str]] <- relevel(meta[[condition_col_str]], ref = ref_level_str)
2633
+ dds <- DESeqDataSetFromMatrix(
2634
+ countData = counts_mat,
2635
+ colData = meta,
2636
+ design = as.formula(design_formula_str)
2637
+ )
2638
+ dds <- DESeq(dds, quiet = TRUE)
2639
+ res <- results(
2640
+ dds,
2641
+ contrast = c(condition_col_str, test_level_str, ref_level_str),
2642
+ alpha = alpha_val
2643
+ )
2644
+ shrink_coeff <- NA_character_
2645
+ if (isTRUE(do_shrink)) {
2646
+ rn <- resultsNames(dds)
2647
+ cand <- rn[grepl(paste0("^", condition_col_str, "_"), rn)]
2648
+ if (length(cand) > 0) {
2649
+ shrink_coeff <- cand[1]
2650
+ res <- lfcShrink(dds, coef = shrink_coeff, type = "apeglm")
2651
+ }
2652
+ }
2653
+ res_df <- as.data.frame(res)
2654
+ res_df$gene_id <- rownames(res_df)
2655
+ """
2656
+ ro.r(r_script)
2657
+ with localconverter(ro.default_converter + pandas2ri.converter):
2658
+ res_df = ro.conversion.rpy2py(ro.globalenv["res_df"])
2659
+ shrink_coeff = str(ro.globalenv["shrink_coeff"][0]) if "shrink_coeff" in ro.globalenv else None
2660
+ if shrink_coeff in {"NA", "NA_character_", "None"}:
2661
+ shrink_coeff = None
2662
+
2663
+ # Normalize column names to match pyDESeq2-style payload.
2664
+ if "log2FoldChange" not in res_df.columns and "log2FoldChange" in [str(c) for c in res_df.columns]:
2665
+ pass
2666
+ if "baseMean" not in res_df.columns or "padj" not in res_df.columns:
2667
+ raise ValueError("R DESeq2 results missing required columns")
2668
+ res_df = res_df.set_index("gene_id")
2669
+ results = res_df.sort_values("padj")
2670
+
2671
+ n_sig = int((results["padj"] < alpha).sum())
2672
+ n_up = int(((results["padj"] < alpha) & (results["log2FoldChange"] > 0)).sum())
2673
+ n_down = int(((results["padj"] < alpha) & (results["log2FoldChange"] < 0)).sum())
2674
+ target_gene_result = _resolve_target_gene(results)
2675
+ target_gene_summary = ""
2676
+ if target_gene_result:
2677
+ if target_gene_result.get("found"):
2678
+ lfc_val = target_gene_result.get("log2FoldChange")
2679
+ lfc_txt = f"{lfc_val:.6g}" if lfc_val is not None else "NA"
2680
+ target_gene_summary = f" {target_gene} log2FoldChange={lfc_txt}."
2681
+ else:
2682
+ target_gene_summary = f" {target_gene} was not found in result gene IDs."
2683
+ return {
2684
+ "method": "DESeq2 (R via rpy2)",
2685
+ "n_genes_tested": len(results),
2686
+ "n_significant": n_sig,
2687
+ "n_upregulated": n_up,
2688
+ "n_downregulated": n_down,
2689
+ "contrast": f"{test} vs {ref}",
2690
+ "design": design_formula,
2691
+ "covariates": covars,
2692
+ "n_samples_ref": n_ref,
2693
+ "n_samples_test": n_test,
2694
+ "n_shared_samples": int(len(common)),
2695
+ "prefilter": {
2696
+ "min_count": int(prefilter_min_count),
2697
+ "min_samples": int(prefilter_min_samples),
2698
+ "n_genes_after": int(len(results)),
2699
+ },
2700
+ "metadata_inferred": bool(infer_metadata and not metadata_path),
2701
+ "alpha": alpha,
2702
+ "lfc_shrink": bool(lfc_shrink),
2703
+ "lfc_shrink_coeff": shrink_coeff,
2704
+ "top_hits": results.head(20).to_dict("index"),
2705
+ "target_gene_result": target_gene_result,
2706
+ "summary": (
2707
+ f"DESeq2 (R): {len(results):,} genes tested ({test} vs {ref}) with design {design_formula}. "
2708
+ f"{n_sig} significant (padj<{alpha}): {n_up} up, {n_down} down."
2709
+ + (" Metadata was inferred from sample order (exploratory)." if infer_metadata and not metadata_path else "")
2710
+ + target_gene_summary
2711
+ ),
2712
+ }
2713
+ except Exception as exc:
2714
+ logger.warning("R DESeq2 backend failed, falling back to pyDESeq2: %s", exc)
2715
+
2716
+ # Try pyDESeq2
2717
+ if _check_pydeseq2():
2718
+ try:
2719
+ from pydeseq2.dds import DeseqDataSet
2720
+ from pydeseq2.ds import DeseqStats
2721
+ from pydeseq2.default_inference import DefaultInference
2722
+
2723
+ inference = DefaultInference(n_cpus=1)
2724
+ # pyDESeq2 wants samples as rows, genes as columns
2725
+ dds = DeseqDataSet(
2726
+ counts=counts.T,
2727
+ metadata=metadata,
2728
+ design=design_formula,
2729
+ refit_cooks=True,
2730
+ inference=inference,
2731
+ quiet=True,
2732
+ )
2733
+ dds.deseq2()
2734
+
2735
+ stat = DeseqStats(
2736
+ dds,
2737
+ contrast=[condition_col, test, ref],
2738
+ alpha=alpha,
2739
+ inference=inference,
2740
+ quiet=True,
2741
+ )
2742
+ stat.summary()
2743
+
2744
+ # Optional apeglm shrinkage on the requested condition coefficient.
2745
+ shrink_coeff = None
2746
+ if lfc_shrink and hasattr(dds, "varm") and "LFC" in dds.varm:
2747
+ coeffs = list(dds.varm["LFC"].columns)
2748
+ preferred = [
2749
+ c for c in coeffs
2750
+ if condition_col in c and (test in c or test.replace("-", "_") in c)
2751
+ ]
2752
+ if preferred:
2753
+ shrink_coeff = preferred[0]
2754
+ try:
2755
+ stat.lfc_shrink(coeff=shrink_coeff)
2756
+ except Exception as exc:
2757
+ logger.warning("LFC shrinkage failed for coeff %s: %s", shrink_coeff, exc)
2758
+
2759
+ results = stat.results_df.sort_values("padj")
2760
+ n_sig = int((results["padj"] < alpha).sum())
2761
+ n_up = int(((results["padj"] < alpha) & (results["log2FoldChange"] > 0)).sum())
2762
+ n_down = int(((results["padj"] < alpha) & (results["log2FoldChange"] < 0)).sum())
2763
+ target_gene_result = _resolve_target_gene(results)
2764
+ target_gene_summary = ""
2765
+ if target_gene_result:
2766
+ if target_gene_result.get("found"):
2767
+ lfc_val = target_gene_result.get("log2FoldChange")
2768
+ lfc_txt = f"{lfc_val:.6g}" if lfc_val is not None else "NA"
2769
+ target_gene_summary = f" {target_gene} log2FoldChange={lfc_txt}."
2770
+ else:
2771
+ target_gene_summary = f" {target_gene} was not found in result gene IDs."
2772
+
2773
+ result_payload = {
2774
+ "method": "DESeq2 (pydeseq2)",
2775
+ "n_genes_tested": len(results),
2776
+ "n_significant": n_sig,
2777
+ "n_upregulated": n_up,
2778
+ "n_downregulated": n_down,
2779
+ "contrast": f"{test} vs {ref}",
2780
+ "design": design_formula,
2781
+ "covariates": covars,
2782
+ "n_samples_ref": n_ref,
2783
+ "n_samples_test": n_test,
2784
+ "n_shared_samples": int(len(common)),
2785
+ "prefilter": {
2786
+ "min_count": int(prefilter_min_count),
2787
+ "min_samples": int(prefilter_min_samples),
2788
+ "n_genes_after": int(len(results)),
2789
+ },
2790
+ "metadata_inferred": bool(infer_metadata and not metadata_path),
2791
+ "alpha": alpha,
2792
+ "lfc_shrink": bool(lfc_shrink),
2793
+ "lfc_shrink_coeff": shrink_coeff,
2794
+ "top_hits": results.head(20).to_dict("index"),
2795
+ "target_gene_result": target_gene_result,
2796
+ "summary": (
2797
+ f"DESeq2: {len(results):,} genes tested ({test} vs {ref}) with design {design_formula}. "
2798
+ f"{n_sig} significant (padj<{alpha}): {n_up} up, {n_down} down."
2799
+ + (" Metadata was inferred from sample order (exploratory)." if infer_metadata and not metadata_path else "")
2800
+ + target_gene_summary
2801
+ ),
2802
+ }
2803
+
2804
+ # Optional enrichment over significant DEGs with effect filters.
2805
+ if enrichment_library:
2806
+ sig = results[results["padj"] < alpha].copy()
2807
+ if min_abs_lfc > 0:
2808
+ sig = sig[sig["log2FoldChange"].abs() >= float(min_abs_lfc)]
2809
+ if min_base_mean > 0 and "baseMean" in sig.columns:
2810
+ sig = sig[sig["baseMean"] >= float(min_base_mean)]
2811
+
2812
+ genes_for_enrichment = list(sig.index.astype(str))
2813
+ mapped_gene_count = None
2814
+ if gene_map_path:
2815
+ gm, gm_err = _load_tabular(gene_map_path, index_col=None)
2816
+ if gm_err:
2817
+ result_payload["enrichment_error"] = f"Gene map load failed: {gm_err}"
2818
+ else:
2819
+ id_col = gene_id_col or ("ENSG_ID" if "ENSG_ID" in gm.columns else gm.columns[0])
2820
+ sym_col = gene_symbol_col or (
2821
+ "gene_name" if "gene_name" in gm.columns else ("symbol" if "symbol" in gm.columns else gm.columns[-1])
2822
+ )
2823
+ gm2 = gm[[id_col, sym_col]].dropna().copy()
2824
+ gm2[id_col] = gm2[id_col].astype(str)
2825
+ gm2[sym_col] = gm2[sym_col].astype(str)
2826
+ mapper = dict(zip(gm2[id_col], gm2[sym_col]))
2827
+ mapped = []
2828
+ for gid in genes_for_enrichment:
2829
+ mapped_sym = mapper.get(gid, mapper.get(gid.split(".")[0]))
2830
+ if mapped_sym:
2831
+ mapped.append(mapped_sym)
2832
+ genes_for_enrichment = sorted(set(mapped))
2833
+ mapped_gene_count = len(genes_for_enrichment)
2834
+
2835
+ if genes_for_enrichment:
2836
+ try:
2837
+ import gseapy
2838
+
2839
+ enr = gseapy.enrichr(
2840
+ gene_list=genes_for_enrichment,
2841
+ gene_sets=enrichment_library,
2842
+ outdir=None,
2843
+ no_plot=True,
2844
+ )
2845
+ enr_df = enr.results.copy()
2846
+ result_payload["enrichment"] = {
2847
+ "library": enrichment_library,
2848
+ "n_input_genes": len(genes_for_enrichment),
2849
+ "mapped_gene_count": mapped_gene_count,
2850
+ "n_terms": int(len(enr_df)),
2851
+ "top_terms": enr_df.head(20).to_dict("records"),
2852
+ }
2853
+
2854
+ if pathway_term:
2855
+ terms = enr_df["Term"].astype(str)
2856
+ exact = enr_df[terms.str.lower() == pathway_term.lower()]
2857
+ target_df = exact if not exact.empty else enr_df[terms.str.contains(pathway_term, case=False, na=False)]
2858
+ if not target_df.empty:
2859
+ target = target_df.iloc[0].to_dict()
2860
+ result_payload["pathway_match"] = target
2861
+ result_payload["pathway_odds_ratio"] = target.get("Odds Ratio")
2862
+ result_payload["summary"] += (
2863
+ f" Enrichment: '{target.get('Term', pathway_term)}' odds ratio "
2864
+ f"{target.get('Odds Ratio')}."
2865
+ )
2866
+ else:
2867
+ result_payload["pathway_match"] = None
2868
+ result_payload["summary"] += f" Enrichment ran but pathway '{pathway_term}' was not found."
2869
+ except Exception as exc:
2870
+ result_payload["enrichment_error"] = str(exc)
2871
+ result_payload["summary"] += " Enrichment step failed."
2872
+
2873
+ return result_payload
2874
+
2875
+ except Exception as exc:
2876
+ logger.warning("pyDESeq2 failed, falling back to Mann-Whitney: %s", exc)
2877
+
2878
+ # Fallback: Mann-Whitney U
2879
+ import numpy as np
2880
+ from scipy import stats
2881
+
2882
+ g1_samples = metadata.index[metadata[condition_col] == ref].tolist()
2883
+ g2_samples = metadata.index[metadata[condition_col] == test].tolist()
2884
+ g1 = counts[g1_samples]
2885
+ g2 = counts[g2_samples]
2886
+
2887
+ log2fc = np.log2((g2.mean(axis=1) + 1) / (g1.mean(axis=1) + 1))
2888
+ pvals = []
2889
+ for gene in counts.index:
2890
+ v1 = g1.loc[gene].dropna().values
2891
+ v2 = g2.loc[gene].dropna().values
2892
+ if len(v1) >= 2 and len(v2) >= 2:
2893
+ _, p = stats.mannwhitneyu(v1, v2, alternative="two-sided")
2894
+ pvals.append(p)
2895
+ else:
2896
+ pvals.append(1.0)
2897
+
2898
+ pvals = np.array(pvals)
2899
+ qvals = _fdr_correct(pvals)
2900
+
2901
+ n_sig = int((qvals < alpha).sum())
2902
+ n_up = int(((qvals < alpha) & (log2fc.values > 0)).sum())
2903
+ n_down = int(((qvals < alpha) & (log2fc.values < 0)).sum())
2904
+
2905
+ results = pd.DataFrame({
2906
+ "log2FoldChange": log2fc, "pvalue": pvals, "padj": qvals,
2907
+ }, index=counts.index).sort_values("padj")
2908
+ target_gene_result = _resolve_target_gene(results)
2909
+ target_gene_summary = ""
2910
+ if target_gene_result:
2911
+ if target_gene_result.get("found"):
2912
+ lfc_val = target_gene_result.get("log2FoldChange")
2913
+ lfc_txt = f"{lfc_val:.6g}" if lfc_val is not None else "NA"
2914
+ target_gene_summary = f" {target_gene} log2FoldChange={lfc_txt}."
2915
+ else:
2916
+ target_gene_summary = f" {target_gene} was not found in result gene IDs."
2917
+
2918
+ return {
2919
+ "method": "Mann-Whitney U (fallback — install pydeseq2 for proper DESeq2)",
2920
+ "n_genes_tested": len(results),
2921
+ "n_significant": n_sig,
2922
+ "n_upregulated": n_up,
2923
+ "n_downregulated": n_down,
2924
+ "contrast": f"{test} vs {ref}",
2925
+ "n_samples_ref": n_ref,
2926
+ "n_samples_test": n_test,
2927
+ "metadata_inferred": bool(infer_metadata and not metadata_path),
2928
+ "alpha": alpha,
2929
+ "top_hits": results.head(20).to_dict("index"),
2930
+ "target_gene_result": target_gene_result,
2931
+ "summary": (
2932
+ f"Differential expression (Mann-Whitney fallback): {len(results):,} genes ({test} vs {ref}). "
2933
+ f"{n_sig} significant (FDR<{alpha}): {n_up} up, {n_down} down. "
2934
+ f"Install pydeseq2 for proper negative binomial modeling."
2935
+ + (" Metadata was inferred from sample order (exploratory)." if infer_metadata and not metadata_path else "")
2936
+ + target_gene_summary
2937
+ ),
2938
+ }
2939
+
2940
+
2941
+ # ---------------------------------------------------------------------------
2942
+ # 20. omics.multiomics_integrate
2943
+ # ---------------------------------------------------------------------------
2944
+
2945
+
2946
+ @registry.register(
2947
+ name="omics.multiomics_integrate",
2948
+ description="Integrate multiple omics modalities using MOFA+ (Multi-Omics Factor Analysis)",
2949
+ category="omics",
2950
+ parameters={
2951
+ "paths": "Comma-separated paths to h5ad files for each modality",
2952
+ "modality_names": "Comma-separated names for each modality (e.g., 'rna,atac,protein')",
2953
+ "n_factors": "Number of latent factors to learn (default 10)",
2954
+ },
2955
+ usage_guide=(
2956
+ "Integrate multiple omics datasets (RNA + ATAC, RNA + protein, etc.) into a shared "
2957
+ "latent space using MOFA+. Requires muon: pip install muon. Each modality should be "
2958
+ "an h5ad file with overlapping cell barcodes."
2959
+ ),
2960
+ )
2961
+ def multiomics_integrate(
2962
+ paths: str = "",
2963
+ modality_names: str = "",
2964
+ n_factors: int = 10,
2965
+ **kwargs,
2966
+ ) -> dict:
2967
+ """Integrate multiple omics modalities using MOFA+."""
2968
+ mu = _check_muon()
2969
+ if mu is None:
2970
+ return {
2971
+ "error": "muon required. Install with: pip install muon mudata",
2972
+ "summary": "Install muon for multi-omics integration: pip install muon mudata",
2973
+ }
2974
+
2975
+ sc = _check_scanpy()
2976
+ if sc is None:
2977
+ return {"error": "scanpy required. Install with: pip install scanpy", "summary": "Install scanpy: pip install scanpy"}
2978
+
2979
+ from mudata import MuData
2980
+
2981
+ path_list = [p.strip() for p in paths.split(",") if p.strip()]
2982
+ name_list = [n.strip() for n in modality_names.split(",") if n.strip()]
2983
+
2984
+ if len(path_list) < 2:
2985
+ return {"error": "Need at least 2 modality paths", "summary": "Provide ≥2 h5ad paths for integration"}
2986
+
2987
+ if not name_list:
2988
+ name_list = [f"modality_{i}" for i in range(len(path_list))]
2989
+ if len(name_list) != len(path_list):
2990
+ return {"error": "Number of names must match number of paths", "summary": "Mismatched path/name count"}
2991
+
2992
+ # Load modalities
2993
+ modalities = {}
2994
+ for name, fpath in zip(name_list, path_list):
2995
+ fp = Path(fpath).expanduser()
2996
+ if not fp.exists():
2997
+ return {"error": f"File not found: {fpath}", "summary": f"Missing file: {fpath}"}
2998
+ try:
2999
+ adata = sc.read_h5ad(fp)
3000
+ modalities[name] = adata
3001
+ except Exception as exc:
3002
+ return {"error": f"Failed to load {fpath}: {str(exc)[:200]}", "summary": f"Load error: {str(exc)[:100]}"}
3003
+
3004
+ # Create MuData
3005
+ try:
3006
+ mdata = MuData(modalities)
3007
+ except Exception as exc:
3008
+ return {"error": f"MuData creation failed: {str(exc)[:200]}", "summary": f"Integration setup error: {str(exc)[:100]}"}
3009
+
3010
+ n_shared = mdata.n_obs
3011
+ mod_shapes = {name: (ad.n_obs, ad.n_vars) for name, ad in modalities.items()}
3012
+
3013
+ # Preprocess each modality
3014
+ for name in name_list:
3015
+ ad = mdata.mod[name]
3016
+ if ad.X.max() > 50: # likely raw counts
3017
+ sc.pp.normalize_total(ad, target_sum=1e4)
3018
+ sc.pp.log1p(ad)
3019
+ sc.pp.highly_variable_genes(ad, min_mean=0.0125, max_mean=3, min_disp=0.5)
3020
+
3021
+ # Run MOFA+
3022
+ try:
3023
+ mu.tl.mofa(mdata, n_factors=n_factors, quiet=True)
3024
+ except Exception as exc:
3025
+ return {
3026
+ "error": f"MOFA+ failed: {str(exc)[:200]}",
3027
+ "summary": f"MOFA+ integration failed: {str(exc)[:100]}",
3028
+ "n_shared_cells": n_shared,
3029
+ "modality_shapes": mod_shapes,
3030
+ }
3031
+
3032
+ # Extract results
3033
+ has_mofa = "X_mofa" in mdata.obsm
3034
+ if not has_mofa:
3035
+ return {
3036
+ "error": "MOFA+ did not produce embeddings",
3037
+ "summary": "Integration ran but produced no factors",
3038
+ }
3039
+
3040
+ # Downstream: neighbors + leiden on MOFA space
3041
+ sc.pp.neighbors(mdata, use_rep="X_mofa")
3042
+ sc.tl.leiden(mdata, resolution=1.0, key_added="joint_cluster")
3043
+
3044
+ clusters = mdata.obs["joint_cluster"].value_counts().to_dict()
3045
+ n_clusters = len(clusters)
3046
+
3047
+ return {
3048
+ "n_shared_cells": n_shared,
3049
+ "n_factors": n_factors,
3050
+ "modalities": name_list,
3051
+ "modality_shapes": mod_shapes,
3052
+ "n_joint_clusters": n_clusters,
3053
+ "joint_cluster_sizes": clusters,
3054
+ "summary": (
3055
+ f"MOFA+ integration of {len(name_list)} modalities "
3056
+ f"({', '.join(f'{n}: {s[0]}cells x {s[1]}features' for n, s in mod_shapes.items())}). "
3057
+ f"{n_shared:,} shared cells → {n_factors} factors → {n_clusters} joint clusters."
3058
+ ),
3059
+ }
3060
+
3061
+
3062
+ # ---------------------------------------------------------------------------
3063
+ # 21. omics.methylation_cluster
3064
+ # ---------------------------------------------------------------------------
3065
+
3066
+
3067
+ @registry.register(
3068
+ name="omics.methylation_cluster",
3069
+ description="Cluster samples by methylation patterns using episcanpy",
3070
+ category="omics",
3071
+ parameters={
3072
+ "path": "Path to methylation matrix (h5ad or CSV, CpG sites as rows, samples as columns)",
3073
+ "n_top_features": "Number of most variable CpGs to use (default 5000)",
3074
+ "resolution": "Leiden clustering resolution (default 1.0)",
3075
+ },
3076
+ usage_guide=(
3077
+ "Cluster cells/samples by DNA methylation profiles. Uses episcanpy for "
3078
+ "methylation-aware preprocessing if available, falls back to scanpy/sklearn. "
3079
+ "Works on Illumina 450K/EPIC beta-value matrices or single-cell methylation h5ad."
3080
+ ),
3081
+ )
3082
+ def methylation_cluster(
3083
+ path: str,
3084
+ n_top_features: int = 5000,
3085
+ resolution: float = 1.0,
3086
+ **kwargs,
3087
+ ) -> dict:
3088
+ """Cluster samples by methylation patterns."""
3089
+ import numpy as np
3090
+
3091
+ filepath = Path(path).expanduser()
3092
+ if not filepath.exists():
3093
+ return {"error": f"File not found: {path}", "summary": f"File not found: {path}"}
3094
+
3095
+ epi = _check_episcanpy()
3096
+ sc = _check_scanpy()
3097
+
3098
+ # Load data
3099
+ adata = None
3100
+ if filepath.suffix.lower() == ".h5ad":
3101
+ if sc is None and epi is None:
3102
+ return {"error": "scanpy or episcanpy required for h5ad", "summary": "Install scanpy or episcanpy"}
3103
+ reader = epi if epi else sc
3104
+ try:
3105
+ adata = reader.read_h5ad(filepath)
3106
+ except Exception as exc:
3107
+ return {"error": f"Failed to load h5ad: {str(exc)[:200]}", "summary": f"Load error: {str(exc)[:100]}"}
3108
+ else:
3109
+ # Tabular: load as AnnData
3110
+ import pandas as pd
3111
+
3112
+ df, error = _load_tabular(str(filepath))
3113
+ if error:
3114
+ return {"error": error, "summary": f"Could not load: {error}"}
3115
+ try:
3116
+ import anndata
3117
+
3118
+ # Transpose so samples are obs and CpGs are var
3119
+ adata = anndata.AnnData(X=df.T.values, obs=pd.DataFrame(index=df.columns), var=pd.DataFrame(index=df.index))
3120
+ except ImportError:
3121
+ return {"error": "anndata required: pip install anndata", "summary": "Install anndata"}
3122
+
3123
+ n_obs, n_vars = adata.n_obs, adata.n_vars
3124
+
3125
+ # Use episcanpy pipeline if available
3126
+ if epi is not None:
3127
+ try:
3128
+ # episcanpy variable feature selection
3129
+ epi.pp.filter_features(adata, min_cells=max(1, int(n_obs * 0.05)))
3130
+ epi.pp.select_var_feature(adata, nb_features=min(n_top_features, adata.n_vars))
3131
+ adata_use = adata[:, adata.var["highly_variable"]] if "highly_variable" in adata.var else adata
3132
+ epi.pp.pca(adata_use, n_comps=min(50, adata_use.n_vars - 1, adata_use.n_obs - 1))
3133
+ epi.pp.neighbors(adata_use, n_neighbors=15)
3134
+ epi.tl.leiden(adata_use, resolution=resolution)
3135
+
3136
+ clusters = adata_use.obs["leiden"].value_counts().to_dict()
3137
+
3138
+ result = {
3139
+ "method": "episcanpy",
3140
+ "n_samples": n_obs,
3141
+ "n_features_input": n_vars,
3142
+ "n_features_used": adata_use.n_vars,
3143
+ "n_clusters": len(clusters),
3144
+ "cluster_sizes": clusters,
3145
+ "summary": (
3146
+ f"Methylation clustering (episcanpy): {n_obs} samples, {adata_use.n_vars} variable CpGs → "
3147
+ f"{len(clusters)} clusters."
3148
+ ),
3149
+ }
3150
+
3151
+ # Try to find marker CpGs
3152
+ try:
3153
+ epi.tl.rank_features(adata_use, groupby="leiden")
3154
+ markers = {}
3155
+ for cl in adata_use.obs["leiden"].unique():
3156
+ markers[str(cl)] = list(adata_use.uns["rank_features_groups"]["names"][cl][:5])
3157
+ result["cluster_markers"] = markers
3158
+ except Exception:
3159
+ pass
3160
+
3161
+ return result
3162
+
3163
+ except Exception as exc:
3164
+ logger.warning("episcanpy pipeline failed, falling back to scanpy: %s", exc)
3165
+
3166
+ # Fallback: scanpy or sklearn
3167
+ if sc is not None:
3168
+ try:
3169
+ sc.pp.highly_variable_genes(adata, n_top_genes=min(n_top_features, adata.n_vars))
3170
+ adata_use = adata[:, adata.var["highly_variable"]]
3171
+ sc.tl.pca(adata_use, n_comps=min(50, adata_use.n_vars - 1, adata_use.n_obs - 1))
3172
+ sc.pp.neighbors(adata_use, n_neighbors=15)
3173
+ sc.tl.leiden(adata_use, resolution=resolution)
3174
+
3175
+ clusters = adata_use.obs["leiden"].value_counts().to_dict()
3176
+ return {
3177
+ "method": "scanpy (episcanpy not installed)",
3178
+ "n_samples": n_obs,
3179
+ "n_features_input": n_vars,
3180
+ "n_features_used": adata_use.n_vars,
3181
+ "n_clusters": len(clusters),
3182
+ "cluster_sizes": clusters,
3183
+ "summary": (
3184
+ f"Methylation clustering (scanpy fallback): {n_obs} samples → "
3185
+ f"{len(clusters)} clusters. Install episcanpy for methylation-specific analysis."
3186
+ ),
3187
+ }
3188
+ except Exception as exc:
3189
+ logger.warning("scanpy fallback failed: %s", exc)
3190
+
3191
+ # Last resort: sklearn KMeans
3192
+ from sklearn.decomposition import PCA
3193
+ from sklearn.cluster import KMeans
3194
+ from sklearn.preprocessing import StandardScaler
3195
+
3196
+ X = adata.X
3197
+ X = np.nan_to_num(X, nan=0.0)
3198
+ X = StandardScaler().fit_transform(X)
3199
+ n_comps = min(50, X.shape[0] - 1, X.shape[1] - 1)
3200
+ X_pca = PCA(n_components=n_comps).fit_transform(X)
3201
+ n_k = min(10, X.shape[0] // 2)
3202
+ labels = KMeans(n_clusters=max(n_k, 2), random_state=42, n_init=3).fit_predict(X_pca)
3203
+
3204
+ import pandas as pd
3205
+
3206
+ cluster_counts = pd.Series(labels).value_counts().to_dict()
3207
+ return {
3208
+ "method": "sklearn (install episcanpy or scanpy for better results)",
3209
+ "n_samples": n_obs,
3210
+ "n_features_input": n_vars,
3211
+ "n_clusters": len(cluster_counts),
3212
+ "cluster_sizes": {str(k): v for k, v in cluster_counts.items()},
3213
+ "summary": (
3214
+ f"Methylation clustering (sklearn fallback): {n_obs} samples → "
3215
+ f"{len(cluster_counts)} clusters. Install episcanpy for methylation-specific analysis."
3216
+ ),
3217
+ }
3218
+
3219
+
3220
+ # ---------------------------------------------------------------------------
3221
+ # KEGG over-representation analysis (code-gen tool)
3222
+ # ---------------------------------------------------------------------------
3223
+
3224
+ KEGG_ORA_SYSTEM_PROMPT = """You are an expert bioinformatics data analyst performing KEGG pathway over-representation analysis.
3225
+
3226
+ {namespace_description}
3227
+
3228
+ ## Available Data
3229
+ {data_files_description}
3230
+
3231
+ ## DATA EXPLORATION (DO THIS FIRST)
3232
+ ```python
3233
+ print("Columns:", df.columns.tolist())
3234
+ print("Shape:", df.shape)
3235
+ print("Head:\\n", df.head(3))
3236
+ if 'Unnamed: 0' in df.columns:
3237
+ df = df.set_index('Unnamed: 0')
3238
+ ```
3239
+
3240
+ ## KEGG ORA METHOD
3241
+ ### Step 1: Determine organism code
3242
+ Common codes: 'hsa' (human), 'mmu' (mouse), 'eco' (E. coli), 'sce' (yeast).
3243
+ Check https://rest.kegg.jp/list/organism for others.
3244
+
3245
+ ### Step 2: Fetch gene-pathway mappings
3246
+ - `/link/pathway/{{org}}` returns gene-to-pathway mapping (strip `path:` prefix from pathway IDs)
3247
+ - `/list/pathway/{{org}}` returns pathway names (already without `path:` prefix)
3248
+ - `/list/{{org}}` returns ALL genes (use as background universe — not just pathway-annotated genes)
3249
+ - Pathway names include organism suffix; use substring matching when searching.
3250
+
3251
+ ### ORA parameters
3252
+ - **Background**: all genes from `/list/{{org}}` (typically much larger than the pathway-annotated subset)
3253
+ - **Size filters**: skip pathways with < 5 or > 500 genes
3254
+ - **Significance**: p < 0.05 and BH-adjusted p < 0.05
3255
+
3256
+ ### Step 3: Fisher's exact test
3257
+ ```python
3258
+ import urllib.request
3259
+ from scipy.stats import fisher_exact
3260
+ from statsmodels.stats.multitest import multipletests
3261
+
3262
+ def run_kegg_ora(gene_ids, all_kegg_genes, path2genes, path_names, min_size=5, max_size=500):
3263
+ deg_kegg = set(gene_ids) & all_kegg_genes
3264
+ N = len(all_kegg_genes)
3265
+ n = len(deg_kegg)
3266
+ if n == 0:
3267
+ return pd.DataFrame()
3268
+ results = []
3269
+ for pid, pgenes in path2genes.items():
3270
+ K = len(pgenes)
3271
+ if K < min_size or K > max_size:
3272
+ continue
3273
+ k = len(deg_kegg & pgenes)
3274
+ if k == 0:
3275
+ continue
3276
+ _, pval = fisher_exact([[k, n-k], [K-k, N-K-n+k]], alternative='greater')
3277
+ results.append({{'pathway': pid, 'name': path_names.get(pid, ''),
3278
+ 'overlap': k, 'pathway_size': K, 'pvalue': pval}})
3279
+ if not results:
3280
+ return pd.DataFrame()
3281
+ res_df = pd.DataFrame(results)
3282
+ _, res_df['padj'], _, _ = multipletests(res_df['pvalue'], method='fdr_bh')
3283
+ return res_df
3284
+ ```
3285
+
3286
+ ### Step 4: Gene ID matching
3287
+ KEGG uses its own gene IDs. Always print examples from both your DEG list and KEGG to
3288
+ verify overlap. If overlap is low (< 10%), try stripping prefixes or case normalization.
3289
+
3290
+ ### Directional analysis
3291
+ When working with DEG results (log2FoldChange), run ORA separately on upregulated
3292
+ (log2FC > threshold) and downregulated (log2FC < -threshold) genes. Combined analysis
3293
+ mixes opposing signals and can produce different pathway results.
3294
+
3295
+ ## Rules
3296
+ 1. Do NOT import libraries already in the namespace (pd, np, plt, sns, scipy_stats, etc.)
3297
+ 2. Save plots to OUTPUT_DIR: `plt.savefig(OUTPUT_DIR / "filename.png", dpi=150, bbox_inches="tight")`; `plt.close()`
3298
+ 3. Assign result: `result = {{"summary": "...", "answer": "PRECISE_ANSWER"}}`
3299
+ 4. Use print() for intermediate output to verify correctness.
3300
+ 5. If 0 results from a filter: print the column values and debug — do not return "N/A".
3301
+
3302
+ Write ONLY the Python code. No explanation, no markdown fences.
3303
+ """
3304
+
3305
+
3306
+ @registry.register(
3307
+ name="omics.kegg_ora",
3308
+ description=(
3309
+ "KEGG pathway over-representation analysis (ORA) on differentially expressed genes "
3310
+ "using KEGG REST API + Fisher's exact test + BH correction"
3311
+ ),
3312
+ category="omics",
3313
+ parameters={"goal": "ORA analysis to perform (include organism code if known, e.g. 'hsa' for human)"},
3314
+ usage_guide=(
3315
+ "Use when the question asks about KEGG pathway enrichment via ORA (not GSEA). "
3316
+ "Handles non-human organisms via KEGG REST API. Uses Fisher's exact test with "
3317
+ "Benjamini-Hochberg FDR correction. "
3318
+ "For human gene set enrichment with gseapy, use code.execute instead."
3319
+ ),
3320
+ )
3321
+ def kegg_ora(goal: str, _session=None, _prior_results=None, **kwargs) -> dict:
3322
+ """Perform KEGG pathway over-representation analysis using generated code."""
3323
+ from ct.tools.code import _generate_and_execute_code
3324
+
3325
+ return _generate_and_execute_code(
3326
+ goal=goal,
3327
+ system_prompt_template=KEGG_ORA_SYSTEM_PROMPT,
3328
+ session=_session,
3329
+ prior_results=_prior_results,
3330
+ )