celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/data/downloader.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset downloader for ct.
|
|
3
|
+
|
|
4
|
+
Manages downloading and caching of common drug discovery datasets.
|
|
5
|
+
Supports automatic downloads for open-access datasets and guided
|
|
6
|
+
instructions for datasets requiring portal authentication.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import gzip
|
|
10
|
+
import hashlib
|
|
11
|
+
import shutil
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
from rich.progress import Progress, SpinnerColumn, BarColumn, DownloadColumn, TransferSpeedColumn
|
|
17
|
+
from rich.table import Table
|
|
18
|
+
|
|
19
|
+
from ct.agent.config import Config
|
|
20
|
+
|
|
21
|
+
console = Console()
|
|
22
|
+
|
|
23
|
+
# Download timeout in seconds (large files like CRISPRGeneEffect ~700MB need more time)
|
|
24
|
+
DOWNLOAD_TIMEOUT = 600
|
|
25
|
+
|
|
26
|
+
DATASETS = {
|
|
27
|
+
"depmap": {
|
|
28
|
+
"description": "DepMap CRISPR gene dependencies, mutations, cell line metadata (24Q4)",
|
|
29
|
+
"files": {
|
|
30
|
+
"CRISPRGeneEffect.csv": "https://ndownloader.figshare.com/files/51064667",
|
|
31
|
+
"Model.csv": "https://ndownloader.figshare.com/files/51065297",
|
|
32
|
+
"OmicsSomaticMutationsMatrixDamaging.csv": "https://ndownloader.figshare.com/files/51065747",
|
|
33
|
+
},
|
|
34
|
+
"source": "https://plus.figshare.com/articles/dataset/DepMap_24Q4_Public/27993248",
|
|
35
|
+
"auto_download": True,
|
|
36
|
+
"size_hint": "~580MB",
|
|
37
|
+
},
|
|
38
|
+
"prism": {
|
|
39
|
+
"description": "PRISM cell viability screening data",
|
|
40
|
+
"files": {
|
|
41
|
+
"prism_LFC_COLLAPSED.csv": None,
|
|
42
|
+
},
|
|
43
|
+
"source": "https://depmap.org/repurposing/",
|
|
44
|
+
"auto_download": False,
|
|
45
|
+
"note": "PRISM data requires manual download from https://depmap.org/repurposing/ or symlink from existing data.",
|
|
46
|
+
"size_hint": "~600MB",
|
|
47
|
+
},
|
|
48
|
+
"l1000": {
|
|
49
|
+
"description": "L1000 landmark gene expression signatures (978 genes)",
|
|
50
|
+
"files": {
|
|
51
|
+
"l1000_landmark_only.parquet": None,
|
|
52
|
+
},
|
|
53
|
+
"source": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742",
|
|
54
|
+
"auto_download": False,
|
|
55
|
+
"note": "Run: python scripts/prepare_l1000.py to download from GEO and extract landmark subset.",
|
|
56
|
+
"size_hint": "~200MB",
|
|
57
|
+
},
|
|
58
|
+
"msigdb": {
|
|
59
|
+
"description": "MSigDB gene set collections (Hallmark, KEGG, Reactome, GO)",
|
|
60
|
+
"files": {
|
|
61
|
+
"h.all.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/h.all.v2024.1.Hs.json",
|
|
62
|
+
"c2.cp.kegg_legacy.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c2.cp.kegg_legacy.v2024.1.Hs.json",
|
|
63
|
+
"c2.cp.reactome.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c2.cp.reactome.v2024.1.Hs.json",
|
|
64
|
+
"c5.go.bp.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c5.go.bp.v2024.1.Hs.json",
|
|
65
|
+
},
|
|
66
|
+
"source": "https://www.gsea-msigdb.org/gsea/msigdb/",
|
|
67
|
+
"auto_download": True,
|
|
68
|
+
},
|
|
69
|
+
"string": {
|
|
70
|
+
"description": "STRING protein-protein interaction network (human)",
|
|
71
|
+
"files": {
|
|
72
|
+
"9606.protein.links.v12.0.txt.gz": "https://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
|
|
73
|
+
},
|
|
74
|
+
"source": "https://string-db.org/",
|
|
75
|
+
"auto_download": True,
|
|
76
|
+
},
|
|
77
|
+
"alphafold": {
|
|
78
|
+
"description": "AlphaFold predicted protein structures (downloaded on demand per-protein)",
|
|
79
|
+
"files": {},
|
|
80
|
+
"source": "https://alphafold.ebi.ac.uk/",
|
|
81
|
+
"auto_download": False,
|
|
82
|
+
"note": "Structures are fetched on-demand by structure.alphafold_fetch tool.",
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _download_file(url: str, dest: Path, desc: str = None) -> bool:
|
|
88
|
+
"""Download a file with progress bar. Returns True on success."""
|
|
89
|
+
desc = desc or dest.name
|
|
90
|
+
try:
|
|
91
|
+
with httpx.stream("GET", url, timeout=DOWNLOAD_TIMEOUT, follow_redirects=True) as resp:
|
|
92
|
+
if resp.status_code != 200:
|
|
93
|
+
console.print(f" [red]HTTP {resp.status_code} for {url}[/red]")
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
total = int(resp.headers.get("content-length", 0))
|
|
97
|
+
|
|
98
|
+
with Progress(
|
|
99
|
+
SpinnerColumn(),
|
|
100
|
+
"[progress.description]{task.description}",
|
|
101
|
+
BarColumn(),
|
|
102
|
+
DownloadColumn(),
|
|
103
|
+
TransferSpeedColumn(),
|
|
104
|
+
console=console,
|
|
105
|
+
) as progress:
|
|
106
|
+
task = progress.add_task(f" {desc}", total=total or None)
|
|
107
|
+
with open(dest, "wb") as f:
|
|
108
|
+
for chunk in resp.iter_bytes(chunk_size=8192):
|
|
109
|
+
f.write(chunk)
|
|
110
|
+
progress.advance(task, len(chunk))
|
|
111
|
+
|
|
112
|
+
return True
|
|
113
|
+
except httpx.HTTPError as e:
|
|
114
|
+
console.print(f" [red]Download failed: {e}[/red]")
|
|
115
|
+
if dest.exists():
|
|
116
|
+
dest.unlink()
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def download_dataset(name: str, output: Path = None):
|
|
121
|
+
"""Download a dataset."""
|
|
122
|
+
if name == "--all" or name == "all":
|
|
123
|
+
download_all(output)
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
if name not in DATASETS:
|
|
127
|
+
console.print(f"[red]Unknown dataset: {name}[/red]")
|
|
128
|
+
console.print(f"Available: {', '.join(DATASETS.keys())}")
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
ds = DATASETS[name]
|
|
132
|
+
cfg = Config.load()
|
|
133
|
+
dest = output or Path(cfg.get("data.base")) / name
|
|
134
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
console.print(f"\n[cyan]{name}:[/cyan] {ds['description']}")
|
|
137
|
+
if ds.get("size_hint"):
|
|
138
|
+
console.print(f" Size: {ds['size_hint']}")
|
|
139
|
+
console.print(f" Destination: {dest}")
|
|
140
|
+
|
|
141
|
+
if not ds.get("auto_download"):
|
|
142
|
+
# Manual download required
|
|
143
|
+
if "note" in ds:
|
|
144
|
+
console.print(f" [yellow]{ds['note']}[/yellow]")
|
|
145
|
+
console.print(f" Source: {ds['source']}")
|
|
146
|
+
console.print(f" Files needed:")
|
|
147
|
+
for fname in ds["files"]:
|
|
148
|
+
fpath = dest / fname
|
|
149
|
+
status = "[green]found[/green]" if fpath.exists() else "[red]missing[/red]"
|
|
150
|
+
console.print(f" {fname} — {status}")
|
|
151
|
+
console.print(f"\n Download from {ds['source']} and place in {dest}/")
|
|
152
|
+
console.print(f" Then run: [cyan]ct config set data.{name} {dest}[/cyan]")
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
# Automatic download
|
|
156
|
+
downloaded = 0
|
|
157
|
+
skipped = 0
|
|
158
|
+
failed = 0
|
|
159
|
+
|
|
160
|
+
for fname, url in ds["files"].items():
|
|
161
|
+
fpath = dest / fname
|
|
162
|
+
if fpath.exists():
|
|
163
|
+
console.print(f" [dim]{fname} — already exists, skipping[/dim]")
|
|
164
|
+
skipped += 1
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
if url is None:
|
|
168
|
+
console.print(f" [yellow]{fname} — no download URL, skip[/yellow]")
|
|
169
|
+
failed += 1
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
if _download_file(url, fpath, fname):
|
|
173
|
+
downloaded += 1
|
|
174
|
+
else:
|
|
175
|
+
failed += 1
|
|
176
|
+
|
|
177
|
+
# Summary
|
|
178
|
+
total = len(ds["files"])
|
|
179
|
+
console.print(f"\n [green]{downloaded} downloaded[/green], {skipped} skipped, ", end="")
|
|
180
|
+
if failed:
|
|
181
|
+
console.print(f"[red]{failed} failed[/red]")
|
|
182
|
+
else:
|
|
183
|
+
console.print(f"0 failed")
|
|
184
|
+
|
|
185
|
+
# Auto-configure data path after successful download
|
|
186
|
+
if downloaded > 0 or skipped > 0:
|
|
187
|
+
cfg.set(f"data.{name}", str(dest))
|
|
188
|
+
cfg.save()
|
|
189
|
+
console.print(f" [green]Auto-configured data.{name} = {dest}[/green]")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def download_all(output: Path = None):
|
|
193
|
+
"""Download all auto-downloadable datasets."""
|
|
194
|
+
auto_datasets = [name for name, ds in DATASETS.items() if ds.get("auto_download")]
|
|
195
|
+
console.print(f"[cyan]Downloading {len(auto_datasets)} datasets: {', '.join(auto_datasets)}[/cyan]")
|
|
196
|
+
for name in auto_datasets:
|
|
197
|
+
download_dataset(name, output=output)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def dataset_status() -> Table:
|
|
201
|
+
"""Check which datasets are available locally."""
|
|
202
|
+
cfg = Config.load()
|
|
203
|
+
base = Path(cfg.get("data.base"))
|
|
204
|
+
|
|
205
|
+
table = Table(title="Dataset Status")
|
|
206
|
+
table.add_column("Dataset", style="cyan")
|
|
207
|
+
table.add_column("Status")
|
|
208
|
+
table.add_column("Files", style="dim")
|
|
209
|
+
table.add_column("Auto-DL")
|
|
210
|
+
|
|
211
|
+
for name, ds in DATASETS.items():
|
|
212
|
+
# Check custom config path first, then default location
|
|
213
|
+
custom_path = cfg.get(f"data.{name}")
|
|
214
|
+
path = Path(custom_path) if custom_path else base / name
|
|
215
|
+
|
|
216
|
+
expected = set(ds["files"].keys())
|
|
217
|
+
found = set()
|
|
218
|
+
if path.exists():
|
|
219
|
+
existing = {f.name for f in path.iterdir() if f.is_file()}
|
|
220
|
+
found = expected & existing
|
|
221
|
+
|
|
222
|
+
if not expected:
|
|
223
|
+
status = "[dim]on-demand[/dim]"
|
|
224
|
+
files_str = "-"
|
|
225
|
+
elif found == expected:
|
|
226
|
+
status = "[green]complete[/green]"
|
|
227
|
+
files_str = f"{len(found)}/{len(expected)}"
|
|
228
|
+
elif found:
|
|
229
|
+
status = "[yellow]partial[/yellow]"
|
|
230
|
+
files_str = f"{len(found)}/{len(expected)}"
|
|
231
|
+
else:
|
|
232
|
+
status = "[red]missing[/red]"
|
|
233
|
+
files_str = f"0/{len(expected)}"
|
|
234
|
+
|
|
235
|
+
auto = "[green]yes[/green]" if ds.get("auto_download") else "[dim]manual[/dim]"
|
|
236
|
+
table.add_row(name, status, files_str, auto)
|
|
237
|
+
|
|
238
|
+
return table
|
ct/data/loaders.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loaders for common drug discovery datasets.
|
|
3
|
+
|
|
4
|
+
Each loader checks configured paths, then the ct-data sister project,
|
|
5
|
+
and supports both CSV and Parquet formats.
|
|
6
|
+
Data paths are configured via ct config.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from functools import lru_cache
|
|
12
|
+
|
|
13
|
+
from ct.agent.config import Config
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Search order for data files (first match wins)
|
|
17
|
+
_DATA_SEARCH_PATHS = [
|
|
18
|
+
Path.home() / "Projects" / "CellType" / "ct-data", # Primary: ct-data repo
|
|
19
|
+
Path.home() / "Projects" / "CellType" / "crews-glue-discovery", # Legacy fallback
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _data_path(key: str) -> Path:
|
|
24
|
+
"""Get configured data path."""
|
|
25
|
+
cfg = Config.load()
|
|
26
|
+
path = cfg.get(f"data.{key}")
|
|
27
|
+
if path:
|
|
28
|
+
return Path(path)
|
|
29
|
+
base = cfg.get("data.base")
|
|
30
|
+
if base:
|
|
31
|
+
return Path(base) / key
|
|
32
|
+
return Path.home() / ".ct" / "data" / key
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _find_file(name: str, subdirs: list[str] = None) -> Path | None:
|
|
36
|
+
"""Search for a file across configured paths and common locations."""
|
|
37
|
+
subdirs = subdirs or [""]
|
|
38
|
+
stem = Path(name).stem
|
|
39
|
+
|
|
40
|
+
# 1. Check configured data.base
|
|
41
|
+
cfg = Config.load()
|
|
42
|
+
base = cfg.get("data.base")
|
|
43
|
+
search_dirs = []
|
|
44
|
+
if base:
|
|
45
|
+
search_dirs.append(Path(base))
|
|
46
|
+
|
|
47
|
+
# 2. Check known data locations
|
|
48
|
+
search_dirs.extend(_DATA_SEARCH_PATHS)
|
|
49
|
+
|
|
50
|
+
# 3. Check ~/.ct/data
|
|
51
|
+
search_dirs.append(Path.home() / ".ct" / "data")
|
|
52
|
+
|
|
53
|
+
for base_dir in search_dirs:
|
|
54
|
+
for sub in subdirs:
|
|
55
|
+
d = base_dir / sub if sub else base_dir
|
|
56
|
+
if not d.exists():
|
|
57
|
+
continue
|
|
58
|
+
# Exact match
|
|
59
|
+
candidate = d / name
|
|
60
|
+
if candidate.exists():
|
|
61
|
+
return candidate
|
|
62
|
+
# Try parquet variant
|
|
63
|
+
parquet = d / f"{stem}.parquet"
|
|
64
|
+
if parquet.exists():
|
|
65
|
+
return parquet
|
|
66
|
+
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _resolve_path(p: Path, filenames: list[str]) -> Path | None:
|
|
71
|
+
"""If p is a directory, search for one of filenames inside it. If p is a file, return it."""
|
|
72
|
+
if p.is_file():
|
|
73
|
+
return p
|
|
74
|
+
if p.is_dir():
|
|
75
|
+
for name in filenames:
|
|
76
|
+
candidate = p / name
|
|
77
|
+
if candidate.exists():
|
|
78
|
+
return candidate
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _read_tabular(path: Path, **kwargs) -> pd.DataFrame:
|
|
83
|
+
"""Read a CSV or Parquet file based on extension."""
|
|
84
|
+
if path.suffix == ".parquet":
|
|
85
|
+
return pd.read_parquet(path, **{k: v for k, v in kwargs.items() if k != "index_col"})
|
|
86
|
+
return pd.read_csv(path, **kwargs)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@lru_cache(maxsize=1)
|
|
90
|
+
def load_crispr() -> pd.DataFrame:
|
|
91
|
+
"""Load DepMap CRISPR gene effect data."""
|
|
92
|
+
# Try configured path first
|
|
93
|
+
path = _find_file("CRISPRGeneEffect.csv", subdirs=["", "depmap"])
|
|
94
|
+
if path is None:
|
|
95
|
+
raise FileNotFoundError(
|
|
96
|
+
"DepMap CRISPR data not found. "
|
|
97
|
+
"Run: ct data pull depmap\n"
|
|
98
|
+
"Or set: ct config set data.base /path/to/data"
|
|
99
|
+
)
|
|
100
|
+
df = _read_tabular(path, index_col=0)
|
|
101
|
+
# Clean column names: "TP53 (7157)" → "TP53"
|
|
102
|
+
df.columns = [c.split(' (')[0] for c in df.columns]
|
|
103
|
+
return df
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@lru_cache(maxsize=1)
|
|
107
|
+
def load_model_metadata() -> pd.DataFrame:
|
|
108
|
+
"""Load DepMap cell line metadata."""
|
|
109
|
+
path = _find_file("Model.csv", subdirs=["", "depmap"])
|
|
110
|
+
if path is None:
|
|
111
|
+
raise FileNotFoundError(
|
|
112
|
+
"Model metadata not found. "
|
|
113
|
+
"Run: ct data pull depmap"
|
|
114
|
+
)
|
|
115
|
+
return _read_tabular(path)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@lru_cache(maxsize=1)
|
|
119
|
+
def load_proteomics() -> pd.DataFrame:
|
|
120
|
+
"""Load proteomics LFC matrix."""
|
|
121
|
+
_prot_files = ["proteomics_log2fc_matrix.parquet", "proteomics_log2fc_matrix.csv", "merged_proteomics.csv"]
|
|
122
|
+
# Check configured path
|
|
123
|
+
cfg = Config.load()
|
|
124
|
+
explicit = cfg.get("data.proteomics")
|
|
125
|
+
if explicit:
|
|
126
|
+
p = _resolve_path(Path(explicit), _prot_files)
|
|
127
|
+
if p:
|
|
128
|
+
return _read_tabular(p, index_col=0)
|
|
129
|
+
|
|
130
|
+
# Search common locations
|
|
131
|
+
for name in ["merged_proteomics.csv", "proteomics_log2fc_matrix.parquet",
|
|
132
|
+
"proteomics_log2fc_matrix.csv"]:
|
|
133
|
+
path = _find_file(name)
|
|
134
|
+
if path:
|
|
135
|
+
return _read_tabular(path, index_col=0)
|
|
136
|
+
|
|
137
|
+
raise FileNotFoundError(
|
|
138
|
+
"Proteomics data not found. "
|
|
139
|
+
"Set: ct config set data.proteomics /path/to/file"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@lru_cache(maxsize=1)
|
|
144
|
+
def load_l1000() -> pd.DataFrame:
|
|
145
|
+
"""Load L1000 landmark gene expression data.
|
|
146
|
+
|
|
147
|
+
Prefers the compound-named profiles parquet (19,811 compounds × 978 genes)
|
|
148
|
+
built from the Broad LINCS GSE92742 Level 5 GCTX. Falls back to legacy
|
|
149
|
+
formats (YU-indexed parquet/CSV).
|
|
150
|
+
"""
|
|
151
|
+
cfg = Config.load()
|
|
152
|
+
|
|
153
|
+
# 1. Prefer compound-named profiles (from LINCS GCTX, 19,811 compounds)
|
|
154
|
+
lincs_path = Path("/mnt2/bronze/lincs/l1000_compound_profiles.parquet")
|
|
155
|
+
if lincs_path.exists():
|
|
156
|
+
return _read_tabular(lincs_path, index_col=0)
|
|
157
|
+
|
|
158
|
+
# 2. Check data.base for compound profiles
|
|
159
|
+
base = cfg.get("data.base")
|
|
160
|
+
if base:
|
|
161
|
+
candidate = Path(base) / "lincs" / "l1000_compound_profiles.parquet"
|
|
162
|
+
if candidate.exists():
|
|
163
|
+
return _read_tabular(candidate, index_col=0)
|
|
164
|
+
|
|
165
|
+
# 3. Check explicit config (may point to legacy YU-indexed data)
|
|
166
|
+
explicit = cfg.get("data.l1000")
|
|
167
|
+
if explicit:
|
|
168
|
+
_l1000_files = [
|
|
169
|
+
"l1000_compound_profiles.parquet",
|
|
170
|
+
"l1000_landmark_only.parquet",
|
|
171
|
+
"L1000_landmark_LFC.csv",
|
|
172
|
+
"l1000_expression_matrix.parquet",
|
|
173
|
+
"l1000_landmark_only.csv",
|
|
174
|
+
]
|
|
175
|
+
p = _resolve_path(Path(explicit), _l1000_files)
|
|
176
|
+
if p:
|
|
177
|
+
return _read_tabular(p, index_col=0)
|
|
178
|
+
|
|
179
|
+
# 4. Fall back to legacy formats
|
|
180
|
+
for name in ["l1000_compound_profiles.parquet", "L1000_landmark_LFC.csv",
|
|
181
|
+
"l1000_landmark_only.parquet", "l1000_expression_matrix.parquet",
|
|
182
|
+
"l1000_landmark_only.csv"]:
|
|
183
|
+
path = _find_file(name, subdirs=["", "lincs", "l1000"])
|
|
184
|
+
if path:
|
|
185
|
+
return _read_tabular(path, index_col=0)
|
|
186
|
+
|
|
187
|
+
raise FileNotFoundError(
|
|
188
|
+
"L1000 data not found. "
|
|
189
|
+
"Set: ct config set data.l1000 /path/to/file"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@lru_cache(maxsize=1)
|
|
194
|
+
def load_prism() -> pd.DataFrame:
|
|
195
|
+
"""Load PRISM cell viability data."""
|
|
196
|
+
_prism_files = ["prism_LFC_COLLAPSED.csv", "prism_LFC_COLLAPSED.parquet"]
|
|
197
|
+
cfg = Config.load()
|
|
198
|
+
explicit = cfg.get("data.prism")
|
|
199
|
+
if explicit:
|
|
200
|
+
p = _resolve_path(Path(explicit), _prism_files)
|
|
201
|
+
if p:
|
|
202
|
+
return _read_tabular(p)
|
|
203
|
+
|
|
204
|
+
for name in ["prism_LFC_COLLAPSED.csv", "prism_LFC_COLLAPSED.parquet"]:
|
|
205
|
+
path = _find_file(name)
|
|
206
|
+
if path:
|
|
207
|
+
return _read_tabular(path)
|
|
208
|
+
|
|
209
|
+
raise FileNotFoundError(
|
|
210
|
+
"PRISM data not found. "
|
|
211
|
+
"Run: ct data pull prism\n"
|
|
212
|
+
"Or set: ct config set data.prism /path/to/file"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def load_mutations() -> pd.DataFrame:
|
|
217
|
+
"""Load DepMap somatic mutation data."""
|
|
218
|
+
path = _find_file("OmicsSomaticMutationsMatrixDamaging.csv", subdirs=["", "depmap"])
|
|
219
|
+
if path is None:
|
|
220
|
+
raise FileNotFoundError(
|
|
221
|
+
"Mutation data not found. "
|
|
222
|
+
"Run: ct data pull depmap"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
df = _read_tabular(path)
|
|
226
|
+
meta_cols = ['Unnamed: 0', 'SequencingID', 'ModelConditionID',
|
|
227
|
+
'IsDefaultEntryForModel', 'IsDefaultEntryForMC']
|
|
228
|
+
|
|
229
|
+
if 'IsDefaultEntryForModel' in df.columns:
|
|
230
|
+
df = df[df['IsDefaultEntryForModel'] == 'Yes']
|
|
231
|
+
|
|
232
|
+
if 'ModelID' in df.columns:
|
|
233
|
+
df = df.set_index('ModelID')
|
|
234
|
+
elif 'Unnamed: 0' in df.columns and df['Unnamed: 0'].astype(str).str.startswith('ACH-').any():
|
|
235
|
+
df = df.set_index('Unnamed: 0')
|
|
236
|
+
df.index.name = 'ModelID'
|
|
237
|
+
df = df.drop(columns=[c for c in meta_cols if c in df.columns], errors='ignore')
|
|
238
|
+
df.columns = [c.split(' (')[0] for c in df.columns]
|
|
239
|
+
return df
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def load_msigdb(collection: str = "h") -> dict:
|
|
243
|
+
"""Load MSigDB gene sets."""
|
|
244
|
+
import json
|
|
245
|
+
# Try both naming patterns: "h.all.v2024.1.Hs.json" and "c2.cp.kegg_legacy.v2024.1.Hs.json"
|
|
246
|
+
for pattern in [f"{collection}.all.v2024.1.Hs.json", f"{collection}.v2024.1.Hs.json"]:
|
|
247
|
+
path = _find_file(pattern, subdirs=["", "msigdb"])
|
|
248
|
+
if path and path.exists():
|
|
249
|
+
with open(path) as f:
|
|
250
|
+
return json.load(f)
|
|
251
|
+
|
|
252
|
+
raise FileNotFoundError("MSigDB data not found. Run: ct data pull msigdb")
|
ct/kb/__init__.py
ADDED
ct/kb/benchmarks.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Domain benchmark harness and release gating.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import asdict, dataclass
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from ct.agent.quality import evaluate_synthesis_quality
|
|
14
|
+
except ImportError:
|
|
15
|
+
evaluate_synthesis_quality = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BenchmarkCase:
|
|
20
|
+
name: str
|
|
21
|
+
domain: str
|
|
22
|
+
synthesis: str
|
|
23
|
+
completed_step_ids: list[int]
|
|
24
|
+
expect_pass: bool = True
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class BenchmarkResult:
|
|
29
|
+
name: str
|
|
30
|
+
domain: str
|
|
31
|
+
passed: bool
|
|
32
|
+
expected_pass: bool
|
|
33
|
+
issues: list[str]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BenchmarkSuite:
|
|
37
|
+
"""Runs deterministic benchmark cases for synthesis quality gates."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, cases: list[BenchmarkCase]):
|
|
40
|
+
self.cases = cases
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def load(cls, path: Path | None = None) -> "BenchmarkSuite":
|
|
44
|
+
source = path or (Path.cwd() / "configs" / "pharma_benchmarks.json")
|
|
45
|
+
if source.exists():
|
|
46
|
+
data = json.loads(source.read_text(encoding="utf-8"))
|
|
47
|
+
raw_cases = data.get("cases", [])
|
|
48
|
+
cases = [BenchmarkCase(**item) for item in raw_cases]
|
|
49
|
+
if cases:
|
|
50
|
+
return cls(cases)
|
|
51
|
+
return cls(default_cases())
|
|
52
|
+
|
|
53
|
+
def run(self) -> dict[str, Any]:
|
|
54
|
+
results: list[BenchmarkResult] = []
|
|
55
|
+
for case in self.cases:
|
|
56
|
+
quality = evaluate_synthesis_quality(
|
|
57
|
+
case.synthesis,
|
|
58
|
+
completed_step_ids=set(case.completed_step_ids),
|
|
59
|
+
require_key_evidence=True,
|
|
60
|
+
min_next_steps=2,
|
|
61
|
+
max_next_steps=3,
|
|
62
|
+
)
|
|
63
|
+
passed = quality.ok
|
|
64
|
+
results.append(
|
|
65
|
+
BenchmarkResult(
|
|
66
|
+
name=case.name,
|
|
67
|
+
domain=case.domain,
|
|
68
|
+
passed=passed,
|
|
69
|
+
expected_pass=case.expect_pass,
|
|
70
|
+
issues=quality.issues,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
expected_correct = 0
|
|
75
|
+
for result in results:
|
|
76
|
+
if result.passed == result.expected_pass:
|
|
77
|
+
expected_correct += 1
|
|
78
|
+
total = len(results)
|
|
79
|
+
pass_rate = expected_correct / max(total, 1)
|
|
80
|
+
domain_scores = self._domain_scores(results)
|
|
81
|
+
return {
|
|
82
|
+
"total_cases": total,
|
|
83
|
+
"expected_behavior_matches": expected_correct,
|
|
84
|
+
"pass_rate": round(pass_rate, 4),
|
|
85
|
+
"domain_scores": domain_scores,
|
|
86
|
+
"results": [asdict(r) for r in results],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _domain_scores(results: list[BenchmarkResult]) -> dict[str, float]:
|
|
91
|
+
buckets: dict[str, list[bool]] = {}
|
|
92
|
+
for result in results:
|
|
93
|
+
buckets.setdefault(result.domain, []).append(result.passed == result.expected_pass)
|
|
94
|
+
scores = {}
|
|
95
|
+
for domain, vals in buckets.items():
|
|
96
|
+
scores[domain] = round(sum(1 for v in vals if v) / max(len(vals), 1), 4)
|
|
97
|
+
return scores
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def gate(summary: dict[str, Any], *, min_pass_rate: float = 0.9) -> dict[str, Any]:
|
|
101
|
+
actual = float(summary.get("pass_rate", 0.0))
|
|
102
|
+
ok = actual >= min_pass_rate
|
|
103
|
+
return {
|
|
104
|
+
"ok": ok,
|
|
105
|
+
"min_pass_rate": min_pass_rate,
|
|
106
|
+
"actual_pass_rate": actual,
|
|
107
|
+
"message": (
|
|
108
|
+
f"Benchmark gate passed ({actual:.2%} >= {min_pass_rate:.2%})"
|
|
109
|
+
if ok
|
|
110
|
+
else f"Benchmark gate failed ({actual:.2%} < {min_pass_rate:.2%})"
|
|
111
|
+
),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def default_cases() -> list[BenchmarkCase]:
|
|
116
|
+
"""Fallback deterministic cases when benchmark file is absent."""
|
|
117
|
+
return [
|
|
118
|
+
BenchmarkCase(
|
|
119
|
+
name="target_validation_grounded",
|
|
120
|
+
domain="target_validation",
|
|
121
|
+
synthesis=(
|
|
122
|
+
"## Answer\nSignal supports target progression.\n\n"
|
|
123
|
+
"## Key Evidence\n- Genetic support observed [step:1]\n"
|
|
124
|
+
"- Expression concordance supports mechanism [step:2]\n\n"
|
|
125
|
+
"## Confidence & Caveats\n- Moderate confidence.\n\n"
|
|
126
|
+
"## Suggested Next Steps\n"
|
|
127
|
+
"1. Run genomics.coloc for the top locus.\n"
|
|
128
|
+
"2. Run target.disease_association for indication prioritization.\n"
|
|
129
|
+
),
|
|
130
|
+
completed_step_ids=[1, 2],
|
|
131
|
+
expect_pass=True,
|
|
132
|
+
),
|
|
133
|
+
BenchmarkCase(
|
|
134
|
+
name="moa_missing_citation",
|
|
135
|
+
domain="moa_inference",
|
|
136
|
+
synthesis=(
|
|
137
|
+
"## Answer\nMechanism inferred.\n\n"
|
|
138
|
+
"## Key Evidence\n- Strong MOA pattern without citation\n\n"
|
|
139
|
+
"## Confidence & Caveats\n- Preliminary.\n\n"
|
|
140
|
+
"## Suggested Next Steps\n"
|
|
141
|
+
"1. Run expression.pathway_enrichment for the top signature.\n"
|
|
142
|
+
"2. Run repurposing.cmap_query on the differential profile.\n"
|
|
143
|
+
),
|
|
144
|
+
completed_step_ids=[1, 2],
|
|
145
|
+
expect_pass=False,
|
|
146
|
+
),
|
|
147
|
+
]
|