celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/data/downloader.py ADDED
@@ -0,0 +1,238 @@
1
+ """
2
+ Dataset downloader for ct.
3
+
4
+ Manages downloading and caching of common drug discovery datasets.
5
+ Supports automatic downloads for open-access datasets and guided
6
+ instructions for datasets requiring portal authentication.
7
+ """
8
+
9
+ import gzip
10
+ import hashlib
11
+ import shutil
12
+ from pathlib import Path
13
+
14
+ import httpx
15
+ from rich.console import Console
16
+ from rich.progress import Progress, SpinnerColumn, BarColumn, DownloadColumn, TransferSpeedColumn
17
+ from rich.table import Table
18
+
19
+ from ct.agent.config import Config
20
+
21
+ console = Console()
22
+
23
+ # Download timeout in seconds (large files like CRISPRGeneEffect ~700MB need more time)
24
+ DOWNLOAD_TIMEOUT = 600
25
+
26
+ DATASETS = {
27
+ "depmap": {
28
+ "description": "DepMap CRISPR gene dependencies, mutations, cell line metadata (24Q4)",
29
+ "files": {
30
+ "CRISPRGeneEffect.csv": "https://ndownloader.figshare.com/files/51064667",
31
+ "Model.csv": "https://ndownloader.figshare.com/files/51065297",
32
+ "OmicsSomaticMutationsMatrixDamaging.csv": "https://ndownloader.figshare.com/files/51065747",
33
+ },
34
+ "source": "https://plus.figshare.com/articles/dataset/DepMap_24Q4_Public/27993248",
35
+ "auto_download": True,
36
+ "size_hint": "~580MB",
37
+ },
38
+ "prism": {
39
+ "description": "PRISM cell viability screening data",
40
+ "files": {
41
+ "prism_LFC_COLLAPSED.csv": None,
42
+ },
43
+ "source": "https://depmap.org/repurposing/",
44
+ "auto_download": False,
45
+ "note": "PRISM data requires manual download from https://depmap.org/repurposing/ or symlink from existing data.",
46
+ "size_hint": "~600MB",
47
+ },
48
+ "l1000": {
49
+ "description": "L1000 landmark gene expression signatures (978 genes)",
50
+ "files": {
51
+ "l1000_landmark_only.parquet": None,
52
+ },
53
+ "source": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742",
54
+ "auto_download": False,
55
+ "note": "Run: python scripts/prepare_l1000.py to download from GEO and extract landmark subset.",
56
+ "size_hint": "~200MB",
57
+ },
58
+ "msigdb": {
59
+ "description": "MSigDB gene set collections (Hallmark, KEGG, Reactome, GO)",
60
+ "files": {
61
+ "h.all.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/h.all.v2024.1.Hs.json",
62
+ "c2.cp.kegg_legacy.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c2.cp.kegg_legacy.v2024.1.Hs.json",
63
+ "c2.cp.reactome.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c2.cp.reactome.v2024.1.Hs.json",
64
+ "c5.go.bp.v2024.1.Hs.json": "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2024.1.Hs/c5.go.bp.v2024.1.Hs.json",
65
+ },
66
+ "source": "https://www.gsea-msigdb.org/gsea/msigdb/",
67
+ "auto_download": True,
68
+ },
69
+ "string": {
70
+ "description": "STRING protein-protein interaction network (human)",
71
+ "files": {
72
+ "9606.protein.links.v12.0.txt.gz": "https://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
73
+ },
74
+ "source": "https://string-db.org/",
75
+ "auto_download": True,
76
+ },
77
+ "alphafold": {
78
+ "description": "AlphaFold predicted protein structures (downloaded on demand per-protein)",
79
+ "files": {},
80
+ "source": "https://alphafold.ebi.ac.uk/",
81
+ "auto_download": False,
82
+ "note": "Structures are fetched on-demand by structure.alphafold_fetch tool.",
83
+ },
84
+ }
85
+
86
+
87
+ def _download_file(url: str, dest: Path, desc: str = None) -> bool:
88
+ """Download a file with progress bar. Returns True on success."""
89
+ desc = desc or dest.name
90
+ try:
91
+ with httpx.stream("GET", url, timeout=DOWNLOAD_TIMEOUT, follow_redirects=True) as resp:
92
+ if resp.status_code != 200:
93
+ console.print(f" [red]HTTP {resp.status_code} for {url}[/red]")
94
+ return False
95
+
96
+ total = int(resp.headers.get("content-length", 0))
97
+
98
+ with Progress(
99
+ SpinnerColumn(),
100
+ "[progress.description]{task.description}",
101
+ BarColumn(),
102
+ DownloadColumn(),
103
+ TransferSpeedColumn(),
104
+ console=console,
105
+ ) as progress:
106
+ task = progress.add_task(f" {desc}", total=total or None)
107
+ with open(dest, "wb") as f:
108
+ for chunk in resp.iter_bytes(chunk_size=8192):
109
+ f.write(chunk)
110
+ progress.advance(task, len(chunk))
111
+
112
+ return True
113
+ except httpx.HTTPError as e:
114
+ console.print(f" [red]Download failed: {e}[/red]")
115
+ if dest.exists():
116
+ dest.unlink()
117
+ return False
118
+
119
+
120
+ def download_dataset(name: str, output: Path = None):
121
+ """Download a dataset."""
122
+ if name == "--all" or name == "all":
123
+ download_all(output)
124
+ return
125
+
126
+ if name not in DATASETS:
127
+ console.print(f"[red]Unknown dataset: {name}[/red]")
128
+ console.print(f"Available: {', '.join(DATASETS.keys())}")
129
+ return
130
+
131
+ ds = DATASETS[name]
132
+ cfg = Config.load()
133
+ dest = output or Path(cfg.get("data.base")) / name
134
+ dest.mkdir(parents=True, exist_ok=True)
135
+
136
+ console.print(f"\n[cyan]{name}:[/cyan] {ds['description']}")
137
+ if ds.get("size_hint"):
138
+ console.print(f" Size: {ds['size_hint']}")
139
+ console.print(f" Destination: {dest}")
140
+
141
+ if not ds.get("auto_download"):
142
+ # Manual download required
143
+ if "note" in ds:
144
+ console.print(f" [yellow]{ds['note']}[/yellow]")
145
+ console.print(f" Source: {ds['source']}")
146
+ console.print(f" Files needed:")
147
+ for fname in ds["files"]:
148
+ fpath = dest / fname
149
+ status = "[green]found[/green]" if fpath.exists() else "[red]missing[/red]"
150
+ console.print(f" {fname} — {status}")
151
+ console.print(f"\n Download from {ds['source']} and place in {dest}/")
152
+ console.print(f" Then run: [cyan]ct config set data.{name} {dest}[/cyan]")
153
+ return
154
+
155
+ # Automatic download
156
+ downloaded = 0
157
+ skipped = 0
158
+ failed = 0
159
+
160
+ for fname, url in ds["files"].items():
161
+ fpath = dest / fname
162
+ if fpath.exists():
163
+ console.print(f" [dim]{fname} — already exists, skipping[/dim]")
164
+ skipped += 1
165
+ continue
166
+
167
+ if url is None:
168
+ console.print(f" [yellow]{fname} — no download URL, skip[/yellow]")
169
+ failed += 1
170
+ continue
171
+
172
+ if _download_file(url, fpath, fname):
173
+ downloaded += 1
174
+ else:
175
+ failed += 1
176
+
177
+ # Summary
178
+ total = len(ds["files"])
179
+ console.print(f"\n [green]{downloaded} downloaded[/green], {skipped} skipped, ", end="")
180
+ if failed:
181
+ console.print(f"[red]{failed} failed[/red]")
182
+ else:
183
+ console.print(f"0 failed")
184
+
185
+ # Auto-configure data path after successful download
186
+ if downloaded > 0 or skipped > 0:
187
+ cfg.set(f"data.{name}", str(dest))
188
+ cfg.save()
189
+ console.print(f" [green]Auto-configured data.{name} = {dest}[/green]")
190
+
191
+
192
+ def download_all(output: Path = None):
193
+ """Download all auto-downloadable datasets."""
194
+ auto_datasets = [name for name, ds in DATASETS.items() if ds.get("auto_download")]
195
+ console.print(f"[cyan]Downloading {len(auto_datasets)} datasets: {', '.join(auto_datasets)}[/cyan]")
196
+ for name in auto_datasets:
197
+ download_dataset(name, output=output)
198
+
199
+
200
+ def dataset_status() -> Table:
201
+ """Check which datasets are available locally."""
202
+ cfg = Config.load()
203
+ base = Path(cfg.get("data.base"))
204
+
205
+ table = Table(title="Dataset Status")
206
+ table.add_column("Dataset", style="cyan")
207
+ table.add_column("Status")
208
+ table.add_column("Files", style="dim")
209
+ table.add_column("Auto-DL")
210
+
211
+ for name, ds in DATASETS.items():
212
+ # Check custom config path first, then default location
213
+ custom_path = cfg.get(f"data.{name}")
214
+ path = Path(custom_path) if custom_path else base / name
215
+
216
+ expected = set(ds["files"].keys())
217
+ found = set()
218
+ if path.exists():
219
+ existing = {f.name for f in path.iterdir() if f.is_file()}
220
+ found = expected & existing
221
+
222
+ if not expected:
223
+ status = "[dim]on-demand[/dim]"
224
+ files_str = "-"
225
+ elif found == expected:
226
+ status = "[green]complete[/green]"
227
+ files_str = f"{len(found)}/{len(expected)}"
228
+ elif found:
229
+ status = "[yellow]partial[/yellow]"
230
+ files_str = f"{len(found)}/{len(expected)}"
231
+ else:
232
+ status = "[red]missing[/red]"
233
+ files_str = f"0/{len(expected)}"
234
+
235
+ auto = "[green]yes[/green]" if ds.get("auto_download") else "[dim]manual[/dim]"
236
+ table.add_row(name, status, files_str, auto)
237
+
238
+ return table
ct/data/loaders.py ADDED
@@ -0,0 +1,252 @@
1
+ """
2
+ Data loaders for common drug discovery datasets.
3
+
4
+ Each loader checks configured paths, then the ct-data sister project,
5
+ and supports both CSV and Parquet formats.
6
+ Data paths are configured via ct config.
7
+ """
8
+
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ from functools import lru_cache
12
+
13
+ from ct.agent.config import Config
14
+
15
+
16
+ # Search order for data files (first match wins)
17
+ _DATA_SEARCH_PATHS = [
18
+ Path.home() / "Projects" / "CellType" / "ct-data", # Primary: ct-data repo
19
+ Path.home() / "Projects" / "CellType" / "crews-glue-discovery", # Legacy fallback
20
+ ]
21
+
22
+
23
+ def _data_path(key: str) -> Path:
24
+ """Get configured data path."""
25
+ cfg = Config.load()
26
+ path = cfg.get(f"data.{key}")
27
+ if path:
28
+ return Path(path)
29
+ base = cfg.get("data.base")
30
+ if base:
31
+ return Path(base) / key
32
+ return Path.home() / ".ct" / "data" / key
33
+
34
+
35
+ def _find_file(name: str, subdirs: list[str] = None) -> Path | None:
36
+ """Search for a file across configured paths and common locations."""
37
+ subdirs = subdirs or [""]
38
+ stem = Path(name).stem
39
+
40
+ # 1. Check configured data.base
41
+ cfg = Config.load()
42
+ base = cfg.get("data.base")
43
+ search_dirs = []
44
+ if base:
45
+ search_dirs.append(Path(base))
46
+
47
+ # 2. Check known data locations
48
+ search_dirs.extend(_DATA_SEARCH_PATHS)
49
+
50
+ # 3. Check ~/.ct/data
51
+ search_dirs.append(Path.home() / ".ct" / "data")
52
+
53
+ for base_dir in search_dirs:
54
+ for sub in subdirs:
55
+ d = base_dir / sub if sub else base_dir
56
+ if not d.exists():
57
+ continue
58
+ # Exact match
59
+ candidate = d / name
60
+ if candidate.exists():
61
+ return candidate
62
+ # Try parquet variant
63
+ parquet = d / f"{stem}.parquet"
64
+ if parquet.exists():
65
+ return parquet
66
+
67
+ return None
68
+
69
+
70
+ def _resolve_path(p: Path, filenames: list[str]) -> Path | None:
71
+ """If p is a directory, search for one of filenames inside it. If p is a file, return it."""
72
+ if p.is_file():
73
+ return p
74
+ if p.is_dir():
75
+ for name in filenames:
76
+ candidate = p / name
77
+ if candidate.exists():
78
+ return candidate
79
+ return None
80
+
81
+
82
+ def _read_tabular(path: Path, **kwargs) -> pd.DataFrame:
83
+ """Read a CSV or Parquet file based on extension."""
84
+ if path.suffix == ".parquet":
85
+ return pd.read_parquet(path, **{k: v for k, v in kwargs.items() if k != "index_col"})
86
+ return pd.read_csv(path, **kwargs)
87
+
88
+
89
+ @lru_cache(maxsize=1)
90
+ def load_crispr() -> pd.DataFrame:
91
+ """Load DepMap CRISPR gene effect data."""
92
+ # Try configured path first
93
+ path = _find_file("CRISPRGeneEffect.csv", subdirs=["", "depmap"])
94
+ if path is None:
95
+ raise FileNotFoundError(
96
+ "DepMap CRISPR data not found. "
97
+ "Run: ct data pull depmap\n"
98
+ "Or set: ct config set data.base /path/to/data"
99
+ )
100
+ df = _read_tabular(path, index_col=0)
101
+ # Clean column names: "TP53 (7157)" → "TP53"
102
+ df.columns = [c.split(' (')[0] for c in df.columns]
103
+ return df
104
+
105
+
106
+ @lru_cache(maxsize=1)
107
+ def load_model_metadata() -> pd.DataFrame:
108
+ """Load DepMap cell line metadata."""
109
+ path = _find_file("Model.csv", subdirs=["", "depmap"])
110
+ if path is None:
111
+ raise FileNotFoundError(
112
+ "Model metadata not found. "
113
+ "Run: ct data pull depmap"
114
+ )
115
+ return _read_tabular(path)
116
+
117
+
118
+ @lru_cache(maxsize=1)
119
+ def load_proteomics() -> pd.DataFrame:
120
+ """Load proteomics LFC matrix."""
121
+ _prot_files = ["proteomics_log2fc_matrix.parquet", "proteomics_log2fc_matrix.csv", "merged_proteomics.csv"]
122
+ # Check configured path
123
+ cfg = Config.load()
124
+ explicit = cfg.get("data.proteomics")
125
+ if explicit:
126
+ p = _resolve_path(Path(explicit), _prot_files)
127
+ if p:
128
+ return _read_tabular(p, index_col=0)
129
+
130
+ # Search common locations
131
+ for name in ["merged_proteomics.csv", "proteomics_log2fc_matrix.parquet",
132
+ "proteomics_log2fc_matrix.csv"]:
133
+ path = _find_file(name)
134
+ if path:
135
+ return _read_tabular(path, index_col=0)
136
+
137
+ raise FileNotFoundError(
138
+ "Proteomics data not found. "
139
+ "Set: ct config set data.proteomics /path/to/file"
140
+ )
141
+
142
+
143
+ @lru_cache(maxsize=1)
144
+ def load_l1000() -> pd.DataFrame:
145
+ """Load L1000 landmark gene expression data.
146
+
147
+ Prefers the compound-named profiles parquet (19,811 compounds × 978 genes)
148
+ built from the Broad LINCS GSE92742 Level 5 GCTX. Falls back to legacy
149
+ formats (YU-indexed parquet/CSV).
150
+ """
151
+ cfg = Config.load()
152
+
153
+ # 1. Prefer compound-named profiles (from LINCS GCTX, 19,811 compounds)
154
+ lincs_path = Path("/mnt2/bronze/lincs/l1000_compound_profiles.parquet")
155
+ if lincs_path.exists():
156
+ return _read_tabular(lincs_path, index_col=0)
157
+
158
+ # 2. Check data.base for compound profiles
159
+ base = cfg.get("data.base")
160
+ if base:
161
+ candidate = Path(base) / "lincs" / "l1000_compound_profiles.parquet"
162
+ if candidate.exists():
163
+ return _read_tabular(candidate, index_col=0)
164
+
165
+ # 3. Check explicit config (may point to legacy YU-indexed data)
166
+ explicit = cfg.get("data.l1000")
167
+ if explicit:
168
+ _l1000_files = [
169
+ "l1000_compound_profiles.parquet",
170
+ "l1000_landmark_only.parquet",
171
+ "L1000_landmark_LFC.csv",
172
+ "l1000_expression_matrix.parquet",
173
+ "l1000_landmark_only.csv",
174
+ ]
175
+ p = _resolve_path(Path(explicit), _l1000_files)
176
+ if p:
177
+ return _read_tabular(p, index_col=0)
178
+
179
+ # 4. Fall back to legacy formats
180
+ for name in ["l1000_compound_profiles.parquet", "L1000_landmark_LFC.csv",
181
+ "l1000_landmark_only.parquet", "l1000_expression_matrix.parquet",
182
+ "l1000_landmark_only.csv"]:
183
+ path = _find_file(name, subdirs=["", "lincs", "l1000"])
184
+ if path:
185
+ return _read_tabular(path, index_col=0)
186
+
187
+ raise FileNotFoundError(
188
+ "L1000 data not found. "
189
+ "Set: ct config set data.l1000 /path/to/file"
190
+ )
191
+
192
+
193
+ @lru_cache(maxsize=1)
194
+ def load_prism() -> pd.DataFrame:
195
+ """Load PRISM cell viability data."""
196
+ _prism_files = ["prism_LFC_COLLAPSED.csv", "prism_LFC_COLLAPSED.parquet"]
197
+ cfg = Config.load()
198
+ explicit = cfg.get("data.prism")
199
+ if explicit:
200
+ p = _resolve_path(Path(explicit), _prism_files)
201
+ if p:
202
+ return _read_tabular(p)
203
+
204
+ for name in ["prism_LFC_COLLAPSED.csv", "prism_LFC_COLLAPSED.parquet"]:
205
+ path = _find_file(name)
206
+ if path:
207
+ return _read_tabular(path)
208
+
209
+ raise FileNotFoundError(
210
+ "PRISM data not found. "
211
+ "Run: ct data pull prism\n"
212
+ "Or set: ct config set data.prism /path/to/file"
213
+ )
214
+
215
+
216
+ def load_mutations() -> pd.DataFrame:
217
+ """Load DepMap somatic mutation data."""
218
+ path = _find_file("OmicsSomaticMutationsMatrixDamaging.csv", subdirs=["", "depmap"])
219
+ if path is None:
220
+ raise FileNotFoundError(
221
+ "Mutation data not found. "
222
+ "Run: ct data pull depmap"
223
+ )
224
+
225
+ df = _read_tabular(path)
226
+ meta_cols = ['Unnamed: 0', 'SequencingID', 'ModelConditionID',
227
+ 'IsDefaultEntryForModel', 'IsDefaultEntryForMC']
228
+
229
+ if 'IsDefaultEntryForModel' in df.columns:
230
+ df = df[df['IsDefaultEntryForModel'] == 'Yes']
231
+
232
+ if 'ModelID' in df.columns:
233
+ df = df.set_index('ModelID')
234
+ elif 'Unnamed: 0' in df.columns and df['Unnamed: 0'].astype(str).str.startswith('ACH-').any():
235
+ df = df.set_index('Unnamed: 0')
236
+ df.index.name = 'ModelID'
237
+ df = df.drop(columns=[c for c in meta_cols if c in df.columns], errors='ignore')
238
+ df.columns = [c.split(' (')[0] for c in df.columns]
239
+ return df
240
+
241
+
242
+ def load_msigdb(collection: str = "h") -> dict:
243
+ """Load MSigDB gene sets."""
244
+ import json
245
+ # Try both naming patterns: "h.all.v2024.1.Hs.json" and "c2.cp.kegg_legacy.v2024.1.Hs.json"
246
+ for pattern in [f"{collection}.all.v2024.1.Hs.json", f"{collection}.v2024.1.Hs.json"]:
247
+ path = _find_file(pattern, subdirs=["", "msigdb"])
248
+ if path and path.exists():
249
+ with open(path) as f:
250
+ return json.load(f)
251
+
252
+ raise FileNotFoundError("MSigDB data not found. Run: ct data pull msigdb")
ct/kb/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Knowledge substrate package for ct."""
2
+
3
+ from ct.kb.substrate import KnowledgeSubstrate
4
+
5
+ __all__ = ["KnowledgeSubstrate"]
ct/kb/benchmarks.py ADDED
@@ -0,0 +1,147 @@
1
+ """
2
+ Domain benchmark harness and release gating.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import asdict, dataclass
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ try:
13
+ from ct.agent.quality import evaluate_synthesis_quality
14
+ except ImportError:
15
+ evaluate_synthesis_quality = None
16
+
17
+
18
+ @dataclass
19
+ class BenchmarkCase:
20
+ name: str
21
+ domain: str
22
+ synthesis: str
23
+ completed_step_ids: list[int]
24
+ expect_pass: bool = True
25
+
26
+
27
+ @dataclass
28
+ class BenchmarkResult:
29
+ name: str
30
+ domain: str
31
+ passed: bool
32
+ expected_pass: bool
33
+ issues: list[str]
34
+
35
+
36
+ class BenchmarkSuite:
37
+ """Runs deterministic benchmark cases for synthesis quality gates."""
38
+
39
+ def __init__(self, cases: list[BenchmarkCase]):
40
+ self.cases = cases
41
+
42
+ @classmethod
43
+ def load(cls, path: Path | None = None) -> "BenchmarkSuite":
44
+ source = path or (Path.cwd() / "configs" / "pharma_benchmarks.json")
45
+ if source.exists():
46
+ data = json.loads(source.read_text(encoding="utf-8"))
47
+ raw_cases = data.get("cases", [])
48
+ cases = [BenchmarkCase(**item) for item in raw_cases]
49
+ if cases:
50
+ return cls(cases)
51
+ return cls(default_cases())
52
+
53
+ def run(self) -> dict[str, Any]:
54
+ results: list[BenchmarkResult] = []
55
+ for case in self.cases:
56
+ quality = evaluate_synthesis_quality(
57
+ case.synthesis,
58
+ completed_step_ids=set(case.completed_step_ids),
59
+ require_key_evidence=True,
60
+ min_next_steps=2,
61
+ max_next_steps=3,
62
+ )
63
+ passed = quality.ok
64
+ results.append(
65
+ BenchmarkResult(
66
+ name=case.name,
67
+ domain=case.domain,
68
+ passed=passed,
69
+ expected_pass=case.expect_pass,
70
+ issues=quality.issues,
71
+ )
72
+ )
73
+
74
+ expected_correct = 0
75
+ for result in results:
76
+ if result.passed == result.expected_pass:
77
+ expected_correct += 1
78
+ total = len(results)
79
+ pass_rate = expected_correct / max(total, 1)
80
+ domain_scores = self._domain_scores(results)
81
+ return {
82
+ "total_cases": total,
83
+ "expected_behavior_matches": expected_correct,
84
+ "pass_rate": round(pass_rate, 4),
85
+ "domain_scores": domain_scores,
86
+ "results": [asdict(r) for r in results],
87
+ }
88
+
89
+ @staticmethod
90
+ def _domain_scores(results: list[BenchmarkResult]) -> dict[str, float]:
91
+ buckets: dict[str, list[bool]] = {}
92
+ for result in results:
93
+ buckets.setdefault(result.domain, []).append(result.passed == result.expected_pass)
94
+ scores = {}
95
+ for domain, vals in buckets.items():
96
+ scores[domain] = round(sum(1 for v in vals if v) / max(len(vals), 1), 4)
97
+ return scores
98
+
99
+ @staticmethod
100
+ def gate(summary: dict[str, Any], *, min_pass_rate: float = 0.9) -> dict[str, Any]:
101
+ actual = float(summary.get("pass_rate", 0.0))
102
+ ok = actual >= min_pass_rate
103
+ return {
104
+ "ok": ok,
105
+ "min_pass_rate": min_pass_rate,
106
+ "actual_pass_rate": actual,
107
+ "message": (
108
+ f"Benchmark gate passed ({actual:.2%} >= {min_pass_rate:.2%})"
109
+ if ok
110
+ else f"Benchmark gate failed ({actual:.2%} < {min_pass_rate:.2%})"
111
+ ),
112
+ }
113
+
114
+
115
+ def default_cases() -> list[BenchmarkCase]:
116
+ """Fallback deterministic cases when benchmark file is absent."""
117
+ return [
118
+ BenchmarkCase(
119
+ name="target_validation_grounded",
120
+ domain="target_validation",
121
+ synthesis=(
122
+ "## Answer\nSignal supports target progression.\n\n"
123
+ "## Key Evidence\n- Genetic support observed [step:1]\n"
124
+ "- Expression concordance supports mechanism [step:2]\n\n"
125
+ "## Confidence & Caveats\n- Moderate confidence.\n\n"
126
+ "## Suggested Next Steps\n"
127
+ "1. Run genomics.coloc for the top locus.\n"
128
+ "2. Run target.disease_association for indication prioritization.\n"
129
+ ),
130
+ completed_step_ids=[1, 2],
131
+ expect_pass=True,
132
+ ),
133
+ BenchmarkCase(
134
+ name="moa_missing_citation",
135
+ domain="moa_inference",
136
+ synthesis=(
137
+ "## Answer\nMechanism inferred.\n\n"
138
+ "## Key Evidence\n- Strong MOA pattern without citation\n\n"
139
+ "## Confidence & Caveats\n- Preliminary.\n\n"
140
+ "## Suggested Next Steps\n"
141
+ "1. Run expression.pathway_enrichment for the top signature.\n"
142
+ "2. Run repurposing.cmap_query on the differential profile.\n"
143
+ ),
144
+ completed_step_ids=[1, 2],
145
+ expect_pass=False,
146
+ ),
147
+ ]