celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/agent/types.py ADDED
@@ -0,0 +1,182 @@
1
+ """
2
+ Core data types for the agent pipeline.
3
+
4
+ Defines Plan, Step, Clarification, and ExecutionResult — the data structures
5
+ shared across the planner, executor, runner, and UI layers.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Optional
10
+
11
+
12
+ # ---------------------------------------------------------------------------
13
+ # Plan & Step
14
+ # ---------------------------------------------------------------------------
15
+
16
+ @dataclass
17
+ class Step:
18
+ """A single research step in a plan."""
19
+ id: int
20
+ description: str = ""
21
+ tool: str = ""
22
+ tool_args: dict = field(default_factory=dict)
23
+ depends_on: list[int] = field(default_factory=list)
24
+ status: str = "pending" # pending, running, completed, failed
25
+ result: Optional[dict] = None
26
+
27
+
28
+ @dataclass
29
+ class Clarification:
30
+ """Planner needs more information from the user before it can plan."""
31
+ question: str
32
+ missing: list[str] = field(default_factory=list)
33
+ suggestions: list[str] = field(default_factory=list)
34
+
35
+
36
+ @dataclass
37
+ class Plan:
38
+ """A structured research plan."""
39
+ query: str
40
+ steps: list[Step] = field(default_factory=list)
41
+ context: dict = field(default_factory=dict)
42
+
43
+ def pending_steps(self) -> list[Step]:
44
+ return [s for s in self.steps if s.status == "pending"]
45
+
46
+ def ready_steps(self) -> list[Step]:
47
+ """Steps whose dependencies are all completed."""
48
+ completed_ids = {s.id for s in self.steps if s.status == "completed"}
49
+ return [
50
+ s for s in self.steps
51
+ if s.status == "pending" and all(d in completed_ids for d in s.depends_on)
52
+ ]
53
+
54
+ def is_complete(self) -> bool:
55
+ return all(s.status in ("completed", "failed") for s in self.steps)
56
+
57
+ def summary(self) -> str:
58
+ lines = [f"Plan: {self.query}", ""]
59
+ for s in self.steps:
60
+ status_icon = {"pending": " ", "running": ">", "completed": "+", "failed": "!"}
61
+ icon = status_icon.get(s.status, "?")
62
+ deps = f" (after {s.depends_on})" if s.depends_on else ""
63
+ lines.append(f" [{icon}] {s.id}. {s.description} [{s.tool}]{deps}")
64
+ return "\n".join(lines)
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # ExecutionResult
69
+ # ---------------------------------------------------------------------------
70
+
71
+ @dataclass
72
+ class ExecutionResult:
73
+ """Result of executing a complete research plan."""
74
+ plan: Plan
75
+ summary: str = ""
76
+ raw_results: dict = field(default_factory=dict)
77
+ duration_s: float = 0.0
78
+ iterations: int = 1
79
+ metadata: dict = field(default_factory=dict)
80
+
81
+ def _metadata_header(self) -> list[str]:
82
+ """Build metadata header lines from self.metadata."""
83
+ md = self.metadata
84
+ if not md:
85
+ return []
86
+ lines = [
87
+ "<!--",
88
+ " Report Metadata (machine-readable provenance)",
89
+ ]
90
+ for key in ("query", "timestamp", "model", "execution_time_s",
91
+ "tool_success_rate", "profile", "ct_version"):
92
+ if key in md:
93
+ lines.append(f" {key}: {md[key]}")
94
+ lines.append("-->")
95
+ lines.append("")
96
+ lines.append("| Metadata | Value |")
97
+ lines.append("|----------|-------|")
98
+ if "timestamp" in md:
99
+ lines.append(f"| Generated | {md['timestamp']} |")
100
+ if "model" in md:
101
+ lines.append(f"| Model | {md['model']} |")
102
+ if "execution_time_s" in md:
103
+ lines.append(f"| Execution Time | {md['execution_time_s']:.1f}s |")
104
+ if "tool_success_rate" in md:
105
+ lines.append(f"| Tool Success Rate | {md['tool_success_rate']} |")
106
+ if "profile" in md:
107
+ lines.append(f"| Profile | {md['profile']} |")
108
+ if "ct_version" in md:
109
+ lines.append(f"| ct Version | {md['ct_version']} |")
110
+ lines.append("")
111
+ return lines
112
+
113
+ def _quality_scorecard(self) -> list[str]:
114
+ """Build quality scorecard footer from plan steps and metadata."""
115
+ lines = ["## Quality Scorecard", ""]
116
+ lines.append("### Tools Executed")
117
+ lines.append("")
118
+ for step in self.plan.steps:
119
+ status_icon = "PASS" if step.status == "completed" else "FAIL"
120
+ lines.append(f"- `{step.tool}`: {status_icon}")
121
+ lines.append("")
122
+
123
+ md = self.metadata
124
+ if md.get("confidence_tier"):
125
+ lines.append(f"**Confidence Tier:** {md['confidence_tier']}")
126
+ lines.append("")
127
+ if md.get("grounding_result"):
128
+ lines.append(f"**Grounding Validation:** {md['grounding_result']}")
129
+ lines.append("")
130
+
131
+ data_sources = set()
132
+ for step in self.plan.steps:
133
+ if step.status == "completed" and step.result:
134
+ if isinstance(step.result, dict):
135
+ for src in step.result.get("data_sources", []):
136
+ data_sources.add(src)
137
+ tool_name = step.tool
138
+ if "." in tool_name:
139
+ data_sources.add(tool_name.split(".")[0])
140
+ if data_sources:
141
+ lines.append("### Data Sources Referenced")
142
+ lines.append("")
143
+ for src in sorted(data_sources):
144
+ lines.append(f"- {src}")
145
+ lines.append("")
146
+
147
+ return lines
148
+
149
+ def to_markdown(self) -> str:
150
+ """Generate a markdown report from the execution results."""
151
+ lines = []
152
+ lines.extend(self._metadata_header())
153
+ lines.extend([
154
+ f"# Research Report: {self.plan.query}",
155
+ "",
156
+ f"*Generated by celltype-cli in {self.duration_s:.1f}s*",
157
+ "",
158
+ "---",
159
+ "",
160
+ self.summary,
161
+ "",
162
+ "---",
163
+ "",
164
+ "## Detailed Step Results",
165
+ "",
166
+ ])
167
+ for step in self.plan.steps:
168
+ status = "completed" if step.status == "completed" else "FAILED"
169
+ lines.append(f"### Step {step.id}: {step.description} [{status}]")
170
+ lines.append(f"Tool: `{step.tool}`")
171
+ lines.append("")
172
+ if step.result:
173
+ if isinstance(step.result, dict) and "summary" in step.result:
174
+ lines.append(step.result["summary"])
175
+ else:
176
+ lines.append(f"```\n{step.result}\n```")
177
+ lines.append("")
178
+
179
+ if self.metadata:
180
+ lines.extend(self._quality_scorecard())
181
+
182
+ return "\n".join(lines)
ct/agent/workflows.py ADDED
@@ -0,0 +1,462 @@
1
+ """
2
+ Workflow templates for common drug discovery research patterns.
3
+
4
+ These are injected into the planner prompt to guide tool selection and sequencing.
5
+ The planner can follow, adapt, or combine workflows as needed.
6
+ """
7
+
8
+
9
+ WORKFLOWS = {
10
+ "target_validation": {
11
+ "description": "Validate a potential drug target",
12
+ "trigger_phrases": [
13
+ "validate target", "is this a good target", "target assessment",
14
+ "druggable target", "target validation",
15
+ ],
16
+ "steps": [
17
+ {"tool": "target.coessentiality", "why": "Find functionally related genes and synthetic lethal partners"},
18
+ {"tool": "literature.pubmed_search", "why": "Check published validation data and known biology"},
19
+ {"tool": "clinical.indication_map", "why": "Map to cancer indications with PRISM sensitivity data"},
20
+ {"tool": "biomarker.mutation_sensitivity", "why": "Check if mutations in the target affect drug response"},
21
+ {"tool": "clinical.tcga_stratify", "why": "Assess target expression across cancer types via TCGA"},
22
+ ],
23
+ },
24
+ "compound_safety": {
25
+ "description": "Full safety assessment of a compound",
26
+ "trigger_phrases": [
27
+ "safety assessment", "is this compound safe", "toxicity profile",
28
+ "safety check", "off-target",
29
+ ],
30
+ "steps": [
31
+ {"tool": "safety.antitarget_profile", "why": "Screen for off-target degradation of tumor suppressors and essential proteins"},
32
+ {"tool": "safety.sall4_risk", "why": "Assess teratogenicity risk via SALL4 degradation (IMiD-type liability)"},
33
+ {"tool": "safety.classify", "why": "Get overall SAFE/CAUTION/DANGEROUS verdict combining all signals"},
34
+ {"tool": "viability.tissue_selectivity", "why": "Check for broad vs selective killing across tissue types"},
35
+ ],
36
+ },
37
+ "hit_characterization": {
38
+ "description": "Characterize a hit compound from a screen",
39
+ "trigger_phrases": [
40
+ "characterize compound", "hit characterization", "what does this compound do",
41
+ "compound profile", "hit profiling",
42
+ ],
43
+ "steps": [
44
+ {"tool": "chemistry.descriptors", "why": "Get molecular properties, drug-likeness, and Lipinski profile"},
45
+ {"tool": "viability.dose_response", "why": "Understand potency across cell lines with IC50 estimates"},
46
+ {"tool": "expression.pathway_enrichment", "why": "Identify affected pathways from L1000 transcriptomic signature"},
47
+ {"tool": "safety.classify", "why": "Quick safety classification before advancing the compound"},
48
+ {"tool": "literature.chembl_query", "why": "Find related compounds and known bioactivity in ChEMBL"},
49
+ ],
50
+ },
51
+ "combination_therapy": {
52
+ "description": "Design a combination therapy strategy",
53
+ "trigger_phrases": [
54
+ "combination therapy", "synergy", "combine with", "drug combination",
55
+ "synthetic lethality", "combination strategy",
56
+ ],
57
+ "steps": [
58
+ {"tool": "combination.synergy_predict", "why": "Find synergistic partners via anti-correlated transcriptomic signatures"},
59
+ {"tool": "combination.synthetic_lethality", "why": "Mine DepMap for synthetic lethal gene pairs"},
60
+ {"tool": "combination.metabolic_vulnerability", "why": "Identify exploitable metabolic dependencies for combination"},
61
+ {"tool": "expression.immune_score", "why": "Check IO potential for immuno-oncology combinations"},
62
+ ],
63
+ },
64
+ "clinical_positioning": {
65
+ "description": "Position a compound for clinical development",
66
+ "trigger_phrases": [
67
+ "clinical positioning", "which indication", "patient population",
68
+ "go-to-market", "clinical strategy", "indication selection",
69
+ ],
70
+ "steps": [
71
+ {"tool": "clinical.indication_map", "why": "Map compound sensitivity to cancer indications"},
72
+ {"tool": "clinical.population_size", "why": "Estimate addressable patient populations per indication"},
73
+ {"tool": "biomarker.mutation_sensitivity", "why": "Identify predictive biomarkers for patient selection"},
74
+ {"tool": "clinical.tcga_stratify", "why": "Validate target expression in patient tumors via TCGA"},
75
+ {"tool": "clinical.trial_design_benchmark", "why": "Benchmark endpoint and protocol design patterns in the current trial landscape"},
76
+ {"tool": "literature.pubmed_search", "why": "Review clinical landscape and competitor data"},
77
+ ],
78
+ },
79
+ "cro_engagement": {
80
+ "description": "Design experiment and engage a CRO for outsourced work",
81
+ "trigger_phrases": [
82
+ "find a CRO", "outsource experiment", "CRO inquiry",
83
+ "contract research", "send to CRO",
84
+ ],
85
+ "steps": [
86
+ {"tool": "experiment.design_assay", "why": "Generate a detailed assay protocol"},
87
+ {"tool": "experiment.estimate_timeline", "why": "Get time and cost estimates for the experiment"},
88
+ {"tool": "cro.match_experiment", "why": "Find best-fit CROs for the assay type"},
89
+ {"tool": "cro.draft_inquiry", "why": "Generate a professional inquiry email to the top CRO"},
90
+ ],
91
+ },
92
+ "structure_prediction": {
93
+ "description": "Predict ternary complex structure for a molecular glue",
94
+ "trigger_phrases": [
95
+ "predict structure", "ternary complex", "structural prediction",
96
+ "dock compound", "binding mode",
97
+ ],
98
+ "steps": [
99
+ {"tool": "structure.alphafold_fetch", "why": "Download AlphaFold structure for the target protein"},
100
+ {"tool": "structure.compound_3d", "why": "Generate 3D conformer for the compound"},
101
+ {"tool": "structure.ternary_predict", "why": "Predict ternary complex (E3 + compound + target)"},
102
+ {"tool": "chemistry.descriptors", "why": "Get molecular properties for structure-activity context"},
103
+ ],
104
+ },
105
+ "gpu_computation": {
106
+ "description": "Submit and manage GPU compute jobs",
107
+ "trigger_phrases": [
108
+ "GPU computation", "run Boltz", "run AlphaFold", "submit job",
109
+ "cloud compute", "estimate cost",
110
+ ],
111
+ "steps": [
112
+ {"tool": "compute.estimate_cost", "why": "Get cost and time estimate before committing resources"},
113
+ {"tool": "compute.list_providers", "why": "Review available GPU providers and pricing"},
114
+ {"tool": "compute.submit_job", "why": "Submit the computation job (dry_run by default)"},
115
+ ],
116
+ },
117
+ "custom_analysis": {
118
+ "description": "Custom data exploration or visualization",
119
+ "trigger_phrases": [
120
+ "create a plot", "make a visualization", "custom analysis",
121
+ "heatmap", "volcano plot", "statistical test", "scatter plot",
122
+ ],
123
+ "steps": [
124
+ {"tool": "code.execute", "why": "Generate custom analysis code"},
125
+ ],
126
+ },
127
+ "script_authoring": {
128
+ "description": "Write or update a standalone script/code file in the workspace",
129
+ "trigger_phrases": [
130
+ "write a python script", "save as .py", "create script file",
131
+ "generate a script",
132
+ ],
133
+ "steps": [
134
+ {"tool": "files.create_file", "why": "Create the requested script file with full code content"},
135
+ {"tool": "files.read_file", "why": "Verify file contents after writing"},
136
+ ],
137
+ },
138
+ "report_generation": {
139
+ "description": "Generate and save a research report",
140
+ "trigger_phrases": [
141
+ "write a report", "save report", "export findings",
142
+ "generate report", "create a report",
143
+ ],
144
+ "steps": [
145
+ {"tool": "code.execute", "why": "Run analysis and gather data"},
146
+ {"tool": "files.write_report", "why": "Save formatted report to output directory"},
147
+ ],
148
+ },
149
+ "genetic_evidence": {
150
+ "description": "Build a comprehensive genetic evidence case for a target-disease link",
151
+ "trigger_phrases": [
152
+ "genetic evidence", "causal evidence", "Mendelian randomization",
153
+ "GWAS evidence", "genetic validation", "causal link",
154
+ ],
155
+ "steps": [
156
+ {"tool": "genomics.gwas_lookup", "why": "Find genome-wide significant associations"},
157
+ {"tool": "genomics.eqtl_lookup", "why": "Check expression QTLs across tissues"},
158
+ {"tool": "genomics.mendelian_randomization_lookup", "why": "Assess causal evidence via MR"},
159
+ {"tool": "genomics.coloc", "why": "Test GWAS-eQTL colocalization (shared causal variant)"},
160
+ {"tool": "target.expression_profile", "why": "Understand tissue expression pattern"},
161
+ ],
162
+ },
163
+ "lead_optimization": {
164
+ "description": "Optimize a hit compound into a lead",
165
+ "trigger_phrases": [
166
+ "optimize compound", "lead optimization", "improve potency",
167
+ "SAR", "improve ADMET", "make analogs",
168
+ ],
169
+ "steps": [
170
+ {"tool": "chemistry.sar_analyze", "why": "Understand current SAR landscape"},
171
+ {"tool": "chemistry.mmp_analysis", "why": "Find matched molecular pair transformations that improve properties"},
172
+ {"tool": "chemistry.scaffold_hop", "why": "Generate scaffold-hopped analogs for IP space"},
173
+ {"tool": "design.suggest_modifications", "why": "Get medicinal chemistry modification suggestions"},
174
+ {"tool": "safety.admet_predict", "why": "Predict ADMET for top candidates"},
175
+ {"tool": "chemistry.retrosynthesis", "why": "Check synthetic accessibility of top analogs"},
176
+ ],
177
+ },
178
+ "protein_deep_dive": {
179
+ "description": "Comprehensive protein characterization",
180
+ "trigger_phrases": [
181
+ "tell me about this protein", "protein function", "protein structure",
182
+ "domain architecture", "protein characterization",
183
+ ],
184
+ "steps": [
185
+ {"tool": "protein.function_predict", "why": "Get full UniProt annotation: function, location, GO terms"},
186
+ {"tool": "protein.domain_annotate", "why": "Map domain architecture from InterPro"},
187
+ {"tool": "data_api.pdb_search", "why": "Find experimental structures"},
188
+ {"tool": "target.expression_profile", "why": "Tissue expression from GTEx and HPA"},
189
+ {"tool": "network.ppi_analysis", "why": "Map protein interaction partners"},
190
+ ],
191
+ },
192
+ "drug_repurposing": {
193
+ "description": "Find repurposing opportunities for existing drugs",
194
+ "trigger_phrases": [
195
+ "repurpose", "drug repurposing", "new indication",
196
+ "repositioning", "off-label", "existing drug for",
197
+ ],
198
+ "steps": [
199
+ {"tool": "repurposing.cmap_query", "why": "Match drug expression signature to disease signatures"},
200
+ {"tool": "data_api.drug_info", "why": "Get comprehensive drug profile and known indications"},
201
+ {"tool": "clinical.trial_search", "why": "Check ongoing trials in the new indication"},
202
+ {"tool": "literature.patent_search", "why": "Assess IP landscape for new indication"},
203
+ {"tool": "clinical.competitive_landscape", "why": "Map competitors in the target indication"},
204
+ ],
205
+ },
206
+ "molecular_docking": {
207
+ "description": "Dock compounds into a protein target and analyze binding",
208
+ "trigger_phrases": [
209
+ "dock", "docking", "binding mode", "binding site",
210
+ "virtual screening", "binding affinity",
211
+ ],
212
+ "steps": [
213
+ {"tool": "structure.alphafold_fetch", "why": "Get protein structure (AlphaFold if no experimental)"},
214
+ {"tool": "structure.binding_site", "why": "Identify druggable binding pockets"},
215
+ {"tool": "structure.compound_3d", "why": "Generate 3D ligand conformer"},
216
+ {"tool": "structure.dock", "why": "Dock ligand into binding site"},
217
+ {"tool": "design.suggest_modifications", "why": "Suggest modifications to improve binding"},
218
+ ],
219
+ },
220
+ "resistance_analysis": {
221
+ "description": "Analyze drug resistance mechanisms and predict resistance-associated biomarkers",
222
+ "trigger_phrases": [
223
+ "resistance mechanism", "resistance profile", "drug resistance",
224
+ "resistance mutation", "acquired resistance", "resistance biomarker",
225
+ ],
226
+ "steps": [
227
+ {"tool": "biomarker.mutation_sensitivity", "why": "Identify mutations that alter drug sensitivity — potential resistance drivers"},
228
+ {"tool": "expression.l1000_similarity", "why": "Find compounds with similar transcriptomic signatures to identify resistance-associated expression patterns"},
229
+ {"tool": "expression.pathway_enrichment", "why": "Map resistance-associated expression changes to pathways (e.g., efflux, bypass signaling)"},
230
+ {"tool": "literature.pubmed_search", "why": "Search published literature for known resistance mechanisms to this drug/target class"},
231
+ {"tool": "biomarker.resistance_profile", "why": "Build comprehensive resistance profile combining mutation, expression, and literature data"},
232
+ ],
233
+ },
234
+ "therapeutic_window": {
235
+ "description": "Assess therapeutic window by comparing on-target vs off-target toxicity",
236
+ "trigger_phrases": [
237
+ "therapeutic window", "therapeutic index", "selectivity index",
238
+ "on-target toxicity", "off-target toxicity", "safety margin",
239
+ ],
240
+ "steps": [
241
+ {"tool": "viability.dose_response", "why": "Get dose-response in target cancer cell lines to establish efficacy range"},
242
+ {"tool": "viability.tissue_selectivity", "why": "Compare sensitivity across tissue types to identify selective vs broadly toxic profiles"},
243
+ {"tool": "viability.tissue_selectivity", "why": "Compare sensitivity across lineages to calculate therapeutic window between sensitive and resistant tissue types"},
244
+ {"tool": "safety.antitarget_profile", "why": "Screen for off-target degradation of tumor suppressors and essential proteins"},
245
+ {"tool": "safety.classify", "why": "Get overall safety classification combining all toxicity signals"},
246
+ ],
247
+ },
248
+ "competitive_landscape": {
249
+ "description": "Map the competitive landscape for a drug target or indication",
250
+ "trigger_phrases": [
251
+ "competitive landscape", "competitor analysis", "market landscape",
252
+ "clinical pipeline", "what drugs target", "who else is developing",
253
+ ],
254
+ "steps": [
255
+ {"tool": "clinical.competitive_landscape", "why": "Aggregate competitive intelligence from Open Targets, ChEMBL, and ClinicalTrials.gov"},
256
+ {"tool": "clinical.trial_search", "why": "Search ClinicalTrials.gov for active and recruiting trials in the indication"},
257
+ {"tool": "literature.pubmed_search", "why": "Find recent publications on clinical results and competitor compounds"},
258
+ {"tool": "clinical.indication_map", "why": "Map compound sensitivity to cancer indications to identify positioning opportunities"},
259
+ ],
260
+ },
261
+ "treatment_landscape": {
262
+ "description": "Describe standard of care treatment and where a drug class fits",
263
+ "trigger_phrases": [
264
+ "standard of care", "treatment sequencing", "treatment regimen",
265
+ "approved therapies", "treatment landscape", "where do",
266
+ ],
267
+ "steps": [
268
+ {"tool": "literature.pubmed_search", "why": "Search for current treatment guidelines and landmark trials"},
269
+ {"tool": "clinical.trial_search", "why": "Search ClinicalTrials.gov for current and recent trials establishing standard of care"},
270
+ {"tool": "clinical.competitive_landscape", "why": "Map all approved and investigational drugs for the indication"},
271
+ {"tool": "data_api.opentargets_search", "why": "Get Open Targets disease-level drug and target landscape"},
272
+ ],
273
+ },
274
+ "mutation_resistance": {
275
+ "description": "Identify clinically observed resistance mutations for a drug or drug class",
276
+ "trigger_phrases": [
277
+ "resistance mutation", "clinically observed mutation", "mutation frequency",
278
+ "IMiD resistance", "drug resistance mutation", "acquired resistance",
279
+ ],
280
+ "steps": [
281
+ {"tool": "literature.pubmed_search", "why": "Search for publications on clinically observed resistance mutations"},
282
+ {"tool": "data_api.opentargets_search", "why": "Get known genetic associations and somatic mutations from Open Targets"},
283
+ {"tool": "biomarker.mutation_sensitivity", "why": "Check if mutations correlate with drug sensitivity in preclinical data"},
284
+ {"tool": "biomarker.resistance_profile", "why": "Build comprehensive resistance profile"},
285
+ {"tool": "literature.openalex_search", "why": "Search for additional clinical mutation data in recent literature"},
286
+ ],
287
+ },
288
+ "protac_design": {
289
+ "description": "Analyze PROTAC linker and component properties",
290
+ "trigger_phrases": [
291
+ "PROTAC", "linker length", "linker composition", "bifunctional degrader",
292
+ "dBET", "MZ1", "ARV", "PROTAC design",
293
+ ],
294
+ "steps": [
295
+ {"tool": "chemistry.pubchem_lookup", "why": "Look up PROTAC structures and molecular properties"},
296
+ {"tool": "chemistry.descriptors", "why": "Calculate molecular descriptors including MW, logP, TPSA for PROTACs"},
297
+ {"tool": "literature.chembl_query", "why": "Find ChEMBL bioactivity data for PROTACs"},
298
+ {"tool": "literature.pubmed_search", "why": "Search for PROTAC SAR and linkerology publications"},
299
+ ],
300
+ },
301
+ "compound_comparison": {
302
+ "description": "Compare two or more compounds on activity and selectivity",
303
+ "trigger_phrases": [
304
+ "compare compounds", "versus", "differential sensitivity",
305
+ "compare selectivity", "compare potency", "which is more potent",
306
+ ],
307
+ "steps": [
308
+ {"tool": "chemistry.pubchem_lookup", "why": "Look up each compound separately to get structures and properties"},
309
+ {"tool": "viability.dose_response", "why": "Get dose-response for first compound (run separately for each)"},
310
+ {"tool": "viability.tissue_selectivity", "why": "Get tissue selectivity for first compound"},
311
+ {"tool": "literature.pubmed_search", "why": "Search published head-to-head comparisons"},
312
+ {"tool": "literature.chembl_query", "why": "Get bioactivity data from ChEMBL for comparison"},
313
+ ],
314
+ },
315
+ "patient_population": {
316
+ "description": "Estimate addressable patient population for a drug concept",
317
+ "trigger_phrases": [
318
+ "patient population", "addressable population", "market sizing",
319
+ "how many patients", "incidence", "prevalence",
320
+ ],
321
+ "steps": [
322
+ {"tool": "clinical.population_size", "why": "Get SEER incidence data for the indication"},
323
+ {"tool": "clinical.trial_search", "why": "Check current trials for treatment rates and eligible populations"},
324
+ {"tool": "clinical.competitive_landscape", "why": "Understand competitive landscape and unmet need"},
325
+ {"tool": "literature.pubmed_search", "why": "Find epidemiology data and treatment utilization rates"},
326
+ ],
327
+ },
328
+ "omics_scrnaseq_analysis": {
329
+ "description": "Analyze single-cell RNA-seq data for a gene or disease",
330
+ "trigger_phrases": [
331
+ "single-cell analysis", "scRNA-seq", "analyze single-cell",
332
+ "single cell RNA", "cell type composition",
333
+ ],
334
+ "steps": [
335
+ {"tool": "omics.geo_search", "why": "Search GEO for relevant scRNA-seq datasets"},
336
+ {"tool": "omics.cellxgene_search", "why": "Search CELLxGENE for curated single-cell datasets"},
337
+ {"tool": "omics.geo_fetch", "why": "Download the most relevant dataset"},
338
+ {"tool": "omics.dataset_info", "why": "Inspect dataset structure and metadata"},
339
+ {"tool": "singlecell.cluster", "why": "Cluster cells and identify populations"},
340
+ {"tool": "singlecell.cell_type_annotate", "why": "Annotate cell types using marker genes"},
341
+ {"tool": "expression.pathway_enrichment", "why": "Identify enriched pathways in cell populations"},
342
+ ],
343
+ },
344
+ "omics_bulk_analysis": {
345
+ "description": "Analyze bulk RNA-seq or expression data",
346
+ "trigger_phrases": [
347
+ "bulk RNA-seq", "bulk expression", "differential expression from GEO",
348
+ "analyze expression data", "gene expression dataset",
349
+ ],
350
+ "steps": [
351
+ {"tool": "omics.geo_search", "why": "Search GEO for relevant expression datasets"},
352
+ {"tool": "omics.geo_fetch", "why": "Download the expression matrix"},
353
+ {"tool": "omics.dataset_info", "why": "Inspect dataset structure and dimensions"},
354
+ {"tool": "omics.deseq2", "why": "Run DESeq2 with explicit sample metadata (condition labels) for robust count-based differential expression"},
355
+ {"tool": "expression.pathway_enrichment", "why": "Identify enriched pathways from DEGs"},
356
+ ],
357
+ },
358
+ "omics_data_discovery": {
359
+ "description": "Find and evaluate public datasets for a research question",
360
+ "trigger_phrases": [
361
+ "find dataset", "find public data", "search for datasets",
362
+ "available data", "download data", "data discovery",
363
+ ],
364
+ "steps": [
365
+ {"tool": "omics.geo_search", "why": "Search NCBI GEO for relevant datasets"},
366
+ {"tool": "omics.cellxgene_search", "why": "Search CELLxGENE for curated single-cell data"},
367
+ {"tool": "omics.tcga_search", "why": "Search TCGA/GDC for cancer genomics data"},
368
+ {"tool": "omics.dataset_info", "why": "Inspect and summarize the top dataset"},
369
+ ],
370
+ },
371
+ "omics_methylation_analysis": {
372
+ "description": "Analyze DNA methylation data for differential methylation",
373
+ "trigger_phrases": [
374
+ "methylation analysis", "differential methylation", "DNA methylation",
375
+ "CpG methylation", "epigenetic analysis",
376
+ ],
377
+ "steps": [
378
+ {"tool": "omics.geo_search", "why": "Search GEO for methylation datasets"},
379
+ {"tool": "omics.geo_fetch", "why": "Download methylation beta-value matrix"},
380
+ {"tool": "omics.dataset_info", "why": "Inspect dataset structure"},
381
+ {"tool": "omics.methylation_profile", "why": "Summarize global methylation landscape"},
382
+ {"tool": "omics.methylation_diff", "why": "Identify differentially methylated sites using explicit case/control sample groups"},
383
+ {"tool": "omics.methylation_cluster", "why": "Cluster samples by methylation patterns"},
384
+ ],
385
+ },
386
+ "omics_proteomics_analysis": {
387
+ "description": "Analyze proteomics data for differential protein abundance",
388
+ "trigger_phrases": [
389
+ "proteomics analysis", "differential protein", "protein abundance",
390
+ "mass spectrometry", "TMT proteomics",
391
+ ],
392
+ "steps": [
393
+ {"tool": "omics.dataset_info", "why": "Inspect proteomics data structure"},
394
+ {"tool": "omics.proteomics_diff", "why": "Differential protein abundance analysis with explicit sample grouping"},
395
+ {"tool": "omics.proteomics_enrich", "why": "Pathway enrichment of DE proteins"},
396
+ ],
397
+ },
398
+ "omics_epigenomics_analysis": {
399
+ "description": "Analyze ATAC-seq or ChIP-seq epigenomic data",
400
+ "trigger_phrases": [
401
+ "ATAC-seq analysis", "ChIP-seq analysis", "chromatin accessibility",
402
+ "epigenomic profiling", "open chromatin",
403
+ ],
404
+ "steps": [
405
+ {"tool": "omics.geo_search", "why": "Search GEO for ATAC-seq/ChIP-seq datasets"},
406
+ {"tool": "omics.geo_fetch", "why": "Download peak or count data"},
407
+ {"tool": "omics.atac_peak_annotate", "why": "Annotate peaks by genomic features"},
408
+ {"tool": "omics.chromatin_accessibility", "why": "Differential accessibility between explicit biological groups"},
409
+ {"tool": "omics.chipseq_enrich", "why": "Enrichment analysis of target genes"},
410
+ ],
411
+ },
412
+ "omics_multiomics_integration": {
413
+ "description": "Integrate multiple omics modalities into shared latent space",
414
+ "trigger_phrases": [
415
+ "multi-omics integration", "integrate RNA and ATAC", "MOFA",
416
+ "multiomics", "combine omics modalities",
417
+ ],
418
+ "steps": [
419
+ {"tool": "omics.dataset_info", "why": "Inspect each modality file"},
420
+ {"tool": "omics.multiomics_integrate", "why": "MOFA+ integration into shared latent factors"},
421
+ ],
422
+ },
423
+ "omics_spatial_analysis": {
424
+ "description": "Analyze spatial transcriptomics data",
425
+ "trigger_phrases": [
426
+ "spatial transcriptomics", "Visium", "MERFISH", "spatial gene expression",
427
+ "spatial clustering", "tissue architecture",
428
+ ],
429
+ "steps": [
430
+ {"tool": "omics.cellxgene_search", "why": "Search for spatial datasets"},
431
+ {"tool": "omics.dataset_info", "why": "Inspect spatial data structure"},
432
+ {"tool": "omics.spatial_cluster", "why": "Spatial-aware cell clustering"},
433
+ {"tool": "omics.spatial_autocorrelation", "why": "Identify spatially patterned genes"},
434
+ ],
435
+ },
436
+ }
437
+
438
+
439
+ def format_workflows_for_llm(allowed_tools: set[str] | None = None) -> str:
440
+ """Format workflow templates as markdown for the planner prompt."""
441
+ lines = ["\n# Recommended Workflows", ""]
442
+ lines.append(
443
+ "These are expert-recommended tool sequences for common drug discovery tasks. "
444
+ "You may follow, adapt, or combine them as appropriate for the query."
445
+ )
446
+ lines.append("")
447
+
448
+ for wf_id, wf in WORKFLOWS.items():
449
+ wf_steps = wf["steps"]
450
+ if allowed_tools is not None:
451
+ wf_steps = [s for s in wf_steps if s["tool"] in allowed_tools]
452
+ if not wf_steps:
453
+ continue
454
+
455
+ lines.append(f"## {wf_id}: {wf['description']}")
456
+ triggers = ", ".join(f'"{t}"' for t in wf["trigger_phrases"][:3])
457
+ lines.append(f" Trigger phrases: {triggers}")
458
+ for i, step in enumerate(wf_steps, 1):
459
+ lines.append(f" {i}. **{step['tool']}** — {step['why']}")
460
+ lines.append("")
461
+
462
+ return "\n".join(lines)
ct/api/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """ct Data Query API — serves filtered queries against large datasets via DuckDB."""