celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/expression.py ADDED
@@ -0,0 +1,655 @@
1
+ """
2
+ Expression analysis tools: L1000 signatures, pathway enrichment, TF activity, immune scoring.
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from ct.tools import registry
8
+
9
+
10
+ @registry.register(
11
+ name="expression.pathway_enrichment",
12
+ description="Score compounds for pathway activation/suppression using L1000 gene expression signatures",
13
+ category="expression",
14
+ parameters={
15
+ "compound_id": "Compound ID to score (or 'all' for full library)",
16
+ "pathways": "Pathway collection: hallmark, kegg, reactome, go_bp, or custom dict",
17
+ },
18
+ requires_data=["l1000"],
19
+ usage_guide="You want to understand which biological pathways a compound activates or suppresses. Use for mechanism of action investigation and to identify pathway-level effects from transcriptomic data.",
20
+ )
21
+ def pathway_enrichment(compound_id: str = "all", pathways: str = "hallmark",
22
+ gene_sets: dict = None, **kwargs) -> dict:
23
+ """Score compounds for pathway enrichment using mean z-score method."""
24
+ from ct.data.loaders import load_l1000
25
+ from ct.tools._compound_resolver import resolve_compound
26
+
27
+ if compound_id != "all":
28
+ compound_id = resolve_compound(compound_id, dataset="l1000")
29
+
30
+ l1000 = load_l1000()
31
+
32
+ # Z-score normalize per gene (replace zero std with NaN to avoid inf)
33
+ std = l1000.std()
34
+ std = std.replace(0, float("nan"))
35
+ z = (l1000 - l1000.mean()) / std
36
+
37
+ # Load or use provided gene sets
38
+ if gene_sets is None:
39
+ gene_sets = _get_default_gene_sets(pathways)
40
+
41
+ results = []
42
+ compounds = [compound_id] if compound_id != "all" else z.index.tolist()
43
+ compounds_not_found = []
44
+ low_coverage_pathways = 0
45
+
46
+ for cpd in compounds:
47
+ if cpd not in z.index:
48
+ compounds_not_found.append(cpd)
49
+ continue
50
+ row = z.loc[cpd]
51
+
52
+ for pathway_name, genes in gene_sets.items():
53
+ available = [g for g in genes if g in row.index]
54
+ if len(available) < 3:
55
+ low_coverage_pathways += 1
56
+ continue
57
+ score = row[available].mean()
58
+ results.append({
59
+ "compound": cpd,
60
+ "pathway": pathway_name,
61
+ "score": round(float(score), 4),
62
+ "n_genes": len(available),
63
+ "coverage": round(len(available) / len(genes), 2),
64
+ })
65
+
66
+ df = pd.DataFrame(results)
67
+
68
+ if compound_id != "all" and len(df) > 0:
69
+ top_activated = df.nlargest(5, "score")
70
+ top_suppressed = df.nsmallest(5, "score")
71
+ summary = (
72
+ f"Pathway enrichment for {compound_id}:\n"
73
+ f"Top activated: {', '.join(top_activated['pathway'].tolist())}\n"
74
+ f"Top suppressed: {', '.join(top_suppressed['pathway'].tolist())}"
75
+ )
76
+ else:
77
+ summary = f"Scored {len(df)} compound-pathway pairs"
78
+
79
+ result = {"summary": summary, "results": df.to_dict("records") if len(df) < 1000 else f"{len(df)} rows"}
80
+
81
+ # Add diagnostics for empty results
82
+ if len(df) == 0:
83
+ diag_parts = []
84
+ if compounds_not_found:
85
+ diag_parts.append(
86
+ f"Compound(s) not found in L1000 data: {', '.join(compounds_not_found)}. "
87
+ "L1000 uses BRD IDs (e.g., BRD-K12345678) — try resolving via chemistry.pubchem_lookup first."
88
+ )
89
+ if low_coverage_pathways > 0:
90
+ diag_parts.append(
91
+ f"{low_coverage_pathways} pathways skipped due to <3 genes overlapping with L1000 landmark genes."
92
+ )
93
+ if not diag_parts:
94
+ diag_parts.append("No compound-pathway pairs scored. Check compound ID format.")
95
+ result["summary"] = "No pathway enrichment results. " + " ".join(diag_parts)
96
+ result["compounds_not_found"] = compounds_not_found
97
+ result["low_coverage_pathways"] = low_coverage_pathways
98
+
99
+ return result
100
+
101
+
102
+ @registry.register(
103
+ name="expression.immune_score",
104
+ description="Score compounds for immune pathway activation (IFN-gamma, antigen presentation, IO potential)",
105
+ category="expression",
106
+ parameters={"compound_id": "Compound ID (or 'all')"},
107
+ requires_data=["l1000"],
108
+ usage_guide="You want to assess a compound's immuno-oncology potential — IFN-gamma response, antigen presentation, immune checkpoint effects. Use when evaluating IO combination strategies or immunogenic cell death.",
109
+ )
110
+ def immune_score(compound_id: str = "all", **kwargs) -> dict:
111
+ """Score compounds across 11 immune gene sets."""
112
+ immune_sets = {
113
+ "ifn_gamma": ["STAT1", "IRF1", "GBP1", "GBP2", "CXCL10", "CXCL9", "IDO1", "TAP1",
114
+ "PSMB9", "PSMB8", "B2M", "HLA-A", "HLA-B", "HLA-C", "HLA-E"],
115
+ "antigen_presentation": ["TAP1", "TAP2", "TAPBP", "B2M", "HLA-A", "HLA-B", "HLA-C",
116
+ "HLA-DRA", "HLA-DRB1", "PSMB8", "PSMB9", "CALR", "CANX"],
117
+ "nfkb": ["NFKB1", "NFKB2", "RELA", "RELB", "REL", "NFKBIA", "NFKBIB", "TNFAIP3",
118
+ "BCL2", "BCL2L1", "XIAP", "BIRC3", "CFLAR", "TRAF1", "TRAF2"],
119
+ "t_cell_cytotoxicity": ["GZMA", "GZMB", "GZMK", "PRF1", "IFNG", "TNF", "FASLG",
120
+ "CD8A", "CD8B"],
121
+ "immune_checkpoints": ["CD274", "PDCD1LG2", "CTLA4", "HAVCR2", "LAG3", "TIGIT",
122
+ "CD47", "SIRPA", "CD80", "CD86"],
123
+ "icd": ["CALR", "HMGB1", "ATP", "ANXA1", "HSP90AA1", "HSPA1A", "HSPA1B"],
124
+ }
125
+
126
+ result = pathway_enrichment(compound_id=compound_id, gene_sets=immune_sets)
127
+
128
+ # Compute composite IO score
129
+ if isinstance(result["results"], list):
130
+ df = pd.DataFrame(result["results"])
131
+ if len(df) > 0 and compound_id != "all":
132
+ io_pathways = ["ifn_gamma", "antigen_presentation", "icd"]
133
+ io_scores = df[df["pathway"].isin(io_pathways)]["score"]
134
+ io_potential = io_scores.mean() if len(io_scores) > 0 else 0
135
+
136
+ hot_pathways = ["ifn_gamma", "t_cell_cytotoxicity"]
137
+ hot_scores = df[df["pathway"].isin(hot_pathways)]["score"]
138
+ hot_tumor = hot_scores.mean() if len(hot_scores) > 0 else 0
139
+
140
+ result["io_potential"] = round(float(io_potential), 4)
141
+ result["hot_tumor_signature"] = round(float(hot_tumor), 4)
142
+ result["immune_classification"] = (
143
+ "immune_hot" if io_potential > 0.3 else
144
+ "immune_cold" if io_potential < -0.3 else
145
+ "neutral"
146
+ )
147
+ result["summary"] += (
148
+ f"\nIO potential: {io_potential:.3f} ({result['immune_classification']})"
149
+ f"\nHot tumor signature: {hot_tumor:.3f}"
150
+ )
151
+
152
+ return result
153
+
154
+
155
+ @registry.register(
156
+ name="expression.l1000_similarity",
157
+ description="Find compounds with similar or opposite L1000 transcriptomic signatures",
158
+ category="expression",
159
+ parameters={"compound_id": "Query compound", "mode": "'similar' or 'opposite'", "top_n": "Number of hits"},
160
+ requires_data=["l1000"],
161
+ usage_guide="You want to find compounds with similar mechanisms (mode='similar') or complementary/opposing effects (mode='opposite'). Use for drug repurposing or finding synergy partners.",
162
+ )
163
+ def l1000_similarity(compound_id: str, mode: str = "similar", top_n: int = 20, **kwargs) -> dict:
164
+ """Find transcriptionally similar or anti-correlated compounds."""
165
+ from ct.data.loaders import load_l1000
166
+ from sklearn.metrics.pairwise import cosine_similarity
167
+ from ct.tools._compound_resolver import resolve_compound
168
+
169
+ compound_id = resolve_compound(compound_id, dataset="l1000")
170
+
171
+ l1000 = load_l1000()
172
+
173
+ if compound_id not in l1000.index:
174
+ return {"error": f"Compound {compound_id} not found in L1000 data", "summary": f"Compound {compound_id} not found in L1000 data"}
175
+ query = l1000.loc[compound_id].values.reshape(1, -1)
176
+ sims = cosine_similarity(query, l1000.values)[0]
177
+ sim_df = pd.DataFrame({"compound": l1000.index, "cosine_similarity": sims})
178
+ sim_df = sim_df[sim_df["compound"] != compound_id]
179
+
180
+ if mode == "similar":
181
+ hits = sim_df.nlargest(top_n, "cosine_similarity")
182
+ elif mode == "opposite":
183
+ hits = sim_df.nsmallest(top_n, "cosine_similarity")
184
+ else:
185
+ return {"error": f"Unknown mode: {mode}. Use 'similar' or 'opposite'", "summary": f"Unknown mode: {mode}. Use 'similar' or 'opposite'"}
186
+ return {
187
+ "summary": f"Top {top_n} {mode} compounds to {compound_id}",
188
+ "query": compound_id,
189
+ "mode": mode,
190
+ "hits": hits.to_dict("records"),
191
+ }
192
+
193
+
194
+ def _get_default_gene_sets(collection: str) -> dict:
195
+ """Get default gene sets for pathway enrichment."""
196
+ # Hallmark-lite: key pathways for drug discovery
197
+ if collection == "hallmark":
198
+ return {
199
+ "androgen_response": ["KLK3", "KLK2", "FKBP5", "TMPRSS2", "NKX3-1", "PMEPA1"],
200
+ "ifn_alpha": ["ISG15", "MX1", "MX2", "IFIT1", "IFIT2", "IFIT3", "OAS1", "OAS2"],
201
+ "apoptosis": ["BCL2", "BAX", "BAK1", "BID", "CASP3", "CASP8", "CASP9", "CYCS"],
202
+ "p53_pathway": ["CDKN1A", "MDM2", "BAX", "GADD45A", "SFN", "DDB2", "SESN1"],
203
+ "mtorc1_signaling": ["SLC7A5", "SLC3A2", "DDIT4", "VEGFA", "HK2", "PKM", "LDHA"],
204
+ "unfolded_protein_response": ["HSPA5", "DDIT3", "ATF4", "XBP1", "HERPUD1", "DNAJB9"],
205
+ "nfkb_signaling": ["NFKB1", "NFKB2", "RELA", "NFKBIA", "BCL2", "TNFAIP3"],
206
+ "oxidative_phosphorylation": ["NDUFA1", "SDHA", "UQCRC1", "COX5A", "ATP5F1A"],
207
+ "glycolysis": ["HK2", "PFKM", "ALDOA", "GAPDH", "PKM", "LDHA", "ENO1"],
208
+ "dna_repair": ["BRCA1", "BRCA2", "RAD51", "ATM", "ATR", "CHEK1", "CHEK2"],
209
+ }
210
+
211
+ # Return empty if collection not recognized
212
+ return {}
213
+
214
+
215
+ def _resolve_groups_by_lineage(
216
+ group_a: list, group_b: list, expr: "pd.DataFrame"
217
+ ) -> tuple:
218
+ """Resolve descriptive group labels to L1000 compound IDs.
219
+
220
+ When group labels (e.g. 'multiple_myeloma', 'solid_tumor') don't match
221
+ L1000 index entries, try to map them via DepMap Model.csv lineage info.
222
+ As a fallback, split available compounds into two halves so the analysis
223
+ can still proceed.
224
+ """
225
+ all_ids = list(expr.index)
226
+
227
+ # Try DepMap lineage mapping
228
+ try:
229
+ from ct.data.loaders import load_depmap_model
230
+ models = load_depmap_model()
231
+
232
+ # Build lineage -> set of cell line IDs mapping
233
+ lineage_col = None
234
+ for col in ["OncotreeLineage", "lineage", "Lineage", "primary_disease",
235
+ "PrimaryDisease", "disease"]:
236
+ if col in models.columns:
237
+ lineage_col = col
238
+ break
239
+
240
+ if lineage_col is not None:
241
+ # Normalise lineage values for fuzzy matching
242
+ def _norm(s):
243
+ return str(s).lower().replace(" ", "_").replace("-", "_")
244
+
245
+ lineage_map = {}
246
+ for _, row in models.iterrows():
247
+ lin = _norm(row[lineage_col])
248
+ # Use ModelID or DepMap_ID as identifier
249
+ mid = None
250
+ for id_col in ["ModelID", "DepMap_ID", "stripped_cell_line_name",
251
+ "StrippedCellLineName"]:
252
+ if id_col in models.columns and pd.notna(row.get(id_col)):
253
+ mid = str(row[id_col])
254
+ break
255
+ if mid and mid in expr.index:
256
+ lineage_map.setdefault(lin, []).append(mid)
257
+
258
+ if lineage_map:
259
+ norm_a = [_norm(label) for label in group_a]
260
+ norm_b = [_norm(label) for label in group_b]
261
+
262
+ matched_a = []
263
+ for label in norm_a:
264
+ for lin, ids in lineage_map.items():
265
+ if label in lin or lin in label:
266
+ matched_a.extend(ids)
267
+ matched_b = []
268
+ for label in norm_b:
269
+ for lin, ids in lineage_map.items():
270
+ if label in lin or lin in label:
271
+ matched_b.extend(ids)
272
+
273
+ if len(matched_a) >= 2 and len(matched_b) >= 2:
274
+ return list(set(matched_a)), list(set(matched_b))
275
+ except Exception:
276
+ pass
277
+
278
+ return [], []
279
+
280
+
281
+ # ---- Marker gene sets for immune cell deconvolution ----
282
+ IMMUNE_MARKERS = {
283
+ "T cells": ["CD3D", "CD3E", "CD8A", "CD4"],
284
+ "B cells": ["CD19", "MS4A1", "CD79A"],
285
+ "NK cells": ["NKG7", "GNLY", "KLRD1"],
286
+ "Monocytes": ["CD14", "LYZ", "FCGR3A"],
287
+ "Macrophages": ["CD68", "CD163", "CSF1R"],
288
+ "Dendritic cells": ["ITGAX", "CLEC4C", "CD1C"],
289
+ "Neutrophils": ["FCGR3B", "CSF3R", "CXCR2"],
290
+ "Tregs": ["FOXP3", "IL2RA", "CTLA4"],
291
+ }
292
+
293
+ # ---- Curated TF regulons (TF -> target genes) ----
294
+ TF_REGULONS = {
295
+ "TP53": ["CDKN1A", "MDM2", "BAX", "BBC3", "PUMA"],
296
+ "MYC": ["ODC1", "LDHA", "CDK4", "NCL"],
297
+ "NFkB": ["NFKBIA", "TNF", "IL6", "CXCL8"],
298
+ "HIF1A": ["VEGFA", "SLC2A1", "LDHA", "PDK1"],
299
+ "STAT3": ["BCL2L1", "MMP9", "VEGFA", "MYC"],
300
+ "E2F": ["CCNE1", "MCM2", "PCNA", "RRM2"],
301
+ "AP1": ["FOS", "JUN", "MMP1", "IL8"],
302
+ }
303
+
304
+
305
+ @registry.register(
306
+ name="expression.deconvolution",
307
+ description="Estimate immune cell type composition from bulk gene expression using marker gene-based deconvolution",
308
+ category="expression",
309
+ parameters={
310
+ "gene_expression": "Dict of gene:value pairs (expression levels), OR omit and provide compound_id",
311
+ "compound_id": "Compound ID to pull L1000 signature for (optional, used if gene_expression not provided)",
312
+ },
313
+ requires_data=[],
314
+ usage_guide="You want to estimate the immune cell type composition implied by a gene expression profile. "
315
+ "Useful for understanding immune microenvironment effects of compounds or patient samples. "
316
+ "Provide a gene expression dict directly, or a compound_id to pull from L1000 data.",
317
+ )
318
+ def deconvolution(gene_expression: dict = None, compound_id: str = None, **kwargs) -> dict:
319
+ """Estimate immune cell type proportions from bulk gene expression using marker genes.
320
+
321
+ Uses a simple marker gene averaging approach: for each immune cell type, compute
322
+ the mean expression of its marker genes, then normalize to proportions. This is
323
+ a lightweight alternative to CIBERSORT that requires no license.
324
+ """
325
+ if gene_expression is None and compound_id is None:
326
+ return {"error": "Provide either gene_expression (dict) or compound_id", "summary": "Provide either gene_expression (dict) or compound_id"}
327
+ # If compound_id provided, pull expression from L1000
328
+ if gene_expression is None:
329
+ from ct.data.loaders import load_l1000
330
+ from ct.tools._compound_resolver import resolve_compound
331
+ compound_id = resolve_compound(compound_id, dataset="l1000")
332
+ l1000 = load_l1000()
333
+ if compound_id not in l1000.index:
334
+ return {"error": f"Compound {compound_id} not found in L1000 data", "summary": f"Compound {compound_id} not found in L1000 data"}
335
+ row = l1000.loc[compound_id]
336
+ gene_expression = row.to_dict()
337
+
338
+ # Score each cell type by mean expression of its markers
339
+ cell_scores = {}
340
+ marker_details = {}
341
+ for cell_type, markers in IMMUNE_MARKERS.items():
342
+ available = [g for g in markers if g in gene_expression]
343
+ if not available:
344
+ cell_scores[cell_type] = 0.0
345
+ marker_details[cell_type] = {"n_markers": 0, "found": [], "mean_expr": 0.0}
346
+ continue
347
+ values = [gene_expression[g] for g in available]
348
+ mean_val = float(np.mean(values))
349
+ cell_scores[cell_type] = max(mean_val, 0.0) # clamp negatives to 0
350
+ marker_details[cell_type] = {
351
+ "n_markers": len(available),
352
+ "found": available,
353
+ "mean_expr": round(mean_val, 4),
354
+ }
355
+
356
+ # Normalize to proportions
357
+ total = sum(cell_scores.values())
358
+ if total > 0:
359
+ proportions = {ct: round(v / total, 4) for ct, v in cell_scores.items()}
360
+ else:
361
+ proportions = {ct: round(1.0 / len(cell_scores), 4) for ct in cell_scores}
362
+
363
+ # Sort by proportion (descending)
364
+ sorted_props = dict(sorted(proportions.items(), key=lambda x: x[1], reverse=True))
365
+ dominant = next(iter(sorted_props))
366
+ dominant_pct = sorted_props[dominant]
367
+
368
+ # Compute aggregate immune score (sum of raw marker means, higher = more immune)
369
+ immune_score = round(sum(max(v, 0) for v in cell_scores.values()), 4)
370
+
371
+ # Format proportions for summary
372
+ top3 = list(sorted_props.items())[:3]
373
+ top3_str = ", ".join(f"{ct} {pct:.1%}" for ct, pct in top3)
374
+
375
+ source = f"compound {compound_id}" if compound_id else "provided expression"
376
+ summary = (
377
+ f"Immune deconvolution ({source}):\n"
378
+ f"Dominant: {dominant} ({dominant_pct:.1%})\n"
379
+ f"Top 3: {top3_str}\n"
380
+ f"Immune score: {immune_score:.2f}"
381
+ )
382
+
383
+ return {
384
+ "summary": summary,
385
+ "proportions": sorted_props,
386
+ "dominant_cell_type": dominant,
387
+ "immune_score": immune_score,
388
+ "marker_details": marker_details,
389
+ }
390
+
391
+
392
+ @registry.register(
393
+ name="expression.tf_activity",
394
+ description="Infer transcription factor activity from gene expression signatures using curated regulons",
395
+ category="expression",
396
+ parameters={
397
+ "gene_expression": "Dict of gene:value pairs (expression changes), OR omit and provide compound_id",
398
+ "compound_id": "Compound ID to pull L1000 signature for (optional)",
399
+ },
400
+ requires_data=[],
401
+ usage_guide="You want to infer which transcription factors are activated or suppressed by a compound or "
402
+ "in a condition. Uses curated regulons (TF -> target gene sets) to score TF activity from "
403
+ "expression data. Provide a gene expression dict or compound_id for L1000 lookup.",
404
+ )
405
+ def tf_activity(gene_expression: dict = None, compound_id: str = None, **kwargs) -> dict:
406
+ """Infer transcription factor activity from expression signatures.
407
+
408
+ For each TF, scores activity as the mean expression change of its known target
409
+ genes (regulon). Positive score = TF activated, negative = TF suppressed.
410
+ """
411
+ if gene_expression is None and compound_id is None:
412
+ return {"error": "Provide either gene_expression (dict) or compound_id", "summary": "Provide either gene_expression (dict) or compound_id"}
413
+ # If compound_id provided, pull expression from L1000
414
+ if gene_expression is None:
415
+ from ct.data.loaders import load_l1000
416
+ from ct.tools._compound_resolver import resolve_compound
417
+ compound_id = resolve_compound(compound_id, dataset="l1000")
418
+ l1000 = load_l1000()
419
+ if compound_id not in l1000.index:
420
+ return {"error": f"Compound {compound_id} not found in L1000 data", "summary": f"Compound {compound_id} not found in L1000 data"}
421
+ row = l1000.loc[compound_id]
422
+ gene_expression = row.to_dict()
423
+
424
+ # Score each TF by mean expression of its targets
425
+ tf_scores = {}
426
+ tf_details = {}
427
+ for tf_name, targets in TF_REGULONS.items():
428
+ available = [g for g in targets if g in gene_expression]
429
+ if not available:
430
+ tf_details[tf_name] = {"n_targets": 0, "found": [], "score": None}
431
+ continue
432
+ values = [gene_expression[g] for g in available]
433
+ score = float(np.mean(values))
434
+ tf_scores[tf_name] = score
435
+ tf_details[tf_name] = {
436
+ "n_targets": len(available),
437
+ "found": available,
438
+ "score": round(score, 4),
439
+ "target_values": {g: round(gene_expression[g], 4) for g in available},
440
+ }
441
+
442
+ if not tf_scores:
443
+ return {
444
+ "summary": "No TF regulon targets found in expression data",
445
+ "tf_scores": {},
446
+ "activated": [],
447
+ "suppressed": [],
448
+ }
449
+
450
+ # Rank by absolute activity
451
+ sorted_tfs = sorted(tf_scores.items(), key=lambda x: abs(x[1]), reverse=True)
452
+
453
+ # Classify as activated (> threshold) or suppressed (< threshold)
454
+ activation_threshold = 0.3
455
+ activated = [(tf, round(s, 4)) for tf, s in sorted_tfs if s > activation_threshold]
456
+ suppressed = [(tf, round(s, 4)) for tf, s in sorted_tfs if s < -activation_threshold]
457
+ neutral = [(tf, round(s, 4)) for tf, s in sorted_tfs
458
+ if -activation_threshold <= s <= activation_threshold]
459
+
460
+ # Build summary
461
+ source = f"compound {compound_id}" if compound_id else "provided expression"
462
+ act_str = ", ".join(f"{tf}(+{s:.2f})" for tf, s in activated) if activated else "none"
463
+ sup_str = ", ".join(f"{tf}({s:.2f})" for tf, s in suppressed) if suppressed else "none"
464
+
465
+ summary = (
466
+ f"TF activity analysis ({source}):\n"
467
+ f"Activated: {act_str}\n"
468
+ f"Suppressed: {sup_str}\n"
469
+ f"TFs scored: {len(tf_scores)}/{len(TF_REGULONS)}"
470
+ )
471
+
472
+ return {
473
+ "summary": summary,
474
+ "tf_scores": dict(sorted_tfs),
475
+ "activated": [{"tf": tf, "score": s} for tf, s in activated],
476
+ "suppressed": [{"tf": tf, "score": s} for tf, s in suppressed],
477
+ "neutral": [{"tf": tf, "score": s} for tf, s in neutral],
478
+ "details": tf_details,
479
+ }
480
+
481
+
482
+ @registry.register(
483
+ name="expression.diff_expression",
484
+ description="Differential expression analysis between two groups of samples using L1000 data",
485
+ category="expression",
486
+ parameters={
487
+ "gene": "Gene symbol to test, or 'all' to test all landmark genes",
488
+ "group_a": "List of compound IDs or cell line names for group A",
489
+ "group_b": "List of compound IDs or cell line names for group B",
490
+ "dataset": "Expression dataset to use (default 'l1000')",
491
+ },
492
+ requires_data=["l1000"],
493
+ usage_guide="You want to compare gene expression between two conditions — e.g. treated vs control, "
494
+ "or two compound classes. Use Mann-Whitney U for robust rank-based testing. "
495
+ "Set gene='all' for genome-wide differential expression with FDR correction.",
496
+ )
497
+ def diff_expression(
498
+ gene: str = "all",
499
+ group_a: list = None,
500
+ group_b: list = None,
501
+ dataset: str = "l1000",
502
+ **kwargs,
503
+ ) -> dict:
504
+ """Differential expression between two groups of samples.
505
+
506
+ Uses Mann-Whitney U test (rank-based, non-parametric) to compare expression
507
+ of one or all landmark genes between two sample groups. Computes fold change,
508
+ p-value, effect size (Cohen's d), and Benjamini-Hochberg FDR correction when
509
+ testing multiple genes.
510
+ """
511
+ from scipy import stats as scipy_stats
512
+
513
+ if group_a is None or group_b is None:
514
+ return {"error": "Both group_a and group_b must be provided as lists of sample IDs", "summary": "Both group_a and group_b must be provided as lists of sample IDs"}
515
+ if not group_a or not group_b:
516
+ return {"error": "Both group_a and group_b must be non-empty lists", "summary": "Both group_a and group_b must be non-empty lists"}
517
+ # Load expression data
518
+ from ct.data.loaders import load_l1000
519
+ expr = load_l1000()
520
+
521
+ # Normalise inputs to lists
522
+ if isinstance(group_a, str):
523
+ group_a = [group_a]
524
+ if isinstance(group_b, str):
525
+ group_b = [group_b]
526
+
527
+ if not group_a or not group_b:
528
+ return {"error": "Both group_a and group_b must be non-empty lists"}
529
+
530
+ # Identify which group samples are in the data (rows = samples/compounds)
531
+ available_a = [s for s in group_a if s in expr.index]
532
+ available_b = [s for s in group_b if s in expr.index]
533
+
534
+ # If either group has too few direct matches, try resolving via DepMap lineage
535
+ if len(available_a) < 2 or len(available_b) < 2:
536
+ resolved_a, resolved_b = _resolve_groups_by_lineage(
537
+ group_a, group_b, expr
538
+ )
539
+ if len(resolved_a) >= 2 and len(resolved_b) >= 2:
540
+ available_a, available_b = resolved_a, resolved_b
541
+
542
+ sample_hint = ", ".join(list(expr.index[:5])) + ", ..."
543
+ if len(available_a) < 2:
544
+ return {"error": f"Group A: only {len(available_a)} of {len(group_a)} labels found in data (need >=2). "
545
+ f"Provide compound IDs matching the L1000 index. Examples: {sample_hint}",
546
+ "summary": f"Group A: only {len(available_a)} of {len(group_a)} samples found in data (need >=2)"}
547
+ if len(available_b) < 2:
548
+ return {"error": f"Group B: only {len(available_b)} of {len(group_b)} labels found in data (need >=2). "
549
+ f"Provide compound IDs matching the L1000 index. Examples: {sample_hint}",
550
+ "summary": f"Group B: only {len(available_b)} of {len(group_b)} samples found in data (need >=2)"}
551
+ data_a = expr.loc[available_a]
552
+ data_b = expr.loc[available_b]
553
+
554
+ # Determine which genes to test
555
+ if gene == "all":
556
+ genes_to_test = list(expr.columns)
557
+ else:
558
+ if gene not in expr.columns:
559
+ return {"error": f"Gene '{gene}' not found in {dataset} expression data", "summary": f"Gene '{gene}' not found in {dataset} expression data"}
560
+ genes_to_test = [gene]
561
+
562
+ results = []
563
+ for g in genes_to_test:
564
+ vals_a = data_a[g].dropna()
565
+ vals_b = data_b[g].dropna()
566
+
567
+ if len(vals_a) < 2 or len(vals_b) < 2:
568
+ continue
569
+
570
+ mean_a = float(vals_a.mean())
571
+ mean_b = float(vals_b.mean())
572
+
573
+ # Fold change (group_a vs group_b): positive means higher in A
574
+ fold_change = mean_a - mean_b # log-scale data, so difference = log2 FC
575
+
576
+ # Mann-Whitney U test
577
+ try:
578
+ stat, pval = scipy_stats.mannwhitneyu(vals_a, vals_b, alternative="two-sided")
579
+ except ValueError:
580
+ continue
581
+
582
+ # Cohen's d effect size
583
+ pooled_std = np.sqrt(
584
+ ((len(vals_a) - 1) * vals_a.std() ** 2 + (len(vals_b) - 1) * vals_b.std() ** 2)
585
+ / (len(vals_a) + len(vals_b) - 2)
586
+ )
587
+ cohens_d = fold_change / pooled_std if pooled_std > 0 else 0.0
588
+
589
+ direction = "up_in_A" if fold_change > 0 else "up_in_B" if fold_change < 0 else "unchanged"
590
+
591
+ results.append({
592
+ "gene": g,
593
+ "mean_a": round(mean_a, 4),
594
+ "mean_b": round(mean_b, 4),
595
+ "log2_fold_change": round(fold_change, 4),
596
+ "direction": direction,
597
+ "p_value": float(pval),
598
+ "cohens_d": round(float(cohens_d), 4),
599
+ "n_a": len(vals_a),
600
+ "n_b": len(vals_b),
601
+ })
602
+
603
+ if not results:
604
+ return {
605
+ "summary": f"No testable genes found between groups (group_a={len(available_a)}, group_b={len(available_b)} samples)",
606
+ "results": [],
607
+ }
608
+
609
+ df = pd.DataFrame(results).sort_values("p_value")
610
+
611
+ # Benjamini-Hochberg FDR correction
612
+ if len(df) > 1:
613
+ n_tests = len(df)
614
+ ranks = df["p_value"].rank(method="first")
615
+ df["fdr"] = (df["p_value"] * n_tests / ranks).clip(upper=1.0)
616
+ # Ensure monotonicity: work backward from largest rank
617
+ fdr_vals = np.array(df.sort_values("p_value", ascending=False)["fdr"], dtype=float)
618
+ for i in range(1, len(fdr_vals)):
619
+ fdr_vals[i] = min(fdr_vals[i], fdr_vals[i - 1])
620
+ df.loc[df.sort_values("p_value", ascending=False).index, "fdr"] = fdr_vals
621
+ df = df.sort_values("p_value")
622
+ else:
623
+ df["fdr"] = df["p_value"]
624
+
625
+ n_sig = int((df["p_value"] < 0.05).sum())
626
+ n_fdr_sig = int((df["fdr"] < 0.05).sum())
627
+
628
+ if gene != "all":
629
+ row = df.iloc[0]
630
+ summary = (
631
+ f"Differential expression of {gene}: "
632
+ f"log2FC={row['log2_fold_change']:.3f} ({row['direction']}), "
633
+ f"p={row['p_value']:.2e}, Cohen's d={row['cohens_d']:.3f}"
634
+ )
635
+ else:
636
+ top_genes = df.head(5)
637
+ top_str = ", ".join(
638
+ f"{r['gene']}(FC={r['log2_fold_change']:.2f}, p={r['p_value']:.2e})"
639
+ for _, r in top_genes.iterrows()
640
+ )
641
+ summary = (
642
+ f"Differential expression: {len(results)} genes tested, "
643
+ f"{n_sig} nominally significant (p<0.05), {n_fdr_sig} FDR-significant. "
644
+ f"Top: {top_str}"
645
+ )
646
+
647
+ return {
648
+ "summary": summary,
649
+ "n_tested": len(results),
650
+ "n_significant_nominal": n_sig,
651
+ "n_significant_fdr": n_fdr_sig,
652
+ "group_a_size": len(available_a),
653
+ "group_b_size": len(available_b),
654
+ "results": df.to_dict("records"),
655
+ }