celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/statistics.py ADDED
@@ -0,0 +1,552 @@
1
+ """Statistical analysis tools: survival, dose-response, biomarker panels."""
2
+
3
+ import numpy as np
4
+ from ct.tools import registry
5
+
6
+
7
+ @registry.register(
8
+ name="statistics.dose_response_fit",
9
+ description="Fit a 4-parameter logistic (Hill equation) dose-response curve and compute IC50",
10
+ category="statistics",
11
+ parameters={
12
+ "doses": "List of dose concentrations (floats)",
13
+ "responses": "List of response values (floats, e.g. viability or inhibition %)",
14
+ "compound_name": "Optional compound name for labeling",
15
+ },
16
+ requires_data=[],
17
+ usage_guide="You have dose-response data and want to fit a curve to compute IC50, Hill slope, "
18
+ "and assess curve quality. Provide matched lists of doses and responses. "
19
+ "Works with any dose-response data (viability, inhibition, binding, etc.).",
20
+ )
21
+ def dose_response_fit(doses: list = None, responses: list = None,
22
+ compound_name: str = "unknown", **kwargs) -> dict:
23
+ """Fit a 4-parameter logistic (Hill equation) dose-response curve.
24
+
25
+ Hill equation: f(x) = bottom + (top - bottom) / (1 + (IC50/x)^slope)
26
+
27
+ Parameters
28
+ ----------
29
+ doses : list of float
30
+ Concentration values (must be positive).
31
+ responses : list of float
32
+ Response values (e.g. % viability, % inhibition).
33
+ compound_name : str
34
+ Label for the compound.
35
+
36
+ Returns
37
+ -------
38
+ dict with fitted parameters, IC50, R-squared, quality assessment.
39
+ """
40
+ from scipy.optimize import curve_fit
41
+ from scipy.stats import pearsonr
42
+
43
+ if doses is None or responses is None:
44
+ return {"error": "Both 'doses' and 'responses' lists are required", "summary": "Both 'doses' and 'responses' lists are required"}
45
+ doses = [float(d) for d in doses]
46
+ responses = [float(r) for r in responses]
47
+
48
+ if len(doses) != len(responses):
49
+ return {"error": f"Length mismatch: {len(doses)} doses vs {len(responses)} responses", "summary": f"Length mismatch: {len(doses)} doses vs {len(responses)} responses"}
50
+ if len(doses) < 4:
51
+ return {"error": f"Need at least 4 data points for 4PL fit, got {len(doses)}", "summary": f"Need at least 4 data points for 4PL fit, got {len(doses)}"}
52
+ # Filter out non-positive doses (log-space fitting)
53
+ valid = [(d, r) for d, r in zip(doses, responses) if d > 0]
54
+ if len(valid) < 4:
55
+ return {"error": "Need at least 4 positive dose values", "summary": "Need at least 4 positive dose values"}
56
+ doses_arr = np.array([v[0] for v in valid])
57
+ resp_arr = np.array([v[1] for v in valid])
58
+
59
+ # 4-parameter logistic (Hill equation)
60
+ def hill(x, bottom, top, ic50, slope):
61
+ return bottom + (top - bottom) / (1.0 + (ic50 / x) ** slope)
62
+
63
+ # Initial parameter guesses
64
+ bottom_guess = float(np.min(resp_arr))
65
+ top_guess = float(np.max(resp_arr))
66
+ ic50_guess = float(np.median(doses_arr))
67
+ slope_guess = 1.0
68
+
69
+ try:
70
+ popt, pcov = curve_fit(
71
+ hill, doses_arr, resp_arr,
72
+ p0=[bottom_guess, top_guess, ic50_guess, slope_guess],
73
+ bounds=(
74
+ [-np.inf, -np.inf, 1e-15, 0.01], # lower bounds (IC50 > 0, slope > 0)
75
+ [np.inf, np.inf, np.inf, 100.0] # upper bounds
76
+ ),
77
+ maxfev=10000,
78
+ )
79
+ bottom, top, ic50, slope = popt
80
+ perr = np.sqrt(np.diag(pcov))
81
+
82
+ # Compute R-squared
83
+ predicted = hill(doses_arr, *popt)
84
+ ss_res = np.sum((resp_arr - predicted) ** 2)
85
+ ss_tot = np.sum((resp_arr - np.mean(resp_arr)) ** 2)
86
+ r_squared = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
87
+
88
+ # Quality assessment
89
+ if r_squared > 0.95 and perr[2] / abs(ic50) < 0.5:
90
+ quality = "HIGH"
91
+ quality_detail = "Excellent fit with tight IC50 confidence"
92
+ elif r_squared > 0.8:
93
+ quality = "MEDIUM"
94
+ quality_detail = "Good fit, IC50 estimate reliable"
95
+ elif r_squared > 0.5:
96
+ quality = "LOW"
97
+ quality_detail = "Marginal fit, IC50 estimate approximate"
98
+ else:
99
+ quality = "POOR"
100
+ quality_detail = "Poor fit, IC50 unreliable"
101
+
102
+ # Max effect (dynamic range)
103
+ max_effect = abs(top - bottom)
104
+
105
+ summary = (
106
+ f"Dose-response fit for {compound_name}:\n"
107
+ f"IC50 = {ic50:.4g}, Hill slope = {slope:.2f}\n"
108
+ f"R² = {r_squared:.4f}, Quality: {quality}\n"
109
+ f"Bottom = {bottom:.2f}, Top = {top:.2f}, Max effect = {max_effect:.2f}"
110
+ )
111
+
112
+ return {
113
+ "summary": summary,
114
+ "compound": compound_name,
115
+ "ic50": round(float(ic50), 6),
116
+ "hill_slope": round(float(slope), 4),
117
+ "bottom": round(float(bottom), 4),
118
+ "top": round(float(top), 4),
119
+ "r_squared": round(float(r_squared), 4),
120
+ "max_effect": round(float(max_effect), 4),
121
+ "parameter_errors": {
122
+ "bottom_se": round(float(perr[0]), 4),
123
+ "top_se": round(float(perr[1]), 4),
124
+ "ic50_se": round(float(perr[2]), 6),
125
+ "slope_se": round(float(perr[3]), 4),
126
+ },
127
+ "quality": quality,
128
+ "quality_detail": quality_detail,
129
+ "n_points": len(doses_arr),
130
+ }
131
+
132
+ except RuntimeError as e:
133
+ return {
134
+ "summary": f"Dose-response fit FAILED for {compound_name}: curve fitting did not converge",
135
+ "error": f"Convergence failure: {str(e)}",
136
+ "compound": compound_name,
137
+ "n_points": len(doses_arr),
138
+ "dose_range": [float(np.min(doses_arr)), float(np.max(doses_arr))],
139
+ "response_range": [float(np.min(resp_arr)), float(np.max(resp_arr))],
140
+ }
141
+ except Exception as e:
142
+ return {
143
+ "summary": f"Dose-response fit FAILED for {compound_name}: {str(e)}",
144
+ "error": str(e),
145
+ "compound": compound_name,
146
+ }
147
+
148
+
149
+ @registry.register(
150
+ name="statistics.survival_analysis",
151
+ description="Perform Kaplan-Meier survival analysis with optional log-rank test for group comparison",
152
+ category="statistics",
153
+ parameters={
154
+ "times": "List of survival/follow-up times",
155
+ "events": "List of event indicators (1=event occurred, 0=censored)",
156
+ "groups": "Optional list of group labels for comparing survival between groups",
157
+ },
158
+ requires_data=[],
159
+ usage_guide="You have time-to-event data and want to estimate survival curves, median survival, "
160
+ "and compare groups. Provide times, event indicators, and optionally group labels. "
161
+ "Use for clinical trial analysis, patient stratification, or biomarker validation.",
162
+ )
163
+ def survival_analysis(times: list = None, events: list = None,
164
+ groups: list = None, **kwargs) -> dict:
165
+ """Perform Kaplan-Meier survival analysis with log-rank test.
166
+
167
+ Implements the Kaplan-Meier estimator from scratch:
168
+ S(t) = product of (1 - d_i/n_i) for all event times t_i <= t
169
+
170
+ If groups are provided, computes separate KM curves and a log-rank test.
171
+
172
+ Parameters
173
+ ----------
174
+ times : list of float
175
+ Survival or follow-up times.
176
+ events : list of int
177
+ Event indicators: 1 = event (death/progression), 0 = censored.
178
+ groups : list, optional
179
+ Group labels for stratified analysis (e.g., ["high", "low", "high", ...]).
180
+ """
181
+ from scipy import stats as sp_stats
182
+
183
+ if times is None or events is None:
184
+ return {"error": "Both 'times' and 'events' lists are required", "summary": "Both 'times' and 'events' lists are required"}
185
+ times = [float(t) for t in times]
186
+ events = [int(e) for e in events]
187
+
188
+ if len(times) != len(events):
189
+ return {"error": f"Length mismatch: {len(times)} times vs {len(events)} events", "summary": f"Length mismatch: {len(times)} times vs {len(events)} events"}
190
+ if len(times) < 3:
191
+ return {"error": f"Need at least 3 observations, got {len(times)}", "summary": f"Need at least 3 observations, got {len(times)}"}
192
+ def _kaplan_meier(t_arr, e_arr):
193
+ """Compute KM survival curve from times and events arrays."""
194
+ # Sort by time
195
+ order = np.argsort(t_arr)
196
+ t_sorted = t_arr[order]
197
+ e_sorted = e_arr[order]
198
+
199
+ # Get unique event times (only where event=1)
200
+ event_times = np.unique(t_sorted[e_sorted == 1])
201
+
202
+ km_times = [0.0]
203
+ km_survival = [1.0]
204
+ n_at_risk = len(t_sorted)
205
+ current_s = 1.0
206
+
207
+ for et in event_times:
208
+ # Number who have been censored or had event before this time
209
+ # n_at_risk at time et: those with time >= et
210
+ n_at_risk = int(np.sum(t_sorted >= et))
211
+ # Deaths at this time
212
+ d = int(np.sum((t_sorted == et) & (e_sorted == 1)))
213
+
214
+ if n_at_risk > 0:
215
+ current_s *= (1.0 - d / n_at_risk)
216
+
217
+ km_times.append(float(et))
218
+ km_survival.append(round(current_s, 6))
219
+
220
+ # Median survival: first time S(t) <= 0.5
221
+ median_surv = None
222
+ for t_val, s_val in zip(km_times, km_survival):
223
+ if s_val <= 0.5:
224
+ median_surv = t_val
225
+ break
226
+
227
+ return {
228
+ "times": km_times,
229
+ "survival": km_survival,
230
+ "median_survival": median_surv,
231
+ "n_events": int(np.sum(e_sorted)),
232
+ "n_censored": int(np.sum(e_sorted == 0)),
233
+ "n_total": len(t_sorted),
234
+ }
235
+
236
+ t_arr = np.array(times)
237
+ e_arr = np.array(events)
238
+
239
+ # Single-group analysis
240
+ if groups is None:
241
+ km = _kaplan_meier(t_arr, e_arr)
242
+ median_str = f"{km['median_survival']:.1f}" if km['median_survival'] is not None else "not reached"
243
+ summary = (
244
+ f"Kaplan-Meier survival analysis (n={km['n_total']}):\n"
245
+ f"Events: {km['n_events']}, Censored: {km['n_censored']}\n"
246
+ f"Median survival: {median_str}"
247
+ )
248
+ return {
249
+ "summary": summary,
250
+ "kaplan_meier": km,
251
+ }
252
+
253
+ # Multi-group analysis
254
+ groups = list(groups)
255
+ if len(groups) != len(times):
256
+ return {"error": f"Length mismatch: {len(times)} times vs {len(groups)} groups", "summary": f"Length mismatch: {len(times)} times vs {len(groups)} groups"}
257
+ unique_groups = sorted(set(groups))
258
+ if len(unique_groups) < 2:
259
+ return {"error": f"Need at least 2 groups for comparison, got {len(unique_groups)}", "summary": f"Need at least 2 groups for comparison, got {len(unique_groups)}"}
260
+ group_arr = np.array(groups)
261
+ group_results = {}
262
+
263
+ for g in unique_groups:
264
+ mask = group_arr == g
265
+ group_results[str(g)] = _kaplan_meier(t_arr[mask], e_arr[mask])
266
+
267
+ # Log-rank test (for 2 groups, generalizable)
268
+ # Implementation: compare observed vs expected events in each group
269
+ # at each unique event time across all groups
270
+ all_event_times = np.unique(t_arr[e_arr == 1])
271
+
272
+ # Compute chi-squared statistic for log-rank
273
+ observed_minus_expected = {str(g): 0.0 for g in unique_groups}
274
+ variance_sum = 0.0
275
+
276
+ for et in all_event_times:
277
+ # Total at risk and total events at this time
278
+ at_risk_total = int(np.sum(t_arr >= et))
279
+ events_total = int(np.sum((t_arr == et) & (e_arr == 1)))
280
+
281
+ if at_risk_total == 0:
282
+ continue
283
+
284
+ for g in unique_groups:
285
+ mask = group_arr == g
286
+ at_risk_g = int(np.sum(t_arr[mask] >= et))
287
+ events_g = int(np.sum((t_arr[mask] == et) & (e_arr[mask] == 1)))
288
+
289
+ # Expected events under null
290
+ expected_g = at_risk_g * events_total / at_risk_total if at_risk_total > 0 else 0
291
+ observed_minus_expected[str(g)] += (events_g - expected_g)
292
+
293
+ # Variance contribution (hypergeometric variance)
294
+ if at_risk_total > 1:
295
+ for g in unique_groups:
296
+ mask = group_arr == g
297
+ n_g = int(np.sum(t_arr[mask] >= et))
298
+ frac = n_g / at_risk_total
299
+ censored_total = at_risk_total - events_total
300
+ var_contrib = (events_total * censored_total * frac * (1 - frac)) / (at_risk_total - 1)
301
+ # Only accumulate for the first group (2-group test)
302
+ if g == unique_groups[0]:
303
+ variance_sum += var_contrib
304
+
305
+ # Chi-squared statistic (1 df for 2 groups)
306
+ if variance_sum > 0:
307
+ chi2 = (observed_minus_expected[str(unique_groups[0])] ** 2) / variance_sum
308
+ p_value = float(1.0 - sp_stats.chi2.cdf(chi2, df=len(unique_groups) - 1))
309
+ else:
310
+ chi2 = 0.0
311
+ p_value = 1.0
312
+
313
+ # Event rate ratio (simplified — not a proper Cox hazard ratio).
314
+ # Computed as (events_1 / total_time_1) / (events_2 / total_time_2).
315
+ hr = None
316
+ hr_str = "N/A"
317
+ if len(unique_groups) == 2:
318
+ g1, g2 = str(unique_groups[0]), str(unique_groups[1])
319
+ r1 = group_results[g1]
320
+ r2 = group_results[g2]
321
+ rate1 = r1["n_events"] / max(np.sum(t_arr[group_arr == unique_groups[0]]), 1e-10)
322
+ rate2 = r2["n_events"] / max(np.sum(t_arr[group_arr == unique_groups[1]]), 1e-10)
323
+ if rate2 > 0:
324
+ hr = round(float(rate1 / rate2), 4)
325
+ hr_str = f"{hr:.3f}"
326
+
327
+ # Build summary
328
+ median_parts = []
329
+ for g in unique_groups:
330
+ med = group_results[str(g)]["median_survival"]
331
+ med_str = f"{med:.1f}" if med is not None else "NR"
332
+ n = group_results[str(g)]["n_total"]
333
+ median_parts.append(f"{g}(n={n}): {med_str}")
334
+
335
+ significance = "significant" if p_value < 0.05 else "not significant"
336
+
337
+ summary = (
338
+ f"Kaplan-Meier survival analysis ({len(unique_groups)} groups, n={len(times)}):\n"
339
+ f"Median survival: {', '.join(median_parts)}\n"
340
+ f"Log-rank p = {p_value:.4g} ({significance})\n"
341
+ f"Event rate ratio (simplified HR): {hr_str}"
342
+ )
343
+
344
+ return {
345
+ "summary": summary,
346
+ "groups": group_results,
347
+ "log_rank": {
348
+ "chi2": round(float(chi2), 4),
349
+ "p_value": round(float(p_value), 6),
350
+ "significant": p_value < 0.05,
351
+ },
352
+ "hazard_ratio": hr, # Note: simplified event rate ratio, not Cox regression HR
353
+ }
354
+
355
+
356
+ @registry.register(
357
+ name="statistics.enrichment_test",
358
+ description="Gene set over-representation analysis using hypergeometric test with FDR correction",
359
+ category="statistics",
360
+ parameters={
361
+ "gene_list": "List of gene symbols (your query genes)",
362
+ "gene_set": "Dict of set_name:gene_list, or 'hallmark' for built-in MSigDB hallmark sets",
363
+ "background_size": "Total background gene count (default 20000)",
364
+ },
365
+ requires_data=[],
366
+ usage_guide="You have a list of genes (e.g. differentially expressed, mutated, degraded) and want "
367
+ "to know which pathways or gene sets they are enriched in. Provide your gene list and "
368
+ "optionally a custom gene set dict, or use built-in hallmark sets. Returns FDR-corrected p-values.",
369
+ )
370
+ def enrichment_test(gene_list: list = None, gene_set: dict | str = "hallmark",
371
+ background_size: int = 20000, **kwargs) -> dict:
372
+ """Gene set over-representation analysis (ORA) with hypergeometric test.
373
+
374
+ For each gene set, computes:
375
+ - Hypergeometric p-value (Fisher's exact one-tailed)
376
+ - Fold enrichment = (overlap/query) / (set_size/background)
377
+ - Benjamini-Hochberg FDR correction
378
+
379
+ Parameters
380
+ ----------
381
+ gene_list : list of str
382
+ Query genes (e.g. upregulated genes, hit list).
383
+ gene_set : dict or str
384
+ Dict mapping set names to gene lists, or "hallmark" for built-in.
385
+ background_size : int
386
+ Total number of genes in the background (default 20000).
387
+ """
388
+ from scipy.stats import hypergeom
389
+
390
+ if gene_list is None or len(gene_list) == 0:
391
+ return {"error": "Provide a non-empty gene_list", "summary": "Provide a non-empty gene_list"}
392
+ gene_list = [str(g).upper() for g in gene_list]
393
+ query_set = set(gene_list)
394
+ n_query = len(query_set)
395
+
396
+ # Load or use provided gene sets
397
+ if isinstance(gene_set, str):
398
+ if gene_set == "hallmark":
399
+ gene_set = _get_hallmark_sets()
400
+ else:
401
+ return {"error": f"Unknown gene set collection: {gene_set}. Provide a dict or 'hallmark'", "summary": f"Unknown gene set collection: {gene_set}. Provide a dict or 'hallmark'"}
402
+ if not isinstance(gene_set, dict) or len(gene_set) == 0:
403
+ return {"error": "gene_set must be a non-empty dict of set_name: gene_list", "summary": "gene_set must be a non-empty dict of set_name: gene_list"}
404
+ N = int(background_size) # total background genes
405
+ results = []
406
+
407
+ for set_name, set_genes in gene_set.items():
408
+ set_genes_upper = [str(g).upper() for g in set_genes]
409
+ set_size = len(set_genes_upper)
410
+ gene_set_set = set(set_genes_upper)
411
+
412
+ # Overlap
413
+ overlap = query_set & gene_set_set
414
+ k = len(overlap)
415
+
416
+ if k == 0:
417
+ continue
418
+
419
+ # Hypergeometric test
420
+ # P(X >= k) where X ~ Hypergeometric(N, K, n)
421
+ # N = background, K = set_size, n = query_size
422
+ p_value = float(hypergeom.sf(k - 1, N, set_size, n_query))
423
+
424
+ # Fold enrichment
425
+ expected = (set_size / N) * n_query if N > 0 else 0
426
+ fold_enrichment = k / expected if expected > 0 else float('inf')
427
+
428
+ results.append({
429
+ "gene_set": set_name,
430
+ "overlap_count": k,
431
+ "overlap_genes": sorted(overlap),
432
+ "set_size": set_size,
433
+ "p_value": p_value,
434
+ "fold_enrichment": round(float(fold_enrichment), 2),
435
+ })
436
+
437
+ if not results:
438
+ return {
439
+ "summary": f"No enrichment found: {n_query} query genes had no overlap with {len(gene_set)} gene sets",
440
+ "n_query_genes": n_query,
441
+ "n_gene_sets_tested": len(gene_set),
442
+ "enriched": [],
443
+ }
444
+
445
+ # Sort by p-value
446
+ results.sort(key=lambda x: x["p_value"])
447
+
448
+ # Benjamini-Hochberg FDR correction
449
+ n_tests = len(results)
450
+ for i, r in enumerate(results):
451
+ rank = i + 1
452
+ r["fdr"] = round(min(r["p_value"] * n_tests / rank, 1.0), 6)
453
+
454
+ # Enforce monotonicity (FDR should be non-decreasing from bottom)
455
+ for i in range(n_tests - 2, -1, -1):
456
+ results[i]["fdr"] = min(results[i]["fdr"], results[i + 1]["fdr"])
457
+
458
+ # Round p-values for output
459
+ for r in results:
460
+ r["p_value"] = round(r["p_value"], 8)
461
+
462
+ significant = [r for r in results if r["fdr"] < 0.05]
463
+
464
+ # Top 10 for summary
465
+ top = results[:10]
466
+ top_str = "\n".join(
467
+ f" {r['gene_set']}: {r['overlap_count']}/{r['set_size']} genes, "
468
+ f"FE={r['fold_enrichment']:.1f}x, FDR={r['fdr']:.2g}"
469
+ for r in top
470
+ )
471
+
472
+ summary = (
473
+ f"Gene set enrichment ({n_query} query genes, {len(gene_set)} sets tested):\n"
474
+ f"Significant (FDR<0.05): {len(significant)}/{n_tests}\n"
475
+ f"Top enriched:\n{top_str}"
476
+ )
477
+
478
+ return {
479
+ "summary": summary,
480
+ "n_query_genes": n_query,
481
+ "n_gene_sets_tested": len(gene_set),
482
+ "n_significant": len(significant),
483
+ "enriched": results,
484
+ }
485
+
486
+
487
+ def _get_hallmark_sets() -> dict:
488
+ """Get built-in hallmark-lite gene sets for enrichment analysis."""
489
+ # Try to load full MSigDB hallmark sets first
490
+ try:
491
+ from ct.data.loaders import load_msigdb
492
+ msigdb = load_msigdb("h")
493
+ # MSigDB JSON: {set_name: {geneSymbols: [...]}}
494
+ if isinstance(msigdb, dict):
495
+ parsed = {}
496
+ for name, data in msigdb.items():
497
+ if isinstance(data, dict) and "geneSymbols" in data:
498
+ parsed[name] = data["geneSymbols"]
499
+ elif isinstance(data, list):
500
+ parsed[name] = data
501
+ if parsed:
502
+ return parsed
503
+ except (FileNotFoundError, ImportError):
504
+ pass
505
+
506
+ # Fallback: curated hallmark-lite sets
507
+ return {
508
+ "HALLMARK_P53_PATHWAY": ["CDKN1A", "MDM2", "BAX", "GADD45A", "SFN", "DDB2",
509
+ "SESN1", "TP53I3", "PMAIP1", "BBC3"],
510
+ "HALLMARK_APOPTOSIS": ["BCL2", "BAX", "BAK1", "BID", "CASP3", "CASP8",
511
+ "CASP9", "CYCS", "APAF1", "FADD"],
512
+ "HALLMARK_MTORC1_SIGNALING": ["SLC7A5", "SLC3A2", "DDIT4", "VEGFA", "HK2",
513
+ "PKM", "LDHA", "SLC2A1", "RPS6", "EIF4E"],
514
+ "HALLMARK_MYC_TARGETS_V1": ["ODC1", "LDHA", "CDK4", "NCL", "NPM1", "NOP56",
515
+ "BOP1", "MRTO4", "RRP12", "WDR12"],
516
+ "HALLMARK_E2F_TARGETS": ["CCNE1", "MCM2", "PCNA", "RRM2", "MCM3", "MCM4",
517
+ "MCM5", "MCM6", "CDC6", "ORC1"],
518
+ "HALLMARK_G2M_CHECKPOINT": ["CDK1", "CCNB1", "CCNB2", "BUB1", "BUB1B",
519
+ "AURKA", "AURKB", "PLK1", "TOP2A", "BIRC5"],
520
+ "HALLMARK_DNA_REPAIR": ["BRCA1", "BRCA2", "RAD51", "ATM", "ATR", "CHEK1",
521
+ "CHEK2", "MSH2", "MSH6", "MLH1"],
522
+ "HALLMARK_INFLAMMATORY_RESPONSE": ["TNF", "IL6", "IL1B", "CXCL8", "CCL2",
523
+ "ICAM1", "VCAM1", "SELE", "PTGS2", "MMP9"],
524
+ "HALLMARK_TNFA_SIGNALING_VIA_NFKB": ["NFKBIA", "TNF", "IL6", "CXCL8", "CCL2",
525
+ "TNFAIP3", "BIRC3", "TRAF1", "RELB", "BCL3"],
526
+ "HALLMARK_INTERFERON_GAMMA_RESPONSE": ["STAT1", "IRF1", "GBP1", "GBP2", "CXCL10",
527
+ "CXCL9", "IDO1", "TAP1", "PSMB9", "B2M"],
528
+ "HALLMARK_INTERFERON_ALPHA_RESPONSE": ["ISG15", "MX1", "MX2", "IFIT1", "IFIT2",
529
+ "IFIT3", "OAS1", "OAS2", "RSAD2", "IFI44L"],
530
+ "HALLMARK_HYPOXIA": ["VEGFA", "SLC2A1", "LDHA", "PDK1", "BNIP3", "DDIT4",
531
+ "ENO1", "PGK1", "ALDOA", "HK2"],
532
+ "HALLMARK_GLYCOLYSIS": ["HK2", "PFKM", "ALDOA", "GAPDH", "PKM", "LDHA",
533
+ "ENO1", "TPI1", "PGK1", "GPI"],
534
+ "HALLMARK_OXIDATIVE_PHOSPHORYLATION": ["NDUFA1", "SDHA", "UQCRC1", "COX5A",
535
+ "ATP5F1A", "NDUFS1", "SDHB", "COX7A2",
536
+ "UQCRB", "ATP5F1B"],
537
+ "HALLMARK_UNFOLDED_PROTEIN_RESPONSE": ["HSPA5", "DDIT3", "ATF4", "XBP1",
538
+ "HERPUD1", "DNAJB9", "PDIA4", "ERN1",
539
+ "ATF6", "EDEM1"],
540
+ "HALLMARK_WNT_BETA_CATENIN_SIGNALING": ["CTNNB1", "LEF1", "TCF7", "MYC",
541
+ "CCND1", "AXIN2", "DKK1", "WNT3A",
542
+ "FZD1", "LRP6"],
543
+ "HALLMARK_NOTCH_SIGNALING": ["NOTCH1", "HES1", "HEY1", "JAG1", "DLL1",
544
+ "RBPJ", "MAML1", "NRARP", "DTX1", "HEYL"],
545
+ "HALLMARK_HEDGEHOG_SIGNALING": ["SHH", "SMO", "PTCH1", "GLI1", "GLI2",
546
+ "GLI3", "SUFU", "HHIP", "GAS1", "BOC"],
547
+ "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION": ["VIM", "CDH2", "FN1", "SNAI1",
548
+ "SNAI2", "TWIST1", "ZEB1", "ZEB2",
549
+ "MMP2", "MMP9"],
550
+ "HALLMARK_ANGIOGENESIS": ["VEGFA", "KDR", "FLT1", "PECAM1", "ANGPT1",
551
+ "ANGPT2", "TEK", "NRP1", "ENG", "HIF1A"],
552
+ }