celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/statistics.py
ADDED
|
@@ -0,0 +1,552 @@
|
|
|
1
|
+
"""Statistical analysis tools: survival, dose-response, biomarker panels."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from ct.tools import registry
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@registry.register(
|
|
8
|
+
name="statistics.dose_response_fit",
|
|
9
|
+
description="Fit a 4-parameter logistic (Hill equation) dose-response curve and compute IC50",
|
|
10
|
+
category="statistics",
|
|
11
|
+
parameters={
|
|
12
|
+
"doses": "List of dose concentrations (floats)",
|
|
13
|
+
"responses": "List of response values (floats, e.g. viability or inhibition %)",
|
|
14
|
+
"compound_name": "Optional compound name for labeling",
|
|
15
|
+
},
|
|
16
|
+
requires_data=[],
|
|
17
|
+
usage_guide="You have dose-response data and want to fit a curve to compute IC50, Hill slope, "
|
|
18
|
+
"and assess curve quality. Provide matched lists of doses and responses. "
|
|
19
|
+
"Works with any dose-response data (viability, inhibition, binding, etc.).",
|
|
20
|
+
)
|
|
21
|
+
def dose_response_fit(doses: list = None, responses: list = None,
|
|
22
|
+
compound_name: str = "unknown", **kwargs) -> dict:
|
|
23
|
+
"""Fit a 4-parameter logistic (Hill equation) dose-response curve.
|
|
24
|
+
|
|
25
|
+
Hill equation: f(x) = bottom + (top - bottom) / (1 + (IC50/x)^slope)
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
doses : list of float
|
|
30
|
+
Concentration values (must be positive).
|
|
31
|
+
responses : list of float
|
|
32
|
+
Response values (e.g. % viability, % inhibition).
|
|
33
|
+
compound_name : str
|
|
34
|
+
Label for the compound.
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
dict with fitted parameters, IC50, R-squared, quality assessment.
|
|
39
|
+
"""
|
|
40
|
+
from scipy.optimize import curve_fit
|
|
41
|
+
from scipy.stats import pearsonr
|
|
42
|
+
|
|
43
|
+
if doses is None or responses is None:
|
|
44
|
+
return {"error": "Both 'doses' and 'responses' lists are required", "summary": "Both 'doses' and 'responses' lists are required"}
|
|
45
|
+
doses = [float(d) for d in doses]
|
|
46
|
+
responses = [float(r) for r in responses]
|
|
47
|
+
|
|
48
|
+
if len(doses) != len(responses):
|
|
49
|
+
return {"error": f"Length mismatch: {len(doses)} doses vs {len(responses)} responses", "summary": f"Length mismatch: {len(doses)} doses vs {len(responses)} responses"}
|
|
50
|
+
if len(doses) < 4:
|
|
51
|
+
return {"error": f"Need at least 4 data points for 4PL fit, got {len(doses)}", "summary": f"Need at least 4 data points for 4PL fit, got {len(doses)}"}
|
|
52
|
+
# Filter out non-positive doses (log-space fitting)
|
|
53
|
+
valid = [(d, r) for d, r in zip(doses, responses) if d > 0]
|
|
54
|
+
if len(valid) < 4:
|
|
55
|
+
return {"error": "Need at least 4 positive dose values", "summary": "Need at least 4 positive dose values"}
|
|
56
|
+
doses_arr = np.array([v[0] for v in valid])
|
|
57
|
+
resp_arr = np.array([v[1] for v in valid])
|
|
58
|
+
|
|
59
|
+
# 4-parameter logistic (Hill equation)
|
|
60
|
+
def hill(x, bottom, top, ic50, slope):
|
|
61
|
+
return bottom + (top - bottom) / (1.0 + (ic50 / x) ** slope)
|
|
62
|
+
|
|
63
|
+
# Initial parameter guesses
|
|
64
|
+
bottom_guess = float(np.min(resp_arr))
|
|
65
|
+
top_guess = float(np.max(resp_arr))
|
|
66
|
+
ic50_guess = float(np.median(doses_arr))
|
|
67
|
+
slope_guess = 1.0
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
popt, pcov = curve_fit(
|
|
71
|
+
hill, doses_arr, resp_arr,
|
|
72
|
+
p0=[bottom_guess, top_guess, ic50_guess, slope_guess],
|
|
73
|
+
bounds=(
|
|
74
|
+
[-np.inf, -np.inf, 1e-15, 0.01], # lower bounds (IC50 > 0, slope > 0)
|
|
75
|
+
[np.inf, np.inf, np.inf, 100.0] # upper bounds
|
|
76
|
+
),
|
|
77
|
+
maxfev=10000,
|
|
78
|
+
)
|
|
79
|
+
bottom, top, ic50, slope = popt
|
|
80
|
+
perr = np.sqrt(np.diag(pcov))
|
|
81
|
+
|
|
82
|
+
# Compute R-squared
|
|
83
|
+
predicted = hill(doses_arr, *popt)
|
|
84
|
+
ss_res = np.sum((resp_arr - predicted) ** 2)
|
|
85
|
+
ss_tot = np.sum((resp_arr - np.mean(resp_arr)) ** 2)
|
|
86
|
+
r_squared = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
|
|
87
|
+
|
|
88
|
+
# Quality assessment
|
|
89
|
+
if r_squared > 0.95 and perr[2] / abs(ic50) < 0.5:
|
|
90
|
+
quality = "HIGH"
|
|
91
|
+
quality_detail = "Excellent fit with tight IC50 confidence"
|
|
92
|
+
elif r_squared > 0.8:
|
|
93
|
+
quality = "MEDIUM"
|
|
94
|
+
quality_detail = "Good fit, IC50 estimate reliable"
|
|
95
|
+
elif r_squared > 0.5:
|
|
96
|
+
quality = "LOW"
|
|
97
|
+
quality_detail = "Marginal fit, IC50 estimate approximate"
|
|
98
|
+
else:
|
|
99
|
+
quality = "POOR"
|
|
100
|
+
quality_detail = "Poor fit, IC50 unreliable"
|
|
101
|
+
|
|
102
|
+
# Max effect (dynamic range)
|
|
103
|
+
max_effect = abs(top - bottom)
|
|
104
|
+
|
|
105
|
+
summary = (
|
|
106
|
+
f"Dose-response fit for {compound_name}:\n"
|
|
107
|
+
f"IC50 = {ic50:.4g}, Hill slope = {slope:.2f}\n"
|
|
108
|
+
f"R² = {r_squared:.4f}, Quality: {quality}\n"
|
|
109
|
+
f"Bottom = {bottom:.2f}, Top = {top:.2f}, Max effect = {max_effect:.2f}"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
"summary": summary,
|
|
114
|
+
"compound": compound_name,
|
|
115
|
+
"ic50": round(float(ic50), 6),
|
|
116
|
+
"hill_slope": round(float(slope), 4),
|
|
117
|
+
"bottom": round(float(bottom), 4),
|
|
118
|
+
"top": round(float(top), 4),
|
|
119
|
+
"r_squared": round(float(r_squared), 4),
|
|
120
|
+
"max_effect": round(float(max_effect), 4),
|
|
121
|
+
"parameter_errors": {
|
|
122
|
+
"bottom_se": round(float(perr[0]), 4),
|
|
123
|
+
"top_se": round(float(perr[1]), 4),
|
|
124
|
+
"ic50_se": round(float(perr[2]), 6),
|
|
125
|
+
"slope_se": round(float(perr[3]), 4),
|
|
126
|
+
},
|
|
127
|
+
"quality": quality,
|
|
128
|
+
"quality_detail": quality_detail,
|
|
129
|
+
"n_points": len(doses_arr),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
except RuntimeError as e:
|
|
133
|
+
return {
|
|
134
|
+
"summary": f"Dose-response fit FAILED for {compound_name}: curve fitting did not converge",
|
|
135
|
+
"error": f"Convergence failure: {str(e)}",
|
|
136
|
+
"compound": compound_name,
|
|
137
|
+
"n_points": len(doses_arr),
|
|
138
|
+
"dose_range": [float(np.min(doses_arr)), float(np.max(doses_arr))],
|
|
139
|
+
"response_range": [float(np.min(resp_arr)), float(np.max(resp_arr))],
|
|
140
|
+
}
|
|
141
|
+
except Exception as e:
|
|
142
|
+
return {
|
|
143
|
+
"summary": f"Dose-response fit FAILED for {compound_name}: {str(e)}",
|
|
144
|
+
"error": str(e),
|
|
145
|
+
"compound": compound_name,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@registry.register(
|
|
150
|
+
name="statistics.survival_analysis",
|
|
151
|
+
description="Perform Kaplan-Meier survival analysis with optional log-rank test for group comparison",
|
|
152
|
+
category="statistics",
|
|
153
|
+
parameters={
|
|
154
|
+
"times": "List of survival/follow-up times",
|
|
155
|
+
"events": "List of event indicators (1=event occurred, 0=censored)",
|
|
156
|
+
"groups": "Optional list of group labels for comparing survival between groups",
|
|
157
|
+
},
|
|
158
|
+
requires_data=[],
|
|
159
|
+
usage_guide="You have time-to-event data and want to estimate survival curves, median survival, "
|
|
160
|
+
"and compare groups. Provide times, event indicators, and optionally group labels. "
|
|
161
|
+
"Use for clinical trial analysis, patient stratification, or biomarker validation.",
|
|
162
|
+
)
|
|
163
|
+
def survival_analysis(times: list = None, events: list = None,
|
|
164
|
+
groups: list = None, **kwargs) -> dict:
|
|
165
|
+
"""Perform Kaplan-Meier survival analysis with log-rank test.
|
|
166
|
+
|
|
167
|
+
Implements the Kaplan-Meier estimator from scratch:
|
|
168
|
+
S(t) = product of (1 - d_i/n_i) for all event times t_i <= t
|
|
169
|
+
|
|
170
|
+
If groups are provided, computes separate KM curves and a log-rank test.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
times : list of float
|
|
175
|
+
Survival or follow-up times.
|
|
176
|
+
events : list of int
|
|
177
|
+
Event indicators: 1 = event (death/progression), 0 = censored.
|
|
178
|
+
groups : list, optional
|
|
179
|
+
Group labels for stratified analysis (e.g., ["high", "low", "high", ...]).
|
|
180
|
+
"""
|
|
181
|
+
from scipy import stats as sp_stats
|
|
182
|
+
|
|
183
|
+
if times is None or events is None:
|
|
184
|
+
return {"error": "Both 'times' and 'events' lists are required", "summary": "Both 'times' and 'events' lists are required"}
|
|
185
|
+
times = [float(t) for t in times]
|
|
186
|
+
events = [int(e) for e in events]
|
|
187
|
+
|
|
188
|
+
if len(times) != len(events):
|
|
189
|
+
return {"error": f"Length mismatch: {len(times)} times vs {len(events)} events", "summary": f"Length mismatch: {len(times)} times vs {len(events)} events"}
|
|
190
|
+
if len(times) < 3:
|
|
191
|
+
return {"error": f"Need at least 3 observations, got {len(times)}", "summary": f"Need at least 3 observations, got {len(times)}"}
|
|
192
|
+
def _kaplan_meier(t_arr, e_arr):
|
|
193
|
+
"""Compute KM survival curve from times and events arrays."""
|
|
194
|
+
# Sort by time
|
|
195
|
+
order = np.argsort(t_arr)
|
|
196
|
+
t_sorted = t_arr[order]
|
|
197
|
+
e_sorted = e_arr[order]
|
|
198
|
+
|
|
199
|
+
# Get unique event times (only where event=1)
|
|
200
|
+
event_times = np.unique(t_sorted[e_sorted == 1])
|
|
201
|
+
|
|
202
|
+
km_times = [0.0]
|
|
203
|
+
km_survival = [1.0]
|
|
204
|
+
n_at_risk = len(t_sorted)
|
|
205
|
+
current_s = 1.0
|
|
206
|
+
|
|
207
|
+
for et in event_times:
|
|
208
|
+
# Number who have been censored or had event before this time
|
|
209
|
+
# n_at_risk at time et: those with time >= et
|
|
210
|
+
n_at_risk = int(np.sum(t_sorted >= et))
|
|
211
|
+
# Deaths at this time
|
|
212
|
+
d = int(np.sum((t_sorted == et) & (e_sorted == 1)))
|
|
213
|
+
|
|
214
|
+
if n_at_risk > 0:
|
|
215
|
+
current_s *= (1.0 - d / n_at_risk)
|
|
216
|
+
|
|
217
|
+
km_times.append(float(et))
|
|
218
|
+
km_survival.append(round(current_s, 6))
|
|
219
|
+
|
|
220
|
+
# Median survival: first time S(t) <= 0.5
|
|
221
|
+
median_surv = None
|
|
222
|
+
for t_val, s_val in zip(km_times, km_survival):
|
|
223
|
+
if s_val <= 0.5:
|
|
224
|
+
median_surv = t_val
|
|
225
|
+
break
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
"times": km_times,
|
|
229
|
+
"survival": km_survival,
|
|
230
|
+
"median_survival": median_surv,
|
|
231
|
+
"n_events": int(np.sum(e_sorted)),
|
|
232
|
+
"n_censored": int(np.sum(e_sorted == 0)),
|
|
233
|
+
"n_total": len(t_sorted),
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
t_arr = np.array(times)
|
|
237
|
+
e_arr = np.array(events)
|
|
238
|
+
|
|
239
|
+
# Single-group analysis
|
|
240
|
+
if groups is None:
|
|
241
|
+
km = _kaplan_meier(t_arr, e_arr)
|
|
242
|
+
median_str = f"{km['median_survival']:.1f}" if km['median_survival'] is not None else "not reached"
|
|
243
|
+
summary = (
|
|
244
|
+
f"Kaplan-Meier survival analysis (n={km['n_total']}):\n"
|
|
245
|
+
f"Events: {km['n_events']}, Censored: {km['n_censored']}\n"
|
|
246
|
+
f"Median survival: {median_str}"
|
|
247
|
+
)
|
|
248
|
+
return {
|
|
249
|
+
"summary": summary,
|
|
250
|
+
"kaplan_meier": km,
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
# Multi-group analysis
|
|
254
|
+
groups = list(groups)
|
|
255
|
+
if len(groups) != len(times):
|
|
256
|
+
return {"error": f"Length mismatch: {len(times)} times vs {len(groups)} groups", "summary": f"Length mismatch: {len(times)} times vs {len(groups)} groups"}
|
|
257
|
+
unique_groups = sorted(set(groups))
|
|
258
|
+
if len(unique_groups) < 2:
|
|
259
|
+
return {"error": f"Need at least 2 groups for comparison, got {len(unique_groups)}", "summary": f"Need at least 2 groups for comparison, got {len(unique_groups)}"}
|
|
260
|
+
group_arr = np.array(groups)
|
|
261
|
+
group_results = {}
|
|
262
|
+
|
|
263
|
+
for g in unique_groups:
|
|
264
|
+
mask = group_arr == g
|
|
265
|
+
group_results[str(g)] = _kaplan_meier(t_arr[mask], e_arr[mask])
|
|
266
|
+
|
|
267
|
+
# Log-rank test (for 2 groups, generalizable)
|
|
268
|
+
# Implementation: compare observed vs expected events in each group
|
|
269
|
+
# at each unique event time across all groups
|
|
270
|
+
all_event_times = np.unique(t_arr[e_arr == 1])
|
|
271
|
+
|
|
272
|
+
# Compute chi-squared statistic for log-rank
|
|
273
|
+
observed_minus_expected = {str(g): 0.0 for g in unique_groups}
|
|
274
|
+
variance_sum = 0.0
|
|
275
|
+
|
|
276
|
+
for et in all_event_times:
|
|
277
|
+
# Total at risk and total events at this time
|
|
278
|
+
at_risk_total = int(np.sum(t_arr >= et))
|
|
279
|
+
events_total = int(np.sum((t_arr == et) & (e_arr == 1)))
|
|
280
|
+
|
|
281
|
+
if at_risk_total == 0:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
for g in unique_groups:
|
|
285
|
+
mask = group_arr == g
|
|
286
|
+
at_risk_g = int(np.sum(t_arr[mask] >= et))
|
|
287
|
+
events_g = int(np.sum((t_arr[mask] == et) & (e_arr[mask] == 1)))
|
|
288
|
+
|
|
289
|
+
# Expected events under null
|
|
290
|
+
expected_g = at_risk_g * events_total / at_risk_total if at_risk_total > 0 else 0
|
|
291
|
+
observed_minus_expected[str(g)] += (events_g - expected_g)
|
|
292
|
+
|
|
293
|
+
# Variance contribution (hypergeometric variance)
|
|
294
|
+
if at_risk_total > 1:
|
|
295
|
+
for g in unique_groups:
|
|
296
|
+
mask = group_arr == g
|
|
297
|
+
n_g = int(np.sum(t_arr[mask] >= et))
|
|
298
|
+
frac = n_g / at_risk_total
|
|
299
|
+
censored_total = at_risk_total - events_total
|
|
300
|
+
var_contrib = (events_total * censored_total * frac * (1 - frac)) / (at_risk_total - 1)
|
|
301
|
+
# Only accumulate for the first group (2-group test)
|
|
302
|
+
if g == unique_groups[0]:
|
|
303
|
+
variance_sum += var_contrib
|
|
304
|
+
|
|
305
|
+
# Chi-squared statistic (1 df for 2 groups)
|
|
306
|
+
if variance_sum > 0:
|
|
307
|
+
chi2 = (observed_minus_expected[str(unique_groups[0])] ** 2) / variance_sum
|
|
308
|
+
p_value = float(1.0 - sp_stats.chi2.cdf(chi2, df=len(unique_groups) - 1))
|
|
309
|
+
else:
|
|
310
|
+
chi2 = 0.0
|
|
311
|
+
p_value = 1.0
|
|
312
|
+
|
|
313
|
+
# Event rate ratio (simplified — not a proper Cox hazard ratio).
|
|
314
|
+
# Computed as (events_1 / total_time_1) / (events_2 / total_time_2).
|
|
315
|
+
hr = None
|
|
316
|
+
hr_str = "N/A"
|
|
317
|
+
if len(unique_groups) == 2:
|
|
318
|
+
g1, g2 = str(unique_groups[0]), str(unique_groups[1])
|
|
319
|
+
r1 = group_results[g1]
|
|
320
|
+
r2 = group_results[g2]
|
|
321
|
+
rate1 = r1["n_events"] / max(np.sum(t_arr[group_arr == unique_groups[0]]), 1e-10)
|
|
322
|
+
rate2 = r2["n_events"] / max(np.sum(t_arr[group_arr == unique_groups[1]]), 1e-10)
|
|
323
|
+
if rate2 > 0:
|
|
324
|
+
hr = round(float(rate1 / rate2), 4)
|
|
325
|
+
hr_str = f"{hr:.3f}"
|
|
326
|
+
|
|
327
|
+
# Build summary
|
|
328
|
+
median_parts = []
|
|
329
|
+
for g in unique_groups:
|
|
330
|
+
med = group_results[str(g)]["median_survival"]
|
|
331
|
+
med_str = f"{med:.1f}" if med is not None else "NR"
|
|
332
|
+
n = group_results[str(g)]["n_total"]
|
|
333
|
+
median_parts.append(f"{g}(n={n}): {med_str}")
|
|
334
|
+
|
|
335
|
+
significance = "significant" if p_value < 0.05 else "not significant"
|
|
336
|
+
|
|
337
|
+
summary = (
|
|
338
|
+
f"Kaplan-Meier survival analysis ({len(unique_groups)} groups, n={len(times)}):\n"
|
|
339
|
+
f"Median survival: {', '.join(median_parts)}\n"
|
|
340
|
+
f"Log-rank p = {p_value:.4g} ({significance})\n"
|
|
341
|
+
f"Event rate ratio (simplified HR): {hr_str}"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
"summary": summary,
|
|
346
|
+
"groups": group_results,
|
|
347
|
+
"log_rank": {
|
|
348
|
+
"chi2": round(float(chi2), 4),
|
|
349
|
+
"p_value": round(float(p_value), 6),
|
|
350
|
+
"significant": p_value < 0.05,
|
|
351
|
+
},
|
|
352
|
+
"hazard_ratio": hr, # Note: simplified event rate ratio, not Cox regression HR
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
@registry.register(
|
|
357
|
+
name="statistics.enrichment_test",
|
|
358
|
+
description="Gene set over-representation analysis using hypergeometric test with FDR correction",
|
|
359
|
+
category="statistics",
|
|
360
|
+
parameters={
|
|
361
|
+
"gene_list": "List of gene symbols (your query genes)",
|
|
362
|
+
"gene_set": "Dict of set_name:gene_list, or 'hallmark' for built-in MSigDB hallmark sets",
|
|
363
|
+
"background_size": "Total background gene count (default 20000)",
|
|
364
|
+
},
|
|
365
|
+
requires_data=[],
|
|
366
|
+
usage_guide="You have a list of genes (e.g. differentially expressed, mutated, degraded) and want "
|
|
367
|
+
"to know which pathways or gene sets they are enriched in. Provide your gene list and "
|
|
368
|
+
"optionally a custom gene set dict, or use built-in hallmark sets. Returns FDR-corrected p-values.",
|
|
369
|
+
)
|
|
370
|
+
def enrichment_test(gene_list: list = None, gene_set: dict | str = "hallmark",
|
|
371
|
+
background_size: int = 20000, **kwargs) -> dict:
|
|
372
|
+
"""Gene set over-representation analysis (ORA) with hypergeometric test.
|
|
373
|
+
|
|
374
|
+
For each gene set, computes:
|
|
375
|
+
- Hypergeometric p-value (Fisher's exact one-tailed)
|
|
376
|
+
- Fold enrichment = (overlap/query) / (set_size/background)
|
|
377
|
+
- Benjamini-Hochberg FDR correction
|
|
378
|
+
|
|
379
|
+
Parameters
|
|
380
|
+
----------
|
|
381
|
+
gene_list : list of str
|
|
382
|
+
Query genes (e.g. upregulated genes, hit list).
|
|
383
|
+
gene_set : dict or str
|
|
384
|
+
Dict mapping set names to gene lists, or "hallmark" for built-in.
|
|
385
|
+
background_size : int
|
|
386
|
+
Total number of genes in the background (default 20000).
|
|
387
|
+
"""
|
|
388
|
+
from scipy.stats import hypergeom
|
|
389
|
+
|
|
390
|
+
if gene_list is None or len(gene_list) == 0:
|
|
391
|
+
return {"error": "Provide a non-empty gene_list", "summary": "Provide a non-empty gene_list"}
|
|
392
|
+
gene_list = [str(g).upper() for g in gene_list]
|
|
393
|
+
query_set = set(gene_list)
|
|
394
|
+
n_query = len(query_set)
|
|
395
|
+
|
|
396
|
+
# Load or use provided gene sets
|
|
397
|
+
if isinstance(gene_set, str):
|
|
398
|
+
if gene_set == "hallmark":
|
|
399
|
+
gene_set = _get_hallmark_sets()
|
|
400
|
+
else:
|
|
401
|
+
return {"error": f"Unknown gene set collection: {gene_set}. Provide a dict or 'hallmark'", "summary": f"Unknown gene set collection: {gene_set}. Provide a dict or 'hallmark'"}
|
|
402
|
+
if not isinstance(gene_set, dict) or len(gene_set) == 0:
|
|
403
|
+
return {"error": "gene_set must be a non-empty dict of set_name: gene_list", "summary": "gene_set must be a non-empty dict of set_name: gene_list"}
|
|
404
|
+
N = int(background_size) # total background genes
|
|
405
|
+
results = []
|
|
406
|
+
|
|
407
|
+
for set_name, set_genes in gene_set.items():
|
|
408
|
+
set_genes_upper = [str(g).upper() for g in set_genes]
|
|
409
|
+
set_size = len(set_genes_upper)
|
|
410
|
+
gene_set_set = set(set_genes_upper)
|
|
411
|
+
|
|
412
|
+
# Overlap
|
|
413
|
+
overlap = query_set & gene_set_set
|
|
414
|
+
k = len(overlap)
|
|
415
|
+
|
|
416
|
+
if k == 0:
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
# Hypergeometric test
|
|
420
|
+
# P(X >= k) where X ~ Hypergeometric(N, K, n)
|
|
421
|
+
# N = background, K = set_size, n = query_size
|
|
422
|
+
p_value = float(hypergeom.sf(k - 1, N, set_size, n_query))
|
|
423
|
+
|
|
424
|
+
# Fold enrichment
|
|
425
|
+
expected = (set_size / N) * n_query if N > 0 else 0
|
|
426
|
+
fold_enrichment = k / expected if expected > 0 else float('inf')
|
|
427
|
+
|
|
428
|
+
results.append({
|
|
429
|
+
"gene_set": set_name,
|
|
430
|
+
"overlap_count": k,
|
|
431
|
+
"overlap_genes": sorted(overlap),
|
|
432
|
+
"set_size": set_size,
|
|
433
|
+
"p_value": p_value,
|
|
434
|
+
"fold_enrichment": round(float(fold_enrichment), 2),
|
|
435
|
+
})
|
|
436
|
+
|
|
437
|
+
if not results:
|
|
438
|
+
return {
|
|
439
|
+
"summary": f"No enrichment found: {n_query} query genes had no overlap with {len(gene_set)} gene sets",
|
|
440
|
+
"n_query_genes": n_query,
|
|
441
|
+
"n_gene_sets_tested": len(gene_set),
|
|
442
|
+
"enriched": [],
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
# Sort by p-value
|
|
446
|
+
results.sort(key=lambda x: x["p_value"])
|
|
447
|
+
|
|
448
|
+
# Benjamini-Hochberg FDR correction
|
|
449
|
+
n_tests = len(results)
|
|
450
|
+
for i, r in enumerate(results):
|
|
451
|
+
rank = i + 1
|
|
452
|
+
r["fdr"] = round(min(r["p_value"] * n_tests / rank, 1.0), 6)
|
|
453
|
+
|
|
454
|
+
# Enforce monotonicity (FDR should be non-decreasing from bottom)
|
|
455
|
+
for i in range(n_tests - 2, -1, -1):
|
|
456
|
+
results[i]["fdr"] = min(results[i]["fdr"], results[i + 1]["fdr"])
|
|
457
|
+
|
|
458
|
+
# Round p-values for output
|
|
459
|
+
for r in results:
|
|
460
|
+
r["p_value"] = round(r["p_value"], 8)
|
|
461
|
+
|
|
462
|
+
significant = [r for r in results if r["fdr"] < 0.05]
|
|
463
|
+
|
|
464
|
+
# Top 10 for summary
|
|
465
|
+
top = results[:10]
|
|
466
|
+
top_str = "\n".join(
|
|
467
|
+
f" {r['gene_set']}: {r['overlap_count']}/{r['set_size']} genes, "
|
|
468
|
+
f"FE={r['fold_enrichment']:.1f}x, FDR={r['fdr']:.2g}"
|
|
469
|
+
for r in top
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
summary = (
|
|
473
|
+
f"Gene set enrichment ({n_query} query genes, {len(gene_set)} sets tested):\n"
|
|
474
|
+
f"Significant (FDR<0.05): {len(significant)}/{n_tests}\n"
|
|
475
|
+
f"Top enriched:\n{top_str}"
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
return {
|
|
479
|
+
"summary": summary,
|
|
480
|
+
"n_query_genes": n_query,
|
|
481
|
+
"n_gene_sets_tested": len(gene_set),
|
|
482
|
+
"n_significant": len(significant),
|
|
483
|
+
"enriched": results,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _get_hallmark_sets() -> dict:
|
|
488
|
+
"""Get built-in hallmark-lite gene sets for enrichment analysis."""
|
|
489
|
+
# Try to load full MSigDB hallmark sets first
|
|
490
|
+
try:
|
|
491
|
+
from ct.data.loaders import load_msigdb
|
|
492
|
+
msigdb = load_msigdb("h")
|
|
493
|
+
# MSigDB JSON: {set_name: {geneSymbols: [...]}}
|
|
494
|
+
if isinstance(msigdb, dict):
|
|
495
|
+
parsed = {}
|
|
496
|
+
for name, data in msigdb.items():
|
|
497
|
+
if isinstance(data, dict) and "geneSymbols" in data:
|
|
498
|
+
parsed[name] = data["geneSymbols"]
|
|
499
|
+
elif isinstance(data, list):
|
|
500
|
+
parsed[name] = data
|
|
501
|
+
if parsed:
|
|
502
|
+
return parsed
|
|
503
|
+
except (FileNotFoundError, ImportError):
|
|
504
|
+
pass
|
|
505
|
+
|
|
506
|
+
# Fallback: curated hallmark-lite sets
|
|
507
|
+
return {
|
|
508
|
+
"HALLMARK_P53_PATHWAY": ["CDKN1A", "MDM2", "BAX", "GADD45A", "SFN", "DDB2",
|
|
509
|
+
"SESN1", "TP53I3", "PMAIP1", "BBC3"],
|
|
510
|
+
"HALLMARK_APOPTOSIS": ["BCL2", "BAX", "BAK1", "BID", "CASP3", "CASP8",
|
|
511
|
+
"CASP9", "CYCS", "APAF1", "FADD"],
|
|
512
|
+
"HALLMARK_MTORC1_SIGNALING": ["SLC7A5", "SLC3A2", "DDIT4", "VEGFA", "HK2",
|
|
513
|
+
"PKM", "LDHA", "SLC2A1", "RPS6", "EIF4E"],
|
|
514
|
+
"HALLMARK_MYC_TARGETS_V1": ["ODC1", "LDHA", "CDK4", "NCL", "NPM1", "NOP56",
|
|
515
|
+
"BOP1", "MRTO4", "RRP12", "WDR12"],
|
|
516
|
+
"HALLMARK_E2F_TARGETS": ["CCNE1", "MCM2", "PCNA", "RRM2", "MCM3", "MCM4",
|
|
517
|
+
"MCM5", "MCM6", "CDC6", "ORC1"],
|
|
518
|
+
"HALLMARK_G2M_CHECKPOINT": ["CDK1", "CCNB1", "CCNB2", "BUB1", "BUB1B",
|
|
519
|
+
"AURKA", "AURKB", "PLK1", "TOP2A", "BIRC5"],
|
|
520
|
+
"HALLMARK_DNA_REPAIR": ["BRCA1", "BRCA2", "RAD51", "ATM", "ATR", "CHEK1",
|
|
521
|
+
"CHEK2", "MSH2", "MSH6", "MLH1"],
|
|
522
|
+
"HALLMARK_INFLAMMATORY_RESPONSE": ["TNF", "IL6", "IL1B", "CXCL8", "CCL2",
|
|
523
|
+
"ICAM1", "VCAM1", "SELE", "PTGS2", "MMP9"],
|
|
524
|
+
"HALLMARK_TNFA_SIGNALING_VIA_NFKB": ["NFKBIA", "TNF", "IL6", "CXCL8", "CCL2",
|
|
525
|
+
"TNFAIP3", "BIRC3", "TRAF1", "RELB", "BCL3"],
|
|
526
|
+
"HALLMARK_INTERFERON_GAMMA_RESPONSE": ["STAT1", "IRF1", "GBP1", "GBP2", "CXCL10",
|
|
527
|
+
"CXCL9", "IDO1", "TAP1", "PSMB9", "B2M"],
|
|
528
|
+
"HALLMARK_INTERFERON_ALPHA_RESPONSE": ["ISG15", "MX1", "MX2", "IFIT1", "IFIT2",
|
|
529
|
+
"IFIT3", "OAS1", "OAS2", "RSAD2", "IFI44L"],
|
|
530
|
+
"HALLMARK_HYPOXIA": ["VEGFA", "SLC2A1", "LDHA", "PDK1", "BNIP3", "DDIT4",
|
|
531
|
+
"ENO1", "PGK1", "ALDOA", "HK2"],
|
|
532
|
+
"HALLMARK_GLYCOLYSIS": ["HK2", "PFKM", "ALDOA", "GAPDH", "PKM", "LDHA",
|
|
533
|
+
"ENO1", "TPI1", "PGK1", "GPI"],
|
|
534
|
+
"HALLMARK_OXIDATIVE_PHOSPHORYLATION": ["NDUFA1", "SDHA", "UQCRC1", "COX5A",
|
|
535
|
+
"ATP5F1A", "NDUFS1", "SDHB", "COX7A2",
|
|
536
|
+
"UQCRB", "ATP5F1B"],
|
|
537
|
+
"HALLMARK_UNFOLDED_PROTEIN_RESPONSE": ["HSPA5", "DDIT3", "ATF4", "XBP1",
|
|
538
|
+
"HERPUD1", "DNAJB9", "PDIA4", "ERN1",
|
|
539
|
+
"ATF6", "EDEM1"],
|
|
540
|
+
"HALLMARK_WNT_BETA_CATENIN_SIGNALING": ["CTNNB1", "LEF1", "TCF7", "MYC",
|
|
541
|
+
"CCND1", "AXIN2", "DKK1", "WNT3A",
|
|
542
|
+
"FZD1", "LRP6"],
|
|
543
|
+
"HALLMARK_NOTCH_SIGNALING": ["NOTCH1", "HES1", "HEY1", "JAG1", "DLL1",
|
|
544
|
+
"RBPJ", "MAML1", "NRARP", "DTX1", "HEYL"],
|
|
545
|
+
"HALLMARK_HEDGEHOG_SIGNALING": ["SHH", "SMO", "PTCH1", "GLI1", "GLI2",
|
|
546
|
+
"GLI3", "SUFU", "HHIP", "GAS1", "BOC"],
|
|
547
|
+
"HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION": ["VIM", "CDH2", "FN1", "SNAI1",
|
|
548
|
+
"SNAI2", "TWIST1", "ZEB1", "ZEB2",
|
|
549
|
+
"MMP2", "MMP9"],
|
|
550
|
+
"HALLMARK_ANGIOGENESIS": ["VEGFA", "KDR", "FLT1", "PECAM1", "ANGPT1",
|
|
551
|
+
"ANGPT2", "TEK", "NRP1", "ENG", "HIF1A"],
|
|
552
|
+
}
|