DeConveil 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deconveil/__init__.py +1 -0
- deconveil/__version__.py +1 -1
- deconveil/dds.py +169 -62
- deconveil/default_inference.py +3 -3
- deconveil/ds.py +82 -170
- deconveil/grid_search.py +1 -0
- deconveil/inference.py +4 -4
- deconveil/nb_regression_fit.py +313 -0
- deconveil/simulate_gene_dosage.py +589 -0
- deconveil/utils_fit.py +173 -129
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/METADATA +4 -1
- deconveil-0.2.0.dist-info/RECORD +18 -0
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/WHEEL +1 -1
- deconveil-0.1.4.dist-info/RECORD +0 -16
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {deconveil-0.1.4.dist-info → deconveil-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
from cmdstanpy import CmdStanModel
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import scipy.stats as st
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def fit_one_gene(
|
|
8
|
+
gene_df: pd.DataFrame,
|
|
9
|
+
model: CmdStanModel,
|
|
10
|
+
gene: str | None = None, # optional convenience
|
|
11
|
+
cna: str = "all", # "amp" | "del" | "all"
|
|
12
|
+
et: float = 0.15,
|
|
13
|
+
min_aneup: int = 5,
|
|
14
|
+
min_unique_counts: int = 5,
|
|
15
|
+
min_cn_abs_sum: float = 1.0, # identifiability filter for cna="all"
|
|
16
|
+
chains: int = 4,
|
|
17
|
+
iter_warmup: int = 1000,
|
|
18
|
+
iter_sampling: int = 1000,
|
|
19
|
+
seed: int = 1,
|
|
20
|
+
show_progress: bool = False,
|
|
21
|
+
adapt_delta: float = 0.99,
|
|
22
|
+
max_treedepth: int = 15,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Fit Stan NB regression model for a single gene.
|
|
26
|
+
Expects gene_df to be filtered to one gene OR pass gene and full df.
|
|
27
|
+
|
|
28
|
+
Required columns in gene_df:
|
|
29
|
+
gene, expr, copies, purity, stroma, sf, eup_dev_cancer, eup_equiv_cancer
|
|
30
|
+
Optional columns:
|
|
31
|
+
covar (if missing -> set to 'ALL')
|
|
32
|
+
"""
|
|
33
|
+
# subset to one gene if gene provided and gene_df contains multiple genes
|
|
34
|
+
df = gene_df.copy()
|
|
35
|
+
if gene is not None and "gene" in df.columns and df["gene"].nunique() > 1:
|
|
36
|
+
df = df.loc[df["gene"] == gene].copy()
|
|
37
|
+
if gene is None and "gene" in df.columns and df["gene"].nunique() == 1:
|
|
38
|
+
gene = str(df["gene"].iloc[0])
|
|
39
|
+
|
|
40
|
+
if df.empty:
|
|
41
|
+
return {"status": "skipped", "gene": gene, "reason": "no_rows_for_gene"}
|
|
42
|
+
|
|
43
|
+
required = {"expr","copies","purity","stroma","sf","eup_dev_cancer","eup_equiv_cancer"}
|
|
44
|
+
missing = required - set(df.columns)
|
|
45
|
+
if missing:
|
|
46
|
+
return {"status": "error", "gene": gene, "reason": f"missing_columns: {sorted(missing)}"}
|
|
47
|
+
|
|
48
|
+
# CNA subset (optional)
|
|
49
|
+
if cna == "amp":
|
|
50
|
+
df = df[df["copies"] > (2 - et)]
|
|
51
|
+
elif cna == "del":
|
|
52
|
+
df = df[df["copies"] < (2 + et)]
|
|
53
|
+
elif cna == "all":
|
|
54
|
+
pass
|
|
55
|
+
else:
|
|
56
|
+
raise ValueError("cna must be 'amp', 'del', or 'all'")
|
|
57
|
+
|
|
58
|
+
if df.empty:
|
|
59
|
+
return {"status": "skipped", "gene": gene, "reason": "no_samples_after_cna_filter"}
|
|
60
|
+
|
|
61
|
+
# basic data QC
|
|
62
|
+
df = df.dropna(subset=["expr","sf","purity","stroma","eup_dev_cancer","eup_equiv_cancer"])
|
|
63
|
+
if df.empty:
|
|
64
|
+
return {"status": "skipped", "gene": gene, "reason": "all_na_after_dropna"}
|
|
65
|
+
|
|
66
|
+
if (df["expr"] < 0).any():
|
|
67
|
+
return {"status": "error", "gene": gene, "reason": "negative_counts"}
|
|
68
|
+
|
|
69
|
+
if not df["purity"].between(0, 1).all():
|
|
70
|
+
return {"status": "error", "gene": gene, "reason": "purity_out_of_bounds"}
|
|
71
|
+
|
|
72
|
+
if not (df["sf"] > 0).all():
|
|
73
|
+
return {"status": "error", "gene": gene, "reason": "nonpositive_sf"}
|
|
74
|
+
|
|
75
|
+
if not (df["eup_equiv_cancer"] > 0).all():
|
|
76
|
+
return {"status": "error", "gene": gene, "reason": "nonpositive_eup_equiv_cancer"}
|
|
77
|
+
|
|
78
|
+
# skip degenerate genes early (prevents phi->inf + treedepth explosions)
|
|
79
|
+
if df["expr"].nunique() < min_unique_counts:
|
|
80
|
+
return {"status": "skipped", "gene": gene, "reason": "too_few_unique_counts"}
|
|
81
|
+
|
|
82
|
+
# aneuploid count check (use copies as cancer_copies analogue)
|
|
83
|
+
n_aneup = int((np.abs(df["copies"].astype(float) - 2.0) > (1.0 - et)).sum())
|
|
84
|
+
if n_aneup < min_aneup or (df["expr"] == 0).all():
|
|
85
|
+
return {"status": "skipped", "gene": gene, "n_aneup": n_aneup, "reason": "low_aneup_or_all_zero"}
|
|
86
|
+
|
|
87
|
+
# identifiability check for cna="all": need some CN deviation mass
|
|
88
|
+
if cna == "all" and df["eup_dev_cancer"].abs().sum() < min_cn_abs_sum:
|
|
89
|
+
return {"status": "skipped", "gene": gene, "n_aneup": n_aneup, "reason": "too_little_cn_variation"}
|
|
90
|
+
|
|
91
|
+
# within one cohort: enforce K=1, covar='ALL'
|
|
92
|
+
df["covar"] = "ALL"
|
|
93
|
+
covar_levels = ["ALL"]
|
|
94
|
+
covar_idx = np.ones(len(df), dtype=int)
|
|
95
|
+
|
|
96
|
+
# per-gene sf scaling
|
|
97
|
+
mean_expr = float(df["expr"].mean())
|
|
98
|
+
df["sf_scaled"] = df["sf"].astype(float) * mean_expr
|
|
99
|
+
|
|
100
|
+
stan_data = {
|
|
101
|
+
"N": int(len(df)),
|
|
102
|
+
"y": df["expr"].astype(int).to_numpy(),
|
|
103
|
+
"K": 1,
|
|
104
|
+
"covar": covar_idx,
|
|
105
|
+
"sf": df["sf_scaled"].to_numpy(dtype=float),
|
|
106
|
+
"purity": df["purity"].to_numpy(dtype=float),
|
|
107
|
+
"stroma": df["stroma"].to_numpy(dtype=float),
|
|
108
|
+
"eup_equiv_cancer": df["eup_equiv_cancer"].to_numpy(dtype=float),
|
|
109
|
+
"eup_dev_cancer": df["eup_dev_cancer"].to_numpy(dtype=float),
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
rng = np.random.default_rng(seed)
|
|
113
|
+
init = {
|
|
114
|
+
"b_scaling": rng.uniform(0.5, 1.5, size=1).tolist(),
|
|
115
|
+
"b_noncancer": rng.uniform(0.5, 1.5, size=1).tolist(),
|
|
116
|
+
"b_deviation": 0.0,
|
|
117
|
+
"phi": 1.0,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
fit = model.sample(
|
|
121
|
+
data=stan_data,
|
|
122
|
+
chains=chains,
|
|
123
|
+
iter_warmup=iter_warmup,
|
|
124
|
+
iter_sampling=iter_sampling,
|
|
125
|
+
seed=seed,
|
|
126
|
+
inits=init,
|
|
127
|
+
show_progress=show_progress,
|
|
128
|
+
adapt_delta=adapt_delta,
|
|
129
|
+
max_treedepth=max_treedepth,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
draws = fit.draws_pd()
|
|
133
|
+
|
|
134
|
+
# extract posterior summaries
|
|
135
|
+
|
|
136
|
+
scaling_col = next((c for c in draws.columns if c.startswith("b_scaling")), None)
|
|
137
|
+
if scaling_col is None:
|
|
138
|
+
return {"status": "error", "gene": gene, "reason": "missing_b_scaling_draws"}
|
|
139
|
+
|
|
140
|
+
if "b_deviation" not in draws.columns:
|
|
141
|
+
return {"status": "error", "gene": gene, "reason": "missing_b_deviation_draws"}
|
|
142
|
+
|
|
143
|
+
phi_col = "phi" if "phi" in draws.columns else None
|
|
144
|
+
|
|
145
|
+
b_scaling = draws[scaling_col].to_numpy()
|
|
146
|
+
b_dev = draws["b_deviation"].to_numpy()
|
|
147
|
+
|
|
148
|
+
# z and p
|
|
149
|
+
z_comp = float(b_dev.mean() / b_dev.std(ddof=1))
|
|
150
|
+
p_value = float(2.0 * (1.0 - st.norm.cdf(abs(z_comp))))
|
|
151
|
+
|
|
152
|
+
summ = fit.summary()
|
|
153
|
+
|
|
154
|
+
# robust extraction of Rhat / ESS
|
|
155
|
+
rhat_col = next((c for c in ["R_hat", "Rhat"] if c in summ.columns), None)
|
|
156
|
+
ess_col = next((c for c in ["Ess_bulk", "ESS_bulk", "N_Eff", "Ess"] if c in summ.columns), None)
|
|
157
|
+
|
|
158
|
+
rhat_dev = float(summ.loc["b_deviation", rhat_col]) if (rhat_col and "b_deviation" in summ.index) else np.nan
|
|
159
|
+
ess_dev = float(summ.loc["b_deviation", ess_col]) if (ess_col and "b_deviation" in summ.index) else np.nan
|
|
160
|
+
|
|
161
|
+
# mark borderline fits as warn (you can filter later)
|
|
162
|
+
status = "ok"
|
|
163
|
+
if (not np.isnan(rhat_dev) and rhat_dev > 1.05) or (not np.isnan(ess_dev) and ess_dev < 200):
|
|
164
|
+
status = "warn"
|
|
165
|
+
|
|
166
|
+
out = {
|
|
167
|
+
"status": status,
|
|
168
|
+
"gene": gene,
|
|
169
|
+
"N": int(len(df)),
|
|
170
|
+
"n_aneup": n_aneup,
|
|
171
|
+
"cna": cna,
|
|
172
|
+
# posterior summaries
|
|
173
|
+
"mean_b_scaling": float(b_scaling.mean()),
|
|
174
|
+
"sd_b_scaling": float(b_scaling.std(ddof=1)),
|
|
175
|
+
"mean_b_deviation": float(b_dev.mean()),
|
|
176
|
+
"sd_b_deviation": float(b_dev.std(ddof=1)),
|
|
177
|
+
"z_comp": z_comp,
|
|
178
|
+
"p_value": p_value,
|
|
179
|
+
# optional
|
|
180
|
+
"mean_phi": float(draws[phi_col].mean()) if phi_col else np.nan,
|
|
181
|
+
"Rhat_b_deviation": rhat_dev,
|
|
182
|
+
"ess_b_deviation": ess_dev,
|
|
183
|
+
"covar_levels": covar_levels,
|
|
184
|
+
}
|
|
185
|
+
return out
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def postprocess_nb_results(
|
|
189
|
+
res_df: pd.DataFrame,
|
|
190
|
+
alpha: float = 0.05,
|
|
191
|
+
comp_thr: float = 0.5,
|
|
192
|
+
min_aneup_frac: float = 0.30, # >=30% aneuploid samples
|
|
193
|
+
min_scaling: float = 1e-6,
|
|
194
|
+
require_scaling_for_dsg: bool = True,
|
|
195
|
+
warn_suffix: str | None = None, # e.g. "_LOWCONF" to tag warn calls
|
|
196
|
+
) -> pd.DataFrame:
|
|
197
|
+
"""
|
|
198
|
+
Post-process Stan NB regression results.
|
|
199
|
+
|
|
200
|
+
Expected columns in `res_df`:
|
|
201
|
+
- status, gene, N, n_aneup, cna
|
|
202
|
+
- mean_b_scaling, sd_b_scaling
|
|
203
|
+
- mean_b_deviation, sd_b_deviation
|
|
204
|
+
- z_comp, p_value, mean_phi
|
|
205
|
+
- Rhat_b_deviation, ess_b_deviation, covar_levels
|
|
206
|
+
|
|
207
|
+
Adds:
|
|
208
|
+
- aneup_frac : n_aneup / N (CN informativeness proxy)
|
|
209
|
+
- comp_score : normalized deviation = mean_b_deviation / mean_b_scaling
|
|
210
|
+
- signed_comp : sign-normalized comp_score (flip for 'del', keep for 'amp'/'all')
|
|
211
|
+
- shrunk_comp : comp_score shrunk toward 0 by (1 - p_value) [orthogonal score]
|
|
212
|
+
- label_nb : {'DSG','DCG','HYPER','OTHER','SKIP','ERROR'}
|
|
213
|
+
|
|
214
|
+
Notes:
|
|
215
|
+
- Without a per-gene CN direction summary, signed_comp is only partially
|
|
216
|
+
identifiable for cna='all'; here we flip sign for 'del' fits and leave
|
|
217
|
+
'amp'/'all' unchanged.
|
|
218
|
+
- DCG/HYPER require sufficient magnitude |signed_comp| >= comp_thr
|
|
219
|
+
- DSG small magnitude |signed_comp| <= comp_thr, with sufficient CN signal.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
df = res_df.copy()
|
|
223
|
+
|
|
224
|
+
# required columns check
|
|
225
|
+
required = {
|
|
226
|
+
"status", "gene", "N", "n_aneup", "cna", "p_value",
|
|
227
|
+
"mean_b_scaling", "mean_b_deviation"
|
|
228
|
+
}
|
|
229
|
+
missing = required - set(df.columns)
|
|
230
|
+
if missing:
|
|
231
|
+
raise ValueError(f"Missing required columns: {sorted(missing)}")
|
|
232
|
+
|
|
233
|
+
# usable rows: include ok + warn
|
|
234
|
+
status_lower = df["status"].astype(str).str.lower()
|
|
235
|
+
usable = status_lower.isin(["ok", "warn"]) & df["p_value"].notna()
|
|
236
|
+
|
|
237
|
+
# default labels
|
|
238
|
+
df["label_nb"] = "OTHER"
|
|
239
|
+
df.loc[status_lower.isin(["skip", "skipped"]), "label_nb"] = "SKIP"
|
|
240
|
+
df.loc[status_lower.isin(["error", "failed", "fail"]), "label_nb"] = "ERROR"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# CN informativeness proxy (fraction aneuploid)
|
|
244
|
+
df["aneup_frac"] = np.nan
|
|
245
|
+
df.loc[usable, "aneup_frac"] = (
|
|
246
|
+
df.loc[usable, "n_aneup"] / df.loc[usable, "N"]
|
|
247
|
+
).astype(float)
|
|
248
|
+
|
|
249
|
+
# compute compensation scores
|
|
250
|
+
df["comp_score"] = np.nan
|
|
251
|
+
df["signed_comp"] = np.nan
|
|
252
|
+
df["shrunk_comp"] = np.nan # normalized + shrunk score (orthogonal summary)
|
|
253
|
+
|
|
254
|
+
ok2 = (
|
|
255
|
+
usable
|
|
256
|
+
& df["mean_b_scaling"].notna()
|
|
257
|
+
& df["mean_b_deviation"].notna()
|
|
258
|
+
& (df["mean_b_scaling"].abs() > min_scaling)
|
|
259
|
+
& df["aneup_frac"].notna()
|
|
260
|
+
& (df["aneup_frac"] >= min_aneup_frac)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Normalized deviation: beta2* = mean_b_deviation / mean_b_scaling
|
|
264
|
+
df.loc[ok2, "comp_score"] = (
|
|
265
|
+
df.loc[ok2, "mean_b_deviation"] / df.loc[ok2, "mean_b_scaling"]
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Sign-normalize by CNA type if available: del -> flip sign
|
|
269
|
+
cna_vals = df.loc[ok2, "cna"].astype(str).str.lower()
|
|
270
|
+
sign = np.ones(len(cna_vals), dtype=float)
|
|
271
|
+
sign[cna_vals == "del"] = -1.0 # treat deletions as negative direction
|
|
272
|
+
# amp/all remain +1
|
|
273
|
+
df.loc[ok2, "signed_comp"] = (
|
|
274
|
+
df.loc[ok2, "comp_score"].to_numpy(dtype=float) * sign
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Shrink normalized deviation by pseudo-p (here: raw p_value)
|
|
278
|
+
# s_g ≈ beta2*_g * (1 - p_g)
|
|
279
|
+
pseudo_p = df.loc[ok2, "p_value"].astype(float).clip(0.0, 1.0)
|
|
280
|
+
df.loc[ok2, "shrunk_comp"] = df.loc[ok2, "comp_score"] * (1.0 - pseudo_p)
|
|
281
|
+
|
|
282
|
+
# classification gates
|
|
283
|
+
# Significant deviation (
|
|
284
|
+
is_sig = ok2 & (df["p_value"] <= alpha)
|
|
285
|
+
is_nonsig = ok2 & (df["p_value"] > alpha)
|
|
286
|
+
|
|
287
|
+
# Optional: require non-trivial scaling to call DSG
|
|
288
|
+
scaling_ok = ok2
|
|
289
|
+
if require_scaling_for_dsg:
|
|
290
|
+
scaling_ok = ok2 & (df["mean_b_scaling"] > 0.3)
|
|
291
|
+
|
|
292
|
+
# DCG: significant negative deviation (compensation)
|
|
293
|
+
#df.loc[is_sig & (df["shrunk_comp"] <= -comp_thr), "label_nb"] = "DCG"
|
|
294
|
+
df.loc[df["shrunk_comp"] <= -comp_thr, "label_nb"] = "DCG"
|
|
295
|
+
|
|
296
|
+
# HYPER: significant positive deviation
|
|
297
|
+
#df.loc[is_sig & (df["signed_comp"] >= comp_thr), "label_nb"] = "HYPER"
|
|
298
|
+
df.loc[df["shrunk_comp"] >= comp_thr, "label_nb"] = "HYPER"
|
|
299
|
+
|
|
300
|
+
# DSG: non-significant deviation AND near CN-proportional expectation
|
|
301
|
+
df.loc[
|
|
302
|
+
#is_nonsig & scaling_ok & (df["shrunk_comp"].abs() <= comp_thr),
|
|
303
|
+
scaling_ok & (df["shrunk_comp"].abs() <= comp_thr),
|
|
304
|
+
"label_nb"
|
|
305
|
+
] = "DSG"
|
|
306
|
+
|
|
307
|
+
# optional: tag warn calls as low confidence
|
|
308
|
+
if warn_suffix:
|
|
309
|
+
is_warn = status_lower.eq("warn")
|
|
310
|
+
mask = is_warn & df["label_nb"].isin(["DSG", "DCG", "HYPER"])
|
|
311
|
+
df.loc[mask, "label_nb"] = df.loc[mask, "label_nb"] + warn_suffix
|
|
312
|
+
|
|
313
|
+
return df
|