DeConveil 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,313 @@
1
+ from cmdstanpy import CmdStanModel
2
+ import numpy as np
3
+ import pandas as pd
4
+ import scipy.stats as st
5
+
6
+
7
+ def fit_one_gene(
8
+ gene_df: pd.DataFrame,
9
+ model: CmdStanModel,
10
+ gene: str | None = None, # optional convenience
11
+ cna: str = "all", # "amp" | "del" | "all"
12
+ et: float = 0.15,
13
+ min_aneup: int = 5,
14
+ min_unique_counts: int = 5,
15
+ min_cn_abs_sum: float = 1.0, # identifiability filter for cna="all"
16
+ chains: int = 4,
17
+ iter_warmup: int = 1000,
18
+ iter_sampling: int = 1000,
19
+ seed: int = 1,
20
+ show_progress: bool = False,
21
+ adapt_delta: float = 0.99,
22
+ max_treedepth: int = 15,
23
+ ):
24
+ """
25
+ Fit Stan NB regression model for a single gene.
26
+ Expects gene_df to be filtered to one gene OR pass gene and full df.
27
+
28
+ Required columns in gene_df:
29
+ gene, expr, copies, purity, stroma, sf, eup_dev_cancer, eup_equiv_cancer
30
+ Optional columns:
31
+ covar (if missing -> set to 'ALL')
32
+ """
33
+ # subset to one gene if gene provided and gene_df contains multiple genes
34
+ df = gene_df.copy()
35
+ if gene is not None and "gene" in df.columns and df["gene"].nunique() > 1:
36
+ df = df.loc[df["gene"] == gene].copy()
37
+ if gene is None and "gene" in df.columns and df["gene"].nunique() == 1:
38
+ gene = str(df["gene"].iloc[0])
39
+
40
+ if df.empty:
41
+ return {"status": "skipped", "gene": gene, "reason": "no_rows_for_gene"}
42
+
43
+ required = {"expr","copies","purity","stroma","sf","eup_dev_cancer","eup_equiv_cancer"}
44
+ missing = required - set(df.columns)
45
+ if missing:
46
+ return {"status": "error", "gene": gene, "reason": f"missing_columns: {sorted(missing)}"}
47
+
48
+ # CNA subset (optional)
49
+ if cna == "amp":
50
+ df = df[df["copies"] > (2 - et)]
51
+ elif cna == "del":
52
+ df = df[df["copies"] < (2 + et)]
53
+ elif cna == "all":
54
+ pass
55
+ else:
56
+ raise ValueError("cna must be 'amp', 'del', or 'all'")
57
+
58
+ if df.empty:
59
+ return {"status": "skipped", "gene": gene, "reason": "no_samples_after_cna_filter"}
60
+
61
+ # basic data QC
62
+ df = df.dropna(subset=["expr","sf","purity","stroma","eup_dev_cancer","eup_equiv_cancer"])
63
+ if df.empty:
64
+ return {"status": "skipped", "gene": gene, "reason": "all_na_after_dropna"}
65
+
66
+ if (df["expr"] < 0).any():
67
+ return {"status": "error", "gene": gene, "reason": "negative_counts"}
68
+
69
+ if not df["purity"].between(0, 1).all():
70
+ return {"status": "error", "gene": gene, "reason": "purity_out_of_bounds"}
71
+
72
+ if not (df["sf"] > 0).all():
73
+ return {"status": "error", "gene": gene, "reason": "nonpositive_sf"}
74
+
75
+ if not (df["eup_equiv_cancer"] > 0).all():
76
+ return {"status": "error", "gene": gene, "reason": "nonpositive_eup_equiv_cancer"}
77
+
78
+ # skip degenerate genes early (prevents phi->inf + treedepth explosions)
79
+ if df["expr"].nunique() < min_unique_counts:
80
+ return {"status": "skipped", "gene": gene, "reason": "too_few_unique_counts"}
81
+
82
+ # aneuploid count check (use copies as cancer_copies analogue)
83
+ n_aneup = int((np.abs(df["copies"].astype(float) - 2.0) > (1.0 - et)).sum())
84
+ if n_aneup < min_aneup or (df["expr"] == 0).all():
85
+ return {"status": "skipped", "gene": gene, "n_aneup": n_aneup, "reason": "low_aneup_or_all_zero"}
86
+
87
+ # identifiability check for cna="all": need some CN deviation mass
88
+ if cna == "all" and df["eup_dev_cancer"].abs().sum() < min_cn_abs_sum:
89
+ return {"status": "skipped", "gene": gene, "n_aneup": n_aneup, "reason": "too_little_cn_variation"}
90
+
91
+ # within one cohort: enforce K=1, covar='ALL'
92
+ df["covar"] = "ALL"
93
+ covar_levels = ["ALL"]
94
+ covar_idx = np.ones(len(df), dtype=int)
95
+
96
+ # per-gene sf scaling
97
+ mean_expr = float(df["expr"].mean())
98
+ df["sf_scaled"] = df["sf"].astype(float) * mean_expr
99
+
100
+ stan_data = {
101
+ "N": int(len(df)),
102
+ "y": df["expr"].astype(int).to_numpy(),
103
+ "K": 1,
104
+ "covar": covar_idx,
105
+ "sf": df["sf_scaled"].to_numpy(dtype=float),
106
+ "purity": df["purity"].to_numpy(dtype=float),
107
+ "stroma": df["stroma"].to_numpy(dtype=float),
108
+ "eup_equiv_cancer": df["eup_equiv_cancer"].to_numpy(dtype=float),
109
+ "eup_dev_cancer": df["eup_dev_cancer"].to_numpy(dtype=float),
110
+ }
111
+
112
+ rng = np.random.default_rng(seed)
113
+ init = {
114
+ "b_scaling": rng.uniform(0.5, 1.5, size=1).tolist(),
115
+ "b_noncancer": rng.uniform(0.5, 1.5, size=1).tolist(),
116
+ "b_deviation": 0.0,
117
+ "phi": 1.0,
118
+ }
119
+
120
+ fit = model.sample(
121
+ data=stan_data,
122
+ chains=chains,
123
+ iter_warmup=iter_warmup,
124
+ iter_sampling=iter_sampling,
125
+ seed=seed,
126
+ inits=init,
127
+ show_progress=show_progress,
128
+ adapt_delta=adapt_delta,
129
+ max_treedepth=max_treedepth,
130
+ )
131
+
132
+ draws = fit.draws_pd()
133
+
134
+ # extract posterior summaries
135
+
136
+ scaling_col = next((c for c in draws.columns if c.startswith("b_scaling")), None)
137
+ if scaling_col is None:
138
+ return {"status": "error", "gene": gene, "reason": "missing_b_scaling_draws"}
139
+
140
+ if "b_deviation" not in draws.columns:
141
+ return {"status": "error", "gene": gene, "reason": "missing_b_deviation_draws"}
142
+
143
+ phi_col = "phi" if "phi" in draws.columns else None
144
+
145
+ b_scaling = draws[scaling_col].to_numpy()
146
+ b_dev = draws["b_deviation"].to_numpy()
147
+
148
+ # z and p
149
+ z_comp = float(b_dev.mean() / b_dev.std(ddof=1))
150
+ p_value = float(2.0 * (1.0 - st.norm.cdf(abs(z_comp))))
151
+
152
+ summ = fit.summary()
153
+
154
+ # robust extraction of Rhat / ESS
155
+ rhat_col = next((c for c in ["R_hat", "Rhat"] if c in summ.columns), None)
156
+ ess_col = next((c for c in ["Ess_bulk", "ESS_bulk", "N_Eff", "Ess"] if c in summ.columns), None)
157
+
158
+ rhat_dev = float(summ.loc["b_deviation", rhat_col]) if (rhat_col and "b_deviation" in summ.index) else np.nan
159
+ ess_dev = float(summ.loc["b_deviation", ess_col]) if (ess_col and "b_deviation" in summ.index) else np.nan
160
+
161
+ # mark borderline fits as warn (you can filter later)
162
+ status = "ok"
163
+ if (not np.isnan(rhat_dev) and rhat_dev > 1.05) or (not np.isnan(ess_dev) and ess_dev < 200):
164
+ status = "warn"
165
+
166
+ out = {
167
+ "status": status,
168
+ "gene": gene,
169
+ "N": int(len(df)),
170
+ "n_aneup": n_aneup,
171
+ "cna": cna,
172
+ # posterior summaries
173
+ "mean_b_scaling": float(b_scaling.mean()),
174
+ "sd_b_scaling": float(b_scaling.std(ddof=1)),
175
+ "mean_b_deviation": float(b_dev.mean()),
176
+ "sd_b_deviation": float(b_dev.std(ddof=1)),
177
+ "z_comp": z_comp,
178
+ "p_value": p_value,
179
+ # optional
180
+ "mean_phi": float(draws[phi_col].mean()) if phi_col else np.nan,
181
+ "Rhat_b_deviation": rhat_dev,
182
+ "ess_b_deviation": ess_dev,
183
+ "covar_levels": covar_levels,
184
+ }
185
+ return out
186
+
187
+
188
+ def postprocess_nb_results(
189
+ res_df: pd.DataFrame,
190
+ alpha: float = 0.05,
191
+ comp_thr: float = 0.5,
192
+ min_aneup_frac: float = 0.30, # >=30% aneuploid samples
193
+ min_scaling: float = 1e-6,
194
+ require_scaling_for_dsg: bool = True,
195
+ warn_suffix: str | None = None, # e.g. "_LOWCONF" to tag warn calls
196
+ ) -> pd.DataFrame:
197
+ """
198
+ Post-process Stan NB regression results.
199
+
200
+ Expected columns in `res_df`:
201
+ - status, gene, N, n_aneup, cna
202
+ - mean_b_scaling, sd_b_scaling
203
+ - mean_b_deviation, sd_b_deviation
204
+ - z_comp, p_value, mean_phi
205
+ - Rhat_b_deviation, ess_b_deviation, covar_levels
206
+
207
+ Adds:
208
+ - aneup_frac : n_aneup / N (CN informativeness proxy)
209
+ - comp_score : normalized deviation = mean_b_deviation / mean_b_scaling
210
+ - signed_comp : sign-normalized comp_score (flip for 'del', keep for 'amp'/'all')
211
+ - shrunk_comp : comp_score shrunk toward 0 by (1 - p_value) [orthogonal score]
212
+ - label_nb : {'DSG','DCG','HYPER','OTHER','SKIP','ERROR'}
213
+
214
+ Notes:
215
+ - Without a per-gene CN direction summary, signed_comp is only partially
216
+ identifiable for cna='all'; here we flip sign for 'del' fits and leave
217
+ 'amp'/'all' unchanged.
218
+ - DCG/HYPER require sufficient magnitude |signed_comp| >= comp_thr
219
+ - DSG small magnitude |signed_comp| <= comp_thr, with sufficient CN signal.
220
+ """
221
+
222
+ df = res_df.copy()
223
+
224
+ # required columns check
225
+ required = {
226
+ "status", "gene", "N", "n_aneup", "cna", "p_value",
227
+ "mean_b_scaling", "mean_b_deviation"
228
+ }
229
+ missing = required - set(df.columns)
230
+ if missing:
231
+ raise ValueError(f"Missing required columns: {sorted(missing)}")
232
+
233
+ # usable rows: include ok + warn
234
+ status_lower = df["status"].astype(str).str.lower()
235
+ usable = status_lower.isin(["ok", "warn"]) & df["p_value"].notna()
236
+
237
+ # default labels
238
+ df["label_nb"] = "OTHER"
239
+ df.loc[status_lower.isin(["skip", "skipped"]), "label_nb"] = "SKIP"
240
+ df.loc[status_lower.isin(["error", "failed", "fail"]), "label_nb"] = "ERROR"
241
+
242
+
243
+ # CN informativeness proxy (fraction aneuploid)
244
+ df["aneup_frac"] = np.nan
245
+ df.loc[usable, "aneup_frac"] = (
246
+ df.loc[usable, "n_aneup"] / df.loc[usable, "N"]
247
+ ).astype(float)
248
+
249
+ # compute compensation scores
250
+ df["comp_score"] = np.nan
251
+ df["signed_comp"] = np.nan
252
+ df["shrunk_comp"] = np.nan # normalized + shrunk score (orthogonal summary)
253
+
254
+ ok2 = (
255
+ usable
256
+ & df["mean_b_scaling"].notna()
257
+ & df["mean_b_deviation"].notna()
258
+ & (df["mean_b_scaling"].abs() > min_scaling)
259
+ & df["aneup_frac"].notna()
260
+ & (df["aneup_frac"] >= min_aneup_frac)
261
+ )
262
+
263
+ # Normalized deviation: beta2* = mean_b_deviation / mean_b_scaling
264
+ df.loc[ok2, "comp_score"] = (
265
+ df.loc[ok2, "mean_b_deviation"] / df.loc[ok2, "mean_b_scaling"]
266
+ )
267
+
268
+ # Sign-normalize by CNA type if available: del -> flip sign
269
+ cna_vals = df.loc[ok2, "cna"].astype(str).str.lower()
270
+ sign = np.ones(len(cna_vals), dtype=float)
271
+ sign[cna_vals == "del"] = -1.0 # treat deletions as negative direction
272
+ # amp/all remain +1
273
+ df.loc[ok2, "signed_comp"] = (
274
+ df.loc[ok2, "comp_score"].to_numpy(dtype=float) * sign
275
+ )
276
+
277
+ # Shrink normalized deviation by pseudo-p (here: raw p_value)
278
+ # s_g ≈ beta2*_g * (1 - p_g)
279
+ pseudo_p = df.loc[ok2, "p_value"].astype(float).clip(0.0, 1.0)
280
+ df.loc[ok2, "shrunk_comp"] = df.loc[ok2, "comp_score"] * (1.0 - pseudo_p)
281
+
282
+ # classification gates
283
+ # Significant deviation (
284
+ is_sig = ok2 & (df["p_value"] <= alpha)
285
+ is_nonsig = ok2 & (df["p_value"] > alpha)
286
+
287
+ # Optional: require non-trivial scaling to call DSG
288
+ scaling_ok = ok2
289
+ if require_scaling_for_dsg:
290
+ scaling_ok = ok2 & (df["mean_b_scaling"] > 0.3)
291
+
292
+ # DCG: significant negative deviation (compensation)
293
+ #df.loc[is_sig & (df["shrunk_comp"] <= -comp_thr), "label_nb"] = "DCG"
294
+ df.loc[df["shrunk_comp"] <= -comp_thr, "label_nb"] = "DCG"
295
+
296
+ # HYPER: significant positive deviation
297
+ #df.loc[is_sig & (df["signed_comp"] >= comp_thr), "label_nb"] = "HYPER"
298
+ df.loc[df["shrunk_comp"] >= comp_thr, "label_nb"] = "HYPER"
299
+
300
+ # DSG: non-significant deviation AND near CN-proportional expectation
301
+ df.loc[
302
+ #is_nonsig & scaling_ok & (df["shrunk_comp"].abs() <= comp_thr),
303
+ scaling_ok & (df["shrunk_comp"].abs() <= comp_thr),
304
+ "label_nb"
305
+ ] = "DSG"
306
+
307
+ # optional: tag warn calls as low confidence
308
+ if warn_suffix:
309
+ is_warn = status_lower.eq("warn")
310
+ mask = is_warn & df["label_nb"].isin(["DSG", "DCG", "HYPER"])
311
+ df.loc[mask, "label_nb"] = df.loc[mask, "label_nb"] + warn_suffix
312
+
313
+ return df