M3Drop 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import statsmodels.api as sm
4
+ import scipy.sparse as sp
5
+ from scipy.stats import chi2
6
+
7
+ from .basics import SparseMat3Drop, compute_row_mean_and_var
8
+
9
+ def BrenneckeGetVariableGenes(expr_mat, spikes=None, suppress_plot=False, fdr=0.1, mt_method="fdr_bh", mt_threshold=0.01, minBiolDisp=0.5, fitMeanQuantile=0.8):
10
+ """
11
+ Implements the method of Brennecke et al. (2013) to identify highly
12
+ variable genes.
13
+
14
+ Parameters
15
+ ----------
16
+ expr_mat : pd.DataFrame
17
+ Normalized or raw (not log-transformed) expression values.
18
+ Columns = samples, rows = genes.
19
+ spikes : list or np.ndarray, optional
20
+ Gene names or row numbers of spike-in genes.
21
+ suppress_plot : bool, default=False
22
+ Whether to make a plot.
23
+ fdr : float, default=0.1
24
+ FDR to identify significantly highly variable genes.
25
+ mt_method : str, default="fdr_bh"
26
+ Multiple testing correction method.
27
+ mt_threshold : float, default=0.01
28
+ Multiple testing threshold.
29
+ minBiolDisp : float, default=0.5
30
+ Minimum percentage of variance due to biological factors.
31
+ fitMeanQuantile : float, default=0.8
32
+ Threshold for genes to be used in fitting.
33
+
34
+ Returns
35
+ -------
36
+ pd.DataFrame
37
+ DataFrame of highly variable genes.
38
+ """
39
+
40
+ # Use mt_threshold if provided, otherwise use fdr
41
+ threshold = mt_threshold if mt_threshold != 0.01 or fdr == 0.1 else fdr
42
+
43
+ matrix_input = expr_mat
44
+ if isinstance(expr_mat, np.ndarray):
45
+ matrix_input = pd.DataFrame(expr_mat)
46
+ elif isinstance(expr_mat, pd.DataFrame):
47
+ matrix_input = expr_mat
48
+ elif isinstance(expr_mat, (SparseMat3Drop, sp.spmatrix)):
49
+ matrix_input = expr_mat
50
+ else:
51
+ raise TypeError("Unsupported input type for expr_mat.")
52
+
53
+ means_all, vars_all = compute_row_mean_and_var(matrix_input, ddof=1)
54
+
55
+ if spikes is not None:
56
+ if isinstance(spikes[0], str):
57
+ spike_mask = means_all.index.isin(spikes)
58
+ elif isinstance(spikes[0], (int, np.integer)):
59
+ spike_mask = np.zeros(len(means_all), dtype=bool)
60
+ spike_mask[np.asarray(spikes, dtype=int)] = True
61
+ else:
62
+ raise TypeError("Spike identifiers must be strings or integers.")
63
+
64
+ meansSp = means_all[spike_mask]
65
+ varsSp = vars_all[spike_mask]
66
+ meansGenes = means_all[~spike_mask]
67
+ varsGenes = vars_all[~spike_mask]
68
+ else:
69
+ meansSp = means_all
70
+ varsSp = vars_all
71
+ meansGenes = means_all
72
+ varsGenes = vars_all
73
+
74
+ def safe_cv2(vars_series, mean_series):
75
+ cv2 = vars_series / (mean_series.replace(0, np.nan) ** 2)
76
+ return cv2.replace([np.inf, -np.inf], np.nan).fillna(0)
77
+
78
+ cv2Sp = safe_cv2(varsSp, meansSp)
79
+ cv2Genes = safe_cv2(varsGenes, meansGenes)
80
+
81
+ # Fit Model
82
+ minMeanForFit = np.quantile(meansSp[cv2Sp > 0.3], fitMeanQuantile) if np.sum(cv2Sp > 0.3) > 0 else 0
83
+ useForFit = meansSp >= minMeanForFit
84
+
85
+ if np.sum(useForFit) < 20:
86
+ print("Too few spike-ins exceed minMeanForFit, recomputing using all genes.")
87
+ meansAll = pd.concat([meansGenes, meansSp])
88
+ cv2All = pd.concat([cv2Genes, cv2Sp])
89
+ minMeanForFit = np.quantile(meansAll[cv2All > 0.3], 0.80)
90
+ useForFit = meansSp >= minMeanForFit
91
+
92
+ if np.sum(useForFit) < 30:
93
+ print(f"Only {np.sum(useForFit)} spike-ins to be used in fitting, may result in poor fit.")
94
+
95
+ # GLM fit
96
+ glm_data = pd.DataFrame({'cv2': cv2Sp[useForFit], 'mean': meansSp[useForFit]})
97
+ glm_data['a1tilde'] = 1 / glm_data['mean']
98
+
99
+ fit = sm.GLM(
100
+ glm_data['cv2'],
101
+ sm.add_constant(glm_data['a1tilde']),
102
+ family=sm.families.Gamma(link=sm.families.links.identity())
103
+ ).fit()
104
+
105
+ a0 = fit.params['const']
106
+ a1 = fit.params['a1tilde']
107
+
108
+ res = cv2Genes - (a0 + a1 / meansGenes)
109
+
110
+ # Test
111
+ psia1theta = a1
112
+ minBiolDisp_sq = minBiolDisp**2
113
+ m = matrix_input.shape[1]
114
+ cv2th = a0 + minBiolDisp_sq + a0 * minBiolDisp_sq
115
+ testDenom = (meansGenes * psia1theta + meansGenes**2 * cv2th) / (1 + cv2th / m)
116
+
117
+ p = pd.Series(1 - chi2.cdf(varsGenes * (m - 1) / testDenom, m - 1), index=varsGenes.index)
118
+
119
+ # FDR adjustment
120
+ p_df = pd.DataFrame({'p': p, 'gene': p.index})
121
+ p_df = p_df.sort_values(by='p')
122
+ p_df['i'] = np.arange(1, len(p_df) + 1)
123
+ p_df['p_adj'] = p_df['p'] * len(p_df) / p_df['i']
124
+ padj = p_df.set_index('gene')['p_adj']
125
+ padj = padj.reindex(p.index)
126
+
127
+ sig = padj < threshold
128
+ sig[sig.isna()] = False
129
+
130
+ # Create result table
131
+ table = pd.DataFrame({
132
+ 'Gene': meansGenes.index[sig],
133
+ 'effect.size': res[sig],
134
+ 'p.value': p[sig],
135
+ 'q.value': padj[sig]
136
+ })
137
+ table = table.sort_values(by='effect.size', ascending=False)
138
+
139
+ return table
@@ -0,0 +1,443 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.stats import norm
4
+ from scipy.optimize import minimize
5
+ from sklearn.linear_model import LogisticRegression
6
+ import warnings
7
+
8
+
9
+ def bg__fit_MM(p, s):
10
+ """
11
+ Fits the modified Michaelis-Menten equation to the relationship between
12
+ mean expression and dropout-rate.
13
+ """
14
+ s_clean = s[~p.isna() & ~s.isna()]
15
+ p_clean = p[~p.isna() & ~s.isna()]
16
+
17
+ def neg_log_likelihood(params):
18
+ K, sd = params
19
+ if K <= 0 or sd <= 0:
20
+ return np.inf
21
+
22
+ predictions = K / (s_clean + K)
23
+ log_likelihood = np.sum(norm.logpdf(p_clean, loc=predictions, scale=sd))
24
+ return -log_likelihood
25
+
26
+ initial_params = [np.median(s_clean), 0.1]
27
+
28
+ result = minimize(
29
+ neg_log_likelihood,
30
+ initial_params,
31
+ method='L-BFGS-B',
32
+ bounds=[(1e-9, None), (1e-9, None)]
33
+ )
34
+
35
+ K, sd = result.x
36
+
37
+ # Calculate predictions for all data
38
+ predictions = K / (s + K)
39
+
40
+ # Calculate residuals and error estimates
41
+ residuals = p - predictions
42
+ ssr = np.sum(residuals**2)
43
+
44
+ # Estimate K error based on the Hessian (if available) or use a reasonable default
45
+ if hasattr(result, 'hess_inv') and result.hess_inv is not None:
46
+ try:
47
+ # Extract standard error from Hessian inverse
48
+ K_var = result.hess_inv[0, 0] if result.hess_inv.shape[0] > 0 else 0.1**2
49
+ Kerr = np.sqrt(K_var)
50
+ except:
51
+ # Fallback: use empirical estimate
52
+ Kerr = max(0.05 * K, 0.1)
53
+ else:
54
+ # Fallback: use empirical estimate
55
+ Kerr = max(0.05 * K, 0.1)
56
+
57
+ # Fitted error is the residual standard deviation
58
+ fitted_err = sd
59
+
60
+ return {
61
+ 'K': K,
62
+ 'Kerr': Kerr,
63
+ 'sd': sd,
64
+ 'fitted_err': fitted_err,
65
+ 'predictions': pd.Series(predictions, index=s.index),
66
+ 'SSr': ssr,
67
+ 'model': f"Michaelis-Menten (K={K:.2f})"
68
+ }
69
+
70
+
71
+ def hidden__fit_MM_lognormal(p, s):
72
+ """
73
+ Fit Michaelis-Menten using lognormal approach.
74
+ This consistently underestimates K compared to the main method.
75
+ """
76
+ if len(p) != len(s):
77
+ raise ValueError("Error: p and s not same length. Cannot fit Michaelis-Menten.")
78
+
79
+ # Clean data - remove invalid values
80
+ mask = (p < 1) & (p > 0) & (~np.isnan(p)) & (~np.isnan(s))
81
+ p_c = p[mask]
82
+ s_c = s[mask]
83
+
84
+ if len(p_c) == 0:
85
+ # Return default values if no valid data
86
+ K = 1.0
87
+ predicted = 1 - (s / (K + s))
88
+ residuals = p - predicted
89
+ return {
90
+ 'K': K,
91
+ 'Kerr': 1.0,
92
+ 'fitted_err': 0.25,
93
+ 'predictions': predicted,
94
+ 'model': f"MMenten K={K:.3f}",
95
+ 'SSr': np.sum(residuals**2),
96
+ 'SAr': np.sum(np.abs(residuals))
97
+ }
98
+
99
+ def neg_log_likelihood(params):
100
+ krt, sigma = params
101
+ if krt <= 0 or sigma <= 0:
102
+ return 1e100
103
+
104
+ try:
105
+ obs_Ks = p_c / (1 - p_c) * s_c
106
+ R = np.log(obs_Ks) - np.log(krt)
107
+
108
+ # Filter based on density (simplified version of R's densCols approach)
109
+ Q75, Q25 = np.percentile(R, [75, 25])
110
+ IQR = Q75 - Q25
111
+
112
+ # Use all data points within reasonable range
113
+ valid_mask = np.abs(R - np.median(R)) < 3 * IQR
114
+ R_filtered = R[valid_mask]
115
+
116
+ if len(R_filtered) == 0:
117
+ return 1e100
118
+
119
+ log_likelihood = np.sum(norm.logpdf(R_filtered, 0, sigma))
120
+ return -log_likelihood
121
+ except:
122
+ return 1e100
123
+
124
+ # Initial parameters
125
+ initial_params = [6.0, 0.25]
126
+
127
+ try:
128
+ result = minimize(
129
+ neg_log_likelihood,
130
+ initial_params,
131
+ method='L-BFGS-B',
132
+ bounds=[(1e-9, None), (1e-9, None)]
133
+ )
134
+
135
+ krt = result.x[0]
136
+ res_err = result.x[1]
137
+ Kerr = max(res_err, 0.1) # Simplified error estimate
138
+
139
+ except:
140
+ krt = 6.0
141
+ res_err = 0.25
142
+ Kerr = 0.25
143
+
144
+ predicted = 1 - (s / (krt + s))
145
+ residuals = p - predicted
146
+
147
+ return {
148
+ 'K': krt,
149
+ 'Kerr': Kerr,
150
+ 'fitted_err': res_err,
151
+ 'predictions': predicted,
152
+ 'model': f"MMenten K={krt:.3f}",
153
+ 'SSr': np.sum(residuals**2),
154
+ 'SAr': np.sum(np.abs(residuals))
155
+ }
156
+
157
+
158
+ def hidden__fit_MM_logistic(p, s):
159
+ """
160
+ Fit Michaelis-Menten using logistic regression.
161
+ """
162
+ if len(p) != len(s):
163
+ raise ValueError("Error: p and s not same length. Cannot fit Michaelis-Menten.")
164
+
165
+ # Remove zero values for log transformation
166
+ mask = s > 0
167
+ s_nozero = s[mask]
168
+ p_nozero = p[mask]
169
+
170
+ if len(s_nozero) == 0:
171
+ # Return default values if no valid data
172
+ predicted = np.zeros_like(s)
173
+ residuals = p - predicted
174
+ return {
175
+ 'K': 1.0,
176
+ 'Kerr': 1.0,
177
+ 'predictions': predicted,
178
+ 'model': "MMenten K=1.000",
179
+ 'SSr': np.sum(residuals**2),
180
+ 'SAr': np.sum(np.abs(residuals))
181
+ }
182
+
183
+ try:
184
+ # Use logistic regression with offset
185
+ # R: glm(p_nozero ~ offset(-1*log(s_nozero)), family="binomial")
186
+ # This is equivalent to fitting: logit(p) = K_coeff - log(s)
187
+
188
+ # Transform to logistic regression format
189
+ X = np.ones((len(s_nozero), 1)) # Intercept only
190
+ offset = -np.log(s_nozero)
191
+
192
+ # Manual logistic regression with offset
193
+ def logistic_with_offset(beta, X, offset, y):
194
+ linear_pred = X @ beta + offset
195
+ p_pred = 1 / (1 + np.exp(-linear_pred))
196
+ p_pred = np.clip(p_pred, 1e-15, 1-1e-15) # Avoid log(0)
197
+ return -np.sum(y * np.log(p_pred) + (1-y) * np.log(1-p_pred))
198
+
199
+ initial_beta = [0.0]
200
+ result = minimize(
201
+ lambda beta: logistic_with_offset(beta, X, offset, p_nozero),
202
+ initial_beta,
203
+ method='BFGS'
204
+ )
205
+
206
+ Kcoeff = result.x[0]
207
+ krt = np.exp(Kcoeff)
208
+
209
+ # Error estimate (simplified)
210
+ Kerr = 0.1 * krt
211
+
212
+ # Predictions
213
+ predicted = np.zeros_like(s, dtype=float)
214
+ linear_pred = Kcoeff - np.log(s_nozero)
215
+ predicted[mask] = 1 / (1 + np.exp(-linear_pred))
216
+
217
+ except:
218
+ # Fallback values
219
+ krt = 1.0
220
+ Kerr = 1.0
221
+ predicted = np.zeros_like(s, dtype=float)
222
+
223
+ residuals = p - predicted
224
+
225
+ return {
226
+ 'K': krt,
227
+ 'Kerr': Kerr,
228
+ 'predictions': predicted,
229
+ 'model': f"MMenten K={krt:.3f}",
230
+ 'SSr': np.sum(residuals**2),
231
+ 'SAr': np.sum(np.abs(residuals))
232
+ }
233
+
234
+
235
+ def bg__fit_logistic(p, s):
236
+ """
237
+ Fits logistic regression to the relationship between mean expression and dropout rate.
238
+ """
239
+ if len(p) != len(s):
240
+ raise ValueError("Error: p and s not same length. Cannot fit Logistic Regression.")
241
+
242
+ # Remove zero values for log transformation
243
+ mask = s > 0
244
+ s_nozero = s[mask]
245
+ p_nozero = p[mask]
246
+
247
+ if len(s_nozero) == 0:
248
+ # Return default values if no valid data
249
+ fullpredictions = np.zeros_like(s)
250
+ res = fullpredictions - p
251
+ return {
252
+ 'predictions': fullpredictions,
253
+ 'B0': 0.0,
254
+ 'B1': 0.0,
255
+ 'model': "Logistic Intercept=0.000 Coeff=0.000",
256
+ 'SSr': np.sum(res**2),
257
+ 'SAr': np.sum(np.abs(res))
258
+ }
259
+
260
+ try:
261
+ # Fit logistic regression: p_nozero ~ log(s_nozero)
262
+ X = np.column_stack([np.ones(len(s_nozero)), np.log(s_nozero)])
263
+
264
+ def logistic_loss(beta, X, y):
265
+ linear_pred = X @ beta
266
+ p_pred = 1 / (1 + np.exp(-linear_pred))
267
+ p_pred = np.clip(p_pred, 1e-15, 1-1e-15) # Avoid log(0)
268
+ return -np.sum(y * np.log(p_pred) + (1-y) * np.log(1-p_pred))
269
+
270
+ initial_beta = [0.0, 0.0]
271
+ with warnings.catch_warnings():
272
+ warnings.simplefilter("ignore")
273
+ result = minimize(
274
+ lambda beta: logistic_loss(beta, X, p_nozero),
275
+ initial_beta,
276
+ method='BFGS'
277
+ )
278
+
279
+ B0, B1 = result.x
280
+
281
+ # Generate predictions
282
+ fullpredictions = np.zeros_like(s, dtype=float)
283
+ linear_pred = B0 + B1 * np.log(s_nozero)
284
+ fullpredictions[mask] = 1 / (1 + np.exp(-linear_pred))
285
+
286
+ except:
287
+ # Fallback values
288
+ B0, B1 = 0.0, 0.0
289
+ fullpredictions = np.zeros_like(s, dtype=float)
290
+
291
+ res = fullpredictions - p
292
+
293
+ return {
294
+ 'predictions': fullpredictions,
295
+ 'B0': B0,
296
+ 'B1': B1,
297
+ 'model': f"Logistic Intercept={B0:.3f} Coeff={B1:.3f}",
298
+ 'SSr': np.sum(res**2),
299
+ 'SAr': np.sum(np.abs(res))
300
+ }
301
+
302
+
303
+ def bg__fit_ZIFA(p, s):
304
+ """
305
+ Fits double exponential (ZIFA-style) model to the relationship between
306
+ mean expression and dropout rate.
307
+ """
308
+ if len(p) != len(s):
309
+ raise ValueError("Error: p and s not same length. Cannot fit double exponential.")
310
+
311
+ # Handle zero dropout rates
312
+ p_nozero = p.copy()
313
+ p_nozero[p == 0] = np.min(p[p > 0]) / 10 if np.any(p > 0) else 1e-10
314
+
315
+ try:
316
+ # Fit: log(p_nozero) ~ -1 + s^2 (no intercept, s-squared term only)
317
+ # This is equivalent to: p = exp(-lambda * s^2)
318
+
319
+ X = (s**2).values.reshape(-1, 1)
320
+ y = np.log(p_nozero).values
321
+
322
+ # Use least squares to fit the model
323
+ from sklearn.linear_model import LinearRegression
324
+ reg = LinearRegression(fit_intercept=False)
325
+ reg.fit(X, y)
326
+
327
+ # Extract lambda (negative of coefficient since we want exp(-lambda*s^2))
328
+ lambda_param = -reg.coef_[0]
329
+
330
+ # Error estimates (simplified)
331
+ Lerr = 0.1 * abs(lambda_param)
332
+ res_err = 0.1
333
+
334
+ # Generate predictions
335
+ predicted = np.exp(-lambda_param * s**2)
336
+
337
+ except:
338
+ # Fallback values
339
+ lambda_param = 1e-6
340
+ Lerr = 1e-7
341
+ res_err = 0.1
342
+ predicted = np.exp(-lambda_param * s**2)
343
+
344
+ residuals = p - predicted
345
+
346
+ return {
347
+ 'lambda': lambda_param,
348
+ 'Lerr': Lerr,
349
+ 'fitted_err': res_err,
350
+ 'predictions': predicted,
351
+ 'model': f"p ~ e^(-lambda*S^2) lambda={lambda_param:.2e}",
352
+ 'SSr': np.sum(residuals**2),
353
+ 'SAr': np.sum(np.abs(residuals))
354
+ }
355
+
356
+
357
+ def bg__dropout_plot_base(expr_mat, xlim=None, suppress_plot=False):
358
+ """
359
+ Create base plot for dropout analysis.
360
+ For now, this is a simplified version that just calculates variables.
361
+ """
362
+ from .basics import bg__calc_variables
363
+
364
+ gene_info = bg__calc_variables(expr_mat)
365
+
366
+ # Placeholder for actual plotting functionality
367
+ if not suppress_plot:
368
+ print("Plotting functionality not yet implemented.")
369
+
370
+ return {'gene_info': gene_info}
371
+
372
+
373
+ def bg__add_model_to_plot(model_fit, base_plot, lty=1, lwd=2.5, col="black", legend_loc="topright"):
374
+ """
375
+ Add model curve to dropout plot.
376
+ For now, this is a placeholder.
377
+ """
378
+ if base_plot is None:
379
+ return
380
+
381
+ # Placeholder for actual plotting functionality
382
+ print(f"Would add {model_fit.get('model', 'Unknown')} model to plot")
383
+
384
+ # Return dummy legend location
385
+ return {
386
+ 'rect': {
387
+ 'left': 0.7,
388
+ 'top': 0.9,
389
+ 'w': 0.2,
390
+ 'h': 0.1
391
+ }
392
+ }
393
+
394
+
395
+ def M3DropDropoutModels(expr_mat, xlim=None, suppress_plot=False):
396
+ """
397
+ Fits and compares three different dropout models: Michaelis-Menten,
398
+ Logistic Regression, and ZIFA double exponential.
399
+
400
+ Parameters
401
+ ----------
402
+ expr_mat : pd.DataFrame
403
+ Expression matrix with genes as rows and cells as columns.
404
+ xlim : tuple, optional
405
+ X-axis limits for plotting.
406
+ suppress_plot : bool, default=False
407
+ Whether to suppress plotting.
408
+
409
+ Returns
410
+ -------
411
+ dict
412
+ Dictionary containing fit results for all three models:
413
+ - MMFit: Michaelis-Menten fit
414
+ - LogiFit: Logistic regression fit
415
+ - ExpoFit: ZIFA exponential fit
416
+ """
417
+ # Create base plot and get gene info
418
+ base_plot = bg__dropout_plot_base(expr_mat, xlim=xlim, suppress_plot=suppress_plot)
419
+
420
+ # Extract dropout rate (p) and mean expression (s)
421
+ p = base_plot['gene_info']['p']
422
+ s = base_plot['gene_info']['s']
423
+
424
+ # Fit the three models
425
+ MM = bg__fit_MM(p, s)
426
+ SCDE = bg__fit_logistic(p, s) # Called SCDE in R (Single Cell Differential Expression)
427
+ ZIFA = bg__fit_ZIFA(p, s)
428
+
429
+ # Add models to plot if plotting is enabled
430
+ if not suppress_plot:
431
+ sizeloc = bg__add_model_to_plot(MM, base_plot, lty=1, lwd=2.5, col="black", legend_loc="topright")
432
+ sizeloc = bg__add_model_to_plot(SCDE, base_plot, lty=2, lwd=2.5, col="magenta3",
433
+ legend_loc=(sizeloc['rect']['left'] + sizeloc['rect']['w'],
434
+ sizeloc['rect']['top'] - sizeloc['rect']['h'] - 0.05))
435
+ sizeloc = bg__add_model_to_plot(ZIFA, base_plot, lty=3, lwd=2.5, col="red",
436
+ legend_loc=(sizeloc['rect']['left'] + sizeloc['rect']['w'],
437
+ sizeloc['rect']['top'] - sizeloc['rect']['h'] - 0.05))
438
+
439
+ return {
440
+ 'MMFit': MM,
441
+ 'LogiFit': SCDE,
442
+ 'ExpoFit': ZIFA
443
+ }
@@ -0,0 +1,99 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import warnings
4
+
5
+ def NBumiCoexpression(counts, fit, gene_list=None, method="both"):
6
+ """
7
+ Ranks genes based on co-expression.
8
+
9
+ Tests for co-expression using the normal approximation of a binomial test.
10
+
11
+ Parameters
12
+ ----------
13
+ counts : pd.DataFrame or np.ndarray
14
+ Raw count matrix.
15
+ fit : dict
16
+ Output from `NBumiFitModel`.
17
+ gene_list : list of str, optional
18
+ Set of gene names to test coexpression of.
19
+ method : {"both", "on", "off"}, default="both"
20
+ Type of co-expression to test. "on" for co-expression, "off" for
21
+ co-absence, "both" for either.
22
+
23
+ Returns
24
+ -------
25
+ pd.DataFrame
26
+ A matrix of Z-scores for each pair of genes.
27
+ """
28
+ # Set up
29
+ if gene_list is None:
30
+ gene_list = list(fit['vals']['tjs'].index)
31
+
32
+ if isinstance(counts, np.ndarray):
33
+ counts = pd.DataFrame(counts)
34
+
35
+ # Initialize matrix for gene probabilities
36
+ pd_gene = np.full((len(gene_list), counts.shape[1]), -1.0)
37
+ name_gene = [""] * len(gene_list)
38
+
39
+ for i, gene_name in enumerate(gene_list):
40
+ if gene_name in fit['vals']['tjs'].index:
41
+ gid = fit['vals']['tjs'].index.get_loc(gene_name)
42
+ mu_is = fit['vals']['tjs'].iloc[gid] * fit['vals']['tis'] / fit['vals']['total']
43
+ p_is = (1 + mu_is / fit['sizes'][gid])**(-fit['sizes'][gid])
44
+ pd_gene[i, :] = p_is
45
+ name_gene[i] = gene_name
46
+
47
+ # Remove genes that weren't found
48
+ if sum(name == "" for name in name_gene) > 0:
49
+ missing_count = sum(name == "" for name in name_gene)
50
+ warnings.warn(f"Warning: {missing_count} genes not found, check your gene list is correct.")
51
+ exclude = [i for i, name in enumerate(name_gene) if name == ""]
52
+ pd_gene = np.delete(pd_gene, exclude, axis=0)
53
+ name_gene = [name for name in name_gene if name != ""]
54
+
55
+ # Convert to DataFrame for easier indexing
56
+ pd_gene = pd.DataFrame(pd_gene, index=name_gene)
57
+
58
+ # Initialize Z-score matrix
59
+ n_genes = len(pd_gene)
60
+ z_mat = np.full((n_genes, n_genes), -1.0)
61
+
62
+ for i in range(n_genes):
63
+ for j in range(i, n_genes):
64
+ p_g1 = pd_gene.iloc[i, :]
65
+ p_g2 = pd_gene.iloc[j, :]
66
+
67
+ gene1_name = pd_gene.index[i]
68
+ gene2_name = pd_gene.index[j]
69
+
70
+ expr_g1 = counts.loc[gene1_name, :]
71
+ expr_g2 = counts.loc[gene2_name, :]
72
+
73
+ if method == "off" or method == "both":
74
+ # Both zero
75
+ expect_both_zero = p_g1 * p_g2
76
+ expect_both_err = expect_both_zero * (1 - expect_both_zero)
77
+ obs_both_zero = np.sum((expr_g1 == 0) & (expr_g2 == 0))
78
+ z = (obs_both_zero - np.sum(expect_both_zero)) / np.sqrt(np.sum(expect_both_err))
79
+
80
+ if method == "on" or method == "both":
81
+ # Both nonzero
82
+ obs_both_nonzero = np.sum((expr_g1 != 0) & (expr_g2 != 0))
83
+ expect_both_nonzero = (1 - p_g1) * (1 - p_g2)
84
+ expect_non_err = expect_both_nonzero * (1 - expect_both_nonzero)
85
+ z = (obs_both_nonzero - np.sum(expect_both_nonzero)) / np.sqrt(np.sum(expect_non_err))
86
+
87
+ if method == "both":
88
+ # Either (this overwrites the previous z calculation, matching R behavior)
89
+ obs_either = obs_both_zero + obs_both_nonzero
90
+ expect_either = expect_both_zero + expect_both_nonzero
91
+ expect_err = expect_either * (1 - expect_either)
92
+ z = (obs_either - np.sum(expect_either)) / np.sqrt(np.sum(expect_err))
93
+
94
+ z_mat[i, j] = z_mat[j, i] = z
95
+
96
+ # Convert to DataFrame with proper row/column names
97
+ z_mat = pd.DataFrame(z_mat, index=pd_gene.index, columns=pd_gene.index)
98
+
99
+ return z_mat