M3Drop 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m3Drop/Brennecke_implementation.py +139 -0
- m3Drop/Curve_fitting.py +443 -0
- m3Drop/DANB_Coexpression.py +99 -0
- m3Drop/DANB_HVG.py +85 -0
- m3Drop/Extremes.py +423 -0
- m3Drop/M3D_Imputation.py +93 -0
- m3Drop/NB_UMI.py +1458 -0
- m3Drop/Normalization.py +257 -0
- m3Drop/NormalizationGPU.py +201 -0
- m3Drop/Other_FS_functions.py +431 -0
- m3Drop/Plotting_fxns.py +270 -0
- m3Drop/Simulations_Functions.py +335 -0
- m3Drop/Simulations_GPU.py +411 -0
- m3Drop/Threeway_ProportionalArea_VennDiagrams.py +305 -0
- m3Drop/Traditional_DE.py +421 -0
- m3Drop/__init__.py +219 -0
- m3Drop/basics.py +726 -0
- m3Drop/scanpy.py +461 -0
- m3drop-0.2.1.dist-info/METADATA +133 -0
- m3drop-0.2.1.dist-info/RECORD +23 -0
- m3drop-0.2.1.dist-info/WHEEL +5 -0
- m3drop-0.2.1.dist-info/licenses/LICENSE +21 -0
- m3drop-0.2.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import statsmodels.api as sm
|
|
4
|
+
import scipy.sparse as sp
|
|
5
|
+
from scipy.stats import chi2
|
|
6
|
+
|
|
7
|
+
from .basics import SparseMat3Drop, compute_row_mean_and_var
|
|
8
|
+
|
|
9
|
+
def BrenneckeGetVariableGenes(expr_mat, spikes=None, suppress_plot=False, fdr=0.1, mt_method="fdr_bh", mt_threshold=0.01, minBiolDisp=0.5, fitMeanQuantile=0.8):
|
|
10
|
+
"""
|
|
11
|
+
Implements the method of Brennecke et al. (2013) to identify highly
|
|
12
|
+
variable genes.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
expr_mat : pd.DataFrame
|
|
17
|
+
Normalized or raw (not log-transformed) expression values.
|
|
18
|
+
Columns = samples, rows = genes.
|
|
19
|
+
spikes : list or np.ndarray, optional
|
|
20
|
+
Gene names or row numbers of spike-in genes.
|
|
21
|
+
suppress_plot : bool, default=False
|
|
22
|
+
Whether to make a plot.
|
|
23
|
+
fdr : float, default=0.1
|
|
24
|
+
FDR to identify significantly highly variable genes.
|
|
25
|
+
mt_method : str, default="fdr_bh"
|
|
26
|
+
Multiple testing correction method.
|
|
27
|
+
mt_threshold : float, default=0.01
|
|
28
|
+
Multiple testing threshold.
|
|
29
|
+
minBiolDisp : float, default=0.5
|
|
30
|
+
Minimum percentage of variance due to biological factors.
|
|
31
|
+
fitMeanQuantile : float, default=0.8
|
|
32
|
+
Threshold for genes to be used in fitting.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
pd.DataFrame
|
|
37
|
+
DataFrame of highly variable genes.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# Use mt_threshold if provided, otherwise use fdr
|
|
41
|
+
threshold = mt_threshold if mt_threshold != 0.01 or fdr == 0.1 else fdr
|
|
42
|
+
|
|
43
|
+
matrix_input = expr_mat
|
|
44
|
+
if isinstance(expr_mat, np.ndarray):
|
|
45
|
+
matrix_input = pd.DataFrame(expr_mat)
|
|
46
|
+
elif isinstance(expr_mat, pd.DataFrame):
|
|
47
|
+
matrix_input = expr_mat
|
|
48
|
+
elif isinstance(expr_mat, (SparseMat3Drop, sp.spmatrix)):
|
|
49
|
+
matrix_input = expr_mat
|
|
50
|
+
else:
|
|
51
|
+
raise TypeError("Unsupported input type for expr_mat.")
|
|
52
|
+
|
|
53
|
+
means_all, vars_all = compute_row_mean_and_var(matrix_input, ddof=1)
|
|
54
|
+
|
|
55
|
+
if spikes is not None:
|
|
56
|
+
if isinstance(spikes[0], str):
|
|
57
|
+
spike_mask = means_all.index.isin(spikes)
|
|
58
|
+
elif isinstance(spikes[0], (int, np.integer)):
|
|
59
|
+
spike_mask = np.zeros(len(means_all), dtype=bool)
|
|
60
|
+
spike_mask[np.asarray(spikes, dtype=int)] = True
|
|
61
|
+
else:
|
|
62
|
+
raise TypeError("Spike identifiers must be strings or integers.")
|
|
63
|
+
|
|
64
|
+
meansSp = means_all[spike_mask]
|
|
65
|
+
varsSp = vars_all[spike_mask]
|
|
66
|
+
meansGenes = means_all[~spike_mask]
|
|
67
|
+
varsGenes = vars_all[~spike_mask]
|
|
68
|
+
else:
|
|
69
|
+
meansSp = means_all
|
|
70
|
+
varsSp = vars_all
|
|
71
|
+
meansGenes = means_all
|
|
72
|
+
varsGenes = vars_all
|
|
73
|
+
|
|
74
|
+
def safe_cv2(vars_series, mean_series):
|
|
75
|
+
cv2 = vars_series / (mean_series.replace(0, np.nan) ** 2)
|
|
76
|
+
return cv2.replace([np.inf, -np.inf], np.nan).fillna(0)
|
|
77
|
+
|
|
78
|
+
cv2Sp = safe_cv2(varsSp, meansSp)
|
|
79
|
+
cv2Genes = safe_cv2(varsGenes, meansGenes)
|
|
80
|
+
|
|
81
|
+
# Fit Model
|
|
82
|
+
minMeanForFit = np.quantile(meansSp[cv2Sp > 0.3], fitMeanQuantile) if np.sum(cv2Sp > 0.3) > 0 else 0
|
|
83
|
+
useForFit = meansSp >= minMeanForFit
|
|
84
|
+
|
|
85
|
+
if np.sum(useForFit) < 20:
|
|
86
|
+
print("Too few spike-ins exceed minMeanForFit, recomputing using all genes.")
|
|
87
|
+
meansAll = pd.concat([meansGenes, meansSp])
|
|
88
|
+
cv2All = pd.concat([cv2Genes, cv2Sp])
|
|
89
|
+
minMeanForFit = np.quantile(meansAll[cv2All > 0.3], 0.80)
|
|
90
|
+
useForFit = meansSp >= minMeanForFit
|
|
91
|
+
|
|
92
|
+
if np.sum(useForFit) < 30:
|
|
93
|
+
print(f"Only {np.sum(useForFit)} spike-ins to be used in fitting, may result in poor fit.")
|
|
94
|
+
|
|
95
|
+
# GLM fit
|
|
96
|
+
glm_data = pd.DataFrame({'cv2': cv2Sp[useForFit], 'mean': meansSp[useForFit]})
|
|
97
|
+
glm_data['a1tilde'] = 1 / glm_data['mean']
|
|
98
|
+
|
|
99
|
+
fit = sm.GLM(
|
|
100
|
+
glm_data['cv2'],
|
|
101
|
+
sm.add_constant(glm_data['a1tilde']),
|
|
102
|
+
family=sm.families.Gamma(link=sm.families.links.identity())
|
|
103
|
+
).fit()
|
|
104
|
+
|
|
105
|
+
a0 = fit.params['const']
|
|
106
|
+
a1 = fit.params['a1tilde']
|
|
107
|
+
|
|
108
|
+
res = cv2Genes - (a0 + a1 / meansGenes)
|
|
109
|
+
|
|
110
|
+
# Test
|
|
111
|
+
psia1theta = a1
|
|
112
|
+
minBiolDisp_sq = minBiolDisp**2
|
|
113
|
+
m = matrix_input.shape[1]
|
|
114
|
+
cv2th = a0 + minBiolDisp_sq + a0 * minBiolDisp_sq
|
|
115
|
+
testDenom = (meansGenes * psia1theta + meansGenes**2 * cv2th) / (1 + cv2th / m)
|
|
116
|
+
|
|
117
|
+
p = pd.Series(1 - chi2.cdf(varsGenes * (m - 1) / testDenom, m - 1), index=varsGenes.index)
|
|
118
|
+
|
|
119
|
+
# FDR adjustment
|
|
120
|
+
p_df = pd.DataFrame({'p': p, 'gene': p.index})
|
|
121
|
+
p_df = p_df.sort_values(by='p')
|
|
122
|
+
p_df['i'] = np.arange(1, len(p_df) + 1)
|
|
123
|
+
p_df['p_adj'] = p_df['p'] * len(p_df) / p_df['i']
|
|
124
|
+
padj = p_df.set_index('gene')['p_adj']
|
|
125
|
+
padj = padj.reindex(p.index)
|
|
126
|
+
|
|
127
|
+
sig = padj < threshold
|
|
128
|
+
sig[sig.isna()] = False
|
|
129
|
+
|
|
130
|
+
# Create result table
|
|
131
|
+
table = pd.DataFrame({
|
|
132
|
+
'Gene': meansGenes.index[sig],
|
|
133
|
+
'effect.size': res[sig],
|
|
134
|
+
'p.value': p[sig],
|
|
135
|
+
'q.value': padj[sig]
|
|
136
|
+
})
|
|
137
|
+
table = table.sort_values(by='effect.size', ascending=False)
|
|
138
|
+
|
|
139
|
+
return table
|
m3Drop/Curve_fitting.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.stats import norm
|
|
4
|
+
from scipy.optimize import minimize
|
|
5
|
+
from sklearn.linear_model import LogisticRegression
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def bg__fit_MM(p, s):
|
|
10
|
+
"""
|
|
11
|
+
Fits the modified Michaelis-Menten equation to the relationship between
|
|
12
|
+
mean expression and dropout-rate.
|
|
13
|
+
"""
|
|
14
|
+
s_clean = s[~p.isna() & ~s.isna()]
|
|
15
|
+
p_clean = p[~p.isna() & ~s.isna()]
|
|
16
|
+
|
|
17
|
+
def neg_log_likelihood(params):
|
|
18
|
+
K, sd = params
|
|
19
|
+
if K <= 0 or sd <= 0:
|
|
20
|
+
return np.inf
|
|
21
|
+
|
|
22
|
+
predictions = K / (s_clean + K)
|
|
23
|
+
log_likelihood = np.sum(norm.logpdf(p_clean, loc=predictions, scale=sd))
|
|
24
|
+
return -log_likelihood
|
|
25
|
+
|
|
26
|
+
initial_params = [np.median(s_clean), 0.1]
|
|
27
|
+
|
|
28
|
+
result = minimize(
|
|
29
|
+
neg_log_likelihood,
|
|
30
|
+
initial_params,
|
|
31
|
+
method='L-BFGS-B',
|
|
32
|
+
bounds=[(1e-9, None), (1e-9, None)]
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
K, sd = result.x
|
|
36
|
+
|
|
37
|
+
# Calculate predictions for all data
|
|
38
|
+
predictions = K / (s + K)
|
|
39
|
+
|
|
40
|
+
# Calculate residuals and error estimates
|
|
41
|
+
residuals = p - predictions
|
|
42
|
+
ssr = np.sum(residuals**2)
|
|
43
|
+
|
|
44
|
+
# Estimate K error based on the Hessian (if available) or use a reasonable default
|
|
45
|
+
if hasattr(result, 'hess_inv') and result.hess_inv is not None:
|
|
46
|
+
try:
|
|
47
|
+
# Extract standard error from Hessian inverse
|
|
48
|
+
K_var = result.hess_inv[0, 0] if result.hess_inv.shape[0] > 0 else 0.1**2
|
|
49
|
+
Kerr = np.sqrt(K_var)
|
|
50
|
+
except:
|
|
51
|
+
# Fallback: use empirical estimate
|
|
52
|
+
Kerr = max(0.05 * K, 0.1)
|
|
53
|
+
else:
|
|
54
|
+
# Fallback: use empirical estimate
|
|
55
|
+
Kerr = max(0.05 * K, 0.1)
|
|
56
|
+
|
|
57
|
+
# Fitted error is the residual standard deviation
|
|
58
|
+
fitted_err = sd
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
'K': K,
|
|
62
|
+
'Kerr': Kerr,
|
|
63
|
+
'sd': sd,
|
|
64
|
+
'fitted_err': fitted_err,
|
|
65
|
+
'predictions': pd.Series(predictions, index=s.index),
|
|
66
|
+
'SSr': ssr,
|
|
67
|
+
'model': f"Michaelis-Menten (K={K:.2f})"
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def hidden__fit_MM_lognormal(p, s):
|
|
72
|
+
"""
|
|
73
|
+
Fit Michaelis-Menten using lognormal approach.
|
|
74
|
+
This consistently underestimates K compared to the main method.
|
|
75
|
+
"""
|
|
76
|
+
if len(p) != len(s):
|
|
77
|
+
raise ValueError("Error: p and s not same length. Cannot fit Michaelis-Menten.")
|
|
78
|
+
|
|
79
|
+
# Clean data - remove invalid values
|
|
80
|
+
mask = (p < 1) & (p > 0) & (~np.isnan(p)) & (~np.isnan(s))
|
|
81
|
+
p_c = p[mask]
|
|
82
|
+
s_c = s[mask]
|
|
83
|
+
|
|
84
|
+
if len(p_c) == 0:
|
|
85
|
+
# Return default values if no valid data
|
|
86
|
+
K = 1.0
|
|
87
|
+
predicted = 1 - (s / (K + s))
|
|
88
|
+
residuals = p - predicted
|
|
89
|
+
return {
|
|
90
|
+
'K': K,
|
|
91
|
+
'Kerr': 1.0,
|
|
92
|
+
'fitted_err': 0.25,
|
|
93
|
+
'predictions': predicted,
|
|
94
|
+
'model': f"MMenten K={K:.3f}",
|
|
95
|
+
'SSr': np.sum(residuals**2),
|
|
96
|
+
'SAr': np.sum(np.abs(residuals))
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def neg_log_likelihood(params):
|
|
100
|
+
krt, sigma = params
|
|
101
|
+
if krt <= 0 or sigma <= 0:
|
|
102
|
+
return 1e100
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
obs_Ks = p_c / (1 - p_c) * s_c
|
|
106
|
+
R = np.log(obs_Ks) - np.log(krt)
|
|
107
|
+
|
|
108
|
+
# Filter based on density (simplified version of R's densCols approach)
|
|
109
|
+
Q75, Q25 = np.percentile(R, [75, 25])
|
|
110
|
+
IQR = Q75 - Q25
|
|
111
|
+
|
|
112
|
+
# Use all data points within reasonable range
|
|
113
|
+
valid_mask = np.abs(R - np.median(R)) < 3 * IQR
|
|
114
|
+
R_filtered = R[valid_mask]
|
|
115
|
+
|
|
116
|
+
if len(R_filtered) == 0:
|
|
117
|
+
return 1e100
|
|
118
|
+
|
|
119
|
+
log_likelihood = np.sum(norm.logpdf(R_filtered, 0, sigma))
|
|
120
|
+
return -log_likelihood
|
|
121
|
+
except:
|
|
122
|
+
return 1e100
|
|
123
|
+
|
|
124
|
+
# Initial parameters
|
|
125
|
+
initial_params = [6.0, 0.25]
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
result = minimize(
|
|
129
|
+
neg_log_likelihood,
|
|
130
|
+
initial_params,
|
|
131
|
+
method='L-BFGS-B',
|
|
132
|
+
bounds=[(1e-9, None), (1e-9, None)]
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
krt = result.x[0]
|
|
136
|
+
res_err = result.x[1]
|
|
137
|
+
Kerr = max(res_err, 0.1) # Simplified error estimate
|
|
138
|
+
|
|
139
|
+
except:
|
|
140
|
+
krt = 6.0
|
|
141
|
+
res_err = 0.25
|
|
142
|
+
Kerr = 0.25
|
|
143
|
+
|
|
144
|
+
predicted = 1 - (s / (krt + s))
|
|
145
|
+
residuals = p - predicted
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
'K': krt,
|
|
149
|
+
'Kerr': Kerr,
|
|
150
|
+
'fitted_err': res_err,
|
|
151
|
+
'predictions': predicted,
|
|
152
|
+
'model': f"MMenten K={krt:.3f}",
|
|
153
|
+
'SSr': np.sum(residuals**2),
|
|
154
|
+
'SAr': np.sum(np.abs(residuals))
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def hidden__fit_MM_logistic(p, s):
|
|
159
|
+
"""
|
|
160
|
+
Fit Michaelis-Menten using logistic regression.
|
|
161
|
+
"""
|
|
162
|
+
if len(p) != len(s):
|
|
163
|
+
raise ValueError("Error: p and s not same length. Cannot fit Michaelis-Menten.")
|
|
164
|
+
|
|
165
|
+
# Remove zero values for log transformation
|
|
166
|
+
mask = s > 0
|
|
167
|
+
s_nozero = s[mask]
|
|
168
|
+
p_nozero = p[mask]
|
|
169
|
+
|
|
170
|
+
if len(s_nozero) == 0:
|
|
171
|
+
# Return default values if no valid data
|
|
172
|
+
predicted = np.zeros_like(s)
|
|
173
|
+
residuals = p - predicted
|
|
174
|
+
return {
|
|
175
|
+
'K': 1.0,
|
|
176
|
+
'Kerr': 1.0,
|
|
177
|
+
'predictions': predicted,
|
|
178
|
+
'model': "MMenten K=1.000",
|
|
179
|
+
'SSr': np.sum(residuals**2),
|
|
180
|
+
'SAr': np.sum(np.abs(residuals))
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
# Use logistic regression with offset
|
|
185
|
+
# R: glm(p_nozero ~ offset(-1*log(s_nozero)), family="binomial")
|
|
186
|
+
# This is equivalent to fitting: logit(p) = K_coeff - log(s)
|
|
187
|
+
|
|
188
|
+
# Transform to logistic regression format
|
|
189
|
+
X = np.ones((len(s_nozero), 1)) # Intercept only
|
|
190
|
+
offset = -np.log(s_nozero)
|
|
191
|
+
|
|
192
|
+
# Manual logistic regression with offset
|
|
193
|
+
def logistic_with_offset(beta, X, offset, y):
|
|
194
|
+
linear_pred = X @ beta + offset
|
|
195
|
+
p_pred = 1 / (1 + np.exp(-linear_pred))
|
|
196
|
+
p_pred = np.clip(p_pred, 1e-15, 1-1e-15) # Avoid log(0)
|
|
197
|
+
return -np.sum(y * np.log(p_pred) + (1-y) * np.log(1-p_pred))
|
|
198
|
+
|
|
199
|
+
initial_beta = [0.0]
|
|
200
|
+
result = minimize(
|
|
201
|
+
lambda beta: logistic_with_offset(beta, X, offset, p_nozero),
|
|
202
|
+
initial_beta,
|
|
203
|
+
method='BFGS'
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
Kcoeff = result.x[0]
|
|
207
|
+
krt = np.exp(Kcoeff)
|
|
208
|
+
|
|
209
|
+
# Error estimate (simplified)
|
|
210
|
+
Kerr = 0.1 * krt
|
|
211
|
+
|
|
212
|
+
# Predictions
|
|
213
|
+
predicted = np.zeros_like(s, dtype=float)
|
|
214
|
+
linear_pred = Kcoeff - np.log(s_nozero)
|
|
215
|
+
predicted[mask] = 1 / (1 + np.exp(-linear_pred))
|
|
216
|
+
|
|
217
|
+
except:
|
|
218
|
+
# Fallback values
|
|
219
|
+
krt = 1.0
|
|
220
|
+
Kerr = 1.0
|
|
221
|
+
predicted = np.zeros_like(s, dtype=float)
|
|
222
|
+
|
|
223
|
+
residuals = p - predicted
|
|
224
|
+
|
|
225
|
+
return {
|
|
226
|
+
'K': krt,
|
|
227
|
+
'Kerr': Kerr,
|
|
228
|
+
'predictions': predicted,
|
|
229
|
+
'model': f"MMenten K={krt:.3f}",
|
|
230
|
+
'SSr': np.sum(residuals**2),
|
|
231
|
+
'SAr': np.sum(np.abs(residuals))
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def bg__fit_logistic(p, s):
|
|
236
|
+
"""
|
|
237
|
+
Fits logistic regression to the relationship between mean expression and dropout rate.
|
|
238
|
+
"""
|
|
239
|
+
if len(p) != len(s):
|
|
240
|
+
raise ValueError("Error: p and s not same length. Cannot fit Logistic Regression.")
|
|
241
|
+
|
|
242
|
+
# Remove zero values for log transformation
|
|
243
|
+
mask = s > 0
|
|
244
|
+
s_nozero = s[mask]
|
|
245
|
+
p_nozero = p[mask]
|
|
246
|
+
|
|
247
|
+
if len(s_nozero) == 0:
|
|
248
|
+
# Return default values if no valid data
|
|
249
|
+
fullpredictions = np.zeros_like(s)
|
|
250
|
+
res = fullpredictions - p
|
|
251
|
+
return {
|
|
252
|
+
'predictions': fullpredictions,
|
|
253
|
+
'B0': 0.0,
|
|
254
|
+
'B1': 0.0,
|
|
255
|
+
'model': "Logistic Intercept=0.000 Coeff=0.000",
|
|
256
|
+
'SSr': np.sum(res**2),
|
|
257
|
+
'SAr': np.sum(np.abs(res))
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# Fit logistic regression: p_nozero ~ log(s_nozero)
|
|
262
|
+
X = np.column_stack([np.ones(len(s_nozero)), np.log(s_nozero)])
|
|
263
|
+
|
|
264
|
+
def logistic_loss(beta, X, y):
|
|
265
|
+
linear_pred = X @ beta
|
|
266
|
+
p_pred = 1 / (1 + np.exp(-linear_pred))
|
|
267
|
+
p_pred = np.clip(p_pred, 1e-15, 1-1e-15) # Avoid log(0)
|
|
268
|
+
return -np.sum(y * np.log(p_pred) + (1-y) * np.log(1-p_pred))
|
|
269
|
+
|
|
270
|
+
initial_beta = [0.0, 0.0]
|
|
271
|
+
with warnings.catch_warnings():
|
|
272
|
+
warnings.simplefilter("ignore")
|
|
273
|
+
result = minimize(
|
|
274
|
+
lambda beta: logistic_loss(beta, X, p_nozero),
|
|
275
|
+
initial_beta,
|
|
276
|
+
method='BFGS'
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
B0, B1 = result.x
|
|
280
|
+
|
|
281
|
+
# Generate predictions
|
|
282
|
+
fullpredictions = np.zeros_like(s, dtype=float)
|
|
283
|
+
linear_pred = B0 + B1 * np.log(s_nozero)
|
|
284
|
+
fullpredictions[mask] = 1 / (1 + np.exp(-linear_pred))
|
|
285
|
+
|
|
286
|
+
except:
|
|
287
|
+
# Fallback values
|
|
288
|
+
B0, B1 = 0.0, 0.0
|
|
289
|
+
fullpredictions = np.zeros_like(s, dtype=float)
|
|
290
|
+
|
|
291
|
+
res = fullpredictions - p
|
|
292
|
+
|
|
293
|
+
return {
|
|
294
|
+
'predictions': fullpredictions,
|
|
295
|
+
'B0': B0,
|
|
296
|
+
'B1': B1,
|
|
297
|
+
'model': f"Logistic Intercept={B0:.3f} Coeff={B1:.3f}",
|
|
298
|
+
'SSr': np.sum(res**2),
|
|
299
|
+
'SAr': np.sum(np.abs(res))
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def bg__fit_ZIFA(p, s):
|
|
304
|
+
"""
|
|
305
|
+
Fits double exponential (ZIFA-style) model to the relationship between
|
|
306
|
+
mean expression and dropout rate.
|
|
307
|
+
"""
|
|
308
|
+
if len(p) != len(s):
|
|
309
|
+
raise ValueError("Error: p and s not same length. Cannot fit double exponential.")
|
|
310
|
+
|
|
311
|
+
# Handle zero dropout rates
|
|
312
|
+
p_nozero = p.copy()
|
|
313
|
+
p_nozero[p == 0] = np.min(p[p > 0]) / 10 if np.any(p > 0) else 1e-10
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
# Fit: log(p_nozero) ~ -1 + s^2 (no intercept, s-squared term only)
|
|
317
|
+
# This is equivalent to: p = exp(-lambda * s^2)
|
|
318
|
+
|
|
319
|
+
X = (s**2).values.reshape(-1, 1)
|
|
320
|
+
y = np.log(p_nozero).values
|
|
321
|
+
|
|
322
|
+
# Use least squares to fit the model
|
|
323
|
+
from sklearn.linear_model import LinearRegression
|
|
324
|
+
reg = LinearRegression(fit_intercept=False)
|
|
325
|
+
reg.fit(X, y)
|
|
326
|
+
|
|
327
|
+
# Extract lambda (negative of coefficient since we want exp(-lambda*s^2))
|
|
328
|
+
lambda_param = -reg.coef_[0]
|
|
329
|
+
|
|
330
|
+
# Error estimates (simplified)
|
|
331
|
+
Lerr = 0.1 * abs(lambda_param)
|
|
332
|
+
res_err = 0.1
|
|
333
|
+
|
|
334
|
+
# Generate predictions
|
|
335
|
+
predicted = np.exp(-lambda_param * s**2)
|
|
336
|
+
|
|
337
|
+
except:
|
|
338
|
+
# Fallback values
|
|
339
|
+
lambda_param = 1e-6
|
|
340
|
+
Lerr = 1e-7
|
|
341
|
+
res_err = 0.1
|
|
342
|
+
predicted = np.exp(-lambda_param * s**2)
|
|
343
|
+
|
|
344
|
+
residuals = p - predicted
|
|
345
|
+
|
|
346
|
+
return {
|
|
347
|
+
'lambda': lambda_param,
|
|
348
|
+
'Lerr': Lerr,
|
|
349
|
+
'fitted_err': res_err,
|
|
350
|
+
'predictions': predicted,
|
|
351
|
+
'model': f"p ~ e^(-lambda*S^2) lambda={lambda_param:.2e}",
|
|
352
|
+
'SSr': np.sum(residuals**2),
|
|
353
|
+
'SAr': np.sum(np.abs(residuals))
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def bg__dropout_plot_base(expr_mat, xlim=None, suppress_plot=False):
|
|
358
|
+
"""
|
|
359
|
+
Create base plot for dropout analysis.
|
|
360
|
+
For now, this is a simplified version that just calculates variables.
|
|
361
|
+
"""
|
|
362
|
+
from .basics import bg__calc_variables
|
|
363
|
+
|
|
364
|
+
gene_info = bg__calc_variables(expr_mat)
|
|
365
|
+
|
|
366
|
+
# Placeholder for actual plotting functionality
|
|
367
|
+
if not suppress_plot:
|
|
368
|
+
print("Plotting functionality not yet implemented.")
|
|
369
|
+
|
|
370
|
+
return {'gene_info': gene_info}
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def bg__add_model_to_plot(model_fit, base_plot, lty=1, lwd=2.5, col="black", legend_loc="topright"):
|
|
374
|
+
"""
|
|
375
|
+
Add model curve to dropout plot.
|
|
376
|
+
For now, this is a placeholder.
|
|
377
|
+
"""
|
|
378
|
+
if base_plot is None:
|
|
379
|
+
return
|
|
380
|
+
|
|
381
|
+
# Placeholder for actual plotting functionality
|
|
382
|
+
print(f"Would add {model_fit.get('model', 'Unknown')} model to plot")
|
|
383
|
+
|
|
384
|
+
# Return dummy legend location
|
|
385
|
+
return {
|
|
386
|
+
'rect': {
|
|
387
|
+
'left': 0.7,
|
|
388
|
+
'top': 0.9,
|
|
389
|
+
'w': 0.2,
|
|
390
|
+
'h': 0.1
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def M3DropDropoutModels(expr_mat, xlim=None, suppress_plot=False):
|
|
396
|
+
"""
|
|
397
|
+
Fits and compares three different dropout models: Michaelis-Menten,
|
|
398
|
+
Logistic Regression, and ZIFA double exponential.
|
|
399
|
+
|
|
400
|
+
Parameters
|
|
401
|
+
----------
|
|
402
|
+
expr_mat : pd.DataFrame
|
|
403
|
+
Expression matrix with genes as rows and cells as columns.
|
|
404
|
+
xlim : tuple, optional
|
|
405
|
+
X-axis limits for plotting.
|
|
406
|
+
suppress_plot : bool, default=False
|
|
407
|
+
Whether to suppress plotting.
|
|
408
|
+
|
|
409
|
+
Returns
|
|
410
|
+
-------
|
|
411
|
+
dict
|
|
412
|
+
Dictionary containing fit results for all three models:
|
|
413
|
+
- MMFit: Michaelis-Menten fit
|
|
414
|
+
- LogiFit: Logistic regression fit
|
|
415
|
+
- ExpoFit: ZIFA exponential fit
|
|
416
|
+
"""
|
|
417
|
+
# Create base plot and get gene info
|
|
418
|
+
base_plot = bg__dropout_plot_base(expr_mat, xlim=xlim, suppress_plot=suppress_plot)
|
|
419
|
+
|
|
420
|
+
# Extract dropout rate (p) and mean expression (s)
|
|
421
|
+
p = base_plot['gene_info']['p']
|
|
422
|
+
s = base_plot['gene_info']['s']
|
|
423
|
+
|
|
424
|
+
# Fit the three models
|
|
425
|
+
MM = bg__fit_MM(p, s)
|
|
426
|
+
SCDE = bg__fit_logistic(p, s) # Called SCDE in R (Single Cell Differential Expression)
|
|
427
|
+
ZIFA = bg__fit_ZIFA(p, s)
|
|
428
|
+
|
|
429
|
+
# Add models to plot if plotting is enabled
|
|
430
|
+
if not suppress_plot:
|
|
431
|
+
sizeloc = bg__add_model_to_plot(MM, base_plot, lty=1, lwd=2.5, col="black", legend_loc="topright")
|
|
432
|
+
sizeloc = bg__add_model_to_plot(SCDE, base_plot, lty=2, lwd=2.5, col="magenta3",
|
|
433
|
+
legend_loc=(sizeloc['rect']['left'] + sizeloc['rect']['w'],
|
|
434
|
+
sizeloc['rect']['top'] - sizeloc['rect']['h'] - 0.05))
|
|
435
|
+
sizeloc = bg__add_model_to_plot(ZIFA, base_plot, lty=3, lwd=2.5, col="red",
|
|
436
|
+
legend_loc=(sizeloc['rect']['left'] + sizeloc['rect']['w'],
|
|
437
|
+
sizeloc['rect']['top'] - sizeloc['rect']['h'] - 0.05))
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
'MMFit': MM,
|
|
441
|
+
'LogiFit': SCDE,
|
|
442
|
+
'ExpoFit': ZIFA
|
|
443
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
def NBumiCoexpression(counts, fit, gene_list=None, method="both"):
|
|
6
|
+
"""
|
|
7
|
+
Ranks genes based on co-expression.
|
|
8
|
+
|
|
9
|
+
Tests for co-expression using the normal approximation of a binomial test.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
counts : pd.DataFrame or np.ndarray
|
|
14
|
+
Raw count matrix.
|
|
15
|
+
fit : dict
|
|
16
|
+
Output from `NBumiFitModel`.
|
|
17
|
+
gene_list : list of str, optional
|
|
18
|
+
Set of gene names to test coexpression of.
|
|
19
|
+
method : {"both", "on", "off"}, default="both"
|
|
20
|
+
Type of co-expression to test. "on" for co-expression, "off" for
|
|
21
|
+
co-absence, "both" for either.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
pd.DataFrame
|
|
26
|
+
A matrix of Z-scores for each pair of genes.
|
|
27
|
+
"""
|
|
28
|
+
# Set up
|
|
29
|
+
if gene_list is None:
|
|
30
|
+
gene_list = list(fit['vals']['tjs'].index)
|
|
31
|
+
|
|
32
|
+
if isinstance(counts, np.ndarray):
|
|
33
|
+
counts = pd.DataFrame(counts)
|
|
34
|
+
|
|
35
|
+
# Initialize matrix for gene probabilities
|
|
36
|
+
pd_gene = np.full((len(gene_list), counts.shape[1]), -1.0)
|
|
37
|
+
name_gene = [""] * len(gene_list)
|
|
38
|
+
|
|
39
|
+
for i, gene_name in enumerate(gene_list):
|
|
40
|
+
if gene_name in fit['vals']['tjs'].index:
|
|
41
|
+
gid = fit['vals']['tjs'].index.get_loc(gene_name)
|
|
42
|
+
mu_is = fit['vals']['tjs'].iloc[gid] * fit['vals']['tis'] / fit['vals']['total']
|
|
43
|
+
p_is = (1 + mu_is / fit['sizes'][gid])**(-fit['sizes'][gid])
|
|
44
|
+
pd_gene[i, :] = p_is
|
|
45
|
+
name_gene[i] = gene_name
|
|
46
|
+
|
|
47
|
+
# Remove genes that weren't found
|
|
48
|
+
if sum(name == "" for name in name_gene) > 0:
|
|
49
|
+
missing_count = sum(name == "" for name in name_gene)
|
|
50
|
+
warnings.warn(f"Warning: {missing_count} genes not found, check your gene list is correct.")
|
|
51
|
+
exclude = [i for i, name in enumerate(name_gene) if name == ""]
|
|
52
|
+
pd_gene = np.delete(pd_gene, exclude, axis=0)
|
|
53
|
+
name_gene = [name for name in name_gene if name != ""]
|
|
54
|
+
|
|
55
|
+
# Convert to DataFrame for easier indexing
|
|
56
|
+
pd_gene = pd.DataFrame(pd_gene, index=name_gene)
|
|
57
|
+
|
|
58
|
+
# Initialize Z-score matrix
|
|
59
|
+
n_genes = len(pd_gene)
|
|
60
|
+
z_mat = np.full((n_genes, n_genes), -1.0)
|
|
61
|
+
|
|
62
|
+
for i in range(n_genes):
|
|
63
|
+
for j in range(i, n_genes):
|
|
64
|
+
p_g1 = pd_gene.iloc[i, :]
|
|
65
|
+
p_g2 = pd_gene.iloc[j, :]
|
|
66
|
+
|
|
67
|
+
gene1_name = pd_gene.index[i]
|
|
68
|
+
gene2_name = pd_gene.index[j]
|
|
69
|
+
|
|
70
|
+
expr_g1 = counts.loc[gene1_name, :]
|
|
71
|
+
expr_g2 = counts.loc[gene2_name, :]
|
|
72
|
+
|
|
73
|
+
if method == "off" or method == "both":
|
|
74
|
+
# Both zero
|
|
75
|
+
expect_both_zero = p_g1 * p_g2
|
|
76
|
+
expect_both_err = expect_both_zero * (1 - expect_both_zero)
|
|
77
|
+
obs_both_zero = np.sum((expr_g1 == 0) & (expr_g2 == 0))
|
|
78
|
+
z = (obs_both_zero - np.sum(expect_both_zero)) / np.sqrt(np.sum(expect_both_err))
|
|
79
|
+
|
|
80
|
+
if method == "on" or method == "both":
|
|
81
|
+
# Both nonzero
|
|
82
|
+
obs_both_nonzero = np.sum((expr_g1 != 0) & (expr_g2 != 0))
|
|
83
|
+
expect_both_nonzero = (1 - p_g1) * (1 - p_g2)
|
|
84
|
+
expect_non_err = expect_both_nonzero * (1 - expect_both_nonzero)
|
|
85
|
+
z = (obs_both_nonzero - np.sum(expect_both_nonzero)) / np.sqrt(np.sum(expect_non_err))
|
|
86
|
+
|
|
87
|
+
if method == "both":
|
|
88
|
+
# Either (this overwrites the previous z calculation, matching R behavior)
|
|
89
|
+
obs_either = obs_both_zero + obs_both_nonzero
|
|
90
|
+
expect_either = expect_both_zero + expect_both_nonzero
|
|
91
|
+
expect_err = expect_either * (1 - expect_either)
|
|
92
|
+
z = (obs_either - np.sum(expect_either)) / np.sqrt(np.sum(expect_err))
|
|
93
|
+
|
|
94
|
+
z_mat[i, j] = z_mat[j, i] = z
|
|
95
|
+
|
|
96
|
+
# Convert to DataFrame with proper row/column names
|
|
97
|
+
z_mat = pd.DataFrame(z_mat, index=pd_gene.index, columns=pd_gene.index)
|
|
98
|
+
|
|
99
|
+
return z_mat
|