edgepython 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,474 @@
1
+ # This code was written by Claude (Anthropic). The project was directed by Lior Pachter.
2
+ """
3
+ Smoothing functions for edgePython.
4
+
5
+ Port of edgeR's locfitByCol and loessByCol.
6
+ """
7
+
8
+ import numpy as np
9
+ from numba import njit, prange
10
+
11
+
12
+ @njit(cache=True)
13
+ def _locfit_degree0_point(i, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted):
14
+ """Compute locfit degree=0 for a single point i."""
15
+ lo = i
16
+ hi = i + 1
17
+ while hi - lo < nn:
18
+ can_left = lo > 0
19
+ can_right = hi < n
20
+ if can_left and can_right:
21
+ if x_sorted[i] - x_sorted[lo - 1] <= x_sorted[hi] - x_sorted[i]:
22
+ lo -= 1
23
+ else:
24
+ hi += 1
25
+ elif can_left:
26
+ lo -= 1
27
+ elif can_right:
28
+ hi += 1
29
+ else:
30
+ break
31
+
32
+ h = 0.0
33
+ for k in range(lo, hi):
34
+ d = abs(x_sorted[k] - x_sorted[i])
35
+ if d > h:
36
+ h = d
37
+ h += 1e-10
38
+
39
+ sw = 0.0
40
+ for k in range(lo, hi):
41
+ u = abs(x_sorted[k] - x_sorted[i]) / h
42
+ if u >= 1.0:
43
+ wk = 0.0
44
+ else:
45
+ t = 1.0 - u * u * u
46
+ wk = t * t * t
47
+ wk *= w_sorted[k]
48
+ sw += wk
49
+ if wk > 0.0:
50
+ for j in range(ncols):
51
+ result_sorted[i, j] += wk * y_sorted[k, j]
52
+
53
+ if sw > 0.0:
54
+ for j in range(ncols):
55
+ result_sorted[i, j] /= sw
56
+ else:
57
+ for j in range(ncols):
58
+ result_sorted[i, j] = y_sorted[i, j]
59
+
60
+
61
+ @njit(cache=True, parallel=True)
62
+ def _locfit_degree0_kernel(x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted):
63
+ """Numba kernel for locfit degree=0 (Nadaraya-Watson) with nearest-neighbor bandwidth."""
64
+ for i in prange(n):
65
+ idx = np.int64(i)
66
+ _locfit_degree0_point(idx, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted)
67
+
68
+
69
+ @njit(cache=True)
70
+ def _locfit_degree0_grid_point(i, x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid):
71
+ """Compute locfit degree=0 at eval point x_eval[i] using sorted data."""
72
+ xv = x_eval[i]
73
+ # Binary search to find insertion point in x_sorted
74
+ lo_bs = 0
75
+ hi_bs = n
76
+ while lo_bs < hi_bs:
77
+ mid = (lo_bs + hi_bs) // 2
78
+ if x_sorted[mid] < xv:
79
+ lo_bs = mid + 1
80
+ else:
81
+ hi_bs = mid
82
+ # lo_bs is the insertion point; expand window to nn neighbors
83
+ lo = lo_bs
84
+ hi = lo_bs
85
+ while hi - lo < nn:
86
+ can_left = lo > 0
87
+ can_right = hi < n
88
+ if can_left and can_right:
89
+ if xv - x_sorted[lo - 1] <= x_sorted[hi] - xv:
90
+ lo -= 1
91
+ else:
92
+ hi += 1
93
+ elif can_left:
94
+ lo -= 1
95
+ elif can_right:
96
+ hi += 1
97
+ else:
98
+ break
99
+
100
+ h = 0.0
101
+ for k in range(lo, hi):
102
+ d = abs(x_sorted[k] - xv)
103
+ if d > h:
104
+ h = d
105
+ h += 1e-10
106
+
107
+ sw = 0.0
108
+ for k in range(lo, hi):
109
+ u = abs(x_sorted[k] - xv) / h
110
+ if u >= 1.0:
111
+ wk = 0.0
112
+ else:
113
+ t = 1.0 - u * u * u
114
+ wk = t * t * t
115
+ wk *= w_sorted[k]
116
+ sw += wk
117
+ if wk > 0.0:
118
+ for j in range(ncols):
119
+ result_grid[i, j] += wk * y_sorted[k, j]
120
+
121
+ if sw > 0.0:
122
+ for j in range(ncols):
123
+ result_grid[i, j] /= sw
124
+
125
+
126
+ @njit(cache=True, parallel=True)
127
+ def _locfit_degree0_grid_kernel(x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid):
128
+ """Evaluate locfit degree=0 at M grid points using sorted data."""
129
+ m = len(x_eval)
130
+ for i in prange(m):
131
+ idx = np.int64(i)
132
+ _locfit_degree0_grid_point(idx, x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid)
133
+
134
+
135
+ @njit(cache=True)
136
+ def _locfit_degree1_point(i, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted):
137
+ """Compute locfit degree=1 for a single point i."""
138
+ lo = i
139
+ hi = i + 1
140
+ while hi - lo < nn:
141
+ can_left = lo > 0
142
+ can_right = hi < n
143
+ if can_left and can_right:
144
+ if x_sorted[i] - x_sorted[lo - 1] <= x_sorted[hi] - x_sorted[i]:
145
+ lo -= 1
146
+ else:
147
+ hi += 1
148
+ elif can_left:
149
+ lo -= 1
150
+ elif can_right:
151
+ hi += 1
152
+ else:
153
+ break
154
+
155
+ h = 0.0
156
+ for k in range(lo, hi):
157
+ d = abs(x_sorted[k] - x_sorted[i])
158
+ if d > h:
159
+ h = d
160
+ h += 1e-10
161
+
162
+ sum_w = 0.0
163
+ sum_w_dx = 0.0
164
+ sum_w_dx2 = 0.0
165
+
166
+ for k in range(lo, hi):
167
+ u = abs(x_sorted[k] - x_sorted[i]) / h
168
+ if u >= 1.0:
169
+ wk = 0.0
170
+ else:
171
+ t = 1.0 - u * u * u
172
+ wk = t * t * t
173
+ wk *= w_sorted[k]
174
+ dx = x_sorted[k] - x_sorted[i]
175
+ sum_w += wk
176
+ sum_w_dx += wk * dx
177
+ sum_w_dx2 += wk * dx * dx
178
+
179
+ det = sum_w * sum_w_dx2 - sum_w_dx * sum_w_dx
180
+ if abs(det) < 1e-300:
181
+ for j in range(ncols):
182
+ result_sorted[i, j] = y_sorted[i, j]
183
+ else:
184
+ for j in range(ncols):
185
+ rhs0 = 0.0
186
+ rhs1 = 0.0
187
+ for k in range(lo, hi):
188
+ u = abs(x_sorted[k] - x_sorted[i]) / h
189
+ if u >= 1.0:
190
+ wk = 0.0
191
+ else:
192
+ t2 = 1.0 - u * u * u
193
+ wk = t2 * t2 * t2
194
+ wk *= w_sorted[k]
195
+ dx = x_sorted[k] - x_sorted[i]
196
+ rhs0 += wk * y_sorted[k, j]
197
+ rhs1 += wk * dx * y_sorted[k, j]
198
+ result_sorted[i, j] = (sum_w_dx2 * rhs0 - sum_w_dx * rhs1) / det
199
+
200
+
201
+ @njit(cache=True, parallel=True)
202
+ def _locfit_degree1_kernel(x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted):
203
+ """Numba kernel for locfit degree=1 (local linear regression) with nearest-neighbor bandwidth."""
204
+ for i in prange(n):
205
+ idx = np.int64(i)
206
+ _locfit_degree1_point(idx, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted)
207
+
208
+
209
+ @njit(cache=True)
210
+ def _locfit_degree1_grid_point(i, x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid):
211
+ """Compute locfit degree=1 at eval point x_eval[i] using sorted data."""
212
+ xv = x_eval[i]
213
+ # Binary search to find insertion point in x_sorted
214
+ lo_bs = 0
215
+ hi_bs = n
216
+ while lo_bs < hi_bs:
217
+ mid = (lo_bs + hi_bs) // 2
218
+ if x_sorted[mid] < xv:
219
+ lo_bs = mid + 1
220
+ else:
221
+ hi_bs = mid
222
+ lo = lo_bs
223
+ hi = lo_bs
224
+ while hi - lo < nn:
225
+ can_left = lo > 0
226
+ can_right = hi < n
227
+ if can_left and can_right:
228
+ if xv - x_sorted[lo - 1] <= x_sorted[hi] - xv:
229
+ lo -= 1
230
+ else:
231
+ hi += 1
232
+ elif can_left:
233
+ lo -= 1
234
+ elif can_right:
235
+ hi += 1
236
+ else:
237
+ break
238
+
239
+ h = 0.0
240
+ for k in range(lo, hi):
241
+ d = abs(x_sorted[k] - xv)
242
+ if d > h:
243
+ h = d
244
+ h += 1e-10
245
+
246
+ sum_w = 0.0
247
+ sum_w_dx = 0.0
248
+ sum_w_dx2 = 0.0
249
+
250
+ for k in range(lo, hi):
251
+ u = abs(x_sorted[k] - xv) / h
252
+ if u >= 1.0:
253
+ wk = 0.0
254
+ else:
255
+ t = 1.0 - u * u * u
256
+ wk = t * t * t
257
+ wk *= w_sorted[k]
258
+ dx = x_sorted[k] - xv
259
+ sum_w += wk
260
+ sum_w_dx += wk * dx
261
+ sum_w_dx2 += wk * dx * dx
262
+
263
+ det = sum_w * sum_w_dx2 - sum_w_dx * sum_w_dx
264
+ if abs(det) < 1e-300:
265
+ # Fallback: find nearest data point
266
+ best_k = lo
267
+ best_d = abs(x_sorted[lo] - xv)
268
+ for k in range(lo + 1, hi):
269
+ d = abs(x_sorted[k] - xv)
270
+ if d < best_d:
271
+ best_d = d
272
+ best_k = k
273
+ for j in range(ncols):
274
+ result_grid[i, j] = y_sorted[best_k, j]
275
+ else:
276
+ for j in range(ncols):
277
+ rhs0 = 0.0
278
+ rhs1 = 0.0
279
+ for k in range(lo, hi):
280
+ u = abs(x_sorted[k] - xv) / h
281
+ if u >= 1.0:
282
+ wk = 0.0
283
+ else:
284
+ t2 = 1.0 - u * u * u
285
+ wk = t2 * t2 * t2
286
+ wk *= w_sorted[k]
287
+ dx = x_sorted[k] - xv
288
+ rhs0 += wk * y_sorted[k, j]
289
+ rhs1 += wk * dx * y_sorted[k, j]
290
+ result_grid[i, j] = (sum_w_dx2 * rhs0 - sum_w_dx * rhs1) / det
291
+
292
+
293
+ @njit(cache=True, parallel=True)
294
+ def _locfit_degree1_grid_kernel(x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid):
295
+ """Evaluate locfit degree=1 at M grid points using sorted data."""
296
+ m = len(x_eval)
297
+ for i in prange(m):
298
+ idx = np.int64(i)
299
+ _locfit_degree1_grid_point(idx, x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid)
300
+
301
+
302
+ def locfit_by_col(y, x=None, weights=1, span=0.5, degree=0):
303
+ """Local regression smoother for columns of a matrix.
304
+
305
+ Port of edgeR's locfitByCol. Uses a simple local weighted regression
306
+ since Python doesn't have a direct locfit equivalent.
307
+ """
308
+ y = np.asarray(y, dtype=np.float64)
309
+ if y.ndim == 1:
310
+ y = y.reshape(-1, 1)
311
+ n, ncols = y.shape
312
+
313
+ weights = np.broadcast_to(np.asarray(weights, dtype=np.float64), n).copy()
314
+ if x is None:
315
+ x = np.arange(1, n + 1, dtype=np.float64)
316
+ x = np.asarray(x, dtype=np.float64)
317
+
318
+ if span * n < 2 or n <= 1:
319
+ return y.copy()
320
+
321
+ # Sort by x for efficient windowing
322
+ order = np.argsort(x)
323
+ x_sorted = x[order].copy()
324
+ y_sorted = y[order].copy()
325
+ w_sorted = weights[order].copy()
326
+
327
+ nn = max(2, int(round(span * n)))
328
+
329
+ # Adaptive grid path for large n: evaluate at M grid points, interpolate
330
+ if n > 1000:
331
+ M = 200
332
+ x_eval = np.linspace(x_sorted[0], x_sorted[-1], M)
333
+ result_grid = np.zeros((M, ncols), dtype=np.float64)
334
+ if degree == 0:
335
+ _locfit_degree0_grid_kernel(x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid)
336
+ else:
337
+ _locfit_degree1_grid_kernel(x_eval, x_sorted, y_sorted, w_sorted, n, ncols, nn, result_grid)
338
+ # Interpolate grid results to all data points
339
+ result_sorted = np.empty((n, ncols), dtype=np.float64)
340
+ for j in range(ncols):
341
+ result_sorted[:, j] = np.interp(x_sorted, x_eval, result_grid[:, j])
342
+ else:
343
+ result_sorted = np.zeros((n, ncols), dtype=np.float64)
344
+ if degree == 0:
345
+ _locfit_degree0_kernel(x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted)
346
+ else:
347
+ _locfit_degree1_kernel(x_sorted, y_sorted, w_sorted, n, ncols, nn, result_sorted)
348
+
349
+ # Unsort
350
+ result = np.empty_like(result_sorted)
351
+ result[order] = result_sorted
352
+ return result
353
+
354
+
355
+ @njit(cache=True)
356
+ def _loess_point(i, x, y, n, ncols, nspan, order, rank, fitted, leverages):
357
+ """Compute loess for a single point i."""
358
+ ri = rank[i]
359
+ lo_cand = ri
360
+ hi_cand = ri + 1
361
+ while hi_cand - lo_cand < nspan:
362
+ can_go_left = lo_cand > 0
363
+ can_go_right = hi_cand < n
364
+ if can_go_left and can_go_right:
365
+ dl = abs(x[order[lo_cand - 1]] - x[i])
366
+ dr = abs(x[order[hi_cand]] - x[i])
367
+ if dl <= dr:
368
+ lo_cand -= 1
369
+ else:
370
+ hi_cand += 1
371
+ elif can_go_left:
372
+ lo_cand -= 1
373
+ elif can_go_right:
374
+ hi_cand += 1
375
+ else:
376
+ break
377
+
378
+ max_dist = 0.0
379
+ for k in range(lo_cand, hi_cand):
380
+ d = abs(x[order[k]] - x[i])
381
+ if d > max_dist:
382
+ max_dist = d
383
+ max_dist += 1e-10
384
+
385
+ sw = 0.0
386
+ for k in range(lo_cand, hi_cand):
387
+ idx = order[k]
388
+ u = abs(x[idx] - x[i]) / max_dist
389
+ if u >= 1.0:
390
+ wk = 0.0
391
+ else:
392
+ t = 1.0 - u * u * u
393
+ wk = t * t * t
394
+ sw += wk
395
+
396
+ self_w_norm = 0.0
397
+ if sw > 0.0:
398
+ for k in range(lo_cand, hi_cand):
399
+ idx = order[k]
400
+ u = abs(x[idx] - x[i]) / max_dist
401
+ if u >= 1.0:
402
+ wk = 0.0
403
+ else:
404
+ t = 1.0 - u * u * u
405
+ wk = t * t * t
406
+ w_norm = wk / sw
407
+ for j in range(ncols):
408
+ fitted[i, j] += w_norm * y[idx, j]
409
+ if idx == i:
410
+ self_w_norm = w_norm
411
+ leverages[i] = self_w_norm
412
+ else:
413
+ for j in range(ncols):
414
+ fitted[i, j] = y[i, j]
415
+ leverages[i] = 1.0
416
+
417
+
418
+ @njit(cache=True, parallel=True)
419
+ def _loess_kernel(x, y, n, ncols, nspan, fitted, leverages):
420
+ """Numba kernel for loess_by_col: degree-0 local regression with leverages."""
421
+ order = np.argsort(x)
422
+ rank = np.empty(n, dtype=np.int64)
423
+ for k in range(n):
424
+ rank[order[k]] = k
425
+
426
+ for i in prange(n):
427
+ idx = np.int64(i)
428
+ _loess_point(idx, x, y, n, ncols, nspan, order, rank, fitted, leverages)
429
+
430
+
431
+ def loess_by_col(y, x=None, span=0.5):
432
+ """Fit a lowess curve of degree 0 to each column of a matrix.
433
+
434
+ Port of edgeR's loessByCol. Returns fitted values and leverages.
435
+
436
+ Parameters
437
+ ----------
438
+ y : array-like
439
+ Matrix of values.
440
+ x : array-like, optional
441
+ Covariate (defaults to 1:nrow).
442
+ span : float
443
+ Span for smoothing.
444
+
445
+ Returns
446
+ -------
447
+ dict with 'fitted_values' and 'leverages'.
448
+ """
449
+ y = np.asarray(y, dtype=np.float64)
450
+ if y.ndim == 1:
451
+ y = y.reshape(-1, 1)
452
+ n = y.shape[0]
453
+ ncols = y.shape[1]
454
+
455
+ if x is None:
456
+ x = np.arange(1, n + 1, dtype=np.float64)
457
+ x = np.asarray(x, dtype=np.float64).copy()
458
+
459
+ nspan = min(int(span * n), n)
460
+ if nspan <= 1:
461
+ return {
462
+ 'fitted_values': y.copy(),
463
+ 'leverages': np.ones(n)
464
+ }
465
+
466
+ fitted = np.zeros((n, ncols), dtype=np.float64)
467
+ leverages = np.zeros(n, dtype=np.float64)
468
+
469
+ _loess_kernel(x, y, n, ncols, nspan, fitted, leverages)
470
+
471
+ return {
472
+ 'fitted_values': fitted,
473
+ 'leverages': leverages
474
+ }