consenrich 0.7.11b2__cp314-cp314-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

Files changed (38) hide show
  1. consenrich/.dylibs/libomp.dylib +0 -0
  2. consenrich/__init__.py +11 -0
  3. consenrich/cconsenrich.c +50610 -0
  4. consenrich/cconsenrich.cpython-314-darwin.so +0 -0
  5. consenrich/cconsenrich.pyx +1065 -0
  6. consenrich/consenrich.py +1802 -0
  7. consenrich/constants.py +172 -0
  8. consenrich/core.py +2068 -0
  9. consenrich/data/ce10.sizes +6 -0
  10. consenrich/data/ce10_blacklist.bed +100 -0
  11. consenrich/data/ce10_sparse.bed +11828 -0
  12. consenrich/data/ce11.sizes +6 -0
  13. consenrich/data/ce11_blacklist.bed +97 -0
  14. consenrich/data/ce11_sparse.bed +11828 -0
  15. consenrich/data/dm6.sizes +7 -0
  16. consenrich/data/dm6_blacklist.bed +182 -0
  17. consenrich/data/dm6_sparse.bed +20000 -0
  18. consenrich/data/hg19.sizes +24 -0
  19. consenrich/data/hg19_blacklist.bed +834 -0
  20. consenrich/data/hg19_sparse.bed +288358 -0
  21. consenrich/data/hg38.sizes +24 -0
  22. consenrich/data/hg38_blacklist.bed +636 -0
  23. consenrich/data/hg38_sparse.bed +288699 -0
  24. consenrich/data/mm10.sizes +21 -0
  25. consenrich/data/mm10_blacklist.bed +3435 -0
  26. consenrich/data/mm10_sparse.bed +100400 -0
  27. consenrich/data/mm39.sizes +21 -0
  28. consenrich/data/mm39_blacklist.bed +3360 -0
  29. consenrich/data/mm39_sparse.bed +100381 -0
  30. consenrich/detrorm.py +297 -0
  31. consenrich/matching.py +929 -0
  32. consenrich/misc_util.py +122 -0
  33. consenrich-0.7.11b2.dist-info/METADATA +66 -0
  34. consenrich-0.7.11b2.dist-info/RECORD +38 -0
  35. consenrich-0.7.11b2.dist-info/WHEEL +6 -0
  36. consenrich-0.7.11b2.dist-info/entry_points.txt +2 -0
  37. consenrich-0.7.11b2.dist-info/licenses/LICENSE +21 -0
  38. consenrich-0.7.11b2.dist-info/top_level.txt +1 -0
consenrich/matching.py ADDED
@@ -0,0 +1,929 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""Module implementing (experimental) 'structured peak detection' features using wavelet-based templates."""
3
+
4
+ import logging
5
+ import os
6
+ import math
7
+ from pybedtools import BedTool
8
+ from typing import List, Optional
9
+
10
+ import pandas as pd
11
+ import pywt as pw
12
+ import numpy as np
13
+ import numpy.typing as npt
14
+
15
+ from scipy import signal, stats
16
+
17
+ from . import cconsenrich
18
+ from . import core as core
19
+
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def _FDR(pVals: np.ndarray, method: str|None = "bh") -> np.ndarray:
28
+ # can use bh or the more conservative Benjamini-Yekutieli to
29
+ # ... control FDR under arbitrary dependencies between tests
30
+ if method is None:
31
+ return pVals
32
+ return stats.false_discovery_control(pVals, method=method.lower())
33
+
34
+
35
+ def autoMinLengthIntervals(
36
+ values: np.ndarray,
37
+ initLen: int = 3,
38
+ cutoffQuantile: float = 0.90,
39
+ isLogScale: bool = False,
40
+ ) -> int:
41
+ r"""Determines a minimum matching length (in interval units) based on the input signal values.
42
+
43
+ Returns the average length of non-zero contiguous segments in a log-scaled/centered version of `values`
44
+
45
+ :param values: A 1D array of signal-like values.
46
+ :type values: np.ndarray
47
+ :param initLen: Initial minimum length (in intervals). Defaults to 3.
48
+ :type initLen: int
49
+ :return: Estimated minimum matching length (in intervals)
50
+ :rtype: int
51
+
52
+ """
53
+ values_ = values.astype(np.float64).copy()
54
+ if not isLogScale:
55
+ np.asinh(values_, out=values_)
56
+
57
+ trValues = values_ - signal.medfilt(
58
+ values_,
59
+ kernel_size=max(
60
+ (2 * initLen) + 1,
61
+ 2 * (int(len(values_) * 0.05)) + 1,
62
+ ),
63
+ )
64
+
65
+ # just consider stretches of positive signal
66
+ nz = trValues[trValues > 0]
67
+ if len(nz) == 0:
68
+ return initLen
69
+ # ... mask out < quantile
70
+ thr = np.quantile(
71
+ nz, cutoffQuantile, method="interpolated_inverted_cdf"
72
+ )
73
+ mask = nz >= thr
74
+ if not np.any(mask):
75
+ return initLen
76
+
77
+ idx = np.flatnonzero(np.diff(np.r_[False, mask, False]))
78
+ runs = idx.reshape(-1, 2)
79
+ widths = runs[:, 1] - runs[:, 0]
80
+ widths = widths[widths >= initLen]
81
+
82
+ if len(widths) == 0:
83
+ return initLen
84
+
85
+ return int(np.mean(widths))
86
+
87
+
88
+ def scalarClip(value: float, low: float, high: float) -> float:
89
+ return low if value < low else high if value > high else value
90
+
91
+
92
+ def castableToFloat(value) -> bool:
93
+ if value is None:
94
+ return False
95
+ if isinstance(value, bool):
96
+ return False
97
+ if isinstance(value, str):
98
+ if value.lower().replace(" ", "") in [
99
+ "nan",
100
+ "inf",
101
+ "-inf",
102
+ "infinity",
103
+ "-infinity",
104
+ "",
105
+ " ",
106
+ ]:
107
+ return False
108
+
109
+ try:
110
+ float(value)
111
+ if np.isfinite(float(value)):
112
+ return True
113
+ except Exception:
114
+ return False
115
+ return False
116
+
117
+
118
+ def matchWavelet(
119
+ chromosome: str,
120
+ intervals: npt.NDArray[int],
121
+ values: npt.NDArray[np.float64],
122
+ templateNames: List[str],
123
+ cascadeLevels: List[int],
124
+ iters: int,
125
+ alpha: float = 0.05,
126
+ minMatchLengthBP: Optional[int] = 250,
127
+ maxNumMatches: Optional[int] = 100_000,
128
+ minSignalAtMaxima: Optional[float | str] = "q:0.75",
129
+ randSeed: int = 42,
130
+ recenterAtPointSource: bool = True,
131
+ useScalingFunction: bool = True,
132
+ excludeRegionsBedFile: Optional[str] = None,
133
+ weights: Optional[npt.NDArray[np.float64]] = None,
134
+ eps: float = 1.0e-2,
135
+ isLogScale: bool = False,
136
+ autoLengthQuantile: float = 0.90,
137
+ ) -> pd.DataFrame:
138
+ r"""Detect structured peaks in Consenrich tracks by matching wavelet- or scaling-function–based templates.
139
+
140
+ :param chromosome: Chromosome name for the input intervals and values.
141
+ :type chromosome: str
142
+ :param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
143
+ but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
144
+ :type values: npt.NDArray[np.float64]
145
+ :param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
146
+ :type templateNames: List[str]
147
+ :param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
148
+ Must have the same length as `templateNames`, with each entry aligned to the
149
+ corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
150
+ :type cascadeLevels: List[int]
151
+ :param iters: Number of random blocks to sample in the response sequence while building
152
+ an empirical null to test significance within chromosomes. See :func:`cconsenrich.csampleBlockStats`.
153
+ :type iters: int
154
+ :param alpha: Primary significance threshold on detected matches. Specifically, the
155
+ minimum corrected empirical p-value approximated from randomly sampled blocks in the
156
+ response sequence.
157
+ :type alpha: float
158
+ :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
159
+ the signal-template convolution must be greater in value than others to qualify as matches.
160
+ If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
161
+ If set to `None`, defaults to 250 bp.
162
+ :type minMatchLengthBP: Optional[int]
163
+ :param minSignalAtMaxima: Secondary significance threshold coupled with :math:`\alpha`. Requires the *signal value*
164
+ at relative maxima in the response sequence to be greater than a threshold :math:`\pm \epsilon`. Comparisons are
165
+ made in log-scale (arsinh). If a `float` value is provided, then we require minimum signal value must be greater
166
+ than this value.
167
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
168
+ threshold is then set to the corresponding quantile of the non-zero signal estimates.
169
+ Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
170
+ :type minSignalAtMaxima: Optional[str | float]
171
+ :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
172
+ If False, use (only) the wavelet function.
173
+ :type useScalingFunction: bool
174
+ :param excludeRegionsBedFile: A BED file with regions to exclude from matching
175
+ :type excludeRegionsBedFile: Optional[str]
176
+ :param recenterAtPointSource: If True, recenter detected matches at the point source (max value)
177
+ :type recenterAtPointSource: bool
178
+ :param weights: Optional weights to apply to `values` prior to matching. Must have the same length as `values`.
179
+ :type weights: Optional[npt.NDArray[np.float64]]
180
+ :param eps: Tolerance parameter for relative maxima detection in the response sequence. Set to zero to enforce strict
181
+ inequalities when identifying discrete relative maxima.
182
+ :type eps: float
183
+ :param isLogScale: Whether the input values have already been transformed. Used to double/redundant transformations.
184
+ :type isLogScale: bool
185
+ :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
186
+ :return: A pandas DataFrame with detected matches
187
+ :rtype: pd.DataFrame
188
+ """
189
+
190
+ rng = np.random.default_rng(int(randSeed))
191
+ if len(intervals) < 5:
192
+ raise ValueError("`intervals` must be at least length 5")
193
+
194
+ if len(values) != len(intervals):
195
+ raise ValueError(
196
+ "`values` must have the same length as `intervals`"
197
+ )
198
+
199
+ if len(templateNames) != len(cascadeLevels):
200
+ raise ValueError(
201
+ "\n\t`templateNames` and `cascadeLevels` must have the same length."
202
+ "\n\tSet products are not supported, i.e., each template needs an explicitly defined cascade level."
203
+ "\t\ne.g., for `templateNames = [haar, db2]`, use `cascadeLevels = [2, 2]`, not `[2]`.\n"
204
+ )
205
+
206
+ intervalLengthBp = intervals[1] - intervals[0]
207
+
208
+ if minMatchLengthBP is not None and minMatchLengthBP < 1:
209
+ minMatchLengthBP = autoMinLengthIntervals(
210
+ values,
211
+ cutoffQuantile=autoLengthQuantile,
212
+ isLogScale=isLogScale,
213
+ ) * int(intervalLengthBp)
214
+ elif minMatchLengthBP is None:
215
+ minMatchLengthBP = 147 # default to nucleosome size
216
+
217
+ logger.info(f"\n\tUsing minMatchLengthBP: {minMatchLengthBP}")
218
+
219
+ if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
220
+ raise ValueError("`intervals` must be evenly spaced.")
221
+
222
+ if weights is not None:
223
+ if len(weights) != len(values):
224
+ logger.warning(
225
+ f"`weights` length {len(weights)} does not match `values` length {len(values)}. Ignoring..."
226
+ )
227
+ else:
228
+ values = values * weights
229
+
230
+ if not isLogScale:
231
+ asinhValues = np.asinh(values, dtype=np.float32)
232
+ else:
233
+ asinhValues = values.astype(np.float32)
234
+ asinhNonZeroValues = asinhValues[asinhValues > 0]
235
+
236
+ iters = max(int(iters), 1000)
237
+ defQuantile = 0.75
238
+ chromMin = int(intervals[0])
239
+ chromMax = int(intervals[-1])
240
+ chromMid = chromMin + (chromMax - chromMin) // 2 # for split
241
+ halfLeftMask = intervals < chromMid
242
+ halfRightMask = ~halfLeftMask
243
+ excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
244
+ if excludeRegionsBedFile is not None:
245
+ excludeMaskGlobal = core.getBedMask(
246
+ chromosome, excludeRegionsBedFile, intervals
247
+ ).astype(np.uint8)
248
+ allRows = []
249
+
250
+ def parseMinSignalThreshold(val):
251
+ if val is None:
252
+ return -1e6
253
+ if isinstance(val, str):
254
+ if val.startswith("q:"):
255
+ qVal = float(val.split("q:")[-1])
256
+ if not (0 <= qVal <= 1):
257
+ raise ValueError(
258
+ f"Quantile {qVal} is out of range"
259
+ )
260
+ return float(
261
+ np.quantile(
262
+ asinhNonZeroValues,
263
+ qVal,
264
+ method="interpolated_inverted_cdf",
265
+ )
266
+ )
267
+ elif castableToFloat(val):
268
+ v = float(val)
269
+ return -1e6 if v < 0 else float(np.asinh(v))
270
+ else:
271
+ return float(
272
+ np.quantile(
273
+ asinhNonZeroValues,
274
+ defQuantile,
275
+ method="interpolated_inverted_cdf",
276
+ )
277
+ )
278
+ if isinstance(val, (float, int)):
279
+ v = float(val)
280
+ return -1e6 if v < 0 else float(np.asinh(v))
281
+ return float(
282
+ np.quantile(
283
+ asinhNonZeroValues,
284
+ defQuantile,
285
+ method="interpolated_inverted_cdf",
286
+ )
287
+ )
288
+
289
+ def relativeMaxima(
290
+ resp: np.ndarray, orderBins: int, eps: float = None
291
+ ) -> np.ndarray:
292
+ order_: int = max(int(orderBins), 1)
293
+ if eps is None:
294
+ eps = np.finfo(resp.dtype).eps * 10
295
+
296
+ def ge_with_tol(a, b):
297
+ return a > (b - eps)
298
+
299
+ # get initial set using loosened criterion
300
+ idx = signal.argrelextrema(
301
+ resp, comparator=ge_with_tol, order=order_
302
+ )[0]
303
+ if idx.size == 0:
304
+ return idx
305
+
306
+ if eps > 0.0:
307
+ groups = []
308
+ start, prev = idx[0], idx[0]
309
+ for x in idx[1:]:
310
+ # case: still contiguous
311
+ if x == prev + 1:
312
+ prev = x
313
+ else:
314
+ # case: a gap --> break off from previous group
315
+ groups.append((start, prev))
316
+ start = x
317
+ prev = x
318
+ groups.append((start, prev))
319
+
320
+ centers: list[int] = []
321
+ for s, e in groups:
322
+ if s == e:
323
+ centers.append(s)
324
+ else:
325
+ # for each `group` of tied indices, picks the center
326
+ centers.append((s + e) // 2)
327
+
328
+ return np.asarray(centers, dtype=np.intp)
329
+
330
+ return idx
331
+
332
+ def sampleBlockMaxima(
333
+ resp: np.ndarray,
334
+ halfMask: np.ndarray,
335
+ relWindowBins: int,
336
+ nsamp: int,
337
+ seed: int,
338
+ eps: float,
339
+ ):
340
+ exMask = excludeMaskGlobal.astype(np.uint8).copy()
341
+ exMask |= (~halfMask).astype(np.uint8)
342
+ vals = np.array(
343
+ cconsenrich.csampleBlockStats(
344
+ intervals.astype(np.uint32),
345
+ resp,
346
+ int(relWindowBins),
347
+ int(nsamp),
348
+ int(seed),
349
+ exMask.astype(np.uint8),
350
+ np.float64(eps if eps is not None else 0.0),
351
+ ),
352
+ dtype=float,
353
+ )
354
+ if len(vals) == 0:
355
+ return vals
356
+ low = np.quantile(vals, 0.001)
357
+ high = np.quantile(vals, 0.999)
358
+ return vals[(vals > low) & (vals < high)]
359
+
360
+ for templateName, cascadeLevel in zip(
361
+ templateNames, cascadeLevels
362
+ ):
363
+ if templateName not in pw.wavelist(kind="discrete"):
364
+ logger.warning(
365
+ f"Skipping unknown wavelet template: {templateName}"
366
+ )
367
+ continue
368
+
369
+ wav = pw.Wavelet(str(templateName))
370
+ scalingFunc, waveletFunc, _ = wav.wavefun(
371
+ level=int(cascadeLevel)
372
+ )
373
+ template = np.array(
374
+ scalingFunc if useScalingFunction else waveletFunc,
375
+ dtype=np.float64,
376
+ )
377
+ template /= np.linalg.norm(template)
378
+
379
+ logger.info(
380
+ f"\n\tMatching template: {templateName}"
381
+ f"\n\tcascade level: {cascadeLevel}"
382
+ f"\n\ttemplate length: {len(template)}"
383
+ )
384
+
385
+ # efficient FFT-based cross-correlation
386
+ # (OA may be better for smaller templates, TODO add a check)
387
+ response = signal.fftconvolve(
388
+ values, template[::-1], mode="same"
389
+ )
390
+ thisMinMatchBp = minMatchLengthBP
391
+ if thisMinMatchBp is None or thisMinMatchBp < 1:
392
+ thisMinMatchBp = len(template) * intervalLengthBp
393
+ if thisMinMatchBp % intervalLengthBp != 0:
394
+ thisMinMatchBp += intervalLengthBp - (
395
+ thisMinMatchBp % intervalLengthBp
396
+ )
397
+ relWindowBins = int(
398
+ ((thisMinMatchBp / intervalLengthBp) / 2) + 1
399
+ )
400
+ relWindowBins = max(relWindowBins, 1)
401
+ asinhThreshold = parseMinSignalThreshold(minSignalAtMaxima)
402
+ for nullMask, testMask, tag in [
403
+ (halfLeftMask, halfRightMask, "R"),
404
+ (halfRightMask, halfLeftMask, "L"),
405
+ ]:
406
+ blockMaxima = sampleBlockMaxima(
407
+ response,
408
+ nullMask,
409
+ relWindowBins,
410
+ nsamp=max(iters, 1000),
411
+ seed=rng.integers(1, 10_000),
412
+ eps=eps,
413
+ )
414
+ if len(blockMaxima) < 25:
415
+ pooledMask = ~excludeMaskGlobal.astype(bool)
416
+ blockMaxima = sampleBlockMaxima(
417
+ response,
418
+ pooledMask,
419
+ relWindowBins,
420
+ nsamp=max(iters, 1000),
421
+ seed=rng.integers(1, 10_000),
422
+ eps=eps,
423
+ )
424
+ ecdfSf = stats.ecdf(blockMaxima).sf
425
+ candidateIdx = relativeMaxima(
426
+ response, relWindowBins, eps=eps
427
+ )
428
+
429
+ candidateMask = (
430
+ (candidateIdx >= relWindowBins)
431
+ & (candidateIdx < len(response) - relWindowBins)
432
+ & (testMask[candidateIdx])
433
+ & (excludeMaskGlobal[candidateIdx] == 0)
434
+ & (asinhValues[candidateIdx] > asinhThreshold)
435
+ )
436
+
437
+ candidateIdx = candidateIdx[candidateMask]
438
+ if len(candidateIdx) == 0:
439
+ continue
440
+ if (
441
+ maxNumMatches is not None
442
+ and len(candidateIdx) > maxNumMatches
443
+ ):
444
+ candidateIdx = candidateIdx[
445
+ np.argsort(asinhValues[candidateIdx])[
446
+ -maxNumMatches:
447
+ ]
448
+ ]
449
+ pEmp = np.clip(
450
+ ecdfSf.evaluate(response[candidateIdx]),
451
+ np.finfo(np.float32).tiny,
452
+ 1.0,
453
+ )
454
+ startsIdx = np.maximum(candidateIdx - relWindowBins, 0)
455
+ endsIdx = np.minimum(
456
+ len(values) - 1, candidateIdx + relWindowBins
457
+ )
458
+ pointSourcesIdx = []
459
+ for s, e in zip(startsIdx, endsIdx):
460
+ pointSourcesIdx.append(
461
+ np.argmax(values[s : e + 1]) + s
462
+ )
463
+ pointSourcesIdx = np.array(pointSourcesIdx)
464
+ starts = intervals[startsIdx]
465
+ ends = intervals[endsIdx]
466
+ pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
467
+ 1, intervalLengthBp // 2
468
+ )
469
+ if recenterAtPointSource:
470
+ starts = pointSourcesAbs - (
471
+ relWindowBins * intervalLengthBp
472
+ )
473
+ ends = pointSourcesAbs + (
474
+ relWindowBins * intervalLengthBp
475
+ )
476
+ pointSourcesRel = (
477
+ intervals[pointSourcesIdx] - starts
478
+ ) + max(1, intervalLengthBp // 2)
479
+ sqScores = (1 + response[candidateIdx]) ** 2
480
+ minR, maxR = (
481
+ float(np.min(sqScores)),
482
+ float(np.max(sqScores)),
483
+ )
484
+ rangeR = max(maxR - minR, 1.0)
485
+ scores = (250 + 750 * (sqScores - minR) / rangeR).astype(
486
+ int
487
+ )
488
+ for i, idxVal in enumerate(candidateIdx):
489
+ allRows.append(
490
+ {
491
+ "chromosome": chromosome,
492
+ "start": int(starts[i]),
493
+ "end": int(ends[i]),
494
+ "name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
495
+ "score": int(scores[i]),
496
+ "strand": ".",
497
+ "signal": float(response[idxVal]),
498
+ "p_raw": float(pEmp[i]),
499
+ "pointSource": int(pointSourcesRel[i]),
500
+ }
501
+ )
502
+
503
+ if not allRows:
504
+ logger.warning(
505
+ "No matches detected, returning empty DataFrame."
506
+ )
507
+
508
+ return pd.DataFrame(
509
+ columns=[
510
+ "chromosome",
511
+ "start",
512
+ "end",
513
+ "name",
514
+ "score",
515
+ "strand",
516
+ "signal",
517
+ "pValue",
518
+ "qValue",
519
+ "pointSource",
520
+ ]
521
+ )
522
+
523
+ df = pd.DataFrame(allRows)
524
+ qVals = _FDR(df["p_raw"].values.astype(float))
525
+ df["pValue"] = -np.log10(
526
+ np.clip(df["p_raw"].values, np.finfo(np.float32).tiny, 1.0)
527
+ )
528
+ df["qValue"] = -np.log10(
529
+ np.clip(qVals, np.finfo(np.float32).tiny, 1.0)
530
+ )
531
+ df.drop(columns=["p_raw"], inplace=True)
532
+ df = df[qVals <= alpha].copy()
533
+ df["chromosome"] = df["chromosome"].astype(str)
534
+ df.sort_values(by=["chromosome", "start", "end"], inplace=True)
535
+ df.reset_index(drop=True, inplace=True)
536
+ df = df[
537
+ [
538
+ "chromosome",
539
+ "start",
540
+ "end",
541
+ "name",
542
+ "score",
543
+ "strand",
544
+ "signal",
545
+ "pValue",
546
+ "qValue",
547
+ "pointSource",
548
+ ]
549
+ ]
550
+ return df
551
+
552
+
553
+ def mergeMatches(
554
+ filePath: str,
555
+ mergeGapBP: Optional[int] = -1,
556
+ ) -> Optional[str]:
557
+ r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
558
+
559
+ The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
560
+ The fourth column (name) of each merged peak contains information about the number of features that were merged
561
+ and the range of q-values among them.
562
+
563
+ Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
564
+
565
+ :param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
566
+ :type filePath: str
567
+ :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging.
568
+ :type mergeGapBP: Optional[int]
569
+
570
+ :seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
571
+ """
572
+
573
+ if mergeGapBP is None or mergeGapBP < 1:
574
+ mergeGapBP = 147
575
+ logger.info(f"Setting mergeGapBP = {mergeGapBP} bp")
576
+
577
+ MAX_NEGLOGP = 10.0
578
+ MIN_NEGLOGP = 1.0e-10
579
+
580
+ if not os.path.isfile(filePath):
581
+ logger.warning(f"Couldn't access {filePath}...skipping merge")
582
+ return None
583
+ bed = None
584
+ try:
585
+ bed = BedTool(filePath)
586
+ except Exception as ex:
587
+ logger.warning(
588
+ f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
589
+ )
590
+ return None
591
+ if bed is None:
592
+ logger.warning(
593
+ f"Couldn't create BedTool for {filePath}...skipping merge"
594
+ )
595
+ return None
596
+
597
+ bed = bed.sort()
598
+ clustered = bed.cluster(d=mergeGapBP)
599
+ groups = {}
600
+ for f in clustered:
601
+ fields = f.fields
602
+ chrom = fields[0]
603
+ start = int(fields[1])
604
+ end = int(fields[2])
605
+ score = float(fields[4])
606
+ signal = float(fields[6])
607
+ pLog10 = float(fields[7])
608
+ qLog10 = float(fields[8])
609
+ peak = int(fields[9])
610
+ clusterID = fields[-1]
611
+ if clusterID not in groups:
612
+ groups[clusterID] = {
613
+ "chrom": chrom,
614
+ "sMin": start,
615
+ "eMax": end,
616
+ "scSum": 0.0,
617
+ "sigSum": 0.0,
618
+ "n": 0,
619
+ "maxS": float("-inf"),
620
+ "peakAbs": -1,
621
+ "pMax": float("-inf"),
622
+ "pTail": 0.0,
623
+ "pHasInf": False,
624
+ "qMax": float("-inf"),
625
+ "qMin": float("inf"),
626
+ "qTail": 0.0,
627
+ "qHasInf": False,
628
+ }
629
+ g = groups[clusterID]
630
+ if start < g["sMin"]:
631
+ g["sMin"] = start
632
+ if end > g["eMax"]:
633
+ g["eMax"] = end
634
+ g["scSum"] += score
635
+ g["sigSum"] += signal
636
+ g["n"] += 1
637
+
638
+ if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
639
+ g["pHasInf"] = True
640
+ else:
641
+ if pLog10 > g["pMax"]:
642
+ if g["pMax"] == float("-inf"):
643
+ g["pTail"] = 1.0
644
+ else:
645
+ g["pTail"] = (
646
+ g["pTail"] * (10 ** (g["pMax"] - pLog10))
647
+ + 1.0
648
+ )
649
+ g["pMax"] = pLog10
650
+ else:
651
+ g["pTail"] += 10 ** (pLog10 - g["pMax"])
652
+
653
+ if (
654
+ math.isinf(qLog10)
655
+ or qLog10 >= MAX_NEGLOGP
656
+ or qLog10 <= MIN_NEGLOGP
657
+ ):
658
+ g["qHasInf"] = True
659
+ else:
660
+ if qLog10 < g["qMin"]:
661
+ if qLog10 < MIN_NEGLOGP:
662
+ g["qMin"] = MIN_NEGLOGP
663
+ else:
664
+ g["qMin"] = qLog10
665
+
666
+ if qLog10 > g["qMax"]:
667
+ if g["qMax"] == float("-inf"):
668
+ g["qTail"] = 1.0
669
+ else:
670
+ g["qTail"] = (
671
+ g["qTail"] * (10 ** (g["qMax"] - qLog10))
672
+ + 1.0
673
+ )
674
+ g["qMax"] = qLog10
675
+ else:
676
+ g["qTail"] += 10 ** (qLog10 - g["qMax"])
677
+
678
+ if signal > g["maxS"]:
679
+ g["maxS"] = signal
680
+ g["peakAbs"] = start + peak if peak >= 0 else -1
681
+
682
+ items = []
683
+ for clusterID, g in groups.items():
684
+ items.append((g["chrom"], g["sMin"], g["eMax"], g))
685
+ items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
686
+
687
+ outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
688
+ lines = []
689
+ i = 0
690
+ for chrom, sMin, eMax, g in items:
691
+ i += 1
692
+ avgScore = g["scSum"] / g["n"]
693
+ if avgScore < 0:
694
+ avgScore = 0
695
+ if avgScore > 1000:
696
+ avgScore = 1000
697
+ scoreInt = int(round(avgScore))
698
+ sigAvg = g["sigSum"] / g["n"]
699
+
700
+ if g["pHasInf"]:
701
+ pHMLog10 = MAX_NEGLOGP
702
+ else:
703
+ if (
704
+ g["pMax"] == float("-inf")
705
+ or not (g["pTail"] > 0.0)
706
+ or math.isnan(g["pTail"])
707
+ ):
708
+ pHMLog10 = MIN_NEGLOGP
709
+ else:
710
+ pHMLog10 = -math.log10(g["n"]) + (
711
+ g["pMax"] + math.log10(g["pTail"])
712
+ )
713
+ pHMLog10 = max(
714
+ MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
715
+ )
716
+
717
+ if g["qHasInf"]:
718
+ qHMLog10 = MAX_NEGLOGP
719
+ else:
720
+ if (
721
+ g["qMax"] == float("-inf")
722
+ or not (g["qTail"] > 0.0)
723
+ or math.isnan(g["qTail"])
724
+ ):
725
+ qHMLog10 = MIN_NEGLOGP
726
+ else:
727
+ qHMLog10 = -math.log10(g["n"]) + (
728
+ g["qMax"] + math.log10(g["qTail"])
729
+ )
730
+ qHMLog10 = max(
731
+ MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
732
+ )
733
+
734
+ pointSource = (
735
+ g["peakAbs"] - sMin
736
+ if g["peakAbs"] >= 0
737
+ else (eMax - sMin) // 2
738
+ )
739
+
740
+ qMinLog10 = g["qMin"]
741
+ qMaxLog10 = g["qMax"]
742
+ if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
743
+ qMinLog10 = MIN_NEGLOGP
744
+ if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
745
+ qMaxLog10 = MAX_NEGLOGP
746
+ elif (
747
+ not math.isfinite(qMaxLog10)
748
+ or not math.isfinite(qMinLog10)
749
+ ) or (qMaxLog10 < MIN_NEGLOGP):
750
+ qMinLog10 = 0.0
751
+ qMaxLog10 = 0.0
752
+
753
+ # informative+parsable name
754
+ # e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
755
+ name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
756
+ lines.append(
757
+ f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
758
+ )
759
+
760
+ with open(outPath, "w") as outF:
761
+ outF.write("\n".join(lines) + ("\n" if lines else ""))
762
+ logger.info(f"Merged matches written to {outPath}")
763
+ return outPath
764
+
765
+
766
+ def runMatchingAlgorithm(
767
+ bedGraphFile: str,
768
+ templateNames: List[str],
769
+ cascadeLevels: List[int],
770
+ iters: int,
771
+ alpha: float = 0.05,
772
+ minMatchLengthBP: Optional[int] = 250,
773
+ maxNumMatches: Optional[int] = 100_000,
774
+ minSignalAtMaxima: Optional[float | str] = "q:0.75",
775
+ randSeed: int = 42,
776
+ recenterAtPointSource: bool = True,
777
+ useScalingFunction: bool = True,
778
+ excludeRegionsBedFile: Optional[str] = None,
779
+ weightsBedGraph: str | None = None,
780
+ eps: float = 1.0e-2,
781
+ isLogScale: bool = False,
782
+ autoLengthQuantile: float = 0.90,
783
+ mergeGapBP: int | None = -1,
784
+ methodFDR: str|None = None,
785
+ merge: bool = True,
786
+ ):
787
+ r"""Wraps :func:`matchWavelet` for genome-wide matching given a bedGraph file"""
788
+ gwideDF = pd.DataFrame()
789
+ chromosomes = (
790
+ pd.read_csv(
791
+ bedGraphFile,
792
+ sep="\t",
793
+ header=None,
794
+ names=["chromosome", "start", "end", "value"],
795
+ dtype={
796
+ "chromosome": str,
797
+ "start": np.uint32,
798
+ "end": np.uint32,
799
+ "value": np.float64,
800
+ },
801
+ )["chromosome"]
802
+ .unique()
803
+ .tolist()
804
+ )
805
+
806
+ avgMinMatchLengths = []
807
+
808
+ for c_, chromosome_ in enumerate(chromosomes):
809
+ cols = ["chromosome", "start", "end", "value"]
810
+ chromBedGraphDF = pd.read_csv(
811
+ bedGraphFile,
812
+ sep="\t",
813
+ header=None,
814
+ names=cols,
815
+ dtype={
816
+ "chromosome": str,
817
+ "start": np.uint32,
818
+ "end": np.uint32,
819
+ "value": np.float64,
820
+ },
821
+ )
822
+ chromBedGraphDF = chromBedGraphDF[
823
+ chromBedGraphDF["chromosome"] == chromosome_
824
+ ]
825
+ chromIntervals = chromBedGraphDF["start"].to_numpy()
826
+ chromValues = chromBedGraphDF["value"].to_numpy()
827
+ del chromBedGraphDF
828
+
829
+ weightsDF = pd.DataFrame()
830
+ weights = np.ones_like(chromValues, dtype=np.float64)
831
+ if weightsBedGraph is not None and os.path.exists(
832
+ weightsBedGraph
833
+ ):
834
+ try:
835
+ weightsDF = pd.read_csv(
836
+ weightsBedGraph,
837
+ sep="\t",
838
+ header=None,
839
+ names=cols,
840
+ dtype={
841
+ "chromosome": str,
842
+ "start": np.uint32,
843
+ "end": np.uint32,
844
+ "value": np.float64,
845
+ },
846
+ )
847
+ weights = weightsDF[
848
+ weightsDF["chromosome"] == chromosome_
849
+ ]
850
+ weights = 1 / np.sqrt(
851
+ weights["value"].to_numpy() + 1.0
852
+ )
853
+ except Exception as ex:
854
+ logger.warning(
855
+ "Failed to parse weights from {weightsBedGraph}. Ignoring weights...."
856
+ )
857
+ del weightsDF
858
+
859
+ if minMatchLengthBP is not None and minMatchLengthBP < 1:
860
+ minMatchLengthBP_ = autoMinLengthIntervals(
861
+ chromValues,
862
+ cutoffQuantile=autoLengthQuantile,
863
+ isLogScale=isLogScale,
864
+ ) * int(chromIntervals[1] - chromIntervals[0])
865
+ else:
866
+ minMatchLengthBP_ = minMatchLengthBP
867
+
868
+ avgMinMatchLengths.append(minMatchLengthBP_)
869
+
870
+ df__ = matchWavelet(
871
+ chromosome_,
872
+ chromIntervals,
873
+ chromValues,
874
+ templateNames,
875
+ cascadeLevels,
876
+ iters,
877
+ 1.0, # keep all for later gwide correction
878
+ minMatchLengthBP_,
879
+ maxNumMatches,
880
+ minSignalAtMaxima,
881
+ randSeed,
882
+ recenterAtPointSource,
883
+ useScalingFunction,
884
+ excludeRegionsBedFile,
885
+ weights,
886
+ eps,
887
+ isLogScale,
888
+ )
889
+ if df__.empty:
890
+ logger.info(f"No matches detected on {chromosome_}.")
891
+ continue
892
+ gwideDF = pd.concat(
893
+ [gwideDF, df__], axis=0, ignore_index=True
894
+ )
895
+
896
+ if gwideDF.empty:
897
+ logger.warning("Empty matching results over `chromosomes`.")
898
+ return gwideDF
899
+ naturalScalePValues = 10 ** (
900
+ -gwideDF["pValue"].values.astype(float)
901
+ )
902
+ qVals = _FDR(naturalScalePValues, method=methodFDR)
903
+ gwideDF["qValue"] = -np.log10(
904
+ np.clip(qVals, np.finfo(np.float32).tiny, 1.0)
905
+ )
906
+ gwideDF = gwideDF[qVals <= alpha].copy()
907
+ gwideDF.sort_values(
908
+ by=["chromosome", "start", "end"], inplace=True
909
+ )
910
+ tempNarrowPeak = f"{bedGraphFile}_matches.narrowPeak".replace(
911
+ ".bedGraph", ""
912
+ )
913
+ gwideDF.to_csv(
914
+ tempNarrowPeak,
915
+ sep="\t",
916
+ index=False,
917
+ header=False,
918
+ )
919
+
920
+ if mergeGapBP is None or mergeGapBP < 1:
921
+ mergeGapBP = max((np.median(avgMinMatchLengths).astype(int) // 2), 147)
922
+
923
+ mergedPath = None
924
+ if merge:
925
+ mergedPath = mergeMatches(tempNarrowPeak, mergeGapBP=mergeGapBP)
926
+ if mergedPath is not None and os.path.isfile(mergedPath):
927
+ logger.info(f"Merged matches written to {mergedPath}")
928
+
929
+ return mergedPath