consenrich 0.7.5b1__cp314-cp314-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

consenrich/matching.py ADDED
@@ -0,0 +1,907 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""Module implementing (experimental) 'structured peak detection' features using wavelet-based templates."""
3
+
4
+ import logging
5
+ import os
6
+ import math
7
+ from pybedtools import BedTool
8
+ from typing import List, Optional
9
+
10
+ import pandas as pd
11
+ import pywt as pw
12
+ import numpy as np
13
+ import numpy.typing as npt
14
+
15
+ from scipy import signal, stats
16
+
17
+ from . import cconsenrich
18
+ from . import core as core
19
+
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def autoMinLengthIntervals(
28
+ values: np.ndarray, initLen: int = 3
29
+ ) -> int:
30
+ r"""Determines a minimum matching length (in interval units) based on the input signal values.
31
+
32
+ Returns the mean length of non-zero contiguous segments in a log-scaled/centered version of `values`
33
+
34
+ :param values: A 1D array of signal-like values.
35
+ :type values: np.ndarray
36
+ :param initLen: Initial minimum length (in intervals). Defaults to 3.
37
+ :type initLen: int
38
+ :return: Estimated minimum matching length (in intervals)
39
+ :rtype: int
40
+
41
+ """
42
+ trValues = np.asinh(values) - signal.medfilt(
43
+ np.asinh(values),
44
+ kernel_size=max(
45
+ (2 * initLen) + 1,
46
+ 2 * (int(len(values) * 0.005)) + 1,
47
+ ),
48
+ )
49
+ nz = trValues[trValues > 0]
50
+ if len(nz) == 0:
51
+ return initLen
52
+ thr = np.quantile(nz, 0.90, method="interpolated_inverted_cdf")
53
+ mask = nz >= thr
54
+ if not np.any(mask):
55
+ return initLen
56
+ idx = np.flatnonzero(np.diff(np.r_[False, mask, False]))
57
+ runs = idx.reshape(-1, 2)
58
+ widths = runs[:, 1] - runs[:, 0]
59
+ widths = widths[widths >= initLen]
60
+ if len(widths) == 0:
61
+ return initLen
62
+ return int(np.mean(widths))
63
+
64
+
65
+ def scalarClip(value: float, low: float, high: float) -> float:
66
+ return low if value < low else high if value > high else value
67
+
68
+
69
+ def castableToFloat(value) -> bool:
70
+ if value is None:
71
+ return False
72
+ if isinstance(value, bool):
73
+ return False
74
+ if isinstance(value, str):
75
+ if value.lower().replace(" ", "") in [
76
+ "nan",
77
+ "inf",
78
+ "-inf",
79
+ "infinity",
80
+ "-infinity",
81
+ "",
82
+ " ",
83
+ ]:
84
+ return False
85
+
86
+ try:
87
+ float(value)
88
+ if np.isfinite(float(value)):
89
+ return True
90
+ except Exception:
91
+ return False
92
+ return False
93
+
94
+
95
+ def matchExistingBedGraph(
96
+ bedGraphFile: str,
97
+ templateName: str,
98
+ cascadeLevel: int,
99
+ alpha: float = 0.05,
100
+ minMatchLengthBP: Optional[int] = 250,
101
+ iters: int = 25_000,
102
+ minSignalAtMaxima: Optional[float | str] = "q:0.75",
103
+ maxNumMatches: Optional[int] = 100_000,
104
+ recenterAtPointSource: bool = True,
105
+ useScalingFunction: bool = True,
106
+ excludeRegionsBedFile: Optional[str] = None,
107
+ mergeGapBP: Optional[int] = None,
108
+ merge: bool = True,
109
+ weights: Optional[npt.NDArray[np.float64]] = None,
110
+ randSeed: int = 42,
111
+ ) -> Optional[str]:
112
+ r"""Match discrete templates in a bedGraph file of Consenrich estimates
113
+
114
+ This function is a simple wrapper. See :func:`consenrich.matching.matchWavelet` for details on parameters.
115
+
116
+ :param bedGraphFile: A bedGraph file with 'consensus' signal estimates derived from multiple samples, e.g., from Consenrich. The suffix '.bedGraph' is required.
117
+ :type bedGraphFile: str
118
+
119
+ :seealso: :func:`consenrich.matching.matchWavelet`, :class:`consenrich.core.matchingParams`, :ref:`matching`
120
+ """
121
+ if not os.path.isfile(bedGraphFile):
122
+ raise FileNotFoundError(f"Couldn't access {bedGraphFile}")
123
+ if not bedGraphFile.endswith(".bedGraph"):
124
+ raise ValueError(
125
+ f"Please use a suffix '.bedGraph' for `bedGraphFile`, got: {bedGraphFile}"
126
+ )
127
+
128
+ if mergeGapBP is None:
129
+ mergeGapBP = (
130
+ (minMatchLengthBP // 2) + 1
131
+ if minMatchLengthBP is not None
132
+ else 75
133
+ )
134
+
135
+ allowedTemplates = [
136
+ x for x in pw.wavelist(kind="discrete") if "bio" not in x
137
+ ]
138
+ if templateName not in allowedTemplates:
139
+ raise ValueError(
140
+ f"Unknown wavelet template: {templateName}\nAvailable templates: {allowedTemplates}"
141
+ )
142
+
143
+ cols = ["chromosome", "start", "end", "value"]
144
+ bedGraphDF = pd.read_csv(
145
+ bedGraphFile,
146
+ sep="\t",
147
+ header=None,
148
+ names=cols,
149
+ dtype={
150
+ "chromosome": str,
151
+ "start": np.uint32,
152
+ "end": np.uint32,
153
+ "value": np.float64,
154
+ },
155
+ )
156
+
157
+ outPaths: List[str] = []
158
+ outPathsMerged: List[str] = []
159
+ outPathAll: Optional[str] = None
160
+ outPathMergedAll: Optional[str] = None
161
+
162
+ for chrom_ in sorted(bedGraphDF["chromosome"].unique()):
163
+ df_ = bedGraphDF[bedGraphDF["chromosome"] == chrom_]
164
+ if len(df_) < 5:
165
+ logger.info(f"Skipping {chrom_}: less than 5 intervals.")
166
+ continue
167
+
168
+ try:
169
+ df__ = matchWavelet(
170
+ chrom_,
171
+ df_["start"].to_numpy(),
172
+ df_["value"].to_numpy(),
173
+ [templateName],
174
+ [cascadeLevel],
175
+ iters,
176
+ alpha,
177
+ minMatchLengthBP,
178
+ maxNumMatches,
179
+ recenterAtPointSource=recenterAtPointSource,
180
+ useScalingFunction=useScalingFunction,
181
+ excludeRegionsBedFile=excludeRegionsBedFile,
182
+ weights=weights,
183
+ minSignalAtMaxima=minSignalAtMaxima,
184
+ randSeed=randSeed,
185
+ )
186
+ except Exception as ex:
187
+ logger.info(
188
+ f"Skipping {chrom_} due to error in matchWavelet: {ex}"
189
+ )
190
+ continue
191
+
192
+ if df__.empty:
193
+ logger.info(f"No matches detected on {chrom_}.")
194
+ continue
195
+
196
+ perChromOut = bedGraphFile.replace(
197
+ ".bedGraph",
198
+ f".{chrom_}.matched.{templateName}_lvl{cascadeLevel}.narrowPeak",
199
+ )
200
+ df__.to_csv(perChromOut, sep="\t", index=False, header=False)
201
+ logger.info(f"Matches written to {perChromOut}")
202
+ outPaths.append(perChromOut)
203
+
204
+ if merge:
205
+ mergedPath = mergeMatches(
206
+ perChromOut, mergeGapBP=mergeGapBP
207
+ )
208
+ if mergedPath is not None:
209
+ logger.info(f"Merged matches written to {mergedPath}")
210
+ outPathsMerged.append(mergedPath)
211
+
212
+ if len(outPaths) == 0 and len(outPathsMerged) == 0:
213
+ raise ValueError("No matches were detected.")
214
+
215
+ if len(outPaths) > 0:
216
+ outPathAll = (
217
+ f"{bedGraphFile.replace('.bedGraph', '')}"
218
+ f".allChroms.matched.{templateName}_lvl{cascadeLevel}.narrowPeak"
219
+ )
220
+ with open(outPathAll, "w") as outF:
221
+ for path_ in outPaths:
222
+ if os.path.isfile(path_):
223
+ with open(path_, "r") as inF:
224
+ for line in inF:
225
+ outF.write(line)
226
+ logger.info(f"All unmerged matches written to {outPathAll}")
227
+
228
+ if merge and len(outPathsMerged) > 0:
229
+ outPathMergedAll = (
230
+ f"{bedGraphFile.replace('.bedGraph', '')}"
231
+ f".allChroms.matched.{templateName}_lvl{cascadeLevel}.mergedMatches.narrowPeak"
232
+ )
233
+ with open(outPathMergedAll, "w") as outF:
234
+ for path in outPathsMerged:
235
+ if os.path.isfile(path):
236
+ with open(path, "r") as inF:
237
+ for line in inF:
238
+ outF.write(line)
239
+ logger.info(
240
+ f"All merged matches written to {outPathMergedAll}"
241
+ )
242
+
243
+ for path_ in outPaths + outPathsMerged:
244
+ try:
245
+ if os.path.isfile(path_):
246
+ os.remove(path_)
247
+ except Exception:
248
+ pass
249
+
250
+ if merge and outPathMergedAll:
251
+ return outPathMergedAll
252
+ if outPathAll:
253
+ return outPathAll
254
+ logger.warning("No matches were detected...returning `None`")
255
+ return None
256
+
257
+
258
+ def matchWavelet(
259
+ chromosome: str,
260
+ intervals: npt.NDArray[int],
261
+ values: npt.NDArray[np.float64],
262
+ templateNames: List[str],
263
+ cascadeLevels: List[int],
264
+ iters: int,
265
+ alpha: float = 0.05,
266
+ minMatchLengthBP: Optional[int] = 250,
267
+ maxNumMatches: Optional[int] = 100_000,
268
+ minSignalAtMaxima: Optional[float | str] = "q:0.75",
269
+ randSeed: int = 42,
270
+ recenterAtPointSource: bool = True,
271
+ useScalingFunction: bool = True,
272
+ excludeRegionsBedFile: Optional[str] = None,
273
+ weights: Optional[npt.NDArray[np.float64]] = None,
274
+ eps: float = 1.0e-2,
275
+ isLogScale: bool = False,
276
+ ) -> pd.DataFrame:
277
+ r"""Detect structured peaks in Consenrich tracks by matching wavelet- or scaling-function–based templates.
278
+
279
+ :param chromosome: Chromosome name for the input intervals and values.
280
+ :type chromosome: str
281
+ :param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
282
+ but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
283
+ :type values: npt.NDArray[np.float64]
284
+ :param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
285
+ :type templateNames: List[str]
286
+ :param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
287
+ Must have the same length as `templateNames`, with each entry aligned to the
288
+ corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
289
+ :type cascadeLevels: List[int]
290
+ :param iters: Number of random blocks to sample in the response sequence while building
291
+ an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
292
+ :type iters: int
293
+ :param alpha: Primary significance threshold on detected matches. Specifically, the
294
+ minimum corr. empirical p-value approximated from randomly sampled blocks in the
295
+ response sequence.
296
+ :type alpha: float
297
+ :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
298
+ the signal-template convolution must be greater in value than others to qualify as matches.
299
+ If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
300
+ If set to `None`, defaults to 250 bp.
301
+ :type minMatchLengthBP: Optional[int]
302
+ :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
303
+ at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
304
+ to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
305
+ than this (absolute) value. *Set to a negative value to disable the threshold*.
306
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
307
+ threshold is then set to the corresponding quantile of the non-zero signal estimates.
308
+ Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
309
+ :type minSignalAtMaxima: Optional[str | float]
310
+ :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
311
+ If False, use (only) the wavelet function.
312
+ :type useScalingFunction: bool
313
+ :param excludeRegionsBedFile: A BED file with regions to exclude from matching
314
+ :type excludeRegionsBedFile: Optional[str]
315
+ :param recenterAtPointSource: If True, recenter detected matches at the point source (max value)
316
+ :type recenterAtPointSource: bool
317
+ :param weights: Optional weights to apply to `values` prior to matching. Must have the same length as `values`.
318
+ :type weights: Optional[npt.NDArray[np.float64]]
319
+ :param eps: Tolerance parameter for relative maxima detection in the response sequence. Set to zero to enforce strict
320
+ inequalities when identifying discrete relative maxima.
321
+ :type eps: float
322
+ :param isLogScale: Whether the input values have already been log- or asinh-transformed. Used to avoid redundant transformations.
323
+ :type isLogScale: bool
324
+ :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
325
+ :return: A pandas DataFrame with detected matches
326
+ :rtype: pd.DataFrame
327
+ """
328
+
329
+ rng = np.random.default_rng(int(randSeed))
330
+ if len(intervals) < 5:
331
+ raise ValueError("`intervals` must be at least length 5")
332
+
333
+ if len(values) != len(intervals):
334
+ raise ValueError(
335
+ "`values` must have the same length as `intervals`"
336
+ )
337
+
338
+ if len(templateNames) != len(cascadeLevels):
339
+ raise ValueError(
340
+ "\n\t`templateNames` and `cascadeLevels` must have the same length."
341
+ "\n\tSet products are not supported, i.e., each template needs an explicitly defined cascade level."
342
+ "\t\ne.g., for `templateNames = [haar, db2]`, use `cascadeLevels = [2, 2]`, not `[2]`.\n"
343
+ )
344
+
345
+ intervalLengthBp = intervals[1] - intervals[0]
346
+
347
+ if minMatchLengthBP is not None and minMatchLengthBP < 1:
348
+ minMatchLengthBP = autoMinLengthIntervals(values) * int(
349
+ intervalLengthBp
350
+ )
351
+ elif minMatchLengthBP is None:
352
+ minMatchLengthBP = 250
353
+
354
+ logger.info(f"\n\tUsing minMatchLengthBP: {minMatchLengthBP}")
355
+
356
+ if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
357
+ raise ValueError("`intervals` must be evenly spaced.")
358
+
359
+ if weights is not None:
360
+ if len(weights) != len(values):
361
+ logger.warning(
362
+ f"`weights` length {len(weights)} does not match `values` length {len(values)}. Ignoring..."
363
+ )
364
+ else:
365
+ values = values * weights
366
+
367
+ if not isLogScale:
368
+ asinhValues = np.asinh(values, dtype=np.float32)
369
+ else:
370
+ asinhValues = values.astype(np.float32)
371
+ asinhNonZeroValues = asinhValues[asinhValues > 0]
372
+
373
+ iters = max(int(iters), 1000)
374
+ defQuantile = 0.75
375
+ chromMin = int(intervals[0])
376
+ chromMax = int(intervals[-1])
377
+ chromMid = chromMin + (chromMax - chromMin) // 2 # for split
378
+ halfLeftMask = intervals < chromMid
379
+ halfRightMask = ~halfLeftMask
380
+ excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
381
+ if excludeRegionsBedFile is not None:
382
+ excludeMaskGlobal = core.getBedMask(
383
+ chromosome, excludeRegionsBedFile, intervals
384
+ ).astype(np.uint8)
385
+ allRows = []
386
+
387
+ def bhFdr(p: np.ndarray) -> np.ndarray:
388
+ m = len(p)
389
+ order = np.argsort(p, kind="mergesort")
390
+ ranked = np.arange(1, m + 1, dtype=float)
391
+ q = (p[order] * m) / ranked
392
+ q = np.minimum.accumulate(q[::-1])[::-1]
393
+ out = np.empty_like(q)
394
+ out[order] = q
395
+ return np.clip(out, 0.0, 1.0)
396
+
397
+ def parseMinSignalThreshold(val):
398
+ if val is None:
399
+ return -1e6
400
+ if isinstance(val, str):
401
+ if val.startswith("q:"):
402
+ qVal = float(val.split("q:")[-1])
403
+ if not (0 <= qVal <= 1):
404
+ raise ValueError(
405
+ f"Quantile {qVal} is out of range"
406
+ )
407
+ return float(
408
+ np.quantile(
409
+ asinhNonZeroValues,
410
+ qVal,
411
+ method="interpolated_inverted_cdf",
412
+ )
413
+ )
414
+ elif castableToFloat(val):
415
+ v = float(val)
416
+ return -1e6 if v < 0 else float(np.asinh(v))
417
+ else:
418
+ return float(
419
+ np.quantile(
420
+ asinhNonZeroValues,
421
+ defQuantile,
422
+ method="interpolated_inverted_cdf",
423
+ )
424
+ )
425
+ if isinstance(val, (float, int)):
426
+ v = float(val)
427
+ return -1e6 if v < 0 else float(np.asinh(v))
428
+ return float(
429
+ np.quantile(
430
+ asinhNonZeroValues,
431
+ defQuantile,
432
+ method="interpolated_inverted_cdf",
433
+ )
434
+ )
435
+
436
+ def relativeMaxima(
437
+ resp: np.ndarray, orderBins: int, eps: float = None
438
+ ) -> np.ndarray:
439
+ order_: int = max(int(orderBins), 1)
440
+ if eps is None:
441
+ eps = np.finfo(resp.dtype).eps * 10
442
+
443
+ def ge_with_tol(a, b):
444
+ return a > (b - eps)
445
+
446
+ # get initial set using loosened criterion
447
+ idx = signal.argrelextrema(
448
+ resp, comparator=ge_with_tol, order=order_
449
+ )[0]
450
+ if idx.size == 0:
451
+ return idx
452
+
453
+ if eps > 0.0:
454
+ groups = []
455
+ start, prev = idx[0], idx[0]
456
+ for x in idx[1:]:
457
+ # case: still contiguous
458
+ if x == prev + 1:
459
+ prev = x
460
+ else:
461
+ # case: a gap --> break off from previous group
462
+ groups.append((start, prev))
463
+ start = x
464
+ prev = x
465
+ groups.append((start, prev))
466
+
467
+ centers: list[int] = []
468
+ for s, e in groups:
469
+ if s == e:
470
+ centers.append(s)
471
+ else:
472
+ # for each `group` of tied indices, picks the center
473
+ centers.append((s + e) // 2)
474
+
475
+ return np.asarray(centers, dtype=np.intp)
476
+
477
+ return idx
478
+
479
+ def sampleBlockMaxima(
480
+ resp: np.ndarray,
481
+ halfMask: np.ndarray,
482
+ relWindowBins: int,
483
+ nsamp: int,
484
+ seed: int,
485
+ eps: float,
486
+ ):
487
+ exMask = excludeMaskGlobal.astype(np.uint8).copy()
488
+ exMask |= (~halfMask).astype(np.uint8)
489
+ vals = np.array(
490
+ cconsenrich.csampleBlockStats(
491
+ intervals.astype(np.uint32),
492
+ resp,
493
+ int(relWindowBins),
494
+ int(nsamp),
495
+ int(seed),
496
+ exMask.astype(np.uint8),
497
+ np.float64(eps if eps is not None else 0.0),
498
+ ),
499
+ dtype=float,
500
+ )
501
+ if len(vals) == 0:
502
+ return vals
503
+ low = np.quantile(vals, 0.001)
504
+ high = np.quantile(vals, 0.999)
505
+ return vals[(vals > low) & (vals < high)]
506
+
507
+ for templateName, cascadeLevel in zip(
508
+ templateNames, cascadeLevels
509
+ ):
510
+ if templateName not in pw.wavelist(kind="discrete"):
511
+ logger.warning(
512
+ f"Skipping unknown wavelet template: {templateName}"
513
+ )
514
+ continue
515
+
516
+ wav = pw.Wavelet(str(templateName))
517
+ scalingFunc, waveletFunc, _ = wav.wavefun(
518
+ level=int(cascadeLevel)
519
+ )
520
+ template = np.array(
521
+ scalingFunc if useScalingFunction else waveletFunc,
522
+ dtype=np.float64,
523
+ )
524
+ template /= np.linalg.norm(template)
525
+
526
+ logger.info(
527
+ f"\n\tMatching template: {templateName}"
528
+ f"\n\tcascade level: {cascadeLevel}"
529
+ f"\n\ttemplate length: {len(template)}"
530
+ )
531
+
532
+ # efficient FFT-based cross-correlation
533
+ # (OA may be better for smaller templates, TODO add a check)
534
+ response = signal.fftconvolve(
535
+ values, template[::-1], mode="same"
536
+ )
537
+ thisMinMatchBp = minMatchLengthBP
538
+ if thisMinMatchBp is None or thisMinMatchBp < 1:
539
+ thisMinMatchBp = len(template) * intervalLengthBp
540
+ if thisMinMatchBp % intervalLengthBp != 0:
541
+ thisMinMatchBp += intervalLengthBp - (
542
+ thisMinMatchBp % intervalLengthBp
543
+ )
544
+ relWindowBins = int(
545
+ ((thisMinMatchBp / intervalLengthBp) / 2) + 1
546
+ )
547
+ relWindowBins = max(relWindowBins, 1)
548
+ asinhThreshold = parseMinSignalThreshold(minSignalAtMaxima)
549
+ for nullMask, testMask, tag in [
550
+ (halfLeftMask, halfRightMask, "R"),
551
+ (halfRightMask, halfLeftMask, "L"),
552
+ ]:
553
+ blockMaxima = sampleBlockMaxima(
554
+ response,
555
+ nullMask,
556
+ relWindowBins,
557
+ nsamp=max(iters, 1000),
558
+ seed=rng.integers(1, 10_000),
559
+ eps=eps,
560
+ )
561
+ if len(blockMaxima) < 25:
562
+ pooledMask = ~excludeMaskGlobal.astype(bool)
563
+ blockMaxima = sampleBlockMaxima(
564
+ response,
565
+ pooledMask,
566
+ relWindowBins,
567
+ nsamp=max(iters, 1000),
568
+ seed=rng.integers(1, 10_000),
569
+ eps=eps,
570
+ )
571
+ ecdfSf = stats.ecdf(blockMaxima).sf
572
+ candidateIdx = relativeMaxima(
573
+ response, relWindowBins, eps=eps
574
+ )
575
+
576
+ candidateMask = (
577
+ (candidateIdx >= relWindowBins)
578
+ & (candidateIdx < len(response) - relWindowBins)
579
+ & (testMask[candidateIdx])
580
+ & (excludeMaskGlobal[candidateIdx] == 0)
581
+ & (asinhValues[candidateIdx] > asinhThreshold)
582
+ )
583
+
584
+ candidateIdx = candidateIdx[candidateMask]
585
+ if len(candidateIdx) == 0:
586
+ continue
587
+ if (
588
+ maxNumMatches is not None
589
+ and len(candidateIdx) > maxNumMatches
590
+ ):
591
+ candidateIdx = candidateIdx[
592
+ np.argsort(asinhValues[candidateIdx])[
593
+ -maxNumMatches:
594
+ ]
595
+ ]
596
+ pEmp = np.clip(
597
+ ecdfSf.evaluate(response[candidateIdx]),
598
+ 1.0e-10,
599
+ 1.0,
600
+ )
601
+ startsIdx = np.maximum(candidateIdx - relWindowBins, 0)
602
+ endsIdx = np.minimum(
603
+ len(values) - 1, candidateIdx + relWindowBins
604
+ )
605
+ pointSourcesIdx = []
606
+ for s, e in zip(startsIdx, endsIdx):
607
+ pointSourcesIdx.append(
608
+ np.argmax(values[s : e + 1]) + s
609
+ )
610
+ pointSourcesIdx = np.array(pointSourcesIdx)
611
+ starts = intervals[startsIdx]
612
+ ends = intervals[endsIdx]
613
+ pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
614
+ 1, intervalLengthBp // 2
615
+ )
616
+ if recenterAtPointSource:
617
+ starts = pointSourcesAbs - (
618
+ relWindowBins * intervalLengthBp
619
+ )
620
+ ends = pointSourcesAbs + (
621
+ relWindowBins * intervalLengthBp
622
+ )
623
+ pointSourcesRel = (
624
+ intervals[pointSourcesIdx] - starts
625
+ ) + max(1, intervalLengthBp // 2)
626
+ sqScores = (1 + response[candidateIdx]) ** 2
627
+ minR, maxR = (
628
+ float(np.min(sqScores)),
629
+ float(np.max(sqScores)),
630
+ )
631
+ rangeR = max(maxR - minR, 1.0)
632
+ scores = (250 + 750 * (sqScores - minR) / rangeR).astype(
633
+ int
634
+ )
635
+ for i, idxVal in enumerate(candidateIdx):
636
+ allRows.append(
637
+ {
638
+ "chromosome": chromosome,
639
+ "start": int(starts[i]),
640
+ "end": int(ends[i]),
641
+ "name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
642
+ "score": int(scores[i]),
643
+ "strand": ".",
644
+ "signal": float(response[idxVal]),
645
+ "p_raw": float(pEmp[i]),
646
+ "pointSource": int(pointSourcesRel[i]),
647
+ }
648
+ )
649
+
650
+ if not allRows:
651
+ logger.warning(
652
+ "No matches detected, returning empty DataFrame."
653
+ )
654
+
655
+ return pd.DataFrame(
656
+ columns=[
657
+ "chromosome",
658
+ "start",
659
+ "end",
660
+ "name",
661
+ "score",
662
+ "strand",
663
+ "signal",
664
+ "pValue",
665
+ "qValue",
666
+ "pointSource",
667
+ ]
668
+ )
669
+
670
+ df = pd.DataFrame(allRows)
671
+ qVals = bhFdr(df["p_raw"].values.astype(float))
672
+ df["pValue"] = -np.log10(
673
+ np.clip(df["p_raw"].values, 1.0e-10, 1.0)
674
+ )
675
+ df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
676
+ df.drop(columns=["p_raw"], inplace=True)
677
+ df = df[qVals <= alpha].copy()
678
+ df["chromosome"] = df["chromosome"].astype(str)
679
+ df.sort_values(by=["chromosome", "start", "end"], inplace=True)
680
+ df.reset_index(drop=True, inplace=True)
681
+ df = df[
682
+ [
683
+ "chromosome",
684
+ "start",
685
+ "end",
686
+ "name",
687
+ "score",
688
+ "strand",
689
+ "signal",
690
+ "pValue",
691
+ "qValue",
692
+ "pointSource",
693
+ ]
694
+ ]
695
+ return df
696
+
697
+
698
+ def mergeMatches(
699
+ filePath: str,
700
+ mergeGapBP: Optional[int],
701
+ ) -> Optional[str]:
702
+ r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
703
+
704
+ The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
705
+ The fourth column (name) of each merged peak contains information about the number of features that were merged
706
+ and the range of q-values among them.
707
+
708
+ Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
709
+
710
+ :param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
711
+ :type filePath: str
712
+ :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
713
+ :type mergeGapBP: Optional[int]
714
+
715
+ :seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
716
+ """
717
+
718
+ if mergeGapBP is None or mergeGapBP < 1:
719
+ mergeGapBP = 75
720
+
721
+ MAX_NEGLOGP = 10.0
722
+ MIN_NEGLOGP = 1.0e-10
723
+
724
+ if not os.path.isfile(filePath):
725
+ logger.warning(f"Couldn't access {filePath}...skipping merge")
726
+ return None
727
+ bed = None
728
+ try:
729
+ bed = BedTool(filePath)
730
+ except Exception as ex:
731
+ logger.warning(
732
+ f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
733
+ )
734
+ return None
735
+ if bed is None:
736
+ logger.warning(
737
+ f"Couldn't create BedTool for {filePath}...skipping merge"
738
+ )
739
+ return None
740
+
741
+ bed = bed.sort()
742
+ clustered = bed.cluster(d=mergeGapBP)
743
+ groups = {}
744
+ for f in clustered:
745
+ fields = f.fields
746
+ chrom = fields[0]
747
+ start = int(fields[1])
748
+ end = int(fields[2])
749
+ score = float(fields[4])
750
+ signal = float(fields[6])
751
+ pLog10 = float(fields[7])
752
+ qLog10 = float(fields[8])
753
+ peak = int(fields[9])
754
+ clusterID = fields[-1]
755
+ if clusterID not in groups:
756
+ groups[clusterID] = {
757
+ "chrom": chrom,
758
+ "sMin": start,
759
+ "eMax": end,
760
+ "scSum": 0.0,
761
+ "sigSum": 0.0,
762
+ "n": 0,
763
+ "maxS": float("-inf"),
764
+ "peakAbs": -1,
765
+ "pMax": float("-inf"),
766
+ "pTail": 0.0,
767
+ "pHasInf": False,
768
+ "qMax": float("-inf"),
769
+ "qMin": float("inf"),
770
+ "qTail": 0.0,
771
+ "qHasInf": False,
772
+ }
773
+ g = groups[clusterID]
774
+ if start < g["sMin"]:
775
+ g["sMin"] = start
776
+ if end > g["eMax"]:
777
+ g["eMax"] = end
778
+ g["scSum"] += score
779
+ g["sigSum"] += signal
780
+ g["n"] += 1
781
+
782
+ if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
783
+ g["pHasInf"] = True
784
+ else:
785
+ if pLog10 > g["pMax"]:
786
+ if g["pMax"] == float("-inf"):
787
+ g["pTail"] = 1.0
788
+ else:
789
+ g["pTail"] = (
790
+ g["pTail"] * (10 ** (g["pMax"] - pLog10))
791
+ + 1.0
792
+ )
793
+ g["pMax"] = pLog10
794
+ else:
795
+ g["pTail"] += 10 ** (pLog10 - g["pMax"])
796
+
797
+ if (
798
+ math.isinf(qLog10)
799
+ or qLog10 >= MAX_NEGLOGP
800
+ or qLog10 <= MIN_NEGLOGP
801
+ ):
802
+ g["qHasInf"] = True
803
+ else:
804
+ if qLog10 < g["qMin"]:
805
+ if qLog10 < MIN_NEGLOGP:
806
+ g["qMin"] = MIN_NEGLOGP
807
+ else:
808
+ g["qMin"] = qLog10
809
+
810
+ if qLog10 > g["qMax"]:
811
+ if g["qMax"] == float("-inf"):
812
+ g["qTail"] = 1.0
813
+ else:
814
+ g["qTail"] = (
815
+ g["qTail"] * (10 ** (g["qMax"] - qLog10))
816
+ + 1.0
817
+ )
818
+ g["qMax"] = qLog10
819
+ else:
820
+ g["qTail"] += 10 ** (qLog10 - g["qMax"])
821
+
822
+ if signal > g["maxS"]:
823
+ g["maxS"] = signal
824
+ g["peakAbs"] = start + peak if peak >= 0 else -1
825
+
826
+ items = []
827
+ for clusterID, g in groups.items():
828
+ items.append((g["chrom"], g["sMin"], g["eMax"], g))
829
+ items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
830
+
831
+ outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
832
+ lines = []
833
+ i = 0
834
+ for chrom, sMin, eMax, g in items:
835
+ i += 1
836
+ avgScore = g["scSum"] / g["n"]
837
+ if avgScore < 0:
838
+ avgScore = 0
839
+ if avgScore > 1000:
840
+ avgScore = 1000
841
+ scoreInt = int(round(avgScore))
842
+ sigAvg = g["sigSum"] / g["n"]
843
+
844
+ if g["pHasInf"]:
845
+ pHMLog10 = MAX_NEGLOGP
846
+ else:
847
+ if (
848
+ g["pMax"] == float("-inf")
849
+ or not (g["pTail"] > 0.0)
850
+ or math.isnan(g["pTail"])
851
+ ):
852
+ pHMLog10 = MIN_NEGLOGP
853
+ else:
854
+ pHMLog10 = -math.log10(g["n"]) + (
855
+ g["pMax"] + math.log10(g["pTail"])
856
+ )
857
+ pHMLog10 = max(
858
+ MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
859
+ )
860
+
861
+ if g["qHasInf"]:
862
+ qHMLog10 = MAX_NEGLOGP
863
+ else:
864
+ if (
865
+ g["qMax"] == float("-inf")
866
+ or not (g["qTail"] > 0.0)
867
+ or math.isnan(g["qTail"])
868
+ ):
869
+ qHMLog10 = MIN_NEGLOGP
870
+ else:
871
+ qHMLog10 = -math.log10(g["n"]) + (
872
+ g["qMax"] + math.log10(g["qTail"])
873
+ )
874
+ qHMLog10 = max(
875
+ MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
876
+ )
877
+
878
+ pointSource = (
879
+ g["peakAbs"] - sMin
880
+ if g["peakAbs"] >= 0
881
+ else (eMax - sMin) // 2
882
+ )
883
+
884
+ qMinLog10 = g["qMin"]
885
+ qMaxLog10 = g["qMax"]
886
+ if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
887
+ qMinLog10 = MIN_NEGLOGP
888
+ if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
889
+ qMaxLog10 = MAX_NEGLOGP
890
+ elif (
891
+ not math.isfinite(qMaxLog10)
892
+ or not math.isfinite(qMinLog10)
893
+ ) or (qMaxLog10 < MIN_NEGLOGP):
894
+ qMinLog10 = 0.0
895
+ qMaxLog10 = 0.0
896
+
897
+ # informative+parsable name
898
+ # e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
899
+ name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
900
+ lines.append(
901
+ f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
902
+ )
903
+
904
+ with open(outPath, "w") as outF:
905
+ outF.write("\n".join(lines) + ("\n" if lines else ""))
906
+ logger.info(f"Merged matches written to {outPath}")
907
+ return outPath