PyPI - consenrich - Versions diffs - 0.7.0b1__cp311-cp311-macosx_11_0_arm64.whl → 0.7.1b1__cp311-cp311-macosx_11_0_arm64.whl - Mend

consenrich 0.7.0b1__cp311-cp311-macosx_11_0_arm64.whl → 0.7.1b1__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of consenrich might be problematic. Click here for more details.

Files changed (11) hide show

consenrich/matching.py CHANGED Viewed

@@ -3,6 +3,7 @@ r"""Module implementing (experimental) 'structured peak detection' features usin
 import logging
 import os
+import math
 from pybedtools import BedTool
 from typing import List, Optional
@@ -23,13 +24,25 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
+def scalarClip(value: float, low: float, high: float) -> float:
+    return low if value < low else high if value > high else value
 def castableToFloat(value) -> bool:
     if value is None:
         return False
     if isinstance(value, bool):
         return False
     if isinstance(value, str):
-        if value.lower().replace(' ', '') in ["nan", "inf", "-inf", "infinity", "-infinity", "", " "]:
+        if value.lower().replace(" ", "") in [
+            "nan",
+            "inf",
+            "-inf",
+            "infinity",
+            "-infinity",
+            "",
+            " ",
+        ]:
             return False
     try:
@@ -75,7 +88,11 @@ def matchExistingBedGraph(
         )
     if mergeGapBP is None:
-        mergeGapBP = (minMatchLengthBP // 2) + 1 if minMatchLengthBP is not None else 75
+        mergeGapBP = (
+            (minMatchLengthBP // 2) + 1
+            if minMatchLengthBP is not None
+            else 75
+        )
     allowedTemplates = [
         x for x in pw.wavelist(kind="discrete") if "bio" not in x
@@ -129,7 +146,9 @@ def matchExistingBedGraph(
                 randSeed=randSeed,
             )
         except Exception as ex:
-            logger.info(f"Skipping {chrom_} due to error in matchWavelet: {ex}")
+            logger.info(
+                f"Skipping {chrom_} due to error in matchWavelet: {ex}"
+            )
             continue
         if df__.empty:
@@ -145,7 +164,9 @@ def matchExistingBedGraph(
         outPaths.append(perChromOut)
         if merge:
-            mergedPath = mergeMatches(perChromOut, mergeGapBP=mergeGapBP)
+            mergedPath = mergeMatches(
+                perChromOut, mergeGapBP=mergeGapBP
+            )
             if mergedPath is not None:
                 logger.info(f"Merged matches written to {mergedPath}")
                 outPathsMerged.append(mergedPath)
@@ -177,7 +198,9 @@ def matchExistingBedGraph(
                     with open(path, "r") as inF:
                         for line in inF:
                             outF.write(line)
-        logger.info(f"All merged matches written to {outPathMergedAll}")
+        logger.info(
+            f"All merged matches written to {outPathMergedAll}"
+        )
     for path_ in outPaths + outPathsMerged:
         try:
@@ -215,7 +238,8 @@ def matchWavelet(
     :param chromosome: Chromosome name for the input intervals and values.
     :type chromosome: str
-    :param values: 'Consensus' signal estimates derived from multiple samples, e.g., from Consenrich.
+    :param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
+        but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
     :type values: npt.NDArray[np.float64]
     :param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
     :type templateNames: List[str]
@@ -226,19 +250,19 @@ def matchWavelet(
         an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
     :type iters: int
     :param alpha: Primary significance threshold on detected matches. Specifically, the
-        :math:`1 - \alpha` quantile of an empirical null distribution. The empirical null
-        distribution is built from cross-correlation values over randomly sampled blocks.
+        minimum corr. empirical p-value approximated from randomly sampled blocks in the
+        response sequence.
     :type alpha: float
     :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
         the signal-template convolution must be greater in value than others to qualify as matches.
     :type minMatchLengthBP: int
-    :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Require the *signal value*
-        at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale.
-        If a `float` value is provided, the minimum signal value must be greater than this (absolute) value. *Set to a
-        negative value to disable the threshold*.
-        If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.75'. The
+    :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
+        at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
+        to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
+        than this (absolute) value. *Set to a negative value to disable the threshold*.
+        If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
         threshold is then set to the corresponding quantile of the non-zero signal estimates.
-        Defaults to str value 'q:0.75' --- the 90th percentile of signal values.
+        Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
     :type minSignalAtMaxima: Optional[str | float]
     :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
         If False, use (only) the wavelet function.
@@ -247,342 +271,349 @@ def matchWavelet(
     :type excludeRegionsBedFile: Optional[str]
     :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
+    :return: A pandas DataFrame with detected matches
+    :rtype: pd.DataFrame
     """
     if len(intervals) < 5:
         raise ValueError("`intervals` must be at least length 5")
     if len(values) != len(intervals):
-        raise ValueError("`values` must have the same length as `intervals`")
-    intervalLengthBP = intervals[1] - intervals[0]
-    if not np.all(np.abs(np.diff(intervals)) == intervalLengthBP):
-        # FFR: don't change this exception message without updating tests
-        # --'spaced' is matched in tests
+        raise ValueError(
+            "`values` must have the same length as `intervals`"
+        )
+    intervalLengthBp = intervals[1] - intervals[0]
+    if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
         raise ValueError("`intervals` must be evenly spaced.")
-    randSeed_: int = int(randSeed)
-    cols = [
-        "chromosome",
-        "start",
-        "end",
-        "name",
-        "score",
-        "strand",
-        "signal",
-        "pValue",
-        "qValue",
-        "pointSource",
-    ]
-    matchDF = pd.DataFrame(columns=cols)
-    minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
+    rng = np.random.default_rng(int(randSeed))
     cascadeLevels = sorted(list(set(cascadeLevels)))
     if weights is not None and len(weights) == len(values):
         values = values * weights
     asinhValues = np.asinh(values, dtype=np.float32)
     asinhNonZeroValues = asinhValues[asinhValues > 0]
-    iters = max(iters, 1000)
-    defQuantile: float = 0.75
-    for l_, cascadeLevel in enumerate(cascadeLevels):
-        for t_, templateName in enumerate(templateNames):
-            try:
-                templateName = str(templateName)
-                cascadeLevel = int(cascadeLevel)
-            except ValueError:
-                logger.info(
-                    f"Skipping invalid templateName or cascadeLevel: {templateName}, {cascadeLevel}"
+    iters = max(int(iters), 1000)
+    defQuantile = 0.75
+    chromMin = int(intervals[0])
+    chromMax = int(intervals[-1])
+    chromMid = chromMin + (chromMax - chromMin) // 2  # for split
+    halfLeftMask = intervals < chromMid
+    halfRightMask = ~halfLeftMask
+    excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
+    if excludeRegionsBedFile is not None:
+        excludeMaskGlobal = core.getBedMask(
+            chromosome, excludeRegionsBedFile, intervals
+        ).astype(np.uint8)
+    allRows = []
+    def bhFdr(p: np.ndarray) -> np.ndarray:
+        m = len(p)
+        order = np.argsort(p, kind="mergesort")
+        ranked = np.arange(1, m + 1, dtype=float)
+        q = (p[order] * m) / ranked
+        q = np.minimum.accumulate(q[::-1])[::-1]
+        out = np.empty_like(q)
+        out[order] = q
+        return np.clip(out, 0.0, 1.0)
+    def parseMinSignalThreshold(val):
+        if val is None:
+            return -1e6
+        if isinstance(val, str):
+            if val.startswith("q:"):
+                qVal = float(val.split("q:")[-1])
+                if not (0 <= qVal <= 1):
+                    raise ValueError(
+                        f"Quantile {qVal} is out of range"
+                    )
+                return float(
+                    np.quantile(
+                        asinhNonZeroValues,
+                        qVal,
+                        method="interpolated_inverted_cdf",
+                    )
                 )
-                continue
+            elif castableToFloat(val):
+                v = float(val)
+                return -1e6 if v < 0 else float(np.asinh(v))
+            else:
+                return float(
+                    np.quantile(
+                        asinhNonZeroValues,
+                        defQuantile,
+                        method="interpolated_inverted_cdf",
+                    )
+                )
+        if isinstance(val, (float, int)):
+            v = float(val)
+            return -1e6 if v < 0 else float(np.asinh(v))
+        return float(
+            np.quantile(
+                asinhNonZeroValues,
+                defQuantile,
+                method="interpolated_inverted_cdf",
+            )
+        )
+    def relativeMaxima(
+        resp: np.ndarray, orderBins: int
+    ) -> np.ndarray:
+        return signal.argrelmax(resp, order=max(int(orderBins), 1))[0]
+    def sampleBlockMaxima(
+        resp: np.ndarray,
+        halfMask: np.ndarray,
+        relWindowBins: int,
+        nsamp: int,
+        seed: int,
+    ):
+        exMask = excludeMaskGlobal.astype(np.uint8).copy()
+        exMask |= (~halfMask).astype(np.uint8)
+        vals = np.array(
+            cconsenrich.csampleBlockStats(
+                intervals.astype(np.uint32),
+                resp,
+                int(relWindowBins),
+                int(nsamp),
+                int(seed),
+                exMask.astype(np.uint8),
+            ),
+            dtype=float,
+        )
+        if len(vals) == 0:
+            return vals
+        low = np.quantile(vals, 0.001)
+        high = np.quantile(vals, 0.999)
+        return vals[(vals > low) & (vals < high)]
+    for cascadeLevel in cascadeLevels:
+        for templateName in templateNames:
             if templateName not in pw.wavelist(kind="discrete"):
-                logger.info(
-                    f"\nSkipping unknown wavelet template: {templateName}\nAvailable templates: {pw.wavelist(kind='discrete')}"
+                logger.warning(
+                    f"Skipping unknown wavelet template: {templateName}"
                 )
                 continue
-            wav = pw.Wavelet(templateName)
-            scalingFunc, waveletFunc, x = wav.wavefun(level=cascadeLevel)
-            template = np.array(waveletFunc, dtype=np.float64) / np.linalg.norm(
-                waveletFunc
+            wav = pw.Wavelet(str(templateName))
+            scalingFunc, waveletFunc, _ = wav.wavefun(
+                level=int(cascadeLevel)
             )
-            if useScalingFunction:
-                template = np.array(
-                    scalingFunc, dtype=np.float64
-                ) / np.linalg.norm(scalingFunc)
+            template = np.array(
+                scalingFunc if useScalingFunction else waveletFunc,
+                dtype=np.float64,
+            )
+            template /= np.linalg.norm(template)
             logger.info(
-                f"Matching: template: {templateName}, cascade level: {cascadeLevel}, template length: {len(template)}, scaling: {useScalingFunction}, wavelet: {not useScalingFunction}"
+                f"\n\tMatching template: {templateName}"
+                f"\n\tcascade level: {cascadeLevel}"
+                f"\n\ttemplate length: {len(template)}"
             )
-            responseSequence: npt.NDArray[np.float64] = signal.fftconvolve(
+            # efficient FFT-based cross-correlation
+            # (OA may be better for smaller templates, TODO add a check)
+            response = signal.fftconvolve(
                 values, template[::-1], mode="same"
             )
-            minMatchLengthBP = minMatchLengthBPCopy
-            if minMatchLengthBP is None or minMatchLengthBP < 1:
-                minMatchLengthBP = len(template) * intervalLengthBP
-            if minMatchLengthBP % intervalLengthBP != 0:
-                minMatchLengthBP += intervalLengthBP - (
-                    minMatchLengthBP % intervalLengthBP
+            thisMinMatchBp = minMatchLengthBP
+            if thisMinMatchBp is None or thisMinMatchBp < 1:
+                thisMinMatchBp = len(template) * intervalLengthBp
+            if thisMinMatchBp % intervalLengthBp != 0:
+                thisMinMatchBp += intervalLengthBp - (
+                    thisMinMatchBp % intervalLengthBp
                 )
-            relativeMaximaWindow = int(
-                ((minMatchLengthBP / intervalLengthBP) / 2) + 1
+            relWindowBins = int(
+                ((thisMinMatchBp / intervalLengthBp) / 2) + 1
             )
-            relativeMaximaWindow = max(relativeMaximaWindow, 1)
-            excludeMask = np.zeros(len(intervals), dtype=np.uint8)
-            if excludeRegionsBedFile is not None:
-                excludeMask = core.getBedMask(
-                    chromosome,
-                    excludeRegionsBedFile,
-                    intervals,
-                )
-            logger.info(
-                f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
+            relWindowBins = max(relWindowBins, 1)
+            asinhThreshold = parseMinSignalThreshold(
+                minSignalAtMaxima
             )
-            blockMaxima = np.array(
-                cconsenrich.csampleBlockStats(
-                    intervals.astype(np.uint32),
-                    responseSequence,
-                    relativeMaximaWindow,
-                    iters * 2,
-                    randSeed_,
-                    excludeMask.astype(np.uint8),
-                ),
-                dtype=float,
-            )
-            blockMaximaCheck = blockMaxima.copy()[iters:]
-            blockMaxima = blockMaxima[:iters]
-            blockMaxima = blockMaxima[
-                (blockMaxima > np.quantile(blockMaxima, 0.005))
-                & (blockMaxima < np.quantile(blockMaxima, 0.995))
-            ]
-            ecdfBlockMaximaSF = stats.ecdf(blockMaxima).sf
-            responseThreshold = float(1e6)
-            arsinhSignalThreshold = float(1e6)
-            try:
-                # we use 'interpolated_inverted_cdf' in a few spots
-                # --- making sure it's supported here, at its first use
-                responseThreshold = np.quantile(
-                    blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
+            for nullMask, testMask, tag in [
+                (halfLeftMask, halfRightMask, "R"),
+                (halfRightMask, halfLeftMask, "L"),
+            ]:
+                blockMaxima = sampleBlockMaxima(
+                    response,
+                    nullMask,
+                    relWindowBins,
+                    nsamp=max(iters, 1000),
+                    seed=rng.integers(1, 10_000),
                 )
-            except (TypeError, ValueError, KeyError) as err_:
-                logger.warning(
-                    f"\nError computing response threshold  with alpha={alpha}:\n{err_}\n"
-                    f"\nIs `blockMaxima` empty?"
-                    f"\nIs NumPy older than 1.22.0 (~May 2022~)?"
-                    f"\nIs `alpha` in (0,1)?\n"
-                )
-                raise
-            # parse minSignalAtMaxima, set arsinhSignalThreshold
-            if minSignalAtMaxima is None:
-                # -----we got a `None`-----
-                arsinhSignalThreshold = -float(1e6)
-            elif isinstance(minSignalAtMaxima, str):
-                # -----we got a str-----
-                if minSignalAtMaxima.startswith("q:"):
-                    # case: expected 'q:quantileValue' format
-                    qVal = float(minSignalAtMaxima.split("q:")[-1])
-                    if qVal < 0 or qVal > 1:
-                        raise ValueError(f"Quantile {qVal} is out of range")
-                    arsinhSignalThreshold = float(
-                        np.quantile(
-                            asinhNonZeroValues,
-                            qVal,
-                            method="interpolated_inverted_cdf",
-                        )
-                    )
-                elif castableToFloat(minSignalAtMaxima):
-                    # case: numeric in str form (possible due to CLI)
-                    if float(minSignalAtMaxima) < 0.0:
-                        # effectively disables threshold
-                        arsinhSignalThreshold = -float(1e6)
-                    else:
-                        # use supplied value
-                        arsinhSignalThreshold = np.asinh(
-                            float(minSignalAtMaxima)
-                        )
-                else:
-                    # case: not in known format, not castable to a float, use defaults
-                    logger.info(
-                        f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
-                    )
-                    arsinhSignalThreshold = float(
-                        np.quantile(
-                            asinhNonZeroValues,
-                            defQuantile,
-                            method="interpolated_inverted_cdf",
-                        )
+                if len(blockMaxima) < 25:
+                    pooledMask = ~excludeMaskGlobal.astype(bool)
+                    blockMaxima = sampleBlockMaxima(
+                        response,
+                        pooledMask,
+                        relWindowBins,
+                        nsamp=max(iters, 1000),
+                        seed=rng.integers(1, 10_000),
                     )
-                # -----
-            elif isinstance(minSignalAtMaxima, (float, int)):
-                # -----we got an int or float-----
-                if float(minSignalAtMaxima) < 0.0:
-                    # effectively disables threshold
-                    arsinhSignalThreshold = -float(1e6)
-                else:
-                    # use supplied value
-                    arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
-                # -----
-            relativeMaximaIndices = signal.argrelmax(
-                responseSequence, order=relativeMaximaWindow
-            )[0]
-            relativeMaximaIndices = relativeMaximaIndices[
-                (responseSequence[relativeMaximaIndices] > responseThreshold)
-                & (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
-            ]
-            if len(relativeMaximaIndices) == 0:
-                logger.info(
-                    f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
+                ecdfSf = stats.ecdf(blockMaxima).sf
+                candidateIdx = relativeMaxima(response, relWindowBins)
+                candidateMask = (
+                    (candidateIdx >= relWindowBins)
+                    & (candidateIdx < len(response) - relWindowBins)
+                    & (testMask[candidateIdx])
+                    & (excludeMaskGlobal[candidateIdx] == 0)
+                    & (asinhValues[candidateIdx] > asinhThreshold)
                 )
-                continue
-            if maxNumMatches is not None:
-                if len(relativeMaximaIndices) > maxNumMatches:
-                    # take the greatest maxNumMatches (by 'signal')
-                    relativeMaximaIndices = relativeMaximaIndices[
-                        np.argsort(asinhValues[relativeMaximaIndices])[
+                candidateIdx = candidateIdx[candidateMask]
+                if len(candidateIdx) == 0:
+                    continue
+                if (
+                    maxNumMatches is not None
+                    and len(candidateIdx) > maxNumMatches
+                ):
+                    candidateIdx = candidateIdx[
+                        np.argsort(asinhValues[candidateIdx])[
                             -maxNumMatches:
                         ]
                     ]
-            ecdfSFCheckVals: npt.NDArray[np.float64] = (
-                ecdfBlockMaximaSF.evaluate(blockMaximaCheck)
-            )
-            testKS, _ = stats.kstest(
-                ecdfSFCheckVals,
-                stats.uniform.cdf,
-                alternative="two-sided",
-            )
-            logger.info(
-                f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
-                f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
-                f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
-                f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n"  # lil text-plot histogram of approx. null CDF
-            )
-            # starts
-            startsIdx = np.maximum(
-                relativeMaximaIndices - relativeMaximaWindow, 0
-            )
-            # ends
-            endsIdx = np.minimum(
-                len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
-            )
-            # point source
-            pointSourcesIdx = []
-            for start_, end_ in zip(startsIdx, endsIdx):
-                pointSourcesIdx.append(
-                    np.argmax(values[start_ : end_ + 1]) + start_
+                pEmp = np.clip(
+                    ecdfSf.evaluate(response[candidateIdx]),
+                    1.0e-10,
+                    1.0,
                 )
-            pointSourcesIdx = np.array(pointSourcesIdx)
-            starts = intervals[startsIdx]
-            ends = intervals[endsIdx]
-            pointSources = (intervals[pointSourcesIdx]) + max(
-                1, intervalLengthBP // 2
-            )
-            if (
-                recenterAtPointSource
-            ):  # recenter at point source (signal maximum)
-                starts = pointSources - (
-                    relativeMaximaWindow * intervalLengthBP
+                startsIdx = np.maximum(
+                    candidateIdx - relWindowBins, 0
                 )
-                ends = pointSources + (relativeMaximaWindow * intervalLengthBP)
-            pointSources = (intervals[pointSourcesIdx] - starts) + max(
-                1, intervalLengthBP // 2
-            )
-            # (ucsc browser) score [0,1000]
-            sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
-            minResponse = np.min(sqScores)
-            maxResponse = np.max(sqScores)
-            rangeResponse = max(maxResponse - minResponse, 1.0)
-            scores = (
-                250 + 750 * (sqScores - minResponse) / rangeResponse
-            ).astype(int)
-            # feature name
-            names = [
-                f"{templateName}_{cascadeLevel}_{i}"
-                for i in relativeMaximaIndices
-            ]
-            # strand
-            strands = ["." for _ in range(len(scores))]
-            # p-values in -log10 scale per convention
-            pValues = -np.log10(
-                np.clip(
-                    ecdfBlockMaximaSF.evaluate(
-                        responseSequence[relativeMaximaIndices]
-                    ),
-                    1e-10,
-                    1.0,
+                endsIdx = np.minimum(
+                    len(values) - 1, candidateIdx + relWindowBins
                 )
-            )
-            # q-values (ignored)
-            qValues = np.array(np.ones_like(pValues) * -1.0)
-            tempDF = pd.DataFrame(
-                {
-                    "chromosome": [chromosome] * len(relativeMaximaIndices),
-                    "start": starts.astype(int),
-                    "end": ends.astype(int),
-                    "name": names,
-                    "score": scores,
-                    "strand": strands,
-                    "signal": responseSequence[relativeMaximaIndices],
-                    "pValue": pValues,
-                    "qValue": qValues,
-                    "pointSource": pointSources.astype(int),
-                }
-            )
+                pointSourcesIdx = []
+                for s, e in zip(startsIdx, endsIdx):
+                    pointSourcesIdx.append(
+                        np.argmax(values[s : e + 1]) + s
+                    )
+                pointSourcesIdx = np.array(pointSourcesIdx)
+                starts = intervals[startsIdx]
+                ends = intervals[endsIdx]
+                pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
+                    1, intervalLengthBp // 2
+                )
+                if recenterAtPointSource:
+                    starts = pointSourcesAbs - (
+                        relWindowBins * intervalLengthBp
+                    )
+                    ends = pointSourcesAbs + (
+                        relWindowBins * intervalLengthBp
+                    )
+                pointSourcesRel = (
+                    intervals[pointSourcesIdx] - starts
+                ) + max(1, intervalLengthBp // 2)
+                sqScores = (1 + response[candidateIdx]) ** 2
+                minR, maxR = (
+                    float(np.min(sqScores)),
+                    float(np.max(sqScores)),
+                )
+                rangeR = max(maxR - minR, 1.0)
+                scores = (
+                    250 + 750 * (sqScores - minR) / rangeR
+                ).astype(int)
+                for i, idxVal in enumerate(candidateIdx):
+                    allRows.append(
+                        {
+                            "chromosome": chromosome,
+                            "start": int(starts[i]),
+                            "end": int(ends[i]),
+                            "name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
+                            "score": int(scores[i]),
+                            "strand": ".",
+                            "signal": float(response[idxVal]),
+                            "p_raw": float(pEmp[i]),
+                            "pointSource": int(pointSourcesRel[i]),
+                        }
+                    )
-            if matchDF.empty:
-                matchDF = tempDF
-            else:
-                matchDF = pd.concat([matchDF, tempDF], ignore_index=True)
-            randSeed_ += 1
+    if not allRows:
+        logger.warning(
+            "No matches detected, returning empty DataFrame."
+        )
-    if matchDF.empty:
-        logger.info("No matches detected, returning empty DataFrame.")
-        return matchDF
-    matchDF.sort_values(by=["chromosome", "start", "end"], inplace=True)
-    matchDF.reset_index(drop=True, inplace=True)
-    return matchDF
+        return pd.DataFrame(
+            columns=[
+                "chromosome",
+                "start",
+                "end",
+                "name",
+                "score",
+                "strand",
+                "signal",
+                "pValue",
+                "qValue",
+                "pointSource",
+            ]
+        )
+    df = pd.DataFrame(allRows)
+    qVals = bhFdr(df["p_raw"].values.astype(float))
+    df["pValue"] = -np.log10(
+        np.clip(df["p_raw"].values, 1.0e-10, 1.0)
+    )
+    df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
+    df.drop(columns=["p_raw"], inplace=True)
+    df = df[qVals <= alpha].copy()
+    df["chromosome"] = df["chromosome"].astype(str)
+    df.sort_values(by=["chromosome", "start", "end"], inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    df = df[
+        [
+            "chromosome",
+            "start",
+            "end",
+            "name",
+            "score",
+            "strand",
+            "signal",
+            "pValue",
+            "qValue",
+            "pointSource",
+        ]
+    ]
+    return df
-def mergeMatches(filePath: str, mergeGapBP: int = 50):
-    r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
-    Where an overlap occurs within `mergeGapBP` base pairs, the feature with the greatest signal defines the new summit/pointSource
+def mergeMatches(
+    filePath: str,
+    mergeGapBP: Optional[int],
+) -> Optional[str]:
+    r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
+    The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
+    The fourth column (name) of each merged peak contains information about the number of features that were merged
+    and the range of q-values among them.
+    Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
     :param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
     :type filePath: str
-    :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
-    :type mergeGapBP: int
+    :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
+    :type mergeGapBP: Optional[int]
-    :seealso: :class:`consenrich.core.matchingParams`
+    :seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
     """
+    if mergeGapBP is None or mergeGapBP < 1:
+        mergeGapBP = 75
+    MAX_NEGLOGP = 10.0
+    MIN_NEGLOGP = 1.0e-10
     if not os.path.isfile(filePath):
-        logger.info(f"Couldn't access {filePath}...skipping merge")
+        logger.warning(f"Couldn't access {filePath}...skipping merge")
         return None
     bed = None
     try:
         bed = BedTool(filePath)
     except Exception as ex:
-        logger.info(
+        logger.warning(
             f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
         )
         return None
     if bed is None:
-        logger.info(f"Couldn't create BedTool for {filePath}...skipping merge")
+        logger.warning(
+            f"Couldn't create BedTool for {filePath}...skipping merge"
+        )
         return None
     bed = bed.sort()
@@ -595,41 +626,86 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
         end = int(fields[2])
         score = float(fields[4])
         signal = float(fields[6])
-        pval = float(fields[7])
-        qval = float(fields[8])
+        pLog10 = float(fields[7])
+        qLog10 = float(fields[8])
         peak = int(fields[9])
-        clId = fields[-1]
-        if clId not in groups:
-            groups[clId] = {
+        clusterID = fields[-1]
+        if clusterID not in groups:
+            groups[clusterID] = {
                 "chrom": chrom,
                 "sMin": start,
                 "eMax": end,
                 "scSum": 0.0,
                 "sigSum": 0.0,
-                "pSum": 0.0,
-                "qSum": 0.0,
                 "n": 0,
                 "maxS": float("-inf"),
                 "peakAbs": -1,
+                "pMax": float("-inf"),
+                "pTail": 0.0,
+                "pHasInf": False,
+                "qMax": float("-inf"),
+                "qMin": float("inf"),
+                "qTail": 0.0,
+                "qHasInf": False,
             }
-        g = groups[clId]
+        g = groups[clusterID]
         if start < g["sMin"]:
             g["sMin"] = start
         if end > g["eMax"]:
             g["eMax"] = end
         g["scSum"] += score
         g["sigSum"] += signal
-        g["pSum"] += pval
-        g["qSum"] += qval
         g["n"] += 1
-        # scan for largest signal, FFR: consider using the p-val in the future
+        if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
+            g["pHasInf"] = True
+        else:
+            if pLog10 > g["pMax"]:
+                if g["pMax"] == float("-inf"):
+                    g["pTail"] = 1.0
+                else:
+                    g["pTail"] = (
+                        g["pTail"] * (10 ** (g["pMax"] - pLog10))
+                        + 1.0
+                    )
+                g["pMax"] = pLog10
+            else:
+                g["pTail"] += 10 ** (pLog10 - g["pMax"])
+        if (
+            math.isinf(qLog10)
+            or qLog10 >= MAX_NEGLOGP
+            or qLog10 <= MIN_NEGLOGP
+        ):
+            g["qHasInf"] = True
+        else:
+            if qLog10 < g["qMin"]:
+                if qLog10 < MIN_NEGLOGP:
+                    g["qMin"] = MIN_NEGLOGP
+                else:
+                    g["qMin"] = qLog10
+            if qLog10 > g["qMax"]:
+                if g["qMax"] == float("-inf"):
+                    g["qTail"] = 1.0
+                else:
+                    g["qTail"] = (
+                        g["qTail"] * (10 ** (g["qMax"] - qLog10))
+                        + 1.0
+                    )
+                g["qMax"] = qLog10
+            else:
+                g["qTail"] += 10 ** (qLog10 - g["qMax"])
         if signal > g["maxS"]:
             g["maxS"] = signal
             g["peakAbs"] = start + peak if peak >= 0 else -1
     items = []
-    for clId, g in groups.items():
+    for clusterID, g in groups.items():
         items.append((g["chrom"], g["sMin"], g["eMax"], g))
     items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
     outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
     lines = []
     i = 0
@@ -642,69 +718,68 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
             avgScore = 1000
         scoreInt = int(round(avgScore))
         sigAvg = g["sigSum"] / g["n"]
-        pAvg = g["pSum"] / g["n"]
-        qAvg = g["qSum"] / g["n"]
-        pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
-        name = f"mergedPeak{i}"
-        lines.append(
-            f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
-        )
-    with open(outPath, "w") as outF:
-        outF.write("\n".join(lines) + ("\n" if lines else ""))
-    logger.info(f"Merged matches written to {outPath}")
-    return outPath
+        if g["pHasInf"]:
+            pHMLog10 = MAX_NEGLOGP
+        else:
+            if (
+                g["pMax"] == float("-inf")
+                or not (g["pTail"] > 0.0)
+                or math.isnan(g["pTail"])
+            ):
+                pHMLog10 = MIN_NEGLOGP
+            else:
+                pHMLog10 = -math.log10(g["n"]) + (
+                    g["pMax"] + math.log10(g["pTail"])
+                )
+                pHMLog10 = max(
+                    MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
+                )
-def textNullCDF(
-    nullBlockMaximaSFVals: npt.NDArray[np.float64],
-    binCount: int = 20,
-    barWidth: int = 50,
-    barChar="\u25a2",
-    normalize: bool = False,
-) -> str:
-    r"""Plot a histogram of the distribution 1 - ECDF(nullBlockMaxima)
+        if g["qHasInf"]:
+            qHMLog10 = MAX_NEGLOGP
+        else:
+            if (
+                g["qMax"] == float("-inf")
+                or not (g["qTail"] > 0.0)
+                or math.isnan(g["qTail"])
+            ):
+                qHMLog10 = MIN_NEGLOGP
+            else:
+                qHMLog10 = -math.log10(g["n"]) + (
+                    g["qMax"] + math.log10(g["qTail"])
+                )
+                qHMLog10 = max(
+                    MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
+                )
-    Called by :func:`consenrich.matching.matchWavelet`. Ideally resembles
-    a uniform(0,1) distribution.
+        pointSource = (
+            g["peakAbs"] - sMin
+            if g["peakAbs"] >= 0
+            else (eMax - sMin) // 2
+        )
-    :seealso: :func:`consenrich.matching.matchWavelet`, :ref:`cconsenrich.csampleBlockStats`
-    """
-    valueLower, valueUpper = (
-        min(nullBlockMaximaSFVals),
-        max(nullBlockMaximaSFVals),
-    )
-    binCount = max(1, int(binCount))
-    binStep = (valueUpper - valueLower) / binCount
-    binEdges = [
-        valueLower + indexValue * binStep for indexValue in range(binCount)
-    ]
-    binEdges.append(valueUpper)
-    binCounts = [0] * binCount
-    for numericValue in nullBlockMaximaSFVals:
-        binIndex = int((numericValue - valueLower) / binStep)
-        if binIndex == binCount:
-            binIndex -= 1
-        binCounts[binIndex] += 1
-    valueSeries = (
-        [countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
-        if normalize
-        else binCounts[:]
-    )
-    valueMaximum = max(valueSeries) if valueSeries else 0
-    widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
-    edgeFormat = f"{{:.{2}f}}"
-    rangeLabels = [
-        f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
-        for indexValue in range(binCount)
-    ]
-    labelWidth = max(len(textValue) for textValue in rangeLabels)
-    lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
-    for rangeLabel, seriesValue, countValue in zip(
-        rangeLabels, valueSeries, binCounts
-    ):
-        barString = barChar * int(round(seriesValue * widthScale))
-        trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
+        qMinLog10 = g["qMin"]
+        qMaxLog10 = g["qMax"]
+        if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
+            qMinLog10 = MIN_NEGLOGP
+        if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
+            qMaxLog10 = MAX_NEGLOGP
+        elif (
+            not math.isfinite(qMaxLog10)
+            or not math.isfinite(qMinLog10)
+        ) or (qMaxLog10 < MIN_NEGLOGP):
+            qMinLog10 = 0.0
+            qMaxLog10 = 0.0
+        # informative+parsable name
+        # e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
+        name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
         lines.append(
-            f"{rangeLabel.rjust(labelWidth)} | {barString}{trailingText.ljust(10)}"
+            f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
         )
-    return "\n".join(lines)
+    with open(outPath, "w") as outF:
+        outF.write("\n".join(lines) + ("\n" if lines else ""))
+    logger.info(f"Merged matches written to {outPath}")
+    return outPath