consenrich 0.7.0b1__cp313-cp313-macosx_11_0_arm64.whl → 0.7.1b2__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

consenrich/matching.py CHANGED
@@ -3,6 +3,7 @@ r"""Module implementing (experimental) 'structured peak detection' features usin
3
3
 
4
4
  import logging
5
5
  import os
6
+ import math
6
7
  from pybedtools import BedTool
7
8
  from typing import List, Optional
8
9
 
@@ -23,13 +24,64 @@ logging.basicConfig(
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
27
+ def autoMinLengthIntervals(
28
+ values: np.ndarray, initLen: int = 3
29
+ ) -> int:
30
+ r"""Determines a minimum matching length (in interval units) based on the input signal values.
31
+
32
+ Returns the mean length of non-zero contiguous segments in a log-scaled/centered version of `values`
33
+
34
+ :param values: A 1D array of signal-like values.
35
+ :type values: np.ndarray
36
+ :param initLen: Initial minimum length (in intervals). Defaults to 3.
37
+ :type initLen: int
38
+ :return: Estimated minimum matching length (in intervals)
39
+ :rtype: int
40
+
41
+ """
42
+ trValues = np.asinh(values) - signal.medfilt(
43
+ np.asinh(values),
44
+ kernel_size=
45
+ max(
46
+ (2 * initLen) + 1,
47
+ 2 * (int(len(values) * 0.005)) + 1,
48
+ )
49
+ )
50
+ nz = trValues[trValues > 0]
51
+ if len(nz) == 0:
52
+ return initLen
53
+ thr = np.quantile(nz, 0.90, method="interpolated_inverted_cdf")
54
+ mask = nz >= thr
55
+ if not np.any(mask):
56
+ return initLen
57
+ idx = np.flatnonzero(np.diff(np.r_[False, mask, False]))
58
+ runs = idx.reshape(-1, 2)
59
+ widths = runs[:, 1] - runs[:, 0]
60
+ widths = widths[widths >= initLen]
61
+ if len(widths) == 0:
62
+ return initLen
63
+ return int(np.mean(widths))
64
+
65
+
66
+ def scalarClip(value: float, low: float, high: float) -> float:
67
+ return low if value < low else high if value > high else value
68
+
69
+
26
70
  def castableToFloat(value) -> bool:
27
71
  if value is None:
28
72
  return False
29
73
  if isinstance(value, bool):
30
74
  return False
31
75
  if isinstance(value, str):
32
- if value.lower().replace(' ', '') in ["nan", "inf", "-inf", "infinity", "-infinity", "", " "]:
76
+ if value.lower().replace(" ", "") in [
77
+ "nan",
78
+ "inf",
79
+ "-inf",
80
+ "infinity",
81
+ "-infinity",
82
+ "",
83
+ " ",
84
+ ]:
33
85
  return False
34
86
 
35
87
  try:
@@ -75,7 +127,11 @@ def matchExistingBedGraph(
75
127
  )
76
128
 
77
129
  if mergeGapBP is None:
78
- mergeGapBP = (minMatchLengthBP // 2) + 1 if minMatchLengthBP is not None else 75
130
+ mergeGapBP = (
131
+ (minMatchLengthBP // 2) + 1
132
+ if minMatchLengthBP is not None
133
+ else 75
134
+ )
79
135
 
80
136
  allowedTemplates = [
81
137
  x for x in pw.wavelist(kind="discrete") if "bio" not in x
@@ -107,7 +163,7 @@ def matchExistingBedGraph(
107
163
  for chrom_ in sorted(bedGraphDF["chromosome"].unique()):
108
164
  df_ = bedGraphDF[bedGraphDF["chromosome"] == chrom_]
109
165
  if len(df_) < 5:
110
- logger.info(f"Skipping {chrom_}: fewer than 5 rows.")
166
+ logger.info(f"Skipping {chrom_}: less than 5 intervals.")
111
167
  continue
112
168
 
113
169
  try:
@@ -129,7 +185,9 @@ def matchExistingBedGraph(
129
185
  randSeed=randSeed,
130
186
  )
131
187
  except Exception as ex:
132
- logger.info(f"Skipping {chrom_} due to error in matchWavelet: {ex}")
188
+ logger.info(
189
+ f"Skipping {chrom_} due to error in matchWavelet: {ex}"
190
+ )
133
191
  continue
134
192
 
135
193
  if df__.empty:
@@ -145,7 +203,9 @@ def matchExistingBedGraph(
145
203
  outPaths.append(perChromOut)
146
204
 
147
205
  if merge:
148
- mergedPath = mergeMatches(perChromOut, mergeGapBP=mergeGapBP)
206
+ mergedPath = mergeMatches(
207
+ perChromOut, mergeGapBP=mergeGapBP
208
+ )
149
209
  if mergedPath is not None:
150
210
  logger.info(f"Merged matches written to {mergedPath}")
151
211
  outPathsMerged.append(mergedPath)
@@ -177,7 +237,9 @@ def matchExistingBedGraph(
177
237
  with open(path, "r") as inF:
178
238
  for line in inF:
179
239
  outF.write(line)
180
- logger.info(f"All merged matches written to {outPathMergedAll}")
240
+ logger.info(
241
+ f"All merged matches written to {outPathMergedAll}"
242
+ )
181
243
 
182
244
  for path_ in outPaths + outPathsMerged:
183
245
  try:
@@ -211,34 +273,38 @@ def matchWavelet(
211
273
  excludeRegionsBedFile: Optional[str] = None,
212
274
  weights: Optional[npt.NDArray[np.float64]] = None,
213
275
  ) -> pd.DataFrame:
214
- r"""Detect structured peaks by cross-correlating Consenrich tracks with wavelet- or scaling-function templates.
276
+ r"""Detect structured peaks in Consenrich tracks by matching wavelet- or scaling-function–based templates.
215
277
 
216
278
  :param chromosome: Chromosome name for the input intervals and values.
217
279
  :type chromosome: str
218
- :param values: 'Consensus' signal estimates derived from multiple samples, e.g., from Consenrich.
280
+ :param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
281
+ but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
219
282
  :type values: npt.NDArray[np.float64]
220
- :param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
283
+ :param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
221
284
  :type templateNames: List[str]
222
- :param cascadeLevels: A list of int values -- the number of cascade iterations used for approximating
223
- the scaling/wavelet functions.
285
+ :param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
286
+ Must have the same length as `templateNames`, with each entry aligned to the
287
+ corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
224
288
  :type cascadeLevels: List[int]
225
289
  :param iters: Number of random blocks to sample in the response sequence while building
226
290
  an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
227
291
  :type iters: int
228
292
  :param alpha: Primary significance threshold on detected matches. Specifically, the
229
- :math:`1 - \alpha` quantile of an empirical null distribution. The empirical null
230
- distribution is built from cross-correlation values over randomly sampled blocks.
293
+ minimum corr. empirical p-value approximated from randomly sampled blocks in the
294
+ response sequence.
231
295
  :type alpha: float
232
296
  :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
233
297
  the signal-template convolution must be greater in value than others to qualify as matches.
234
- :type minMatchLengthBP: int
235
- :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Require the *signal value*
236
- at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale.
237
- If a `float` value is provided, the minimum signal value must be greater than this (absolute) value. *Set to a
238
- negative value to disable the threshold*.
239
- If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.75'. The
298
+ If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
299
+ If set to `None`, defaults to 250 bp.
300
+ :type minMatchLengthBP: Optional[int]
301
+ :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
302
+ at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
303
+ to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
304
+ than this (absolute) value. *Set to a negative value to disable the threshold*.
305
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
240
306
  threshold is then set to the corresponding quantile of the non-zero signal estimates.
241
- Defaults to str value 'q:0.75' --- the 90th percentile of signal values.
307
+ Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
242
308
  :type minSignalAtMaxima: Optional[str | float]
243
309
  :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
244
310
  If False, use (only) the wavelet function.
@@ -247,342 +313,372 @@ def matchWavelet(
247
313
  :type excludeRegionsBedFile: Optional[str]
248
314
 
249
315
  :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
316
+ :return: A pandas DataFrame with detected matches
317
+ :rtype: pd.DataFrame
250
318
  """
251
319
 
320
+ rng = np.random.default_rng(int(randSeed))
252
321
  if len(intervals) < 5:
253
322
  raise ValueError("`intervals` must be at least length 5")
254
- if len(values) != len(intervals):
255
- raise ValueError("`values` must have the same length as `intervals`")
256
- intervalLengthBP = intervals[1] - intervals[0]
257
- if not np.all(np.abs(np.diff(intervals)) == intervalLengthBP):
258
- # FFR: don't change this exception message without updating tests
259
- # --'spaced' is matched in tests
260
- raise ValueError("`intervals` must be evenly spaced.")
261
323
 
262
- randSeed_: int = int(randSeed)
263
- cols = [
264
- "chromosome",
265
- "start",
266
- "end",
267
- "name",
268
- "score",
269
- "strand",
270
- "signal",
271
- "pValue",
272
- "qValue",
273
- "pointSource",
274
- ]
275
- matchDF = pd.DataFrame(columns=cols)
276
- minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
277
- cascadeLevels = sorted(list(set(cascadeLevels)))
278
- if weights is not None and len(weights) == len(values):
279
- values = values * weights
280
- asinhValues = np.asinh(values, dtype=np.float32)
281
- asinhNonZeroValues = asinhValues[asinhValues > 0]
282
- iters = max(iters, 1000)
283
- defQuantile: float = 0.75
284
- for l_, cascadeLevel in enumerate(cascadeLevels):
285
- for t_, templateName in enumerate(templateNames):
286
- try:
287
- templateName = str(templateName)
288
- cascadeLevel = int(cascadeLevel)
289
- except ValueError:
290
- logger.info(
291
- f"Skipping invalid templateName or cascadeLevel: {templateName}, {cascadeLevel}"
292
- )
293
- continue
294
- if templateName not in pw.wavelist(kind="discrete"):
295
- logger.info(
296
- f"\nSkipping unknown wavelet template: {templateName}\nAvailable templates: {pw.wavelist(kind='discrete')}"
297
- )
298
- continue
299
-
300
- wav = pw.Wavelet(templateName)
301
- scalingFunc, waveletFunc, x = wav.wavefun(level=cascadeLevel)
302
- template = np.array(waveletFunc, dtype=np.float64) / np.linalg.norm(
303
- waveletFunc
304
- )
305
-
306
- if useScalingFunction:
307
- template = np.array(
308
- scalingFunc, dtype=np.float64
309
- ) / np.linalg.norm(scalingFunc)
324
+ if len(values) != len(intervals):
325
+ raise ValueError(
326
+ "`values` must have the same length as `intervals`"
327
+ )
310
328
 
311
- logger.info(
312
- f"Matching: template: {templateName}, cascade level: {cascadeLevel}, template length: {len(template)}, scaling: {useScalingFunction}, wavelet: {not useScalingFunction}"
313
- )
329
+ if len(templateNames) != len(cascadeLevels):
330
+ raise ValueError(
331
+ "\n\t`templateNames` and `cascadeLevels` must have the same length."
332
+ "\n\tSet products are not supported, i.e., each template needs an explicitly defined cascade level."
333
+ "\t\ne.g., for `templateNames = [haar, db2]`, use `cascadeLevels = [2, 2]`, not `[2]`.\n"
334
+ )
314
335
 
315
- responseSequence: npt.NDArray[np.float64] = signal.fftconvolve(
316
- values, template[::-1], mode="same"
317
- )
336
+ intervalLengthBp = intervals[1] - intervals[0]
318
337
 
319
- minMatchLengthBP = minMatchLengthBPCopy
320
- if minMatchLengthBP is None or minMatchLengthBP < 1:
321
- minMatchLengthBP = len(template) * intervalLengthBP
322
- if minMatchLengthBP % intervalLengthBP != 0:
323
- minMatchLengthBP += intervalLengthBP - (
324
- minMatchLengthBP % intervalLengthBP
325
- )
338
+ if minMatchLengthBP is not None and minMatchLengthBP < 1:
339
+ minMatchLengthBP = (
340
+ autoMinLengthIntervals(values) * int(intervalLengthBp)
341
+ )
342
+ elif minMatchLengthBP is None:
343
+ minMatchLengthBP = 250
326
344
 
327
- relativeMaximaWindow = int(
328
- ((minMatchLengthBP / intervalLengthBP) / 2) + 1
329
- )
330
- relativeMaximaWindow = max(relativeMaximaWindow, 1)
331
-
332
- excludeMask = np.zeros(len(intervals), dtype=np.uint8)
333
- if excludeRegionsBedFile is not None:
334
- excludeMask = core.getBedMask(
335
- chromosome,
336
- excludeRegionsBedFile,
337
- intervals,
338
- )
345
+ logger.info(
346
+ f"\n\tUsing minMatchLengthBP: {minMatchLengthBP}"
347
+ )
339
348
 
340
- logger.info(
341
- f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
342
- )
343
- blockMaxima = np.array(
344
- cconsenrich.csampleBlockStats(
345
- intervals.astype(np.uint32),
346
- responseSequence,
347
- relativeMaximaWindow,
348
- iters * 2,
349
- randSeed_,
350
- excludeMask.astype(np.uint8),
351
- ),
352
- dtype=float,
349
+ if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
350
+ raise ValueError("`intervals` must be evenly spaced.")
351
+
352
+ if weights is not None:
353
+ if len(weights) != len(values):
354
+ logger.warning(
355
+ f"`weights` length {len(weights)} does not match `values` length {len(values)}. Ignoring..."
353
356
  )
354
- blockMaximaCheck = blockMaxima.copy()[iters:]
355
- blockMaxima = blockMaxima[:iters]
356
- blockMaxima = blockMaxima[
357
- (blockMaxima > np.quantile(blockMaxima, 0.005))
358
- & (blockMaxima < np.quantile(blockMaxima, 0.995))
359
- ]
360
-
361
- ecdfBlockMaximaSF = stats.ecdf(blockMaxima).sf
362
-
363
- responseThreshold = float(1e6)
364
- arsinhSignalThreshold = float(1e6)
365
- try:
366
- # we use 'interpolated_inverted_cdf' in a few spots
367
- # --- making sure it's supported here, at its first use
368
- responseThreshold = np.quantile(
369
- blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
370
- )
371
- except (TypeError, ValueError, KeyError) as err_:
372
- logger.warning(
373
- f"\nError computing response threshold with alpha={alpha}:\n{err_}\n"
374
- f"\nIs `blockMaxima` empty?"
375
- f"\nIs NumPy older than 1.22.0 (~May 2022~)?"
376
- f"\nIs `alpha` in (0,1)?\n"
377
- )
378
- raise
379
-
380
- # parse minSignalAtMaxima, set arsinhSignalThreshold
381
- if minSignalAtMaxima is None:
382
- # -----we got a `None`-----
383
- arsinhSignalThreshold = -float(1e6)
384
- elif isinstance(minSignalAtMaxima, str):
385
- # -----we got a str-----
386
- if minSignalAtMaxima.startswith("q:"):
387
- # case: expected 'q:quantileValue' format
388
- qVal = float(minSignalAtMaxima.split("q:")[-1])
389
- if qVal < 0 or qVal > 1:
390
- raise ValueError(f"Quantile {qVal} is out of range")
391
- arsinhSignalThreshold = float(
392
- np.quantile(
393
- asinhNonZeroValues,
394
- qVal,
395
- method="interpolated_inverted_cdf",
396
- )
357
+ else:
358
+ values = values * weights
359
+
360
+ asinhValues = np.asinh(values, dtype=np.float32)
361
+ asinhNonZeroValues = asinhValues[asinhValues > 0]
362
+ iters = max(int(iters), 1000)
363
+ defQuantile = 0.75
364
+ chromMin = int(intervals[0])
365
+ chromMax = int(intervals[-1])
366
+ chromMid = chromMin + (chromMax - chromMin) // 2 # for split
367
+ halfLeftMask = intervals < chromMid
368
+ halfRightMask = ~halfLeftMask
369
+ excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
370
+ if excludeRegionsBedFile is not None:
371
+ excludeMaskGlobal = core.getBedMask(
372
+ chromosome, excludeRegionsBedFile, intervals
373
+ ).astype(np.uint8)
374
+ allRows = []
375
+
376
+ def bhFdr(p: np.ndarray) -> np.ndarray:
377
+ m = len(p)
378
+ order = np.argsort(p, kind="mergesort")
379
+ ranked = np.arange(1, m + 1, dtype=float)
380
+ q = (p[order] * m) / ranked
381
+ q = np.minimum.accumulate(q[::-1])[::-1]
382
+ out = np.empty_like(q)
383
+ out[order] = q
384
+ return np.clip(out, 0.0, 1.0)
385
+
386
+ def parseMinSignalThreshold(val):
387
+ if val is None:
388
+ return -1e6
389
+ if isinstance(val, str):
390
+ if val.startswith("q:"):
391
+ qVal = float(val.split("q:")[-1])
392
+ if not (0 <= qVal <= 1):
393
+ raise ValueError(
394
+ f"Quantile {qVal} is out of range"
397
395
  )
398
-
399
- elif castableToFloat(minSignalAtMaxima):
400
- # case: numeric in str form (possible due to CLI)
401
- if float(minSignalAtMaxima) < 0.0:
402
- # effectively disables threshold
403
- arsinhSignalThreshold = -float(1e6)
404
- else:
405
- # use supplied value
406
- arsinhSignalThreshold = np.asinh(
407
- float(minSignalAtMaxima)
408
- )
409
- else:
410
- # case: not in known format, not castable to a float, use defaults
411
- logger.info(
412
- f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
396
+ return float(
397
+ np.quantile(
398
+ asinhNonZeroValues,
399
+ qVal,
400
+ method="interpolated_inverted_cdf",
413
401
  )
414
- arsinhSignalThreshold = float(
415
- np.quantile(
416
- asinhNonZeroValues,
417
- defQuantile,
418
- method="interpolated_inverted_cdf",
419
- )
402
+ )
403
+ elif castableToFloat(val):
404
+ v = float(val)
405
+ return -1e6 if v < 0 else float(np.asinh(v))
406
+ else:
407
+ return float(
408
+ np.quantile(
409
+ asinhNonZeroValues,
410
+ defQuantile,
411
+ method="interpolated_inverted_cdf",
420
412
  )
421
- # -----
422
-
423
- elif isinstance(minSignalAtMaxima, (float, int)):
424
- # -----we got an int or float-----
425
- if float(minSignalAtMaxima) < 0.0:
426
- # effectively disables threshold
427
- arsinhSignalThreshold = -float(1e6)
428
- else:
429
- # use supplied value
430
- arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
431
- # -----
432
-
433
-
434
- relativeMaximaIndices = signal.argrelmax(
435
- responseSequence, order=relativeMaximaWindow
436
- )[0]
413
+ )
414
+ if isinstance(val, (float, int)):
415
+ v = float(val)
416
+ return -1e6 if v < 0 else float(np.asinh(v))
417
+ return float(
418
+ np.quantile(
419
+ asinhNonZeroValues,
420
+ defQuantile,
421
+ method="interpolated_inverted_cdf",
422
+ )
423
+ )
437
424
 
438
- relativeMaximaIndices = relativeMaximaIndices[
439
- (responseSequence[relativeMaximaIndices] > responseThreshold)
440
- & (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
441
- ]
425
+ def relativeMaxima(
426
+ resp: np.ndarray, orderBins: int
427
+ ) -> np.ndarray:
428
+ return signal.argrelmax(resp, order=max(int(orderBins), 1))[0]
429
+
430
+ def sampleBlockMaxima(
431
+ resp: np.ndarray,
432
+ halfMask: np.ndarray,
433
+ relWindowBins: int,
434
+ nsamp: int,
435
+ seed: int,
436
+ ):
437
+ exMask = excludeMaskGlobal.astype(np.uint8).copy()
438
+ exMask |= (~halfMask).astype(np.uint8)
439
+ vals = np.array(
440
+ cconsenrich.csampleBlockStats(
441
+ intervals.astype(np.uint32),
442
+ resp,
443
+ int(relWindowBins),
444
+ int(nsamp),
445
+ int(seed),
446
+ exMask.astype(np.uint8),
447
+ ),
448
+ dtype=float,
449
+ )
450
+ if len(vals) == 0:
451
+ return vals
452
+ low = np.quantile(vals, 0.001)
453
+ high = np.quantile(vals, 0.999)
454
+ return vals[(vals > low) & (vals < high)]
455
+
456
+ for templateName, cascadeLevel in zip(
457
+ templateNames, cascadeLevels
458
+ ):
459
+ if templateName not in pw.wavelist(kind="discrete"):
460
+ logger.warning(
461
+ f"Skipping unknown wavelet template: {templateName}"
462
+ )
463
+ continue
442
464
 
443
- if len(relativeMaximaIndices) == 0:
444
- logger.info(
445
- f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
446
- )
447
- continue
465
+ wav = pw.Wavelet(str(templateName))
466
+ scalingFunc, waveletFunc, _ = wav.wavefun(
467
+ level=int(cascadeLevel)
468
+ )
469
+ template = np.array(
470
+ scalingFunc if useScalingFunction else waveletFunc,
471
+ dtype=np.float64,
472
+ )
473
+ template /= np.linalg.norm(template)
448
474
 
449
- if maxNumMatches is not None:
450
- if len(relativeMaximaIndices) > maxNumMatches:
451
- # take the greatest maxNumMatches (by 'signal')
452
- relativeMaximaIndices = relativeMaximaIndices[
453
- np.argsort(asinhValues[relativeMaximaIndices])[
454
- -maxNumMatches:
455
- ]
456
- ]
475
+ logger.info(
476
+ f"\n\tMatching template: {templateName}"
477
+ f"\n\tcascade level: {cascadeLevel}"
478
+ f"\n\ttemplate length: {len(template)}"
479
+ )
457
480
 
458
- ecdfSFCheckVals: npt.NDArray[np.float64] = (
459
- ecdfBlockMaximaSF.evaluate(blockMaximaCheck)
481
+ # efficient FFT-based cross-correlation
482
+ # (OA may be better for smaller templates, TODO add a check)
483
+ response = signal.fftconvolve(
484
+ values, template[::-1], mode="same"
485
+ )
486
+ thisMinMatchBp = minMatchLengthBP
487
+ if thisMinMatchBp is None or thisMinMatchBp < 1:
488
+ thisMinMatchBp = len(template) * intervalLengthBp
489
+ if thisMinMatchBp % intervalLengthBp != 0:
490
+ thisMinMatchBp += intervalLengthBp - (
491
+ thisMinMatchBp % intervalLengthBp
460
492
  )
461
- testKS, _ = stats.kstest(
462
- ecdfSFCheckVals,
463
- stats.uniform.cdf,
464
- alternative="two-sided",
493
+ relWindowBins = int(
494
+ ((thisMinMatchBp / intervalLengthBp) / 2) + 1
495
+ )
496
+ relWindowBins = max(relWindowBins, 1)
497
+ asinhThreshold = parseMinSignalThreshold(minSignalAtMaxima)
498
+ for nullMask, testMask, tag in [
499
+ (halfLeftMask, halfRightMask, "R"),
500
+ (halfRightMask, halfLeftMask, "L"),
501
+ ]:
502
+ blockMaxima = sampleBlockMaxima(
503
+ response,
504
+ nullMask,
505
+ relWindowBins,
506
+ nsamp=max(iters, 1000),
507
+ seed=rng.integers(1, 10_000),
465
508
  )
466
-
467
- logger.info(
468
- f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
469
- f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
470
- f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
471
- f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n" # lil text-plot histogram of approx. null CDF
509
+ if len(blockMaxima) < 25:
510
+ pooledMask = ~excludeMaskGlobal.astype(bool)
511
+ blockMaxima = sampleBlockMaxima(
512
+ response,
513
+ pooledMask,
514
+ relWindowBins,
515
+ nsamp=max(iters, 1000),
516
+ seed=rng.integers(1, 10_000),
517
+ )
518
+ ecdfSf = stats.ecdf(blockMaxima).sf
519
+ candidateIdx = relativeMaxima(response, relWindowBins)
520
+
521
+ candidateMask = (
522
+ (candidateIdx >= relWindowBins)
523
+ & (candidateIdx < len(response) - relWindowBins)
524
+ & (testMask[candidateIdx])
525
+ & (excludeMaskGlobal[candidateIdx] == 0)
526
+ & (asinhValues[candidateIdx] > asinhThreshold)
472
527
  )
473
528
 
474
- # starts
475
- startsIdx = np.maximum(
476
- relativeMaximaIndices - relativeMaximaWindow, 0
529
+ candidateIdx = candidateIdx[candidateMask]
530
+ if len(candidateIdx) == 0:
531
+ continue
532
+ if (
533
+ maxNumMatches is not None
534
+ and len(candidateIdx) > maxNumMatches
535
+ ):
536
+ candidateIdx = candidateIdx[
537
+ np.argsort(asinhValues[candidateIdx])[
538
+ -maxNumMatches:
539
+ ]
540
+ ]
541
+ pEmp = np.clip(
542
+ ecdfSf.evaluate(response[candidateIdx]),
543
+ 1.0e-10,
544
+ 1.0,
477
545
  )
478
- # ends
546
+ startsIdx = np.maximum(candidateIdx - relWindowBins, 0)
479
547
  endsIdx = np.minimum(
480
- len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
548
+ len(values) - 1, candidateIdx + relWindowBins
481
549
  )
482
- # point source
483
550
  pointSourcesIdx = []
484
- for start_, end_ in zip(startsIdx, endsIdx):
551
+ for s, e in zip(startsIdx, endsIdx):
485
552
  pointSourcesIdx.append(
486
- np.argmax(values[start_ : end_ + 1]) + start_
553
+ np.argmax(values[s : e + 1]) + s
487
554
  )
488
555
  pointSourcesIdx = np.array(pointSourcesIdx)
489
556
  starts = intervals[startsIdx]
490
557
  ends = intervals[endsIdx]
491
- pointSources = (intervals[pointSourcesIdx]) + max(
492
- 1, intervalLengthBP // 2
558
+ pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
559
+ 1, intervalLengthBp // 2
493
560
  )
494
- if (
495
- recenterAtPointSource
496
- ): # recenter at point source (signal maximum)
497
- starts = pointSources - (
498
- relativeMaximaWindow * intervalLengthBP
561
+ if recenterAtPointSource:
562
+ starts = pointSourcesAbs - (
563
+ relWindowBins * intervalLengthBp
499
564
  )
500
- ends = pointSources + (relativeMaximaWindow * intervalLengthBP)
501
- pointSources = (intervals[pointSourcesIdx] - starts) + max(
502
- 1, intervalLengthBP // 2
503
- )
504
- # (ucsc browser) score [0,1000]
505
- sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
506
- minResponse = np.min(sqScores)
507
- maxResponse = np.max(sqScores)
508
- rangeResponse = max(maxResponse - minResponse, 1.0)
509
- scores = (
510
- 250 + 750 * (sqScores - minResponse) / rangeResponse
511
- ).astype(int)
512
- # feature name
513
- names = [
514
- f"{templateName}_{cascadeLevel}_{i}"
515
- for i in relativeMaximaIndices
516
- ]
517
- # strand
518
- strands = ["." for _ in range(len(scores))]
519
- # p-values in -log10 scale per convention
520
- pValues = -np.log10(
521
- np.clip(
522
- ecdfBlockMaximaSF.evaluate(
523
- responseSequence[relativeMaximaIndices]
524
- ),
525
- 1e-10,
526
- 1.0,
565
+ ends = pointSourcesAbs + (
566
+ relWindowBins * intervalLengthBp
527
567
  )
568
+ pointSourcesRel = (
569
+ intervals[pointSourcesIdx] - starts
570
+ ) + max(1, intervalLengthBp // 2)
571
+ sqScores = (1 + response[candidateIdx]) ** 2
572
+ minR, maxR = (
573
+ float(np.min(sqScores)),
574
+ float(np.max(sqScores)),
528
575
  )
529
- # q-values (ignored)
530
- qValues = np.array(np.ones_like(pValues) * -1.0)
531
-
532
- tempDF = pd.DataFrame(
533
- {
534
- "chromosome": [chromosome] * len(relativeMaximaIndices),
535
- "start": starts.astype(int),
536
- "end": ends.astype(int),
537
- "name": names,
538
- "score": scores,
539
- "strand": strands,
540
- "signal": responseSequence[relativeMaximaIndices],
541
- "pValue": pValues,
542
- "qValue": qValues,
543
- "pointSource": pointSources.astype(int),
544
- }
545
- )
576
+ rangeR = max(maxR - minR, 1.0)
577
+ scores = (250 + 750 * (sqScores - minR) / rangeR).astype(int)
578
+ for i, idxVal in enumerate(candidateIdx):
579
+ allRows.append(
580
+ {
581
+ "chromosome": chromosome,
582
+ "start": int(starts[i]),
583
+ "end": int(ends[i]),
584
+ "name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
585
+ "score": int(scores[i]),
586
+ "strand": ".",
587
+ "signal": float(response[idxVal]),
588
+ "p_raw": float(pEmp[i]),
589
+ "pointSource": int(pointSourcesRel[i]),
590
+ }
591
+ )
546
592
 
547
- if matchDF.empty:
548
- matchDF = tempDF
549
- else:
550
- matchDF = pd.concat([matchDF, tempDF], ignore_index=True)
551
- randSeed_ += 1
593
+ if not allRows:
594
+ logger.warning(
595
+ "No matches detected, returning empty DataFrame."
596
+ )
552
597
 
553
- if matchDF.empty:
554
- logger.info("No matches detected, returning empty DataFrame.")
555
- return matchDF
556
- matchDF.sort_values(by=["chromosome", "start", "end"], inplace=True)
557
- matchDF.reset_index(drop=True, inplace=True)
558
- return matchDF
598
+ return pd.DataFrame(
599
+ columns=[
600
+ "chromosome",
601
+ "start",
602
+ "end",
603
+ "name",
604
+ "score",
605
+ "strand",
606
+ "signal",
607
+ "pValue",
608
+ "qValue",
609
+ "pointSource",
610
+ ]
611
+ )
559
612
 
613
+ df = pd.DataFrame(allRows)
614
+ qVals = bhFdr(df["p_raw"].values.astype(float))
615
+ df["pValue"] = -np.log10(
616
+ np.clip(df["p_raw"].values, 1.0e-10, 1.0)
617
+ )
618
+ df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
619
+ df.drop(columns=["p_raw"], inplace=True)
620
+ df = df[qVals <= alpha].copy()
621
+ df["chromosome"] = df["chromosome"].astype(str)
622
+ df.sort_values(by=["chromosome", "start", "end"], inplace=True)
623
+ df.reset_index(drop=True, inplace=True)
624
+ df = df[
625
+ [
626
+ "chromosome",
627
+ "start",
628
+ "end",
629
+ "name",
630
+ "score",
631
+ "strand",
632
+ "signal",
633
+ "pValue",
634
+ "qValue",
635
+ "pointSource",
636
+ ]
637
+ ]
638
+ return df
560
639
 
561
- def mergeMatches(filePath: str, mergeGapBP: int = 50):
562
- r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
563
640
 
564
- Where an overlap occurs within `mergeGapBP` base pairs, the feature with the greatest signal defines the new summit/pointSource
641
+ def mergeMatches(
642
+ filePath: str,
643
+ mergeGapBP: Optional[int],
644
+ ) -> Optional[str]:
645
+ r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
646
+
647
+ The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
648
+ The fourth column (name) of each merged peak contains information about the number of features that were merged
649
+ and the range of q-values among them.
650
+
651
+ Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
565
652
 
566
653
  :param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
567
654
  :type filePath: str
568
- :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
569
- :type mergeGapBP: int
655
+ :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
656
+ :type mergeGapBP: Optional[int]
570
657
 
571
- :seealso: :class:`consenrich.core.matchingParams`
658
+ :seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
572
659
  """
660
+
661
+ if mergeGapBP is None or mergeGapBP < 1:
662
+ mergeGapBP = 75
663
+
664
+ MAX_NEGLOGP = 10.0
665
+ MIN_NEGLOGP = 1.0e-10
666
+
573
667
  if not os.path.isfile(filePath):
574
- logger.info(f"Couldn't access {filePath}...skipping merge")
668
+ logger.warning(f"Couldn't access {filePath}...skipping merge")
575
669
  return None
576
670
  bed = None
577
671
  try:
578
672
  bed = BedTool(filePath)
579
673
  except Exception as ex:
580
- logger.info(
674
+ logger.warning(
581
675
  f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
582
676
  )
583
677
  return None
584
678
  if bed is None:
585
- logger.info(f"Couldn't create BedTool for {filePath}...skipping merge")
679
+ logger.warning(
680
+ f"Couldn't create BedTool for {filePath}...skipping merge"
681
+ )
586
682
  return None
587
683
 
588
684
  bed = bed.sort()
@@ -595,41 +691,86 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
595
691
  end = int(fields[2])
596
692
  score = float(fields[4])
597
693
  signal = float(fields[6])
598
- pval = float(fields[7])
599
- qval = float(fields[8])
694
+ pLog10 = float(fields[7])
695
+ qLog10 = float(fields[8])
600
696
  peak = int(fields[9])
601
- clId = fields[-1]
602
- if clId not in groups:
603
- groups[clId] = {
697
+ clusterID = fields[-1]
698
+ if clusterID not in groups:
699
+ groups[clusterID] = {
604
700
  "chrom": chrom,
605
701
  "sMin": start,
606
702
  "eMax": end,
607
703
  "scSum": 0.0,
608
704
  "sigSum": 0.0,
609
- "pSum": 0.0,
610
- "qSum": 0.0,
611
705
  "n": 0,
612
706
  "maxS": float("-inf"),
613
707
  "peakAbs": -1,
708
+ "pMax": float("-inf"),
709
+ "pTail": 0.0,
710
+ "pHasInf": False,
711
+ "qMax": float("-inf"),
712
+ "qMin": float("inf"),
713
+ "qTail": 0.0,
714
+ "qHasInf": False,
614
715
  }
615
- g = groups[clId]
716
+ g = groups[clusterID]
616
717
  if start < g["sMin"]:
617
718
  g["sMin"] = start
618
719
  if end > g["eMax"]:
619
720
  g["eMax"] = end
620
721
  g["scSum"] += score
621
722
  g["sigSum"] += signal
622
- g["pSum"] += pval
623
- g["qSum"] += qval
624
723
  g["n"] += 1
625
- # scan for largest signal, FFR: consider using the p-val in the future
724
+
725
+ if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
726
+ g["pHasInf"] = True
727
+ else:
728
+ if pLog10 > g["pMax"]:
729
+ if g["pMax"] == float("-inf"):
730
+ g["pTail"] = 1.0
731
+ else:
732
+ g["pTail"] = (
733
+ g["pTail"] * (10 ** (g["pMax"] - pLog10))
734
+ + 1.0
735
+ )
736
+ g["pMax"] = pLog10
737
+ else:
738
+ g["pTail"] += 10 ** (pLog10 - g["pMax"])
739
+
740
+ if (
741
+ math.isinf(qLog10)
742
+ or qLog10 >= MAX_NEGLOGP
743
+ or qLog10 <= MIN_NEGLOGP
744
+ ):
745
+ g["qHasInf"] = True
746
+ else:
747
+ if qLog10 < g["qMin"]:
748
+ if qLog10 < MIN_NEGLOGP:
749
+ g["qMin"] = MIN_NEGLOGP
750
+ else:
751
+ g["qMin"] = qLog10
752
+
753
+ if qLog10 > g["qMax"]:
754
+ if g["qMax"] == float("-inf"):
755
+ g["qTail"] = 1.0
756
+ else:
757
+ g["qTail"] = (
758
+ g["qTail"] * (10 ** (g["qMax"] - qLog10))
759
+ + 1.0
760
+ )
761
+ g["qMax"] = qLog10
762
+ else:
763
+ g["qTail"] += 10 ** (qLog10 - g["qMax"])
764
+
626
765
  if signal > g["maxS"]:
627
766
  g["maxS"] = signal
628
767
  g["peakAbs"] = start + peak if peak >= 0 else -1
768
+
629
769
  items = []
630
- for clId, g in groups.items():
770
+ for clusterID, g in groups.items():
631
771
  items.append((g["chrom"], g["sMin"], g["eMax"], g))
632
772
  items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
773
+
633
774
  outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
634
775
  lines = []
635
776
  i = 0
@@ -642,69 +783,68 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
642
783
  avgScore = 1000
643
784
  scoreInt = int(round(avgScore))
644
785
  sigAvg = g["sigSum"] / g["n"]
645
- pAvg = g["pSum"] / g["n"]
646
- qAvg = g["qSum"] / g["n"]
647
- pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
648
- name = f"mergedPeak{i}"
649
- lines.append(
650
- f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
651
- )
652
- with open(outPath, "w") as outF:
653
- outF.write("\n".join(lines) + ("\n" if lines else ""))
654
- logger.info(f"Merged matches written to {outPath}")
655
- return outPath
656
786
 
787
+ if g["pHasInf"]:
788
+ pHMLog10 = MAX_NEGLOGP
789
+ else:
790
+ if (
791
+ g["pMax"] == float("-inf")
792
+ or not (g["pTail"] > 0.0)
793
+ or math.isnan(g["pTail"])
794
+ ):
795
+ pHMLog10 = MIN_NEGLOGP
796
+ else:
797
+ pHMLog10 = -math.log10(g["n"]) + (
798
+ g["pMax"] + math.log10(g["pTail"])
799
+ )
800
+ pHMLog10 = max(
801
+ MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
802
+ )
657
803
 
658
- def textNullCDF(
659
- nullBlockMaximaSFVals: npt.NDArray[np.float64],
660
- binCount: int = 20,
661
- barWidth: int = 50,
662
- barChar="\u25a2",
663
- normalize: bool = False,
664
- ) -> str:
665
- r"""Plot a histogram of the distribution 1 - ECDF(nullBlockMaxima)
804
+ if g["qHasInf"]:
805
+ qHMLog10 = MAX_NEGLOGP
806
+ else:
807
+ if (
808
+ g["qMax"] == float("-inf")
809
+ or not (g["qTail"] > 0.0)
810
+ or math.isnan(g["qTail"])
811
+ ):
812
+ qHMLog10 = MIN_NEGLOGP
813
+ else:
814
+ qHMLog10 = -math.log10(g["n"]) + (
815
+ g["qMax"] + math.log10(g["qTail"])
816
+ )
817
+ qHMLog10 = max(
818
+ MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
819
+ )
666
820
 
667
- Called by :func:`consenrich.matching.matchWavelet`. Ideally resembles
668
- a uniform(0,1) distribution.
821
+ pointSource = (
822
+ g["peakAbs"] - sMin
823
+ if g["peakAbs"] >= 0
824
+ else (eMax - sMin) // 2
825
+ )
669
826
 
670
- :seealso: :func:`consenrich.matching.matchWavelet`, :ref:`cconsenrich.csampleBlockStats`
671
- """
672
- valueLower, valueUpper = (
673
- min(nullBlockMaximaSFVals),
674
- max(nullBlockMaximaSFVals),
675
- )
676
- binCount = max(1, int(binCount))
677
- binStep = (valueUpper - valueLower) / binCount
678
- binEdges = [
679
- valueLower + indexValue * binStep for indexValue in range(binCount)
680
- ]
681
- binEdges.append(valueUpper)
682
- binCounts = [0] * binCount
683
- for numericValue in nullBlockMaximaSFVals:
684
- binIndex = int((numericValue - valueLower) / binStep)
685
- if binIndex == binCount:
686
- binIndex -= 1
687
- binCounts[binIndex] += 1
688
- valueSeries = (
689
- [countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
690
- if normalize
691
- else binCounts[:]
692
- )
693
- valueMaximum = max(valueSeries) if valueSeries else 0
694
- widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
695
- edgeFormat = f"{{:.{2}f}}"
696
- rangeLabels = [
697
- f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
698
- for indexValue in range(binCount)
699
- ]
700
- labelWidth = max(len(textValue) for textValue in rangeLabels)
701
- lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
702
- for rangeLabel, seriesValue, countValue in zip(
703
- rangeLabels, valueSeries, binCounts
704
- ):
705
- barString = barChar * int(round(seriesValue * widthScale))
706
- trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
827
+ qMinLog10 = g["qMin"]
828
+ qMaxLog10 = g["qMax"]
829
+ if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
830
+ qMinLog10 = MIN_NEGLOGP
831
+ if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
832
+ qMaxLog10 = MAX_NEGLOGP
833
+ elif (
834
+ not math.isfinite(qMaxLog10)
835
+ or not math.isfinite(qMinLog10)
836
+ ) or (qMaxLog10 < MIN_NEGLOGP):
837
+ qMinLog10 = 0.0
838
+ qMaxLog10 = 0.0
839
+
840
+ # informative+parsable name
841
+ # e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
842
+ name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
707
843
  lines.append(
708
- f"{rangeLabel.rjust(labelWidth)} | {barString}{trailingText.ljust(10)}"
844
+ f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
709
845
  )
710
- return "\n".join(lines)
846
+
847
+ with open(outPath, "w") as outF:
848
+ outF.write("\n".join(lines) + ("\n" if lines else ""))
849
+ logger.info(f"Merged matches written to {outPath}")
850
+ return outPath