consenrich 0.7.0b1__cp311-cp311-macosx_11_0_arm64.whl → 0.7.1b1__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

consenrich/matching.py CHANGED
@@ -3,6 +3,7 @@ r"""Module implementing (experimental) 'structured peak detection' features usin
3
3
 
4
4
  import logging
5
5
  import os
6
+ import math
6
7
  from pybedtools import BedTool
7
8
  from typing import List, Optional
8
9
 
@@ -23,13 +24,25 @@ logging.basicConfig(
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
27
+ def scalarClip(value: float, low: float, high: float) -> float:
28
+ return low if value < low else high if value > high else value
29
+
30
+
26
31
  def castableToFloat(value) -> bool:
27
32
  if value is None:
28
33
  return False
29
34
  if isinstance(value, bool):
30
35
  return False
31
36
  if isinstance(value, str):
32
- if value.lower().replace(' ', '') in ["nan", "inf", "-inf", "infinity", "-infinity", "", " "]:
37
+ if value.lower().replace(" ", "") in [
38
+ "nan",
39
+ "inf",
40
+ "-inf",
41
+ "infinity",
42
+ "-infinity",
43
+ "",
44
+ " ",
45
+ ]:
33
46
  return False
34
47
 
35
48
  try:
@@ -75,7 +88,11 @@ def matchExistingBedGraph(
75
88
  )
76
89
 
77
90
  if mergeGapBP is None:
78
- mergeGapBP = (minMatchLengthBP // 2) + 1 if minMatchLengthBP is not None else 75
91
+ mergeGapBP = (
92
+ (minMatchLengthBP // 2) + 1
93
+ if minMatchLengthBP is not None
94
+ else 75
95
+ )
79
96
 
80
97
  allowedTemplates = [
81
98
  x for x in pw.wavelist(kind="discrete") if "bio" not in x
@@ -129,7 +146,9 @@ def matchExistingBedGraph(
129
146
  randSeed=randSeed,
130
147
  )
131
148
  except Exception as ex:
132
- logger.info(f"Skipping {chrom_} due to error in matchWavelet: {ex}")
149
+ logger.info(
150
+ f"Skipping {chrom_} due to error in matchWavelet: {ex}"
151
+ )
133
152
  continue
134
153
 
135
154
  if df__.empty:
@@ -145,7 +164,9 @@ def matchExistingBedGraph(
145
164
  outPaths.append(perChromOut)
146
165
 
147
166
  if merge:
148
- mergedPath = mergeMatches(perChromOut, mergeGapBP=mergeGapBP)
167
+ mergedPath = mergeMatches(
168
+ perChromOut, mergeGapBP=mergeGapBP
169
+ )
149
170
  if mergedPath is not None:
150
171
  logger.info(f"Merged matches written to {mergedPath}")
151
172
  outPathsMerged.append(mergedPath)
@@ -177,7 +198,9 @@ def matchExistingBedGraph(
177
198
  with open(path, "r") as inF:
178
199
  for line in inF:
179
200
  outF.write(line)
180
- logger.info(f"All merged matches written to {outPathMergedAll}")
201
+ logger.info(
202
+ f"All merged matches written to {outPathMergedAll}"
203
+ )
181
204
 
182
205
  for path_ in outPaths + outPathsMerged:
183
206
  try:
@@ -215,7 +238,8 @@ def matchWavelet(
215
238
 
216
239
  :param chromosome: Chromosome name for the input intervals and values.
217
240
  :type chromosome: str
218
- :param values: 'Consensus' signal estimates derived from multiple samples, e.g., from Consenrich.
241
+ :param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
242
+ but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
219
243
  :type values: npt.NDArray[np.float64]
220
244
  :param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
221
245
  :type templateNames: List[str]
@@ -226,19 +250,19 @@ def matchWavelet(
226
250
  an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
227
251
  :type iters: int
228
252
  :param alpha: Primary significance threshold on detected matches. Specifically, the
229
- :math:`1 - \alpha` quantile of an empirical null distribution. The empirical null
230
- distribution is built from cross-correlation values over randomly sampled blocks.
253
+ minimum corr. empirical p-value approximated from randomly sampled blocks in the
254
+ response sequence.
231
255
  :type alpha: float
232
256
  :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
233
257
  the signal-template convolution must be greater in value than others to qualify as matches.
234
258
  :type minMatchLengthBP: int
235
- :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Require the *signal value*
236
- at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale.
237
- If a `float` value is provided, the minimum signal value must be greater than this (absolute) value. *Set to a
238
- negative value to disable the threshold*.
239
- If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.75'. The
259
+ :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
260
+ at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
261
+ to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
262
+ than this (absolute) value. *Set to a negative value to disable the threshold*.
263
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
240
264
  threshold is then set to the corresponding quantile of the non-zero signal estimates.
241
- Defaults to str value 'q:0.75' --- the 90th percentile of signal values.
265
+ Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
242
266
  :type minSignalAtMaxima: Optional[str | float]
243
267
  :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
244
268
  If False, use (only) the wavelet function.
@@ -247,342 +271,349 @@ def matchWavelet(
247
271
  :type excludeRegionsBedFile: Optional[str]
248
272
 
249
273
  :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
274
+ :return: A pandas DataFrame with detected matches
275
+ :rtype: pd.DataFrame
250
276
  """
251
-
252
277
  if len(intervals) < 5:
253
278
  raise ValueError("`intervals` must be at least length 5")
254
279
  if len(values) != len(intervals):
255
- raise ValueError("`values` must have the same length as `intervals`")
256
- intervalLengthBP = intervals[1] - intervals[0]
257
- if not np.all(np.abs(np.diff(intervals)) == intervalLengthBP):
258
- # FFR: don't change this exception message without updating tests
259
- # --'spaced' is matched in tests
280
+ raise ValueError(
281
+ "`values` must have the same length as `intervals`"
282
+ )
283
+ intervalLengthBp = intervals[1] - intervals[0]
284
+ if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
260
285
  raise ValueError("`intervals` must be evenly spaced.")
261
-
262
- randSeed_: int = int(randSeed)
263
- cols = [
264
- "chromosome",
265
- "start",
266
- "end",
267
- "name",
268
- "score",
269
- "strand",
270
- "signal",
271
- "pValue",
272
- "qValue",
273
- "pointSource",
274
- ]
275
- matchDF = pd.DataFrame(columns=cols)
276
- minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
286
+ rng = np.random.default_rng(int(randSeed))
277
287
  cascadeLevels = sorted(list(set(cascadeLevels)))
278
288
  if weights is not None and len(weights) == len(values):
279
289
  values = values * weights
280
290
  asinhValues = np.asinh(values, dtype=np.float32)
281
291
  asinhNonZeroValues = asinhValues[asinhValues > 0]
282
- iters = max(iters, 1000)
283
- defQuantile: float = 0.75
284
- for l_, cascadeLevel in enumerate(cascadeLevels):
285
- for t_, templateName in enumerate(templateNames):
286
- try:
287
- templateName = str(templateName)
288
- cascadeLevel = int(cascadeLevel)
289
- except ValueError:
290
- logger.info(
291
- f"Skipping invalid templateName or cascadeLevel: {templateName}, {cascadeLevel}"
292
+ iters = max(int(iters), 1000)
293
+ defQuantile = 0.75
294
+ chromMin = int(intervals[0])
295
+ chromMax = int(intervals[-1])
296
+ chromMid = chromMin + (chromMax - chromMin) // 2 # for split
297
+ halfLeftMask = intervals < chromMid
298
+ halfRightMask = ~halfLeftMask
299
+ excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
300
+ if excludeRegionsBedFile is not None:
301
+ excludeMaskGlobal = core.getBedMask(
302
+ chromosome, excludeRegionsBedFile, intervals
303
+ ).astype(np.uint8)
304
+ allRows = []
305
+
306
+ def bhFdr(p: np.ndarray) -> np.ndarray:
307
+ m = len(p)
308
+ order = np.argsort(p, kind="mergesort")
309
+ ranked = np.arange(1, m + 1, dtype=float)
310
+ q = (p[order] * m) / ranked
311
+ q = np.minimum.accumulate(q[::-1])[::-1]
312
+ out = np.empty_like(q)
313
+ out[order] = q
314
+ return np.clip(out, 0.0, 1.0)
315
+
316
+ def parseMinSignalThreshold(val):
317
+ if val is None:
318
+ return -1e6
319
+ if isinstance(val, str):
320
+ if val.startswith("q:"):
321
+ qVal = float(val.split("q:")[-1])
322
+ if not (0 <= qVal <= 1):
323
+ raise ValueError(
324
+ f"Quantile {qVal} is out of range"
325
+ )
326
+ return float(
327
+ np.quantile(
328
+ asinhNonZeroValues,
329
+ qVal,
330
+ method="interpolated_inverted_cdf",
331
+ )
292
332
  )
293
- continue
333
+ elif castableToFloat(val):
334
+ v = float(val)
335
+ return -1e6 if v < 0 else float(np.asinh(v))
336
+ else:
337
+ return float(
338
+ np.quantile(
339
+ asinhNonZeroValues,
340
+ defQuantile,
341
+ method="interpolated_inverted_cdf",
342
+ )
343
+ )
344
+ if isinstance(val, (float, int)):
345
+ v = float(val)
346
+ return -1e6 if v < 0 else float(np.asinh(v))
347
+ return float(
348
+ np.quantile(
349
+ asinhNonZeroValues,
350
+ defQuantile,
351
+ method="interpolated_inverted_cdf",
352
+ )
353
+ )
354
+
355
+ def relativeMaxima(
356
+ resp: np.ndarray, orderBins: int
357
+ ) -> np.ndarray:
358
+ return signal.argrelmax(resp, order=max(int(orderBins), 1))[0]
359
+
360
+ def sampleBlockMaxima(
361
+ resp: np.ndarray,
362
+ halfMask: np.ndarray,
363
+ relWindowBins: int,
364
+ nsamp: int,
365
+ seed: int,
366
+ ):
367
+ exMask = excludeMaskGlobal.astype(np.uint8).copy()
368
+ exMask |= (~halfMask).astype(np.uint8)
369
+ vals = np.array(
370
+ cconsenrich.csampleBlockStats(
371
+ intervals.astype(np.uint32),
372
+ resp,
373
+ int(relWindowBins),
374
+ int(nsamp),
375
+ int(seed),
376
+ exMask.astype(np.uint8),
377
+ ),
378
+ dtype=float,
379
+ )
380
+ if len(vals) == 0:
381
+ return vals
382
+ low = np.quantile(vals, 0.001)
383
+ high = np.quantile(vals, 0.999)
384
+ return vals[(vals > low) & (vals < high)]
385
+
386
+ for cascadeLevel in cascadeLevels:
387
+ for templateName in templateNames:
294
388
  if templateName not in pw.wavelist(kind="discrete"):
295
- logger.info(
296
- f"\nSkipping unknown wavelet template: {templateName}\nAvailable templates: {pw.wavelist(kind='discrete')}"
389
+ logger.warning(
390
+ f"Skipping unknown wavelet template: {templateName}"
297
391
  )
298
392
  continue
299
393
 
300
- wav = pw.Wavelet(templateName)
301
- scalingFunc, waveletFunc, x = wav.wavefun(level=cascadeLevel)
302
- template = np.array(waveletFunc, dtype=np.float64) / np.linalg.norm(
303
- waveletFunc
394
+ wav = pw.Wavelet(str(templateName))
395
+ scalingFunc, waveletFunc, _ = wav.wavefun(
396
+ level=int(cascadeLevel)
304
397
  )
305
-
306
- if useScalingFunction:
307
- template = np.array(
308
- scalingFunc, dtype=np.float64
309
- ) / np.linalg.norm(scalingFunc)
398
+ template = np.array(
399
+ scalingFunc if useScalingFunction else waveletFunc,
400
+ dtype=np.float64,
401
+ )
402
+ template /= np.linalg.norm(template)
310
403
 
311
404
  logger.info(
312
- f"Matching: template: {templateName}, cascade level: {cascadeLevel}, template length: {len(template)}, scaling: {useScalingFunction}, wavelet: {not useScalingFunction}"
405
+ f"\n\tMatching template: {templateName}"
406
+ f"\n\tcascade level: {cascadeLevel}"
407
+ f"\n\ttemplate length: {len(template)}"
313
408
  )
314
409
 
315
- responseSequence: npt.NDArray[np.float64] = signal.fftconvolve(
410
+ # efficient FFT-based cross-correlation
411
+ # (OA may be better for smaller templates, TODO add a check)
412
+ response = signal.fftconvolve(
316
413
  values, template[::-1], mode="same"
317
414
  )
318
-
319
- minMatchLengthBP = minMatchLengthBPCopy
320
- if minMatchLengthBP is None or minMatchLengthBP < 1:
321
- minMatchLengthBP = len(template) * intervalLengthBP
322
- if minMatchLengthBP % intervalLengthBP != 0:
323
- minMatchLengthBP += intervalLengthBP - (
324
- minMatchLengthBP % intervalLengthBP
415
+ thisMinMatchBp = minMatchLengthBP
416
+ if thisMinMatchBp is None or thisMinMatchBp < 1:
417
+ thisMinMatchBp = len(template) * intervalLengthBp
418
+ if thisMinMatchBp % intervalLengthBp != 0:
419
+ thisMinMatchBp += intervalLengthBp - (
420
+ thisMinMatchBp % intervalLengthBp
325
421
  )
326
-
327
- relativeMaximaWindow = int(
328
- ((minMatchLengthBP / intervalLengthBP) / 2) + 1
422
+ relWindowBins = int(
423
+ ((thisMinMatchBp / intervalLengthBp) / 2) + 1
329
424
  )
330
- relativeMaximaWindow = max(relativeMaximaWindow, 1)
331
-
332
- excludeMask = np.zeros(len(intervals), dtype=np.uint8)
333
- if excludeRegionsBedFile is not None:
334
- excludeMask = core.getBedMask(
335
- chromosome,
336
- excludeRegionsBedFile,
337
- intervals,
338
- )
339
-
340
- logger.info(
341
- f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
425
+ relWindowBins = max(relWindowBins, 1)
426
+ asinhThreshold = parseMinSignalThreshold(
427
+ minSignalAtMaxima
342
428
  )
343
- blockMaxima = np.array(
344
- cconsenrich.csampleBlockStats(
345
- intervals.astype(np.uint32),
346
- responseSequence,
347
- relativeMaximaWindow,
348
- iters * 2,
349
- randSeed_,
350
- excludeMask.astype(np.uint8),
351
- ),
352
- dtype=float,
353
- )
354
- blockMaximaCheck = blockMaxima.copy()[iters:]
355
- blockMaxima = blockMaxima[:iters]
356
- blockMaxima = blockMaxima[
357
- (blockMaxima > np.quantile(blockMaxima, 0.005))
358
- & (blockMaxima < np.quantile(blockMaxima, 0.995))
359
- ]
360
-
361
- ecdfBlockMaximaSF = stats.ecdf(blockMaxima).sf
362
-
363
- responseThreshold = float(1e6)
364
- arsinhSignalThreshold = float(1e6)
365
- try:
366
- # we use 'interpolated_inverted_cdf' in a few spots
367
- # --- making sure it's supported here, at its first use
368
- responseThreshold = np.quantile(
369
- blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
429
+ for nullMask, testMask, tag in [
430
+ (halfLeftMask, halfRightMask, "R"),
431
+ (halfRightMask, halfLeftMask, "L"),
432
+ ]:
433
+ blockMaxima = sampleBlockMaxima(
434
+ response,
435
+ nullMask,
436
+ relWindowBins,
437
+ nsamp=max(iters, 1000),
438
+ seed=rng.integers(1, 10_000),
370
439
  )
371
- except (TypeError, ValueError, KeyError) as err_:
372
- logger.warning(
373
- f"\nError computing response threshold with alpha={alpha}:\n{err_}\n"
374
- f"\nIs `blockMaxima` empty?"
375
- f"\nIs NumPy older than 1.22.0 (~May 2022~)?"
376
- f"\nIs `alpha` in (0,1)?\n"
377
- )
378
- raise
379
-
380
- # parse minSignalAtMaxima, set arsinhSignalThreshold
381
- if minSignalAtMaxima is None:
382
- # -----we got a `None`-----
383
- arsinhSignalThreshold = -float(1e6)
384
- elif isinstance(minSignalAtMaxima, str):
385
- # -----we got a str-----
386
- if minSignalAtMaxima.startswith("q:"):
387
- # case: expected 'q:quantileValue' format
388
- qVal = float(minSignalAtMaxima.split("q:")[-1])
389
- if qVal < 0 or qVal > 1:
390
- raise ValueError(f"Quantile {qVal} is out of range")
391
- arsinhSignalThreshold = float(
392
- np.quantile(
393
- asinhNonZeroValues,
394
- qVal,
395
- method="interpolated_inverted_cdf",
396
- )
397
- )
398
-
399
- elif castableToFloat(minSignalAtMaxima):
400
- # case: numeric in str form (possible due to CLI)
401
- if float(minSignalAtMaxima) < 0.0:
402
- # effectively disables threshold
403
- arsinhSignalThreshold = -float(1e6)
404
- else:
405
- # use supplied value
406
- arsinhSignalThreshold = np.asinh(
407
- float(minSignalAtMaxima)
408
- )
409
- else:
410
- # case: not in known format, not castable to a float, use defaults
411
- logger.info(
412
- f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
413
- )
414
- arsinhSignalThreshold = float(
415
- np.quantile(
416
- asinhNonZeroValues,
417
- defQuantile,
418
- method="interpolated_inverted_cdf",
419
- )
440
+ if len(blockMaxima) < 25:
441
+ pooledMask = ~excludeMaskGlobal.astype(bool)
442
+ blockMaxima = sampleBlockMaxima(
443
+ response,
444
+ pooledMask,
445
+ relWindowBins,
446
+ nsamp=max(iters, 1000),
447
+ seed=rng.integers(1, 10_000),
420
448
  )
421
- # -----
422
-
423
- elif isinstance(minSignalAtMaxima, (float, int)):
424
- # -----we got an int or float-----
425
- if float(minSignalAtMaxima) < 0.0:
426
- # effectively disables threshold
427
- arsinhSignalThreshold = -float(1e6)
428
- else:
429
- # use supplied value
430
- arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
431
- # -----
432
-
433
-
434
- relativeMaximaIndices = signal.argrelmax(
435
- responseSequence, order=relativeMaximaWindow
436
- )[0]
437
-
438
- relativeMaximaIndices = relativeMaximaIndices[
439
- (responseSequence[relativeMaximaIndices] > responseThreshold)
440
- & (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
441
- ]
442
-
443
- if len(relativeMaximaIndices) == 0:
444
- logger.info(
445
- f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
449
+ ecdfSf = stats.ecdf(blockMaxima).sf
450
+ candidateIdx = relativeMaxima(response, relWindowBins)
451
+
452
+ candidateMask = (
453
+ (candidateIdx >= relWindowBins)
454
+ & (candidateIdx < len(response) - relWindowBins)
455
+ & (testMask[candidateIdx])
456
+ & (excludeMaskGlobal[candidateIdx] == 0)
457
+ & (asinhValues[candidateIdx] > asinhThreshold)
446
458
  )
447
- continue
448
459
 
449
- if maxNumMatches is not None:
450
- if len(relativeMaximaIndices) > maxNumMatches:
451
- # take the greatest maxNumMatches (by 'signal')
452
- relativeMaximaIndices = relativeMaximaIndices[
453
- np.argsort(asinhValues[relativeMaximaIndices])[
460
+ candidateIdx = candidateIdx[candidateMask]
461
+ if len(candidateIdx) == 0:
462
+ continue
463
+ if (
464
+ maxNumMatches is not None
465
+ and len(candidateIdx) > maxNumMatches
466
+ ):
467
+ candidateIdx = candidateIdx[
468
+ np.argsort(asinhValues[candidateIdx])[
454
469
  -maxNumMatches:
455
470
  ]
456
471
  ]
457
-
458
- ecdfSFCheckVals: npt.NDArray[np.float64] = (
459
- ecdfBlockMaximaSF.evaluate(blockMaximaCheck)
460
- )
461
- testKS, _ = stats.kstest(
462
- ecdfSFCheckVals,
463
- stats.uniform.cdf,
464
- alternative="two-sided",
465
- )
466
-
467
- logger.info(
468
- f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
469
- f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
470
- f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
471
- f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n" # lil text-plot histogram of approx. null CDF
472
- )
473
-
474
- # starts
475
- startsIdx = np.maximum(
476
- relativeMaximaIndices - relativeMaximaWindow, 0
477
- )
478
- # ends
479
- endsIdx = np.minimum(
480
- len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
481
- )
482
- # point source
483
- pointSourcesIdx = []
484
- for start_, end_ in zip(startsIdx, endsIdx):
485
- pointSourcesIdx.append(
486
- np.argmax(values[start_ : end_ + 1]) + start_
472
+ pEmp = np.clip(
473
+ ecdfSf.evaluate(response[candidateIdx]),
474
+ 1.0e-10,
475
+ 1.0,
487
476
  )
488
- pointSourcesIdx = np.array(pointSourcesIdx)
489
- starts = intervals[startsIdx]
490
- ends = intervals[endsIdx]
491
- pointSources = (intervals[pointSourcesIdx]) + max(
492
- 1, intervalLengthBP // 2
493
- )
494
- if (
495
- recenterAtPointSource
496
- ): # recenter at point source (signal maximum)
497
- starts = pointSources - (
498
- relativeMaximaWindow * intervalLengthBP
477
+ startsIdx = np.maximum(
478
+ candidateIdx - relWindowBins, 0
499
479
  )
500
- ends = pointSources + (relativeMaximaWindow * intervalLengthBP)
501
- pointSources = (intervals[pointSourcesIdx] - starts) + max(
502
- 1, intervalLengthBP // 2
503
- )
504
- # (ucsc browser) score [0,1000]
505
- sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
506
- minResponse = np.min(sqScores)
507
- maxResponse = np.max(sqScores)
508
- rangeResponse = max(maxResponse - minResponse, 1.0)
509
- scores = (
510
- 250 + 750 * (sqScores - minResponse) / rangeResponse
511
- ).astype(int)
512
- # feature name
513
- names = [
514
- f"{templateName}_{cascadeLevel}_{i}"
515
- for i in relativeMaximaIndices
516
- ]
517
- # strand
518
- strands = ["." for _ in range(len(scores))]
519
- # p-values in -log10 scale per convention
520
- pValues = -np.log10(
521
- np.clip(
522
- ecdfBlockMaximaSF.evaluate(
523
- responseSequence[relativeMaximaIndices]
524
- ),
525
- 1e-10,
526
- 1.0,
480
+ endsIdx = np.minimum(
481
+ len(values) - 1, candidateIdx + relWindowBins
527
482
  )
528
- )
529
- # q-values (ignored)
530
- qValues = np.array(np.ones_like(pValues) * -1.0)
531
-
532
- tempDF = pd.DataFrame(
533
- {
534
- "chromosome": [chromosome] * len(relativeMaximaIndices),
535
- "start": starts.astype(int),
536
- "end": ends.astype(int),
537
- "name": names,
538
- "score": scores,
539
- "strand": strands,
540
- "signal": responseSequence[relativeMaximaIndices],
541
- "pValue": pValues,
542
- "qValue": qValues,
543
- "pointSource": pointSources.astype(int),
544
- }
545
- )
483
+ pointSourcesIdx = []
484
+ for s, e in zip(startsIdx, endsIdx):
485
+ pointSourcesIdx.append(
486
+ np.argmax(values[s : e + 1]) + s
487
+ )
488
+ pointSourcesIdx = np.array(pointSourcesIdx)
489
+ starts = intervals[startsIdx]
490
+ ends = intervals[endsIdx]
491
+ pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
492
+ 1, intervalLengthBp // 2
493
+ )
494
+ if recenterAtPointSource:
495
+ starts = pointSourcesAbs - (
496
+ relWindowBins * intervalLengthBp
497
+ )
498
+ ends = pointSourcesAbs + (
499
+ relWindowBins * intervalLengthBp
500
+ )
501
+ pointSourcesRel = (
502
+ intervals[pointSourcesIdx] - starts
503
+ ) + max(1, intervalLengthBp // 2)
504
+ sqScores = (1 + response[candidateIdx]) ** 2
505
+ minR, maxR = (
506
+ float(np.min(sqScores)),
507
+ float(np.max(sqScores)),
508
+ )
509
+ rangeR = max(maxR - minR, 1.0)
510
+ scores = (
511
+ 250 + 750 * (sqScores - minR) / rangeR
512
+ ).astype(int)
513
+ for i, idxVal in enumerate(candidateIdx):
514
+ allRows.append(
515
+ {
516
+ "chromosome": chromosome,
517
+ "start": int(starts[i]),
518
+ "end": int(ends[i]),
519
+ "name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
520
+ "score": int(scores[i]),
521
+ "strand": ".",
522
+ "signal": float(response[idxVal]),
523
+ "p_raw": float(pEmp[i]),
524
+ "pointSource": int(pointSourcesRel[i]),
525
+ }
526
+ )
546
527
 
547
- if matchDF.empty:
548
- matchDF = tempDF
549
- else:
550
- matchDF = pd.concat([matchDF, tempDF], ignore_index=True)
551
- randSeed_ += 1
528
+ if not allRows:
529
+ logger.warning(
530
+ "No matches detected, returning empty DataFrame."
531
+ )
552
532
 
553
- if matchDF.empty:
554
- logger.info("No matches detected, returning empty DataFrame.")
555
- return matchDF
556
- matchDF.sort_values(by=["chromosome", "start", "end"], inplace=True)
557
- matchDF.reset_index(drop=True, inplace=True)
558
- return matchDF
533
+ return pd.DataFrame(
534
+ columns=[
535
+ "chromosome",
536
+ "start",
537
+ "end",
538
+ "name",
539
+ "score",
540
+ "strand",
541
+ "signal",
542
+ "pValue",
543
+ "qValue",
544
+ "pointSource",
545
+ ]
546
+ )
559
547
 
548
+ df = pd.DataFrame(allRows)
549
+ qVals = bhFdr(df["p_raw"].values.astype(float))
550
+ df["pValue"] = -np.log10(
551
+ np.clip(df["p_raw"].values, 1.0e-10, 1.0)
552
+ )
553
+ df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
554
+ df.drop(columns=["p_raw"], inplace=True)
555
+ df = df[qVals <= alpha].copy()
556
+ df["chromosome"] = df["chromosome"].astype(str)
557
+ df.sort_values(by=["chromosome", "start", "end"], inplace=True)
558
+ df.reset_index(drop=True, inplace=True)
559
+ df = df[
560
+ [
561
+ "chromosome",
562
+ "start",
563
+ "end",
564
+ "name",
565
+ "score",
566
+ "strand",
567
+ "signal",
568
+ "pValue",
569
+ "qValue",
570
+ "pointSource",
571
+ ]
572
+ ]
573
+ return df
560
574
 
561
- def mergeMatches(filePath: str, mergeGapBP: int = 50):
562
- r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
563
575
 
564
- Where an overlap occurs within `mergeGapBP` base pairs, the feature with the greatest signal defines the new summit/pointSource
576
+ def mergeMatches(
577
+ filePath: str,
578
+ mergeGapBP: Optional[int],
579
+ ) -> Optional[str]:
580
+ r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
581
+
582
+ The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
583
+ The fourth column (name) of each merged peak contains information about the number of features that were merged
584
+ and the range of q-values among them.
585
+
586
+ Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
565
587
 
566
588
  :param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
567
589
  :type filePath: str
568
- :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
569
- :type mergeGapBP: int
590
+ :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
591
+ :type mergeGapBP: Optional[int]
570
592
 
571
- :seealso: :class:`consenrich.core.matchingParams`
593
+ :seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
572
594
  """
595
+
596
+ if mergeGapBP is None or mergeGapBP < 1:
597
+ mergeGapBP = 75
598
+
599
+ MAX_NEGLOGP = 10.0
600
+ MIN_NEGLOGP = 1.0e-10
601
+
573
602
  if not os.path.isfile(filePath):
574
- logger.info(f"Couldn't access {filePath}...skipping merge")
603
+ logger.warning(f"Couldn't access {filePath}...skipping merge")
575
604
  return None
576
605
  bed = None
577
606
  try:
578
607
  bed = BedTool(filePath)
579
608
  except Exception as ex:
580
- logger.info(
609
+ logger.warning(
581
610
  f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
582
611
  )
583
612
  return None
584
613
  if bed is None:
585
- logger.info(f"Couldn't create BedTool for {filePath}...skipping merge")
614
+ logger.warning(
615
+ f"Couldn't create BedTool for {filePath}...skipping merge"
616
+ )
586
617
  return None
587
618
 
588
619
  bed = bed.sort()
@@ -595,41 +626,86 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
595
626
  end = int(fields[2])
596
627
  score = float(fields[4])
597
628
  signal = float(fields[6])
598
- pval = float(fields[7])
599
- qval = float(fields[8])
629
+ pLog10 = float(fields[7])
630
+ qLog10 = float(fields[8])
600
631
  peak = int(fields[9])
601
- clId = fields[-1]
602
- if clId not in groups:
603
- groups[clId] = {
632
+ clusterID = fields[-1]
633
+ if clusterID not in groups:
634
+ groups[clusterID] = {
604
635
  "chrom": chrom,
605
636
  "sMin": start,
606
637
  "eMax": end,
607
638
  "scSum": 0.0,
608
639
  "sigSum": 0.0,
609
- "pSum": 0.0,
610
- "qSum": 0.0,
611
640
  "n": 0,
612
641
  "maxS": float("-inf"),
613
642
  "peakAbs": -1,
643
+ "pMax": float("-inf"),
644
+ "pTail": 0.0,
645
+ "pHasInf": False,
646
+ "qMax": float("-inf"),
647
+ "qMin": float("inf"),
648
+ "qTail": 0.0,
649
+ "qHasInf": False,
614
650
  }
615
- g = groups[clId]
651
+ g = groups[clusterID]
616
652
  if start < g["sMin"]:
617
653
  g["sMin"] = start
618
654
  if end > g["eMax"]:
619
655
  g["eMax"] = end
620
656
  g["scSum"] += score
621
657
  g["sigSum"] += signal
622
- g["pSum"] += pval
623
- g["qSum"] += qval
624
658
  g["n"] += 1
625
- # scan for largest signal, FFR: consider using the p-val in the future
659
+
660
+ if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
661
+ g["pHasInf"] = True
662
+ else:
663
+ if pLog10 > g["pMax"]:
664
+ if g["pMax"] == float("-inf"):
665
+ g["pTail"] = 1.0
666
+ else:
667
+ g["pTail"] = (
668
+ g["pTail"] * (10 ** (g["pMax"] - pLog10))
669
+ + 1.0
670
+ )
671
+ g["pMax"] = pLog10
672
+ else:
673
+ g["pTail"] += 10 ** (pLog10 - g["pMax"])
674
+
675
+ if (
676
+ math.isinf(qLog10)
677
+ or qLog10 >= MAX_NEGLOGP
678
+ or qLog10 <= MIN_NEGLOGP
679
+ ):
680
+ g["qHasInf"] = True
681
+ else:
682
+ if qLog10 < g["qMin"]:
683
+ if qLog10 < MIN_NEGLOGP:
684
+ g["qMin"] = MIN_NEGLOGP
685
+ else:
686
+ g["qMin"] = qLog10
687
+
688
+ if qLog10 > g["qMax"]:
689
+ if g["qMax"] == float("-inf"):
690
+ g["qTail"] = 1.0
691
+ else:
692
+ g["qTail"] = (
693
+ g["qTail"] * (10 ** (g["qMax"] - qLog10))
694
+ + 1.0
695
+ )
696
+ g["qMax"] = qLog10
697
+ else:
698
+ g["qTail"] += 10 ** (qLog10 - g["qMax"])
699
+
626
700
  if signal > g["maxS"]:
627
701
  g["maxS"] = signal
628
702
  g["peakAbs"] = start + peak if peak >= 0 else -1
703
+
629
704
  items = []
630
- for clId, g in groups.items():
705
+ for clusterID, g in groups.items():
631
706
  items.append((g["chrom"], g["sMin"], g["eMax"], g))
632
707
  items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
708
+
633
709
  outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
634
710
  lines = []
635
711
  i = 0
@@ -642,69 +718,68 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
642
718
  avgScore = 1000
643
719
  scoreInt = int(round(avgScore))
644
720
  sigAvg = g["sigSum"] / g["n"]
645
- pAvg = g["pSum"] / g["n"]
646
- qAvg = g["qSum"] / g["n"]
647
- pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
648
- name = f"mergedPeak{i}"
649
- lines.append(
650
- f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
651
- )
652
- with open(outPath, "w") as outF:
653
- outF.write("\n".join(lines) + ("\n" if lines else ""))
654
- logger.info(f"Merged matches written to {outPath}")
655
- return outPath
656
721
 
722
+ if g["pHasInf"]:
723
+ pHMLog10 = MAX_NEGLOGP
724
+ else:
725
+ if (
726
+ g["pMax"] == float("-inf")
727
+ or not (g["pTail"] > 0.0)
728
+ or math.isnan(g["pTail"])
729
+ ):
730
+ pHMLog10 = MIN_NEGLOGP
731
+ else:
732
+ pHMLog10 = -math.log10(g["n"]) + (
733
+ g["pMax"] + math.log10(g["pTail"])
734
+ )
735
+ pHMLog10 = max(
736
+ MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
737
+ )
657
738
 
658
- def textNullCDF(
659
- nullBlockMaximaSFVals: npt.NDArray[np.float64],
660
- binCount: int = 20,
661
- barWidth: int = 50,
662
- barChar="\u25a2",
663
- normalize: bool = False,
664
- ) -> str:
665
- r"""Plot a histogram of the distribution 1 - ECDF(nullBlockMaxima)
739
+ if g["qHasInf"]:
740
+ qHMLog10 = MAX_NEGLOGP
741
+ else:
742
+ if (
743
+ g["qMax"] == float("-inf")
744
+ or not (g["qTail"] > 0.0)
745
+ or math.isnan(g["qTail"])
746
+ ):
747
+ qHMLog10 = MIN_NEGLOGP
748
+ else:
749
+ qHMLog10 = -math.log10(g["n"]) + (
750
+ g["qMax"] + math.log10(g["qTail"])
751
+ )
752
+ qHMLog10 = max(
753
+ MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
754
+ )
666
755
 
667
- Called by :func:`consenrich.matching.matchWavelet`. Ideally resembles
668
- a uniform(0,1) distribution.
756
+ pointSource = (
757
+ g["peakAbs"] - sMin
758
+ if g["peakAbs"] >= 0
759
+ else (eMax - sMin) // 2
760
+ )
669
761
 
670
- :seealso: :func:`consenrich.matching.matchWavelet`, :ref:`cconsenrich.csampleBlockStats`
671
- """
672
- valueLower, valueUpper = (
673
- min(nullBlockMaximaSFVals),
674
- max(nullBlockMaximaSFVals),
675
- )
676
- binCount = max(1, int(binCount))
677
- binStep = (valueUpper - valueLower) / binCount
678
- binEdges = [
679
- valueLower + indexValue * binStep for indexValue in range(binCount)
680
- ]
681
- binEdges.append(valueUpper)
682
- binCounts = [0] * binCount
683
- for numericValue in nullBlockMaximaSFVals:
684
- binIndex = int((numericValue - valueLower) / binStep)
685
- if binIndex == binCount:
686
- binIndex -= 1
687
- binCounts[binIndex] += 1
688
- valueSeries = (
689
- [countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
690
- if normalize
691
- else binCounts[:]
692
- )
693
- valueMaximum = max(valueSeries) if valueSeries else 0
694
- widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
695
- edgeFormat = f"{{:.{2}f}}"
696
- rangeLabels = [
697
- f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
698
- for indexValue in range(binCount)
699
- ]
700
- labelWidth = max(len(textValue) for textValue in rangeLabels)
701
- lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
702
- for rangeLabel, seriesValue, countValue in zip(
703
- rangeLabels, valueSeries, binCounts
704
- ):
705
- barString = barChar * int(round(seriesValue * widthScale))
706
- trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
762
+ qMinLog10 = g["qMin"]
763
+ qMaxLog10 = g["qMax"]
764
+ if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
765
+ qMinLog10 = MIN_NEGLOGP
766
+ if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
767
+ qMaxLog10 = MAX_NEGLOGP
768
+ elif (
769
+ not math.isfinite(qMaxLog10)
770
+ or not math.isfinite(qMinLog10)
771
+ ) or (qMaxLog10 < MIN_NEGLOGP):
772
+ qMinLog10 = 0.0
773
+ qMaxLog10 = 0.0
774
+
775
+ # informative+parsable name
776
+ # e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
777
+ name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
707
778
  lines.append(
708
- f"{rangeLabel.rjust(labelWidth)} | {barString}{trailingText.ljust(10)}"
779
+ f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
709
780
  )
710
- return "\n".join(lines)
781
+
782
+ with open(outPath, "w") as outF:
783
+ outF.write("\n".join(lines) + ("\n" if lines else ""))
784
+ logger.info(f"Merged matches written to {outPath}")
785
+ return outPath