consenrich 0.6.3b1__cp314-cp314-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

consenrich/matching.py ADDED
@@ -0,0 +1,710 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""Module implementing (experimental) 'structured peak detection' features using wavelet-based templates."""
3
+
4
+ import logging
5
+ import os
6
+ from pybedtools import BedTool
7
+ from typing import List, Optional
8
+
9
+ import pandas as pd
10
+ import pywt as pw
11
+ import numpy as np
12
+ import numpy.typing as npt
13
+
14
+ from scipy import signal, stats
15
+
16
+ from . import cconsenrich
17
+ from . import core as core
18
+
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def castableToFloat(value) -> bool:
27
+ if value is None:
28
+ return False
29
+ if isinstance(value, bool):
30
+ return False
31
+ if isinstance(value, str):
32
+ if value.lower().replace(' ', '') in ["nan", "inf", "-inf", "infinity", "-infinity", "", " "]:
33
+ return False
34
+
35
+ try:
36
+ float(value)
37
+ if np.isfinite(float(value)):
38
+ return True
39
+ except Exception:
40
+ return False
41
+ return False
42
+
43
+
44
+ def matchExistingBedGraph(
45
+ bedGraphFile: str,
46
+ templateName: str,
47
+ cascadeLevel: int,
48
+ alpha: float = 0.05,
49
+ minMatchLengthBP: Optional[int] = 250,
50
+ iters: int = 25_000,
51
+ minSignalAtMaxima: Optional[float | str] = "q:0.75",
52
+ maxNumMatches: Optional[int] = 100_000,
53
+ recenterAtPointSource: bool = True,
54
+ useScalingFunction: bool = True,
55
+ excludeRegionsBedFile: Optional[str] = None,
56
+ mergeGapBP: int = 50,
57
+ merge: bool = True,
58
+ weights: Optional[npt.NDArray[np.float64]] = None,
59
+ randSeed: int = 42,
60
+ ) -> Optional[str]:
61
+ r"""Match discrete templates in a bedGraph file of Consenrich estimates
62
+
63
+ This function is a simple wrapper. See :func:`consenrich.matching.matchWavelet` for details on parameters.
64
+
65
+ :param bedGraphFile: A bedGraph file with 'consensus' signal estimates derived from multiple samples, e.g., from Consenrich. The suffix '.bedGraph' is required.
66
+ :type bedGraphFile: str
67
+
68
+ :seealso: :func:`consenrich.matching.matchWavelet`, :class:`consenrich.core.matchingParams`, :ref:`matching`
69
+ """
70
+ if not os.path.isfile(bedGraphFile):
71
+ raise FileNotFoundError(f"Couldn't access {bedGraphFile}")
72
+ if not bedGraphFile.endswith(".bedGraph"):
73
+ raise ValueError(
74
+ f"Please use a suffix '.bedGraph' for `bedGraphFile`, got: {bedGraphFile}"
75
+ )
76
+
77
+ allowedTemplates = [
78
+ x for x in pw.wavelist(kind="discrete") if "bio" not in x
79
+ ]
80
+ if templateName not in allowedTemplates:
81
+ raise ValueError(
82
+ f"Unknown wavelet template: {templateName}\nAvailable templates: {allowedTemplates}"
83
+ )
84
+
85
+ cols = ["chromosome", "start", "end", "value"]
86
+ bedGraphDF = pd.read_csv(
87
+ bedGraphFile,
88
+ sep="\t",
89
+ header=None,
90
+ names=cols,
91
+ dtype={
92
+ "chromosome": str,
93
+ "start": np.uint32,
94
+ "end": np.uint32,
95
+ "value": np.float64,
96
+ },
97
+ )
98
+
99
+ outPaths: List[str] = []
100
+ outPathsMerged: List[str] = []
101
+ outPathAll: Optional[str] = None
102
+ outPathMergedAll: Optional[str] = None
103
+
104
+ for chrom_ in sorted(bedGraphDF["chromosome"].unique()):
105
+ df_ = bedGraphDF[bedGraphDF["chromosome"] == chrom_]
106
+ if len(df_) < 5:
107
+ logger.info(f"Skipping {chrom_}: fewer than 5 rows.")
108
+ continue
109
+
110
+ try:
111
+ df__ = matchWavelet(
112
+ chrom_,
113
+ df_["start"].to_numpy(),
114
+ df_["value"].to_numpy(),
115
+ [templateName],
116
+ [cascadeLevel],
117
+ iters,
118
+ alpha,
119
+ minMatchLengthBP,
120
+ maxNumMatches,
121
+ recenterAtPointSource=recenterAtPointSource,
122
+ useScalingFunction=useScalingFunction,
123
+ excludeRegionsBedFile=excludeRegionsBedFile,
124
+ weights=weights,
125
+ minSignalAtMaxima=minSignalAtMaxima,
126
+ randSeed=randSeed,
127
+ )
128
+ except Exception as ex:
129
+ logger.info(f"Skipping {chrom_} due to error in matchWavelet: {ex}")
130
+ continue
131
+
132
+ if df__.empty:
133
+ logger.info(f"No matches detected on {chrom_}.")
134
+ continue
135
+
136
+ perChromOut = bedGraphFile.replace(
137
+ ".bedGraph",
138
+ f".{chrom_}.matched.{templateName}_lvl{cascadeLevel}.narrowPeak",
139
+ )
140
+ df__.to_csv(perChromOut, sep="\t", index=False, header=False)
141
+ logger.info(f"Matches written to {perChromOut}")
142
+ outPaths.append(perChromOut)
143
+
144
+ if merge:
145
+ mergedPath = mergeMatches(perChromOut, mergeGapBP=mergeGapBP)
146
+ if mergedPath is not None:
147
+ logger.info(f"Merged matches written to {mergedPath}")
148
+ outPathsMerged.append(mergedPath)
149
+
150
+ if len(outPaths) == 0 and len(outPathsMerged) == 0:
151
+ raise ValueError("No matches were detected.")
152
+
153
+ if len(outPaths) > 0:
154
+ outPathAll = (
155
+ f"{bedGraphFile.replace('.bedGraph', '')}"
156
+ f".allChroms.matched.{templateName}_lvl{cascadeLevel}.narrowPeak"
157
+ )
158
+ with open(outPathAll, "w") as outF:
159
+ for path_ in outPaths:
160
+ if os.path.isfile(path_):
161
+ with open(path_, "r") as inF:
162
+ for line in inF:
163
+ outF.write(line)
164
+ logger.info(f"All unmerged matches written to {outPathAll}")
165
+
166
+ if merge and len(outPathsMerged) > 0:
167
+ outPathMergedAll = (
168
+ f"{bedGraphFile.replace('.bedGraph', '')}"
169
+ f".allChroms.matched.{templateName}_lvl{cascadeLevel}.mergedMatches.narrowPeak"
170
+ )
171
+ with open(outPathMergedAll, "w") as outF:
172
+ for path in outPathsMerged:
173
+ if os.path.isfile(path):
174
+ with open(path, "r") as inF:
175
+ for line in inF:
176
+ outF.write(line)
177
+ logger.info(f"All merged matches written to {outPathMergedAll}")
178
+
179
+ for path_ in outPaths + outPathsMerged:
180
+ try:
181
+ if os.path.isfile(path_):
182
+ os.remove(path_)
183
+ except Exception:
184
+ pass
185
+
186
+ if merge and outPathMergedAll:
187
+ return outPathMergedAll
188
+ if outPathAll:
189
+ return outPathAll
190
+ logger.warning("No matches were detected...returning `None`")
191
+ return None
192
+
193
+
194
+ def matchWavelet(
195
+ chromosome: str,
196
+ intervals: npt.NDArray[int],
197
+ values: npt.NDArray[np.float64],
198
+ templateNames: List[str],
199
+ cascadeLevels: List[int],
200
+ iters: int,
201
+ alpha: float = 0.05,
202
+ minMatchLengthBP: Optional[int] = 250,
203
+ maxNumMatches: Optional[int] = 100_000,
204
+ minSignalAtMaxima: Optional[float | str] = "q:0.75",
205
+ randSeed: int = 42,
206
+ recenterAtPointSource: bool = True,
207
+ useScalingFunction: bool = True,
208
+ excludeRegionsBedFile: Optional[str] = None,
209
+ weights: Optional[npt.NDArray[np.float64]] = None,
210
+ ) -> pd.DataFrame:
211
+ r"""Detect structured peaks by cross-correlating Consenrich tracks with wavelet- or scaling-function templates.
212
+
213
+ See :ref:`matching` for an overview of the approach.
214
+
215
+ :param chromosome: Chromosome name for the input intervals and values.
216
+ :type chromosome: str
217
+ :param values: 'Consensus' signal estimates derived from multiple samples, e.g., from Consenrich.
218
+ :type values: npt.NDArray[np.float64]
219
+ :param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
220
+ :type templateNames: List[str]
221
+ :param cascadeLevels: A list of int values -- the number of cascade iterations used for approximating
222
+ the scaling/wavelet functions.
223
+ :type cascadeLevels: List[int]
224
+ :param iters: Number of random blocks to sample in the response sequence while building
225
+ an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
226
+ :type iters: int
227
+ :param alpha: Primary significance threshold on detected matches. Specifically, the
228
+ :math:`1 - \alpha` quantile of an empirical null distribution. The empirical null
229
+ distribution is built from cross-correlation values over randomly sampled blocks.
230
+ :type alpha: float
231
+ :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
232
+ the signal-template convolution must be greater in value than others to qualify as matches.
233
+ *Set to a negative value to disable this filter*.
234
+ :type minMatchLengthBP: int
235
+ :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Require the *signal value*
236
+ at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale.
237
+ If a `float` value is provided, the minimum signal value must be greater than this (absolute) value. *Set to a
238
+ negative value to disable the threshold*.
239
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.75'. The
240
+ threshold is then set to the corresponding quantile of the non-zero signal estimates.
241
+ Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
242
+ :type minSignalAtMaxima: Optional[str | float]
243
+ :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
244
+ If False, use (only) the wavelet function.
245
+ :type useScalingFunction: bool
246
+ :param excludeRegionsBedFile: A BED file with regions to exclude from matching
247
+ :type excludeRegionsBedFile: Optional[str]
248
+
249
+ :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
250
+ """
251
+
252
+ if len(intervals) < 5:
253
+ raise ValueError("`intervals` must be at least length 5")
254
+ if len(values) != len(intervals):
255
+ raise ValueError("`values` must have the same length as `intervals`")
256
+ intervalLengthBP = intervals[1] - intervals[0]
257
+ if not np.all(np.abs(np.diff(intervals)) == intervalLengthBP):
258
+ # FFR: don't change this exception message without updating tests
259
+ # --'spaced' is matched in tests
260
+ raise ValueError("`intervals` must be evenly spaced.")
261
+
262
+ randSeed_: int = int(randSeed)
263
+ cols = [
264
+ "chromosome",
265
+ "start",
266
+ "end",
267
+ "name",
268
+ "score",
269
+ "strand",
270
+ "signal",
271
+ "pValue",
272
+ "qValue",
273
+ "pointSource",
274
+ ]
275
+ matchDF = pd.DataFrame(columns=cols)
276
+ minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
277
+ cascadeLevels = sorted(list(set(cascadeLevels)))
278
+ if weights is not None and len(weights) == len(values):
279
+ values = values * weights
280
+ asinhValues = np.asinh(values, dtype=np.float32)
281
+ asinhNonZeroValues = asinhValues[asinhValues > 0]
282
+ iters = max(iters, 1000)
283
+ defQuantile: float = 0.75
284
+ for l_, cascadeLevel in enumerate(cascadeLevels):
285
+ for t_, templateName in enumerate(templateNames):
286
+ try:
287
+ templateName = str(templateName)
288
+ cascadeLevel = int(cascadeLevel)
289
+ except ValueError:
290
+ logger.info(
291
+ f"Skipping invalid templateName or cascadeLevel: {templateName}, {cascadeLevel}"
292
+ )
293
+ continue
294
+ if templateName not in pw.wavelist(kind="discrete"):
295
+ logger.info(
296
+ f"\nSkipping unknown wavelet template: {templateName}\nAvailable templates: {pw.wavelist(kind='discrete')}"
297
+ )
298
+ continue
299
+
300
+ wav = pw.Wavelet(templateName)
301
+ scalingFunc, waveletFunc, x = wav.wavefun(level=cascadeLevel)
302
+ template = np.array(waveletFunc, dtype=np.float64) / np.linalg.norm(
303
+ waveletFunc
304
+ )
305
+
306
+ if useScalingFunction:
307
+ template = np.array(
308
+ scalingFunc, dtype=np.float64
309
+ ) / np.linalg.norm(scalingFunc)
310
+
311
+ logger.info(
312
+ f"Matching: template: {templateName}, cascade level: {cascadeLevel}, template length: {len(template)}, scaling: {useScalingFunction}, wavelet: {not useScalingFunction}"
313
+ )
314
+
315
+ responseSequence: npt.NDArray[np.float64] = signal.fftconvolve(
316
+ values, template[::-1], mode="same"
317
+ )
318
+
319
+ minMatchLengthBP = minMatchLengthBPCopy
320
+ if minMatchLengthBP is None or minMatchLengthBP < 1:
321
+ minMatchLengthBP = len(template) * intervalLengthBP
322
+ if minMatchLengthBP % intervalLengthBP != 0:
323
+ minMatchLengthBP += intervalLengthBP - (
324
+ minMatchLengthBP % intervalLengthBP
325
+ )
326
+
327
+ relativeMaximaWindow = int(
328
+ ((minMatchLengthBP / intervalLengthBP) / 2) + 1
329
+ )
330
+ relativeMaximaWindow = max(relativeMaximaWindow, 1)
331
+
332
+ excludeMask = np.zeros(len(intervals), dtype=np.uint8)
333
+ if excludeRegionsBedFile is not None:
334
+ excludeMask = core.getBedMask(
335
+ chromosome,
336
+ excludeRegionsBedFile,
337
+ intervals,
338
+ )
339
+
340
+ logger.info(
341
+ f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
342
+ )
343
+ blockMaxima = np.array(
344
+ cconsenrich.csampleBlockStats(
345
+ intervals.astype(np.uint32),
346
+ responseSequence,
347
+ relativeMaximaWindow,
348
+ iters * 2,
349
+ randSeed_,
350
+ excludeMask.astype(np.uint8),
351
+ ),
352
+ dtype=float,
353
+ )
354
+ blockMaximaCheck = blockMaxima.copy()[iters:]
355
+ blockMaxima = blockMaxima[:iters]
356
+ blockMaxima = blockMaxima[
357
+ (blockMaxima > np.quantile(blockMaxima, 0.005))
358
+ & (blockMaxima < np.quantile(blockMaxima, 0.995))
359
+ ]
360
+
361
+ ecdfBlockMaximaSF = stats.ecdf(blockMaxima).sf
362
+
363
+ responseThreshold = float(1e6)
364
+ arsinhSignalThreshold = float(1e6)
365
+ try:
366
+ # we use 'interpolated_inverted_cdf' in a few spots
367
+ # --- making sure it's supported here, at its first use
368
+ responseThreshold = np.quantile(
369
+ blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
370
+ )
371
+ except (TypeError, ValueError, KeyError) as err_:
372
+ logger.warning(
373
+ f"\nError computing response threshold with alpha={alpha}:\n{err_}\n"
374
+ f"\nIs `blockMaxima` empty?"
375
+ f"\nIs NumPy older than 1.22.0 (~May 2022~)?"
376
+ f"\nIs `alpha` in (0,1)?\n"
377
+ )
378
+ raise
379
+
380
+ # parse minSignalAtMaxima, set arsinhSignalThreshold
381
+ if minSignalAtMaxima is None:
382
+ # -----we got a `None`-----
383
+ arsinhSignalThreshold = -float(1e6)
384
+ elif isinstance(minSignalAtMaxima, str):
385
+ # -----we got a str-----
386
+ if minSignalAtMaxima.startswith("q:"):
387
+ # case: expected 'q:quantileValue' format
388
+ qVal = float(minSignalAtMaxima.split("q:")[-1])
389
+ if qVal < 0 or qVal > 1:
390
+ raise ValueError(f"Quantile {qVal} is out of range")
391
+ arsinhSignalThreshold = float(
392
+ np.quantile(
393
+ asinhNonZeroValues,
394
+ qVal,
395
+ method="interpolated_inverted_cdf",
396
+ )
397
+ )
398
+
399
+ elif castableToFloat(minSignalAtMaxima):
400
+ # case: numeric in str form (possible due to CLI)
401
+ if float(minSignalAtMaxima) < 0.0:
402
+ # effectively disables threshold
403
+ arsinhSignalThreshold = -float(1e6)
404
+ else:
405
+ # use supplied value
406
+ arsinhSignalThreshold = np.asinh(
407
+ float(minSignalAtMaxima)
408
+ )
409
+ else:
410
+ # case: not in known format, not castable to a float, use defaults
411
+ logger.info(
412
+ f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
413
+ )
414
+ arsinhSignalThreshold = float(
415
+ np.quantile(
416
+ asinhNonZeroValues,
417
+ defQuantile,
418
+ method="interpolated_inverted_cdf",
419
+ )
420
+ )
421
+ # -----
422
+
423
+ elif isinstance(minSignalAtMaxima, (float, int)):
424
+ # -----we got an int or float-----
425
+ if float(minSignalAtMaxima) < 0.0:
426
+ # effectively disables threshold
427
+ arsinhSignalThreshold = -float(1e6)
428
+ else:
429
+ # use supplied value
430
+ arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
431
+ # -----
432
+
433
+
434
+ relativeMaximaIndices = signal.argrelmax(
435
+ responseSequence, order=relativeMaximaWindow
436
+ )[0]
437
+
438
+ relativeMaximaIndices = relativeMaximaIndices[
439
+ (responseSequence[relativeMaximaIndices] > responseThreshold)
440
+ & (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
441
+ ]
442
+
443
+ if len(relativeMaximaIndices) == 0:
444
+ logger.info(
445
+ f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
446
+ )
447
+ continue
448
+
449
+ if maxNumMatches is not None:
450
+ if len(relativeMaximaIndices) > maxNumMatches:
451
+ # take the greatest maxNumMatches (by 'signal')
452
+ relativeMaximaIndices = relativeMaximaIndices[
453
+ np.argsort(asinhValues[relativeMaximaIndices])[
454
+ -maxNumMatches:
455
+ ]
456
+ ]
457
+
458
+ ecdfSFCheckVals: npt.NDArray[np.float64] = (
459
+ ecdfBlockMaximaSF.evaluate(blockMaximaCheck)
460
+ )
461
+ testKS, _ = stats.kstest(
462
+ ecdfSFCheckVals,
463
+ stats.uniform.cdf,
464
+ alternative="two-sided",
465
+ )
466
+
467
+ logger.info(
468
+ f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
469
+ f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
470
+ f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
471
+ f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n" # lil text-plot histogram of approx. null CDF
472
+ )
473
+
474
+ # starts
475
+ startsIdx = np.maximum(
476
+ relativeMaximaIndices - relativeMaximaWindow, 0
477
+ )
478
+ # ends
479
+ endsIdx = np.minimum(
480
+ len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
481
+ )
482
+ # point source
483
+ pointSourcesIdx = []
484
+ for start_, end_ in zip(startsIdx, endsIdx):
485
+ pointSourcesIdx.append(
486
+ np.argmax(values[start_ : end_ + 1]) + start_
487
+ )
488
+ pointSourcesIdx = np.array(pointSourcesIdx)
489
+ starts = intervals[startsIdx]
490
+ ends = intervals[endsIdx]
491
+ pointSources = (intervals[pointSourcesIdx]) + max(
492
+ 1, intervalLengthBP // 2
493
+ )
494
+ if (
495
+ recenterAtPointSource
496
+ ): # recenter at point source (signal maximum)
497
+ starts = pointSources - (
498
+ relativeMaximaWindow * intervalLengthBP
499
+ )
500
+ ends = pointSources + (relativeMaximaWindow * intervalLengthBP)
501
+ pointSources = (intervals[pointSourcesIdx] - starts) + max(
502
+ 1, intervalLengthBP // 2
503
+ )
504
+ # (ucsc browser) score [0,1000]
505
+ sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
506
+ minResponse = np.min(sqScores)
507
+ maxResponse = np.max(sqScores)
508
+ rangeResponse = max(maxResponse - minResponse, 1.0)
509
+ scores = (
510
+ 250 + 750 * (sqScores - minResponse) / rangeResponse
511
+ ).astype(int)
512
+ # feature name
513
+ names = [
514
+ f"{templateName}_{cascadeLevel}_{i}"
515
+ for i in relativeMaximaIndices
516
+ ]
517
+ # strand
518
+ strands = ["." for _ in range(len(scores))]
519
+ # p-values in -log10 scale per convention
520
+ pValues = -np.log10(
521
+ np.clip(
522
+ ecdfBlockMaximaSF.evaluate(
523
+ responseSequence[relativeMaximaIndices]
524
+ ),
525
+ 1e-10,
526
+ 1.0,
527
+ )
528
+ )
529
+ # q-values (ignored)
530
+ qValues = np.array(np.ones_like(pValues) * -1.0)
531
+
532
+ tempDF = pd.DataFrame(
533
+ {
534
+ "chromosome": [chromosome] * len(relativeMaximaIndices),
535
+ "start": starts.astype(int),
536
+ "end": ends.astype(int),
537
+ "name": names,
538
+ "score": scores,
539
+ "strand": strands,
540
+ "signal": responseSequence[relativeMaximaIndices],
541
+ "pValue": pValues,
542
+ "qValue": qValues,
543
+ "pointSource": pointSources.astype(int),
544
+ }
545
+ )
546
+
547
+ if matchDF.empty:
548
+ matchDF = tempDF
549
+ else:
550
+ matchDF = pd.concat([matchDF, tempDF], ignore_index=True)
551
+ randSeed_ += 1
552
+
553
+ if matchDF.empty:
554
+ logger.info("No matches detected, returning empty DataFrame.")
555
+ return matchDF
556
+ matchDF.sort_values(by=["chromosome", "start", "end"], inplace=True)
557
+ matchDF.reset_index(drop=True, inplace=True)
558
+ return matchDF
559
+
560
+
561
+ def mergeMatches(filePath: str, mergeGapBP: int = 50):
562
+ r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
563
+
564
+ Where an overlap occurs within `mergeGapBP` base pairs, the feature with the greatest signal defines the new summit/pointSource
565
+
566
+ :param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
567
+ :type filePath: str
568
+ :param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
569
+ :type mergeGapBP: int
570
+
571
+ :seealso: :class:`consenrich.core.matchingParams`
572
+ """
573
+ if not os.path.isfile(filePath):
574
+ logger.info(f"Couldn't access {filePath}...skipping merge")
575
+ return None
576
+ bed = None
577
+ try:
578
+ bed = BedTool(filePath)
579
+ except Exception as ex:
580
+ logger.info(
581
+ f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
582
+ )
583
+ return None
584
+ if bed is None:
585
+ logger.info(f"Couldn't create BedTool for {filePath}...skipping merge")
586
+ return None
587
+
588
+ bed = bed.sort()
589
+ clustered = bed.cluster(d=mergeGapBP)
590
+ groups = {}
591
+ for f in clustered:
592
+ fields = f.fields
593
+ chrom = fields[0]
594
+ start = int(fields[1])
595
+ end = int(fields[2])
596
+ score = float(fields[4])
597
+ signal = float(fields[6])
598
+ pval = float(fields[7])
599
+ qval = float(fields[8])
600
+ peak = int(fields[9])
601
+ clId = fields[-1]
602
+ if clId not in groups:
603
+ groups[clId] = {
604
+ "chrom": chrom,
605
+ "sMin": start,
606
+ "eMax": end,
607
+ "scSum": 0.0,
608
+ "sigSum": 0.0,
609
+ "pSum": 0.0,
610
+ "qSum": 0.0,
611
+ "n": 0,
612
+ "maxS": float("-inf"),
613
+ "peakAbs": -1,
614
+ }
615
+ g = groups[clId]
616
+ if start < g["sMin"]:
617
+ g["sMin"] = start
618
+ if end > g["eMax"]:
619
+ g["eMax"] = end
620
+ g["scSum"] += score
621
+ g["sigSum"] += signal
622
+ g["pSum"] += pval
623
+ g["qSum"] += qval
624
+ g["n"] += 1
625
+ # scan for largest signal, FFR: consider using the p-val in the future
626
+ if signal > g["maxS"]:
627
+ g["maxS"] = signal
628
+ g["peakAbs"] = start + peak if peak >= 0 else -1
629
+ items = []
630
+ for clId, g in groups.items():
631
+ items.append((g["chrom"], g["sMin"], g["eMax"], g))
632
+ items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
633
+ outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
634
+ lines = []
635
+ i = 0
636
+ for chrom, sMin, eMax, g in items:
637
+ i += 1
638
+ avgScore = g["scSum"] / g["n"]
639
+ if avgScore < 0:
640
+ avgScore = 0
641
+ if avgScore > 1000:
642
+ avgScore = 1000
643
+ scoreInt = int(round(avgScore))
644
+ sigAvg = g["sigSum"] / g["n"]
645
+ pAvg = g["pSum"] / g["n"]
646
+ qAvg = g["qSum"] / g["n"]
647
+ pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
648
+ name = f"mergedPeak{i}"
649
+ lines.append(
650
+ f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
651
+ )
652
+ with open(outPath, "w") as outF:
653
+ outF.write("\n".join(lines) + ("\n" if lines else ""))
654
+ logger.info(f"Merged matches written to {outPath}")
655
+ return outPath
656
+
657
+
658
+ def textNullCDF(
659
+ nullBlockMaximaSFVals: npt.NDArray[np.float64],
660
+ binCount: int = 20,
661
+ barWidth: int = 50,
662
+ barChar="\u25a2",
663
+ normalize: bool = False,
664
+ ) -> str:
665
+ r"""Plot a histogram of the distribution 1 - ECDF(nullBlockMaxima)
666
+
667
+ Called by :func:`consenrich.matching.matchWavelet`. Ideally resembles
668
+ a uniform(0,1) distribution.
669
+
670
+ :seealso: :func:`consenrich.matching.matchWavelet`, :ref:`cconsenrich.csampleBlockStats`
671
+ """
672
+ valueLower, valueUpper = (
673
+ min(nullBlockMaximaSFVals),
674
+ max(nullBlockMaximaSFVals),
675
+ )
676
+ binCount = max(1, int(binCount))
677
+ binStep = (valueUpper - valueLower) / binCount
678
+ binEdges = [
679
+ valueLower + indexValue * binStep for indexValue in range(binCount)
680
+ ]
681
+ binEdges.append(valueUpper)
682
+ binCounts = [0] * binCount
683
+ for numericValue in nullBlockMaximaSFVals:
684
+ binIndex = int((numericValue - valueLower) / binStep)
685
+ if binIndex == binCount:
686
+ binIndex -= 1
687
+ binCounts[binIndex] += 1
688
+ valueSeries = (
689
+ [countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
690
+ if normalize
691
+ else binCounts[:]
692
+ )
693
+ valueMaximum = max(valueSeries) if valueSeries else 0
694
+ widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
695
+ edgeFormat = f"{{:.{2}f}}"
696
+ rangeLabels = [
697
+ f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
698
+ for indexValue in range(binCount)
699
+ ]
700
+ labelWidth = max(len(textValue) for textValue in rangeLabels)
701
+ lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
702
+ for rangeLabel, seriesValue, countValue in zip(
703
+ rangeLabels, valueSeries, binCounts
704
+ ):
705
+ barString = barChar * int(round(seriesValue * widthScale))
706
+ trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
707
+ lines.append(
708
+ f"{rangeLabel.rjust(labelWidth)} | {barString}{trailingText.ljust(10)}"
709
+ )
710
+ return "\n".join(lines)