consenrich 0.6.3b1__cp311-cp311-macosx_11_0_arm64.whl → 0.7.1b1__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/cconsenrich.c +404 -404
- consenrich/cconsenrich.cpython-311-darwin.so +0 -0
- consenrich/consenrich.py +216 -62
- consenrich/core.py +30 -17
- consenrich/detrorm.py +12 -3
- consenrich/matching.py +444 -369
- consenrich/misc_util.py +29 -0
- {consenrich-0.6.3b1.dist-info → consenrich-0.7.1b1.dist-info}/METADATA +3 -3
- {consenrich-0.6.3b1.dist-info → consenrich-0.7.1b1.dist-info}/RECORD +13 -13
- {consenrich-0.6.3b1.dist-info → consenrich-0.7.1b1.dist-info}/WHEEL +0 -0
- {consenrich-0.6.3b1.dist-info → consenrich-0.7.1b1.dist-info}/entry_points.txt +0 -0
- {consenrich-0.6.3b1.dist-info → consenrich-0.7.1b1.dist-info}/licenses/LICENSE +0 -0
- {consenrich-0.6.3b1.dist-info → consenrich-0.7.1b1.dist-info}/top_level.txt +0 -0
consenrich/matching.py
CHANGED
|
@@ -3,6 +3,7 @@ r"""Module implementing (experimental) 'structured peak detection' features usin
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import math
|
|
6
7
|
from pybedtools import BedTool
|
|
7
8
|
from typing import List, Optional
|
|
8
9
|
|
|
@@ -23,13 +24,25 @@ logging.basicConfig(
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def scalarClip(value: float, low: float, high: float) -> float:
|
|
28
|
+
return low if value < low else high if value > high else value
|
|
29
|
+
|
|
30
|
+
|
|
26
31
|
def castableToFloat(value) -> bool:
|
|
27
32
|
if value is None:
|
|
28
33
|
return False
|
|
29
34
|
if isinstance(value, bool):
|
|
30
35
|
return False
|
|
31
36
|
if isinstance(value, str):
|
|
32
|
-
if value.lower().replace(
|
|
37
|
+
if value.lower().replace(" ", "") in [
|
|
38
|
+
"nan",
|
|
39
|
+
"inf",
|
|
40
|
+
"-inf",
|
|
41
|
+
"infinity",
|
|
42
|
+
"-infinity",
|
|
43
|
+
"",
|
|
44
|
+
" ",
|
|
45
|
+
]:
|
|
33
46
|
return False
|
|
34
47
|
|
|
35
48
|
try:
|
|
@@ -53,7 +66,7 @@ def matchExistingBedGraph(
|
|
|
53
66
|
recenterAtPointSource: bool = True,
|
|
54
67
|
useScalingFunction: bool = True,
|
|
55
68
|
excludeRegionsBedFile: Optional[str] = None,
|
|
56
|
-
mergeGapBP: int =
|
|
69
|
+
mergeGapBP: Optional[int] = None,
|
|
57
70
|
merge: bool = True,
|
|
58
71
|
weights: Optional[npt.NDArray[np.float64]] = None,
|
|
59
72
|
randSeed: int = 42,
|
|
@@ -74,6 +87,13 @@ def matchExistingBedGraph(
|
|
|
74
87
|
f"Please use a suffix '.bedGraph' for `bedGraphFile`, got: {bedGraphFile}"
|
|
75
88
|
)
|
|
76
89
|
|
|
90
|
+
if mergeGapBP is None:
|
|
91
|
+
mergeGapBP = (
|
|
92
|
+
(minMatchLengthBP // 2) + 1
|
|
93
|
+
if minMatchLengthBP is not None
|
|
94
|
+
else 75
|
|
95
|
+
)
|
|
96
|
+
|
|
77
97
|
allowedTemplates = [
|
|
78
98
|
x for x in pw.wavelist(kind="discrete") if "bio" not in x
|
|
79
99
|
]
|
|
@@ -126,7 +146,9 @@ def matchExistingBedGraph(
|
|
|
126
146
|
randSeed=randSeed,
|
|
127
147
|
)
|
|
128
148
|
except Exception as ex:
|
|
129
|
-
logger.info(
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Skipping {chrom_} due to error in matchWavelet: {ex}"
|
|
151
|
+
)
|
|
130
152
|
continue
|
|
131
153
|
|
|
132
154
|
if df__.empty:
|
|
@@ -142,7 +164,9 @@ def matchExistingBedGraph(
|
|
|
142
164
|
outPaths.append(perChromOut)
|
|
143
165
|
|
|
144
166
|
if merge:
|
|
145
|
-
mergedPath = mergeMatches(
|
|
167
|
+
mergedPath = mergeMatches(
|
|
168
|
+
perChromOut, mergeGapBP=mergeGapBP
|
|
169
|
+
)
|
|
146
170
|
if mergedPath is not None:
|
|
147
171
|
logger.info(f"Merged matches written to {mergedPath}")
|
|
148
172
|
outPathsMerged.append(mergedPath)
|
|
@@ -174,7 +198,9 @@ def matchExistingBedGraph(
|
|
|
174
198
|
with open(path, "r") as inF:
|
|
175
199
|
for line in inF:
|
|
176
200
|
outF.write(line)
|
|
177
|
-
logger.info(
|
|
201
|
+
logger.info(
|
|
202
|
+
f"All merged matches written to {outPathMergedAll}"
|
|
203
|
+
)
|
|
178
204
|
|
|
179
205
|
for path_ in outPaths + outPathsMerged:
|
|
180
206
|
try:
|
|
@@ -210,11 +236,10 @@ def matchWavelet(
|
|
|
210
236
|
) -> pd.DataFrame:
|
|
211
237
|
r"""Detect structured peaks by cross-correlating Consenrich tracks with wavelet- or scaling-function templates.
|
|
212
238
|
|
|
213
|
-
See :ref:`matching` for an overview of the approach.
|
|
214
|
-
|
|
215
239
|
:param chromosome: Chromosome name for the input intervals and values.
|
|
216
240
|
:type chromosome: str
|
|
217
|
-
:param values:
|
|
241
|
+
:param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
|
|
242
|
+
but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
|
|
218
243
|
:type values: npt.NDArray[np.float64]
|
|
219
244
|
:param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
|
|
220
245
|
:type templateNames: List[str]
|
|
@@ -225,18 +250,17 @@ def matchWavelet(
|
|
|
225
250
|
an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
|
|
226
251
|
:type iters: int
|
|
227
252
|
:param alpha: Primary significance threshold on detected matches. Specifically, the
|
|
228
|
-
|
|
229
|
-
|
|
253
|
+
minimum corr. empirical p-value approximated from randomly sampled blocks in the
|
|
254
|
+
response sequence.
|
|
230
255
|
:type alpha: float
|
|
231
256
|
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
232
257
|
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
233
|
-
*Set to a negative value to disable this filter*.
|
|
234
258
|
:type minMatchLengthBP: int
|
|
235
|
-
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`.
|
|
236
|
-
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
237
|
-
If a `float` value is provided, the minimum signal value must be greater
|
|
238
|
-
negative value to disable the threshold*.
|
|
239
|
-
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.
|
|
259
|
+
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
|
|
260
|
+
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
261
|
+
to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
|
|
262
|
+
than this (absolute) value. *Set to a negative value to disable the threshold*.
|
|
263
|
+
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
|
|
240
264
|
threshold is then set to the corresponding quantile of the non-zero signal estimates.
|
|
241
265
|
Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
|
|
242
266
|
:type minSignalAtMaxima: Optional[str | float]
|
|
@@ -247,342 +271,349 @@ def matchWavelet(
|
|
|
247
271
|
:type excludeRegionsBedFile: Optional[str]
|
|
248
272
|
|
|
249
273
|
:seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
|
|
274
|
+
:return: A pandas DataFrame with detected matches
|
|
275
|
+
:rtype: pd.DataFrame
|
|
250
276
|
"""
|
|
251
|
-
|
|
252
277
|
if len(intervals) < 5:
|
|
253
278
|
raise ValueError("`intervals` must be at least length 5")
|
|
254
279
|
if len(values) != len(intervals):
|
|
255
|
-
raise ValueError(
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
280
|
+
raise ValueError(
|
|
281
|
+
"`values` must have the same length as `intervals`"
|
|
282
|
+
)
|
|
283
|
+
intervalLengthBp = intervals[1] - intervals[0]
|
|
284
|
+
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
|
|
260
285
|
raise ValueError("`intervals` must be evenly spaced.")
|
|
261
|
-
|
|
262
|
-
randSeed_: int = int(randSeed)
|
|
263
|
-
cols = [
|
|
264
|
-
"chromosome",
|
|
265
|
-
"start",
|
|
266
|
-
"end",
|
|
267
|
-
"name",
|
|
268
|
-
"score",
|
|
269
|
-
"strand",
|
|
270
|
-
"signal",
|
|
271
|
-
"pValue",
|
|
272
|
-
"qValue",
|
|
273
|
-
"pointSource",
|
|
274
|
-
]
|
|
275
|
-
matchDF = pd.DataFrame(columns=cols)
|
|
276
|
-
minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
|
|
286
|
+
rng = np.random.default_rng(int(randSeed))
|
|
277
287
|
cascadeLevels = sorted(list(set(cascadeLevels)))
|
|
278
288
|
if weights is not None and len(weights) == len(values):
|
|
279
289
|
values = values * weights
|
|
280
290
|
asinhValues = np.asinh(values, dtype=np.float32)
|
|
281
291
|
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
282
|
-
iters = max(iters, 1000)
|
|
283
|
-
defQuantile
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
+
iters = max(int(iters), 1000)
|
|
293
|
+
defQuantile = 0.75
|
|
294
|
+
chromMin = int(intervals[0])
|
|
295
|
+
chromMax = int(intervals[-1])
|
|
296
|
+
chromMid = chromMin + (chromMax - chromMin) // 2 # for split
|
|
297
|
+
halfLeftMask = intervals < chromMid
|
|
298
|
+
halfRightMask = ~halfLeftMask
|
|
299
|
+
excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
|
|
300
|
+
if excludeRegionsBedFile is not None:
|
|
301
|
+
excludeMaskGlobal = core.getBedMask(
|
|
302
|
+
chromosome, excludeRegionsBedFile, intervals
|
|
303
|
+
).astype(np.uint8)
|
|
304
|
+
allRows = []
|
|
305
|
+
|
|
306
|
+
def bhFdr(p: np.ndarray) -> np.ndarray:
|
|
307
|
+
m = len(p)
|
|
308
|
+
order = np.argsort(p, kind="mergesort")
|
|
309
|
+
ranked = np.arange(1, m + 1, dtype=float)
|
|
310
|
+
q = (p[order] * m) / ranked
|
|
311
|
+
q = np.minimum.accumulate(q[::-1])[::-1]
|
|
312
|
+
out = np.empty_like(q)
|
|
313
|
+
out[order] = q
|
|
314
|
+
return np.clip(out, 0.0, 1.0)
|
|
315
|
+
|
|
316
|
+
def parseMinSignalThreshold(val):
|
|
317
|
+
if val is None:
|
|
318
|
+
return -1e6
|
|
319
|
+
if isinstance(val, str):
|
|
320
|
+
if val.startswith("q:"):
|
|
321
|
+
qVal = float(val.split("q:")[-1])
|
|
322
|
+
if not (0 <= qVal <= 1):
|
|
323
|
+
raise ValueError(
|
|
324
|
+
f"Quantile {qVal} is out of range"
|
|
325
|
+
)
|
|
326
|
+
return float(
|
|
327
|
+
np.quantile(
|
|
328
|
+
asinhNonZeroValues,
|
|
329
|
+
qVal,
|
|
330
|
+
method="interpolated_inverted_cdf",
|
|
331
|
+
)
|
|
292
332
|
)
|
|
293
|
-
|
|
333
|
+
elif castableToFloat(val):
|
|
334
|
+
v = float(val)
|
|
335
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
336
|
+
else:
|
|
337
|
+
return float(
|
|
338
|
+
np.quantile(
|
|
339
|
+
asinhNonZeroValues,
|
|
340
|
+
defQuantile,
|
|
341
|
+
method="interpolated_inverted_cdf",
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
if isinstance(val, (float, int)):
|
|
345
|
+
v = float(val)
|
|
346
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
347
|
+
return float(
|
|
348
|
+
np.quantile(
|
|
349
|
+
asinhNonZeroValues,
|
|
350
|
+
defQuantile,
|
|
351
|
+
method="interpolated_inverted_cdf",
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def relativeMaxima(
|
|
356
|
+
resp: np.ndarray, orderBins: int
|
|
357
|
+
) -> np.ndarray:
|
|
358
|
+
return signal.argrelmax(resp, order=max(int(orderBins), 1))[0]
|
|
359
|
+
|
|
360
|
+
def sampleBlockMaxima(
|
|
361
|
+
resp: np.ndarray,
|
|
362
|
+
halfMask: np.ndarray,
|
|
363
|
+
relWindowBins: int,
|
|
364
|
+
nsamp: int,
|
|
365
|
+
seed: int,
|
|
366
|
+
):
|
|
367
|
+
exMask = excludeMaskGlobal.astype(np.uint8).copy()
|
|
368
|
+
exMask |= (~halfMask).astype(np.uint8)
|
|
369
|
+
vals = np.array(
|
|
370
|
+
cconsenrich.csampleBlockStats(
|
|
371
|
+
intervals.astype(np.uint32),
|
|
372
|
+
resp,
|
|
373
|
+
int(relWindowBins),
|
|
374
|
+
int(nsamp),
|
|
375
|
+
int(seed),
|
|
376
|
+
exMask.astype(np.uint8),
|
|
377
|
+
),
|
|
378
|
+
dtype=float,
|
|
379
|
+
)
|
|
380
|
+
if len(vals) == 0:
|
|
381
|
+
return vals
|
|
382
|
+
low = np.quantile(vals, 0.001)
|
|
383
|
+
high = np.quantile(vals, 0.999)
|
|
384
|
+
return vals[(vals > low) & (vals < high)]
|
|
385
|
+
|
|
386
|
+
for cascadeLevel in cascadeLevels:
|
|
387
|
+
for templateName in templateNames:
|
|
294
388
|
if templateName not in pw.wavelist(kind="discrete"):
|
|
295
|
-
logger.
|
|
296
|
-
f"
|
|
389
|
+
logger.warning(
|
|
390
|
+
f"Skipping unknown wavelet template: {templateName}"
|
|
297
391
|
)
|
|
298
392
|
continue
|
|
299
393
|
|
|
300
|
-
wav = pw.Wavelet(templateName)
|
|
301
|
-
scalingFunc, waveletFunc,
|
|
302
|
-
|
|
303
|
-
waveletFunc
|
|
394
|
+
wav = pw.Wavelet(str(templateName))
|
|
395
|
+
scalingFunc, waveletFunc, _ = wav.wavefun(
|
|
396
|
+
level=int(cascadeLevel)
|
|
304
397
|
)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
398
|
+
template = np.array(
|
|
399
|
+
scalingFunc if useScalingFunction else waveletFunc,
|
|
400
|
+
dtype=np.float64,
|
|
401
|
+
)
|
|
402
|
+
template /= np.linalg.norm(template)
|
|
310
403
|
|
|
311
404
|
logger.info(
|
|
312
|
-
f"
|
|
405
|
+
f"\n\tMatching template: {templateName}"
|
|
406
|
+
f"\n\tcascade level: {cascadeLevel}"
|
|
407
|
+
f"\n\ttemplate length: {len(template)}"
|
|
313
408
|
)
|
|
314
409
|
|
|
315
|
-
|
|
410
|
+
# efficient FFT-based cross-correlation
|
|
411
|
+
# (OA may be better for smaller templates, TODO add a check)
|
|
412
|
+
response = signal.fftconvolve(
|
|
316
413
|
values, template[::-1], mode="same"
|
|
317
414
|
)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
minMatchLengthBP % intervalLengthBP
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
relativeMaximaWindow = int(
|
|
328
|
-
((minMatchLengthBP / intervalLengthBP) / 2) + 1
|
|
329
|
-
)
|
|
330
|
-
relativeMaximaWindow = max(relativeMaximaWindow, 1)
|
|
331
|
-
|
|
332
|
-
excludeMask = np.zeros(len(intervals), dtype=np.uint8)
|
|
333
|
-
if excludeRegionsBedFile is not None:
|
|
334
|
-
excludeMask = core.getBedMask(
|
|
335
|
-
chromosome,
|
|
336
|
-
excludeRegionsBedFile,
|
|
337
|
-
intervals,
|
|
415
|
+
thisMinMatchBp = minMatchLengthBP
|
|
416
|
+
if thisMinMatchBp is None or thisMinMatchBp < 1:
|
|
417
|
+
thisMinMatchBp = len(template) * intervalLengthBp
|
|
418
|
+
if thisMinMatchBp % intervalLengthBp != 0:
|
|
419
|
+
thisMinMatchBp += intervalLengthBp - (
|
|
420
|
+
thisMinMatchBp % intervalLengthBp
|
|
338
421
|
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
|
|
422
|
+
relWindowBins = int(
|
|
423
|
+
((thisMinMatchBp / intervalLengthBp) / 2) + 1
|
|
342
424
|
)
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
responseSequence,
|
|
347
|
-
relativeMaximaWindow,
|
|
348
|
-
iters * 2,
|
|
349
|
-
randSeed_,
|
|
350
|
-
excludeMask.astype(np.uint8),
|
|
351
|
-
),
|
|
352
|
-
dtype=float,
|
|
425
|
+
relWindowBins = max(relWindowBins, 1)
|
|
426
|
+
asinhThreshold = parseMinSignalThreshold(
|
|
427
|
+
minSignalAtMaxima
|
|
353
428
|
)
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
arsinhSignalThreshold = float(1e6)
|
|
365
|
-
try:
|
|
366
|
-
# we use 'interpolated_inverted_cdf' in a few spots
|
|
367
|
-
# --- making sure it's supported here, at its first use
|
|
368
|
-
responseThreshold = np.quantile(
|
|
369
|
-
blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
|
|
370
|
-
)
|
|
371
|
-
except (TypeError, ValueError, KeyError) as err_:
|
|
372
|
-
logger.warning(
|
|
373
|
-
f"\nError computing response threshold with alpha={alpha}:\n{err_}\n"
|
|
374
|
-
f"\nIs `blockMaxima` empty?"
|
|
375
|
-
f"\nIs NumPy older than 1.22.0 (~May 2022~)?"
|
|
376
|
-
f"\nIs `alpha` in (0,1)?\n"
|
|
429
|
+
for nullMask, testMask, tag in [
|
|
430
|
+
(halfLeftMask, halfRightMask, "R"),
|
|
431
|
+
(halfRightMask, halfLeftMask, "L"),
|
|
432
|
+
]:
|
|
433
|
+
blockMaxima = sampleBlockMaxima(
|
|
434
|
+
response,
|
|
435
|
+
nullMask,
|
|
436
|
+
relWindowBins,
|
|
437
|
+
nsamp=max(iters, 1000),
|
|
438
|
+
seed=rng.integers(1, 10_000),
|
|
377
439
|
)
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
if minSignalAtMaxima.startswith("q:"):
|
|
387
|
-
# case: expected 'q:quantileValue' format
|
|
388
|
-
qVal = float(minSignalAtMaxima.split("q:")[-1])
|
|
389
|
-
if qVal < 0 or qVal > 1:
|
|
390
|
-
raise ValueError(f"Quantile {qVal} is out of range")
|
|
391
|
-
arsinhSignalThreshold = float(
|
|
392
|
-
np.quantile(
|
|
393
|
-
asinhNonZeroValues,
|
|
394
|
-
qVal,
|
|
395
|
-
method="interpolated_inverted_cdf",
|
|
396
|
-
)
|
|
440
|
+
if len(blockMaxima) < 25:
|
|
441
|
+
pooledMask = ~excludeMaskGlobal.astype(bool)
|
|
442
|
+
blockMaxima = sampleBlockMaxima(
|
|
443
|
+
response,
|
|
444
|
+
pooledMask,
|
|
445
|
+
relWindowBins,
|
|
446
|
+
nsamp=max(iters, 1000),
|
|
447
|
+
seed=rng.integers(1, 10_000),
|
|
397
448
|
)
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
float(minSignalAtMaxima)
|
|
408
|
-
)
|
|
409
|
-
else:
|
|
410
|
-
# case: not in known format, not castable to a float, use defaults
|
|
411
|
-
logger.info(
|
|
412
|
-
f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
|
|
413
|
-
)
|
|
414
|
-
arsinhSignalThreshold = float(
|
|
415
|
-
np.quantile(
|
|
416
|
-
asinhNonZeroValues,
|
|
417
|
-
defQuantile,
|
|
418
|
-
method="interpolated_inverted_cdf",
|
|
419
|
-
)
|
|
420
|
-
)
|
|
421
|
-
# -----
|
|
422
|
-
|
|
423
|
-
elif isinstance(minSignalAtMaxima, (float, int)):
|
|
424
|
-
# -----we got an int or float-----
|
|
425
|
-
if float(minSignalAtMaxima) < 0.0:
|
|
426
|
-
# effectively disables threshold
|
|
427
|
-
arsinhSignalThreshold = -float(1e6)
|
|
428
|
-
else:
|
|
429
|
-
# use supplied value
|
|
430
|
-
arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
|
|
431
|
-
# -----
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
relativeMaximaIndices = signal.argrelmax(
|
|
435
|
-
responseSequence, order=relativeMaximaWindow
|
|
436
|
-
)[0]
|
|
437
|
-
|
|
438
|
-
relativeMaximaIndices = relativeMaximaIndices[
|
|
439
|
-
(responseSequence[relativeMaximaIndices] > responseThreshold)
|
|
440
|
-
& (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
|
|
441
|
-
]
|
|
442
|
-
|
|
443
|
-
if len(relativeMaximaIndices) == 0:
|
|
444
|
-
logger.info(
|
|
445
|
-
f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
|
|
449
|
+
ecdfSf = stats.ecdf(blockMaxima).sf
|
|
450
|
+
candidateIdx = relativeMaxima(response, relWindowBins)
|
|
451
|
+
|
|
452
|
+
candidateMask = (
|
|
453
|
+
(candidateIdx >= relWindowBins)
|
|
454
|
+
& (candidateIdx < len(response) - relWindowBins)
|
|
455
|
+
& (testMask[candidateIdx])
|
|
456
|
+
& (excludeMaskGlobal[candidateIdx] == 0)
|
|
457
|
+
& (asinhValues[candidateIdx] > asinhThreshold)
|
|
446
458
|
)
|
|
447
|
-
continue
|
|
448
459
|
|
|
449
|
-
|
|
450
|
-
if len(
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
460
|
+
candidateIdx = candidateIdx[candidateMask]
|
|
461
|
+
if len(candidateIdx) == 0:
|
|
462
|
+
continue
|
|
463
|
+
if (
|
|
464
|
+
maxNumMatches is not None
|
|
465
|
+
and len(candidateIdx) > maxNumMatches
|
|
466
|
+
):
|
|
467
|
+
candidateIdx = candidateIdx[
|
|
468
|
+
np.argsort(asinhValues[candidateIdx])[
|
|
454
469
|
-maxNumMatches:
|
|
455
470
|
]
|
|
456
471
|
]
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
testKS, _ = stats.kstest(
|
|
462
|
-
ecdfSFCheckVals,
|
|
463
|
-
stats.uniform.cdf,
|
|
464
|
-
alternative="two-sided",
|
|
465
|
-
)
|
|
466
|
-
|
|
467
|
-
logger.info(
|
|
468
|
-
f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
|
|
469
|
-
f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
|
|
470
|
-
f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
|
|
471
|
-
f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n" # lil text-plot histogram of approx. null CDF
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
# starts
|
|
475
|
-
startsIdx = np.maximum(
|
|
476
|
-
relativeMaximaIndices - relativeMaximaWindow, 0
|
|
477
|
-
)
|
|
478
|
-
# ends
|
|
479
|
-
endsIdx = np.minimum(
|
|
480
|
-
len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
|
|
481
|
-
)
|
|
482
|
-
# point source
|
|
483
|
-
pointSourcesIdx = []
|
|
484
|
-
for start_, end_ in zip(startsIdx, endsIdx):
|
|
485
|
-
pointSourcesIdx.append(
|
|
486
|
-
np.argmax(values[start_ : end_ + 1]) + start_
|
|
472
|
+
pEmp = np.clip(
|
|
473
|
+
ecdfSf.evaluate(response[candidateIdx]),
|
|
474
|
+
1.0e-10,
|
|
475
|
+
1.0,
|
|
487
476
|
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
ends = intervals[endsIdx]
|
|
491
|
-
pointSources = (intervals[pointSourcesIdx]) + max(
|
|
492
|
-
1, intervalLengthBP // 2
|
|
493
|
-
)
|
|
494
|
-
if (
|
|
495
|
-
recenterAtPointSource
|
|
496
|
-
): # recenter at point source (signal maximum)
|
|
497
|
-
starts = pointSources - (
|
|
498
|
-
relativeMaximaWindow * intervalLengthBP
|
|
477
|
+
startsIdx = np.maximum(
|
|
478
|
+
candidateIdx - relWindowBins, 0
|
|
499
479
|
)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
1, intervalLengthBP // 2
|
|
503
|
-
)
|
|
504
|
-
# (ucsc browser) score [0,1000]
|
|
505
|
-
sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
|
|
506
|
-
minResponse = np.min(sqScores)
|
|
507
|
-
maxResponse = np.max(sqScores)
|
|
508
|
-
rangeResponse = max(maxResponse - minResponse, 1.0)
|
|
509
|
-
scores = (
|
|
510
|
-
250 + 750 * (sqScores - minResponse) / rangeResponse
|
|
511
|
-
).astype(int)
|
|
512
|
-
# feature name
|
|
513
|
-
names = [
|
|
514
|
-
f"{templateName}_{cascadeLevel}_{i}"
|
|
515
|
-
for i in relativeMaximaIndices
|
|
516
|
-
]
|
|
517
|
-
# strand
|
|
518
|
-
strands = ["." for _ in range(len(scores))]
|
|
519
|
-
# p-values in -log10 scale per convention
|
|
520
|
-
pValues = -np.log10(
|
|
521
|
-
np.clip(
|
|
522
|
-
ecdfBlockMaximaSF.evaluate(
|
|
523
|
-
responseSequence[relativeMaximaIndices]
|
|
524
|
-
),
|
|
525
|
-
1e-10,
|
|
526
|
-
1.0,
|
|
480
|
+
endsIdx = np.minimum(
|
|
481
|
+
len(values) - 1, candidateIdx + relWindowBins
|
|
527
482
|
)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
483
|
+
pointSourcesIdx = []
|
|
484
|
+
for s, e in zip(startsIdx, endsIdx):
|
|
485
|
+
pointSourcesIdx.append(
|
|
486
|
+
np.argmax(values[s : e + 1]) + s
|
|
487
|
+
)
|
|
488
|
+
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
489
|
+
starts = intervals[startsIdx]
|
|
490
|
+
ends = intervals[endsIdx]
|
|
491
|
+
pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
|
|
492
|
+
1, intervalLengthBp // 2
|
|
493
|
+
)
|
|
494
|
+
if recenterAtPointSource:
|
|
495
|
+
starts = pointSourcesAbs - (
|
|
496
|
+
relWindowBins * intervalLengthBp
|
|
497
|
+
)
|
|
498
|
+
ends = pointSourcesAbs + (
|
|
499
|
+
relWindowBins * intervalLengthBp
|
|
500
|
+
)
|
|
501
|
+
pointSourcesRel = (
|
|
502
|
+
intervals[pointSourcesIdx] - starts
|
|
503
|
+
) + max(1, intervalLengthBp // 2)
|
|
504
|
+
sqScores = (1 + response[candidateIdx]) ** 2
|
|
505
|
+
minR, maxR = (
|
|
506
|
+
float(np.min(sqScores)),
|
|
507
|
+
float(np.max(sqScores)),
|
|
508
|
+
)
|
|
509
|
+
rangeR = max(maxR - minR, 1.0)
|
|
510
|
+
scores = (
|
|
511
|
+
250 + 750 * (sqScores - minR) / rangeR
|
|
512
|
+
).astype(int)
|
|
513
|
+
for i, idxVal in enumerate(candidateIdx):
|
|
514
|
+
allRows.append(
|
|
515
|
+
{
|
|
516
|
+
"chromosome": chromosome,
|
|
517
|
+
"start": int(starts[i]),
|
|
518
|
+
"end": int(ends[i]),
|
|
519
|
+
"name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
|
|
520
|
+
"score": int(scores[i]),
|
|
521
|
+
"strand": ".",
|
|
522
|
+
"signal": float(response[idxVal]),
|
|
523
|
+
"p_raw": float(pEmp[i]),
|
|
524
|
+
"pointSource": int(pointSourcesRel[i]),
|
|
525
|
+
}
|
|
526
|
+
)
|
|
546
527
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
528
|
+
if not allRows:
|
|
529
|
+
logger.warning(
|
|
530
|
+
"No matches detected, returning empty DataFrame."
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
return pd.DataFrame(
|
|
534
|
+
columns=[
|
|
535
|
+
"chromosome",
|
|
536
|
+
"start",
|
|
537
|
+
"end",
|
|
538
|
+
"name",
|
|
539
|
+
"score",
|
|
540
|
+
"strand",
|
|
541
|
+
"signal",
|
|
542
|
+
"pValue",
|
|
543
|
+
"qValue",
|
|
544
|
+
"pointSource",
|
|
545
|
+
]
|
|
546
|
+
)
|
|
552
547
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
548
|
+
df = pd.DataFrame(allRows)
|
|
549
|
+
qVals = bhFdr(df["p_raw"].values.astype(float))
|
|
550
|
+
df["pValue"] = -np.log10(
|
|
551
|
+
np.clip(df["p_raw"].values, 1.0e-10, 1.0)
|
|
552
|
+
)
|
|
553
|
+
df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
|
|
554
|
+
df.drop(columns=["p_raw"], inplace=True)
|
|
555
|
+
df = df[qVals <= alpha].copy()
|
|
556
|
+
df["chromosome"] = df["chromosome"].astype(str)
|
|
557
|
+
df.sort_values(by=["chromosome", "start", "end"], inplace=True)
|
|
558
|
+
df.reset_index(drop=True, inplace=True)
|
|
559
|
+
df = df[
|
|
560
|
+
[
|
|
561
|
+
"chromosome",
|
|
562
|
+
"start",
|
|
563
|
+
"end",
|
|
564
|
+
"name",
|
|
565
|
+
"score",
|
|
566
|
+
"strand",
|
|
567
|
+
"signal",
|
|
568
|
+
"pValue",
|
|
569
|
+
"qValue",
|
|
570
|
+
"pointSource",
|
|
571
|
+
]
|
|
572
|
+
]
|
|
573
|
+
return df
|
|
559
574
|
|
|
560
575
|
|
|
561
|
-
def mergeMatches(
|
|
562
|
-
|
|
576
|
+
def mergeMatches(
|
|
577
|
+
filePath: str,
|
|
578
|
+
mergeGapBP: Optional[int],
|
|
579
|
+
) -> Optional[str]:
|
|
580
|
+
r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
|
|
581
|
+
|
|
582
|
+
The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
|
|
583
|
+
The fourth column (name) of each merged peak contains information about the number of features that were merged
|
|
584
|
+
and the range of q-values among them.
|
|
563
585
|
|
|
564
|
-
|
|
586
|
+
Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
|
|
565
587
|
|
|
566
588
|
:param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
|
|
567
589
|
:type filePath: str
|
|
568
|
-
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
|
|
569
|
-
:type mergeGapBP: int
|
|
590
|
+
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
|
|
591
|
+
:type mergeGapBP: Optional[int]
|
|
570
592
|
|
|
571
|
-
:seealso: :class:`consenrich.core.matchingParams`
|
|
593
|
+
:seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
|
|
572
594
|
"""
|
|
595
|
+
|
|
596
|
+
if mergeGapBP is None or mergeGapBP < 1:
|
|
597
|
+
mergeGapBP = 75
|
|
598
|
+
|
|
599
|
+
MAX_NEGLOGP = 10.0
|
|
600
|
+
MIN_NEGLOGP = 1.0e-10
|
|
601
|
+
|
|
573
602
|
if not os.path.isfile(filePath):
|
|
574
|
-
logger.
|
|
603
|
+
logger.warning(f"Couldn't access {filePath}...skipping merge")
|
|
575
604
|
return None
|
|
576
605
|
bed = None
|
|
577
606
|
try:
|
|
578
607
|
bed = BedTool(filePath)
|
|
579
608
|
except Exception as ex:
|
|
580
|
-
logger.
|
|
609
|
+
logger.warning(
|
|
581
610
|
f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
|
|
582
611
|
)
|
|
583
612
|
return None
|
|
584
613
|
if bed is None:
|
|
585
|
-
logger.
|
|
614
|
+
logger.warning(
|
|
615
|
+
f"Couldn't create BedTool for {filePath}...skipping merge"
|
|
616
|
+
)
|
|
586
617
|
return None
|
|
587
618
|
|
|
588
619
|
bed = bed.sort()
|
|
@@ -595,41 +626,86 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
|
595
626
|
end = int(fields[2])
|
|
596
627
|
score = float(fields[4])
|
|
597
628
|
signal = float(fields[6])
|
|
598
|
-
|
|
599
|
-
|
|
629
|
+
pLog10 = float(fields[7])
|
|
630
|
+
qLog10 = float(fields[8])
|
|
600
631
|
peak = int(fields[9])
|
|
601
|
-
|
|
602
|
-
if
|
|
603
|
-
groups[
|
|
632
|
+
clusterID = fields[-1]
|
|
633
|
+
if clusterID not in groups:
|
|
634
|
+
groups[clusterID] = {
|
|
604
635
|
"chrom": chrom,
|
|
605
636
|
"sMin": start,
|
|
606
637
|
"eMax": end,
|
|
607
638
|
"scSum": 0.0,
|
|
608
639
|
"sigSum": 0.0,
|
|
609
|
-
"pSum": 0.0,
|
|
610
|
-
"qSum": 0.0,
|
|
611
640
|
"n": 0,
|
|
612
641
|
"maxS": float("-inf"),
|
|
613
642
|
"peakAbs": -1,
|
|
643
|
+
"pMax": float("-inf"),
|
|
644
|
+
"pTail": 0.0,
|
|
645
|
+
"pHasInf": False,
|
|
646
|
+
"qMax": float("-inf"),
|
|
647
|
+
"qMin": float("inf"),
|
|
648
|
+
"qTail": 0.0,
|
|
649
|
+
"qHasInf": False,
|
|
614
650
|
}
|
|
615
|
-
g = groups[
|
|
651
|
+
g = groups[clusterID]
|
|
616
652
|
if start < g["sMin"]:
|
|
617
653
|
g["sMin"] = start
|
|
618
654
|
if end > g["eMax"]:
|
|
619
655
|
g["eMax"] = end
|
|
620
656
|
g["scSum"] += score
|
|
621
657
|
g["sigSum"] += signal
|
|
622
|
-
g["pSum"] += pval
|
|
623
|
-
g["qSum"] += qval
|
|
624
658
|
g["n"] += 1
|
|
625
|
-
|
|
659
|
+
|
|
660
|
+
if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
|
|
661
|
+
g["pHasInf"] = True
|
|
662
|
+
else:
|
|
663
|
+
if pLog10 > g["pMax"]:
|
|
664
|
+
if g["pMax"] == float("-inf"):
|
|
665
|
+
g["pTail"] = 1.0
|
|
666
|
+
else:
|
|
667
|
+
g["pTail"] = (
|
|
668
|
+
g["pTail"] * (10 ** (g["pMax"] - pLog10))
|
|
669
|
+
+ 1.0
|
|
670
|
+
)
|
|
671
|
+
g["pMax"] = pLog10
|
|
672
|
+
else:
|
|
673
|
+
g["pTail"] += 10 ** (pLog10 - g["pMax"])
|
|
674
|
+
|
|
675
|
+
if (
|
|
676
|
+
math.isinf(qLog10)
|
|
677
|
+
or qLog10 >= MAX_NEGLOGP
|
|
678
|
+
or qLog10 <= MIN_NEGLOGP
|
|
679
|
+
):
|
|
680
|
+
g["qHasInf"] = True
|
|
681
|
+
else:
|
|
682
|
+
if qLog10 < g["qMin"]:
|
|
683
|
+
if qLog10 < MIN_NEGLOGP:
|
|
684
|
+
g["qMin"] = MIN_NEGLOGP
|
|
685
|
+
else:
|
|
686
|
+
g["qMin"] = qLog10
|
|
687
|
+
|
|
688
|
+
if qLog10 > g["qMax"]:
|
|
689
|
+
if g["qMax"] == float("-inf"):
|
|
690
|
+
g["qTail"] = 1.0
|
|
691
|
+
else:
|
|
692
|
+
g["qTail"] = (
|
|
693
|
+
g["qTail"] * (10 ** (g["qMax"] - qLog10))
|
|
694
|
+
+ 1.0
|
|
695
|
+
)
|
|
696
|
+
g["qMax"] = qLog10
|
|
697
|
+
else:
|
|
698
|
+
g["qTail"] += 10 ** (qLog10 - g["qMax"])
|
|
699
|
+
|
|
626
700
|
if signal > g["maxS"]:
|
|
627
701
|
g["maxS"] = signal
|
|
628
702
|
g["peakAbs"] = start + peak if peak >= 0 else -1
|
|
703
|
+
|
|
629
704
|
items = []
|
|
630
|
-
for
|
|
705
|
+
for clusterID, g in groups.items():
|
|
631
706
|
items.append((g["chrom"], g["sMin"], g["eMax"], g))
|
|
632
707
|
items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
|
|
708
|
+
|
|
633
709
|
outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
|
|
634
710
|
lines = []
|
|
635
711
|
i = 0
|
|
@@ -642,69 +718,68 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
|
642
718
|
avgScore = 1000
|
|
643
719
|
scoreInt = int(round(avgScore))
|
|
644
720
|
sigAvg = g["sigSum"] / g["n"]
|
|
645
|
-
pAvg = g["pSum"] / g["n"]
|
|
646
|
-
qAvg = g["qSum"] / g["n"]
|
|
647
|
-
pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
|
|
648
|
-
name = f"mergedPeak{i}"
|
|
649
|
-
lines.append(
|
|
650
|
-
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
|
|
651
|
-
)
|
|
652
|
-
with open(outPath, "w") as outF:
|
|
653
|
-
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
654
|
-
logger.info(f"Merged matches written to {outPath}")
|
|
655
|
-
return outPath
|
|
656
721
|
|
|
722
|
+
if g["pHasInf"]:
|
|
723
|
+
pHMLog10 = MAX_NEGLOGP
|
|
724
|
+
else:
|
|
725
|
+
if (
|
|
726
|
+
g["pMax"] == float("-inf")
|
|
727
|
+
or not (g["pTail"] > 0.0)
|
|
728
|
+
or math.isnan(g["pTail"])
|
|
729
|
+
):
|
|
730
|
+
pHMLog10 = MIN_NEGLOGP
|
|
731
|
+
else:
|
|
732
|
+
pHMLog10 = -math.log10(g["n"]) + (
|
|
733
|
+
g["pMax"] + math.log10(g["pTail"])
|
|
734
|
+
)
|
|
735
|
+
pHMLog10 = max(
|
|
736
|
+
MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
|
|
737
|
+
)
|
|
657
738
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
)
|
|
665
|
-
|
|
739
|
+
if g["qHasInf"]:
|
|
740
|
+
qHMLog10 = MAX_NEGLOGP
|
|
741
|
+
else:
|
|
742
|
+
if (
|
|
743
|
+
g["qMax"] == float("-inf")
|
|
744
|
+
or not (g["qTail"] > 0.0)
|
|
745
|
+
or math.isnan(g["qTail"])
|
|
746
|
+
):
|
|
747
|
+
qHMLog10 = MIN_NEGLOGP
|
|
748
|
+
else:
|
|
749
|
+
qHMLog10 = -math.log10(g["n"]) + (
|
|
750
|
+
g["qMax"] + math.log10(g["qTail"])
|
|
751
|
+
)
|
|
752
|
+
qHMLog10 = max(
|
|
753
|
+
MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
|
|
754
|
+
)
|
|
666
755
|
|
|
667
|
-
|
|
668
|
-
|
|
756
|
+
pointSource = (
|
|
757
|
+
g["peakAbs"] - sMin
|
|
758
|
+
if g["peakAbs"] >= 0
|
|
759
|
+
else (eMax - sMin) // 2
|
|
760
|
+
)
|
|
669
761
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
binIndex -= 1
|
|
687
|
-
binCounts[binIndex] += 1
|
|
688
|
-
valueSeries = (
|
|
689
|
-
[countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
|
|
690
|
-
if normalize
|
|
691
|
-
else binCounts[:]
|
|
692
|
-
)
|
|
693
|
-
valueMaximum = max(valueSeries) if valueSeries else 0
|
|
694
|
-
widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
|
|
695
|
-
edgeFormat = f"{{:.{2}f}}"
|
|
696
|
-
rangeLabels = [
|
|
697
|
-
f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
|
|
698
|
-
for indexValue in range(binCount)
|
|
699
|
-
]
|
|
700
|
-
labelWidth = max(len(textValue) for textValue in rangeLabels)
|
|
701
|
-
lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
|
|
702
|
-
for rangeLabel, seriesValue, countValue in zip(
|
|
703
|
-
rangeLabels, valueSeries, binCounts
|
|
704
|
-
):
|
|
705
|
-
barString = barChar * int(round(seriesValue * widthScale))
|
|
706
|
-
trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
|
|
762
|
+
qMinLog10 = g["qMin"]
|
|
763
|
+
qMaxLog10 = g["qMax"]
|
|
764
|
+
if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
|
|
765
|
+
qMinLog10 = MIN_NEGLOGP
|
|
766
|
+
if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
|
|
767
|
+
qMaxLog10 = MAX_NEGLOGP
|
|
768
|
+
elif (
|
|
769
|
+
not math.isfinite(qMaxLog10)
|
|
770
|
+
or not math.isfinite(qMinLog10)
|
|
771
|
+
) or (qMaxLog10 < MIN_NEGLOGP):
|
|
772
|
+
qMinLog10 = 0.0
|
|
773
|
+
qMaxLog10 = 0.0
|
|
774
|
+
|
|
775
|
+
# informative+parsable name
|
|
776
|
+
# e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
|
|
777
|
+
name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
|
|
707
778
|
lines.append(
|
|
708
|
-
f"{
|
|
779
|
+
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
|
|
709
780
|
)
|
|
710
|
-
|
|
781
|
+
|
|
782
|
+
with open(outPath, "w") as outF:
|
|
783
|
+
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
784
|
+
logger.info(f"Merged matches written to {outPath}")
|
|
785
|
+
return outPath
|