consenrich 0.7.0b1__cp311-cp311-macosx_11_0_arm64.whl → 0.7.1b1__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/cconsenrich.c +174 -174
- consenrich/cconsenrich.cpython-311-darwin.so +0 -0
- consenrich/consenrich.py +203 -67
- consenrich/core.py +5 -5
- consenrich/matching.py +442 -367
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b1.dist-info}/METADATA +1 -1
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b1.dist-info}/RECORD +11 -11
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b1.dist-info}/WHEEL +0 -0
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b1.dist-info}/entry_points.txt +0 -0
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b1.dist-info}/licenses/LICENSE +0 -0
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b1.dist-info}/top_level.txt +0 -0
consenrich/matching.py
CHANGED
|
@@ -3,6 +3,7 @@ r"""Module implementing (experimental) 'structured peak detection' features usin
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import math
|
|
6
7
|
from pybedtools import BedTool
|
|
7
8
|
from typing import List, Optional
|
|
8
9
|
|
|
@@ -23,13 +24,25 @@ logging.basicConfig(
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def scalarClip(value: float, low: float, high: float) -> float:
|
|
28
|
+
return low if value < low else high if value > high else value
|
|
29
|
+
|
|
30
|
+
|
|
26
31
|
def castableToFloat(value) -> bool:
|
|
27
32
|
if value is None:
|
|
28
33
|
return False
|
|
29
34
|
if isinstance(value, bool):
|
|
30
35
|
return False
|
|
31
36
|
if isinstance(value, str):
|
|
32
|
-
if value.lower().replace(
|
|
37
|
+
if value.lower().replace(" ", "") in [
|
|
38
|
+
"nan",
|
|
39
|
+
"inf",
|
|
40
|
+
"-inf",
|
|
41
|
+
"infinity",
|
|
42
|
+
"-infinity",
|
|
43
|
+
"",
|
|
44
|
+
" ",
|
|
45
|
+
]:
|
|
33
46
|
return False
|
|
34
47
|
|
|
35
48
|
try:
|
|
@@ -75,7 +88,11 @@ def matchExistingBedGraph(
|
|
|
75
88
|
)
|
|
76
89
|
|
|
77
90
|
if mergeGapBP is None:
|
|
78
|
-
mergeGapBP = (
|
|
91
|
+
mergeGapBP = (
|
|
92
|
+
(minMatchLengthBP // 2) + 1
|
|
93
|
+
if minMatchLengthBP is not None
|
|
94
|
+
else 75
|
|
95
|
+
)
|
|
79
96
|
|
|
80
97
|
allowedTemplates = [
|
|
81
98
|
x for x in pw.wavelist(kind="discrete") if "bio" not in x
|
|
@@ -129,7 +146,9 @@ def matchExistingBedGraph(
|
|
|
129
146
|
randSeed=randSeed,
|
|
130
147
|
)
|
|
131
148
|
except Exception as ex:
|
|
132
|
-
logger.info(
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Skipping {chrom_} due to error in matchWavelet: {ex}"
|
|
151
|
+
)
|
|
133
152
|
continue
|
|
134
153
|
|
|
135
154
|
if df__.empty:
|
|
@@ -145,7 +164,9 @@ def matchExistingBedGraph(
|
|
|
145
164
|
outPaths.append(perChromOut)
|
|
146
165
|
|
|
147
166
|
if merge:
|
|
148
|
-
mergedPath = mergeMatches(
|
|
167
|
+
mergedPath = mergeMatches(
|
|
168
|
+
perChromOut, mergeGapBP=mergeGapBP
|
|
169
|
+
)
|
|
149
170
|
if mergedPath is not None:
|
|
150
171
|
logger.info(f"Merged matches written to {mergedPath}")
|
|
151
172
|
outPathsMerged.append(mergedPath)
|
|
@@ -177,7 +198,9 @@ def matchExistingBedGraph(
|
|
|
177
198
|
with open(path, "r") as inF:
|
|
178
199
|
for line in inF:
|
|
179
200
|
outF.write(line)
|
|
180
|
-
logger.info(
|
|
201
|
+
logger.info(
|
|
202
|
+
f"All merged matches written to {outPathMergedAll}"
|
|
203
|
+
)
|
|
181
204
|
|
|
182
205
|
for path_ in outPaths + outPathsMerged:
|
|
183
206
|
try:
|
|
@@ -215,7 +238,8 @@ def matchWavelet(
|
|
|
215
238
|
|
|
216
239
|
:param chromosome: Chromosome name for the input intervals and values.
|
|
217
240
|
:type chromosome: str
|
|
218
|
-
:param values:
|
|
241
|
+
:param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
|
|
242
|
+
but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
|
|
219
243
|
:type values: npt.NDArray[np.float64]
|
|
220
244
|
:param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
|
|
221
245
|
:type templateNames: List[str]
|
|
@@ -226,19 +250,19 @@ def matchWavelet(
|
|
|
226
250
|
an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
|
|
227
251
|
:type iters: int
|
|
228
252
|
:param alpha: Primary significance threshold on detected matches. Specifically, the
|
|
229
|
-
|
|
230
|
-
|
|
253
|
+
minimum corr. empirical p-value approximated from randomly sampled blocks in the
|
|
254
|
+
response sequence.
|
|
231
255
|
:type alpha: float
|
|
232
256
|
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
233
257
|
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
234
258
|
:type minMatchLengthBP: int
|
|
235
|
-
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`.
|
|
236
|
-
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
237
|
-
If a `float` value is provided, the minimum signal value must be greater
|
|
238
|
-
negative value to disable the threshold*.
|
|
239
|
-
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.
|
|
259
|
+
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
|
|
260
|
+
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
261
|
+
to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
|
|
262
|
+
than this (absolute) value. *Set to a negative value to disable the threshold*.
|
|
263
|
+
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
|
|
240
264
|
threshold is then set to the corresponding quantile of the non-zero signal estimates.
|
|
241
|
-
Defaults to str value 'q:0.75' --- the
|
|
265
|
+
Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
|
|
242
266
|
:type minSignalAtMaxima: Optional[str | float]
|
|
243
267
|
:param useScalingFunction: If True, use (only) the scaling function to build the matching template.
|
|
244
268
|
If False, use (only) the wavelet function.
|
|
@@ -247,342 +271,349 @@ def matchWavelet(
|
|
|
247
271
|
:type excludeRegionsBedFile: Optional[str]
|
|
248
272
|
|
|
249
273
|
:seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
|
|
274
|
+
:return: A pandas DataFrame with detected matches
|
|
275
|
+
:rtype: pd.DataFrame
|
|
250
276
|
"""
|
|
251
|
-
|
|
252
277
|
if len(intervals) < 5:
|
|
253
278
|
raise ValueError("`intervals` must be at least length 5")
|
|
254
279
|
if len(values) != len(intervals):
|
|
255
|
-
raise ValueError(
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
280
|
+
raise ValueError(
|
|
281
|
+
"`values` must have the same length as `intervals`"
|
|
282
|
+
)
|
|
283
|
+
intervalLengthBp = intervals[1] - intervals[0]
|
|
284
|
+
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
|
|
260
285
|
raise ValueError("`intervals` must be evenly spaced.")
|
|
261
|
-
|
|
262
|
-
randSeed_: int = int(randSeed)
|
|
263
|
-
cols = [
|
|
264
|
-
"chromosome",
|
|
265
|
-
"start",
|
|
266
|
-
"end",
|
|
267
|
-
"name",
|
|
268
|
-
"score",
|
|
269
|
-
"strand",
|
|
270
|
-
"signal",
|
|
271
|
-
"pValue",
|
|
272
|
-
"qValue",
|
|
273
|
-
"pointSource",
|
|
274
|
-
]
|
|
275
|
-
matchDF = pd.DataFrame(columns=cols)
|
|
276
|
-
minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
|
|
286
|
+
rng = np.random.default_rng(int(randSeed))
|
|
277
287
|
cascadeLevels = sorted(list(set(cascadeLevels)))
|
|
278
288
|
if weights is not None and len(weights) == len(values):
|
|
279
289
|
values = values * weights
|
|
280
290
|
asinhValues = np.asinh(values, dtype=np.float32)
|
|
281
291
|
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
282
|
-
iters = max(iters, 1000)
|
|
283
|
-
defQuantile
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
+
iters = max(int(iters), 1000)
|
|
293
|
+
defQuantile = 0.75
|
|
294
|
+
chromMin = int(intervals[0])
|
|
295
|
+
chromMax = int(intervals[-1])
|
|
296
|
+
chromMid = chromMin + (chromMax - chromMin) // 2 # for split
|
|
297
|
+
halfLeftMask = intervals < chromMid
|
|
298
|
+
halfRightMask = ~halfLeftMask
|
|
299
|
+
excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
|
|
300
|
+
if excludeRegionsBedFile is not None:
|
|
301
|
+
excludeMaskGlobal = core.getBedMask(
|
|
302
|
+
chromosome, excludeRegionsBedFile, intervals
|
|
303
|
+
).astype(np.uint8)
|
|
304
|
+
allRows = []
|
|
305
|
+
|
|
306
|
+
def bhFdr(p: np.ndarray) -> np.ndarray:
|
|
307
|
+
m = len(p)
|
|
308
|
+
order = np.argsort(p, kind="mergesort")
|
|
309
|
+
ranked = np.arange(1, m + 1, dtype=float)
|
|
310
|
+
q = (p[order] * m) / ranked
|
|
311
|
+
q = np.minimum.accumulate(q[::-1])[::-1]
|
|
312
|
+
out = np.empty_like(q)
|
|
313
|
+
out[order] = q
|
|
314
|
+
return np.clip(out, 0.0, 1.0)
|
|
315
|
+
|
|
316
|
+
def parseMinSignalThreshold(val):
|
|
317
|
+
if val is None:
|
|
318
|
+
return -1e6
|
|
319
|
+
if isinstance(val, str):
|
|
320
|
+
if val.startswith("q:"):
|
|
321
|
+
qVal = float(val.split("q:")[-1])
|
|
322
|
+
if not (0 <= qVal <= 1):
|
|
323
|
+
raise ValueError(
|
|
324
|
+
f"Quantile {qVal} is out of range"
|
|
325
|
+
)
|
|
326
|
+
return float(
|
|
327
|
+
np.quantile(
|
|
328
|
+
asinhNonZeroValues,
|
|
329
|
+
qVal,
|
|
330
|
+
method="interpolated_inverted_cdf",
|
|
331
|
+
)
|
|
292
332
|
)
|
|
293
|
-
|
|
333
|
+
elif castableToFloat(val):
|
|
334
|
+
v = float(val)
|
|
335
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
336
|
+
else:
|
|
337
|
+
return float(
|
|
338
|
+
np.quantile(
|
|
339
|
+
asinhNonZeroValues,
|
|
340
|
+
defQuantile,
|
|
341
|
+
method="interpolated_inverted_cdf",
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
if isinstance(val, (float, int)):
|
|
345
|
+
v = float(val)
|
|
346
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
347
|
+
return float(
|
|
348
|
+
np.quantile(
|
|
349
|
+
asinhNonZeroValues,
|
|
350
|
+
defQuantile,
|
|
351
|
+
method="interpolated_inverted_cdf",
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def relativeMaxima(
|
|
356
|
+
resp: np.ndarray, orderBins: int
|
|
357
|
+
) -> np.ndarray:
|
|
358
|
+
return signal.argrelmax(resp, order=max(int(orderBins), 1))[0]
|
|
359
|
+
|
|
360
|
+
def sampleBlockMaxima(
|
|
361
|
+
resp: np.ndarray,
|
|
362
|
+
halfMask: np.ndarray,
|
|
363
|
+
relWindowBins: int,
|
|
364
|
+
nsamp: int,
|
|
365
|
+
seed: int,
|
|
366
|
+
):
|
|
367
|
+
exMask = excludeMaskGlobal.astype(np.uint8).copy()
|
|
368
|
+
exMask |= (~halfMask).astype(np.uint8)
|
|
369
|
+
vals = np.array(
|
|
370
|
+
cconsenrich.csampleBlockStats(
|
|
371
|
+
intervals.astype(np.uint32),
|
|
372
|
+
resp,
|
|
373
|
+
int(relWindowBins),
|
|
374
|
+
int(nsamp),
|
|
375
|
+
int(seed),
|
|
376
|
+
exMask.astype(np.uint8),
|
|
377
|
+
),
|
|
378
|
+
dtype=float,
|
|
379
|
+
)
|
|
380
|
+
if len(vals) == 0:
|
|
381
|
+
return vals
|
|
382
|
+
low = np.quantile(vals, 0.001)
|
|
383
|
+
high = np.quantile(vals, 0.999)
|
|
384
|
+
return vals[(vals > low) & (vals < high)]
|
|
385
|
+
|
|
386
|
+
for cascadeLevel in cascadeLevels:
|
|
387
|
+
for templateName in templateNames:
|
|
294
388
|
if templateName not in pw.wavelist(kind="discrete"):
|
|
295
|
-
logger.
|
|
296
|
-
f"
|
|
389
|
+
logger.warning(
|
|
390
|
+
f"Skipping unknown wavelet template: {templateName}"
|
|
297
391
|
)
|
|
298
392
|
continue
|
|
299
393
|
|
|
300
|
-
wav = pw.Wavelet(templateName)
|
|
301
|
-
scalingFunc, waveletFunc,
|
|
302
|
-
|
|
303
|
-
waveletFunc
|
|
394
|
+
wav = pw.Wavelet(str(templateName))
|
|
395
|
+
scalingFunc, waveletFunc, _ = wav.wavefun(
|
|
396
|
+
level=int(cascadeLevel)
|
|
304
397
|
)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
398
|
+
template = np.array(
|
|
399
|
+
scalingFunc if useScalingFunction else waveletFunc,
|
|
400
|
+
dtype=np.float64,
|
|
401
|
+
)
|
|
402
|
+
template /= np.linalg.norm(template)
|
|
310
403
|
|
|
311
404
|
logger.info(
|
|
312
|
-
f"
|
|
405
|
+
f"\n\tMatching template: {templateName}"
|
|
406
|
+
f"\n\tcascade level: {cascadeLevel}"
|
|
407
|
+
f"\n\ttemplate length: {len(template)}"
|
|
313
408
|
)
|
|
314
409
|
|
|
315
|
-
|
|
410
|
+
# efficient FFT-based cross-correlation
|
|
411
|
+
# (OA may be better for smaller templates, TODO add a check)
|
|
412
|
+
response = signal.fftconvolve(
|
|
316
413
|
values, template[::-1], mode="same"
|
|
317
414
|
)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
minMatchLengthBP % intervalLengthBP
|
|
415
|
+
thisMinMatchBp = minMatchLengthBP
|
|
416
|
+
if thisMinMatchBp is None or thisMinMatchBp < 1:
|
|
417
|
+
thisMinMatchBp = len(template) * intervalLengthBp
|
|
418
|
+
if thisMinMatchBp % intervalLengthBp != 0:
|
|
419
|
+
thisMinMatchBp += intervalLengthBp - (
|
|
420
|
+
thisMinMatchBp % intervalLengthBp
|
|
325
421
|
)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
((minMatchLengthBP / intervalLengthBP) / 2) + 1
|
|
422
|
+
relWindowBins = int(
|
|
423
|
+
((thisMinMatchBp / intervalLengthBp) / 2) + 1
|
|
329
424
|
)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
if excludeRegionsBedFile is not None:
|
|
334
|
-
excludeMask = core.getBedMask(
|
|
335
|
-
chromosome,
|
|
336
|
-
excludeRegionsBedFile,
|
|
337
|
-
intervals,
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
logger.info(
|
|
341
|
-
f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
|
|
425
|
+
relWindowBins = max(relWindowBins, 1)
|
|
426
|
+
asinhThreshold = parseMinSignalThreshold(
|
|
427
|
+
minSignalAtMaxima
|
|
342
428
|
)
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
)
|
|
354
|
-
blockMaximaCheck = blockMaxima.copy()[iters:]
|
|
355
|
-
blockMaxima = blockMaxima[:iters]
|
|
356
|
-
blockMaxima = blockMaxima[
|
|
357
|
-
(blockMaxima > np.quantile(blockMaxima, 0.005))
|
|
358
|
-
& (blockMaxima < np.quantile(blockMaxima, 0.995))
|
|
359
|
-
]
|
|
360
|
-
|
|
361
|
-
ecdfBlockMaximaSF = stats.ecdf(blockMaxima).sf
|
|
362
|
-
|
|
363
|
-
responseThreshold = float(1e6)
|
|
364
|
-
arsinhSignalThreshold = float(1e6)
|
|
365
|
-
try:
|
|
366
|
-
# we use 'interpolated_inverted_cdf' in a few spots
|
|
367
|
-
# --- making sure it's supported here, at its first use
|
|
368
|
-
responseThreshold = np.quantile(
|
|
369
|
-
blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
|
|
429
|
+
for nullMask, testMask, tag in [
|
|
430
|
+
(halfLeftMask, halfRightMask, "R"),
|
|
431
|
+
(halfRightMask, halfLeftMask, "L"),
|
|
432
|
+
]:
|
|
433
|
+
blockMaxima = sampleBlockMaxima(
|
|
434
|
+
response,
|
|
435
|
+
nullMask,
|
|
436
|
+
relWindowBins,
|
|
437
|
+
nsamp=max(iters, 1000),
|
|
438
|
+
seed=rng.integers(1, 10_000),
|
|
370
439
|
)
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
# parse minSignalAtMaxima, set arsinhSignalThreshold
|
|
381
|
-
if minSignalAtMaxima is None:
|
|
382
|
-
# -----we got a `None`-----
|
|
383
|
-
arsinhSignalThreshold = -float(1e6)
|
|
384
|
-
elif isinstance(minSignalAtMaxima, str):
|
|
385
|
-
# -----we got a str-----
|
|
386
|
-
if minSignalAtMaxima.startswith("q:"):
|
|
387
|
-
# case: expected 'q:quantileValue' format
|
|
388
|
-
qVal = float(minSignalAtMaxima.split("q:")[-1])
|
|
389
|
-
if qVal < 0 or qVal > 1:
|
|
390
|
-
raise ValueError(f"Quantile {qVal} is out of range")
|
|
391
|
-
arsinhSignalThreshold = float(
|
|
392
|
-
np.quantile(
|
|
393
|
-
asinhNonZeroValues,
|
|
394
|
-
qVal,
|
|
395
|
-
method="interpolated_inverted_cdf",
|
|
396
|
-
)
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
elif castableToFloat(minSignalAtMaxima):
|
|
400
|
-
# case: numeric in str form (possible due to CLI)
|
|
401
|
-
if float(minSignalAtMaxima) < 0.0:
|
|
402
|
-
# effectively disables threshold
|
|
403
|
-
arsinhSignalThreshold = -float(1e6)
|
|
404
|
-
else:
|
|
405
|
-
# use supplied value
|
|
406
|
-
arsinhSignalThreshold = np.asinh(
|
|
407
|
-
float(minSignalAtMaxima)
|
|
408
|
-
)
|
|
409
|
-
else:
|
|
410
|
-
# case: not in known format, not castable to a float, use defaults
|
|
411
|
-
logger.info(
|
|
412
|
-
f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
|
|
413
|
-
)
|
|
414
|
-
arsinhSignalThreshold = float(
|
|
415
|
-
np.quantile(
|
|
416
|
-
asinhNonZeroValues,
|
|
417
|
-
defQuantile,
|
|
418
|
-
method="interpolated_inverted_cdf",
|
|
419
|
-
)
|
|
440
|
+
if len(blockMaxima) < 25:
|
|
441
|
+
pooledMask = ~excludeMaskGlobal.astype(bool)
|
|
442
|
+
blockMaxima = sampleBlockMaxima(
|
|
443
|
+
response,
|
|
444
|
+
pooledMask,
|
|
445
|
+
relWindowBins,
|
|
446
|
+
nsamp=max(iters, 1000),
|
|
447
|
+
seed=rng.integers(1, 10_000),
|
|
420
448
|
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
|
|
431
|
-
# -----
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
relativeMaximaIndices = signal.argrelmax(
|
|
435
|
-
responseSequence, order=relativeMaximaWindow
|
|
436
|
-
)[0]
|
|
437
|
-
|
|
438
|
-
relativeMaximaIndices = relativeMaximaIndices[
|
|
439
|
-
(responseSequence[relativeMaximaIndices] > responseThreshold)
|
|
440
|
-
& (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
|
|
441
|
-
]
|
|
442
|
-
|
|
443
|
-
if len(relativeMaximaIndices) == 0:
|
|
444
|
-
logger.info(
|
|
445
|
-
f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
|
|
449
|
+
ecdfSf = stats.ecdf(blockMaxima).sf
|
|
450
|
+
candidateIdx = relativeMaxima(response, relWindowBins)
|
|
451
|
+
|
|
452
|
+
candidateMask = (
|
|
453
|
+
(candidateIdx >= relWindowBins)
|
|
454
|
+
& (candidateIdx < len(response) - relWindowBins)
|
|
455
|
+
& (testMask[candidateIdx])
|
|
456
|
+
& (excludeMaskGlobal[candidateIdx] == 0)
|
|
457
|
+
& (asinhValues[candidateIdx] > asinhThreshold)
|
|
446
458
|
)
|
|
447
|
-
continue
|
|
448
459
|
|
|
449
|
-
|
|
450
|
-
if len(
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
460
|
+
candidateIdx = candidateIdx[candidateMask]
|
|
461
|
+
if len(candidateIdx) == 0:
|
|
462
|
+
continue
|
|
463
|
+
if (
|
|
464
|
+
maxNumMatches is not None
|
|
465
|
+
and len(candidateIdx) > maxNumMatches
|
|
466
|
+
):
|
|
467
|
+
candidateIdx = candidateIdx[
|
|
468
|
+
np.argsort(asinhValues[candidateIdx])[
|
|
454
469
|
-maxNumMatches:
|
|
455
470
|
]
|
|
456
471
|
]
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
testKS, _ = stats.kstest(
|
|
462
|
-
ecdfSFCheckVals,
|
|
463
|
-
stats.uniform.cdf,
|
|
464
|
-
alternative="two-sided",
|
|
465
|
-
)
|
|
466
|
-
|
|
467
|
-
logger.info(
|
|
468
|
-
f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
|
|
469
|
-
f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
|
|
470
|
-
f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
|
|
471
|
-
f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n" # lil text-plot histogram of approx. null CDF
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
# starts
|
|
475
|
-
startsIdx = np.maximum(
|
|
476
|
-
relativeMaximaIndices - relativeMaximaWindow, 0
|
|
477
|
-
)
|
|
478
|
-
# ends
|
|
479
|
-
endsIdx = np.minimum(
|
|
480
|
-
len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
|
|
481
|
-
)
|
|
482
|
-
# point source
|
|
483
|
-
pointSourcesIdx = []
|
|
484
|
-
for start_, end_ in zip(startsIdx, endsIdx):
|
|
485
|
-
pointSourcesIdx.append(
|
|
486
|
-
np.argmax(values[start_ : end_ + 1]) + start_
|
|
472
|
+
pEmp = np.clip(
|
|
473
|
+
ecdfSf.evaluate(response[candidateIdx]),
|
|
474
|
+
1.0e-10,
|
|
475
|
+
1.0,
|
|
487
476
|
)
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
ends = intervals[endsIdx]
|
|
491
|
-
pointSources = (intervals[pointSourcesIdx]) + max(
|
|
492
|
-
1, intervalLengthBP // 2
|
|
493
|
-
)
|
|
494
|
-
if (
|
|
495
|
-
recenterAtPointSource
|
|
496
|
-
): # recenter at point source (signal maximum)
|
|
497
|
-
starts = pointSources - (
|
|
498
|
-
relativeMaximaWindow * intervalLengthBP
|
|
477
|
+
startsIdx = np.maximum(
|
|
478
|
+
candidateIdx - relWindowBins, 0
|
|
499
479
|
)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
1, intervalLengthBP // 2
|
|
503
|
-
)
|
|
504
|
-
# (ucsc browser) score [0,1000]
|
|
505
|
-
sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
|
|
506
|
-
minResponse = np.min(sqScores)
|
|
507
|
-
maxResponse = np.max(sqScores)
|
|
508
|
-
rangeResponse = max(maxResponse - minResponse, 1.0)
|
|
509
|
-
scores = (
|
|
510
|
-
250 + 750 * (sqScores - minResponse) / rangeResponse
|
|
511
|
-
).astype(int)
|
|
512
|
-
# feature name
|
|
513
|
-
names = [
|
|
514
|
-
f"{templateName}_{cascadeLevel}_{i}"
|
|
515
|
-
for i in relativeMaximaIndices
|
|
516
|
-
]
|
|
517
|
-
# strand
|
|
518
|
-
strands = ["." for _ in range(len(scores))]
|
|
519
|
-
# p-values in -log10 scale per convention
|
|
520
|
-
pValues = -np.log10(
|
|
521
|
-
np.clip(
|
|
522
|
-
ecdfBlockMaximaSF.evaluate(
|
|
523
|
-
responseSequence[relativeMaximaIndices]
|
|
524
|
-
),
|
|
525
|
-
1e-10,
|
|
526
|
-
1.0,
|
|
480
|
+
endsIdx = np.minimum(
|
|
481
|
+
len(values) - 1, candidateIdx + relWindowBins
|
|
527
482
|
)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
483
|
+
pointSourcesIdx = []
|
|
484
|
+
for s, e in zip(startsIdx, endsIdx):
|
|
485
|
+
pointSourcesIdx.append(
|
|
486
|
+
np.argmax(values[s : e + 1]) + s
|
|
487
|
+
)
|
|
488
|
+
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
489
|
+
starts = intervals[startsIdx]
|
|
490
|
+
ends = intervals[endsIdx]
|
|
491
|
+
pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
|
|
492
|
+
1, intervalLengthBp // 2
|
|
493
|
+
)
|
|
494
|
+
if recenterAtPointSource:
|
|
495
|
+
starts = pointSourcesAbs - (
|
|
496
|
+
relWindowBins * intervalLengthBp
|
|
497
|
+
)
|
|
498
|
+
ends = pointSourcesAbs + (
|
|
499
|
+
relWindowBins * intervalLengthBp
|
|
500
|
+
)
|
|
501
|
+
pointSourcesRel = (
|
|
502
|
+
intervals[pointSourcesIdx] - starts
|
|
503
|
+
) + max(1, intervalLengthBp // 2)
|
|
504
|
+
sqScores = (1 + response[candidateIdx]) ** 2
|
|
505
|
+
minR, maxR = (
|
|
506
|
+
float(np.min(sqScores)),
|
|
507
|
+
float(np.max(sqScores)),
|
|
508
|
+
)
|
|
509
|
+
rangeR = max(maxR - minR, 1.0)
|
|
510
|
+
scores = (
|
|
511
|
+
250 + 750 * (sqScores - minR) / rangeR
|
|
512
|
+
).astype(int)
|
|
513
|
+
for i, idxVal in enumerate(candidateIdx):
|
|
514
|
+
allRows.append(
|
|
515
|
+
{
|
|
516
|
+
"chromosome": chromosome,
|
|
517
|
+
"start": int(starts[i]),
|
|
518
|
+
"end": int(ends[i]),
|
|
519
|
+
"name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
|
|
520
|
+
"score": int(scores[i]),
|
|
521
|
+
"strand": ".",
|
|
522
|
+
"signal": float(response[idxVal]),
|
|
523
|
+
"p_raw": float(pEmp[i]),
|
|
524
|
+
"pointSource": int(pointSourcesRel[i]),
|
|
525
|
+
}
|
|
526
|
+
)
|
|
546
527
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
randSeed_ += 1
|
|
528
|
+
if not allRows:
|
|
529
|
+
logger.warning(
|
|
530
|
+
"No matches detected, returning empty DataFrame."
|
|
531
|
+
)
|
|
552
532
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
533
|
+
return pd.DataFrame(
|
|
534
|
+
columns=[
|
|
535
|
+
"chromosome",
|
|
536
|
+
"start",
|
|
537
|
+
"end",
|
|
538
|
+
"name",
|
|
539
|
+
"score",
|
|
540
|
+
"strand",
|
|
541
|
+
"signal",
|
|
542
|
+
"pValue",
|
|
543
|
+
"qValue",
|
|
544
|
+
"pointSource",
|
|
545
|
+
]
|
|
546
|
+
)
|
|
559
547
|
|
|
548
|
+
df = pd.DataFrame(allRows)
|
|
549
|
+
qVals = bhFdr(df["p_raw"].values.astype(float))
|
|
550
|
+
df["pValue"] = -np.log10(
|
|
551
|
+
np.clip(df["p_raw"].values, 1.0e-10, 1.0)
|
|
552
|
+
)
|
|
553
|
+
df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
|
|
554
|
+
df.drop(columns=["p_raw"], inplace=True)
|
|
555
|
+
df = df[qVals <= alpha].copy()
|
|
556
|
+
df["chromosome"] = df["chromosome"].astype(str)
|
|
557
|
+
df.sort_values(by=["chromosome", "start", "end"], inplace=True)
|
|
558
|
+
df.reset_index(drop=True, inplace=True)
|
|
559
|
+
df = df[
|
|
560
|
+
[
|
|
561
|
+
"chromosome",
|
|
562
|
+
"start",
|
|
563
|
+
"end",
|
|
564
|
+
"name",
|
|
565
|
+
"score",
|
|
566
|
+
"strand",
|
|
567
|
+
"signal",
|
|
568
|
+
"pValue",
|
|
569
|
+
"qValue",
|
|
570
|
+
"pointSource",
|
|
571
|
+
]
|
|
572
|
+
]
|
|
573
|
+
return df
|
|
560
574
|
|
|
561
|
-
def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
562
|
-
r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
|
|
563
575
|
|
|
564
|
-
|
|
576
|
+
def mergeMatches(
|
|
577
|
+
filePath: str,
|
|
578
|
+
mergeGapBP: Optional[int],
|
|
579
|
+
) -> Optional[str]:
|
|
580
|
+
r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
|
|
581
|
+
|
|
582
|
+
The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
|
|
583
|
+
The fourth column (name) of each merged peak contains information about the number of features that were merged
|
|
584
|
+
and the range of q-values among them.
|
|
585
|
+
|
|
586
|
+
Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
|
|
565
587
|
|
|
566
588
|
:param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
|
|
567
589
|
:type filePath: str
|
|
568
|
-
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
|
|
569
|
-
:type mergeGapBP: int
|
|
590
|
+
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
|
|
591
|
+
:type mergeGapBP: Optional[int]
|
|
570
592
|
|
|
571
|
-
:seealso: :class:`consenrich.core.matchingParams`
|
|
593
|
+
:seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
|
|
572
594
|
"""
|
|
595
|
+
|
|
596
|
+
if mergeGapBP is None or mergeGapBP < 1:
|
|
597
|
+
mergeGapBP = 75
|
|
598
|
+
|
|
599
|
+
MAX_NEGLOGP = 10.0
|
|
600
|
+
MIN_NEGLOGP = 1.0e-10
|
|
601
|
+
|
|
573
602
|
if not os.path.isfile(filePath):
|
|
574
|
-
logger.
|
|
603
|
+
logger.warning(f"Couldn't access {filePath}...skipping merge")
|
|
575
604
|
return None
|
|
576
605
|
bed = None
|
|
577
606
|
try:
|
|
578
607
|
bed = BedTool(filePath)
|
|
579
608
|
except Exception as ex:
|
|
580
|
-
logger.
|
|
609
|
+
logger.warning(
|
|
581
610
|
f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
|
|
582
611
|
)
|
|
583
612
|
return None
|
|
584
613
|
if bed is None:
|
|
585
|
-
logger.
|
|
614
|
+
logger.warning(
|
|
615
|
+
f"Couldn't create BedTool for {filePath}...skipping merge"
|
|
616
|
+
)
|
|
586
617
|
return None
|
|
587
618
|
|
|
588
619
|
bed = bed.sort()
|
|
@@ -595,41 +626,86 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
|
595
626
|
end = int(fields[2])
|
|
596
627
|
score = float(fields[4])
|
|
597
628
|
signal = float(fields[6])
|
|
598
|
-
|
|
599
|
-
|
|
629
|
+
pLog10 = float(fields[7])
|
|
630
|
+
qLog10 = float(fields[8])
|
|
600
631
|
peak = int(fields[9])
|
|
601
|
-
|
|
602
|
-
if
|
|
603
|
-
groups[
|
|
632
|
+
clusterID = fields[-1]
|
|
633
|
+
if clusterID not in groups:
|
|
634
|
+
groups[clusterID] = {
|
|
604
635
|
"chrom": chrom,
|
|
605
636
|
"sMin": start,
|
|
606
637
|
"eMax": end,
|
|
607
638
|
"scSum": 0.0,
|
|
608
639
|
"sigSum": 0.0,
|
|
609
|
-
"pSum": 0.0,
|
|
610
|
-
"qSum": 0.0,
|
|
611
640
|
"n": 0,
|
|
612
641
|
"maxS": float("-inf"),
|
|
613
642
|
"peakAbs": -1,
|
|
643
|
+
"pMax": float("-inf"),
|
|
644
|
+
"pTail": 0.0,
|
|
645
|
+
"pHasInf": False,
|
|
646
|
+
"qMax": float("-inf"),
|
|
647
|
+
"qMin": float("inf"),
|
|
648
|
+
"qTail": 0.0,
|
|
649
|
+
"qHasInf": False,
|
|
614
650
|
}
|
|
615
|
-
g = groups[
|
|
651
|
+
g = groups[clusterID]
|
|
616
652
|
if start < g["sMin"]:
|
|
617
653
|
g["sMin"] = start
|
|
618
654
|
if end > g["eMax"]:
|
|
619
655
|
g["eMax"] = end
|
|
620
656
|
g["scSum"] += score
|
|
621
657
|
g["sigSum"] += signal
|
|
622
|
-
g["pSum"] += pval
|
|
623
|
-
g["qSum"] += qval
|
|
624
658
|
g["n"] += 1
|
|
625
|
-
|
|
659
|
+
|
|
660
|
+
if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
|
|
661
|
+
g["pHasInf"] = True
|
|
662
|
+
else:
|
|
663
|
+
if pLog10 > g["pMax"]:
|
|
664
|
+
if g["pMax"] == float("-inf"):
|
|
665
|
+
g["pTail"] = 1.0
|
|
666
|
+
else:
|
|
667
|
+
g["pTail"] = (
|
|
668
|
+
g["pTail"] * (10 ** (g["pMax"] - pLog10))
|
|
669
|
+
+ 1.0
|
|
670
|
+
)
|
|
671
|
+
g["pMax"] = pLog10
|
|
672
|
+
else:
|
|
673
|
+
g["pTail"] += 10 ** (pLog10 - g["pMax"])
|
|
674
|
+
|
|
675
|
+
if (
|
|
676
|
+
math.isinf(qLog10)
|
|
677
|
+
or qLog10 >= MAX_NEGLOGP
|
|
678
|
+
or qLog10 <= MIN_NEGLOGP
|
|
679
|
+
):
|
|
680
|
+
g["qHasInf"] = True
|
|
681
|
+
else:
|
|
682
|
+
if qLog10 < g["qMin"]:
|
|
683
|
+
if qLog10 < MIN_NEGLOGP:
|
|
684
|
+
g["qMin"] = MIN_NEGLOGP
|
|
685
|
+
else:
|
|
686
|
+
g["qMin"] = qLog10
|
|
687
|
+
|
|
688
|
+
if qLog10 > g["qMax"]:
|
|
689
|
+
if g["qMax"] == float("-inf"):
|
|
690
|
+
g["qTail"] = 1.0
|
|
691
|
+
else:
|
|
692
|
+
g["qTail"] = (
|
|
693
|
+
g["qTail"] * (10 ** (g["qMax"] - qLog10))
|
|
694
|
+
+ 1.0
|
|
695
|
+
)
|
|
696
|
+
g["qMax"] = qLog10
|
|
697
|
+
else:
|
|
698
|
+
g["qTail"] += 10 ** (qLog10 - g["qMax"])
|
|
699
|
+
|
|
626
700
|
if signal > g["maxS"]:
|
|
627
701
|
g["maxS"] = signal
|
|
628
702
|
g["peakAbs"] = start + peak if peak >= 0 else -1
|
|
703
|
+
|
|
629
704
|
items = []
|
|
630
|
-
for
|
|
705
|
+
for clusterID, g in groups.items():
|
|
631
706
|
items.append((g["chrom"], g["sMin"], g["eMax"], g))
|
|
632
707
|
items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
|
|
708
|
+
|
|
633
709
|
outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
|
|
634
710
|
lines = []
|
|
635
711
|
i = 0
|
|
@@ -642,69 +718,68 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
|
642
718
|
avgScore = 1000
|
|
643
719
|
scoreInt = int(round(avgScore))
|
|
644
720
|
sigAvg = g["sigSum"] / g["n"]
|
|
645
|
-
pAvg = g["pSum"] / g["n"]
|
|
646
|
-
qAvg = g["qSum"] / g["n"]
|
|
647
|
-
pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
|
|
648
|
-
name = f"mergedPeak{i}"
|
|
649
|
-
lines.append(
|
|
650
|
-
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
|
|
651
|
-
)
|
|
652
|
-
with open(outPath, "w") as outF:
|
|
653
|
-
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
654
|
-
logger.info(f"Merged matches written to {outPath}")
|
|
655
|
-
return outPath
|
|
656
721
|
|
|
722
|
+
if g["pHasInf"]:
|
|
723
|
+
pHMLog10 = MAX_NEGLOGP
|
|
724
|
+
else:
|
|
725
|
+
if (
|
|
726
|
+
g["pMax"] == float("-inf")
|
|
727
|
+
or not (g["pTail"] > 0.0)
|
|
728
|
+
or math.isnan(g["pTail"])
|
|
729
|
+
):
|
|
730
|
+
pHMLog10 = MIN_NEGLOGP
|
|
731
|
+
else:
|
|
732
|
+
pHMLog10 = -math.log10(g["n"]) + (
|
|
733
|
+
g["pMax"] + math.log10(g["pTail"])
|
|
734
|
+
)
|
|
735
|
+
pHMLog10 = max(
|
|
736
|
+
MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
|
|
737
|
+
)
|
|
657
738
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
)
|
|
665
|
-
|
|
739
|
+
if g["qHasInf"]:
|
|
740
|
+
qHMLog10 = MAX_NEGLOGP
|
|
741
|
+
else:
|
|
742
|
+
if (
|
|
743
|
+
g["qMax"] == float("-inf")
|
|
744
|
+
or not (g["qTail"] > 0.0)
|
|
745
|
+
or math.isnan(g["qTail"])
|
|
746
|
+
):
|
|
747
|
+
qHMLog10 = MIN_NEGLOGP
|
|
748
|
+
else:
|
|
749
|
+
qHMLog10 = -math.log10(g["n"]) + (
|
|
750
|
+
g["qMax"] + math.log10(g["qTail"])
|
|
751
|
+
)
|
|
752
|
+
qHMLog10 = max(
|
|
753
|
+
MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
|
|
754
|
+
)
|
|
666
755
|
|
|
667
|
-
|
|
668
|
-
|
|
756
|
+
pointSource = (
|
|
757
|
+
g["peakAbs"] - sMin
|
|
758
|
+
if g["peakAbs"] >= 0
|
|
759
|
+
else (eMax - sMin) // 2
|
|
760
|
+
)
|
|
669
761
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
binIndex -= 1
|
|
687
|
-
binCounts[binIndex] += 1
|
|
688
|
-
valueSeries = (
|
|
689
|
-
[countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
|
|
690
|
-
if normalize
|
|
691
|
-
else binCounts[:]
|
|
692
|
-
)
|
|
693
|
-
valueMaximum = max(valueSeries) if valueSeries else 0
|
|
694
|
-
widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
|
|
695
|
-
edgeFormat = f"{{:.{2}f}}"
|
|
696
|
-
rangeLabels = [
|
|
697
|
-
f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
|
|
698
|
-
for indexValue in range(binCount)
|
|
699
|
-
]
|
|
700
|
-
labelWidth = max(len(textValue) for textValue in rangeLabels)
|
|
701
|
-
lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
|
|
702
|
-
for rangeLabel, seriesValue, countValue in zip(
|
|
703
|
-
rangeLabels, valueSeries, binCounts
|
|
704
|
-
):
|
|
705
|
-
barString = barChar * int(round(seriesValue * widthScale))
|
|
706
|
-
trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
|
|
762
|
+
qMinLog10 = g["qMin"]
|
|
763
|
+
qMaxLog10 = g["qMax"]
|
|
764
|
+
if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
|
|
765
|
+
qMinLog10 = MIN_NEGLOGP
|
|
766
|
+
if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
|
|
767
|
+
qMaxLog10 = MAX_NEGLOGP
|
|
768
|
+
elif (
|
|
769
|
+
not math.isfinite(qMaxLog10)
|
|
770
|
+
or not math.isfinite(qMinLog10)
|
|
771
|
+
) or (qMaxLog10 < MIN_NEGLOGP):
|
|
772
|
+
qMinLog10 = 0.0
|
|
773
|
+
qMaxLog10 = 0.0
|
|
774
|
+
|
|
775
|
+
# informative+parsable name
|
|
776
|
+
# e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
|
|
777
|
+
name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
|
|
707
778
|
lines.append(
|
|
708
|
-
f"{
|
|
779
|
+
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
|
|
709
780
|
)
|
|
710
|
-
|
|
781
|
+
|
|
782
|
+
with open(outPath, "w") as outF:
|
|
783
|
+
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
784
|
+
logger.info(f"Merged matches written to {outPath}")
|
|
785
|
+
return outPath
|