consenrich 0.7.0b1__cp312-cp312-macosx_11_0_arm64.whl → 0.7.1b2__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/cconsenrich.c +174 -174
- consenrich/cconsenrich.cpython-312-darwin.so +0 -0
- consenrich/consenrich.py +273 -77
- consenrich/core.py +11 -9
- consenrich/matching.py +513 -373
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b2.dist-info}/METADATA +1 -1
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b2.dist-info}/RECORD +11 -11
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b2.dist-info}/WHEEL +0 -0
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b2.dist-info}/entry_points.txt +0 -0
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b2.dist-info}/licenses/LICENSE +0 -0
- {consenrich-0.7.0b1.dist-info → consenrich-0.7.1b2.dist-info}/top_level.txt +0 -0
consenrich/matching.py
CHANGED
|
@@ -3,6 +3,7 @@ r"""Module implementing (experimental) 'structured peak detection' features usin
|
|
|
3
3
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import math
|
|
6
7
|
from pybedtools import BedTool
|
|
7
8
|
from typing import List, Optional
|
|
8
9
|
|
|
@@ -23,13 +24,64 @@ logging.basicConfig(
|
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def autoMinLengthIntervals(
|
|
28
|
+
values: np.ndarray, initLen: int = 3
|
|
29
|
+
) -> int:
|
|
30
|
+
r"""Determines a minimum matching length (in interval units) based on the input signal values.
|
|
31
|
+
|
|
32
|
+
Returns the mean length of non-zero contiguous segments in a log-scaled/centered version of `values`
|
|
33
|
+
|
|
34
|
+
:param values: A 1D array of signal-like values.
|
|
35
|
+
:type values: np.ndarray
|
|
36
|
+
:param initLen: Initial minimum length (in intervals). Defaults to 3.
|
|
37
|
+
:type initLen: int
|
|
38
|
+
:return: Estimated minimum matching length (in intervals)
|
|
39
|
+
:rtype: int
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
trValues = np.asinh(values) - signal.medfilt(
|
|
43
|
+
np.asinh(values),
|
|
44
|
+
kernel_size=
|
|
45
|
+
max(
|
|
46
|
+
(2 * initLen) + 1,
|
|
47
|
+
2 * (int(len(values) * 0.005)) + 1,
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
nz = trValues[trValues > 0]
|
|
51
|
+
if len(nz) == 0:
|
|
52
|
+
return initLen
|
|
53
|
+
thr = np.quantile(nz, 0.90, method="interpolated_inverted_cdf")
|
|
54
|
+
mask = nz >= thr
|
|
55
|
+
if not np.any(mask):
|
|
56
|
+
return initLen
|
|
57
|
+
idx = np.flatnonzero(np.diff(np.r_[False, mask, False]))
|
|
58
|
+
runs = idx.reshape(-1, 2)
|
|
59
|
+
widths = runs[:, 1] - runs[:, 0]
|
|
60
|
+
widths = widths[widths >= initLen]
|
|
61
|
+
if len(widths) == 0:
|
|
62
|
+
return initLen
|
|
63
|
+
return int(np.mean(widths))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def scalarClip(value: float, low: float, high: float) -> float:
|
|
67
|
+
return low if value < low else high if value > high else value
|
|
68
|
+
|
|
69
|
+
|
|
26
70
|
def castableToFloat(value) -> bool:
|
|
27
71
|
if value is None:
|
|
28
72
|
return False
|
|
29
73
|
if isinstance(value, bool):
|
|
30
74
|
return False
|
|
31
75
|
if isinstance(value, str):
|
|
32
|
-
if value.lower().replace(
|
|
76
|
+
if value.lower().replace(" ", "") in [
|
|
77
|
+
"nan",
|
|
78
|
+
"inf",
|
|
79
|
+
"-inf",
|
|
80
|
+
"infinity",
|
|
81
|
+
"-infinity",
|
|
82
|
+
"",
|
|
83
|
+
" ",
|
|
84
|
+
]:
|
|
33
85
|
return False
|
|
34
86
|
|
|
35
87
|
try:
|
|
@@ -75,7 +127,11 @@ def matchExistingBedGraph(
|
|
|
75
127
|
)
|
|
76
128
|
|
|
77
129
|
if mergeGapBP is None:
|
|
78
|
-
mergeGapBP = (
|
|
130
|
+
mergeGapBP = (
|
|
131
|
+
(minMatchLengthBP // 2) + 1
|
|
132
|
+
if minMatchLengthBP is not None
|
|
133
|
+
else 75
|
|
134
|
+
)
|
|
79
135
|
|
|
80
136
|
allowedTemplates = [
|
|
81
137
|
x for x in pw.wavelist(kind="discrete") if "bio" not in x
|
|
@@ -107,7 +163,7 @@ def matchExistingBedGraph(
|
|
|
107
163
|
for chrom_ in sorted(bedGraphDF["chromosome"].unique()):
|
|
108
164
|
df_ = bedGraphDF[bedGraphDF["chromosome"] == chrom_]
|
|
109
165
|
if len(df_) < 5:
|
|
110
|
-
logger.info(f"Skipping {chrom_}:
|
|
166
|
+
logger.info(f"Skipping {chrom_}: less than 5 intervals.")
|
|
111
167
|
continue
|
|
112
168
|
|
|
113
169
|
try:
|
|
@@ -129,7 +185,9 @@ def matchExistingBedGraph(
|
|
|
129
185
|
randSeed=randSeed,
|
|
130
186
|
)
|
|
131
187
|
except Exception as ex:
|
|
132
|
-
logger.info(
|
|
188
|
+
logger.info(
|
|
189
|
+
f"Skipping {chrom_} due to error in matchWavelet: {ex}"
|
|
190
|
+
)
|
|
133
191
|
continue
|
|
134
192
|
|
|
135
193
|
if df__.empty:
|
|
@@ -145,7 +203,9 @@ def matchExistingBedGraph(
|
|
|
145
203
|
outPaths.append(perChromOut)
|
|
146
204
|
|
|
147
205
|
if merge:
|
|
148
|
-
mergedPath = mergeMatches(
|
|
206
|
+
mergedPath = mergeMatches(
|
|
207
|
+
perChromOut, mergeGapBP=mergeGapBP
|
|
208
|
+
)
|
|
149
209
|
if mergedPath is not None:
|
|
150
210
|
logger.info(f"Merged matches written to {mergedPath}")
|
|
151
211
|
outPathsMerged.append(mergedPath)
|
|
@@ -177,7 +237,9 @@ def matchExistingBedGraph(
|
|
|
177
237
|
with open(path, "r") as inF:
|
|
178
238
|
for line in inF:
|
|
179
239
|
outF.write(line)
|
|
180
|
-
logger.info(
|
|
240
|
+
logger.info(
|
|
241
|
+
f"All merged matches written to {outPathMergedAll}"
|
|
242
|
+
)
|
|
181
243
|
|
|
182
244
|
for path_ in outPaths + outPathsMerged:
|
|
183
245
|
try:
|
|
@@ -211,34 +273,38 @@ def matchWavelet(
|
|
|
211
273
|
excludeRegionsBedFile: Optional[str] = None,
|
|
212
274
|
weights: Optional[npt.NDArray[np.float64]] = None,
|
|
213
275
|
) -> pd.DataFrame:
|
|
214
|
-
r"""Detect structured peaks
|
|
276
|
+
r"""Detect structured peaks in Consenrich tracks by matching wavelet- or scaling-function–based templates.
|
|
215
277
|
|
|
216
278
|
:param chromosome: Chromosome name for the input intervals and values.
|
|
217
279
|
:type chromosome: str
|
|
218
|
-
:param values:
|
|
280
|
+
:param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
|
|
281
|
+
but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
|
|
219
282
|
:type values: npt.NDArray[np.float64]
|
|
220
|
-
:param templateNames: A list of str values -- wavelet
|
|
283
|
+
:param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
|
|
221
284
|
:type templateNames: List[str]
|
|
222
|
-
:param cascadeLevels:
|
|
223
|
-
the
|
|
285
|
+
:param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
|
|
286
|
+
Must have the same length as `templateNames`, with each entry aligned to the
|
|
287
|
+
corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
|
|
224
288
|
:type cascadeLevels: List[int]
|
|
225
289
|
:param iters: Number of random blocks to sample in the response sequence while building
|
|
226
290
|
an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
|
|
227
291
|
:type iters: int
|
|
228
292
|
:param alpha: Primary significance threshold on detected matches. Specifically, the
|
|
229
|
-
|
|
230
|
-
|
|
293
|
+
minimum corr. empirical p-value approximated from randomly sampled blocks in the
|
|
294
|
+
response sequence.
|
|
231
295
|
:type alpha: float
|
|
232
296
|
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
233
297
|
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
If a `
|
|
298
|
+
If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
|
|
299
|
+
If set to `None`, defaults to 250 bp.
|
|
300
|
+
:type minMatchLengthBP: Optional[int]
|
|
301
|
+
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
|
|
302
|
+
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
303
|
+
to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
|
|
304
|
+
than this (absolute) value. *Set to a negative value to disable the threshold*.
|
|
305
|
+
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
|
|
240
306
|
threshold is then set to the corresponding quantile of the non-zero signal estimates.
|
|
241
|
-
Defaults to str value 'q:0.75' --- the
|
|
307
|
+
Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
|
|
242
308
|
:type minSignalAtMaxima: Optional[str | float]
|
|
243
309
|
:param useScalingFunction: If True, use (only) the scaling function to build the matching template.
|
|
244
310
|
If False, use (only) the wavelet function.
|
|
@@ -247,342 +313,372 @@ def matchWavelet(
|
|
|
247
313
|
:type excludeRegionsBedFile: Optional[str]
|
|
248
314
|
|
|
249
315
|
:seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
|
|
316
|
+
:return: A pandas DataFrame with detected matches
|
|
317
|
+
:rtype: pd.DataFrame
|
|
250
318
|
"""
|
|
251
319
|
|
|
320
|
+
rng = np.random.default_rng(int(randSeed))
|
|
252
321
|
if len(intervals) < 5:
|
|
253
322
|
raise ValueError("`intervals` must be at least length 5")
|
|
254
|
-
if len(values) != len(intervals):
|
|
255
|
-
raise ValueError("`values` must have the same length as `intervals`")
|
|
256
|
-
intervalLengthBP = intervals[1] - intervals[0]
|
|
257
|
-
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBP):
|
|
258
|
-
# FFR: don't change this exception message without updating tests
|
|
259
|
-
# --'spaced' is matched in tests
|
|
260
|
-
raise ValueError("`intervals` must be evenly spaced.")
|
|
261
323
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
"end",
|
|
267
|
-
"name",
|
|
268
|
-
"score",
|
|
269
|
-
"strand",
|
|
270
|
-
"signal",
|
|
271
|
-
"pValue",
|
|
272
|
-
"qValue",
|
|
273
|
-
"pointSource",
|
|
274
|
-
]
|
|
275
|
-
matchDF = pd.DataFrame(columns=cols)
|
|
276
|
-
minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
|
|
277
|
-
cascadeLevels = sorted(list(set(cascadeLevels)))
|
|
278
|
-
if weights is not None and len(weights) == len(values):
|
|
279
|
-
values = values * weights
|
|
280
|
-
asinhValues = np.asinh(values, dtype=np.float32)
|
|
281
|
-
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
282
|
-
iters = max(iters, 1000)
|
|
283
|
-
defQuantile: float = 0.75
|
|
284
|
-
for l_, cascadeLevel in enumerate(cascadeLevels):
|
|
285
|
-
for t_, templateName in enumerate(templateNames):
|
|
286
|
-
try:
|
|
287
|
-
templateName = str(templateName)
|
|
288
|
-
cascadeLevel = int(cascadeLevel)
|
|
289
|
-
except ValueError:
|
|
290
|
-
logger.info(
|
|
291
|
-
f"Skipping invalid templateName or cascadeLevel: {templateName}, {cascadeLevel}"
|
|
292
|
-
)
|
|
293
|
-
continue
|
|
294
|
-
if templateName not in pw.wavelist(kind="discrete"):
|
|
295
|
-
logger.info(
|
|
296
|
-
f"\nSkipping unknown wavelet template: {templateName}\nAvailable templates: {pw.wavelist(kind='discrete')}"
|
|
297
|
-
)
|
|
298
|
-
continue
|
|
299
|
-
|
|
300
|
-
wav = pw.Wavelet(templateName)
|
|
301
|
-
scalingFunc, waveletFunc, x = wav.wavefun(level=cascadeLevel)
|
|
302
|
-
template = np.array(waveletFunc, dtype=np.float64) / np.linalg.norm(
|
|
303
|
-
waveletFunc
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
if useScalingFunction:
|
|
307
|
-
template = np.array(
|
|
308
|
-
scalingFunc, dtype=np.float64
|
|
309
|
-
) / np.linalg.norm(scalingFunc)
|
|
324
|
+
if len(values) != len(intervals):
|
|
325
|
+
raise ValueError(
|
|
326
|
+
"`values` must have the same length as `intervals`"
|
|
327
|
+
)
|
|
310
328
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
329
|
+
if len(templateNames) != len(cascadeLevels):
|
|
330
|
+
raise ValueError(
|
|
331
|
+
"\n\t`templateNames` and `cascadeLevels` must have the same length."
|
|
332
|
+
"\n\tSet products are not supported, i.e., each template needs an explicitly defined cascade level."
|
|
333
|
+
"\t\ne.g., for `templateNames = [haar, db2]`, use `cascadeLevels = [2, 2]`, not `[2]`.\n"
|
|
334
|
+
)
|
|
314
335
|
|
|
315
|
-
|
|
316
|
-
values, template[::-1], mode="same"
|
|
317
|
-
)
|
|
336
|
+
intervalLengthBp = intervals[1] - intervals[0]
|
|
318
337
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
)
|
|
338
|
+
if minMatchLengthBP is not None and minMatchLengthBP < 1:
|
|
339
|
+
minMatchLengthBP = (
|
|
340
|
+
autoMinLengthIntervals(values) * int(intervalLengthBp)
|
|
341
|
+
)
|
|
342
|
+
elif minMatchLengthBP is None:
|
|
343
|
+
minMatchLengthBP = 250
|
|
326
344
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
relativeMaximaWindow = max(relativeMaximaWindow, 1)
|
|
331
|
-
|
|
332
|
-
excludeMask = np.zeros(len(intervals), dtype=np.uint8)
|
|
333
|
-
if excludeRegionsBedFile is not None:
|
|
334
|
-
excludeMask = core.getBedMask(
|
|
335
|
-
chromosome,
|
|
336
|
-
excludeRegionsBedFile,
|
|
337
|
-
intervals,
|
|
338
|
-
)
|
|
345
|
+
logger.info(
|
|
346
|
+
f"\n\tUsing minMatchLengthBP: {minMatchLengthBP}"
|
|
347
|
+
)
|
|
339
348
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
relativeMaximaWindow,
|
|
348
|
-
iters * 2,
|
|
349
|
-
randSeed_,
|
|
350
|
-
excludeMask.astype(np.uint8),
|
|
351
|
-
),
|
|
352
|
-
dtype=float,
|
|
349
|
+
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
|
|
350
|
+
raise ValueError("`intervals` must be evenly spaced.")
|
|
351
|
+
|
|
352
|
+
if weights is not None:
|
|
353
|
+
if len(weights) != len(values):
|
|
354
|
+
logger.warning(
|
|
355
|
+
f"`weights` length {len(weights)} does not match `values` length {len(values)}. Ignoring..."
|
|
353
356
|
)
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
np.quantile(
|
|
393
|
-
asinhNonZeroValues,
|
|
394
|
-
qVal,
|
|
395
|
-
method="interpolated_inverted_cdf",
|
|
396
|
-
)
|
|
357
|
+
else:
|
|
358
|
+
values = values * weights
|
|
359
|
+
|
|
360
|
+
asinhValues = np.asinh(values, dtype=np.float32)
|
|
361
|
+
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
362
|
+
iters = max(int(iters), 1000)
|
|
363
|
+
defQuantile = 0.75
|
|
364
|
+
chromMin = int(intervals[0])
|
|
365
|
+
chromMax = int(intervals[-1])
|
|
366
|
+
chromMid = chromMin + (chromMax - chromMin) // 2 # for split
|
|
367
|
+
halfLeftMask = intervals < chromMid
|
|
368
|
+
halfRightMask = ~halfLeftMask
|
|
369
|
+
excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
|
|
370
|
+
if excludeRegionsBedFile is not None:
|
|
371
|
+
excludeMaskGlobal = core.getBedMask(
|
|
372
|
+
chromosome, excludeRegionsBedFile, intervals
|
|
373
|
+
).astype(np.uint8)
|
|
374
|
+
allRows = []
|
|
375
|
+
|
|
376
|
+
def bhFdr(p: np.ndarray) -> np.ndarray:
|
|
377
|
+
m = len(p)
|
|
378
|
+
order = np.argsort(p, kind="mergesort")
|
|
379
|
+
ranked = np.arange(1, m + 1, dtype=float)
|
|
380
|
+
q = (p[order] * m) / ranked
|
|
381
|
+
q = np.minimum.accumulate(q[::-1])[::-1]
|
|
382
|
+
out = np.empty_like(q)
|
|
383
|
+
out[order] = q
|
|
384
|
+
return np.clip(out, 0.0, 1.0)
|
|
385
|
+
|
|
386
|
+
def parseMinSignalThreshold(val):
|
|
387
|
+
if val is None:
|
|
388
|
+
return -1e6
|
|
389
|
+
if isinstance(val, str):
|
|
390
|
+
if val.startswith("q:"):
|
|
391
|
+
qVal = float(val.split("q:")[-1])
|
|
392
|
+
if not (0 <= qVal <= 1):
|
|
393
|
+
raise ValueError(
|
|
394
|
+
f"Quantile {qVal} is out of range"
|
|
397
395
|
)
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
arsinhSignalThreshold = -float(1e6)
|
|
404
|
-
else:
|
|
405
|
-
# use supplied value
|
|
406
|
-
arsinhSignalThreshold = np.asinh(
|
|
407
|
-
float(minSignalAtMaxima)
|
|
408
|
-
)
|
|
409
|
-
else:
|
|
410
|
-
# case: not in known format, not castable to a float, use defaults
|
|
411
|
-
logger.info(
|
|
412
|
-
f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
|
|
396
|
+
return float(
|
|
397
|
+
np.quantile(
|
|
398
|
+
asinhNonZeroValues,
|
|
399
|
+
qVal,
|
|
400
|
+
method="interpolated_inverted_cdf",
|
|
413
401
|
)
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
402
|
+
)
|
|
403
|
+
elif castableToFloat(val):
|
|
404
|
+
v = float(val)
|
|
405
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
406
|
+
else:
|
|
407
|
+
return float(
|
|
408
|
+
np.quantile(
|
|
409
|
+
asinhNonZeroValues,
|
|
410
|
+
defQuantile,
|
|
411
|
+
method="interpolated_inverted_cdf",
|
|
420
412
|
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
relativeMaximaIndices = signal.argrelmax(
|
|
435
|
-
responseSequence, order=relativeMaximaWindow
|
|
436
|
-
)[0]
|
|
413
|
+
)
|
|
414
|
+
if isinstance(val, (float, int)):
|
|
415
|
+
v = float(val)
|
|
416
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
417
|
+
return float(
|
|
418
|
+
np.quantile(
|
|
419
|
+
asinhNonZeroValues,
|
|
420
|
+
defQuantile,
|
|
421
|
+
method="interpolated_inverted_cdf",
|
|
422
|
+
)
|
|
423
|
+
)
|
|
437
424
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
425
|
+
def relativeMaxima(
|
|
426
|
+
resp: np.ndarray, orderBins: int
|
|
427
|
+
) -> np.ndarray:
|
|
428
|
+
return signal.argrelmax(resp, order=max(int(orderBins), 1))[0]
|
|
429
|
+
|
|
430
|
+
def sampleBlockMaxima(
|
|
431
|
+
resp: np.ndarray,
|
|
432
|
+
halfMask: np.ndarray,
|
|
433
|
+
relWindowBins: int,
|
|
434
|
+
nsamp: int,
|
|
435
|
+
seed: int,
|
|
436
|
+
):
|
|
437
|
+
exMask = excludeMaskGlobal.astype(np.uint8).copy()
|
|
438
|
+
exMask |= (~halfMask).astype(np.uint8)
|
|
439
|
+
vals = np.array(
|
|
440
|
+
cconsenrich.csampleBlockStats(
|
|
441
|
+
intervals.astype(np.uint32),
|
|
442
|
+
resp,
|
|
443
|
+
int(relWindowBins),
|
|
444
|
+
int(nsamp),
|
|
445
|
+
int(seed),
|
|
446
|
+
exMask.astype(np.uint8),
|
|
447
|
+
),
|
|
448
|
+
dtype=float,
|
|
449
|
+
)
|
|
450
|
+
if len(vals) == 0:
|
|
451
|
+
return vals
|
|
452
|
+
low = np.quantile(vals, 0.001)
|
|
453
|
+
high = np.quantile(vals, 0.999)
|
|
454
|
+
return vals[(vals > low) & (vals < high)]
|
|
455
|
+
|
|
456
|
+
for templateName, cascadeLevel in zip(
|
|
457
|
+
templateNames, cascadeLevels
|
|
458
|
+
):
|
|
459
|
+
if templateName not in pw.wavelist(kind="discrete"):
|
|
460
|
+
logger.warning(
|
|
461
|
+
f"Skipping unknown wavelet template: {templateName}"
|
|
462
|
+
)
|
|
463
|
+
continue
|
|
442
464
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
465
|
+
wav = pw.Wavelet(str(templateName))
|
|
466
|
+
scalingFunc, waveletFunc, _ = wav.wavefun(
|
|
467
|
+
level=int(cascadeLevel)
|
|
468
|
+
)
|
|
469
|
+
template = np.array(
|
|
470
|
+
scalingFunc if useScalingFunction else waveletFunc,
|
|
471
|
+
dtype=np.float64,
|
|
472
|
+
)
|
|
473
|
+
template /= np.linalg.norm(template)
|
|
448
474
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
-maxNumMatches:
|
|
455
|
-
]
|
|
456
|
-
]
|
|
475
|
+
logger.info(
|
|
476
|
+
f"\n\tMatching template: {templateName}"
|
|
477
|
+
f"\n\tcascade level: {cascadeLevel}"
|
|
478
|
+
f"\n\ttemplate length: {len(template)}"
|
|
479
|
+
)
|
|
457
480
|
|
|
458
|
-
|
|
459
|
-
|
|
481
|
+
# efficient FFT-based cross-correlation
|
|
482
|
+
# (OA may be better for smaller templates, TODO add a check)
|
|
483
|
+
response = signal.fftconvolve(
|
|
484
|
+
values, template[::-1], mode="same"
|
|
485
|
+
)
|
|
486
|
+
thisMinMatchBp = minMatchLengthBP
|
|
487
|
+
if thisMinMatchBp is None or thisMinMatchBp < 1:
|
|
488
|
+
thisMinMatchBp = len(template) * intervalLengthBp
|
|
489
|
+
if thisMinMatchBp % intervalLengthBp != 0:
|
|
490
|
+
thisMinMatchBp += intervalLengthBp - (
|
|
491
|
+
thisMinMatchBp % intervalLengthBp
|
|
460
492
|
)
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
493
|
+
relWindowBins = int(
|
|
494
|
+
((thisMinMatchBp / intervalLengthBp) / 2) + 1
|
|
495
|
+
)
|
|
496
|
+
relWindowBins = max(relWindowBins, 1)
|
|
497
|
+
asinhThreshold = parseMinSignalThreshold(minSignalAtMaxima)
|
|
498
|
+
for nullMask, testMask, tag in [
|
|
499
|
+
(halfLeftMask, halfRightMask, "R"),
|
|
500
|
+
(halfRightMask, halfLeftMask, "L"),
|
|
501
|
+
]:
|
|
502
|
+
blockMaxima = sampleBlockMaxima(
|
|
503
|
+
response,
|
|
504
|
+
nullMask,
|
|
505
|
+
relWindowBins,
|
|
506
|
+
nsamp=max(iters, 1000),
|
|
507
|
+
seed=rng.integers(1, 10_000),
|
|
465
508
|
)
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
509
|
+
if len(blockMaxima) < 25:
|
|
510
|
+
pooledMask = ~excludeMaskGlobal.astype(bool)
|
|
511
|
+
blockMaxima = sampleBlockMaxima(
|
|
512
|
+
response,
|
|
513
|
+
pooledMask,
|
|
514
|
+
relWindowBins,
|
|
515
|
+
nsamp=max(iters, 1000),
|
|
516
|
+
seed=rng.integers(1, 10_000),
|
|
517
|
+
)
|
|
518
|
+
ecdfSf = stats.ecdf(blockMaxima).sf
|
|
519
|
+
candidateIdx = relativeMaxima(response, relWindowBins)
|
|
520
|
+
|
|
521
|
+
candidateMask = (
|
|
522
|
+
(candidateIdx >= relWindowBins)
|
|
523
|
+
& (candidateIdx < len(response) - relWindowBins)
|
|
524
|
+
& (testMask[candidateIdx])
|
|
525
|
+
& (excludeMaskGlobal[candidateIdx] == 0)
|
|
526
|
+
& (asinhValues[candidateIdx] > asinhThreshold)
|
|
472
527
|
)
|
|
473
528
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
529
|
+
candidateIdx = candidateIdx[candidateMask]
|
|
530
|
+
if len(candidateIdx) == 0:
|
|
531
|
+
continue
|
|
532
|
+
if (
|
|
533
|
+
maxNumMatches is not None
|
|
534
|
+
and len(candidateIdx) > maxNumMatches
|
|
535
|
+
):
|
|
536
|
+
candidateIdx = candidateIdx[
|
|
537
|
+
np.argsort(asinhValues[candidateIdx])[
|
|
538
|
+
-maxNumMatches:
|
|
539
|
+
]
|
|
540
|
+
]
|
|
541
|
+
pEmp = np.clip(
|
|
542
|
+
ecdfSf.evaluate(response[candidateIdx]),
|
|
543
|
+
1.0e-10,
|
|
544
|
+
1.0,
|
|
477
545
|
)
|
|
478
|
-
|
|
546
|
+
startsIdx = np.maximum(candidateIdx - relWindowBins, 0)
|
|
479
547
|
endsIdx = np.minimum(
|
|
480
|
-
len(values) - 1,
|
|
548
|
+
len(values) - 1, candidateIdx + relWindowBins
|
|
481
549
|
)
|
|
482
|
-
# point source
|
|
483
550
|
pointSourcesIdx = []
|
|
484
|
-
for
|
|
551
|
+
for s, e in zip(startsIdx, endsIdx):
|
|
485
552
|
pointSourcesIdx.append(
|
|
486
|
-
np.argmax(values[
|
|
553
|
+
np.argmax(values[s : e + 1]) + s
|
|
487
554
|
)
|
|
488
555
|
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
489
556
|
starts = intervals[startsIdx]
|
|
490
557
|
ends = intervals[endsIdx]
|
|
491
|
-
|
|
492
|
-
1,
|
|
558
|
+
pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
|
|
559
|
+
1, intervalLengthBp // 2
|
|
493
560
|
)
|
|
494
|
-
if
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
starts = pointSources - (
|
|
498
|
-
relativeMaximaWindow * intervalLengthBP
|
|
561
|
+
if recenterAtPointSource:
|
|
562
|
+
starts = pointSourcesAbs - (
|
|
563
|
+
relWindowBins * intervalLengthBp
|
|
499
564
|
)
|
|
500
|
-
ends =
|
|
501
|
-
|
|
502
|
-
1, intervalLengthBP // 2
|
|
503
|
-
)
|
|
504
|
-
# (ucsc browser) score [0,1000]
|
|
505
|
-
sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
|
|
506
|
-
minResponse = np.min(sqScores)
|
|
507
|
-
maxResponse = np.max(sqScores)
|
|
508
|
-
rangeResponse = max(maxResponse - minResponse, 1.0)
|
|
509
|
-
scores = (
|
|
510
|
-
250 + 750 * (sqScores - minResponse) / rangeResponse
|
|
511
|
-
).astype(int)
|
|
512
|
-
# feature name
|
|
513
|
-
names = [
|
|
514
|
-
f"{templateName}_{cascadeLevel}_{i}"
|
|
515
|
-
for i in relativeMaximaIndices
|
|
516
|
-
]
|
|
517
|
-
# strand
|
|
518
|
-
strands = ["." for _ in range(len(scores))]
|
|
519
|
-
# p-values in -log10 scale per convention
|
|
520
|
-
pValues = -np.log10(
|
|
521
|
-
np.clip(
|
|
522
|
-
ecdfBlockMaximaSF.evaluate(
|
|
523
|
-
responseSequence[relativeMaximaIndices]
|
|
524
|
-
),
|
|
525
|
-
1e-10,
|
|
526
|
-
1.0,
|
|
565
|
+
ends = pointSourcesAbs + (
|
|
566
|
+
relWindowBins * intervalLengthBp
|
|
527
567
|
)
|
|
568
|
+
pointSourcesRel = (
|
|
569
|
+
intervals[pointSourcesIdx] - starts
|
|
570
|
+
) + max(1, intervalLengthBp // 2)
|
|
571
|
+
sqScores = (1 + response[candidateIdx]) ** 2
|
|
572
|
+
minR, maxR = (
|
|
573
|
+
float(np.min(sqScores)),
|
|
574
|
+
float(np.max(sqScores)),
|
|
528
575
|
)
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
)
|
|
576
|
+
rangeR = max(maxR - minR, 1.0)
|
|
577
|
+
scores = (250 + 750 * (sqScores - minR) / rangeR).astype(int)
|
|
578
|
+
for i, idxVal in enumerate(candidateIdx):
|
|
579
|
+
allRows.append(
|
|
580
|
+
{
|
|
581
|
+
"chromosome": chromosome,
|
|
582
|
+
"start": int(starts[i]),
|
|
583
|
+
"end": int(ends[i]),
|
|
584
|
+
"name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
|
|
585
|
+
"score": int(scores[i]),
|
|
586
|
+
"strand": ".",
|
|
587
|
+
"signal": float(response[idxVal]),
|
|
588
|
+
"p_raw": float(pEmp[i]),
|
|
589
|
+
"pointSource": int(pointSourcesRel[i]),
|
|
590
|
+
}
|
|
591
|
+
)
|
|
546
592
|
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
randSeed_ += 1
|
|
593
|
+
if not allRows:
|
|
594
|
+
logger.warning(
|
|
595
|
+
"No matches detected, returning empty DataFrame."
|
|
596
|
+
)
|
|
552
597
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
598
|
+
return pd.DataFrame(
|
|
599
|
+
columns=[
|
|
600
|
+
"chromosome",
|
|
601
|
+
"start",
|
|
602
|
+
"end",
|
|
603
|
+
"name",
|
|
604
|
+
"score",
|
|
605
|
+
"strand",
|
|
606
|
+
"signal",
|
|
607
|
+
"pValue",
|
|
608
|
+
"qValue",
|
|
609
|
+
"pointSource",
|
|
610
|
+
]
|
|
611
|
+
)
|
|
559
612
|
|
|
613
|
+
df = pd.DataFrame(allRows)
|
|
614
|
+
qVals = bhFdr(df["p_raw"].values.astype(float))
|
|
615
|
+
df["pValue"] = -np.log10(
|
|
616
|
+
np.clip(df["p_raw"].values, 1.0e-10, 1.0)
|
|
617
|
+
)
|
|
618
|
+
df["qValue"] = -np.log10(np.clip(qVals, 1.0e-10, 1.0))
|
|
619
|
+
df.drop(columns=["p_raw"], inplace=True)
|
|
620
|
+
df = df[qVals <= alpha].copy()
|
|
621
|
+
df["chromosome"] = df["chromosome"].astype(str)
|
|
622
|
+
df.sort_values(by=["chromosome", "start", "end"], inplace=True)
|
|
623
|
+
df.reset_index(drop=True, inplace=True)
|
|
624
|
+
df = df[
|
|
625
|
+
[
|
|
626
|
+
"chromosome",
|
|
627
|
+
"start",
|
|
628
|
+
"end",
|
|
629
|
+
"name",
|
|
630
|
+
"score",
|
|
631
|
+
"strand",
|
|
632
|
+
"signal",
|
|
633
|
+
"pValue",
|
|
634
|
+
"qValue",
|
|
635
|
+
"pointSource",
|
|
636
|
+
]
|
|
637
|
+
]
|
|
638
|
+
return df
|
|
560
639
|
|
|
561
|
-
def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
562
|
-
r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
|
|
563
640
|
|
|
564
|
-
|
|
641
|
+
def mergeMatches(
|
|
642
|
+
filePath: str,
|
|
643
|
+
mergeGapBP: Optional[int],
|
|
644
|
+
) -> Optional[str]:
|
|
645
|
+
r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
|
|
646
|
+
|
|
647
|
+
The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
|
|
648
|
+
The fourth column (name) of each merged peak contains information about the number of features that were merged
|
|
649
|
+
and the range of q-values among them.
|
|
650
|
+
|
|
651
|
+
Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
|
|
565
652
|
|
|
566
653
|
:param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
|
|
567
654
|
:type filePath: str
|
|
568
|
-
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
|
|
569
|
-
:type mergeGapBP: int
|
|
655
|
+
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging. Defaults to 75 bp if `None` or less than 1.
|
|
656
|
+
:type mergeGapBP: Optional[int]
|
|
570
657
|
|
|
571
|
-
:seealso: :class:`consenrich.core.matchingParams`
|
|
658
|
+
:seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
|
|
572
659
|
"""
|
|
660
|
+
|
|
661
|
+
if mergeGapBP is None or mergeGapBP < 1:
|
|
662
|
+
mergeGapBP = 75
|
|
663
|
+
|
|
664
|
+
MAX_NEGLOGP = 10.0
|
|
665
|
+
MIN_NEGLOGP = 1.0e-10
|
|
666
|
+
|
|
573
667
|
if not os.path.isfile(filePath):
|
|
574
|
-
logger.
|
|
668
|
+
logger.warning(f"Couldn't access {filePath}...skipping merge")
|
|
575
669
|
return None
|
|
576
670
|
bed = None
|
|
577
671
|
try:
|
|
578
672
|
bed = BedTool(filePath)
|
|
579
673
|
except Exception as ex:
|
|
580
|
-
logger.
|
|
674
|
+
logger.warning(
|
|
581
675
|
f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
|
|
582
676
|
)
|
|
583
677
|
return None
|
|
584
678
|
if bed is None:
|
|
585
|
-
logger.
|
|
679
|
+
logger.warning(
|
|
680
|
+
f"Couldn't create BedTool for {filePath}...skipping merge"
|
|
681
|
+
)
|
|
586
682
|
return None
|
|
587
683
|
|
|
588
684
|
bed = bed.sort()
|
|
@@ -595,41 +691,86 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
|
595
691
|
end = int(fields[2])
|
|
596
692
|
score = float(fields[4])
|
|
597
693
|
signal = float(fields[6])
|
|
598
|
-
|
|
599
|
-
|
|
694
|
+
pLog10 = float(fields[7])
|
|
695
|
+
qLog10 = float(fields[8])
|
|
600
696
|
peak = int(fields[9])
|
|
601
|
-
|
|
602
|
-
if
|
|
603
|
-
groups[
|
|
697
|
+
clusterID = fields[-1]
|
|
698
|
+
if clusterID not in groups:
|
|
699
|
+
groups[clusterID] = {
|
|
604
700
|
"chrom": chrom,
|
|
605
701
|
"sMin": start,
|
|
606
702
|
"eMax": end,
|
|
607
703
|
"scSum": 0.0,
|
|
608
704
|
"sigSum": 0.0,
|
|
609
|
-
"pSum": 0.0,
|
|
610
|
-
"qSum": 0.0,
|
|
611
705
|
"n": 0,
|
|
612
706
|
"maxS": float("-inf"),
|
|
613
707
|
"peakAbs": -1,
|
|
708
|
+
"pMax": float("-inf"),
|
|
709
|
+
"pTail": 0.0,
|
|
710
|
+
"pHasInf": False,
|
|
711
|
+
"qMax": float("-inf"),
|
|
712
|
+
"qMin": float("inf"),
|
|
713
|
+
"qTail": 0.0,
|
|
714
|
+
"qHasInf": False,
|
|
614
715
|
}
|
|
615
|
-
g = groups[
|
|
716
|
+
g = groups[clusterID]
|
|
616
717
|
if start < g["sMin"]:
|
|
617
718
|
g["sMin"] = start
|
|
618
719
|
if end > g["eMax"]:
|
|
619
720
|
g["eMax"] = end
|
|
620
721
|
g["scSum"] += score
|
|
621
722
|
g["sigSum"] += signal
|
|
622
|
-
g["pSum"] += pval
|
|
623
|
-
g["qSum"] += qval
|
|
624
723
|
g["n"] += 1
|
|
625
|
-
|
|
724
|
+
|
|
725
|
+
if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
|
|
726
|
+
g["pHasInf"] = True
|
|
727
|
+
else:
|
|
728
|
+
if pLog10 > g["pMax"]:
|
|
729
|
+
if g["pMax"] == float("-inf"):
|
|
730
|
+
g["pTail"] = 1.0
|
|
731
|
+
else:
|
|
732
|
+
g["pTail"] = (
|
|
733
|
+
g["pTail"] * (10 ** (g["pMax"] - pLog10))
|
|
734
|
+
+ 1.0
|
|
735
|
+
)
|
|
736
|
+
g["pMax"] = pLog10
|
|
737
|
+
else:
|
|
738
|
+
g["pTail"] += 10 ** (pLog10 - g["pMax"])
|
|
739
|
+
|
|
740
|
+
if (
|
|
741
|
+
math.isinf(qLog10)
|
|
742
|
+
or qLog10 >= MAX_NEGLOGP
|
|
743
|
+
or qLog10 <= MIN_NEGLOGP
|
|
744
|
+
):
|
|
745
|
+
g["qHasInf"] = True
|
|
746
|
+
else:
|
|
747
|
+
if qLog10 < g["qMin"]:
|
|
748
|
+
if qLog10 < MIN_NEGLOGP:
|
|
749
|
+
g["qMin"] = MIN_NEGLOGP
|
|
750
|
+
else:
|
|
751
|
+
g["qMin"] = qLog10
|
|
752
|
+
|
|
753
|
+
if qLog10 > g["qMax"]:
|
|
754
|
+
if g["qMax"] == float("-inf"):
|
|
755
|
+
g["qTail"] = 1.0
|
|
756
|
+
else:
|
|
757
|
+
g["qTail"] = (
|
|
758
|
+
g["qTail"] * (10 ** (g["qMax"] - qLog10))
|
|
759
|
+
+ 1.0
|
|
760
|
+
)
|
|
761
|
+
g["qMax"] = qLog10
|
|
762
|
+
else:
|
|
763
|
+
g["qTail"] += 10 ** (qLog10 - g["qMax"])
|
|
764
|
+
|
|
626
765
|
if signal > g["maxS"]:
|
|
627
766
|
g["maxS"] = signal
|
|
628
767
|
g["peakAbs"] = start + peak if peak >= 0 else -1
|
|
768
|
+
|
|
629
769
|
items = []
|
|
630
|
-
for
|
|
770
|
+
for clusterID, g in groups.items():
|
|
631
771
|
items.append((g["chrom"], g["sMin"], g["eMax"], g))
|
|
632
772
|
items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
|
|
773
|
+
|
|
633
774
|
outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
|
|
634
775
|
lines = []
|
|
635
776
|
i = 0
|
|
@@ -642,69 +783,68 @@ def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
|
642
783
|
avgScore = 1000
|
|
643
784
|
scoreInt = int(round(avgScore))
|
|
644
785
|
sigAvg = g["sigSum"] / g["n"]
|
|
645
|
-
pAvg = g["pSum"] / g["n"]
|
|
646
|
-
qAvg = g["qSum"] / g["n"]
|
|
647
|
-
pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
|
|
648
|
-
name = f"mergedPeak{i}"
|
|
649
|
-
lines.append(
|
|
650
|
-
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
|
|
651
|
-
)
|
|
652
|
-
with open(outPath, "w") as outF:
|
|
653
|
-
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
654
|
-
logger.info(f"Merged matches written to {outPath}")
|
|
655
|
-
return outPath
|
|
656
786
|
|
|
787
|
+
if g["pHasInf"]:
|
|
788
|
+
pHMLog10 = MAX_NEGLOGP
|
|
789
|
+
else:
|
|
790
|
+
if (
|
|
791
|
+
g["pMax"] == float("-inf")
|
|
792
|
+
or not (g["pTail"] > 0.0)
|
|
793
|
+
or math.isnan(g["pTail"])
|
|
794
|
+
):
|
|
795
|
+
pHMLog10 = MIN_NEGLOGP
|
|
796
|
+
else:
|
|
797
|
+
pHMLog10 = -math.log10(g["n"]) + (
|
|
798
|
+
g["pMax"] + math.log10(g["pTail"])
|
|
799
|
+
)
|
|
800
|
+
pHMLog10 = max(
|
|
801
|
+
MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
|
|
802
|
+
)
|
|
657
803
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
)
|
|
665
|
-
|
|
804
|
+
if g["qHasInf"]:
|
|
805
|
+
qHMLog10 = MAX_NEGLOGP
|
|
806
|
+
else:
|
|
807
|
+
if (
|
|
808
|
+
g["qMax"] == float("-inf")
|
|
809
|
+
or not (g["qTail"] > 0.0)
|
|
810
|
+
or math.isnan(g["qTail"])
|
|
811
|
+
):
|
|
812
|
+
qHMLog10 = MIN_NEGLOGP
|
|
813
|
+
else:
|
|
814
|
+
qHMLog10 = -math.log10(g["n"]) + (
|
|
815
|
+
g["qMax"] + math.log10(g["qTail"])
|
|
816
|
+
)
|
|
817
|
+
qHMLog10 = max(
|
|
818
|
+
MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
|
|
819
|
+
)
|
|
666
820
|
|
|
667
|
-
|
|
668
|
-
|
|
821
|
+
pointSource = (
|
|
822
|
+
g["peakAbs"] - sMin
|
|
823
|
+
if g["peakAbs"] >= 0
|
|
824
|
+
else (eMax - sMin) // 2
|
|
825
|
+
)
|
|
669
826
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
binIndex -= 1
|
|
687
|
-
binCounts[binIndex] += 1
|
|
688
|
-
valueSeries = (
|
|
689
|
-
[countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
|
|
690
|
-
if normalize
|
|
691
|
-
else binCounts[:]
|
|
692
|
-
)
|
|
693
|
-
valueMaximum = max(valueSeries) if valueSeries else 0
|
|
694
|
-
widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
|
|
695
|
-
edgeFormat = f"{{:.{2}f}}"
|
|
696
|
-
rangeLabels = [
|
|
697
|
-
f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
|
|
698
|
-
for indexValue in range(binCount)
|
|
699
|
-
]
|
|
700
|
-
labelWidth = max(len(textValue) for textValue in rangeLabels)
|
|
701
|
-
lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
|
|
702
|
-
for rangeLabel, seriesValue, countValue in zip(
|
|
703
|
-
rangeLabels, valueSeries, binCounts
|
|
704
|
-
):
|
|
705
|
-
barString = barChar * int(round(seriesValue * widthScale))
|
|
706
|
-
trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
|
|
827
|
+
qMinLog10 = g["qMin"]
|
|
828
|
+
qMaxLog10 = g["qMax"]
|
|
829
|
+
if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
|
|
830
|
+
qMinLog10 = MIN_NEGLOGP
|
|
831
|
+
if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
|
|
832
|
+
qMaxLog10 = MAX_NEGLOGP
|
|
833
|
+
elif (
|
|
834
|
+
not math.isfinite(qMaxLog10)
|
|
835
|
+
or not math.isfinite(qMinLog10)
|
|
836
|
+
) or (qMaxLog10 < MIN_NEGLOGP):
|
|
837
|
+
qMinLog10 = 0.0
|
|
838
|
+
qMaxLog10 = 0.0
|
|
839
|
+
|
|
840
|
+
# informative+parsable name
|
|
841
|
+
# e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
|
|
842
|
+
name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
|
|
707
843
|
lines.append(
|
|
708
|
-
f"{
|
|
844
|
+
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
|
|
709
845
|
)
|
|
710
|
-
|
|
846
|
+
|
|
847
|
+
with open(outPath, "w") as outF:
|
|
848
|
+
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
849
|
+
logger.info(f"Merged matches written to {outPath}")
|
|
850
|
+
return outPath
|