consenrich 0.7.11b2__cp314-cp314-macosx_15_0_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/.dylibs/libomp.dylib +0 -0
- consenrich/__init__.py +11 -0
- consenrich/cconsenrich.c +50610 -0
- consenrich/cconsenrich.cpython-314-darwin.so +0 -0
- consenrich/cconsenrich.pyx +1065 -0
- consenrich/consenrich.py +1802 -0
- consenrich/constants.py +172 -0
- consenrich/core.py +2068 -0
- consenrich/data/ce10.sizes +6 -0
- consenrich/data/ce10_blacklist.bed +100 -0
- consenrich/data/ce10_sparse.bed +11828 -0
- consenrich/data/ce11.sizes +6 -0
- consenrich/data/ce11_blacklist.bed +97 -0
- consenrich/data/ce11_sparse.bed +11828 -0
- consenrich/data/dm6.sizes +7 -0
- consenrich/data/dm6_blacklist.bed +182 -0
- consenrich/data/dm6_sparse.bed +20000 -0
- consenrich/data/hg19.sizes +24 -0
- consenrich/data/hg19_blacklist.bed +834 -0
- consenrich/data/hg19_sparse.bed +288358 -0
- consenrich/data/hg38.sizes +24 -0
- consenrich/data/hg38_blacklist.bed +636 -0
- consenrich/data/hg38_sparse.bed +288699 -0
- consenrich/data/mm10.sizes +21 -0
- consenrich/data/mm10_blacklist.bed +3435 -0
- consenrich/data/mm10_sparse.bed +100400 -0
- consenrich/data/mm39.sizes +21 -0
- consenrich/data/mm39_blacklist.bed +3360 -0
- consenrich/data/mm39_sparse.bed +100381 -0
- consenrich/detrorm.py +297 -0
- consenrich/matching.py +929 -0
- consenrich/misc_util.py +122 -0
- consenrich-0.7.11b2.dist-info/METADATA +66 -0
- consenrich-0.7.11b2.dist-info/RECORD +38 -0
- consenrich-0.7.11b2.dist-info/WHEEL +6 -0
- consenrich-0.7.11b2.dist-info/entry_points.txt +2 -0
- consenrich-0.7.11b2.dist-info/licenses/LICENSE +21 -0
- consenrich-0.7.11b2.dist-info/top_level.txt +1 -0
consenrich/matching.py
ADDED
|
@@ -0,0 +1,929 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
r"""Module implementing (experimental) 'structured peak detection' features using wavelet-based templates."""
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import math
|
|
7
|
+
from pybedtools import BedTool
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import pywt as pw
|
|
12
|
+
import numpy as np
|
|
13
|
+
import numpy.typing as npt
|
|
14
|
+
|
|
15
|
+
from scipy import signal, stats
|
|
16
|
+
|
|
17
|
+
from . import cconsenrich
|
|
18
|
+
from . import core as core
|
|
19
|
+
|
|
20
|
+
logging.basicConfig(
|
|
21
|
+
level=logging.INFO,
|
|
22
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
23
|
+
)
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _FDR(pVals: np.ndarray, method: str|None = "bh") -> np.ndarray:
|
|
28
|
+
# can use bh or the more conservative Benjamini-Yekutieli to
|
|
29
|
+
# ... control FDR under arbitrary dependencies between tests
|
|
30
|
+
if method is None:
|
|
31
|
+
return pVals
|
|
32
|
+
return stats.false_discovery_control(pVals, method=method.lower())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def autoMinLengthIntervals(
|
|
36
|
+
values: np.ndarray,
|
|
37
|
+
initLen: int = 3,
|
|
38
|
+
cutoffQuantile: float = 0.90,
|
|
39
|
+
isLogScale: bool = False,
|
|
40
|
+
) -> int:
|
|
41
|
+
r"""Determines a minimum matching length (in interval units) based on the input signal values.
|
|
42
|
+
|
|
43
|
+
Returns the average length of non-zero contiguous segments in a log-scaled/centered version of `values`
|
|
44
|
+
|
|
45
|
+
:param values: A 1D array of signal-like values.
|
|
46
|
+
:type values: np.ndarray
|
|
47
|
+
:param initLen: Initial minimum length (in intervals). Defaults to 3.
|
|
48
|
+
:type initLen: int
|
|
49
|
+
:return: Estimated minimum matching length (in intervals)
|
|
50
|
+
:rtype: int
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
values_ = values.astype(np.float64).copy()
|
|
54
|
+
if not isLogScale:
|
|
55
|
+
np.asinh(values_, out=values_)
|
|
56
|
+
|
|
57
|
+
trValues = values_ - signal.medfilt(
|
|
58
|
+
values_,
|
|
59
|
+
kernel_size=max(
|
|
60
|
+
(2 * initLen) + 1,
|
|
61
|
+
2 * (int(len(values_) * 0.05)) + 1,
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# just consider stretches of positive signal
|
|
66
|
+
nz = trValues[trValues > 0]
|
|
67
|
+
if len(nz) == 0:
|
|
68
|
+
return initLen
|
|
69
|
+
# ... mask out < quantile
|
|
70
|
+
thr = np.quantile(
|
|
71
|
+
nz, cutoffQuantile, method="interpolated_inverted_cdf"
|
|
72
|
+
)
|
|
73
|
+
mask = nz >= thr
|
|
74
|
+
if not np.any(mask):
|
|
75
|
+
return initLen
|
|
76
|
+
|
|
77
|
+
idx = np.flatnonzero(np.diff(np.r_[False, mask, False]))
|
|
78
|
+
runs = idx.reshape(-1, 2)
|
|
79
|
+
widths = runs[:, 1] - runs[:, 0]
|
|
80
|
+
widths = widths[widths >= initLen]
|
|
81
|
+
|
|
82
|
+
if len(widths) == 0:
|
|
83
|
+
return initLen
|
|
84
|
+
|
|
85
|
+
return int(np.mean(widths))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def scalarClip(value: float, low: float, high: float) -> float:
|
|
89
|
+
return low if value < low else high if value > high else value
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def castableToFloat(value) -> bool:
|
|
93
|
+
if value is None:
|
|
94
|
+
return False
|
|
95
|
+
if isinstance(value, bool):
|
|
96
|
+
return False
|
|
97
|
+
if isinstance(value, str):
|
|
98
|
+
if value.lower().replace(" ", "") in [
|
|
99
|
+
"nan",
|
|
100
|
+
"inf",
|
|
101
|
+
"-inf",
|
|
102
|
+
"infinity",
|
|
103
|
+
"-infinity",
|
|
104
|
+
"",
|
|
105
|
+
" ",
|
|
106
|
+
]:
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
float(value)
|
|
111
|
+
if np.isfinite(float(value)):
|
|
112
|
+
return True
|
|
113
|
+
except Exception:
|
|
114
|
+
return False
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def matchWavelet(
|
|
119
|
+
chromosome: str,
|
|
120
|
+
intervals: npt.NDArray[int],
|
|
121
|
+
values: npt.NDArray[np.float64],
|
|
122
|
+
templateNames: List[str],
|
|
123
|
+
cascadeLevels: List[int],
|
|
124
|
+
iters: int,
|
|
125
|
+
alpha: float = 0.05,
|
|
126
|
+
minMatchLengthBP: Optional[int] = 250,
|
|
127
|
+
maxNumMatches: Optional[int] = 100_000,
|
|
128
|
+
minSignalAtMaxima: Optional[float | str] = "q:0.75",
|
|
129
|
+
randSeed: int = 42,
|
|
130
|
+
recenterAtPointSource: bool = True,
|
|
131
|
+
useScalingFunction: bool = True,
|
|
132
|
+
excludeRegionsBedFile: Optional[str] = None,
|
|
133
|
+
weights: Optional[npt.NDArray[np.float64]] = None,
|
|
134
|
+
eps: float = 1.0e-2,
|
|
135
|
+
isLogScale: bool = False,
|
|
136
|
+
autoLengthQuantile: float = 0.90,
|
|
137
|
+
) -> pd.DataFrame:
|
|
138
|
+
r"""Detect structured peaks in Consenrich tracks by matching wavelet- or scaling-function–based templates.
|
|
139
|
+
|
|
140
|
+
:param chromosome: Chromosome name for the input intervals and values.
|
|
141
|
+
:type chromosome: str
|
|
142
|
+
:param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
|
|
143
|
+
but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
|
|
144
|
+
:type values: npt.NDArray[np.float64]
|
|
145
|
+
:param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
|
|
146
|
+
:type templateNames: List[str]
|
|
147
|
+
:param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
|
|
148
|
+
Must have the same length as `templateNames`, with each entry aligned to the
|
|
149
|
+
corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
|
|
150
|
+
:type cascadeLevels: List[int]
|
|
151
|
+
:param iters: Number of random blocks to sample in the response sequence while building
|
|
152
|
+
an empirical null to test significance within chromosomes. See :func:`cconsenrich.csampleBlockStats`.
|
|
153
|
+
:type iters: int
|
|
154
|
+
:param alpha: Primary significance threshold on detected matches. Specifically, the
|
|
155
|
+
minimum corrected empirical p-value approximated from randomly sampled blocks in the
|
|
156
|
+
response sequence.
|
|
157
|
+
:type alpha: float
|
|
158
|
+
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
159
|
+
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
160
|
+
If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
|
|
161
|
+
If set to `None`, defaults to 250 bp.
|
|
162
|
+
:type minMatchLengthBP: Optional[int]
|
|
163
|
+
:param minSignalAtMaxima: Secondary significance threshold coupled with :math:`\alpha`. Requires the *signal value*
|
|
164
|
+
at relative maxima in the response sequence to be greater than a threshold :math:`\pm \epsilon`. Comparisons are
|
|
165
|
+
made in log-scale (arsinh). If a `float` value is provided, then we require minimum signal value must be greater
|
|
166
|
+
than this value.
|
|
167
|
+
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
|
|
168
|
+
threshold is then set to the corresponding quantile of the non-zero signal estimates.
|
|
169
|
+
Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
|
|
170
|
+
:type minSignalAtMaxima: Optional[str | float]
|
|
171
|
+
:param useScalingFunction: If True, use (only) the scaling function to build the matching template.
|
|
172
|
+
If False, use (only) the wavelet function.
|
|
173
|
+
:type useScalingFunction: bool
|
|
174
|
+
:param excludeRegionsBedFile: A BED file with regions to exclude from matching
|
|
175
|
+
:type excludeRegionsBedFile: Optional[str]
|
|
176
|
+
:param recenterAtPointSource: If True, recenter detected matches at the point source (max value)
|
|
177
|
+
:type recenterAtPointSource: bool
|
|
178
|
+
:param weights: Optional weights to apply to `values` prior to matching. Must have the same length as `values`.
|
|
179
|
+
:type weights: Optional[npt.NDArray[np.float64]]
|
|
180
|
+
:param eps: Tolerance parameter for relative maxima detection in the response sequence. Set to zero to enforce strict
|
|
181
|
+
inequalities when identifying discrete relative maxima.
|
|
182
|
+
:type eps: float
|
|
183
|
+
:param isLogScale: Whether the input values have already been transformed. Used to double/redundant transformations.
|
|
184
|
+
:type isLogScale: bool
|
|
185
|
+
:seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
|
|
186
|
+
:return: A pandas DataFrame with detected matches
|
|
187
|
+
:rtype: pd.DataFrame
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
rng = np.random.default_rng(int(randSeed))
|
|
191
|
+
if len(intervals) < 5:
|
|
192
|
+
raise ValueError("`intervals` must be at least length 5")
|
|
193
|
+
|
|
194
|
+
if len(values) != len(intervals):
|
|
195
|
+
raise ValueError(
|
|
196
|
+
"`values` must have the same length as `intervals`"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if len(templateNames) != len(cascadeLevels):
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"\n\t`templateNames` and `cascadeLevels` must have the same length."
|
|
202
|
+
"\n\tSet products are not supported, i.e., each template needs an explicitly defined cascade level."
|
|
203
|
+
"\t\ne.g., for `templateNames = [haar, db2]`, use `cascadeLevels = [2, 2]`, not `[2]`.\n"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
intervalLengthBp = intervals[1] - intervals[0]
|
|
207
|
+
|
|
208
|
+
if minMatchLengthBP is not None and minMatchLengthBP < 1:
|
|
209
|
+
minMatchLengthBP = autoMinLengthIntervals(
|
|
210
|
+
values,
|
|
211
|
+
cutoffQuantile=autoLengthQuantile,
|
|
212
|
+
isLogScale=isLogScale,
|
|
213
|
+
) * int(intervalLengthBp)
|
|
214
|
+
elif minMatchLengthBP is None:
|
|
215
|
+
minMatchLengthBP = 147 # default to nucleosome size
|
|
216
|
+
|
|
217
|
+
logger.info(f"\n\tUsing minMatchLengthBP: {minMatchLengthBP}")
|
|
218
|
+
|
|
219
|
+
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
|
|
220
|
+
raise ValueError("`intervals` must be evenly spaced.")
|
|
221
|
+
|
|
222
|
+
if weights is not None:
|
|
223
|
+
if len(weights) != len(values):
|
|
224
|
+
logger.warning(
|
|
225
|
+
f"`weights` length {len(weights)} does not match `values` length {len(values)}. Ignoring..."
|
|
226
|
+
)
|
|
227
|
+
else:
|
|
228
|
+
values = values * weights
|
|
229
|
+
|
|
230
|
+
if not isLogScale:
|
|
231
|
+
asinhValues = np.asinh(values, dtype=np.float32)
|
|
232
|
+
else:
|
|
233
|
+
asinhValues = values.astype(np.float32)
|
|
234
|
+
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
235
|
+
|
|
236
|
+
iters = max(int(iters), 1000)
|
|
237
|
+
defQuantile = 0.75
|
|
238
|
+
chromMin = int(intervals[0])
|
|
239
|
+
chromMax = int(intervals[-1])
|
|
240
|
+
chromMid = chromMin + (chromMax - chromMin) // 2 # for split
|
|
241
|
+
halfLeftMask = intervals < chromMid
|
|
242
|
+
halfRightMask = ~halfLeftMask
|
|
243
|
+
excludeMaskGlobal = np.zeros(len(intervals), dtype=np.uint8)
|
|
244
|
+
if excludeRegionsBedFile is not None:
|
|
245
|
+
excludeMaskGlobal = core.getBedMask(
|
|
246
|
+
chromosome, excludeRegionsBedFile, intervals
|
|
247
|
+
).astype(np.uint8)
|
|
248
|
+
allRows = []
|
|
249
|
+
|
|
250
|
+
def parseMinSignalThreshold(val):
|
|
251
|
+
if val is None:
|
|
252
|
+
return -1e6
|
|
253
|
+
if isinstance(val, str):
|
|
254
|
+
if val.startswith("q:"):
|
|
255
|
+
qVal = float(val.split("q:")[-1])
|
|
256
|
+
if not (0 <= qVal <= 1):
|
|
257
|
+
raise ValueError(
|
|
258
|
+
f"Quantile {qVal} is out of range"
|
|
259
|
+
)
|
|
260
|
+
return float(
|
|
261
|
+
np.quantile(
|
|
262
|
+
asinhNonZeroValues,
|
|
263
|
+
qVal,
|
|
264
|
+
method="interpolated_inverted_cdf",
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
elif castableToFloat(val):
|
|
268
|
+
v = float(val)
|
|
269
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
270
|
+
else:
|
|
271
|
+
return float(
|
|
272
|
+
np.quantile(
|
|
273
|
+
asinhNonZeroValues,
|
|
274
|
+
defQuantile,
|
|
275
|
+
method="interpolated_inverted_cdf",
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
if isinstance(val, (float, int)):
|
|
279
|
+
v = float(val)
|
|
280
|
+
return -1e6 if v < 0 else float(np.asinh(v))
|
|
281
|
+
return float(
|
|
282
|
+
np.quantile(
|
|
283
|
+
asinhNonZeroValues,
|
|
284
|
+
defQuantile,
|
|
285
|
+
method="interpolated_inverted_cdf",
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def relativeMaxima(
|
|
290
|
+
resp: np.ndarray, orderBins: int, eps: float = None
|
|
291
|
+
) -> np.ndarray:
|
|
292
|
+
order_: int = max(int(orderBins), 1)
|
|
293
|
+
if eps is None:
|
|
294
|
+
eps = np.finfo(resp.dtype).eps * 10
|
|
295
|
+
|
|
296
|
+
def ge_with_tol(a, b):
|
|
297
|
+
return a > (b - eps)
|
|
298
|
+
|
|
299
|
+
# get initial set using loosened criterion
|
|
300
|
+
idx = signal.argrelextrema(
|
|
301
|
+
resp, comparator=ge_with_tol, order=order_
|
|
302
|
+
)[0]
|
|
303
|
+
if idx.size == 0:
|
|
304
|
+
return idx
|
|
305
|
+
|
|
306
|
+
if eps > 0.0:
|
|
307
|
+
groups = []
|
|
308
|
+
start, prev = idx[0], idx[0]
|
|
309
|
+
for x in idx[1:]:
|
|
310
|
+
# case: still contiguous
|
|
311
|
+
if x == prev + 1:
|
|
312
|
+
prev = x
|
|
313
|
+
else:
|
|
314
|
+
# case: a gap --> break off from previous group
|
|
315
|
+
groups.append((start, prev))
|
|
316
|
+
start = x
|
|
317
|
+
prev = x
|
|
318
|
+
groups.append((start, prev))
|
|
319
|
+
|
|
320
|
+
centers: list[int] = []
|
|
321
|
+
for s, e in groups:
|
|
322
|
+
if s == e:
|
|
323
|
+
centers.append(s)
|
|
324
|
+
else:
|
|
325
|
+
# for each `group` of tied indices, picks the center
|
|
326
|
+
centers.append((s + e) // 2)
|
|
327
|
+
|
|
328
|
+
return np.asarray(centers, dtype=np.intp)
|
|
329
|
+
|
|
330
|
+
return idx
|
|
331
|
+
|
|
332
|
+
def sampleBlockMaxima(
|
|
333
|
+
resp: np.ndarray,
|
|
334
|
+
halfMask: np.ndarray,
|
|
335
|
+
relWindowBins: int,
|
|
336
|
+
nsamp: int,
|
|
337
|
+
seed: int,
|
|
338
|
+
eps: float,
|
|
339
|
+
):
|
|
340
|
+
exMask = excludeMaskGlobal.astype(np.uint8).copy()
|
|
341
|
+
exMask |= (~halfMask).astype(np.uint8)
|
|
342
|
+
vals = np.array(
|
|
343
|
+
cconsenrich.csampleBlockStats(
|
|
344
|
+
intervals.astype(np.uint32),
|
|
345
|
+
resp,
|
|
346
|
+
int(relWindowBins),
|
|
347
|
+
int(nsamp),
|
|
348
|
+
int(seed),
|
|
349
|
+
exMask.astype(np.uint8),
|
|
350
|
+
np.float64(eps if eps is not None else 0.0),
|
|
351
|
+
),
|
|
352
|
+
dtype=float,
|
|
353
|
+
)
|
|
354
|
+
if len(vals) == 0:
|
|
355
|
+
return vals
|
|
356
|
+
low = np.quantile(vals, 0.001)
|
|
357
|
+
high = np.quantile(vals, 0.999)
|
|
358
|
+
return vals[(vals > low) & (vals < high)]
|
|
359
|
+
|
|
360
|
+
for templateName, cascadeLevel in zip(
|
|
361
|
+
templateNames, cascadeLevels
|
|
362
|
+
):
|
|
363
|
+
if templateName not in pw.wavelist(kind="discrete"):
|
|
364
|
+
logger.warning(
|
|
365
|
+
f"Skipping unknown wavelet template: {templateName}"
|
|
366
|
+
)
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
wav = pw.Wavelet(str(templateName))
|
|
370
|
+
scalingFunc, waveletFunc, _ = wav.wavefun(
|
|
371
|
+
level=int(cascadeLevel)
|
|
372
|
+
)
|
|
373
|
+
template = np.array(
|
|
374
|
+
scalingFunc if useScalingFunction else waveletFunc,
|
|
375
|
+
dtype=np.float64,
|
|
376
|
+
)
|
|
377
|
+
template /= np.linalg.norm(template)
|
|
378
|
+
|
|
379
|
+
logger.info(
|
|
380
|
+
f"\n\tMatching template: {templateName}"
|
|
381
|
+
f"\n\tcascade level: {cascadeLevel}"
|
|
382
|
+
f"\n\ttemplate length: {len(template)}"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# efficient FFT-based cross-correlation
|
|
386
|
+
# (OA may be better for smaller templates, TODO add a check)
|
|
387
|
+
response = signal.fftconvolve(
|
|
388
|
+
values, template[::-1], mode="same"
|
|
389
|
+
)
|
|
390
|
+
thisMinMatchBp = minMatchLengthBP
|
|
391
|
+
if thisMinMatchBp is None or thisMinMatchBp < 1:
|
|
392
|
+
thisMinMatchBp = len(template) * intervalLengthBp
|
|
393
|
+
if thisMinMatchBp % intervalLengthBp != 0:
|
|
394
|
+
thisMinMatchBp += intervalLengthBp - (
|
|
395
|
+
thisMinMatchBp % intervalLengthBp
|
|
396
|
+
)
|
|
397
|
+
relWindowBins = int(
|
|
398
|
+
((thisMinMatchBp / intervalLengthBp) / 2) + 1
|
|
399
|
+
)
|
|
400
|
+
relWindowBins = max(relWindowBins, 1)
|
|
401
|
+
asinhThreshold = parseMinSignalThreshold(minSignalAtMaxima)
|
|
402
|
+
for nullMask, testMask, tag in [
|
|
403
|
+
(halfLeftMask, halfRightMask, "R"),
|
|
404
|
+
(halfRightMask, halfLeftMask, "L"),
|
|
405
|
+
]:
|
|
406
|
+
blockMaxima = sampleBlockMaxima(
|
|
407
|
+
response,
|
|
408
|
+
nullMask,
|
|
409
|
+
relWindowBins,
|
|
410
|
+
nsamp=max(iters, 1000),
|
|
411
|
+
seed=rng.integers(1, 10_000),
|
|
412
|
+
eps=eps,
|
|
413
|
+
)
|
|
414
|
+
if len(blockMaxima) < 25:
|
|
415
|
+
pooledMask = ~excludeMaskGlobal.astype(bool)
|
|
416
|
+
blockMaxima = sampleBlockMaxima(
|
|
417
|
+
response,
|
|
418
|
+
pooledMask,
|
|
419
|
+
relWindowBins,
|
|
420
|
+
nsamp=max(iters, 1000),
|
|
421
|
+
seed=rng.integers(1, 10_000),
|
|
422
|
+
eps=eps,
|
|
423
|
+
)
|
|
424
|
+
ecdfSf = stats.ecdf(blockMaxima).sf
|
|
425
|
+
candidateIdx = relativeMaxima(
|
|
426
|
+
response, relWindowBins, eps=eps
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
candidateMask = (
|
|
430
|
+
(candidateIdx >= relWindowBins)
|
|
431
|
+
& (candidateIdx < len(response) - relWindowBins)
|
|
432
|
+
& (testMask[candidateIdx])
|
|
433
|
+
& (excludeMaskGlobal[candidateIdx] == 0)
|
|
434
|
+
& (asinhValues[candidateIdx] > asinhThreshold)
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
candidateIdx = candidateIdx[candidateMask]
|
|
438
|
+
if len(candidateIdx) == 0:
|
|
439
|
+
continue
|
|
440
|
+
if (
|
|
441
|
+
maxNumMatches is not None
|
|
442
|
+
and len(candidateIdx) > maxNumMatches
|
|
443
|
+
):
|
|
444
|
+
candidateIdx = candidateIdx[
|
|
445
|
+
np.argsort(asinhValues[candidateIdx])[
|
|
446
|
+
-maxNumMatches:
|
|
447
|
+
]
|
|
448
|
+
]
|
|
449
|
+
pEmp = np.clip(
|
|
450
|
+
ecdfSf.evaluate(response[candidateIdx]),
|
|
451
|
+
np.finfo(np.float32).tiny,
|
|
452
|
+
1.0,
|
|
453
|
+
)
|
|
454
|
+
startsIdx = np.maximum(candidateIdx - relWindowBins, 0)
|
|
455
|
+
endsIdx = np.minimum(
|
|
456
|
+
len(values) - 1, candidateIdx + relWindowBins
|
|
457
|
+
)
|
|
458
|
+
pointSourcesIdx = []
|
|
459
|
+
for s, e in zip(startsIdx, endsIdx):
|
|
460
|
+
pointSourcesIdx.append(
|
|
461
|
+
np.argmax(values[s : e + 1]) + s
|
|
462
|
+
)
|
|
463
|
+
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
464
|
+
starts = intervals[startsIdx]
|
|
465
|
+
ends = intervals[endsIdx]
|
|
466
|
+
pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
|
|
467
|
+
1, intervalLengthBp // 2
|
|
468
|
+
)
|
|
469
|
+
if recenterAtPointSource:
|
|
470
|
+
starts = pointSourcesAbs - (
|
|
471
|
+
relWindowBins * intervalLengthBp
|
|
472
|
+
)
|
|
473
|
+
ends = pointSourcesAbs + (
|
|
474
|
+
relWindowBins * intervalLengthBp
|
|
475
|
+
)
|
|
476
|
+
pointSourcesRel = (
|
|
477
|
+
intervals[pointSourcesIdx] - starts
|
|
478
|
+
) + max(1, intervalLengthBp // 2)
|
|
479
|
+
sqScores = (1 + response[candidateIdx]) ** 2
|
|
480
|
+
minR, maxR = (
|
|
481
|
+
float(np.min(sqScores)),
|
|
482
|
+
float(np.max(sqScores)),
|
|
483
|
+
)
|
|
484
|
+
rangeR = max(maxR - minR, 1.0)
|
|
485
|
+
scores = (250 + 750 * (sqScores - minR) / rangeR).astype(
|
|
486
|
+
int
|
|
487
|
+
)
|
|
488
|
+
for i, idxVal in enumerate(candidateIdx):
|
|
489
|
+
allRows.append(
|
|
490
|
+
{
|
|
491
|
+
"chromosome": chromosome,
|
|
492
|
+
"start": int(starts[i]),
|
|
493
|
+
"end": int(ends[i]),
|
|
494
|
+
"name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
|
|
495
|
+
"score": int(scores[i]),
|
|
496
|
+
"strand": ".",
|
|
497
|
+
"signal": float(response[idxVal]),
|
|
498
|
+
"p_raw": float(pEmp[i]),
|
|
499
|
+
"pointSource": int(pointSourcesRel[i]),
|
|
500
|
+
}
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
if not allRows:
|
|
504
|
+
logger.warning(
|
|
505
|
+
"No matches detected, returning empty DataFrame."
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
return pd.DataFrame(
|
|
509
|
+
columns=[
|
|
510
|
+
"chromosome",
|
|
511
|
+
"start",
|
|
512
|
+
"end",
|
|
513
|
+
"name",
|
|
514
|
+
"score",
|
|
515
|
+
"strand",
|
|
516
|
+
"signal",
|
|
517
|
+
"pValue",
|
|
518
|
+
"qValue",
|
|
519
|
+
"pointSource",
|
|
520
|
+
]
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
df = pd.DataFrame(allRows)
|
|
524
|
+
qVals = _FDR(df["p_raw"].values.astype(float))
|
|
525
|
+
df["pValue"] = -np.log10(
|
|
526
|
+
np.clip(df["p_raw"].values, np.finfo(np.float32).tiny, 1.0)
|
|
527
|
+
)
|
|
528
|
+
df["qValue"] = -np.log10(
|
|
529
|
+
np.clip(qVals, np.finfo(np.float32).tiny, 1.0)
|
|
530
|
+
)
|
|
531
|
+
df.drop(columns=["p_raw"], inplace=True)
|
|
532
|
+
df = df[qVals <= alpha].copy()
|
|
533
|
+
df["chromosome"] = df["chromosome"].astype(str)
|
|
534
|
+
df.sort_values(by=["chromosome", "start", "end"], inplace=True)
|
|
535
|
+
df.reset_index(drop=True, inplace=True)
|
|
536
|
+
df = df[
|
|
537
|
+
[
|
|
538
|
+
"chromosome",
|
|
539
|
+
"start",
|
|
540
|
+
"end",
|
|
541
|
+
"name",
|
|
542
|
+
"score",
|
|
543
|
+
"strand",
|
|
544
|
+
"signal",
|
|
545
|
+
"pValue",
|
|
546
|
+
"qValue",
|
|
547
|
+
"pointSource",
|
|
548
|
+
]
|
|
549
|
+
]
|
|
550
|
+
return df
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def mergeMatches(
|
|
554
|
+
filePath: str,
|
|
555
|
+
mergeGapBP: Optional[int] = -1,
|
|
556
|
+
) -> Optional[str]:
|
|
557
|
+
r"""Merge overlapping or nearby structured peaks ('matches') in a narrowPeak file.
|
|
558
|
+
|
|
559
|
+
The harmonic mean of p-values and q-values is computed for each merged region within `mergeGapBP` base pairs.
|
|
560
|
+
The fourth column (name) of each merged peak contains information about the number of features that were merged
|
|
561
|
+
and the range of q-values among them.
|
|
562
|
+
|
|
563
|
+
Expects a `narrowPeak <https://genome.ucsc.edu/FAQ/FAQformat.html#format12>`_ file as input (all numeric columns, '.' for strand if unknown).
|
|
564
|
+
|
|
565
|
+
:param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
|
|
566
|
+
:type filePath: str
|
|
567
|
+
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging.
|
|
568
|
+
:type mergeGapBP: Optional[int]
|
|
569
|
+
|
|
570
|
+
:seealso: :ref:`matching`, :class:`consenrich.core.matchingParams`
|
|
571
|
+
"""
|
|
572
|
+
|
|
573
|
+
if mergeGapBP is None or mergeGapBP < 1:
|
|
574
|
+
mergeGapBP = 147
|
|
575
|
+
logger.info(f"Setting mergeGapBP = {mergeGapBP} bp")
|
|
576
|
+
|
|
577
|
+
MAX_NEGLOGP = 10.0
|
|
578
|
+
MIN_NEGLOGP = 1.0e-10
|
|
579
|
+
|
|
580
|
+
if not os.path.isfile(filePath):
|
|
581
|
+
logger.warning(f"Couldn't access {filePath}...skipping merge")
|
|
582
|
+
return None
|
|
583
|
+
bed = None
|
|
584
|
+
try:
|
|
585
|
+
bed = BedTool(filePath)
|
|
586
|
+
except Exception as ex:
|
|
587
|
+
logger.warning(
|
|
588
|
+
f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
|
|
589
|
+
)
|
|
590
|
+
return None
|
|
591
|
+
if bed is None:
|
|
592
|
+
logger.warning(
|
|
593
|
+
f"Couldn't create BedTool for {filePath}...skipping merge"
|
|
594
|
+
)
|
|
595
|
+
return None
|
|
596
|
+
|
|
597
|
+
bed = bed.sort()
|
|
598
|
+
clustered = bed.cluster(d=mergeGapBP)
|
|
599
|
+
groups = {}
|
|
600
|
+
for f in clustered:
|
|
601
|
+
fields = f.fields
|
|
602
|
+
chrom = fields[0]
|
|
603
|
+
start = int(fields[1])
|
|
604
|
+
end = int(fields[2])
|
|
605
|
+
score = float(fields[4])
|
|
606
|
+
signal = float(fields[6])
|
|
607
|
+
pLog10 = float(fields[7])
|
|
608
|
+
qLog10 = float(fields[8])
|
|
609
|
+
peak = int(fields[9])
|
|
610
|
+
clusterID = fields[-1]
|
|
611
|
+
if clusterID not in groups:
|
|
612
|
+
groups[clusterID] = {
|
|
613
|
+
"chrom": chrom,
|
|
614
|
+
"sMin": start,
|
|
615
|
+
"eMax": end,
|
|
616
|
+
"scSum": 0.0,
|
|
617
|
+
"sigSum": 0.0,
|
|
618
|
+
"n": 0,
|
|
619
|
+
"maxS": float("-inf"),
|
|
620
|
+
"peakAbs": -1,
|
|
621
|
+
"pMax": float("-inf"),
|
|
622
|
+
"pTail": 0.0,
|
|
623
|
+
"pHasInf": False,
|
|
624
|
+
"qMax": float("-inf"),
|
|
625
|
+
"qMin": float("inf"),
|
|
626
|
+
"qTail": 0.0,
|
|
627
|
+
"qHasInf": False,
|
|
628
|
+
}
|
|
629
|
+
g = groups[clusterID]
|
|
630
|
+
if start < g["sMin"]:
|
|
631
|
+
g["sMin"] = start
|
|
632
|
+
if end > g["eMax"]:
|
|
633
|
+
g["eMax"] = end
|
|
634
|
+
g["scSum"] += score
|
|
635
|
+
g["sigSum"] += signal
|
|
636
|
+
g["n"] += 1
|
|
637
|
+
|
|
638
|
+
if math.isinf(pLog10) or pLog10 >= MAX_NEGLOGP:
|
|
639
|
+
g["pHasInf"] = True
|
|
640
|
+
else:
|
|
641
|
+
if pLog10 > g["pMax"]:
|
|
642
|
+
if g["pMax"] == float("-inf"):
|
|
643
|
+
g["pTail"] = 1.0
|
|
644
|
+
else:
|
|
645
|
+
g["pTail"] = (
|
|
646
|
+
g["pTail"] * (10 ** (g["pMax"] - pLog10))
|
|
647
|
+
+ 1.0
|
|
648
|
+
)
|
|
649
|
+
g["pMax"] = pLog10
|
|
650
|
+
else:
|
|
651
|
+
g["pTail"] += 10 ** (pLog10 - g["pMax"])
|
|
652
|
+
|
|
653
|
+
if (
|
|
654
|
+
math.isinf(qLog10)
|
|
655
|
+
or qLog10 >= MAX_NEGLOGP
|
|
656
|
+
or qLog10 <= MIN_NEGLOGP
|
|
657
|
+
):
|
|
658
|
+
g["qHasInf"] = True
|
|
659
|
+
else:
|
|
660
|
+
if qLog10 < g["qMin"]:
|
|
661
|
+
if qLog10 < MIN_NEGLOGP:
|
|
662
|
+
g["qMin"] = MIN_NEGLOGP
|
|
663
|
+
else:
|
|
664
|
+
g["qMin"] = qLog10
|
|
665
|
+
|
|
666
|
+
if qLog10 > g["qMax"]:
|
|
667
|
+
if g["qMax"] == float("-inf"):
|
|
668
|
+
g["qTail"] = 1.0
|
|
669
|
+
else:
|
|
670
|
+
g["qTail"] = (
|
|
671
|
+
g["qTail"] * (10 ** (g["qMax"] - qLog10))
|
|
672
|
+
+ 1.0
|
|
673
|
+
)
|
|
674
|
+
g["qMax"] = qLog10
|
|
675
|
+
else:
|
|
676
|
+
g["qTail"] += 10 ** (qLog10 - g["qMax"])
|
|
677
|
+
|
|
678
|
+
if signal > g["maxS"]:
|
|
679
|
+
g["maxS"] = signal
|
|
680
|
+
g["peakAbs"] = start + peak if peak >= 0 else -1
|
|
681
|
+
|
|
682
|
+
items = []
|
|
683
|
+
for clusterID, g in groups.items():
|
|
684
|
+
items.append((g["chrom"], g["sMin"], g["eMax"], g))
|
|
685
|
+
items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
|
|
686
|
+
|
|
687
|
+
outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
|
|
688
|
+
lines = []
|
|
689
|
+
i = 0
|
|
690
|
+
for chrom, sMin, eMax, g in items:
|
|
691
|
+
i += 1
|
|
692
|
+
avgScore = g["scSum"] / g["n"]
|
|
693
|
+
if avgScore < 0:
|
|
694
|
+
avgScore = 0
|
|
695
|
+
if avgScore > 1000:
|
|
696
|
+
avgScore = 1000
|
|
697
|
+
scoreInt = int(round(avgScore))
|
|
698
|
+
sigAvg = g["sigSum"] / g["n"]
|
|
699
|
+
|
|
700
|
+
if g["pHasInf"]:
|
|
701
|
+
pHMLog10 = MAX_NEGLOGP
|
|
702
|
+
else:
|
|
703
|
+
if (
|
|
704
|
+
g["pMax"] == float("-inf")
|
|
705
|
+
or not (g["pTail"] > 0.0)
|
|
706
|
+
or math.isnan(g["pTail"])
|
|
707
|
+
):
|
|
708
|
+
pHMLog10 = MIN_NEGLOGP
|
|
709
|
+
else:
|
|
710
|
+
pHMLog10 = -math.log10(g["n"]) + (
|
|
711
|
+
g["pMax"] + math.log10(g["pTail"])
|
|
712
|
+
)
|
|
713
|
+
pHMLog10 = max(
|
|
714
|
+
MIN_NEGLOGP, min(pHMLog10, MAX_NEGLOGP)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
if g["qHasInf"]:
|
|
718
|
+
qHMLog10 = MAX_NEGLOGP
|
|
719
|
+
else:
|
|
720
|
+
if (
|
|
721
|
+
g["qMax"] == float("-inf")
|
|
722
|
+
or not (g["qTail"] > 0.0)
|
|
723
|
+
or math.isnan(g["qTail"])
|
|
724
|
+
):
|
|
725
|
+
qHMLog10 = MIN_NEGLOGP
|
|
726
|
+
else:
|
|
727
|
+
qHMLog10 = -math.log10(g["n"]) + (
|
|
728
|
+
g["qMax"] + math.log10(g["qTail"])
|
|
729
|
+
)
|
|
730
|
+
qHMLog10 = max(
|
|
731
|
+
MIN_NEGLOGP, min(qHMLog10, MAX_NEGLOGP)
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
pointSource = (
|
|
735
|
+
g["peakAbs"] - sMin
|
|
736
|
+
if g["peakAbs"] >= 0
|
|
737
|
+
else (eMax - sMin) // 2
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
qMinLog10 = g["qMin"]
|
|
741
|
+
qMaxLog10 = g["qMax"]
|
|
742
|
+
if math.isfinite(qMinLog10) and qMinLog10 < MIN_NEGLOGP:
|
|
743
|
+
qMinLog10 = MIN_NEGLOGP
|
|
744
|
+
if math.isfinite(qMaxLog10) and qMaxLog10 > MAX_NEGLOGP:
|
|
745
|
+
qMaxLog10 = MAX_NEGLOGP
|
|
746
|
+
elif (
|
|
747
|
+
not math.isfinite(qMaxLog10)
|
|
748
|
+
or not math.isfinite(qMinLog10)
|
|
749
|
+
) or (qMaxLog10 < MIN_NEGLOGP):
|
|
750
|
+
qMinLog10 = 0.0
|
|
751
|
+
qMaxLog10 = 0.0
|
|
752
|
+
|
|
753
|
+
# informative+parsable name
|
|
754
|
+
# e.g., regex: ^consenrichPeak\|i=(?P<i>\d+)\|gap=(?P<gap>\d+)bp\|ct=(?P<ct>\d+)\|qRange=(?P<qmin>\d+\.\d{3})_(?P<qmax>\d+\_\d{3})$
|
|
755
|
+
name = f"consenrichPeak|i={i}|gap={mergeGapBP}bp|ct={g['n']}|qRange={qMinLog10:.3f}_{qMaxLog10:.3f}"
|
|
756
|
+
lines.append(
|
|
757
|
+
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pHMLog10:.3f}\t{qHMLog10:.3f}\t{int(pointSource)}"
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
with open(outPath, "w") as outF:
|
|
761
|
+
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
762
|
+
logger.info(f"Merged matches written to {outPath}")
|
|
763
|
+
return outPath
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def runMatchingAlgorithm(
|
|
767
|
+
bedGraphFile: str,
|
|
768
|
+
templateNames: List[str],
|
|
769
|
+
cascadeLevels: List[int],
|
|
770
|
+
iters: int,
|
|
771
|
+
alpha: float = 0.05,
|
|
772
|
+
minMatchLengthBP: Optional[int] = 250,
|
|
773
|
+
maxNumMatches: Optional[int] = 100_000,
|
|
774
|
+
minSignalAtMaxima: Optional[float | str] = "q:0.75",
|
|
775
|
+
randSeed: int = 42,
|
|
776
|
+
recenterAtPointSource: bool = True,
|
|
777
|
+
useScalingFunction: bool = True,
|
|
778
|
+
excludeRegionsBedFile: Optional[str] = None,
|
|
779
|
+
weightsBedGraph: str | None = None,
|
|
780
|
+
eps: float = 1.0e-2,
|
|
781
|
+
isLogScale: bool = False,
|
|
782
|
+
autoLengthQuantile: float = 0.90,
|
|
783
|
+
mergeGapBP: int | None = -1,
|
|
784
|
+
methodFDR: str|None = None,
|
|
785
|
+
merge: bool = True,
|
|
786
|
+
):
|
|
787
|
+
r"""Wraps :func:`matchWavelet` for genome-wide matching given a bedGraph file"""
|
|
788
|
+
gwideDF = pd.DataFrame()
|
|
789
|
+
chromosomes = (
|
|
790
|
+
pd.read_csv(
|
|
791
|
+
bedGraphFile,
|
|
792
|
+
sep="\t",
|
|
793
|
+
header=None,
|
|
794
|
+
names=["chromosome", "start", "end", "value"],
|
|
795
|
+
dtype={
|
|
796
|
+
"chromosome": str,
|
|
797
|
+
"start": np.uint32,
|
|
798
|
+
"end": np.uint32,
|
|
799
|
+
"value": np.float64,
|
|
800
|
+
},
|
|
801
|
+
)["chromosome"]
|
|
802
|
+
.unique()
|
|
803
|
+
.tolist()
|
|
804
|
+
)
|
|
805
|
+
|
|
806
|
+
avgMinMatchLengths = []
|
|
807
|
+
|
|
808
|
+
for c_, chromosome_ in enumerate(chromosomes):
|
|
809
|
+
cols = ["chromosome", "start", "end", "value"]
|
|
810
|
+
chromBedGraphDF = pd.read_csv(
|
|
811
|
+
bedGraphFile,
|
|
812
|
+
sep="\t",
|
|
813
|
+
header=None,
|
|
814
|
+
names=cols,
|
|
815
|
+
dtype={
|
|
816
|
+
"chromosome": str,
|
|
817
|
+
"start": np.uint32,
|
|
818
|
+
"end": np.uint32,
|
|
819
|
+
"value": np.float64,
|
|
820
|
+
},
|
|
821
|
+
)
|
|
822
|
+
chromBedGraphDF = chromBedGraphDF[
|
|
823
|
+
chromBedGraphDF["chromosome"] == chromosome_
|
|
824
|
+
]
|
|
825
|
+
chromIntervals = chromBedGraphDF["start"].to_numpy()
|
|
826
|
+
chromValues = chromBedGraphDF["value"].to_numpy()
|
|
827
|
+
del chromBedGraphDF
|
|
828
|
+
|
|
829
|
+
weightsDF = pd.DataFrame()
|
|
830
|
+
weights = np.ones_like(chromValues, dtype=np.float64)
|
|
831
|
+
if weightsBedGraph is not None and os.path.exists(
|
|
832
|
+
weightsBedGraph
|
|
833
|
+
):
|
|
834
|
+
try:
|
|
835
|
+
weightsDF = pd.read_csv(
|
|
836
|
+
weightsBedGraph,
|
|
837
|
+
sep="\t",
|
|
838
|
+
header=None,
|
|
839
|
+
names=cols,
|
|
840
|
+
dtype={
|
|
841
|
+
"chromosome": str,
|
|
842
|
+
"start": np.uint32,
|
|
843
|
+
"end": np.uint32,
|
|
844
|
+
"value": np.float64,
|
|
845
|
+
},
|
|
846
|
+
)
|
|
847
|
+
weights = weightsDF[
|
|
848
|
+
weightsDF["chromosome"] == chromosome_
|
|
849
|
+
]
|
|
850
|
+
weights = 1 / np.sqrt(
|
|
851
|
+
weights["value"].to_numpy() + 1.0
|
|
852
|
+
)
|
|
853
|
+
except Exception as ex:
|
|
854
|
+
logger.warning(
|
|
855
|
+
"Failed to parse weights from {weightsBedGraph}. Ignoring weights...."
|
|
856
|
+
)
|
|
857
|
+
del weightsDF
|
|
858
|
+
|
|
859
|
+
if minMatchLengthBP is not None and minMatchLengthBP < 1:
|
|
860
|
+
minMatchLengthBP_ = autoMinLengthIntervals(
|
|
861
|
+
chromValues,
|
|
862
|
+
cutoffQuantile=autoLengthQuantile,
|
|
863
|
+
isLogScale=isLogScale,
|
|
864
|
+
) * int(chromIntervals[1] - chromIntervals[0])
|
|
865
|
+
else:
|
|
866
|
+
minMatchLengthBP_ = minMatchLengthBP
|
|
867
|
+
|
|
868
|
+
avgMinMatchLengths.append(minMatchLengthBP_)
|
|
869
|
+
|
|
870
|
+
df__ = matchWavelet(
|
|
871
|
+
chromosome_,
|
|
872
|
+
chromIntervals,
|
|
873
|
+
chromValues,
|
|
874
|
+
templateNames,
|
|
875
|
+
cascadeLevels,
|
|
876
|
+
iters,
|
|
877
|
+
1.0, # keep all for later gwide correction
|
|
878
|
+
minMatchLengthBP_,
|
|
879
|
+
maxNumMatches,
|
|
880
|
+
minSignalAtMaxima,
|
|
881
|
+
randSeed,
|
|
882
|
+
recenterAtPointSource,
|
|
883
|
+
useScalingFunction,
|
|
884
|
+
excludeRegionsBedFile,
|
|
885
|
+
weights,
|
|
886
|
+
eps,
|
|
887
|
+
isLogScale,
|
|
888
|
+
)
|
|
889
|
+
if df__.empty:
|
|
890
|
+
logger.info(f"No matches detected on {chromosome_}.")
|
|
891
|
+
continue
|
|
892
|
+
gwideDF = pd.concat(
|
|
893
|
+
[gwideDF, df__], axis=0, ignore_index=True
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
if gwideDF.empty:
|
|
897
|
+
logger.warning("Empty matching results over `chromosomes`.")
|
|
898
|
+
return gwideDF
|
|
899
|
+
naturalScalePValues = 10 ** (
|
|
900
|
+
-gwideDF["pValue"].values.astype(float)
|
|
901
|
+
)
|
|
902
|
+
qVals = _FDR(naturalScalePValues, method=methodFDR)
|
|
903
|
+
gwideDF["qValue"] = -np.log10(
|
|
904
|
+
np.clip(qVals, np.finfo(np.float32).tiny, 1.0)
|
|
905
|
+
)
|
|
906
|
+
gwideDF = gwideDF[qVals <= alpha].copy()
|
|
907
|
+
gwideDF.sort_values(
|
|
908
|
+
by=["chromosome", "start", "end"], inplace=True
|
|
909
|
+
)
|
|
910
|
+
tempNarrowPeak = f"{bedGraphFile}_matches.narrowPeak".replace(
|
|
911
|
+
".bedGraph", ""
|
|
912
|
+
)
|
|
913
|
+
gwideDF.to_csv(
|
|
914
|
+
tempNarrowPeak,
|
|
915
|
+
sep="\t",
|
|
916
|
+
index=False,
|
|
917
|
+
header=False,
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
if mergeGapBP is None or mergeGapBP < 1:
|
|
921
|
+
mergeGapBP = max((np.median(avgMinMatchLengths).astype(int) // 2), 147)
|
|
922
|
+
|
|
923
|
+
mergedPath = None
|
|
924
|
+
if merge:
|
|
925
|
+
mergedPath = mergeMatches(tempNarrowPeak, mergeGapBP=mergeGapBP)
|
|
926
|
+
if mergedPath is not None and os.path.isfile(mergedPath):
|
|
927
|
+
logger.info(f"Merged matches written to {mergedPath}")
|
|
928
|
+
|
|
929
|
+
return mergedPath
|