consenrich 0.6.3b1__cp314-cp314-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/__init__.py +11 -0
- consenrich/cconsenrich.c +48856 -0
- consenrich/cconsenrich.cpython-314-darwin.so +0 -0
- consenrich/cconsenrich.pyx +836 -0
- consenrich/consenrich.py +923 -0
- consenrich/constants.py +168 -0
- consenrich/core.py +1320 -0
- consenrich/data/ce10.sizes +6 -0
- consenrich/data/ce10_blacklist.bed +100 -0
- consenrich/data/ce10_sparse.bed +11828 -0
- consenrich/data/ce11.sizes +6 -0
- consenrich/data/ce11_blacklist.bed +97 -0
- consenrich/data/ce11_sparse.bed +11828 -0
- consenrich/data/dm6.sizes +7 -0
- consenrich/data/dm6_blacklist.bed +182 -0
- consenrich/data/dm6_sparse.bed +20000 -0
- consenrich/data/hg19.sizes +24 -0
- consenrich/data/hg19_blacklist.bed +834 -0
- consenrich/data/hg19_sparse.bed +288358 -0
- consenrich/data/hg38.sizes +24 -0
- consenrich/data/hg38_blacklist.bed +636 -0
- consenrich/data/hg38_sparse.bed +288699 -0
- consenrich/data/mm10.sizes +21 -0
- consenrich/data/mm10_blacklist.bed +3435 -0
- consenrich/data/mm10_sparse.bed +100400 -0
- consenrich/data/mm39.sizes +21 -0
- consenrich/data/mm39_blacklist.bed +3360 -0
- consenrich/data/mm39_sparse.bed +100381 -0
- consenrich/detrorm.py +230 -0
- consenrich/matching.py +710 -0
- consenrich/misc_util.py +90 -0
- consenrich-0.6.3b1.dist-info/METADATA +65 -0
- consenrich-0.6.3b1.dist-info/RECORD +37 -0
- consenrich-0.6.3b1.dist-info/WHEEL +6 -0
- consenrich-0.6.3b1.dist-info/entry_points.txt +2 -0
- consenrich-0.6.3b1.dist-info/licenses/LICENSE +21 -0
- consenrich-0.6.3b1.dist-info/top_level.txt +1 -0
consenrich/matching.py
ADDED
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
r"""Module implementing (experimental) 'structured peak detection' features using wavelet-based templates."""
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from pybedtools import BedTool
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import pywt as pw
|
|
11
|
+
import numpy as np
|
|
12
|
+
import numpy.typing as npt
|
|
13
|
+
|
|
14
|
+
from scipy import signal, stats
|
|
15
|
+
|
|
16
|
+
from . import cconsenrich
|
|
17
|
+
from . import core as core
|
|
18
|
+
|
|
19
|
+
logging.basicConfig(
|
|
20
|
+
level=logging.INFO,
|
|
21
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
22
|
+
)
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def castableToFloat(value) -> bool:
|
|
27
|
+
if value is None:
|
|
28
|
+
return False
|
|
29
|
+
if isinstance(value, bool):
|
|
30
|
+
return False
|
|
31
|
+
if isinstance(value, str):
|
|
32
|
+
if value.lower().replace(' ', '') in ["nan", "inf", "-inf", "infinity", "-infinity", "", " "]:
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
float(value)
|
|
37
|
+
if np.isfinite(float(value)):
|
|
38
|
+
return True
|
|
39
|
+
except Exception:
|
|
40
|
+
return False
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def matchExistingBedGraph(
|
|
45
|
+
bedGraphFile: str,
|
|
46
|
+
templateName: str,
|
|
47
|
+
cascadeLevel: int,
|
|
48
|
+
alpha: float = 0.05,
|
|
49
|
+
minMatchLengthBP: Optional[int] = 250,
|
|
50
|
+
iters: int = 25_000,
|
|
51
|
+
minSignalAtMaxima: Optional[float | str] = "q:0.75",
|
|
52
|
+
maxNumMatches: Optional[int] = 100_000,
|
|
53
|
+
recenterAtPointSource: bool = True,
|
|
54
|
+
useScalingFunction: bool = True,
|
|
55
|
+
excludeRegionsBedFile: Optional[str] = None,
|
|
56
|
+
mergeGapBP: int = 50,
|
|
57
|
+
merge: bool = True,
|
|
58
|
+
weights: Optional[npt.NDArray[np.float64]] = None,
|
|
59
|
+
randSeed: int = 42,
|
|
60
|
+
) -> Optional[str]:
|
|
61
|
+
r"""Match discrete templates in a bedGraph file of Consenrich estimates
|
|
62
|
+
|
|
63
|
+
This function is a simple wrapper. See :func:`consenrich.matching.matchWavelet` for details on parameters.
|
|
64
|
+
|
|
65
|
+
:param bedGraphFile: A bedGraph file with 'consensus' signal estimates derived from multiple samples, e.g., from Consenrich. The suffix '.bedGraph' is required.
|
|
66
|
+
:type bedGraphFile: str
|
|
67
|
+
|
|
68
|
+
:seealso: :func:`consenrich.matching.matchWavelet`, :class:`consenrich.core.matchingParams`, :ref:`matching`
|
|
69
|
+
"""
|
|
70
|
+
if not os.path.isfile(bedGraphFile):
|
|
71
|
+
raise FileNotFoundError(f"Couldn't access {bedGraphFile}")
|
|
72
|
+
if not bedGraphFile.endswith(".bedGraph"):
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Please use a suffix '.bedGraph' for `bedGraphFile`, got: {bedGraphFile}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
allowedTemplates = [
|
|
78
|
+
x for x in pw.wavelist(kind="discrete") if "bio" not in x
|
|
79
|
+
]
|
|
80
|
+
if templateName not in allowedTemplates:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Unknown wavelet template: {templateName}\nAvailable templates: {allowedTemplates}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
cols = ["chromosome", "start", "end", "value"]
|
|
86
|
+
bedGraphDF = pd.read_csv(
|
|
87
|
+
bedGraphFile,
|
|
88
|
+
sep="\t",
|
|
89
|
+
header=None,
|
|
90
|
+
names=cols,
|
|
91
|
+
dtype={
|
|
92
|
+
"chromosome": str,
|
|
93
|
+
"start": np.uint32,
|
|
94
|
+
"end": np.uint32,
|
|
95
|
+
"value": np.float64,
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
outPaths: List[str] = []
|
|
100
|
+
outPathsMerged: List[str] = []
|
|
101
|
+
outPathAll: Optional[str] = None
|
|
102
|
+
outPathMergedAll: Optional[str] = None
|
|
103
|
+
|
|
104
|
+
for chrom_ in sorted(bedGraphDF["chromosome"].unique()):
|
|
105
|
+
df_ = bedGraphDF[bedGraphDF["chromosome"] == chrom_]
|
|
106
|
+
if len(df_) < 5:
|
|
107
|
+
logger.info(f"Skipping {chrom_}: fewer than 5 rows.")
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
df__ = matchWavelet(
|
|
112
|
+
chrom_,
|
|
113
|
+
df_["start"].to_numpy(),
|
|
114
|
+
df_["value"].to_numpy(),
|
|
115
|
+
[templateName],
|
|
116
|
+
[cascadeLevel],
|
|
117
|
+
iters,
|
|
118
|
+
alpha,
|
|
119
|
+
minMatchLengthBP,
|
|
120
|
+
maxNumMatches,
|
|
121
|
+
recenterAtPointSource=recenterAtPointSource,
|
|
122
|
+
useScalingFunction=useScalingFunction,
|
|
123
|
+
excludeRegionsBedFile=excludeRegionsBedFile,
|
|
124
|
+
weights=weights,
|
|
125
|
+
minSignalAtMaxima=minSignalAtMaxima,
|
|
126
|
+
randSeed=randSeed,
|
|
127
|
+
)
|
|
128
|
+
except Exception as ex:
|
|
129
|
+
logger.info(f"Skipping {chrom_} due to error in matchWavelet: {ex}")
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
if df__.empty:
|
|
133
|
+
logger.info(f"No matches detected on {chrom_}.")
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
perChromOut = bedGraphFile.replace(
|
|
137
|
+
".bedGraph",
|
|
138
|
+
f".{chrom_}.matched.{templateName}_lvl{cascadeLevel}.narrowPeak",
|
|
139
|
+
)
|
|
140
|
+
df__.to_csv(perChromOut, sep="\t", index=False, header=False)
|
|
141
|
+
logger.info(f"Matches written to {perChromOut}")
|
|
142
|
+
outPaths.append(perChromOut)
|
|
143
|
+
|
|
144
|
+
if merge:
|
|
145
|
+
mergedPath = mergeMatches(perChromOut, mergeGapBP=mergeGapBP)
|
|
146
|
+
if mergedPath is not None:
|
|
147
|
+
logger.info(f"Merged matches written to {mergedPath}")
|
|
148
|
+
outPathsMerged.append(mergedPath)
|
|
149
|
+
|
|
150
|
+
if len(outPaths) == 0 and len(outPathsMerged) == 0:
|
|
151
|
+
raise ValueError("No matches were detected.")
|
|
152
|
+
|
|
153
|
+
if len(outPaths) > 0:
|
|
154
|
+
outPathAll = (
|
|
155
|
+
f"{bedGraphFile.replace('.bedGraph', '')}"
|
|
156
|
+
f".allChroms.matched.{templateName}_lvl{cascadeLevel}.narrowPeak"
|
|
157
|
+
)
|
|
158
|
+
with open(outPathAll, "w") as outF:
|
|
159
|
+
for path_ in outPaths:
|
|
160
|
+
if os.path.isfile(path_):
|
|
161
|
+
with open(path_, "r") as inF:
|
|
162
|
+
for line in inF:
|
|
163
|
+
outF.write(line)
|
|
164
|
+
logger.info(f"All unmerged matches written to {outPathAll}")
|
|
165
|
+
|
|
166
|
+
if merge and len(outPathsMerged) > 0:
|
|
167
|
+
outPathMergedAll = (
|
|
168
|
+
f"{bedGraphFile.replace('.bedGraph', '')}"
|
|
169
|
+
f".allChroms.matched.{templateName}_lvl{cascadeLevel}.mergedMatches.narrowPeak"
|
|
170
|
+
)
|
|
171
|
+
with open(outPathMergedAll, "w") as outF:
|
|
172
|
+
for path in outPathsMerged:
|
|
173
|
+
if os.path.isfile(path):
|
|
174
|
+
with open(path, "r") as inF:
|
|
175
|
+
for line in inF:
|
|
176
|
+
outF.write(line)
|
|
177
|
+
logger.info(f"All merged matches written to {outPathMergedAll}")
|
|
178
|
+
|
|
179
|
+
for path_ in outPaths + outPathsMerged:
|
|
180
|
+
try:
|
|
181
|
+
if os.path.isfile(path_):
|
|
182
|
+
os.remove(path_)
|
|
183
|
+
except Exception:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
if merge and outPathMergedAll:
|
|
187
|
+
return outPathMergedAll
|
|
188
|
+
if outPathAll:
|
|
189
|
+
return outPathAll
|
|
190
|
+
logger.warning("No matches were detected...returning `None`")
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def matchWavelet(
|
|
195
|
+
chromosome: str,
|
|
196
|
+
intervals: npt.NDArray[int],
|
|
197
|
+
values: npt.NDArray[np.float64],
|
|
198
|
+
templateNames: List[str],
|
|
199
|
+
cascadeLevels: List[int],
|
|
200
|
+
iters: int,
|
|
201
|
+
alpha: float = 0.05,
|
|
202
|
+
minMatchLengthBP: Optional[int] = 250,
|
|
203
|
+
maxNumMatches: Optional[int] = 100_000,
|
|
204
|
+
minSignalAtMaxima: Optional[float | str] = "q:0.75",
|
|
205
|
+
randSeed: int = 42,
|
|
206
|
+
recenterAtPointSource: bool = True,
|
|
207
|
+
useScalingFunction: bool = True,
|
|
208
|
+
excludeRegionsBedFile: Optional[str] = None,
|
|
209
|
+
weights: Optional[npt.NDArray[np.float64]] = None,
|
|
210
|
+
) -> pd.DataFrame:
|
|
211
|
+
r"""Detect structured peaks by cross-correlating Consenrich tracks with wavelet- or scaling-function templates.
|
|
212
|
+
|
|
213
|
+
See :ref:`matching` for an overview of the approach.
|
|
214
|
+
|
|
215
|
+
:param chromosome: Chromosome name for the input intervals and values.
|
|
216
|
+
:type chromosome: str
|
|
217
|
+
:param values: 'Consensus' signal estimates derived from multiple samples, e.g., from Consenrich.
|
|
218
|
+
:type values: npt.NDArray[np.float64]
|
|
219
|
+
:param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
|
|
220
|
+
:type templateNames: List[str]
|
|
221
|
+
:param cascadeLevels: A list of int values -- the number of cascade iterations used for approximating
|
|
222
|
+
the scaling/wavelet functions.
|
|
223
|
+
:type cascadeLevels: List[int]
|
|
224
|
+
:param iters: Number of random blocks to sample in the response sequence while building
|
|
225
|
+
an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
|
|
226
|
+
:type iters: int
|
|
227
|
+
:param alpha: Primary significance threshold on detected matches. Specifically, the
|
|
228
|
+
:math:`1 - \alpha` quantile of an empirical null distribution. The empirical null
|
|
229
|
+
distribution is built from cross-correlation values over randomly sampled blocks.
|
|
230
|
+
:type alpha: float
|
|
231
|
+
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
232
|
+
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
233
|
+
*Set to a negative value to disable this filter*.
|
|
234
|
+
:type minMatchLengthBP: int
|
|
235
|
+
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Require the *signal value*
|
|
236
|
+
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale.
|
|
237
|
+
If a `float` value is provided, the minimum signal value must be greater than this (absolute) value. *Set to a
|
|
238
|
+
negative value to disable the threshold*.
|
|
239
|
+
If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.75'. The
|
|
240
|
+
threshold is then set to the corresponding quantile of the non-zero signal estimates.
|
|
241
|
+
Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
|
|
242
|
+
:type minSignalAtMaxima: Optional[str | float]
|
|
243
|
+
:param useScalingFunction: If True, use (only) the scaling function to build the matching template.
|
|
244
|
+
If False, use (only) the wavelet function.
|
|
245
|
+
:type useScalingFunction: bool
|
|
246
|
+
:param excludeRegionsBedFile: A BED file with regions to exclude from matching
|
|
247
|
+
:type excludeRegionsBedFile: Optional[str]
|
|
248
|
+
|
|
249
|
+
:seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
if len(intervals) < 5:
|
|
253
|
+
raise ValueError("`intervals` must be at least length 5")
|
|
254
|
+
if len(values) != len(intervals):
|
|
255
|
+
raise ValueError("`values` must have the same length as `intervals`")
|
|
256
|
+
intervalLengthBP = intervals[1] - intervals[0]
|
|
257
|
+
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBP):
|
|
258
|
+
# FFR: don't change this exception message without updating tests
|
|
259
|
+
# --'spaced' is matched in tests
|
|
260
|
+
raise ValueError("`intervals` must be evenly spaced.")
|
|
261
|
+
|
|
262
|
+
randSeed_: int = int(randSeed)
|
|
263
|
+
cols = [
|
|
264
|
+
"chromosome",
|
|
265
|
+
"start",
|
|
266
|
+
"end",
|
|
267
|
+
"name",
|
|
268
|
+
"score",
|
|
269
|
+
"strand",
|
|
270
|
+
"signal",
|
|
271
|
+
"pValue",
|
|
272
|
+
"qValue",
|
|
273
|
+
"pointSource",
|
|
274
|
+
]
|
|
275
|
+
matchDF = pd.DataFrame(columns=cols)
|
|
276
|
+
minMatchLengthBPCopy: Optional[int] = minMatchLengthBP
|
|
277
|
+
cascadeLevels = sorted(list(set(cascadeLevels)))
|
|
278
|
+
if weights is not None and len(weights) == len(values):
|
|
279
|
+
values = values * weights
|
|
280
|
+
asinhValues = np.asinh(values, dtype=np.float32)
|
|
281
|
+
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
282
|
+
iters = max(iters, 1000)
|
|
283
|
+
defQuantile: float = 0.75
|
|
284
|
+
for l_, cascadeLevel in enumerate(cascadeLevels):
|
|
285
|
+
for t_, templateName in enumerate(templateNames):
|
|
286
|
+
try:
|
|
287
|
+
templateName = str(templateName)
|
|
288
|
+
cascadeLevel = int(cascadeLevel)
|
|
289
|
+
except ValueError:
|
|
290
|
+
logger.info(
|
|
291
|
+
f"Skipping invalid templateName or cascadeLevel: {templateName}, {cascadeLevel}"
|
|
292
|
+
)
|
|
293
|
+
continue
|
|
294
|
+
if templateName not in pw.wavelist(kind="discrete"):
|
|
295
|
+
logger.info(
|
|
296
|
+
f"\nSkipping unknown wavelet template: {templateName}\nAvailable templates: {pw.wavelist(kind='discrete')}"
|
|
297
|
+
)
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
wav = pw.Wavelet(templateName)
|
|
301
|
+
scalingFunc, waveletFunc, x = wav.wavefun(level=cascadeLevel)
|
|
302
|
+
template = np.array(waveletFunc, dtype=np.float64) / np.linalg.norm(
|
|
303
|
+
waveletFunc
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
if useScalingFunction:
|
|
307
|
+
template = np.array(
|
|
308
|
+
scalingFunc, dtype=np.float64
|
|
309
|
+
) / np.linalg.norm(scalingFunc)
|
|
310
|
+
|
|
311
|
+
logger.info(
|
|
312
|
+
f"Matching: template: {templateName}, cascade level: {cascadeLevel}, template length: {len(template)}, scaling: {useScalingFunction}, wavelet: {not useScalingFunction}"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
responseSequence: npt.NDArray[np.float64] = signal.fftconvolve(
|
|
316
|
+
values, template[::-1], mode="same"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
minMatchLengthBP = minMatchLengthBPCopy
|
|
320
|
+
if minMatchLengthBP is None or minMatchLengthBP < 1:
|
|
321
|
+
minMatchLengthBP = len(template) * intervalLengthBP
|
|
322
|
+
if minMatchLengthBP % intervalLengthBP != 0:
|
|
323
|
+
minMatchLengthBP += intervalLengthBP - (
|
|
324
|
+
minMatchLengthBP % intervalLengthBP
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
relativeMaximaWindow = int(
|
|
328
|
+
((minMatchLengthBP / intervalLengthBP) / 2) + 1
|
|
329
|
+
)
|
|
330
|
+
relativeMaximaWindow = max(relativeMaximaWindow, 1)
|
|
331
|
+
|
|
332
|
+
excludeMask = np.zeros(len(intervals), dtype=np.uint8)
|
|
333
|
+
if excludeRegionsBedFile is not None:
|
|
334
|
+
excludeMask = core.getBedMask(
|
|
335
|
+
chromosome,
|
|
336
|
+
excludeRegionsBedFile,
|
|
337
|
+
intervals,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
logger.info(
|
|
341
|
+
f"\nSampling {iters} block maxima for template {templateName} at cascade level {cascadeLevel} with (expected) relative maxima window size {relativeMaximaWindow}.\n"
|
|
342
|
+
)
|
|
343
|
+
blockMaxima = np.array(
|
|
344
|
+
cconsenrich.csampleBlockStats(
|
|
345
|
+
intervals.astype(np.uint32),
|
|
346
|
+
responseSequence,
|
|
347
|
+
relativeMaximaWindow,
|
|
348
|
+
iters * 2,
|
|
349
|
+
randSeed_,
|
|
350
|
+
excludeMask.astype(np.uint8),
|
|
351
|
+
),
|
|
352
|
+
dtype=float,
|
|
353
|
+
)
|
|
354
|
+
blockMaximaCheck = blockMaxima.copy()[iters:]
|
|
355
|
+
blockMaxima = blockMaxima[:iters]
|
|
356
|
+
blockMaxima = blockMaxima[
|
|
357
|
+
(blockMaxima > np.quantile(blockMaxima, 0.005))
|
|
358
|
+
& (blockMaxima < np.quantile(blockMaxima, 0.995))
|
|
359
|
+
]
|
|
360
|
+
|
|
361
|
+
ecdfBlockMaximaSF = stats.ecdf(blockMaxima).sf
|
|
362
|
+
|
|
363
|
+
responseThreshold = float(1e6)
|
|
364
|
+
arsinhSignalThreshold = float(1e6)
|
|
365
|
+
try:
|
|
366
|
+
# we use 'interpolated_inverted_cdf' in a few spots
|
|
367
|
+
# --- making sure it's supported here, at its first use
|
|
368
|
+
responseThreshold = np.quantile(
|
|
369
|
+
blockMaxima, 1 - alpha, method="interpolated_inverted_cdf"
|
|
370
|
+
)
|
|
371
|
+
except (TypeError, ValueError, KeyError) as err_:
|
|
372
|
+
logger.warning(
|
|
373
|
+
f"\nError computing response threshold with alpha={alpha}:\n{err_}\n"
|
|
374
|
+
f"\nIs `blockMaxima` empty?"
|
|
375
|
+
f"\nIs NumPy older than 1.22.0 (~May 2022~)?"
|
|
376
|
+
f"\nIs `alpha` in (0,1)?\n"
|
|
377
|
+
)
|
|
378
|
+
raise
|
|
379
|
+
|
|
380
|
+
# parse minSignalAtMaxima, set arsinhSignalThreshold
|
|
381
|
+
if minSignalAtMaxima is None:
|
|
382
|
+
# -----we got a `None`-----
|
|
383
|
+
arsinhSignalThreshold = -float(1e6)
|
|
384
|
+
elif isinstance(minSignalAtMaxima, str):
|
|
385
|
+
# -----we got a str-----
|
|
386
|
+
if minSignalAtMaxima.startswith("q:"):
|
|
387
|
+
# case: expected 'q:quantileValue' format
|
|
388
|
+
qVal = float(minSignalAtMaxima.split("q:")[-1])
|
|
389
|
+
if qVal < 0 or qVal > 1:
|
|
390
|
+
raise ValueError(f"Quantile {qVal} is out of range")
|
|
391
|
+
arsinhSignalThreshold = float(
|
|
392
|
+
np.quantile(
|
|
393
|
+
asinhNonZeroValues,
|
|
394
|
+
qVal,
|
|
395
|
+
method="interpolated_inverted_cdf",
|
|
396
|
+
)
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
elif castableToFloat(minSignalAtMaxima):
|
|
400
|
+
# case: numeric in str form (possible due to CLI)
|
|
401
|
+
if float(minSignalAtMaxima) < 0.0:
|
|
402
|
+
# effectively disables threshold
|
|
403
|
+
arsinhSignalThreshold = -float(1e6)
|
|
404
|
+
else:
|
|
405
|
+
# use supplied value
|
|
406
|
+
arsinhSignalThreshold = np.asinh(
|
|
407
|
+
float(minSignalAtMaxima)
|
|
408
|
+
)
|
|
409
|
+
else:
|
|
410
|
+
# case: not in known format, not castable to a float, use defaults
|
|
411
|
+
logger.info(
|
|
412
|
+
f"Couldn't parse `minSignalAtMaxima` value: {minSignalAtMaxima}, using default"
|
|
413
|
+
)
|
|
414
|
+
arsinhSignalThreshold = float(
|
|
415
|
+
np.quantile(
|
|
416
|
+
asinhNonZeroValues,
|
|
417
|
+
defQuantile,
|
|
418
|
+
method="interpolated_inverted_cdf",
|
|
419
|
+
)
|
|
420
|
+
)
|
|
421
|
+
# -----
|
|
422
|
+
|
|
423
|
+
elif isinstance(minSignalAtMaxima, (float, int)):
|
|
424
|
+
# -----we got an int or float-----
|
|
425
|
+
if float(minSignalAtMaxima) < 0.0:
|
|
426
|
+
# effectively disables threshold
|
|
427
|
+
arsinhSignalThreshold = -float(1e6)
|
|
428
|
+
else:
|
|
429
|
+
# use supplied value
|
|
430
|
+
arsinhSignalThreshold = np.asinh(float(minSignalAtMaxima))
|
|
431
|
+
# -----
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
relativeMaximaIndices = signal.argrelmax(
|
|
435
|
+
responseSequence, order=relativeMaximaWindow
|
|
436
|
+
)[0]
|
|
437
|
+
|
|
438
|
+
relativeMaximaIndices = relativeMaximaIndices[
|
|
439
|
+
(responseSequence[relativeMaximaIndices] > responseThreshold)
|
|
440
|
+
& (asinhValues[relativeMaximaIndices] > arsinhSignalThreshold)
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
if len(relativeMaximaIndices) == 0:
|
|
444
|
+
logger.info(
|
|
445
|
+
f"no matches were detected using for template {templateName} at cascade level {cascadeLevel}...skipping matching"
|
|
446
|
+
)
|
|
447
|
+
continue
|
|
448
|
+
|
|
449
|
+
if maxNumMatches is not None:
|
|
450
|
+
if len(relativeMaximaIndices) > maxNumMatches:
|
|
451
|
+
# take the greatest maxNumMatches (by 'signal')
|
|
452
|
+
relativeMaximaIndices = relativeMaximaIndices[
|
|
453
|
+
np.argsort(asinhValues[relativeMaximaIndices])[
|
|
454
|
+
-maxNumMatches:
|
|
455
|
+
]
|
|
456
|
+
]
|
|
457
|
+
|
|
458
|
+
ecdfSFCheckVals: npt.NDArray[np.float64] = (
|
|
459
|
+
ecdfBlockMaximaSF.evaluate(blockMaximaCheck)
|
|
460
|
+
)
|
|
461
|
+
testKS, _ = stats.kstest(
|
|
462
|
+
ecdfSFCheckVals,
|
|
463
|
+
stats.uniform.cdf,
|
|
464
|
+
alternative="two-sided",
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
logger.info(
|
|
468
|
+
f"\n\tDetected {len(relativeMaximaIndices)} matches (alpha={alpha}, useScalingFunction={useScalingFunction}): {templateName}: level={cascadeLevel}.\n"
|
|
469
|
+
f"\tResponse threshold: {responseThreshold:.3f}, arsinh(Signal Threshold): {arsinhSignalThreshold:.3f}\n"
|
|
470
|
+
f"\t~KS_Statistic~ [ePVals, uniformCDF]: {testKS:.4f}\n"
|
|
471
|
+
f"\n\n{textNullCDF(ecdfSFCheckVals)}\n\n" # lil text-plot histogram of approx. null CDF
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# starts
|
|
475
|
+
startsIdx = np.maximum(
|
|
476
|
+
relativeMaximaIndices - relativeMaximaWindow, 0
|
|
477
|
+
)
|
|
478
|
+
# ends
|
|
479
|
+
endsIdx = np.minimum(
|
|
480
|
+
len(values) - 1, relativeMaximaIndices + relativeMaximaWindow
|
|
481
|
+
)
|
|
482
|
+
# point source
|
|
483
|
+
pointSourcesIdx = []
|
|
484
|
+
for start_, end_ in zip(startsIdx, endsIdx):
|
|
485
|
+
pointSourcesIdx.append(
|
|
486
|
+
np.argmax(values[start_ : end_ + 1]) + start_
|
|
487
|
+
)
|
|
488
|
+
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
489
|
+
starts = intervals[startsIdx]
|
|
490
|
+
ends = intervals[endsIdx]
|
|
491
|
+
pointSources = (intervals[pointSourcesIdx]) + max(
|
|
492
|
+
1, intervalLengthBP // 2
|
|
493
|
+
)
|
|
494
|
+
if (
|
|
495
|
+
recenterAtPointSource
|
|
496
|
+
): # recenter at point source (signal maximum)
|
|
497
|
+
starts = pointSources - (
|
|
498
|
+
relativeMaximaWindow * intervalLengthBP
|
|
499
|
+
)
|
|
500
|
+
ends = pointSources + (relativeMaximaWindow * intervalLengthBP)
|
|
501
|
+
pointSources = (intervals[pointSourcesIdx] - starts) + max(
|
|
502
|
+
1, intervalLengthBP // 2
|
|
503
|
+
)
|
|
504
|
+
# (ucsc browser) score [0,1000]
|
|
505
|
+
sqScores = (1 + responseSequence[relativeMaximaIndices]) ** 2
|
|
506
|
+
minResponse = np.min(sqScores)
|
|
507
|
+
maxResponse = np.max(sqScores)
|
|
508
|
+
rangeResponse = max(maxResponse - minResponse, 1.0)
|
|
509
|
+
scores = (
|
|
510
|
+
250 + 750 * (sqScores - minResponse) / rangeResponse
|
|
511
|
+
).astype(int)
|
|
512
|
+
# feature name
|
|
513
|
+
names = [
|
|
514
|
+
f"{templateName}_{cascadeLevel}_{i}"
|
|
515
|
+
for i in relativeMaximaIndices
|
|
516
|
+
]
|
|
517
|
+
# strand
|
|
518
|
+
strands = ["." for _ in range(len(scores))]
|
|
519
|
+
# p-values in -log10 scale per convention
|
|
520
|
+
pValues = -np.log10(
|
|
521
|
+
np.clip(
|
|
522
|
+
ecdfBlockMaximaSF.evaluate(
|
|
523
|
+
responseSequence[relativeMaximaIndices]
|
|
524
|
+
),
|
|
525
|
+
1e-10,
|
|
526
|
+
1.0,
|
|
527
|
+
)
|
|
528
|
+
)
|
|
529
|
+
# q-values (ignored)
|
|
530
|
+
qValues = np.array(np.ones_like(pValues) * -1.0)
|
|
531
|
+
|
|
532
|
+
tempDF = pd.DataFrame(
|
|
533
|
+
{
|
|
534
|
+
"chromosome": [chromosome] * len(relativeMaximaIndices),
|
|
535
|
+
"start": starts.astype(int),
|
|
536
|
+
"end": ends.astype(int),
|
|
537
|
+
"name": names,
|
|
538
|
+
"score": scores,
|
|
539
|
+
"strand": strands,
|
|
540
|
+
"signal": responseSequence[relativeMaximaIndices],
|
|
541
|
+
"pValue": pValues,
|
|
542
|
+
"qValue": qValues,
|
|
543
|
+
"pointSource": pointSources.astype(int),
|
|
544
|
+
}
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
if matchDF.empty:
|
|
548
|
+
matchDF = tempDF
|
|
549
|
+
else:
|
|
550
|
+
matchDF = pd.concat([matchDF, tempDF], ignore_index=True)
|
|
551
|
+
randSeed_ += 1
|
|
552
|
+
|
|
553
|
+
if matchDF.empty:
|
|
554
|
+
logger.info("No matches detected, returning empty DataFrame.")
|
|
555
|
+
return matchDF
|
|
556
|
+
matchDF.sort_values(by=["chromosome", "start", "end"], inplace=True)
|
|
557
|
+
matchDF.reset_index(drop=True, inplace=True)
|
|
558
|
+
return matchDF
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def mergeMatches(filePath: str, mergeGapBP: int = 50):
|
|
562
|
+
r"""Merge overlapping or nearby structured peaks (matches) in a narrowPeak file.
|
|
563
|
+
|
|
564
|
+
Where an overlap occurs within `mergeGapBP` base pairs, the feature with the greatest signal defines the new summit/pointSource
|
|
565
|
+
|
|
566
|
+
:param filePath: narrowPeak file containing matches detected with :func:`consenrich.matching.matchWavelet`
|
|
567
|
+
:type filePath: str
|
|
568
|
+
:param mergeGapBP: Maximum gap size (in base pairs) to consider for merging
|
|
569
|
+
:type mergeGapBP: int
|
|
570
|
+
|
|
571
|
+
:seealso: :class:`consenrich.core.matchingParams`
|
|
572
|
+
"""
|
|
573
|
+
if not os.path.isfile(filePath):
|
|
574
|
+
logger.info(f"Couldn't access {filePath}...skipping merge")
|
|
575
|
+
return None
|
|
576
|
+
bed = None
|
|
577
|
+
try:
|
|
578
|
+
bed = BedTool(filePath)
|
|
579
|
+
except Exception as ex:
|
|
580
|
+
logger.info(
|
|
581
|
+
f"Couldn't create BedTool for {filePath}:\n{ex}\n\nskipping merge..."
|
|
582
|
+
)
|
|
583
|
+
return None
|
|
584
|
+
if bed is None:
|
|
585
|
+
logger.info(f"Couldn't create BedTool for {filePath}...skipping merge")
|
|
586
|
+
return None
|
|
587
|
+
|
|
588
|
+
bed = bed.sort()
|
|
589
|
+
clustered = bed.cluster(d=mergeGapBP)
|
|
590
|
+
groups = {}
|
|
591
|
+
for f in clustered:
|
|
592
|
+
fields = f.fields
|
|
593
|
+
chrom = fields[0]
|
|
594
|
+
start = int(fields[1])
|
|
595
|
+
end = int(fields[2])
|
|
596
|
+
score = float(fields[4])
|
|
597
|
+
signal = float(fields[6])
|
|
598
|
+
pval = float(fields[7])
|
|
599
|
+
qval = float(fields[8])
|
|
600
|
+
peak = int(fields[9])
|
|
601
|
+
clId = fields[-1]
|
|
602
|
+
if clId not in groups:
|
|
603
|
+
groups[clId] = {
|
|
604
|
+
"chrom": chrom,
|
|
605
|
+
"sMin": start,
|
|
606
|
+
"eMax": end,
|
|
607
|
+
"scSum": 0.0,
|
|
608
|
+
"sigSum": 0.0,
|
|
609
|
+
"pSum": 0.0,
|
|
610
|
+
"qSum": 0.0,
|
|
611
|
+
"n": 0,
|
|
612
|
+
"maxS": float("-inf"),
|
|
613
|
+
"peakAbs": -1,
|
|
614
|
+
}
|
|
615
|
+
g = groups[clId]
|
|
616
|
+
if start < g["sMin"]:
|
|
617
|
+
g["sMin"] = start
|
|
618
|
+
if end > g["eMax"]:
|
|
619
|
+
g["eMax"] = end
|
|
620
|
+
g["scSum"] += score
|
|
621
|
+
g["sigSum"] += signal
|
|
622
|
+
g["pSum"] += pval
|
|
623
|
+
g["qSum"] += qval
|
|
624
|
+
g["n"] += 1
|
|
625
|
+
# scan for largest signal, FFR: consider using the p-val in the future
|
|
626
|
+
if signal > g["maxS"]:
|
|
627
|
+
g["maxS"] = signal
|
|
628
|
+
g["peakAbs"] = start + peak if peak >= 0 else -1
|
|
629
|
+
items = []
|
|
630
|
+
for clId, g in groups.items():
|
|
631
|
+
items.append((g["chrom"], g["sMin"], g["eMax"], g))
|
|
632
|
+
items.sort(key=lambda x: (str(x[0]), x[1], x[2]))
|
|
633
|
+
outPath = f"{filePath.replace('.narrowPeak', '')}.mergedMatches.narrowPeak"
|
|
634
|
+
lines = []
|
|
635
|
+
i = 0
|
|
636
|
+
for chrom, sMin, eMax, g in items:
|
|
637
|
+
i += 1
|
|
638
|
+
avgScore = g["scSum"] / g["n"]
|
|
639
|
+
if avgScore < 0:
|
|
640
|
+
avgScore = 0
|
|
641
|
+
if avgScore > 1000:
|
|
642
|
+
avgScore = 1000
|
|
643
|
+
scoreInt = int(round(avgScore))
|
|
644
|
+
sigAvg = g["sigSum"] / g["n"]
|
|
645
|
+
pAvg = g["pSum"] / g["n"]
|
|
646
|
+
qAvg = g["qSum"] / g["n"]
|
|
647
|
+
pointSource = g["peakAbs"] - sMin if g["peakAbs"] >= 0 else -1
|
|
648
|
+
name = f"mergedPeak{i}"
|
|
649
|
+
lines.append(
|
|
650
|
+
f"{chrom}\t{int(sMin)}\t{int(eMax)}\t{name}\t{scoreInt}\t.\t{sigAvg:.3f}\t{pAvg:.3f}\t{qAvg:.3f}\t{int(pointSource)}"
|
|
651
|
+
)
|
|
652
|
+
with open(outPath, "w") as outF:
|
|
653
|
+
outF.write("\n".join(lines) + ("\n" if lines else ""))
|
|
654
|
+
logger.info(f"Merged matches written to {outPath}")
|
|
655
|
+
return outPath
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def textNullCDF(
|
|
659
|
+
nullBlockMaximaSFVals: npt.NDArray[np.float64],
|
|
660
|
+
binCount: int = 20,
|
|
661
|
+
barWidth: int = 50,
|
|
662
|
+
barChar="\u25a2",
|
|
663
|
+
normalize: bool = False,
|
|
664
|
+
) -> str:
|
|
665
|
+
r"""Plot a histogram of the distribution 1 - ECDF(nullBlockMaxima)
|
|
666
|
+
|
|
667
|
+
Called by :func:`consenrich.matching.matchWavelet`. Ideally resembles
|
|
668
|
+
a uniform(0,1) distribution.
|
|
669
|
+
|
|
670
|
+
:seealso: :func:`consenrich.matching.matchWavelet`, :ref:`cconsenrich.csampleBlockStats`
|
|
671
|
+
"""
|
|
672
|
+
valueLower, valueUpper = (
|
|
673
|
+
min(nullBlockMaximaSFVals),
|
|
674
|
+
max(nullBlockMaximaSFVals),
|
|
675
|
+
)
|
|
676
|
+
binCount = max(1, int(binCount))
|
|
677
|
+
binStep = (valueUpper - valueLower) / binCount
|
|
678
|
+
binEdges = [
|
|
679
|
+
valueLower + indexValue * binStep for indexValue in range(binCount)
|
|
680
|
+
]
|
|
681
|
+
binEdges.append(valueUpper)
|
|
682
|
+
binCounts = [0] * binCount
|
|
683
|
+
for numericValue in nullBlockMaximaSFVals:
|
|
684
|
+
binIndex = int((numericValue - valueLower) / binStep)
|
|
685
|
+
if binIndex == binCount:
|
|
686
|
+
binIndex -= 1
|
|
687
|
+
binCounts[binIndex] += 1
|
|
688
|
+
valueSeries = (
|
|
689
|
+
[countValue / len(nullBlockMaximaSFVals) for countValue in binCounts]
|
|
690
|
+
if normalize
|
|
691
|
+
else binCounts[:]
|
|
692
|
+
)
|
|
693
|
+
valueMaximum = max(valueSeries) if valueSeries else 0
|
|
694
|
+
widthScale = (barWidth / valueMaximum) if valueMaximum > 0 else 0
|
|
695
|
+
edgeFormat = f"{{:.{2}f}}"
|
|
696
|
+
rangeLabels = [
|
|
697
|
+
f"[{edgeFormat.format(binEdges[indexValue])},{edgeFormat.format(binEdges[indexValue + 1])})"
|
|
698
|
+
for indexValue in range(binCount)
|
|
699
|
+
]
|
|
700
|
+
labelWidth = max(len(textValue) for textValue in rangeLabels)
|
|
701
|
+
lines = ['Histogram: "1 - ECDF(nullBlockMaxima)"']
|
|
702
|
+
for rangeLabel, seriesValue, countValue in zip(
|
|
703
|
+
rangeLabels, valueSeries, binCounts
|
|
704
|
+
):
|
|
705
|
+
barString = barChar * int(round(seriesValue * widthScale))
|
|
706
|
+
trailingText = f"({countValue}/{len(nullBlockMaximaSFVals)})\t\t"
|
|
707
|
+
lines.append(
|
|
708
|
+
f"{rangeLabel.rjust(labelWidth)} | {barString}{trailingText.ljust(10)}"
|
|
709
|
+
)
|
|
710
|
+
return "\n".join(lines)
|