consenrich 0.7.1b1__cp312-cp312-macosx_11_0_arm64.whl → 0.7.1b2__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/cconsenrich.c +174 -174
- consenrich/cconsenrich.cpython-312-darwin.so +0 -0
- consenrich/consenrich.py +83 -23
- consenrich/core.py +6 -4
- consenrich/matching.py +199 -134
- {consenrich-0.7.1b1.dist-info → consenrich-0.7.1b2.dist-info}/METADATA +1 -1
- {consenrich-0.7.1b1.dist-info → consenrich-0.7.1b2.dist-info}/RECORD +11 -11
- {consenrich-0.7.1b1.dist-info → consenrich-0.7.1b2.dist-info}/WHEEL +0 -0
- {consenrich-0.7.1b1.dist-info → consenrich-0.7.1b2.dist-info}/entry_points.txt +0 -0
- {consenrich-0.7.1b1.dist-info → consenrich-0.7.1b2.dist-info}/licenses/LICENSE +0 -0
- {consenrich-0.7.1b1.dist-info → consenrich-0.7.1b2.dist-info}/top_level.txt +0 -0
|
Binary file
|
consenrich/consenrich.py
CHANGED
|
@@ -346,9 +346,39 @@ def readConfig(config_path: str) -> Dict[str, Any]:
|
|
|
346
346
|
minQ_default = (
|
|
347
347
|
minR_default / (len(inputParams.bamFiles))
|
|
348
348
|
) + 0.10 # protect condition number
|
|
349
|
+
|
|
349
350
|
matchingExcludeRegionsBedFile_default: Optional[str] = (
|
|
350
351
|
genomeParams.blacklistFile
|
|
351
352
|
)
|
|
353
|
+
|
|
354
|
+
# apply less aggressive *default* detrending/background removal
|
|
355
|
+
# ...IF input controls are present. In either case, respect
|
|
356
|
+
# ...user-specified params
|
|
357
|
+
detrendWindowLengthBP_: int = -1
|
|
358
|
+
detrendSavitzkyGolayDegree_: int = -1
|
|
359
|
+
|
|
360
|
+
if (
|
|
361
|
+
inputParams.bamFilesControl is not None
|
|
362
|
+
and len(inputParams.bamFilesControl) > 0
|
|
363
|
+
):
|
|
364
|
+
detrendWindowLengthBP_ = config.get(
|
|
365
|
+
"detrendParams.detrendWindowLengthBP",
|
|
366
|
+
25_000,
|
|
367
|
+
)
|
|
368
|
+
detrendSavitzkyGolayDegree_ = config.get(
|
|
369
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
370
|
+
1,
|
|
371
|
+
)
|
|
372
|
+
else:
|
|
373
|
+
detrendWindowLengthBP_ = config.get(
|
|
374
|
+
"detrendParams.detrendWindowLengthBP",
|
|
375
|
+
10_000,
|
|
376
|
+
)
|
|
377
|
+
detrendSavitzkyGolayDegree_ = config.get(
|
|
378
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
379
|
+
2,
|
|
380
|
+
)
|
|
381
|
+
|
|
352
382
|
return {
|
|
353
383
|
"experimentName": config.get(
|
|
354
384
|
"experimentName", "consenrichExperiment"
|
|
@@ -378,35 +408,44 @@ def readConfig(config_path: str) -> Dict[str, Any]:
|
|
|
378
408
|
noGlobal=config.get("observationParams.noGlobal", False),
|
|
379
409
|
numNearest=config.get("observationParams.numNearest", 25),
|
|
380
410
|
localWeight=config.get(
|
|
381
|
-
"observationParams.localWeight",
|
|
411
|
+
"observationParams.localWeight",
|
|
412
|
+
0.333,
|
|
382
413
|
),
|
|
383
414
|
globalWeight=config.get(
|
|
384
|
-
"observationParams.globalWeight",
|
|
415
|
+
"observationParams.globalWeight",
|
|
416
|
+
0.667,
|
|
385
417
|
),
|
|
386
418
|
approximationWindowLengthBP=config.get(
|
|
387
|
-
"observationParams.approximationWindowLengthBP",
|
|
419
|
+
"observationParams.approximationWindowLengthBP",
|
|
420
|
+
10000,
|
|
388
421
|
),
|
|
389
422
|
lowPassWindowLengthBP=config.get(
|
|
390
|
-
"observationParams.lowPassWindowLengthBP",
|
|
423
|
+
"observationParams.lowPassWindowLengthBP",
|
|
424
|
+
20000,
|
|
391
425
|
),
|
|
392
426
|
lowPassFilterType=config.get(
|
|
393
|
-
"observationParams.lowPassFilterType",
|
|
427
|
+
"observationParams.lowPassFilterType",
|
|
428
|
+
"median",
|
|
394
429
|
),
|
|
395
430
|
returnCenter=config.get(
|
|
396
|
-
"observationParams.returnCenter",
|
|
431
|
+
"observationParams.returnCenter",
|
|
432
|
+
True,
|
|
397
433
|
),
|
|
398
434
|
),
|
|
399
435
|
"stateArgs": core.stateParams(
|
|
400
436
|
stateInit=config.get("stateParams.stateInit", 0.0),
|
|
401
437
|
stateCovarInit=config.get(
|
|
402
|
-
"stateParams.stateCovarInit",
|
|
438
|
+
"stateParams.stateCovarInit",
|
|
439
|
+
100.0,
|
|
403
440
|
),
|
|
404
441
|
boundState=config.get("stateParams.boundState", True),
|
|
405
442
|
stateLowerBound=config.get(
|
|
406
|
-
"stateParams.stateLowerBound",
|
|
443
|
+
"stateParams.stateLowerBound",
|
|
444
|
+
0.0,
|
|
407
445
|
),
|
|
408
446
|
stateUpperBound=config.get(
|
|
409
|
-
"stateParams.stateUpperBound",
|
|
447
|
+
"stateParams.stateUpperBound",
|
|
448
|
+
10000.0,
|
|
410
449
|
),
|
|
411
450
|
),
|
|
412
451
|
"samArgs": core.samParams(
|
|
@@ -434,32 +473,37 @@ def readConfig(config_path: str) -> Dict[str, Any]:
|
|
|
434
473
|
else 0,
|
|
435
474
|
),
|
|
436
475
|
countEndsOnly=config.get(
|
|
437
|
-
"samParams.countEndsOnly",
|
|
476
|
+
"samParams.countEndsOnly",
|
|
477
|
+
False,
|
|
438
478
|
),
|
|
439
479
|
),
|
|
440
480
|
"detrendArgs": core.detrendParams(
|
|
441
|
-
detrendWindowLengthBP=
|
|
442
|
-
"detrendParams.detrendWindowLengthBP", 10000
|
|
443
|
-
),
|
|
481
|
+
detrendWindowLengthBP=detrendWindowLengthBP_,
|
|
444
482
|
detrendTrackPercentile=config.get(
|
|
445
|
-
"detrendParams.detrendTrackPercentile",
|
|
483
|
+
"detrendParams.detrendTrackPercentile",
|
|
484
|
+
75,
|
|
446
485
|
),
|
|
447
486
|
usePolyFilter=config.get(
|
|
448
|
-
"detrendParams.usePolyFilter",
|
|
487
|
+
"detrendParams.usePolyFilter",
|
|
488
|
+
False,
|
|
449
489
|
),
|
|
450
490
|
detrendSavitzkyGolayDegree=config.get(
|
|
451
|
-
"detrendParams.detrendSavitzkyGolayDegree",
|
|
491
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
492
|
+
detrendSavitzkyGolayDegree_,
|
|
452
493
|
),
|
|
453
494
|
useOrderStatFilter=config.get(
|
|
454
|
-
"detrendParams.useOrderStatFilter",
|
|
495
|
+
"detrendParams.useOrderStatFilter",
|
|
496
|
+
True,
|
|
455
497
|
),
|
|
456
498
|
),
|
|
457
499
|
"matchingArgs": core.matchingParams(
|
|
458
500
|
templateNames=config.get(
|
|
459
|
-
"matchingParams.templateNames",
|
|
501
|
+
"matchingParams.templateNames",
|
|
502
|
+
[],
|
|
460
503
|
),
|
|
461
504
|
cascadeLevels=config.get(
|
|
462
|
-
"matchingParams.cascadeLevels",
|
|
505
|
+
"matchingParams.cascadeLevels",
|
|
506
|
+
[],
|
|
463
507
|
),
|
|
464
508
|
iters=config.get("matchingParams.iters", 25_000),
|
|
465
509
|
alpha=config.get("matchingParams.alpha", 0.05),
|
|
@@ -692,6 +736,9 @@ def main():
|
|
|
692
736
|
scaleDown = countingArgs.scaleDown
|
|
693
737
|
extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
|
|
694
738
|
initialTreatmentScaleFactors = []
|
|
739
|
+
minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
|
|
740
|
+
mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
|
|
741
|
+
|
|
695
742
|
if args.verbose:
|
|
696
743
|
try:
|
|
697
744
|
logger.info("Configuration:\n")
|
|
@@ -1021,6 +1068,18 @@ def main():
|
|
|
1021
1068
|
)
|
|
1022
1069
|
try:
|
|
1023
1070
|
if matchingEnabled:
|
|
1071
|
+
if (
|
|
1072
|
+
minMatchLengthBP_ is None
|
|
1073
|
+
or minMatchLengthBP_ <= 0
|
|
1074
|
+
):
|
|
1075
|
+
minMatchLengthBP_ = (
|
|
1076
|
+
matching.autoMinLengthIntervals(x_)
|
|
1077
|
+
* (intervals[1] - intervals[0])
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
if mergeGapBP_ is None:
|
|
1081
|
+
mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
|
|
1082
|
+
|
|
1024
1083
|
matchingDF = matching.matchWavelet(
|
|
1025
1084
|
chromosome,
|
|
1026
1085
|
intervals,
|
|
@@ -1029,7 +1088,7 @@ def main():
|
|
|
1029
1088
|
matchingArgs.cascadeLevels,
|
|
1030
1089
|
matchingArgs.iters,
|
|
1031
1090
|
matchingArgs.alpha,
|
|
1032
|
-
|
|
1091
|
+
minMatchLengthBP_,
|
|
1033
1092
|
matchingArgs.maxNumMatches,
|
|
1034
1093
|
matchingArgs.minSignalAtMaxima,
|
|
1035
1094
|
useScalingFunction=matchingArgs.useScalingFunction,
|
|
@@ -1055,10 +1114,11 @@ def main():
|
|
|
1055
1114
|
if matchingEnabled and matchingArgs.merge:
|
|
1056
1115
|
try:
|
|
1057
1116
|
mergeGapBP_ = matchingArgs.mergeGapBP
|
|
1058
|
-
if mergeGapBP_ is None:
|
|
1117
|
+
if mergeGapBP_ is None or mergeGapBP_ <= 0:
|
|
1059
1118
|
mergeGapBP_ = (
|
|
1060
|
-
int(
|
|
1061
|
-
if
|
|
1119
|
+
int(minMatchLengthBP_ / 2) + 1
|
|
1120
|
+
if minMatchLengthBP_ is not None
|
|
1121
|
+
and minMatchLengthBP_ >= 0
|
|
1062
1122
|
else 75
|
|
1063
1123
|
)
|
|
1064
1124
|
matching.mergeMatches(
|
consenrich/core.py
CHANGED
|
@@ -317,10 +317,11 @@ class matchingParams(NamedTuple):
|
|
|
317
317
|
|
|
318
318
|
See :ref:`matching` for an overview of the approach.
|
|
319
319
|
|
|
320
|
-
:param templateNames: A list of str values -- wavelet
|
|
320
|
+
:param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
|
|
321
321
|
:type templateNames: List[str]
|
|
322
|
-
:param cascadeLevels:
|
|
323
|
-
the
|
|
322
|
+
:param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
|
|
323
|
+
Must have the same length as `templateNames`, with each entry aligned to the
|
|
324
|
+
corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
|
|
324
325
|
:type cascadeLevels: List[int]
|
|
325
326
|
:param iters: Number of random blocks to sample in the response sequence while building
|
|
326
327
|
an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
|
|
@@ -331,7 +332,8 @@ class matchingParams(NamedTuple):
|
|
|
331
332
|
:type alpha: float
|
|
332
333
|
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
333
334
|
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
334
|
-
|
|
335
|
+
If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
|
|
336
|
+
If set to `None`, defaults to 250 bp.
|
|
335
337
|
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
|
|
336
338
|
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
337
339
|
to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
|
consenrich/matching.py
CHANGED
|
@@ -24,6 +24,45 @@ logging.basicConfig(
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
def autoMinLengthIntervals(
|
|
28
|
+
values: np.ndarray, initLen: int = 3
|
|
29
|
+
) -> int:
|
|
30
|
+
r"""Determines a minimum matching length (in interval units) based on the input signal values.
|
|
31
|
+
|
|
32
|
+
Returns the mean length of non-zero contiguous segments in a log-scaled/centered version of `values`
|
|
33
|
+
|
|
34
|
+
:param values: A 1D array of signal-like values.
|
|
35
|
+
:type values: np.ndarray
|
|
36
|
+
:param initLen: Initial minimum length (in intervals). Defaults to 3.
|
|
37
|
+
:type initLen: int
|
|
38
|
+
:return: Estimated minimum matching length (in intervals)
|
|
39
|
+
:rtype: int
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
trValues = np.asinh(values) - signal.medfilt(
|
|
43
|
+
np.asinh(values),
|
|
44
|
+
kernel_size=
|
|
45
|
+
max(
|
|
46
|
+
(2 * initLen) + 1,
|
|
47
|
+
2 * (int(len(values) * 0.005)) + 1,
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
nz = trValues[trValues > 0]
|
|
51
|
+
if len(nz) == 0:
|
|
52
|
+
return initLen
|
|
53
|
+
thr = np.quantile(nz, 0.90, method="interpolated_inverted_cdf")
|
|
54
|
+
mask = nz >= thr
|
|
55
|
+
if not np.any(mask):
|
|
56
|
+
return initLen
|
|
57
|
+
idx = np.flatnonzero(np.diff(np.r_[False, mask, False]))
|
|
58
|
+
runs = idx.reshape(-1, 2)
|
|
59
|
+
widths = runs[:, 1] - runs[:, 0]
|
|
60
|
+
widths = widths[widths >= initLen]
|
|
61
|
+
if len(widths) == 0:
|
|
62
|
+
return initLen
|
|
63
|
+
return int(np.mean(widths))
|
|
64
|
+
|
|
65
|
+
|
|
27
66
|
def scalarClip(value: float, low: float, high: float) -> float:
|
|
28
67
|
return low if value < low else high if value > high else value
|
|
29
68
|
|
|
@@ -124,7 +163,7 @@ def matchExistingBedGraph(
|
|
|
124
163
|
for chrom_ in sorted(bedGraphDF["chromosome"].unique()):
|
|
125
164
|
df_ = bedGraphDF[bedGraphDF["chromosome"] == chrom_]
|
|
126
165
|
if len(df_) < 5:
|
|
127
|
-
logger.info(f"Skipping {chrom_}:
|
|
166
|
+
logger.info(f"Skipping {chrom_}: less than 5 intervals.")
|
|
128
167
|
continue
|
|
129
168
|
|
|
130
169
|
try:
|
|
@@ -234,17 +273,18 @@ def matchWavelet(
|
|
|
234
273
|
excludeRegionsBedFile: Optional[str] = None,
|
|
235
274
|
weights: Optional[npt.NDArray[np.float64]] = None,
|
|
236
275
|
) -> pd.DataFrame:
|
|
237
|
-
r"""Detect structured peaks
|
|
276
|
+
r"""Detect structured peaks in Consenrich tracks by matching wavelet- or scaling-function–based templates.
|
|
238
277
|
|
|
239
278
|
:param chromosome: Chromosome name for the input intervals and values.
|
|
240
279
|
:type chromosome: str
|
|
241
280
|
:param values: A 1D array of signal-like values. In this documentation, we refer to values derived from Consenrich,
|
|
242
281
|
but other continuous-valued tracks at evenly spaced genomic intervals may be suitable, too.
|
|
243
282
|
:type values: npt.NDArray[np.float64]
|
|
244
|
-
:param templateNames: A list of str values -- wavelet
|
|
283
|
+
:param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
|
|
245
284
|
:type templateNames: List[str]
|
|
246
|
-
:param cascadeLevels:
|
|
247
|
-
the
|
|
285
|
+
:param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
|
|
286
|
+
Must have the same length as `templateNames`, with each entry aligned to the
|
|
287
|
+
corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
|
|
248
288
|
:type cascadeLevels: List[int]
|
|
249
289
|
:param iters: Number of random blocks to sample in the response sequence while building
|
|
250
290
|
an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
|
|
@@ -255,7 +295,9 @@ def matchWavelet(
|
|
|
255
295
|
:type alpha: float
|
|
256
296
|
:param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
|
|
257
297
|
the signal-template convolution must be greater in value than others to qualify as matches.
|
|
258
|
-
|
|
298
|
+
If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
|
|
299
|
+
If set to `None`, defaults to 250 bp.
|
|
300
|
+
:type minMatchLengthBP: Optional[int]
|
|
259
301
|
:param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
|
|
260
302
|
at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
|
|
261
303
|
to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
|
|
@@ -274,19 +316,47 @@ def matchWavelet(
|
|
|
274
316
|
:return: A pandas DataFrame with detected matches
|
|
275
317
|
:rtype: pd.DataFrame
|
|
276
318
|
"""
|
|
319
|
+
|
|
320
|
+
rng = np.random.default_rng(int(randSeed))
|
|
277
321
|
if len(intervals) < 5:
|
|
278
322
|
raise ValueError("`intervals` must be at least length 5")
|
|
323
|
+
|
|
279
324
|
if len(values) != len(intervals):
|
|
280
325
|
raise ValueError(
|
|
281
326
|
"`values` must have the same length as `intervals`"
|
|
282
327
|
)
|
|
328
|
+
|
|
329
|
+
if len(templateNames) != len(cascadeLevels):
|
|
330
|
+
raise ValueError(
|
|
331
|
+
"\n\t`templateNames` and `cascadeLevels` must have the same length."
|
|
332
|
+
"\n\tSet products are not supported, i.e., each template needs an explicitly defined cascade level."
|
|
333
|
+
"\t\ne.g., for `templateNames = [haar, db2]`, use `cascadeLevels = [2, 2]`, not `[2]`.\n"
|
|
334
|
+
)
|
|
335
|
+
|
|
283
336
|
intervalLengthBp = intervals[1] - intervals[0]
|
|
337
|
+
|
|
338
|
+
if minMatchLengthBP is not None and minMatchLengthBP < 1:
|
|
339
|
+
minMatchLengthBP = (
|
|
340
|
+
autoMinLengthIntervals(values) * int(intervalLengthBp)
|
|
341
|
+
)
|
|
342
|
+
elif minMatchLengthBP is None:
|
|
343
|
+
minMatchLengthBP = 250
|
|
344
|
+
|
|
345
|
+
logger.info(
|
|
346
|
+
f"\n\tUsing minMatchLengthBP: {minMatchLengthBP}"
|
|
347
|
+
)
|
|
348
|
+
|
|
284
349
|
if not np.all(np.abs(np.diff(intervals)) == intervalLengthBp):
|
|
285
350
|
raise ValueError("`intervals` must be evenly spaced.")
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
351
|
+
|
|
352
|
+
if weights is not None:
|
|
353
|
+
if len(weights) != len(values):
|
|
354
|
+
logger.warning(
|
|
355
|
+
f"`weights` length {len(weights)} does not match `values` length {len(values)}. Ignoring..."
|
|
356
|
+
)
|
|
357
|
+
else:
|
|
358
|
+
values = values * weights
|
|
359
|
+
|
|
290
360
|
asinhValues = np.asinh(values, dtype=np.float32)
|
|
291
361
|
asinhNonZeroValues = asinhValues[asinhValues > 0]
|
|
292
362
|
iters = max(int(iters), 1000)
|
|
@@ -383,147 +453,142 @@ def matchWavelet(
|
|
|
383
453
|
high = np.quantile(vals, 0.999)
|
|
384
454
|
return vals[(vals > low) & (vals < high)]
|
|
385
455
|
|
|
386
|
-
for cascadeLevel in
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
continue
|
|
393
|
-
|
|
394
|
-
wav = pw.Wavelet(str(templateName))
|
|
395
|
-
scalingFunc, waveletFunc, _ = wav.wavefun(
|
|
396
|
-
level=int(cascadeLevel)
|
|
397
|
-
)
|
|
398
|
-
template = np.array(
|
|
399
|
-
scalingFunc if useScalingFunction else waveletFunc,
|
|
400
|
-
dtype=np.float64,
|
|
456
|
+
for templateName, cascadeLevel in zip(
|
|
457
|
+
templateNames, cascadeLevels
|
|
458
|
+
):
|
|
459
|
+
if templateName not in pw.wavelist(kind="discrete"):
|
|
460
|
+
logger.warning(
|
|
461
|
+
f"Skipping unknown wavelet template: {templateName}"
|
|
401
462
|
)
|
|
402
|
-
|
|
463
|
+
continue
|
|
403
464
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
465
|
+
wav = pw.Wavelet(str(templateName))
|
|
466
|
+
scalingFunc, waveletFunc, _ = wav.wavefun(
|
|
467
|
+
level=int(cascadeLevel)
|
|
468
|
+
)
|
|
469
|
+
template = np.array(
|
|
470
|
+
scalingFunc if useScalingFunction else waveletFunc,
|
|
471
|
+
dtype=np.float64,
|
|
472
|
+
)
|
|
473
|
+
template /= np.linalg.norm(template)
|
|
409
474
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
475
|
+
logger.info(
|
|
476
|
+
f"\n\tMatching template: {templateName}"
|
|
477
|
+
f"\n\tcascade level: {cascadeLevel}"
|
|
478
|
+
f"\n\ttemplate length: {len(template)}"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# efficient FFT-based cross-correlation
|
|
482
|
+
# (OA may be better for smaller templates, TODO add a check)
|
|
483
|
+
response = signal.fftconvolve(
|
|
484
|
+
values, template[::-1], mode="same"
|
|
485
|
+
)
|
|
486
|
+
thisMinMatchBp = minMatchLengthBP
|
|
487
|
+
if thisMinMatchBp is None or thisMinMatchBp < 1:
|
|
488
|
+
thisMinMatchBp = len(template) * intervalLengthBp
|
|
489
|
+
if thisMinMatchBp % intervalLengthBp != 0:
|
|
490
|
+
thisMinMatchBp += intervalLengthBp - (
|
|
491
|
+
thisMinMatchBp % intervalLengthBp
|
|
424
492
|
)
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
493
|
+
relWindowBins = int(
|
|
494
|
+
((thisMinMatchBp / intervalLengthBp) / 2) + 1
|
|
495
|
+
)
|
|
496
|
+
relWindowBins = max(relWindowBins, 1)
|
|
497
|
+
asinhThreshold = parseMinSignalThreshold(minSignalAtMaxima)
|
|
498
|
+
for nullMask, testMask, tag in [
|
|
499
|
+
(halfLeftMask, halfRightMask, "R"),
|
|
500
|
+
(halfRightMask, halfLeftMask, "L"),
|
|
501
|
+
]:
|
|
502
|
+
blockMaxima = sampleBlockMaxima(
|
|
503
|
+
response,
|
|
504
|
+
nullMask,
|
|
505
|
+
relWindowBins,
|
|
506
|
+
nsamp=max(iters, 1000),
|
|
507
|
+
seed=rng.integers(1, 10_000),
|
|
428
508
|
)
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
(halfRightMask, halfLeftMask, "L"),
|
|
432
|
-
]:
|
|
509
|
+
if len(blockMaxima) < 25:
|
|
510
|
+
pooledMask = ~excludeMaskGlobal.astype(bool)
|
|
433
511
|
blockMaxima = sampleBlockMaxima(
|
|
434
512
|
response,
|
|
435
|
-
|
|
513
|
+
pooledMask,
|
|
436
514
|
relWindowBins,
|
|
437
515
|
nsamp=max(iters, 1000),
|
|
438
516
|
seed=rng.integers(1, 10_000),
|
|
439
517
|
)
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
candidateIdx = relativeMaxima(response, relWindowBins)
|
|
451
|
-
|
|
452
|
-
candidateMask = (
|
|
453
|
-
(candidateIdx >= relWindowBins)
|
|
454
|
-
& (candidateIdx < len(response) - relWindowBins)
|
|
455
|
-
& (testMask[candidateIdx])
|
|
456
|
-
& (excludeMaskGlobal[candidateIdx] == 0)
|
|
457
|
-
& (asinhValues[candidateIdx] > asinhThreshold)
|
|
458
|
-
)
|
|
518
|
+
ecdfSf = stats.ecdf(blockMaxima).sf
|
|
519
|
+
candidateIdx = relativeMaxima(response, relWindowBins)
|
|
520
|
+
|
|
521
|
+
candidateMask = (
|
|
522
|
+
(candidateIdx >= relWindowBins)
|
|
523
|
+
& (candidateIdx < len(response) - relWindowBins)
|
|
524
|
+
& (testMask[candidateIdx])
|
|
525
|
+
& (excludeMaskGlobal[candidateIdx] == 0)
|
|
526
|
+
& (asinhValues[candidateIdx] > asinhThreshold)
|
|
527
|
+
)
|
|
459
528
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
]
|
|
529
|
+
candidateIdx = candidateIdx[candidateMask]
|
|
530
|
+
if len(candidateIdx) == 0:
|
|
531
|
+
continue
|
|
532
|
+
if (
|
|
533
|
+
maxNumMatches is not None
|
|
534
|
+
and len(candidateIdx) > maxNumMatches
|
|
535
|
+
):
|
|
536
|
+
candidateIdx = candidateIdx[
|
|
537
|
+
np.argsort(asinhValues[candidateIdx])[
|
|
538
|
+
-maxNumMatches:
|
|
471
539
|
]
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
540
|
+
]
|
|
541
|
+
pEmp = np.clip(
|
|
542
|
+
ecdfSf.evaluate(response[candidateIdx]),
|
|
543
|
+
1.0e-10,
|
|
544
|
+
1.0,
|
|
545
|
+
)
|
|
546
|
+
startsIdx = np.maximum(candidateIdx - relWindowBins, 0)
|
|
547
|
+
endsIdx = np.minimum(
|
|
548
|
+
len(values) - 1, candidateIdx + relWindowBins
|
|
549
|
+
)
|
|
550
|
+
pointSourcesIdx = []
|
|
551
|
+
for s, e in zip(startsIdx, endsIdx):
|
|
552
|
+
pointSourcesIdx.append(
|
|
553
|
+
np.argmax(values[s : e + 1]) + s
|
|
479
554
|
)
|
|
480
|
-
|
|
481
|
-
|
|
555
|
+
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
556
|
+
starts = intervals[startsIdx]
|
|
557
|
+
ends = intervals[endsIdx]
|
|
558
|
+
pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
|
|
559
|
+
1, intervalLengthBp // 2
|
|
560
|
+
)
|
|
561
|
+
if recenterAtPointSource:
|
|
562
|
+
starts = pointSourcesAbs - (
|
|
563
|
+
relWindowBins * intervalLengthBp
|
|
482
564
|
)
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
pointSourcesIdx.append(
|
|
486
|
-
np.argmax(values[s : e + 1]) + s
|
|
487
|
-
)
|
|
488
|
-
pointSourcesIdx = np.array(pointSourcesIdx)
|
|
489
|
-
starts = intervals[startsIdx]
|
|
490
|
-
ends = intervals[endsIdx]
|
|
491
|
-
pointSourcesAbs = (intervals[pointSourcesIdx]) + max(
|
|
492
|
-
1, intervalLengthBp // 2
|
|
565
|
+
ends = pointSourcesAbs + (
|
|
566
|
+
relWindowBins * intervalLengthBp
|
|
493
567
|
)
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
568
|
+
pointSourcesRel = (
|
|
569
|
+
intervals[pointSourcesIdx] - starts
|
|
570
|
+
) + max(1, intervalLengthBp // 2)
|
|
571
|
+
sqScores = (1 + response[candidateIdx]) ** 2
|
|
572
|
+
minR, maxR = (
|
|
573
|
+
float(np.min(sqScores)),
|
|
574
|
+
float(np.max(sqScores)),
|
|
575
|
+
)
|
|
576
|
+
rangeR = max(maxR - minR, 1.0)
|
|
577
|
+
scores = (250 + 750 * (sqScores - minR) / rangeR).astype(int)
|
|
578
|
+
for i, idxVal in enumerate(candidateIdx):
|
|
579
|
+
allRows.append(
|
|
580
|
+
{
|
|
581
|
+
"chromosome": chromosome,
|
|
582
|
+
"start": int(starts[i]),
|
|
583
|
+
"end": int(ends[i]),
|
|
584
|
+
"name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
|
|
585
|
+
"score": int(scores[i]),
|
|
586
|
+
"strand": ".",
|
|
587
|
+
"signal": float(response[idxVal]),
|
|
588
|
+
"p_raw": float(pEmp[i]),
|
|
589
|
+
"pointSource": int(pointSourcesRel[i]),
|
|
590
|
+
}
|
|
508
591
|
)
|
|
509
|
-
rangeR = max(maxR - minR, 1.0)
|
|
510
|
-
scores = (
|
|
511
|
-
250 + 750 * (sqScores - minR) / rangeR
|
|
512
|
-
).astype(int)
|
|
513
|
-
for i, idxVal in enumerate(candidateIdx):
|
|
514
|
-
allRows.append(
|
|
515
|
-
{
|
|
516
|
-
"chromosome": chromosome,
|
|
517
|
-
"start": int(starts[i]),
|
|
518
|
-
"end": int(ends[i]),
|
|
519
|
-
"name": f"{templateName}_{cascadeLevel}_{idxVal}_{tag}",
|
|
520
|
-
"score": int(scores[i]),
|
|
521
|
-
"strand": ".",
|
|
522
|
-
"signal": float(response[idxVal]),
|
|
523
|
-
"p_raw": float(pEmp[i]),
|
|
524
|
-
"pointSource": int(pointSourcesRel[i]),
|
|
525
|
-
}
|
|
526
|
-
)
|
|
527
592
|
|
|
528
593
|
if not allRows:
|
|
529
594
|
logger.warning(
|