consenrich 0.7.4b3__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

consenrich/core.py ADDED
@@ -0,0 +1,1441 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""
3
+ Consenrich core functions and classes.
4
+
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from tempfile import NamedTemporaryFile
10
+ from typing import (
11
+ Callable,
12
+ List,
13
+ Optional,
14
+ Tuple,
15
+ DefaultDict,
16
+ Any,
17
+ NamedTuple,
18
+ )
19
+
20
+ import numpy as np
21
+ import numpy.typing as npt
22
+ import pybedtools as bed
23
+ from scipy import signal, ndimage
24
+
25
+ from . import cconsenrich
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def resolveExtendBP(extendBP, bamFiles: List[str]) -> List[int]:
36
+ numFiles = len(bamFiles)
37
+
38
+ if isinstance(extendBP, str):
39
+ stringValue = extendBP.replace(" ", "")
40
+ try:
41
+ extendBP = (
42
+ [int(x) for x in stringValue.split(",")]
43
+ if stringValue
44
+ else []
45
+ )
46
+ except ValueError:
47
+ raise ValueError(
48
+ "`extendBP` string must be comma-separated values (castable to integers)"
49
+ )
50
+ if extendBP is None:
51
+ return [0] * numFiles
52
+ elif isinstance(extendBP, list):
53
+ valuesList = [int(x) for x in extendBP]
54
+ valuesLen = len(valuesList)
55
+ if valuesLen == 0:
56
+ return [0] * numFiles
57
+ if valuesLen == 1:
58
+ return [valuesList[0]] * numFiles
59
+ if valuesLen == numFiles:
60
+ return valuesList
61
+ raise ValueError(
62
+ f"extendBP length {valuesLen} does not match number of bamFiles {numFiles}; "
63
+ f"provide 0, 1, or {numFiles} values."
64
+ )
65
+ elif isinstance(extendBP, int) or isinstance(extendBP, float):
66
+ return [int(extendBP)] * numFiles
67
+ raise TypeError(
68
+ f"Invalid extendBP type: {type(extendBP).__name__}. "
69
+ "Expecting a single number (broadcast), a list of numbers matching `bamFiles`."
70
+ )
71
+
72
+
73
+ class processParams(NamedTuple):
74
+ r"""Parameters related to the process model of Consenrich.
75
+
76
+ The process model governs the signal and variance propagation
77
+ through the state transition :math:`\mathbf{F} \in \mathbb{R}^{2 \times 2}`
78
+ and process noise covariance :math:`\mathbf{Q}_{[i]} \in \mathbb{R}^{2 \times 2}`
79
+ matrices.
80
+
81
+ :param deltaF: Scales the signal and variance propagation between adjacent genomic intervals.
82
+ :param minQ: Minimum process noise level (diagonal in :math:`\mathbf{Q}_{[i]}`)
83
+ for each state variable.
84
+ :type minQ: float
85
+ :param dStatAlpha: Threshold on the deviation between the data and estimated signal -- used to determine whether the process noise is scaled up.
86
+ :type dStatAlpha: float
87
+ :param dStatd: Constant :math:`d` in the scaling expression :math:`\sqrt{d|D_{[i]} - \alpha_D| + c}`
88
+ that is used to up/down-scale the process noise covariance in the event of a model mismatch.
89
+ :type dStatd: float
90
+ :param dStatPC: Constant :math:`c` in the scaling expression :math:`\sqrt{d|D_{[i]} - \alpha_D| + c}`
91
+ that is used to up/down-scale the process noise covariance in the event of a model mismatch.
92
+ :type dStatPC: float
93
+ :param scaleResidualsByP11: If `True`, the primary state variances (posterior) :math:`\widetilde{P}_{[i], (11)}, i=1\ldots n` are included in the inverse-variance (precision) weighting of residuals :math:`\widetilde{\mathbf{y}}_{[i]}, i=1\ldots n`.
94
+ If `False`, only the per-sample *observation noise levels* will be used in the precision-weighting. Note that this does not affect `raw` residuals output (See :class:`outputParams`).
95
+ :type scaleResidualsByP11: Optional[bool]
96
+
97
+ """
98
+
99
+ deltaF: float
100
+ minQ: float
101
+ maxQ: float
102
+ offDiagQ: float
103
+ dStatAlpha: float
104
+ dStatd: float
105
+ dStatPC: float
106
+ scaleResidualsByP11: Optional[bool] = True
107
+
108
+
109
+ class observationParams(NamedTuple):
110
+ r"""Parameters related to the observation model of Consenrich.
111
+
112
+ The observation model is used to integrate sequence alignment count
113
+ data from the multiple input samples and account for region-and-sample-specific
114
+ noise processes corrupting data. The observation model matrix
115
+ :math:`\mathbf{H} \in \mathbb{R}^{m \times 2}` maps from the state dimension (2)
116
+ to the dimension of measurements/data (:math:`m`).
117
+
118
+ :param minR: The minimum observation noise level for each sample
119
+ :math:`j=1\ldots m` in the observation noise covariance
120
+ matrix :math:`\mathbf{R}_{[i, (11:mm)]}`.
121
+ :type minR: float
122
+ :param numNearest: The number of nearest nearby 'sparse' features to use for local
123
+ variance calculation. Ignored if `useALV` is True.
124
+ :type numNearest: int
125
+ :param localWeight: The coefficient for the local noise level (based on the local surrounding window / `numNearest` features) used in the weighted sum measuring sample-specific noise level at the current interval.
126
+ :type localWeight: float
127
+ :param globalWeight: The coefficient for the global noise level (based on all genomic intervals :math:`i=1\ldots n`) used in the weighted sum measuring sample-specific noise level at the current interval.
128
+ :type globalWeight: float
129
+ :param approximationWindowLengthBP: The length of the local variance approximation window in base pairs (BP)
130
+ for the local variance calculation.
131
+ :type approximationWindowLengthBP: int
132
+ :param sparseBedFile: The path to a BED file of 'sparse' regions for the local variance calculation. For genomes with default resources in `src/consenrich/data`, this may be left as `None`,
133
+ and a default annotation that is devoid of putative regulatory elements (ENCODE cCREs) will be used. Users can instead supply a custom BED file or set `observationParams.useALV` to `True`
134
+ to avoid predefined annotations.
135
+ :type sparseBedFile: str, optional
136
+ :param noGlobal: If True, only the 'local' variances are used to approximate observation noise
137
+ covariance :math:`\mathbf{R}_{[:, (11:mm)]}`.
138
+ :type noGlobal: bool
139
+ :param useALV: Whether to use average local variance (ALV) heuristic *exclusively* to approximate observation noise
140
+ covariances per-sample, per-interval. Note that unrestricted ALV (i.e., without masking previously annotated high-signal regions) is comparatively vulnerable to inflated noise estimates in large enriched genomic domains.
141
+ :type useALV: bool
142
+ :param lowPassFilterType: The type of low-pass filter to use (e.g., 'median', 'mean') in the ALV calculation (:func:`consenrich.core.getAverageLocalVarianceTrack`).
143
+ :type lowPassFilterType: Optional[str]
144
+ """
145
+
146
+ minR: float
147
+ maxR: float
148
+ useALV: bool
149
+ useConstantNoiseLevel: bool
150
+ noGlobal: bool
151
+ numNearest: int
152
+ localWeight: float
153
+ globalWeight: float
154
+ approximationWindowLengthBP: int
155
+ lowPassWindowLengthBP: int
156
+ lowPassFilterType: Optional[str]
157
+ returnCenter: bool
158
+
159
+
160
+ class stateParams(NamedTuple):
161
+ r"""Parameters related to state and uncertainty bounds and initialization.
162
+
163
+ :param stateInit: Initial value of the 'primary' state/signal at the first genomic interval: :math:`x_{[1]}`
164
+ :type stateInit: float
165
+ :param stateCovarInit: Initial state covariance (covariance) scale. Note, the *initial* state uncertainty :math:`\mathbf{P}_{[1]}` is a multiple of the identity matrix :math:`\mathbf{I}`. Final results are typically insensitive to this parameter, since the filter effectively 'forgets' its initialization after processing a moderate number of intervals and backward smoothing.
166
+ :type stateCovarInit: float
167
+ :param boundState: If True, the primary state estimate for :math:`x_{[i]}` is reported within `stateLowerBound` and `stateUpperBound`. Note that the internal filtering is unaffected.
168
+ :type boundState: bool
169
+ :param stateLowerBound: Lower bound for the state estimate.
170
+ :type stateLowerBound: float
171
+ :param stateUpperBound: Upper bound for the state estimate.
172
+ :type stateUpperBound: float
173
+ """
174
+
175
+ stateInit: float
176
+ stateCovarInit: float
177
+ boundState: bool
178
+ stateLowerBound: float
179
+ stateUpperBound: float
180
+
181
+
182
+ class samParams(NamedTuple):
183
+ r"""Parameters related to reading BAM files
184
+
185
+ :param samThreads: The number of threads to use for reading BAM files.
186
+ :type samThreads: int
187
+ :param samFlagExclude: The SAM flag to exclude certain reads.
188
+ :type samFlagExclude: int
189
+ :param oneReadPerBin: If 1, only the interval with the greatest read overlap is incremented.
190
+ :type oneReadPerBin: int
191
+ :param chunkSize: maximum number of intervals' data to hold in memory before flushing to disk.
192
+ :type chunkSize: int
193
+ :param offsetStr: A string of two comma-separated integers -- first for the 5' shift on forward strand, second for the 5' shift on reverse strand.
194
+ :type offsetStr: str
195
+ :param extendBP: A list of integers specifying the number of base pairs to extend reads for each BAM file after shifting per `offsetStr`.
196
+ If all BAM files share the same expected frag. length, can supply a single numeric value to be broadcasted. Ignored for PE reads.
197
+ :type extendBP: List[int]
198
+ :param maxInsertSize: Maximum frag length/insert for paired-end reads.
199
+ :type maxInsertSize: int
200
+ :param pairedEndMode: If > 0, only proper pairs are counted subject to `maxInsertSize`.
201
+ :type pairedEndMode: int
202
+ :param inferFragmentLength: Intended for single-end data: if > 0, the maximum correlation lag
203
+ (avg.) between *strand-specific* read tracks is taken as the fragment length estimate and used to
204
+ extend reads from 5'. Ignored if `pairedEndMode > 0` or `extendBP` set. This parameter is particularly
205
+ important when targeting broader marks (e.g., ChIP-seq H3K27me3).
206
+ :type inferFragmentLength: int
207
+ :param countEndsOnly: If True, only the 5' ends of reads are counted. Overrides `inferFragmentLength` and `pairedEndMode`.
208
+ :type countEndsOnly: Optional[bool]
209
+
210
+ .. tip::
211
+
212
+ For an overview of SAM flags, see https://broadinstitute.github.io/picard/explain-flags.html
213
+
214
+ """
215
+
216
+ samThreads: int
217
+ samFlagExclude: int
218
+ oneReadPerBin: int
219
+ chunkSize: int
220
+ offsetStr: Optional[str] = "0,0"
221
+ extendBP: Optional[List[int]] = []
222
+ maxInsertSize: Optional[int] = 1000
223
+ pairedEndMode: Optional[int] = 0
224
+ inferFragmentLength: Optional[int] = 0
225
+ countEndsOnly: Optional[bool] = False
226
+
227
+
228
+ class detrendParams(NamedTuple):
229
+ r"""Parameters related detrending and background-removal
230
+
231
+ :param useOrderStatFilter: Whether to use a local/moving order statistic (percentile filter) to model and remove trends in the read density data.
232
+ :type useOrderStatFilter: bool
233
+ :param usePolyFilter: Whether to use a low-degree polynomial fit to model and remove trends in the read density data.
234
+ :type usePolyFilter: bool
235
+ :param detrendSavitzkyGolayDegree: The polynomial degree of the Savitzky-Golay filter to use for detrending
236
+ :type detrendSavitzkyGolayDegree: int
237
+ :param detrendTrackPercentile: The percentile to use for the local/moving order-statistic filter.
238
+ Decrease for broad marks + sparse data if `useOrderStatFilter` is True.
239
+ :type detrendTrackPercentile: float
240
+ :param detrendWindowLengthBP: The length of the window in base pairs for detrending.
241
+ Increase for broader marks + sparse data.
242
+ :type detrendWindowLengthBP: int
243
+ """
244
+
245
+ useOrderStatFilter: bool
246
+ usePolyFilter: bool
247
+ detrendTrackPercentile: float
248
+ detrendSavitzkyGolayDegree: int
249
+ detrendWindowLengthBP: int
250
+
251
+
252
+ class inputParams(NamedTuple):
253
+ r"""Parameters related to the input data for Consenrich.
254
+
255
+ :param bamFiles: A list of paths to distinct coordinate-sorted and indexed BAM files.
256
+ :type bamFiles: List[str]
257
+
258
+ :param bamFilesControl: A list of paths to distinct coordinate-sorted and
259
+ indexed control BAM files. e.g., IgG control inputs for ChIP-seq.
260
+
261
+ :type bamFilesControl: List[str], optional
262
+
263
+ """
264
+
265
+ bamFiles: List[str]
266
+ bamFilesControl: Optional[List[str]]
267
+ pairedEnd: Optional[bool]
268
+
269
+
270
+ class genomeParams(NamedTuple):
271
+ r"""Specify assembly-specific resources, parameters.
272
+
273
+ :param genomeName: If supplied, default resources for the assembly (sizes file, blacklist, and 'sparse' regions) in `src/consenrich/data` are used.
274
+ ``ce10, ce11, dm6, hg19, hg38, mm10, mm39`` have default resources available.
275
+ :type genomeName: str
276
+ :param chromSizesFile: A two-column TSV-like file with chromosome names and sizes (in base pairs).
277
+ :type chromSizesFile: str
278
+ :param blacklistFile: A BED file with regions to exclude.
279
+ :type blacklistFile: str, optional
280
+ :param sparseBedFile: A BED file with 'sparse regions' used to estimate noise levels -- ignored if `observationParams.useALV` is True. 'Sparse regions' broadly refers to genomic intervals devoid of the targeted signal, based on prior annotations.
281
+ Users may supply a custom BED file and/or set `observationParams.useALV` to `True` to avoid relying on predefined annotations.
282
+ :type sparseBedFile: str, optional
283
+ :param chromosomes: A list of chromosome names to analyze. If None, all chromosomes in `chromSizesFile` are used.
284
+ :type chromosomes: List[str]
285
+ """
286
+
287
+ genomeName: str
288
+ chromSizesFile: str
289
+ blacklistFile: Optional[str]
290
+ sparseBedFile: Optional[str]
291
+ chromosomes: List[str]
292
+ excludeChroms: List[str]
293
+ excludeForNorm: List[str]
294
+
295
+
296
+ class countingParams(NamedTuple):
297
+ r"""Parameters related to counting reads in genomic intervals.
298
+
299
+ :param stepSize: Step size (bp) for the genomic intervals (AKA bin size, interval length, width, etc.)
300
+ :type stepSize: int
301
+ :param scaleDown: If using paired treatment and control BAM files, whether to
302
+ scale down the larger of the two before computing the difference/ratio
303
+ :type scaleDown: bool, optional
304
+ :param scaleFactors: Scale factors for the read counts.
305
+ :type scaleFactors: List[float], optional
306
+ :param scaleFactorsControl: Scale factors for the control read counts.
307
+ :type scaleFactorsControl: List[float], optional
308
+ :param numReads: Number of reads to sample.
309
+ :type numReads: int
310
+ :param applyAsinh: If true, :math:`\textsf{arsinh}(x)` applied to counts :math:`x` for each supplied BAM file (log-like for large values and linear near the origin).
311
+ :type applyAsinh: bool, optional
312
+ :param applyLog: If true, :math:`\textsf{log}(x + 1)` applied to counts :math:`x` for each supplied BAM file.
313
+ :type applyLog: bool, optional
314
+ """
315
+
316
+ stepSize: int
317
+ scaleDown: Optional[bool]
318
+ scaleFactors: Optional[List[float]]
319
+ scaleFactorsControl: Optional[List[float]]
320
+ numReads: int
321
+ applyAsinh: Optional[bool]
322
+ applyLog: Optional[bool]
323
+ rescaleToTreatmentCoverage: Optional[bool] = False # deprecated
324
+
325
+
326
+ class matchingParams(NamedTuple):
327
+ r"""Parameters related to the matching algorithm.
328
+
329
+ See :ref:`matching` for an overview of the approach.
330
+
331
+ :param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
332
+ :type templateNames: List[str]
333
+ :param cascadeLevels: Number of cascade iterations used to approximate each template (wavelet or scaling function).
334
+ Must have the same length as `templateNames`, with each entry aligned to the
335
+ corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
336
+ :type cascadeLevels: List[int]
337
+ :param iters: Number of random blocks to sample in the response sequence while building
338
+ an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
339
+ :type iters: int
340
+ :param alpha: Primary significance threshold on detected matches. Specifically, the
341
+ minimum corrected empirical p-value approximated from randomly sampled blocks in the
342
+ response sequence.
343
+ :type alpha: float
344
+ :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
345
+ the signal-template convolution must be greater in value than others to qualify as matches.
346
+ If set to a value less than 1, the minimum length is determined via :func:`consenrich.matching.autoMinLengthIntervals`.
347
+ If set to `None`, defaults to 250 bp.
348
+ :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Requires the *signal value*
349
+ at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale
350
+ to temper genome-wide dynamic range. If a `float` value is provided, the minimum signal value must be greater
351
+ than this (absolute) value. *Set to a negative value to disable the threshold*.
352
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The
353
+ threshold is then set to the corresponding quantile of the non-zero signal estimates.
354
+ :type minSignalAtMaxima: Optional[str | float]
355
+ :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
356
+ If False, use (only) the wavelet function.
357
+ :type useScalingFunction: bool
358
+ :param excludeRegionsBedFile: A BED file with regions to exclude from matching
359
+ :type excludeRegionsBedFile: Optional[str]
360
+ :param penalizeBy: Specify a positional metric to scale/weight signal values by when matching.
361
+ For example, 'absResiduals' divides signal values by :math:`|\widetilde{y}_i|` at each
362
+ position :math:`i`, thereby down-weighting positions where the signal estimate deviates from
363
+ the data after accounting for observation noise. 'stateUncertainty' divides signal values by
364
+ the square root of the primary state variance :math:`\sqrt{\widetilde{P}_{i,(11)}}` at each position :math:`i`,
365
+ thereby down-weighting positions where the posterior state uncertainty is high. 'muncTrace' divides signal values by
366
+ the square root of the average observation noise trace :math:`\sqrt{\frac{\textsf{Trace}\left(\mathbf{R}_{[i]}\right)}{m}}` at each position :math:`i`,
367
+ :type penalizeBy: Optional[str]
368
+ :param eps: Tolerance parameter for relative maxima detection in the response sequence. Set to zero to enforce strict
369
+ inequalities when identifying discrete relative maxima.
370
+ :type eps: float
371
+ :seealso: :func:`cconsenrich.csampleBlockStats`, :ref:`matching`, :class:`outputParams`.
372
+ """
373
+
374
+ templateNames: List[str]
375
+ cascadeLevels: List[int]
376
+ iters: int
377
+ alpha: float
378
+ useScalingFunction: Optional[bool]
379
+ minMatchLengthBP: Optional[int]
380
+ maxNumMatches: Optional[int]
381
+ minSignalAtMaxima: Optional[str | float]
382
+ merge: Optional[bool]
383
+ mergeGapBP: Optional[int]
384
+ excludeRegionsBedFile: Optional[str]
385
+ penalizeBy: Optional[str]
386
+ randSeed: Optional[int] = 42
387
+ eps: Optional[float] = 1.0e-2
388
+
389
+
390
+ class outputParams(NamedTuple):
391
+ r"""Parameters related to output files.
392
+
393
+ :param convertToBigWig: If True, output bedGraph files are converted to bigWig format.
394
+ :type convertToBigWig: bool
395
+ :param roundDigits: Number of decimal places to round output values (bedGraph)
396
+ :type roundDigits: int
397
+ :param writeResiduals: If True, write to a separate bedGraph the mean of precision-weighted residuals at each interval. These may be interpreted as
398
+ a measure of model mismatch. Where these quantities are large (+-) the estimated signal and uncertainty do not explain the observed deviation from the data.
399
+ :type writeResiduals: bool
400
+ :param writeRawResiduals: If True, write to a separate bedGraph the pointwise avg. of post-fit residuals at each interval. These values are not 'precision-weighted'.
401
+ :type writeRawResiduals: bool
402
+ :param writeMuncTrace: If True, write to a separate bedGraph :math:`\sqrt{\frac{\textsf{Trace}\left(\mathbf{R}_{[i]}\right)}{m}}` -- that is, square root of the 'average' observation noise level at each interval :math:`i=1\ldots n`, where :math:`m` is the number of samples/tracks.
403
+ :type writeMuncTrace: bool
404
+ :param writeStateStd: If True, write to a separate bedGraph estimated 'standard deviation' of the primary state, :math:`\sqrt{\widetilde{P}_{i,(11)}}`, at each interval. Note that an absolute Gaussian interpretation of this metric depends on sample size, arguments in :class:`processParams` and :class:`observationParams`, etc.
405
+ In any case, this metric may be interpreted as a relative measure of uncertainty in the state estimate at each interval.
406
+ :type writeStateStd: bool
407
+ """
408
+
409
+ convertToBigWig: bool
410
+ roundDigits: int
411
+ writeResiduals: bool
412
+ writeRawResiduals: bool
413
+ writeMuncTrace: bool
414
+ writeStateStd: bool
415
+
416
+
417
+ def _numIntervals(start: int, end: int, step: int) -> int:
418
+ # helper for consistency
419
+ length = max(0, end - start)
420
+ return (length + step) // step
421
+
422
+
423
+ def getChromRanges(
424
+ bamFile: str,
425
+ chromosome: str,
426
+ chromLength: int,
427
+ samThreads: int,
428
+ samFlagExclude: int,
429
+ ) -> Tuple[int, int]:
430
+ r"""Get the start and end positions of reads in a chromosome from a BAM file.
431
+
432
+ :param bamFile: See :class:`inputParams`.
433
+ :type bamFile: str
434
+ :param chromosome: the chromosome to read in `bamFile`.
435
+ :type chromosome: str
436
+ :param chromLength: Base pair length of the chromosome.
437
+ :type chromLength: int
438
+ :param samThreads: See :class:`samParams`.
439
+ :type samThreads: int
440
+ :param samFlagExclude: See :class:`samParams`.
441
+ :type samFlagExclude: int
442
+ :return: Tuple of start and end positions (nucleotide coordinates) in the chromosome.
443
+ :rtype: Tuple[int, int]
444
+
445
+ :seealso: :func:`getChromRangesJoint`, :func:`cconsenrich.cgetFirstChromRead`, :func:`cconsenrich.cgetLastChromRead`
446
+ """
447
+ start: int = cconsenrich.cgetFirstChromRead(
448
+ bamFile, chromosome, chromLength, samThreads, samFlagExclude
449
+ )
450
+ end: int = cconsenrich.cgetLastChromRead(
451
+ bamFile, chromosome, chromLength, samThreads, samFlagExclude
452
+ )
453
+ return start, end
454
+
455
+
456
+ def getChromRangesJoint(
457
+ bamFiles: List[str],
458
+ chromosome: str,
459
+ chromSize: int,
460
+ samThreads: int,
461
+ samFlagExclude: int,
462
+ ) -> Tuple[int, int]:
463
+ r"""For multiple BAM files, reconcile a single start and end position over which to count reads,
464
+ where the start and end positions are defined by the first and last reads across all BAM files.
465
+
466
+ :param bamFiles: List of BAM files to read.
467
+ :type bamFiles: List[str]
468
+ :param chromosome: Chromosome to read.
469
+ :type chromosome: str
470
+ :param chromSize: Size of the chromosome.
471
+ :type chromSize: int
472
+ :param samThreads: Number of threads to use for reading the BAM files.
473
+ :type samThreads: int
474
+ :param samFlagExclude: SAM flag to exclude certain reads.
475
+ :type samFlagExclude: int
476
+ :return: Tuple of start and end positions.
477
+ :rtype: Tuple[int, int]
478
+
479
+ :seealso: :func:`getChromRanges`, :func:`cconsenrich.cgetFirstChromRead`, :func:`cconsenrich.cgetLastChromRead`
480
+ """
481
+ starts = []
482
+ ends = []
483
+ for bam_ in bamFiles:
484
+ start, end = getChromRanges(
485
+ bam_,
486
+ chromosome,
487
+ chromLength=chromSize,
488
+ samThreads=samThreads,
489
+ samFlagExclude=samFlagExclude,
490
+ )
491
+ starts.append(start)
492
+ ends.append(end)
493
+ return min(starts), max(ends)
494
+
495
+
496
+ def getReadLength(
497
+ bamFile: str,
498
+ numReads: int,
499
+ maxIterations: int,
500
+ samThreads: int,
501
+ samFlagExclude: int,
502
+ ) -> int:
503
+ r"""Infer read length from mapped reads in a BAM file.
504
+
505
+ Samples at least `numReads` reads passing criteria given by `samFlagExclude`
506
+ and returns the median read length.
507
+
508
+ :param bamFile: See :class:`inputParams`.
509
+ :type bamFile: str
510
+ :param numReads: Number of reads to sample.
511
+ :type numReads: int
512
+ :param maxIterations: Maximum number of iterations to perform.
513
+ :type maxIterations: int
514
+ :param samThreads: See :class:`samParams`.
515
+ :type samThreads: int
516
+ :param samFlagExclude: See :class:`samParams`.
517
+ :type samFlagExclude: int
518
+ :return: The median read length.
519
+ :rtype: int
520
+
521
+ :raises ValueError: If the read length cannot be determined after scanning `maxIterations` reads.
522
+
523
+ :seealso: :func:`cconsenrich.cgetReadLength`
524
+ """
525
+ init_rlen = cconsenrich.cgetReadLength(
526
+ bamFile, numReads, samThreads, maxIterations, samFlagExclude
527
+ )
528
+ if init_rlen == 0:
529
+ raise ValueError(
530
+ f"Failed to determine read length in {bamFile}. Revise `numReads`, and/or `samFlagExclude` parameters?"
531
+ )
532
+ return init_rlen
533
+
534
+
535
+ def getReadLengths(
536
+ bamFiles: List[str],
537
+ numReads: int,
538
+ maxIterations: int,
539
+ samThreads: int,
540
+ samFlagExclude: int,
541
+ ) -> List[int]:
542
+ r"""Get read lengths for a list of BAM files.
543
+
544
+ :seealso: :func:`getReadLength`
545
+ """
546
+ return [
547
+ getReadLength(
548
+ bamFile,
549
+ numReads=numReads,
550
+ maxIterations=maxIterations,
551
+ samThreads=samThreads,
552
+ samFlagExclude=samFlagExclude,
553
+ )
554
+ for bamFile in bamFiles
555
+ ]
556
+
557
+
558
+ def readBamSegments(
559
+ bamFiles: List[str],
560
+ chromosome: str,
561
+ start: int,
562
+ end: int,
563
+ stepSize: int,
564
+ readLengths: List[int],
565
+ scaleFactors: List[float],
566
+ oneReadPerBin: int,
567
+ samThreads: int,
568
+ samFlagExclude: int,
569
+ offsetStr: Optional[str] = "0,0",
570
+ applyAsinh: Optional[bool] = False,
571
+ applyLog: Optional[bool] = False,
572
+ extendBP: List[int] = [],
573
+ maxInsertSize: Optional[int] = 1000,
574
+ pairedEndMode: Optional[int] = 0,
575
+ inferFragmentLength: Optional[int] = 0,
576
+ countEndsOnly: Optional[bool] = False,
577
+ ) -> npt.NDArray[np.float32]:
578
+ r"""Calculate tracks of read counts (or a function thereof) for each BAM file.
579
+
580
+ See :func:`cconsenrich.creadBamSegment` for the underlying implementation in Cython.
581
+
582
+ :param bamFiles: See :class:`inputParams`.
583
+ :type bamFiles: List[str]
584
+ :param chromosome: Chromosome to read.
585
+ :type chromosome: str
586
+ :param start: Start position of the genomic segment.
587
+ :type start: int
588
+ :param end: End position of the genomic segment.
589
+ :type end: int
590
+ :param readLengths: List of read lengths for each BAM file.
591
+ :type readLengths: List[int]
592
+ :param scaleFactors: List of scale factors for each BAM file.
593
+ :type scaleFactors: List[float]
594
+ :param stepSize: See :class:`countingParams`.
595
+ :type stepSize: int
596
+ :param oneReadPerBin: See :class:`samParams`.
597
+ :type oneReadPerBin: int
598
+ :param samThreads: See :class:`samParams`.
599
+ :type samThreads: int
600
+ :param samFlagExclude: See :class:`samParams`.
601
+ :type samFlagExclude: int
602
+ :param offsetStr: See :class:`samParams`.
603
+ :type offsetStr: str
604
+ :param extendBP: See :class:`samParams`.
605
+ :type extendBP: int
606
+ :param maxInsertSize: See :class:`samParams`.
607
+ :type maxInsertSize: int
608
+ :param pairedEndMode: See :class:`samParams`.
609
+ :type pairedEndMode: int
610
+ :param inferFragmentLength: See :class:`samParams`.
611
+ :type inferFragmentLength: int
612
+ :param countEndsOnly: If True, only the 5' ends of reads are counted. This overrides `inferFragmentLength` and `pairedEndMode`.
613
+ :type countEndsOnly: Optional[bool]
614
+
615
+ """
616
+
617
+ if len(bamFiles) == 0:
618
+ raise ValueError("bamFiles list is empty")
619
+
620
+ if len(readLengths) != len(bamFiles) or len(scaleFactors) != len(
621
+ bamFiles
622
+ ):
623
+ raise ValueError(
624
+ "readLengths and scaleFactors must match bamFiles length"
625
+ )
626
+
627
+ extendBP = resolveExtendBP(extendBP, bamFiles)
628
+ offsetStr = ((str(offsetStr) or "0,0").replace(" ", "")).split(
629
+ ","
630
+ )
631
+ numIntervals = ((end - start) + stepSize - 1) // stepSize
632
+ counts = np.empty((len(bamFiles), numIntervals), dtype=np.float32)
633
+
634
+ if isinstance(countEndsOnly, bool) and countEndsOnly:
635
+ # note: setting this option ignores inferFragmentLength, pairedEndMode
636
+ inferFragmentLength = 0
637
+ pairedEndMode = 0
638
+
639
+ for j, bam in enumerate(bamFiles):
640
+ logger.info(f"Reading {chromosome}: {bam}")
641
+ arr = cconsenrich.creadBamSegment(
642
+ bam,
643
+ chromosome,
644
+ start,
645
+ end,
646
+ stepSize,
647
+ readLengths[j],
648
+ oneReadPerBin,
649
+ samThreads,
650
+ samFlagExclude,
651
+ int(offsetStr[0]),
652
+ int(offsetStr[1]),
653
+ extendBP[j],
654
+ maxInsertSize,
655
+ pairedEndMode,
656
+ inferFragmentLength,
657
+ )
658
+ # FFR: use ufuncs?
659
+ counts[j, :] = arr
660
+ counts[j, :] *= np.float32(scaleFactors[j])
661
+ if applyAsinh:
662
+ counts[j, :] = np.arcsinh(counts[j, :])
663
+ elif applyLog:
664
+ counts[j, :] = np.log1p(counts[j, :])
665
+ return counts
666
+
667
+
668
+
669
+ def getAverageLocalVarianceTrack(
670
+ values: np.ndarray,
671
+ stepSize: int,
672
+ approximationWindowLengthBP: int,
673
+ lowPassWindowLengthBP: int,
674
+ minR: float,
675
+ maxR: float,
676
+ lowPassFilterType: Optional[str] = "median",
677
+ ) -> npt.NDArray[np.float32]:
678
+ r"""Generate positional noise-level tracks with first and second moments approximated from local windows
679
+
680
+ First, computes a moving average of ``values`` using a bp-length window
681
+ ``approximationWindowLengthBP`` and a moving average of ``values**2`` over the
682
+ same window. Their difference is used to approximate the local variance. A low-pass filter
683
+ (median or mean) with window ``lowPassWindowLengthBP`` then smooths the variance track.
684
+ Finally, the track is clipped to ``[minR, maxR]`` to yield the local noise level track.
685
+
686
+ :param values: 1D array of read-density-based values for a single sample.
687
+ :type values: np.ndarray
688
+ :param stepSize: Bin size (bp).
689
+ :type stepSize: int
690
+ :param approximationWindowLengthBP: Window (bp) for local mean and second-moment. See :class:`observationParams`.
691
+ :type approximationWindowLengthBP: int
692
+ :param lowPassWindowLengthBP: Window (bp) for the low-pass filter on the variance track. See :class:`observationParams`.
693
+ :type lowPassWindowLengthBP: int
694
+ :param minR: Lower clip for the returned noise level. See :class:`observationParams`.
695
+ :type minR: float
696
+ :param maxR: Upper clip for the returned noise level. See :class:`observationParams`.
697
+ :type maxR: float
698
+ :param lowPassFilterType: ``"median"`` (default) or ``"mean"``. Type of low-pass filter to use for smoothing the local variance track. See :class:`observationParams`.
699
+ :type lowPassFilterType: Optional[str]
700
+ :return: Local noise level per interval.
701
+ :rtype: npt.NDArray[np.float32]
702
+
703
+ :seealso: :class:`observationParams`
704
+ """
705
+ values = np.asarray(values, dtype=np.float32)
706
+ windowLength = int(approximationWindowLengthBP / stepSize)
707
+ if windowLength % 2 == 0:
708
+ windowLength += 1
709
+ if len(values) < 3:
710
+ constVar = np.var(values)
711
+ if constVar < minR:
712
+ return np.full_like(values, minR, dtype=np.float32)
713
+ return np.full_like(values, constVar, dtype=np.float32)
714
+
715
+ # first get a simple moving average of the values
716
+ localMeanTrack: npt.NDArray[np.float32] = ndimage.uniform_filter(
717
+ values, size=windowLength, mode="nearest"
718
+ )
719
+
720
+ # ~ E[X_i^2] - E[X_i]^2 ~
721
+ localVarTrack: npt.NDArray[np.float32] = (
722
+ ndimage.uniform_filter(
723
+ values**2, size=windowLength, mode="nearest"
724
+ )
725
+ - localMeanTrack**2
726
+ )
727
+
728
+ # safe-guard: difference of convolutions returns negative values.
729
+ # shouldn't actually happen, but just in case there are some
730
+ # ...potential artifacts i'm unaware of edge effects, etc.
731
+ localVarTrack = np.maximum(localVarTrack, 0.0)
732
+
733
+ # low-pass filter on the local variance track: positional 'noise level' track
734
+ lpassWindowLength = int(lowPassWindowLengthBP / stepSize)
735
+ if lpassWindowLength % 2 == 0:
736
+ lpassWindowLength += 1
737
+
738
+ noiseLevel: npt.NDArray[np.float32] = np.zeros_like(
739
+ localVarTrack, dtype=np.float32
740
+ )
741
+ if lowPassFilterType is None or (
742
+ isinstance(lowPassFilterType, str)
743
+ and lowPassFilterType.lower() == "median"
744
+ ):
745
+ noiseLevel = ndimage.median_filter(
746
+ localVarTrack, size=lpassWindowLength
747
+ )
748
+ elif (
749
+ isinstance(lowPassFilterType, str)
750
+ and lowPassFilterType.lower() == "mean"
751
+ ):
752
+ noiseLevel = ndimage.uniform_filter(
753
+ localVarTrack, size=lpassWindowLength
754
+ )
755
+
756
+ return np.clip(noiseLevel, minR, maxR).astype(np.float32)
757
+
758
+
759
+ def constructMatrixF(deltaF: float) -> npt.NDArray[np.float32]:
760
+ r"""Build the state transition matrix for the process model
761
+
762
+ :param deltaF: See :class:`processParams`.
763
+ :type deltaF: float
764
+ :return: The state transition matrix :math:`\mathbf{F}`
765
+ :rtype: npt.NDArray[np.float32]
766
+
767
+ :seealso: :class:`processParams`
768
+ """
769
+ initMatrixF: npt.NDArray[np.float32] = np.eye(2, dtype=np.float32)
770
+ initMatrixF[0, 1] = np.float32(deltaF)
771
+ return initMatrixF
772
+
773
+
774
+ def constructMatrixQ(
775
+ minDiagQ: float, offDiagQ: float = 0.0
776
+ ) -> npt.NDArray[np.float32]:
777
+ r"""Build the initial process noise covariance matrix :math:`\mathbf{Q}_{[1]}`.
778
+
779
+ :param minDiagQ: See :class:`processParams`.
780
+ :type minDiagQ: float
781
+ :param offDiagQ: See :class:`processParams`.
782
+ :type offDiagQ: float
783
+ :return: The initial process noise covariance matrix :math:`\mathbf{Q}_{[1]}`.
784
+ :rtype: npt.NDArray[np.float32]
785
+
786
+ :seealso: :class:`processParams`
787
+ """
788
+ minDiagQ = np.float32(minDiagQ)
789
+ offDiagQ = np.float32(offDiagQ)
790
+ initMatrixQ: npt.NDArray[np.float32] = np.zeros(
791
+ (2, 2), dtype=np.float32
792
+ )
793
+ initMatrixQ[0, 0] = minDiagQ
794
+ initMatrixQ[1, 1] = minDiagQ
795
+ initMatrixQ[0, 1] = offDiagQ
796
+ initMatrixQ[1, 0] = offDiagQ
797
+ return initMatrixQ
798
+
799
+
800
+ def constructMatrixH(
801
+ m: int, coefficients: Optional[np.ndarray] = None
802
+ ) -> npt.NDArray[np.float32]:
803
+ r"""Build the observation model matrix :math:`\mathbf{H}`.
804
+
805
+ :param m: Number of observations.
806
+ :type m: int
807
+ :param coefficients: Optional coefficients for the observation model,
808
+ which can be used to weight the observations manually.
809
+ :type coefficients: Optional[np.ndarray]
810
+ :return: The observation model matrix :math:`\mathbf{H}`.
811
+ :rtype: npt.NDArray[np.float32]
812
+
813
+ :seealso: :class:`observationParams`, class:`inputParams`
814
+ """
815
+ if coefficients is None:
816
+ coefficients = np.ones(m, dtype=np.float32)
817
+ elif isinstance(coefficients, list):
818
+ coefficients = np.array(coefficients, dtype=np.float32)
819
+ initMatrixH = np.empty((m, 2), dtype=np.float32)
820
+ initMatrixH[:, 0] = coefficients.astype(np.float32)
821
+ initMatrixH[:, 1] = np.zeros(m, dtype=np.float32)
822
+ return initMatrixH
823
+
824
+
825
+ def runConsenrich(
826
+ matrixData: np.ndarray,
827
+ matrixMunc: np.ndarray,
828
+ deltaF: float,
829
+ minQ: float,
830
+ maxQ: float,
831
+ offDiagQ: float,
832
+ dStatAlpha: float,
833
+ dStatd: float,
834
+ dStatPC: float,
835
+ stateInit: float,
836
+ stateCovarInit: float,
837
+ boundState: bool,
838
+ stateLowerBound: float,
839
+ stateUpperBound: float,
840
+ chunkSize: int,
841
+ progressIter: int,
842
+ coefficientsH: Optional[np.ndarray] = None,
843
+ residualCovarInversionFunc: Optional[Callable] = None,
844
+ adjustProcessNoiseFunc: Optional[Callable] = None,
845
+ ) -> Tuple[
846
+ npt.NDArray[np.float32],
847
+ npt.NDArray[np.float32],
848
+ npt.NDArray[np.float32],
849
+ ]:
850
+ r"""Run consenrich on a contiguous segment (e.g. a chromosome) of read-density-based data.
851
+ Completes the forward and backward passes given data and approximated observation noise
852
+ covariance matrices :math:`\mathbf{R}_{[1:n, (11:mm)]}`.
853
+
854
+ This is the primary function implementing the core Consenrich algorithm. Users requiring specialized
855
+ preprocessing may prefer to call this function programmatically on their own preprocessed data rather
856
+ than using the command-line interface.
857
+
858
+
859
+ :param matrixData: Read density data for a single chromosome or general contiguous segment,
860
+ possibly preprocessed. Two-dimensional array of shape :math:`m \times n` where :math:`m`
861
+ is the number of samples/tracks and :math:`n` the number of genomic intervals.
862
+ :type matrixData: np.ndarray
863
+ :param matrixMunc: Uncertainty estimates for the read coverage data.
864
+ Two-dimensional array of shape :math:`m \times n` where :math:`m` is the number of samples/tracks
865
+ and :math:`n` the number of genomic intervals. See :func:`getMuncTrack`.
866
+ :type matrixMunc: np.ndarray
867
+ :param deltaF: See :class:`processParams`.
868
+ :type deltaF: float
869
+ :param minQ: See :class:`processParams`.
870
+ :type minQ: float
871
+ :param maxQ: See :class:`processParams`.
872
+ :type maxQ: float
873
+ :param offDiagQ: See :class:`processParams`.
874
+ :type offDiagQ: float
875
+ :param dStatAlpha: See :class:`processParams`.
876
+ :type dStatAlpha: float
877
+ :param dStatd: See :class:`processParams`.
878
+ :type dStatd: float
879
+ :param dStatPC: See :class:`processParams`.
880
+ :type dStatPC: float
881
+ :param stateInit: See :class:`stateParams`.
882
+ :type stateInit: float
883
+ :param stateCovarInit: See :class:`stateParams`.
884
+ :type stateCovarInit: float
885
+ :param chunkSize: Number of genomic intervals' data to keep in memory before flushing to disk.
886
+ :type chunkSize: int
887
+ :param progressIter: The number of iterations after which to log progress.
888
+ :type progressIter: int
889
+ :param coefficientsH: Optional coefficients for the observation model matrix :math:`\mathbf{H}`.
890
+ If None, the coefficients are set to 1.0 for all samples.
891
+ :type coefficientsH: Optional[np.ndarray]
892
+ :param residualCovarInversionFunc: Callable function to invert the observation covariance matrix :math:`\mathbf{E}_{[i]}`.
893
+ If None, defaults to :func:`cconsenrich.cinvertMatrixE`.
894
+ :type residualCovarInversionFunc: Optional[Callable]
895
+ :param adjustProcessNoiseFunc: Function to adjust the process noise covariance matrix :math:`\mathbf{Q}_{[i]}`.
896
+ If None, defaults to :func:`cconsenrich.updateProcessNoiseCovariance`.
897
+ :type adjustProcessNoiseFunc: Optional[Callable]
898
+ :return: Tuple of three numpy arrays:
899
+ - state estimates :math:`\widetilde{\mathbf{x}}_{[i]}` of shape :math:`n \times 2`
900
+ - state covariance estimates :math:`\widetilde{\mathbf{P}}_{[i]}` of shape :math:`n \times 2 \times 2`
901
+ - post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` of shape :math:`n \times m`
902
+ :rtype: Tuple[np.ndarray, np.ndarray, np.ndarray]
903
+
904
+ :raises ValueError: If the number of samples in `matrixData` is not equal to the number of samples in `matrixMunc`.
905
+ :seealso: :class:`observationParams`, :class:`processParams`, :class:`stateParams`
906
+ """
907
+ matrixData = np.ascontiguousarray(matrixData, dtype=np.float32)
908
+ matrixMunc = np.ascontiguousarray(matrixMunc, dtype=np.float32)
909
+ m: int = 1 if matrixData.ndim == 1 else matrixData.shape[0]
910
+ n: int = 1 if matrixData.ndim == 1 else matrixData.shape[1]
911
+ inflatedQ: bool = False
912
+ dStat: float = np.float32(0.0)
913
+ countAdjustments: int = 0
914
+
915
+ IKH: np.ndarray = np.zeros(shape=(2, 2), dtype=np.float32)
916
+ matrixEInverse: np.ndarray = np.zeros(
917
+ shape=(m, m), dtype=np.float32
918
+ )
919
+ matrixF: np.ndarray = constructMatrixF(deltaF)
920
+ matrixQ: np.ndarray = constructMatrixQ(minQ, offDiagQ=offDiagQ)
921
+ matrixQCopy: np.ndarray = matrixQ.copy()
922
+ matrixP: np.ndarray = np.eye(2, dtype=np.float32) * np.float32(
923
+ stateCovarInit
924
+ )
925
+ matrixH: np.ndarray = constructMatrixH(
926
+ m, coefficients=coefficientsH
927
+ )
928
+ matrixK: np.ndarray = np.zeros((2, m), dtype=np.float32)
929
+ vectorX: np.ndarray = np.array([stateInit, 0.0], dtype=np.float32)
930
+ vectorY: np.ndarray = np.zeros(m, dtype=np.float32)
931
+ matrixI2: np.ndarray = np.eye(2, dtype=np.float32)
932
+
933
+ if residualCovarInversionFunc is None:
934
+ residualCovarInversionFunc = cconsenrich.cinvertMatrixE
935
+ if adjustProcessNoiseFunc is None:
936
+ adjustProcessNoiseFunc = (
937
+ cconsenrich.updateProcessNoiseCovariance
938
+ )
939
+
940
+ # ==========================
941
+ # forward: 0,1,2,...,n-1
942
+ # ==========================
943
+ stateForward = np.memmap(
944
+ NamedTemporaryFile(delete=True),
945
+ dtype=np.float32,
946
+ mode="w+",
947
+ shape=(n, 2),
948
+ )
949
+ stateCovarForward = np.memmap(
950
+ NamedTemporaryFile(delete=True),
951
+ dtype=np.float32,
952
+ mode="w+",
953
+ shape=(n, 2, 2),
954
+ )
955
+ pNoiseForward = np.memmap(
956
+ NamedTemporaryFile(delete=True),
957
+ dtype=np.float32,
958
+ mode="w+",
959
+ shape=(n, 2, 2),
960
+ )
961
+ progressIter = max(1, progressIter)
962
+ for i in range(n):
963
+ if i % progressIter == 0:
964
+ logger.info(f"Forward pass interval: {i + 1}/{n}")
965
+ vectorZ = matrixData[:, i]
966
+ vectorX = matrixF @ vectorX
967
+ matrixP = matrixF @ matrixP @ matrixF.T + matrixQ
968
+ vectorY = vectorZ - (matrixH @ vectorX)
969
+ matrixEInverse = residualCovarInversionFunc(
970
+ matrixMunc[:, i], np.float32(matrixP[0, 0])
971
+ )
972
+ Einv_diag = np.diag(matrixEInverse)
973
+ dStat = np.median((vectorY**2) * Einv_diag)
974
+ countAdjustments = countAdjustments + int(dStat > dStatAlpha)
975
+ matrixQ, inflatedQ = adjustProcessNoiseFunc(
976
+ matrixQ,
977
+ matrixQCopy,
978
+ dStat,
979
+ dStatAlpha,
980
+ dStatd,
981
+ dStatPC,
982
+ inflatedQ,
983
+ maxQ,
984
+ minQ,
985
+ )
986
+ matrixK = (matrixP @ matrixH.T) @ matrixEInverse
987
+ IKH = matrixI2 - (matrixK @ matrixH)
988
+
989
+ vectorX = vectorX + (matrixK @ vectorY)
990
+ matrixP = (IKH) @ matrixP @ (IKH).T + (
991
+ matrixK * matrixMunc[:, i]
992
+ ) @ matrixK.T
993
+ stateForward[i] = vectorX.astype(np.float32)
994
+ stateCovarForward[i] = matrixP.astype(np.float32)
995
+ pNoiseForward[i] = matrixQ.astype(np.float32)
996
+
997
+ if i % chunkSize == 0 and i > 0:
998
+ stateForward.flush()
999
+ stateCovarForward.flush()
1000
+ pNoiseForward.flush()
1001
+
1002
+ stateForward.flush()
1003
+ stateCovarForward.flush()
1004
+ pNoiseForward.flush()
1005
+ stateForwardArr = stateForward
1006
+ stateCovarForwardArr = stateCovarForward
1007
+ pNoiseForwardArr = pNoiseForward
1008
+
1009
+ # log num. times process noise was adjusted
1010
+ logger.info(
1011
+ f"`Median(normedInnovations) > α_D` triggered the adaptive procedure at [{round(((1.0 * countAdjustments) / n) * 100.0, 4)}%] of intervals"
1012
+ )
1013
+
1014
+
1015
+ # ==========================
1016
+ # backward: n,n-1,n-2,...,0
1017
+ # ==========================
1018
+ stateSmoothed = np.memmap(
1019
+ NamedTemporaryFile(delete=True),
1020
+ dtype=np.float32,
1021
+ mode="w+",
1022
+ shape=(n, 2),
1023
+ )
1024
+ stateCovarSmoothed = np.memmap(
1025
+ NamedTemporaryFile(delete=True),
1026
+ dtype=np.float32,
1027
+ mode="w+",
1028
+ shape=(n, 2, 2),
1029
+ )
1030
+ postFitResiduals = np.memmap(
1031
+ NamedTemporaryFile(delete=True),
1032
+ dtype=np.float32,
1033
+ mode="w+",
1034
+ shape=(n, m),
1035
+ )
1036
+
1037
+ stateSmoothed[-1] = np.float32(stateForwardArr[-1])
1038
+ stateCovarSmoothed[-1] = np.float32(stateCovarForwardArr[-1])
1039
+ postFitResiduals[-1] = np.float32(
1040
+ matrixData[:, -1] - (matrixH @ stateSmoothed[-1])
1041
+ )
1042
+
1043
+ for k in range(n - 2, -1, -1):
1044
+ if k % progressIter == 0:
1045
+ logger.info(f"Backward pass interval: {k + 1}/{n}")
1046
+ forwardStatePosterior = stateForwardArr[k]
1047
+ forwardCovariancePosterior = stateCovarForwardArr[k]
1048
+ backwardInitialState = matrixF @ forwardStatePosterior
1049
+ backwardInitialCovariance = (
1050
+ matrixF @ forwardCovariancePosterior @ matrixF.T
1051
+ + pNoiseForwardArr[k + 1]
1052
+ )
1053
+
1054
+ smootherGain = np.linalg.solve(
1055
+ backwardInitialCovariance.T,
1056
+ (forwardCovariancePosterior @ matrixF.T).T,
1057
+ ).T
1058
+ stateSmoothed[k] = (
1059
+ forwardStatePosterior
1060
+ + smootherGain
1061
+ @ (stateSmoothed[k + 1] - backwardInitialState)
1062
+ ).astype(np.float32)
1063
+
1064
+ stateCovarSmoothed[k] = (
1065
+ forwardCovariancePosterior
1066
+ + smootherGain
1067
+ @ (stateCovarSmoothed[k + 1] - backwardInitialCovariance)
1068
+ @ smootherGain.T
1069
+ ).astype(np.float32)
1070
+ postFitResiduals[k] = np.float32(
1071
+ matrixData[:, k] - matrixH @ stateSmoothed[k]
1072
+ )
1073
+
1074
+ if k % chunkSize == 0 and k > 0:
1075
+ stateSmoothed.flush()
1076
+ stateCovarSmoothed.flush()
1077
+ postFitResiduals.flush()
1078
+
1079
+ stateSmoothed.flush()
1080
+ stateCovarSmoothed.flush()
1081
+ postFitResiduals.flush()
1082
+ if boundState:
1083
+ stateSmoothed[:, 0] = np.clip(
1084
+ stateSmoothed[:, 0], stateLowerBound, stateUpperBound
1085
+ ).astype(np.float32)
1086
+
1087
+ return (
1088
+ stateSmoothed[:],
1089
+ stateCovarSmoothed[:],
1090
+ postFitResiduals[:],
1091
+ )
1092
+
1093
+
1094
+ def getPrimaryState(
1095
+ stateVectors: np.ndarray, roundPrecision: int = 3
1096
+ ) -> npt.NDArray[np.float32]:
1097
+ r"""Get the primary state estimate from each vector after running Consenrich.
1098
+
1099
+ :param stateVectors: State vectors from :func:`runConsenrich`.
1100
+ :type stateVectors: npt.NDArray[np.float32]
1101
+ :return: A one-dimensional numpy array of the primary state estimates.
1102
+ :rtype: npt.NDArray[np.float32]
1103
+ """
1104
+ out_ = np.ascontiguousarray(stateVectors[:, 0], dtype=np.float32)
1105
+ np.round(out_, decimals=roundPrecision, out=out_)
1106
+ return out_
1107
+
1108
+
1109
+ def getStateCovarTrace(
1110
+ stateCovarMatrices: np.ndarray, roundPrecision: int = 3
1111
+ ) -> npt.NDArray[np.float32]:
1112
+ r"""Get a one-dimensional array of state covariance traces after running Consenrich
1113
+
1114
+ :param stateCovarMatrices: Estimated state covariance matrices :math:`\widetilde{\mathbf{P}}_{[i]}`
1115
+ :type stateCovarMatrices: np.ndarray
1116
+ :return: A one-dimensional numpy array of the traces of the state covariance matrices.
1117
+ :rtype: npt.NDArray[np.float32]
1118
+ """
1119
+ stateCovarMatrices = np.ascontiguousarray(
1120
+ stateCovarMatrices, dtype=np.float32
1121
+ )
1122
+ out_ = cconsenrich.cgetStateCovarTrace(stateCovarMatrices)
1123
+ np.round(out_, decimals=roundPrecision, out=out_)
1124
+ return out_
1125
+
1126
+
1127
+ def getPrecisionWeightedResidual(
1128
+ postFitResiduals: np.ndarray,
1129
+ matrixMunc: np.ndarray,
1130
+ roundPrecision: int = 3,
1131
+ stateCovarSmoothed: Optional[np.ndarray] = None,
1132
+ ) -> npt.NDArray[np.float32]:
1133
+ r"""Get a one-dimensional precision-weighted array residuals after running Consenrich.
1134
+
1135
+ Applies an inverse-variance weighting of the post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` and
1136
+ returns a one-dimensional array of "precision-weighted residuals". The state-level uncertainty can also be
1137
+ incorporated given `stateCovarSmoothed`.
1138
+
1139
+ :param postFitResiduals: Post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` from :func:`runConsenrich`.
1140
+ :type postFitResiduals: np.ndarray
1141
+ :param matrixMunc: An :math:`m \times n` sample-by-interval matrix -- At genomic intervals :math:`i = 1,2,\ldots,n`, the respective length-:math:`m` column is :math:`\mathbf{R}_{[i,11:mm]}`.
1142
+ That is, the observation noise levels for each sample :math:`j=1,2,\ldots,m` at interval :math:`i`. To keep memory usage minimal `matrixMunc` is not returned in full or computed in
1143
+ in :func:`runConsenrich`. If using Consenrich programmatically, run :func:`consenrich.core.getMuncTrack` for each sample's count data (rows in the matrix output of :func:`readBamSegments`).
1144
+ :type matrixMunc: np.ndarray
1145
+ :param stateCovarSmoothed: Smoothed state covariance matrices :math:`\widetilde{\mathbf{P}}_{[i]}` from :func:`runConsenrich`.
1146
+ :type stateCovarSmoothed: Optional[np.ndarray]
1147
+ :return: A one-dimensional array of "precision-weighted residuals"
1148
+ :rtype: npt.NDArray[np.float32]
1149
+ """
1150
+
1151
+ n, m = postFitResiduals.shape
1152
+ if matrixMunc.shape != (m, n):
1153
+ raise ValueError(
1154
+ f"matrixMunc should be (m,n)=({m}, {n}): observed {matrixMunc.shape}"
1155
+ )
1156
+ if stateCovarSmoothed is not None and (
1157
+ stateCovarSmoothed.ndim < 3 or len(stateCovarSmoothed) != n
1158
+ ):
1159
+ raise ValueError(
1160
+ "stateCovarSmoothed must be shape (n) x (2,2) (if provided)"
1161
+ )
1162
+
1163
+ postFitResiduals_CContig = np.ascontiguousarray(
1164
+ postFitResiduals, dtype=np.float32
1165
+ )
1166
+
1167
+ needsCopy = (
1168
+ (stateCovarSmoothed is not None)
1169
+ and len(stateCovarSmoothed) == n
1170
+ ) or (not matrixMunc.flags.writeable)
1171
+
1172
+ matrixMunc_CContig = np.array(
1173
+ matrixMunc, dtype=np.float32, order="C", copy=needsCopy
1174
+ )
1175
+
1176
+ if needsCopy:
1177
+ # adds the 'primary' state uncertainty to observation noise covariance :math:`\mathbf{R}_{[i,:]}`
1178
+ # primary state uncertainty (0,0) :math:`\mathbf{P}_{[i]} \in \mathbb{R}^{2 \times 2}`
1179
+ stateCovarArr00 = np.asarray(
1180
+ stateCovarSmoothed[:, 0, 0], dtype=np.float32
1181
+ )
1182
+ matrixMunc_CContig += stateCovarArr00
1183
+
1184
+ np.maximum(
1185
+ matrixMunc_CContig, np.float32(1e-8), out=matrixMunc_CContig
1186
+ )
1187
+ out = cconsenrich.cgetPrecisionWeightedResidual(
1188
+ postFitResiduals_CContig, matrixMunc_CContig
1189
+ )
1190
+ np.round(out, decimals=roundPrecision, out=out)
1191
+ return out
1192
+
1193
+
1194
+ def getMuncTrack(
1195
+ chromosome: str,
1196
+ intervals: np.ndarray,
1197
+ stepSize: int,
1198
+ rowValues: np.ndarray,
1199
+ minR: float,
1200
+ maxR: float,
1201
+ useALV: bool,
1202
+ useConstantNoiseLevel: bool,
1203
+ noGlobal: bool,
1204
+ localWeight: float,
1205
+ globalWeight: float,
1206
+ approximationWindowLengthBP: int,
1207
+ lowPassWindowLengthBP: int,
1208
+ returnCenter: bool,
1209
+ sparseMap: Optional[dict[int, int]] = None,
1210
+ lowPassFilterType: Optional[str] = "median",
1211
+ ) -> npt.NDArray[np.float32]:
1212
+ r"""Get observation noise variance :math:`R_{[:,jj]}` for the sample :math:`j`.
1213
+
1214
+ Combines a local ALV estimate (see :func:`getAverageLocalVarianceTrack`) with an
1215
+ optional global component. If ``useALV`` is True, *only* the ALV is used. If
1216
+ ``useConstantNoiseLevel`` is True, a constant track set to the global mean is used.
1217
+ When a ``sparseMap`` is provided, local values are aggregated over nearby 'sparse'
1218
+ regions before mixing with the global component.
1219
+
1220
+ For heterochromatic or repressive marks (H3K9me3, H3K27me3, MNase-seq, etc.), consider setting
1221
+ `useALV=True` to prevent inflated sample-level noise estimates.
1222
+
1223
+ :param chromosome: Tracks are approximated for this chromosome.
1224
+ :type chromosome: str
1225
+ :param intervals: Genomic intervals for which to compute the noise track.
1226
+ :param stepSize: See :class:`countingParams`.
1227
+ :type stepSize: int
1228
+ :param rowValues: Read-density-based values for the sample :math:`j` at the genomic intervals :math:`i=1,2,\ldots,n`.
1229
+ :type rowValues: np.ndarray
1230
+ :param minR: See :class:`observationParams`.
1231
+ :type minR: float
1232
+ :param maxR: See :class:`observationParams`.
1233
+ :type maxR: float
1234
+ :param useALV: See :class:`observationParams`.
1235
+ :type useALV: bool
1236
+ :param useConstantNoiseLevel: See :class:`observationParams`.
1237
+ :type useConstantNoiseLevel: bool
1238
+ :param noGlobal: See :class:`observationParams`.
1239
+ :type noGlobal: bool
1240
+ :param localWeight: See :class:`observationParams`.
1241
+ :type localWeight: float
1242
+ :param globalWeight: See :class:`observationParams`.
1243
+ :type globalWeight: float
1244
+ :param approximationWindowLengthBP: See :class:`observationParams` and/or :func:`getAverageLocalVarianceTrack`.
1245
+ :type approximationWindowLengthBP: int
1246
+ :param lowPassWindowLengthBP: See :class:`observationParams` and/or :func:`getAverageLocalVarianceTrack`.
1247
+ :type lowPassWindowLengthBP: int
1248
+ :param sparseMap: Optional mapping (dictionary) of interval indices to the nearest sparse regions. See :func:`getSparseMap`.
1249
+ :type sparseMap: Optional[dict[int, int]]
1250
+ :param lowPassFilterType: The type of low-pass filter to use in average local variance track (e.g., 'median', 'mean').
1251
+ :type lowPassFilterType: Optional[str]
1252
+ :return: A one-dimensional numpy array of the observation noise track for the sample :math:`j`.
1253
+ :rtype: npt.NDArray[np.float32]
1254
+
1255
+ """
1256
+ trackALV = getAverageLocalVarianceTrack(
1257
+ rowValues,
1258
+ stepSize,
1259
+ approximationWindowLengthBP,
1260
+ lowPassWindowLengthBP,
1261
+ minR,
1262
+ maxR,
1263
+ lowPassFilterType,
1264
+ ).astype(np.float32)
1265
+
1266
+ globalNoise: float = np.float32(np.mean(trackALV))
1267
+ if noGlobal or globalWeight == 0 or useALV:
1268
+ return np.clip(trackALV, minR, maxR).astype(np.float32)
1269
+
1270
+ if (
1271
+ useConstantNoiseLevel
1272
+ or localWeight == 0
1273
+ and sparseMap is None
1274
+ ):
1275
+ return np.clip(
1276
+ globalNoise * np.ones_like(rowValues), minR, maxR
1277
+ ).astype(np.float32)
1278
+
1279
+ if sparseMap is not None:
1280
+ trackALV = cconsenrich.cSparseAvg(trackALV, sparseMap)
1281
+
1282
+ return np.clip(
1283
+ trackALV * localWeight + np.mean(trackALV) * globalWeight,
1284
+ minR,
1285
+ maxR,
1286
+ ).astype(np.float32)
1287
+
1288
+
1289
+ def sparseIntersection(
1290
+ chromosome: str, intervals: np.ndarray, sparseBedFile: str
1291
+ ) -> npt.NDArray[np.int64]:
1292
+ r"""Returns intervals in the chromosome that overlap with the sparse features.
1293
+
1294
+ Not relevant if `observationParams.useALV` is True.
1295
+
1296
+ :param chromosome: The chromosome name.
1297
+ :type chromosome: str
1298
+ :param intervals: The genomic intervals to consider.
1299
+ :type intervals: np.ndarray
1300
+ :param sparseBedFile: Path to the sparse BED file.
1301
+ :type sparseBedFile: str
1302
+ :return: A numpy array of start positions of the sparse features that overlap with the intervals
1303
+ :rtype: np.ndarray[Tuple[Any], np.dtype[Any]]
1304
+ """
1305
+
1306
+ stepSize: int = intervals[1] - intervals[0]
1307
+ chromFeatures: bed.BedTool = (
1308
+ bed.BedTool(sparseBedFile)
1309
+ .sort()
1310
+ .merge()
1311
+ .filter(
1312
+ lambda b: (
1313
+ b.chrom == chromosome
1314
+ and b.start > intervals[0]
1315
+ and b.end < intervals[-1]
1316
+ and (b.end - b.start) >= stepSize
1317
+ )
1318
+ )
1319
+ )
1320
+ centeredFeatures: bed.BedTool = chromFeatures.each(
1321
+ adjustFeatureBounds, stepSize=stepSize
1322
+ )
1323
+
1324
+ start0: int = int(intervals[0])
1325
+ last: int = int(intervals[-1])
1326
+ chromFeatures: bed.BedTool = (
1327
+ bed.BedTool(sparseBedFile)
1328
+ .sort()
1329
+ .merge()
1330
+ .filter(
1331
+ lambda b: (
1332
+ b.chrom == chromosome
1333
+ and b.start > start0
1334
+ and b.end < last
1335
+ and (b.end - b.start) >= stepSize
1336
+ )
1337
+ )
1338
+ )
1339
+ centeredFeatures: bed.BedTool = chromFeatures.each(
1340
+ adjustFeatureBounds, stepSize=stepSize
1341
+ )
1342
+ centeredStarts = []
1343
+ for f in centeredFeatures:
1344
+ s = int(f.start)
1345
+ if start0 <= s <= last and (s - start0) % stepSize == 0:
1346
+ centeredStarts.append(s)
1347
+ return np.asarray(centeredStarts, dtype=np.int64)
1348
+
1349
+
1350
+ def adjustFeatureBounds(
1351
+ feature: bed.Interval, stepSize: int
1352
+ ) -> bed.Interval:
1353
+ r"""Adjust the start and end positions of a BED feature to be centered around a step."""
1354
+ feature.start = cconsenrich.stepAdjustment(
1355
+ (feature.start + feature.end) // 2, stepSize
1356
+ )
1357
+ feature.end = feature.start + stepSize
1358
+ return feature
1359
+
1360
+
1361
+ def getSparseMap(
1362
+ chromosome: str,
1363
+ intervals: np.ndarray,
1364
+ numNearest: int,
1365
+ sparseBedFile: str,
1366
+ ) -> dict:
1367
+ r"""Build a map between each genomic interval and numNearest sparse features
1368
+
1369
+ :param chromosome: The chromosome name. Note, this function only needs to be run once per chromosome.
1370
+ :type chromosome: str
1371
+ :param intervals: The genomic intervals to map.
1372
+ :type intervals: np.ndarray
1373
+ :param numNearest: The number of nearest sparse features to consider
1374
+ :type numNearest: int
1375
+ :param sparseBedFile: path to the sparse BED file.
1376
+ :type sparseBedFile: str
1377
+ :return: A dictionary mapping each interval index to the indices of the nearest sparse regions.
1378
+ :rtype: dict[int, np.ndarray]
1379
+
1380
+ """
1381
+ numNearest = numNearest
1382
+ sparseStarts = sparseIntersection(
1383
+ chromosome, intervals, sparseBedFile
1384
+ )
1385
+ idxSparseInIntervals = np.searchsorted(
1386
+ intervals, sparseStarts, side="left"
1387
+ )
1388
+ centers = np.searchsorted(sparseStarts, intervals, side="left")
1389
+ sparseMap: dict = {}
1390
+ for i, (interval, center) in enumerate(zip(intervals, centers)):
1391
+ left = max(0, center - numNearest)
1392
+ right = min(len(sparseStarts), center + numNearest)
1393
+ candidates = np.arange(left, right)
1394
+ dists = np.abs(sparseStarts[candidates] - interval)
1395
+ take = np.argsort(dists)[:numNearest]
1396
+ sparseMap[i] = idxSparseInIntervals[candidates[take]]
1397
+ return sparseMap
1398
+
1399
+
1400
+ def getBedMask(
1401
+ chromosome: str,
1402
+ bedFile: str,
1403
+ intervals: np.ndarray,
1404
+ ) -> np.ndarray:
1405
+ r"""Return a 1/0 mask for intervals overlapping a sorted and merged BED file.
1406
+
1407
+ This function is a wrapper for :func:`cconsenrich.cbedMask`.
1408
+
1409
+ :param chromosome: The chromosome name.
1410
+ :type chromosome: str
1411
+ :param intervals: chromosome-specific, sorted, non-overlapping start positions of genomic intervals.
1412
+ Each interval is assumed `stepSize`.
1413
+ :type intervals: np.ndarray
1414
+ :param bedFile: Path to a sorted and merged BED file
1415
+ :type bedFile: str
1416
+ :return: An `intervals`-length mask s.t. True indicates the interval overlaps a feature in the BED file.
1417
+ :rtype: np.ndarray
1418
+ """
1419
+ if not os.path.exists(bedFile):
1420
+ raise ValueError(f"Could not find {bedFile}")
1421
+ if len(intervals) < 2:
1422
+ raise ValueError(
1423
+ "intervals must contain at least two positions"
1424
+ )
1425
+ bedFile_ = str(bedFile)
1426
+
1427
+ # (possibly redundant) creation of uint32 version
1428
+ # + quick check for constant steps
1429
+ intervals_ = np.asarray(intervals, dtype=np.uint32)
1430
+ if (intervals_[1] - intervals_[0]) != (
1431
+ intervals_[-1] - intervals_[-2]
1432
+ ):
1433
+ raise ValueError("Intervals are not fixed in size")
1434
+
1435
+ stepSize_: int = intervals[1] - intervals[0]
1436
+ return cconsenrich.cbedMask(
1437
+ chromosome,
1438
+ bedFile_,
1439
+ intervals_,
1440
+ stepSize_,
1441
+ ).astype(np.bool_)