consenrich 0.6.3b1__cp314-cp314-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

consenrich/core.py ADDED
@@ -0,0 +1,1320 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""
3
+ Consenrich core functions and classes.
4
+
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from tempfile import NamedTemporaryFile
10
+ from typing import Callable, List, Optional, Tuple, DefaultDict, Any, NamedTuple
11
+
12
+ import numpy as np
13
+ import numpy.typing as npt
14
+ import pybedtools as bed
15
+ from scipy import signal, ndimage
16
+
17
+ from . import cconsenrich
18
+
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def resolveExtendBP(extendBP, bamFiles: List[str]) -> List[int]:
28
+ numFiles = len(bamFiles)
29
+
30
+ if isinstance(extendBP, str):
31
+ stringValue = extendBP.replace(" ", "")
32
+ try:
33
+ extendBP = (
34
+ [int(x) for x in stringValue.split(",")] if stringValue else []
35
+ )
36
+ except ValueError:
37
+ raise ValueError(
38
+ "`extendBP` string must be comma-separated values (castable to integers)"
39
+ )
40
+ if extendBP is None:
41
+ return [0] * numFiles
42
+ elif isinstance(extendBP, list):
43
+ valuesList = [int(x) for x in extendBP]
44
+ valuesLen = len(valuesList)
45
+ if valuesLen == 0:
46
+ return [0] * numFiles
47
+ if valuesLen == 1:
48
+ return [valuesList[0]] * numFiles
49
+ if valuesLen == numFiles:
50
+ return valuesList
51
+ raise ValueError(
52
+ f"extendBP length {valuesLen} does not match number of bamFiles {numFiles}; "
53
+ f"provide 0, 1, or {numFiles} values."
54
+ )
55
+ elif isinstance(extendBP, int) or isinstance(extendBP, float):
56
+ return [int(extendBP)] * numFiles
57
+ raise TypeError(
58
+ f"Invalid extendBP type: {type(extendBP).__name__}. "
59
+ "Expecting a single number (broadcast), a list of numbers matching `bamFiles`."
60
+ )
61
+
62
+
63
+ class processParams(NamedTuple):
64
+ r"""Parameters related to the process model of Consenrich.
65
+
66
+ The process model governs the signal and variance propagation
67
+ through the state transition :math:`\mathbf{F} \in \mathbb{R}^{2 \times 2}`
68
+ and process noise covariance :math:`\mathbf{Q}_{[i]} \in \mathbb{R}^{2 \times 2}`
69
+ matrices.
70
+
71
+ :param deltaF: Scales the signal and variance propagation between adjacent genomic intervals.
72
+ :param minQ: Minimum process noise level (diagonal in :math:`\mathbf{Q}_{[i]}`)
73
+ for each state variable. Adjust relative to data quality (more reliable data --> lower minQ).
74
+ :type minQ: float
75
+ :param dStatAlpha: Threshold on the deviation between the data and estimated signal -- used to determine whether the process noise is scaled up.
76
+ :type dStatAlpha: float
77
+ :param dStatd: Constant :math:`d` in the scaling expression :math:`\sqrt{d|D_{[i]} - \alpha_D| + c}`
78
+ that is used to up/down-scale the process noise covariance in the event of a model mismatch.
79
+ :type dStatd: float
80
+ :param dStatPC: Constant :math:`c` in the scaling expression :math:`\sqrt{d|D_{[i]} - \alpha_D| + c}`
81
+ that is used to up/down-scale the process noise covariance in the event of a model mismatch.
82
+ :type dStatPC: float
83
+ :param scaleResidualsByP11: If `True`, the primary state variances :math:`\widetilde{P}_{[i], (11)}, i=1\ldots n` are included in the inverse-variance (precision) weighting of residuals :math:`\widetilde{\mathbf{y}}_{[i]}, i=1\ldots n`.
84
+ If `False`, only the per-sample observation noise levels are used to reduce computational overhead.
85
+ :type scaleResidualsByP11: Optional[bool]
86
+
87
+ """
88
+
89
+ deltaF: float
90
+ minQ: float
91
+ maxQ: float
92
+ offDiagQ: float
93
+ dStatAlpha: float
94
+ dStatd: float
95
+ dStatPC: float
96
+ scaleResidualsByP11: Optional[bool] = False
97
+
98
+
99
+ class observationParams(NamedTuple):
100
+ r"""Parameters related to the observation model of Consenrich.
101
+ The observation model is used to integrate sequence alignment count
102
+ data from the multiple input samples and account for region-and-sample-specific
103
+ noise processes corrupting data. The observation model matrix
104
+ :math:`\mathbf{H} \in \mathbb{R}^{m \times 2}` maps from the state dimension (2)
105
+ to the dimension of measurements/data (:math:`m`).
106
+
107
+ :param minR: The minimum observation noise level for each sample
108
+ :math:`j=1\ldots m` in the observation noise covariance
109
+ matrix :math:`\mathbf{R}_{[i, (11:mm)]}`.
110
+ :type minR: float
111
+ :param numNearest: The number of nearest nearby sparse features to use for local
112
+ variance calculation. Ignored if `useALV` is True.
113
+ :type numNearest: int
114
+ :param localWeight: The coefficient for the local noise level (based on the local surrounding window / `numNearest` features) used in the weighted sum measuring sample-specific noise level at the current interval.
115
+ :type localWeight: float
116
+ :param globalWeight: The coefficient for the global noise level (based on all genomic intervals :math:`i=1\ldots n`) used in the weighted sum measuring sample-specific noise level at the current interval.
117
+ :type globalWeight: float
118
+ :param approximationWindowLengthBP: The length of the local approximation window in base pairs (BP)
119
+ for the local variance calculation.
120
+ :type approximationWindowLengthBP: int
121
+ :param sparseBedFile: The path to a BED file of 'sparse' regions for the local variance calculation.
122
+ :type sparseBedFile: str, optional
123
+ :param noGlobal: If True, only the 'local' variances are used to approximate observation noise
124
+ covariance :math:`\mathbf{R}_{[:, (11:mm)]}`.
125
+ :type noGlobal: bool
126
+ :param useALV: Whether to use average local variance (ALV) to approximate observation noise
127
+ covariances per-sample, per-interval. Recommended for estimating signals associated with
128
+ repressive/heterochromatic elements.
129
+ :type useALV: bool
130
+ :param useConstantNoiseLevel: Whether to use a constant noise level in the observation model.
131
+ :type useConstantNoiseLevel: bool
132
+ :param lowPassFilterType: The type of low-pass filter to use (e.g., 'median', 'mean').
133
+ :type lowPassFilterType: Optional[str]
134
+ """
135
+
136
+ minR: float
137
+ maxR: float
138
+ useALV: bool
139
+ useConstantNoiseLevel: bool
140
+ noGlobal: bool
141
+ numNearest: int
142
+ localWeight: float
143
+ globalWeight: float
144
+ approximationWindowLengthBP: int
145
+ lowPassWindowLengthBP: int
146
+ lowPassFilterType: Optional[str]
147
+ returnCenter: bool
148
+
149
+
150
+ class stateParams(NamedTuple):
151
+ r"""Parameters related to state and uncertainty bounds and initialization.
152
+
153
+ :param stateInit: Initial value of the 'primary' state/signal at the first genomic interval: :math:`x_{[1]}`
154
+ :type stateInit: float
155
+ :param stateCovarInit: Initial state covariance (covariance) scale. Note, the *initial* state uncertainty :math:`\mathbf{P}_{[1]}` is a multiple of the identity matrix :math:`\mathbf{I}`
156
+ :type stateCovarInit: float
157
+ :param boundState: If True, the primary state estimate for :math:`x_{[i]}` is constrained within `stateLowerBound` and `stateUpperBound`.
158
+ :type boundState: bool
159
+ :param stateLowerBound: Lower bound for the state estimate.
160
+ :type stateLowerBound: float
161
+ :param stateUpperBound: Upper bound for the state estimate.
162
+ :type stateUpperBound: float
163
+ """
164
+
165
+ stateInit: float
166
+ stateCovarInit: float
167
+ boundState: bool
168
+ stateLowerBound: float
169
+ stateUpperBound: float
170
+
171
+
172
+ class samParams(NamedTuple):
173
+ r"""Parameters related to reading BAM files
174
+
175
+ :param samThreads: The number of threads to use for reading BAM files.
176
+ :type samThreads: int
177
+ :param samFlagExclude: The SAM flag to exclude certain reads.
178
+ :type samFlagExclude: int
179
+ :param oneReadPerBin: If 1, only the interval with the greatest read overlap is incremented.
180
+ :type oneReadPerBin: int
181
+ :param chunkSize: maximum number of intervals' data to hold in memory before flushing to disk.
182
+ :type chunkSize: int
183
+ :param offsetStr: A string of two comma-separated integers -- first for the 5' shift on forward strand, second for the 5' shift on reverse strand.
184
+ :type offsetStr: str
185
+ :param extendBP: A list of integers specifying the number of base pairs to extend reads for each BAM file after shifting per `offsetStr`.
186
+ If all BAM files share the same expected frag. length, can supply a single numeric value to be broadcasted. Ignored for PE reads.
187
+ :type extendBP: List[int]
188
+ :param maxInsertSize: Maximum frag length/insert for paired-end reads.
189
+ :type maxInsertSize: int
190
+ :param pairedEndMode: If > 0, only proper pairs are counted subject to `maxInsertSize`.
191
+ :type pairedEndMode: int
192
+ :param inferFragmentLength: Intended for single-end data: if > 0, the maximum correlation lag
193
+ (avg.) between *strand-specific* read tracks is taken as the fragment length estimate and used to
194
+ extend reads from 5'. Ignored if `pairedEndMode > 0` or `extendBP` set. This parameter is particularly
195
+ important when targeting broader marks (e.g., ChIP-seq H3K27me3).
196
+ :type inferFragmentLength: int
197
+
198
+ .. tip::
199
+
200
+ For an overview of SAM flags, see https://broadinstitute.github.io/picard/explain-flags.html
201
+
202
+ """
203
+
204
+ samThreads: int
205
+ samFlagExclude: int
206
+ oneReadPerBin: int
207
+ chunkSize: int
208
+ offsetStr: Optional[str] = "0,0"
209
+ extendBP: Optional[List[int]] = []
210
+ maxInsertSize: Optional[int] = 1000
211
+ pairedEndMode: Optional[int] = 0
212
+ inferFragmentLength: Optional[int] = 0
213
+
214
+
215
+ class detrendParams(NamedTuple):
216
+ r"""Parameters related detrending and background-removal
217
+
218
+ :param useOrderStatFilter: Whether to use a local/moving order statistic (percentile filter) to model and remove trends in the read density data.
219
+ :type useOrderStatFilter: bool
220
+ :param usePolyFilter: Whether to use a low-degree polynomial fit to model and remove trends in the read density data.
221
+ :type usePolyFilter: bool
222
+ :param detrendSavitzkyGolayDegree: The polynomial degree of the Savitzky-Golay filter to use for detrending
223
+ :type detrendSavitzkyGolayDegree: int
224
+ :param detrendTrackPercentile: The percentile to use for the local/moving order-statistic filter.
225
+ Decrease for broad marks + sparse data if `useOrderStatFilter` is True.
226
+ :type detrendTrackPercentile: float
227
+ :param detrendWindowLengthBP: The length of the window in base pairs for detrending.
228
+ Increase for broader marks + sparse data.
229
+ :type detrendWindowLengthBP: int
230
+ """
231
+
232
+ useOrderStatFilter: bool
233
+ usePolyFilter: bool
234
+ detrendTrackPercentile: float
235
+ detrendSavitzkyGolayDegree: int
236
+ detrendWindowLengthBP: int
237
+
238
+
239
+ class inputParams(NamedTuple):
240
+ r"""Parameters related to the input data for Consenrich.
241
+
242
+ :param bamFiles: A list of paths to distinct coordinate-sorted and indexed BAM files.
243
+ :type bamFiles: List[str]
244
+
245
+ :param bamFilesControl: A list of paths to distinct coordinate-sorted and
246
+ indexed control BAM files. e.g., IgG control inputs for ChIP-seq.
247
+
248
+ :type bamFilesControl: List[str], optional
249
+
250
+ """
251
+
252
+ bamFiles: List[str]
253
+ bamFilesControl: Optional[List[str]]
254
+
255
+
256
+ class genomeParams(NamedTuple):
257
+ r"""Specify assembly-specific resources, parameters.
258
+
259
+ :param genomeName: If supplied, default resources for the assembly (sizes file, blacklist, and 'sparse' regions) in `src/consenrich/data` are used.
260
+ ``ce10, ce11, dm6, hg19, hg38, mm10, mm39`` have default resources available.
261
+ :type genomeName: str
262
+ :param chromSizesFile: A two-column TSV-like file with chromosome names and sizes (in base pairs).
263
+ :type chromSizesFile: str
264
+ :param blacklistFile: A BED file with regions to exclude.
265
+ :type blacklistFile: str, optional
266
+ :param sparseBedFile: A BED file with sparse regions used to estimate noise levels -- ignored if `observationParams.useALV` is True.
267
+ :type sparseBedFile: str, optional
268
+ :param chromosomes: A list of chromosome names to analyze. If None, all chromosomes in `chromSizesFile` are used.
269
+ :type chromosomes: List[str]
270
+ """
271
+
272
+ genomeName: str
273
+ chromSizesFile: str
274
+ blacklistFile: Optional[str]
275
+ sparseBedFile: Optional[str]
276
+ chromosomes: List[str]
277
+ excludeChroms: List[str]
278
+ excludeForNorm: List[str]
279
+
280
+
281
+ class countingParams(NamedTuple):
282
+ r"""Parameters related to counting reads in genomic intervals.
283
+
284
+ :param stepSize: Step size (bp) for the genomic intervals (AKA bin size, interval length, width, etc.)
285
+ :type stepSize: int
286
+ :param scaleDown: If using paired treatment and control BAM files, whether to
287
+ scale down the larger of the two before computing the difference/ratio
288
+ :type scaleDown: bool, optional
289
+ :param scaleFactors: Scale factors for the read counts.
290
+ :type scaleFactors: List[float], optional
291
+ :param scaleFactorsControl: Scale factors for the control read counts.
292
+ :type scaleFactorsControl: List[float], optional
293
+ :param numReads: Number of reads to sample.
294
+ :type numReads: int
295
+ :param applyAsinh: If true, :math:`\textsf{arsinh}(x)` applied to counts :math:`x` (log-like for large values and linear near the origin)
296
+ :type applyAsinh: bool, optional
297
+ :param applyLog: If true, :math:`\textsf{log}(x + 1)` applied to counts :math:`x`
298
+ :type applyLog: bool, optional
299
+ """
300
+
301
+ stepSize: int
302
+ scaleDown: Optional[bool]
303
+ scaleFactors: Optional[List[float]]
304
+ scaleFactorsControl: Optional[List[float]]
305
+ numReads: int
306
+ applyAsinh: Optional[bool]
307
+ applyLog: Optional[bool]
308
+ rescaleToTreatmentCoverage: Optional[bool] = False
309
+
310
+
311
+ class matchingParams(NamedTuple):
312
+ r"""Parameters related to the matching algorithm packaged with this software.
313
+
314
+ See :ref:`matching` for details.
315
+
316
+ :param templateNames: A list of str values -- wavelet bases used for matching, e.g., `[haar, db2, sym4]`
317
+ :type templateNames: List[str]
318
+ :param cascadeLevels: A list of int values -- the number of cascade iterations used for approximating
319
+ the scaling/wavelet functions.
320
+ :type cascadeLevels: List[int]
321
+ :param iters: Number of random blocks to sample in the response sequence while building
322
+ an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
323
+ :type iters: int
324
+ :param alpha: Primary significance threshold on detected matches. Specifically, the
325
+ :math:`1 - \alpha` quantile of an empirical null distribution. The empirical null
326
+ distribution is built from cross-correlation values over randomly sampled blocks.
327
+ :type alpha: float
328
+ :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
329
+ the signal-template convolution must be greater in value than others to qualify as matches.
330
+ *Set to a negative value to disable this filter*.
331
+ :type minMatchLengthBP: int
332
+ :param minSignalAtMaxima: Secondary significance threshold coupled with `alpha`. Require the *signal value*
333
+ at relative maxima in the response sequence to be greater than this threshold. Comparisons are made in log-scale.
334
+ If a `float` value is provided, the minimum signal value must be greater than this (absolute) value. *Set to a
335
+ negative value to disable the threshold*.
336
+ If a `str` value is provided, looks for 'q:quantileValue', e.g., 'q:0.75'. The
337
+ threshold is then set to the corresponding quantile of the non-zero signal estimates.
338
+ Defaults to str value 'q:0.75' --- the 75th percentile of signal values.
339
+ :type minSignalAtMaxima: Optional[str | float]
340
+ :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
341
+ If False, use (only) the wavelet function.
342
+ :type useScalingFunction: bool
343
+ :param excludeRegionsBedFile: A BED file with regions to exclude from matching
344
+ :type excludeRegionsBedFile: Optional[str]
345
+
346
+ :seealso: :class:`consenrich.core.matchingParams`, :func:`cconsenrich.csampleBlockStats`, :ref:`matching`
347
+ """
348
+
349
+ templateNames: List[str]
350
+ cascadeLevels: List[int]
351
+ iters: int
352
+ alpha: float
353
+ minMatchLengthBP: Optional[int]
354
+ maxNumMatches: Optional[int]
355
+ minSignalAtMaxima: Optional[str | float] = "q:0.75"
356
+ merge: bool = False
357
+ mergeGapBP: int = 25
358
+ useScalingFunction: bool = True
359
+ excludeRegionsBedFile: Optional[str] = None
360
+
361
+
362
+ def _numIntervals(start: int, end: int, step: int) -> int:
363
+ # helper for consistency
364
+ length = max(0, end - start)
365
+ return (length + step) // step
366
+
367
+
368
+ def getChromRanges(
369
+ bamFile: str,
370
+ chromosome: str,
371
+ chromLength: int,
372
+ samThreads: int,
373
+ samFlagExclude: int,
374
+ ) -> Tuple[int, int]:
375
+ r"""Get the start and end positions of reads in a chromosome from a BAM file.
376
+
377
+ :param bamFile: See :class:`inputParams`.
378
+ :type bamFile: str
379
+ :param chromosome: the chromosome to read in `bamFile`.
380
+ :type chromosome: str
381
+ :param chromLength: Base pair length of the chromosome.
382
+ :type chromLength: int
383
+ :param samThreads: See :class:`samParams`.
384
+ :type samThreads: int
385
+ :param samFlagExclude: See :class:`samParams`.
386
+ :type samFlagExclude: int
387
+ :return: Tuple of start and end positions (nucleotide coordinates) in the chromosome.
388
+ :rtype: Tuple[int, int]
389
+
390
+ :seealso: :func:`getChromRangesJoint`, :func:`cconsenrich.cgetFirstChromRead`, :func:`cconsenrich.cgetLastChromRead`
391
+ """
392
+ start: int = cconsenrich.cgetFirstChromRead(
393
+ bamFile, chromosome, chromLength, samThreads, samFlagExclude
394
+ )
395
+ end: int = cconsenrich.cgetLastChromRead(
396
+ bamFile, chromosome, chromLength, samThreads, samFlagExclude
397
+ )
398
+ return start, end
399
+
400
+
401
+ def getChromRangesJoint(
402
+ bamFiles: List[str],
403
+ chromosome: str,
404
+ chromSize: int,
405
+ samThreads: int,
406
+ samFlagExclude: int,
407
+ ) -> Tuple[int, int]:
408
+ r"""For multiple BAM files, reconcile a single start and end position over which to count reads,
409
+ where the start and end positions are defined by the first and last reads across all BAM files.
410
+
411
+ :param bamFiles: List of BAM files to read.
412
+ :type bamFiles: List[str]
413
+ :param chromosome: Chromosome to read.
414
+ :type chromosome: str
415
+ :param chromSize: Size of the chromosome.
416
+ :type chromSize: int
417
+ :param samThreads: Number of threads to use for reading the BAM files.
418
+ :type samThreads: int
419
+ :param samFlagExclude: SAM flag to exclude certain reads.
420
+ :type samFlagExclude: int
421
+ :return: Tuple of start and end positions.
422
+ :rtype: Tuple[int, int]
423
+
424
+ :seealso: :func:`getChromRanges`, :func:`cconsenrich.cgetFirstChromRead`, :func:`cconsenrich.cgetLastChromRead`
425
+ """
426
+ starts = []
427
+ ends = []
428
+ for bam_ in bamFiles:
429
+ start, end = getChromRanges(
430
+ bam_,
431
+ chromosome,
432
+ chromLength=chromSize,
433
+ samThreads=samThreads,
434
+ samFlagExclude=samFlagExclude,
435
+ )
436
+ starts.append(start)
437
+ ends.append(end)
438
+ return min(starts), max(ends)
439
+
440
+
441
+ def getReadLength(
442
+ bamFile: str,
443
+ numReads: int,
444
+ maxIterations: int,
445
+ samThreads: int,
446
+ samFlagExclude: int,
447
+ ) -> int:
448
+ r"""Infer read length from mapped reads in a BAM file.
449
+
450
+ Samples at least `numReads` reads passing criteria given by `samFlagExclude`
451
+ and returns the median read length.
452
+
453
+ :param bamFile: See :class:`inputParams`.
454
+ :type bamFile: str
455
+ :param numReads: Number of reads to sample.
456
+ :type numReads: int
457
+ :param maxIterations: Maximum number of iterations to perform.
458
+ :type maxIterations: int
459
+ :param samThreads: See :class:`samParams`.
460
+ :type samThreads: int
461
+ :param samFlagExclude: See :class:`samParams`.
462
+ :type samFlagExclude: int
463
+ :return: The median read length.
464
+ :rtype: int
465
+
466
+ :raises ValueError: If the read length cannot be determined after scanning `maxIterations` reads.
467
+
468
+ :seealso: :func:`cconsenrich.cgetReadLength`
469
+ """
470
+ init_rlen = cconsenrich.cgetReadLength(
471
+ bamFile, numReads, samThreads, maxIterations, samFlagExclude
472
+ )
473
+ if init_rlen == 0:
474
+ raise ValueError(
475
+ f"Failed to determine read length in {bamFile}. Revise `numReads`, and/or `samFlagExclude` parameters?"
476
+ )
477
+ return init_rlen
478
+
479
+
480
+ def getReadLengths(
481
+ bamFiles: List[str],
482
+ numReads: int,
483
+ maxIterations: int,
484
+ samThreads: int,
485
+ samFlagExclude: int,
486
+ ) -> List[int]:
487
+ r"""Get read lengths for a list of BAM files.
488
+
489
+ :seealso: :func:`getReadLength`
490
+ """
491
+ return [
492
+ getReadLength(
493
+ bamFile,
494
+ numReads=numReads,
495
+ maxIterations=maxIterations,
496
+ samThreads=samThreads,
497
+ samFlagExclude=samFlagExclude,
498
+ )
499
+ for bamFile in bamFiles
500
+ ]
501
+
502
+
503
+ def readBamSegments(
504
+ bamFiles: List[str],
505
+ chromosome: str,
506
+ start: int,
507
+ end: int,
508
+ stepSize: int,
509
+ readLengths: List[int],
510
+ scaleFactors: List[float],
511
+ oneReadPerBin: int,
512
+ samThreads: int,
513
+ samFlagExclude: int,
514
+ offsetStr: Optional[str] = "0,0",
515
+ applyAsinh: Optional[bool] = False,
516
+ applyLog: Optional[bool] = False,
517
+ extendBP: List[int] = [],
518
+ maxInsertSize: Optional[int] = 1000,
519
+ pairedEndMode: Optional[int] = 0,
520
+ inferFragmentLength: Optional[int] = 0,
521
+ ) -> npt.NDArray[np.float32]:
522
+ r"""Calculate tracks of read counts (or a function thereof) for each BAM file.
523
+
524
+ See :func:`cconsenrich.creadBamSegment` for the underlying implementation in Cython.
525
+
526
+ :param bamFiles: See :class:`inputParams`.
527
+ :type bamFiles: List[str]
528
+ :param chromosome: Chromosome to read.
529
+ :type chromosome: str
530
+ :param start: Start position of the genomic segment.
531
+ :type start: int
532
+ :param end: End position of the genomic segment.
533
+ :type end: int
534
+ :param readLengths: List of read lengths for each BAM file.
535
+ :type readLengths: List[int]
536
+ :param scaleFactors: List of scale factors for each BAM file.
537
+ :type scaleFactors: List[float]
538
+ :param stepSize: See :class:`countingParams`.
539
+ :type stepSize: int
540
+ :param oneReadPerBin: See :class:`samParams`.
541
+ :type oneReadPerBin: int
542
+ :param samThreads: See :class:`samParams`.
543
+ :type samThreads: int
544
+ :param samFlagExclude: See :class:`samParams`.
545
+ :type samFlagExclude: int
546
+ :param offsetStr: See :class:`samParams`.
547
+ :type offsetStr: str
548
+ :param extendBP: See :class:`samParams`.
549
+ :type extendBP: int
550
+ :param maxInsertSize: See :class:`samParams`.
551
+ :type maxInsertSize: int
552
+ :param pairedEndMode: See :class:`samParams`.
553
+ :type pairedEndMode: int
554
+ :param inferFragmentLength: See :class:`samParams`.
555
+ :type inferFragmentLength: int
556
+ """
557
+
558
+ if len(bamFiles) == 0:
559
+ raise ValueError("bamFiles list is empty")
560
+
561
+ if len(readLengths) != len(bamFiles) or len(scaleFactors) != len(bamFiles):
562
+ raise ValueError(
563
+ "readLengths and scaleFactors must match bamFiles length"
564
+ )
565
+
566
+ extendBP = resolveExtendBP(extendBP, bamFiles)
567
+ offsetStr = ((str(offsetStr) or "0,0").replace(" ", "")).split(",")
568
+ numIntervals = ((end - start) + stepSize - 1) // stepSize
569
+ counts = np.empty((len(bamFiles), numIntervals), dtype=np.float32)
570
+ for j, bam in enumerate(bamFiles):
571
+ logger.info(f"Reading {chromosome}: {bam}")
572
+ arr = cconsenrich.creadBamSegment(
573
+ bam,
574
+ chromosome,
575
+ start,
576
+ end,
577
+ stepSize,
578
+ readLengths[j],
579
+ oneReadPerBin,
580
+ samThreads,
581
+ samFlagExclude,
582
+ int(offsetStr[0]),
583
+ int(offsetStr[1]),
584
+ extendBP[j],
585
+ maxInsertSize,
586
+ pairedEndMode,
587
+ inferFragmentLength,
588
+ )
589
+ # FFR: use ufuncs?
590
+ counts[j, :] = arr
591
+ counts[j, :] *= np.float32(scaleFactors[j])
592
+ if applyAsinh:
593
+ counts[j, :] = np.arcsinh(counts[j, :])
594
+ elif applyLog:
595
+ counts[j, :] = np.log1p(counts[j, :])
596
+ return counts
597
+
598
+
599
+ def getAverageLocalVarianceTrack(
600
+ values: np.ndarray,
601
+ stepSize: int,
602
+ approximationWindowLengthBP: int,
603
+ lowPassWindowLengthBP: int,
604
+ minR: float,
605
+ maxR: float,
606
+ lowPassFilterType: Optional[str] = "median",
607
+ ) -> npt.NDArray[np.float32]:
608
+ r"""Approximate a positional/local noise level track for a single sample's read-density-based values.
609
+
610
+ First computes a moving average of ``values`` using a bp-length window
611
+ ``approximationWindowLengthBP`` and a moving average of ``values**2`` over the
612
+ same window. Their difference is used to approximate the local variance. A low-pass filter
613
+ (median or mean) with window ``lowPassWindowLengthBP`` then smooths the variance track.
614
+ Finally, the track is clipped to ``[minR, maxR]`` to yield the local noise level track.
615
+
616
+ :param values: 1D array of read-density-based values for a single sample.
617
+ :type values: np.ndarray
618
+ :param stepSize: Bin size (bp).
619
+ :type stepSize: int
620
+ :param approximationWindowLengthBP: Window (bp) for local mean and second-moment. See :class:`observationParams`.
621
+ :type approximationWindowLengthBP: int
622
+ :param lowPassWindowLengthBP: Window (bp) for the low-pass filter on the variance track. See :class:`observationParams`.
623
+ :type lowPassWindowLengthBP: int
624
+ :param minR: Lower clip for the returned noise level. See :class:`observationParams`.
625
+ :type minR: float
626
+ :param maxR: Upper clip for the returned noise level. See :class:`observationParams`.
627
+ :type maxR: float
628
+ :param lowPassFilterType: ``"median"`` (default) or ``"mean"``. Type of low-pass filter to use for smoothing the local variance track. See :class:`observationParams`.
629
+ :type lowPassFilterType: Optional[str]
630
+ :return: Local noise level per interval.
631
+ :rtype: npt.NDArray[np.float32]
632
+
633
+ :seealso: :class:`observationParams`
634
+ """
635
+ values = np.asarray(values, dtype=np.float32)
636
+ windowLength = int(approximationWindowLengthBP / stepSize)
637
+ if windowLength % 2 == 0:
638
+ windowLength += 1
639
+ if len(values) < 3:
640
+ constVar = np.var(values)
641
+ if constVar < minR:
642
+ return np.full_like(values, minR, dtype=np.float32)
643
+ return np.full_like(values, constVar, dtype=np.float32)
644
+
645
+ # first get a simple moving average of the values
646
+ localMeanTrack: npt.NDArray[np.float32] = ndimage.uniform_filter(
647
+ values, size=windowLength, mode="nearest"
648
+ )
649
+
650
+ # ~ E[X_i^2] - E[X_i]^2 ~
651
+ localVarTrack: npt.NDArray[np.float32] = (
652
+ ndimage.uniform_filter(values**2, size=windowLength, mode="nearest")
653
+ - localMeanTrack**2
654
+ )
655
+
656
+ # safe-guard: difference of convolutions returns negative values.
657
+ # shouldn't actually happen, but just in case there are some
658
+ # ...potential artifacts i'm unaware of edge effects, etc.
659
+ localVarTrack = np.maximum(localVarTrack, 0.0)
660
+
661
+ # low-pass filter on the local variance track: positional 'noise level' track
662
+ lpassWindowLength = int(lowPassWindowLengthBP / stepSize)
663
+ if lpassWindowLength % 2 == 0:
664
+ lpassWindowLength += 1
665
+
666
+ noiseLevel: npt.NDArray[np.float32] = np.zeros_like(
667
+ localVarTrack, dtype=np.float32
668
+ )
669
+ if lowPassFilterType is None or (
670
+ isinstance(lowPassFilterType, str)
671
+ and lowPassFilterType.lower() == "median"
672
+ ):
673
+ noiseLevel = ndimage.median_filter(
674
+ localVarTrack, size=lpassWindowLength
675
+ )
676
+ elif (
677
+ isinstance(lowPassFilterType, str)
678
+ and lowPassFilterType.lower() == "mean"
679
+ ):
680
+ noiseLevel = ndimage.uniform_filter(
681
+ localVarTrack, size=lpassWindowLength
682
+ )
683
+
684
+ return np.clip(noiseLevel, minR, maxR).astype(np.float32)
685
+
686
+
687
+ def constructMatrixF(deltaF: float) -> npt.NDArray[np.float32]:
688
+ r"""Build the state transition matrix for the process model
689
+
690
+ :param deltaF: See :class:`processParams`.
691
+ :type deltaF: float
692
+ :return: The state transition matrix :math:`\mathbf{F}`
693
+ :rtype: npt.NDArray[np.float32]
694
+
695
+ :seealso: :class:`processParams`
696
+ """
697
+ initMatrixF: npt.NDArray[np.float32] = np.eye(2, dtype=np.float32)
698
+ initMatrixF[0, 1] = np.float32(deltaF)
699
+ return initMatrixF
700
+
701
+
702
+ def constructMatrixQ(
703
+ minDiagQ: float, offDiagQ: float = 0.0
704
+ ) -> npt.NDArray[np.float32]:
705
+ r"""Build the initial process noise covariance matrix :math:`\mathbf{Q}_{[1]}`.
706
+
707
+ :param minDiagQ: See :class:`processParams`.
708
+ :type minDiagQ: float
709
+ :param offDiagQ: See :class:`processParams`.
710
+ :type offDiagQ: float
711
+ :return: The initial process noise covariance matrix :math:`\mathbf{Q}_{[1]}`.
712
+ :rtype: npt.NDArray[np.float32]
713
+
714
+ :seealso: :class:`processParams`
715
+ """
716
+ minDiagQ = np.float32(minDiagQ)
717
+ offDiagQ = np.float32(offDiagQ)
718
+ initMatrixQ: npt.NDArray[np.float32] = np.zeros((2, 2), dtype=np.float32)
719
+ initMatrixQ[0, 0] = minDiagQ
720
+ initMatrixQ[1, 1] = minDiagQ
721
+ initMatrixQ[0, 1] = offDiagQ
722
+ initMatrixQ[1, 0] = offDiagQ
723
+ return initMatrixQ
724
+
725
+
726
+ def constructMatrixH(
727
+ m: int, coefficients: Optional[np.ndarray] = None
728
+ ) -> npt.NDArray[np.float32]:
729
+ r"""Build the observation model matrix :math:`\mathbf{H}`.
730
+
731
+ :param m: Number of observations.
732
+ :type m: int
733
+ :param coefficients: Optional coefficients for the observation model,
734
+ which can be used to weight the observations manually.
735
+ :type coefficients: Optional[np.ndarray]
736
+ :return: The observation model matrix :math:`\mathbf{H}`.
737
+ :rtype: npt.NDArray[np.float32]
738
+
739
+ :seealso: :class:`observationParams`, class:`inputParams`
740
+ """
741
+ if coefficients is None:
742
+ coefficients = np.ones(m, dtype=np.float32)
743
+ elif isinstance(coefficients, list):
744
+ coefficients = np.array(coefficients, dtype=np.float32)
745
+ initMatrixH = np.empty((m, 2), dtype=np.float32)
746
+ initMatrixH[:, 0] = coefficients.astype(np.float32)
747
+ initMatrixH[:, 1] = np.zeros(m, dtype=np.float32)
748
+ return initMatrixH
749
+
750
+
751
+ def runConsenrich(
752
+ matrixData: np.ndarray,
753
+ matrixMunc: np.ndarray,
754
+ deltaF: float,
755
+ minQ: float,
756
+ maxQ: float,
757
+ offDiagQ: float,
758
+ dStatAlpha: float,
759
+ dStatd: float,
760
+ dStatPC: float,
761
+ stateInit: float,
762
+ stateCovarInit: float,
763
+ boundState: bool,
764
+ stateLowerBound: float,
765
+ stateUpperBound: float,
766
+ chunkSize: int,
767
+ progressIter: int,
768
+ coefficientsH: Optional[np.ndarray] = None,
769
+ residualCovarInversionFunc: Optional[Callable] = None,
770
+ adjustProcessNoiseFunc: Optional[Callable] = None,
771
+ ) -> Tuple[
772
+ npt.NDArray[np.float32], npt.NDArray[np.float32], npt.NDArray[np.float32]
773
+ ]:
774
+ r"""Run consenrich on a contiguous segment (e.g. a chromosome) of read-density-based data.
775
+ Completes the forward and backward passes given data and approximated observation noise
776
+ covariance matrices :math:`\mathbf{R}_{[1:n, (11:mm)]}`.
777
+
778
+ :param matrixData: Read density data for a single chromosome or general contiguous segment,
779
+ possibly preprocessed. Two-dimensional array of shape :math:`m \times n` where :math:`m`
780
+ is the number of samples/tracks and :math:`n` the number of genomic intervals.
781
+ :type matrixData: np.ndarray
782
+ :param matrixMunc: Uncertainty estimates for the read coverage data.
783
+ Two-dimensional array of shape :math:`m \times n` where :math:`m` is the number of samples/tracks
784
+ and :math:`n` the number of genomic intervals. See :func:`getMuncTrack`.
785
+ :type matrixMunc: np.ndarray
786
+ :param deltaF: See :class:`processParams`.
787
+ :type deltaF: float
788
+ :param minQ: See :class:`processParams`.
789
+ :type minQ: float
790
+ :param maxQ: See :class:`processParams`.
791
+ :type maxQ: float
792
+ :param offDiagQ: See :class:`processParams`.
793
+ :type offDiagQ: float
794
+ :param dStatAlpha: See :class:`processParams`.
795
+ :type dStatAlpha: float
796
+ :param dStatd: See :class:`processParams`.
797
+ :type dStatd: float
798
+ :param dStatPC: See :class:`processParams`.
799
+ :type dStatPC: float
800
+ :param stateInit: See :class:`stateParams`.
801
+ :type stateInit: float
802
+ :param stateCovarInit: See :class:`stateParams`.
803
+ :type stateCovarInit: float
804
+ :param chunkSize: Number of genomic intervals' data to keep in memory before flushing to disk.
805
+ :type chunkSize: int
806
+ :param progressIter: The number of iterations after which to log progress.
807
+ :type progressIter: int
808
+ :param coefficientsH: Optional coefficients for the observation model matrix :math:`\mathbf{H}`.
809
+ If None, the coefficients are set to 1.0 for all samples.
810
+ :type coefficientsH: Optional[np.ndarray]
811
+ :param residualCovarInversionFunc: Callable function to invert the observation covariance matrix :math:`\mathbf{E}_{[i]}`.
812
+ If None, defaults to :func:`cconsenrich.cinvertMatrixE`.
813
+ :type residualCovarInversionFunc: Optional[Callable]
814
+ :param adjustProcessNoiseFunc: Function to adjust the process noise covariance matrix :math:`\mathbf{Q}_{[i]}`.
815
+ If None, defaults to :func:`cconsenrich.updateProcessNoiseCovariance`.
816
+ :type adjustProcessNoiseFunc: Optional[Callable]
817
+ :return: Tuple of three numpy arrays:
818
+ - state estimates :math:`\widetilde{\mathbf{x}}_{[i]}` of shape :math:`n \times 2`
819
+ - state covariance estimates :math:`\widetilde{\mathbf{P}}_{[i]}` of shape :math:`n \times 2 \times 2`
820
+ - post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` of shape :math:`n \times m`
821
+ :rtype: Tuple[np.ndarray, np.ndarray, np.ndarray]
822
+
823
+ :raises ValueError: If the number of samples in `matrixData` is not equal to the number of samples in `matrixMunc`.
824
+ :seealso: :class:`observationParams`, :class:`processParams`, :class:`stateParams`
825
+ """
826
+ matrixData = np.ascontiguousarray(matrixData, dtype=np.float32)
827
+ matrixMunc = np.ascontiguousarray(matrixMunc, dtype=np.float32)
828
+ m: int = 1 if matrixData.ndim == 1 else matrixData.shape[0]
829
+ n: int = 1 if matrixData.ndim == 1 else matrixData.shape[1]
830
+ inflatedQ: bool = False
831
+ dStat: float = np.float32(0.0)
832
+ IKH: np.ndarray = np.zeros(shape=(2, 2), dtype=np.float32)
833
+ matrixEInverse: np.ndarray = np.zeros(shape=(m, m), dtype=np.float32)
834
+
835
+ matrixF: np.ndarray = constructMatrixF(deltaF)
836
+ matrixQ: np.ndarray = constructMatrixQ(minQ, offDiagQ=offDiagQ)
837
+ matrixQCopy: np.ndarray = matrixQ.copy()
838
+ matrixP: np.ndarray = np.eye(2, dtype=np.float32) * np.float32(
839
+ stateCovarInit
840
+ )
841
+ matrixH: np.ndarray = constructMatrixH(m, coefficients=coefficientsH)
842
+ matrixK: np.ndarray = np.zeros((2, m), dtype=np.float32)
843
+ vectorX: np.ndarray = np.array([stateInit, 0.0], dtype=np.float32)
844
+ vectorY: np.ndarray = np.zeros(m, dtype=np.float32)
845
+ matrixI2: np.ndarray = np.eye(2, dtype=np.float32)
846
+
847
+ if residualCovarInversionFunc is None:
848
+ residualCovarInversionFunc = cconsenrich.cinvertMatrixE
849
+ if adjustProcessNoiseFunc is None:
850
+ adjustProcessNoiseFunc = cconsenrich.updateProcessNoiseCovariance
851
+
852
+ # ==========================
853
+ # forward: 0,1,2,...,n-1
854
+ # ==========================
855
+ stateForward = np.memmap(
856
+ NamedTemporaryFile(delete=True),
857
+ dtype=np.float32,
858
+ mode="w+",
859
+ shape=(n, 2),
860
+ )
861
+ stateCovarForward = np.memmap(
862
+ NamedTemporaryFile(delete=True),
863
+ dtype=np.float32,
864
+ mode="w+",
865
+ shape=(n, 2, 2),
866
+ )
867
+ pNoiseForward = np.memmap(
868
+ NamedTemporaryFile(delete=True),
869
+ dtype=np.float32,
870
+ mode="w+",
871
+ shape=(n, 2, 2),
872
+ )
873
+ progressIter = max(1, progressIter)
874
+ for i in range(n):
875
+ if i % progressIter == 0:
876
+ logger.info(f"Forward pass interval: {i + 1}/{n}")
877
+ vectorZ = matrixData[:, i]
878
+ vectorX = matrixF @ vectorX
879
+ matrixP = matrixF @ matrixP @ matrixF.T + matrixQ
880
+ vectorY = vectorZ - (matrixH @ vectorX)
881
+
882
+ matrixEInverse = residualCovarInversionFunc(
883
+ matrixMunc[:, i], np.float32(matrixP[0, 0])
884
+ )
885
+ Einv_diag = np.diag(matrixEInverse)
886
+ dStat = np.median((vectorY**2) * Einv_diag)
887
+ matrixQ, inflatedQ = adjustProcessNoiseFunc(
888
+ matrixQ,
889
+ matrixQCopy,
890
+ dStat,
891
+ dStatAlpha,
892
+ dStatd,
893
+ dStatPC,
894
+ inflatedQ,
895
+ maxQ,
896
+ minQ,
897
+ )
898
+ matrixK = (matrixP @ matrixH.T) @ matrixEInverse
899
+ IKH = matrixI2 - (matrixK @ matrixH)
900
+
901
+ vectorX = vectorX + (matrixK @ vectorY)
902
+ matrixP = (IKH) @ matrixP @ (IKH).T + (
903
+ matrixK * matrixMunc[:, i]
904
+ ) @ matrixK.T
905
+ stateForward[i] = vectorX.astype(np.float32)
906
+ stateCovarForward[i] = matrixP.astype(np.float32)
907
+ pNoiseForward[i] = matrixQ.astype(np.float32)
908
+
909
+ if i % chunkSize == 0 and i > 0:
910
+ stateForward.flush()
911
+ stateCovarForward.flush()
912
+ pNoiseForward.flush()
913
+
914
+ stateForward.flush()
915
+ stateCovarForward.flush()
916
+ pNoiseForward.flush()
917
+ stateForwardArr = stateForward
918
+ stateCovarForwardArr = stateCovarForward
919
+ pNoiseForwardArr = pNoiseForward
920
+
921
+ # ==========================
922
+ # backward: n,n-1,n-2,...,0
923
+ # ==========================
924
+ stateSmoothed = np.memmap(
925
+ NamedTemporaryFile(delete=True),
926
+ dtype=np.float32,
927
+ mode="w+",
928
+ shape=(n, 2),
929
+ )
930
+ stateCovarSmoothed = np.memmap(
931
+ NamedTemporaryFile(delete=True),
932
+ dtype=np.float32,
933
+ mode="w+",
934
+ shape=(n, 2, 2),
935
+ )
936
+ postFitResiduals = np.memmap(
937
+ NamedTemporaryFile(delete=True),
938
+ dtype=np.float32,
939
+ mode="w+",
940
+ shape=(n, m),
941
+ )
942
+
943
+ stateSmoothed[-1] = np.float32(stateForwardArr[-1])
944
+ stateCovarSmoothed[-1] = np.float32(stateCovarForwardArr[-1])
945
+ postFitResiduals[-1] = np.float32(
946
+ matrixData[:, -1] - (matrixH @ stateSmoothed[-1])
947
+ )
948
+
949
+ for k in range(n - 2, -1, -1):
950
+ if k % progressIter == 0:
951
+ logger.info(f"Backward pass interval: {k + 1}/{n}")
952
+ forwardStatePosterior = stateForwardArr[k]
953
+ forwardCovariancePosterior = stateCovarForwardArr[k]
954
+ backwardInitialState = matrixF @ forwardStatePosterior
955
+ backwardInitialCovariance = (
956
+ matrixF @ forwardCovariancePosterior @ matrixF.T
957
+ + pNoiseForwardArr[k + 1]
958
+ )
959
+
960
+ smootherGain = np.linalg.solve(
961
+ backwardInitialCovariance.T,
962
+ (forwardCovariancePosterior @ matrixF.T).T,
963
+ ).T
964
+ stateSmoothed[k] = (
965
+ forwardStatePosterior
966
+ + smootherGain @ (stateSmoothed[k + 1] - backwardInitialState)
967
+ ).astype(np.float32)
968
+
969
+ stateCovarSmoothed[k] = (
970
+ forwardCovariancePosterior
971
+ + smootherGain
972
+ @ (stateCovarSmoothed[k + 1] - backwardInitialCovariance)
973
+ @ smootherGain.T
974
+ ).astype(np.float32)
975
+ postFitResiduals[k] = np.float32(
976
+ matrixData[:, k] - matrixH @ stateSmoothed[k]
977
+ )
978
+
979
+ if k % chunkSize == 0 and k > 0:
980
+ stateSmoothed.flush()
981
+ stateCovarSmoothed.flush()
982
+ postFitResiduals.flush()
983
+
984
+ stateSmoothed.flush()
985
+ stateCovarSmoothed.flush()
986
+ postFitResiduals.flush()
987
+ if boundState:
988
+ stateSmoothed[:, 0] = np.clip(
989
+ stateSmoothed[:, 0], stateLowerBound, stateUpperBound
990
+ ).astype(np.float32)
991
+
992
+ return stateSmoothed[:], stateCovarSmoothed[:], postFitResiduals[:]
993
+
994
+
995
+ def getPrimaryState(
996
+ stateVectors: np.ndarray, roundPrecision: int = 3
997
+ ) -> npt.NDArray[np.float32]:
998
+ r"""Get the primary state estimate from each vector after running Consenrich.
999
+
1000
+ :param stateVectors: State vectors from :func:`runConsenrich`.
1001
+ :type stateVectors: npt.NDArray[np.float32]
1002
+ :return: A one-dimensional numpy array of the primary state estimates.
1003
+ :rtype: npt.NDArray[np.float32]
1004
+ """
1005
+ out_ = np.ascontiguousarray(stateVectors[:,0], dtype=np.float32)
1006
+ np.round(out_, decimals=roundPrecision, out=out_)
1007
+ return out_
1008
+
1009
+
1010
+ def getStateCovarTrace(
1011
+ stateCovarMatrices: np.ndarray, roundPrecision: int = 3
1012
+ ) -> npt.NDArray[np.float32]:
1013
+ r"""Get a one-dimensional array of state covariance traces after running Consenrich
1014
+
1015
+ :param stateCovarMatrices: Estimated state covariance matrices :math:`\widetilde{\mathbf{P}}_{[i]}`
1016
+ :type stateCovarMatrices: np.ndarray
1017
+ :return: A one-dimensional numpy array of the traces of the state covariance matrices.
1018
+ :rtype: npt.NDArray[np.float32]
1019
+ """
1020
+ stateCovarMatrices = np.ascontiguousarray(
1021
+ stateCovarMatrices, dtype=np.float32
1022
+ )
1023
+ out_ = cconsenrich.cgetStateCovarTrace(stateCovarMatrices)
1024
+ np.round(out_, decimals=roundPrecision, out=out_)
1025
+ return out_
1026
+
1027
+
1028
+ def getPrecisionWeightedResidual(
1029
+ postFitResiduals: np.ndarray,
1030
+ matrixMunc: np.ndarray,
1031
+ roundPrecision: int = 3,
1032
+ stateCovarSmoothed: Optional[np.ndarray] = None,
1033
+ ) -> npt.NDArray[np.float32]:
1034
+ r"""Get a one-dimensional precision-weighted array residuals after running Consenrich.
1035
+
1036
+ Applies an inverse-variance weighting of the post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` and
1037
+ returns a one-dimensional array of "precision-weighted residuals". The state-level uncertainty can also be
1038
+ incorporated given `stateCovarSmoothed`.
1039
+
1040
+ :param postFitResiduals: Post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` from :func:`runConsenrich`.
1041
+ :type postFitResiduals: np.ndarray
1042
+ :param matrixMunc: An :math:`m \times n` sample-by-interval matrix -- At genomic intervals :math:`i = 1,2,\ldots,n`, the respective length-:math:`m` column is :math:`\mathbf{R}_{[i,11:mm]}`.
1043
+ That is, the observation noise levels for each sample :math:`j=1,2,\ldots,m` at interval :math:`i`. To keep memory usage minimal `matrixMunc` is not returned in full or computed in
1044
+ in :func:`runConsenrich`. If using Consenrich programmatically, run :func:`consenrich.core.getMuncTrack` for each sample's count data (rows in the matrix output of :func:`readBamSegments`).
1045
+ :type matrixMunc: np.ndarray
1046
+ :param stateCovarSmoothed: Smoothed state covariance matrices :math:`\widetilde{\mathbf{P}}_{[i]}` from :func:`runConsenrich`.
1047
+ :type stateCovarSmoothed: Optional[np.ndarray]
1048
+ :return: A one-dimensional array of "precision-weighted residuals"
1049
+ :rtype: npt.NDArray[np.float32]
1050
+ """
1051
+
1052
+ n, m = postFitResiduals.shape
1053
+ if matrixMunc.shape != (m, n):
1054
+ raise ValueError(
1055
+ f"matrixMunc should be (m,n)=({m}, {n}): observed {matrixMunc.shape}"
1056
+ )
1057
+ if stateCovarSmoothed is not None and (
1058
+ stateCovarSmoothed.ndim < 3 or len(stateCovarSmoothed) != n
1059
+ ):
1060
+ raise ValueError(
1061
+ "stateCovarSmoothed must be shape (n) x (2,2) (if provided)"
1062
+ )
1063
+
1064
+ postFitResiduals_CContig = np.ascontiguousarray(
1065
+ postFitResiduals, dtype=np.float32
1066
+ )
1067
+
1068
+ needsCopy = (
1069
+ (stateCovarSmoothed is not None) and len(stateCovarSmoothed) == n) or (not matrixMunc.flags.writeable)
1070
+
1071
+ matrixMunc_CContig = np.array(
1072
+ matrixMunc, dtype=np.float32, order="C", copy=needsCopy
1073
+ )
1074
+
1075
+ if needsCopy:
1076
+ # adds the 'primary' state uncertainty to observation noise covariance :math:`\mathbf{R}_{[i,:]}`
1077
+ # primary state uncertainty (0,0) :math:`\mathbf{P}_{[i]} \in \mathbb{R}^{2 \times 2}`
1078
+ stateCovarArr00 = np.asarray(stateCovarSmoothed[:, 0, 0], dtype=np.float32)
1079
+ matrixMunc_CContig += stateCovarArr00
1080
+
1081
+ np.maximum(matrixMunc_CContig, np.float32(1e-8), out=matrixMunc_CContig)
1082
+ out = cconsenrich.cgetPrecisionWeightedResidual(
1083
+ postFitResiduals_CContig, matrixMunc_CContig
1084
+ )
1085
+ np.round(out, decimals=roundPrecision, out=out)
1086
+ return out
1087
+
1088
+
1089
+ def getMuncTrack(
1090
+ chromosome: str,
1091
+ intervals: np.ndarray,
1092
+ stepSize: int,
1093
+ rowValues: np.ndarray,
1094
+ minR: float,
1095
+ maxR: float,
1096
+ useALV: bool,
1097
+ useConstantNoiseLevel: bool,
1098
+ noGlobal: bool,
1099
+ localWeight: float,
1100
+ globalWeight: float,
1101
+ approximationWindowLengthBP: int,
1102
+ lowPassWindowLengthBP: int,
1103
+ returnCenter: bool,
1104
+ sparseMap: Optional[dict[int, int]] = None,
1105
+ lowPassFilterType: Optional[str] = "median",
1106
+ ) -> npt.NDArray[np.float32]:
1107
+ r"""Get observation noise variance :math:`R_{[:,jj]}` for the sample :math:`j`.
1108
+
1109
+ Combines a local ALV estimate (see :func:`getAverageLocalVarianceTrack`) with an
1110
+ optional global component. If ``useALV`` is True, *only* the ALV is used. If
1111
+ ``useConstantNoiseLevel`` is True, a constant track set to the global mean is used.
1112
+ When a ``sparseMap`` is provided, local values are aggregated over nearby 'sparse'
1113
+ regions before mixing with the global component.
1114
+
1115
+ For heterochromatic or repressive marks (H3K9me3, H3K27me3, MNase-seq, etc.), consider setting
1116
+ `useALV=True` to prevent inflated sample-level noise estimates.
1117
+
1118
+ :param chromosome: Tracks are approximated for this chromosome.
1119
+ :type chromosome: str
1120
+ :param intervals: Genomic intervals for which to compute the noise track.
1121
+ :param stepSize: See :class:`countingParams`.
1122
+ :type stepSize: int
1123
+ :param rowValues: Read-density-based values for the sample :math:`j` at the genomic intervals :math:`i=1,2,\ldots,n`.
1124
+ :type rowValues: np.ndarray
1125
+ :param minR: See :class:`observationParams`.
1126
+ :type minR: float
1127
+ :param maxR: See :class:`observationParams`.
1128
+ :type maxR: float
1129
+ :param useALV: See :class:`observationParams`.
1130
+ :type useALV: bool
1131
+ :param useConstantNoiseLevel: See :class:`observationParams`.
1132
+ :type useConstantNoiseLevel: bool
1133
+ :param noGlobal: See :class:`observationParams`.
1134
+ :type noGlobal: bool
1135
+ :param localWeight: See :class:`observationParams`.
1136
+ :type localWeight: float
1137
+ :param globalWeight: See :class:`observationParams`.
1138
+ :type globalWeight: float
1139
+ :param approximationWindowLengthBP: See :class:`observationParams` and/or :func:`getAverageLocalVarianceTrack`.
1140
+ :type approximationWindowLengthBP: int
1141
+ :param lowPassWindowLengthBP: See :class:`observationParams` and/or :func:`getAverageLocalVarianceTrack`.
1142
+ :type lowPassWindowLengthBP: int
1143
+ :param sparseMap: Optional mapping (dictionary) of interval indices to the nearest sparse regions. See :func:`getSparseMap`.
1144
+ :type sparseMap: Optional[dict[int, int]]
1145
+ :param lowPassFilterType: The type of low-pass filter to use in average local variance track (e.g., 'median', 'mean').
1146
+ :type lowPassFilterType: Optional[str]
1147
+ :return: A one-dimensional numpy array of the observation noise track for the sample :math:`j`.
1148
+ :rtype: npt.NDArray[np.float32]
1149
+
1150
+ """
1151
+ trackALV = getAverageLocalVarianceTrack(
1152
+ rowValues,
1153
+ stepSize,
1154
+ approximationWindowLengthBP,
1155
+ lowPassWindowLengthBP,
1156
+ minR,
1157
+ maxR,
1158
+ lowPassFilterType,
1159
+ ).astype(np.float32)
1160
+
1161
+ globalNoise: float = np.float32(np.mean(trackALV))
1162
+ if noGlobal or globalWeight == 0 or useALV:
1163
+ return np.clip(trackALV, minR, maxR).astype(np.float32)
1164
+
1165
+ if useConstantNoiseLevel or localWeight == 0 and sparseMap is None:
1166
+ return np.clip(
1167
+ globalNoise * np.ones_like(rowValues), minR, maxR
1168
+ ).astype(np.float32)
1169
+
1170
+ if sparseMap is not None:
1171
+ trackALV = cconsenrich.cSparseAvg(trackALV, sparseMap)
1172
+
1173
+ return np.clip(
1174
+ trackALV * localWeight + np.mean(trackALV) * globalWeight, minR, maxR
1175
+ ).astype(np.float32)
1176
+
1177
+
1178
+ def sparseIntersection(
1179
+ chromosome: str, intervals: np.ndarray, sparseBedFile: str
1180
+ ) -> npt.NDArray[np.int64]:
1181
+ r"""Returns intervals in the chromosome that overlap with the sparse features.
1182
+
1183
+ Not relevant if `observationParams.useALV` is True.
1184
+
1185
+ :param chromosome: The chromosome name.
1186
+ :type chromosome: str
1187
+ :param intervals: The genomic intervals to consider.
1188
+ :type intervals: np.ndarray
1189
+ :param sparseBedFile: Path to the sparse BED file.
1190
+ :type sparseBedFile: str
1191
+ :return: A numpy array of start positions of the sparse features that overlap with the intervals
1192
+ :rtype: np.ndarray[Tuple[Any], np.dtype[Any]]
1193
+ """
1194
+
1195
+ stepSize: int = intervals[1] - intervals[0]
1196
+ chromFeatures: bed.BedTool = (
1197
+ bed.BedTool(sparseBedFile)
1198
+ .sort()
1199
+ .merge()
1200
+ .filter(
1201
+ lambda b: (
1202
+ b.chrom == chromosome
1203
+ and b.start > intervals[0]
1204
+ and b.end < intervals[-1]
1205
+ and (b.end - b.start) >= stepSize
1206
+ )
1207
+ )
1208
+ )
1209
+ centeredFeatures: bed.BedTool = chromFeatures.each(
1210
+ adjustFeatureBounds, stepSize=stepSize
1211
+ )
1212
+
1213
+ start0: int = int(intervals[0])
1214
+ last: int = int(intervals[-1])
1215
+ chromFeatures: bed.BedTool = (
1216
+ bed.BedTool(sparseBedFile)
1217
+ .sort()
1218
+ .merge()
1219
+ .filter(
1220
+ lambda b: (
1221
+ b.chrom == chromosome
1222
+ and b.start > start0
1223
+ and b.end < last
1224
+ and (b.end - b.start) >= stepSize
1225
+ )
1226
+ )
1227
+ )
1228
+ centeredFeatures: bed.BedTool = chromFeatures.each(
1229
+ adjustFeatureBounds, stepSize=stepSize
1230
+ )
1231
+ centeredStarts = []
1232
+ for f in centeredFeatures:
1233
+ s = int(f.start)
1234
+ if start0 <= s <= last and (s - start0) % stepSize == 0:
1235
+ centeredStarts.append(s)
1236
+ return np.asarray(centeredStarts, dtype=np.int64)
1237
+
1238
+
1239
+ def adjustFeatureBounds(feature: bed.Interval, stepSize: int) -> bed.Interval:
1240
+ r"""Adjust the start and end positions of a BED feature to be centered around a step."""
1241
+ feature.start = cconsenrich.stepAdjustment(
1242
+ (feature.start + feature.end) // 2, stepSize
1243
+ )
1244
+ feature.end = feature.start + stepSize
1245
+ return feature
1246
+
1247
+
1248
+ def getSparseMap(
1249
+ chromosome: str,
1250
+ intervals: np.ndarray,
1251
+ numNearest: int,
1252
+ sparseBedFile: str,
1253
+ ) -> dict:
1254
+ r"""Build a map between each genomic interval and numNearest sparse features
1255
+
1256
+ :param chromosome: The chromosome name. Note, this function only needs to be run once per chromosome.
1257
+ :type chromosome: str
1258
+ :param intervals: The genomic intervals to map.
1259
+ :type intervals: np.ndarray
1260
+ :param numNearest: The number of nearest sparse features to consider
1261
+ :type numNearest: int
1262
+ :param sparseBedFile: path to the sparse BED file.
1263
+ :type sparseBedFile: str
1264
+ :return: A dictionary mapping each interval index to the indices of the nearest sparse regions.
1265
+ :rtype: dict[int, np.ndarray]
1266
+
1267
+ """
1268
+ numNearest = numNearest
1269
+ sparseStarts = sparseIntersection(chromosome, intervals, sparseBedFile)
1270
+ idxSparseInIntervals = np.searchsorted(intervals, sparseStarts, side="left")
1271
+ centers = np.searchsorted(sparseStarts, intervals, side="left")
1272
+ sparseMap: dict = {}
1273
+ for i, (interval, center) in enumerate(zip(intervals, centers)):
1274
+ left = max(0, center - numNearest)
1275
+ right = min(len(sparseStarts), center + numNearest)
1276
+ candidates = np.arange(left, right)
1277
+ dists = np.abs(sparseStarts[candidates] - interval)
1278
+ take = np.argsort(dists)[:numNearest]
1279
+ sparseMap[i] = idxSparseInIntervals[candidates[take]]
1280
+ return sparseMap
1281
+
1282
+
1283
+ def getBedMask(
1284
+ chromosome: str,
1285
+ bedFile: str,
1286
+ intervals: np.ndarray,
1287
+ ) -> np.ndarray:
1288
+ r"""Return a 1/0 mask for intervals overlapping a sorted and merged BED file.
1289
+
1290
+ This function is a wrapper for :func:`cconsenrich.cbedMask`.
1291
+
1292
+ :param chromosome: The chromosome name.
1293
+ :type chromosome: str
1294
+ :param intervals: chromosome-specific, sorted, non-overlapping start positions of genomic intervals.
1295
+ Each interval is assumed `stepSize`.
1296
+ :type intervals: np.ndarray
1297
+ :param bedFile: Path to a sorted and merged BED file
1298
+ :type bedFile: str
1299
+ :return: An `intervals`-length mask s.t. True indicates the interval overlaps a feature in the BED file.
1300
+ :rtype: np.ndarray
1301
+ """
1302
+ if not os.path.exists(bedFile):
1303
+ raise ValueError(f"Could not find {bedFile}")
1304
+ if len(intervals) < 2:
1305
+ raise ValueError("intervals must contain at least two positions")
1306
+ bedFile_ = str(bedFile)
1307
+
1308
+ # (possibly redundant) creation of uint32 version
1309
+ # + quick check for constant steps
1310
+ intervals_ = np.asarray(intervals, dtype=np.uint32)
1311
+ if (intervals_[1] - intervals_[0]) != (intervals_[-1] - intervals_[-2]):
1312
+ raise ValueError("Intervals are not fixed in size")
1313
+
1314
+ stepSize_: int = intervals[1] - intervals[0]
1315
+ return cconsenrich.cbedMask(
1316
+ chromosome,
1317
+ bedFile_,
1318
+ intervals_,
1319
+ stepSize_,
1320
+ ).astype(np.bool_)