consenrich 0.7.11b2__cp314-cp314-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

Files changed (38) hide show
  1. consenrich/.dylibs/libomp.dylib +0 -0
  2. consenrich/__init__.py +11 -0
  3. consenrich/cconsenrich.c +50610 -0
  4. consenrich/cconsenrich.cpython-314-darwin.so +0 -0
  5. consenrich/cconsenrich.pyx +1065 -0
  6. consenrich/consenrich.py +1802 -0
  7. consenrich/constants.py +172 -0
  8. consenrich/core.py +2068 -0
  9. consenrich/data/ce10.sizes +6 -0
  10. consenrich/data/ce10_blacklist.bed +100 -0
  11. consenrich/data/ce10_sparse.bed +11828 -0
  12. consenrich/data/ce11.sizes +6 -0
  13. consenrich/data/ce11_blacklist.bed +97 -0
  14. consenrich/data/ce11_sparse.bed +11828 -0
  15. consenrich/data/dm6.sizes +7 -0
  16. consenrich/data/dm6_blacklist.bed +182 -0
  17. consenrich/data/dm6_sparse.bed +20000 -0
  18. consenrich/data/hg19.sizes +24 -0
  19. consenrich/data/hg19_blacklist.bed +834 -0
  20. consenrich/data/hg19_sparse.bed +288358 -0
  21. consenrich/data/hg38.sizes +24 -0
  22. consenrich/data/hg38_blacklist.bed +636 -0
  23. consenrich/data/hg38_sparse.bed +288699 -0
  24. consenrich/data/mm10.sizes +21 -0
  25. consenrich/data/mm10_blacklist.bed +3435 -0
  26. consenrich/data/mm10_sparse.bed +100400 -0
  27. consenrich/data/mm39.sizes +21 -0
  28. consenrich/data/mm39_blacklist.bed +3360 -0
  29. consenrich/data/mm39_sparse.bed +100381 -0
  30. consenrich/detrorm.py +297 -0
  31. consenrich/matching.py +929 -0
  32. consenrich/misc_util.py +122 -0
  33. consenrich-0.7.11b2.dist-info/METADATA +66 -0
  34. consenrich-0.7.11b2.dist-info/RECORD +38 -0
  35. consenrich-0.7.11b2.dist-info/WHEEL +6 -0
  36. consenrich-0.7.11b2.dist-info/entry_points.txt +2 -0
  37. consenrich-0.7.11b2.dist-info/licenses/LICENSE +21 -0
  38. consenrich-0.7.11b2.dist-info/top_level.txt +1 -0
consenrich/core.py ADDED
@@ -0,0 +1,2068 @@
1
+ # -*- coding: utf-8 -*-
2
+ r"""
3
+ Consenrich core functions and classes.
4
+
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from tempfile import NamedTemporaryFile, TemporaryDirectory
10
+ from typing import (
11
+ Any,
12
+ Callable,
13
+ DefaultDict,
14
+ List,
15
+ NamedTuple,
16
+ Optional,
17
+ Tuple,
18
+ )
19
+
20
+ import matplotlib.pyplot as plt
21
+ import numpy as np
22
+ import numpy.typing as npt
23
+ import pybedtools as bed
24
+ from numpy.lib.stride_tricks import as_strided
25
+ from scipy import ndimage, signal
26
+ from scipy.stats.mstats import trimtail
27
+ from . import cconsenrich
28
+
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class plotParams(NamedTuple):
38
+ r"""(Experimental) Parameters related to plotting filter results and diagnostics.
39
+
40
+ :param plotPrefix: Prefix for output plot filenames.
41
+ :type plotPrefix: str or None
42
+ :param plotStateEstimatesHistogram: If True, plot a histogram of post-fit primary state estimates
43
+ :type plotStateEstimatesHistogram: bool
44
+ :param plotResidualsHistogram: If True, plot a histogram of post-fit residuals
45
+ :type plotResidualsHistogram: bool
46
+ :param plotStateStdHistogram: If True, plot a histogram of the posterior state standard deviations.
47
+ :type plotStateStdHistogram: bool
48
+ :param plotHeightInches: Height of output plots in inches.
49
+ :type plotHeightInches: float
50
+ :param plotWidthInches: Width of output plots in inches.
51
+ :type plotWidthInches: float
52
+ :param plotDPI: DPI of output plots (png)
53
+ :type plotDPI: int
54
+ :param plotDirectory: Directory where plots will be written.
55
+ :type plotDirectory: str or None
56
+
57
+ :seealso: :func:`plotStateEstimatesHistogram`, :func:`plotResidualsHistogram`, :func:`plotStateStdHistogram`
58
+ """
59
+
60
+ plotPrefix: str | None = None
61
+ plotStateEstimatesHistogram: bool = False
62
+ plotResidualsHistogram: bool = False
63
+ plotStateStdHistogram: bool = False
64
+ plotHeightInches: float = 6.0
65
+ plotWidthInches: float = 8.0
66
+ plotDPI: int = 300
67
+ plotDirectory: str | None = None
68
+
69
+
70
+ class processParams(NamedTuple):
71
+ r"""Parameters related to the process model of Consenrich.
72
+
73
+ The process model governs the signal and variance propagation
74
+ through the state transition :math:`\mathbf{F} \in \mathbb{R}^{2 \times 2}`
75
+ and process noise covariance :math:`\mathbf{Q}_{[i]} \in \mathbb{R}^{2 \times 2}`
76
+ matrices.
77
+
78
+ :param deltaF: Scales the signal and variance propagation between adjacent genomic intervals. If ``< 0`` (default), determined based on stepSize:fragmentLength ratio.
79
+ :type deltaF: float
80
+ :param minQ: Minimum process noise level (diagonal in :math:`\mathbf{Q}_{[i]}`)
81
+ for each state variable. If `minQ < 0` (default), a value based on
82
+ the minimum observation noise level (``observationParams.minR``) is used that
83
+ enforces numerical stability and a worst-case balance between process and observation models
84
+ for the given number of samples.
85
+ :param maxQ: Maximum process noise level.
86
+ :type minQ: float
87
+ :param dStatAlpha: Threshold on the deviation between the data and estimated signal -- used to determine whether the process noise is scaled up.
88
+ :type dStatAlpha: float
89
+ :param dStatd: Constant :math:`d` in the scaling expression :math:`\sqrt{d|D_{[i]} - \alpha_D| + c}`
90
+ that is used to up/down-scale the process noise covariance in the event of a model mismatch.
91
+ :type dStatd: float
92
+ :param dStatPC: Constant :math:`c` in the scaling expression :math:`\sqrt{d|D_{[i]} - \alpha_D| + c}`
93
+ that is used to up/down-scale the process noise covariance in the event of a model mismatch.
94
+ :type dStatPC: float
95
+ :param dStatUseMean: If `True`, the mean of squared, diagonal-standardized residuals (rather than the median) is used to compute the :math:`D_{[i]}` statistic at each interval :math:`i`.
96
+ :param scaleResidualsByP11: If `True`, the primary state variances (posterior) :math:`\widetilde{P}_{[i], (11)}, i=1\ldots n` are included in the inverse-variance (precision) weighting of residuals :math:`\widetilde{\mathbf{y}}_{[i]}, i=1\ldots n`.
97
+ If `False`, only the per-sample *observation noise levels* will be used in the precision-weighting. Note that this does not affect `raw` residuals output (i.e., ``postFitResiduals`` from :func:`consenrich.consenrich.runConsenrich`).
98
+ :type scaleResidualsByP11: Optional[bool]
99
+ """
100
+
101
+ deltaF: float
102
+ minQ: float
103
+ maxQ: float
104
+ offDiagQ: float
105
+ dStatAlpha: float
106
+ dStatd: float
107
+ dStatPC: float
108
+ dStatUseMean: bool
109
+ scaleResidualsByP11: Optional[bool]
110
+
111
+
112
+ class observationParams(NamedTuple):
113
+ r"""Parameters related to the observation model of Consenrich.
114
+
115
+ The observation model is used to integrate sequence alignment count
116
+ data from the multiple input samples and account for region-and-sample-specific
117
+ noise processes corrupting data. The observation model matrix
118
+ :math:`\mathbf{H} \in \mathbb{R}^{m \times 2}` maps from the state dimension (2)
119
+ to the dimension of measurements/data (:math:`m`).
120
+
121
+ :param minR: Genome-wide lower bound for the local/sample-specific observation noise levels.
122
+ If ``minR < 0`` (default), the minimum noise level is set based on the data as in the left
123
+ tail of empirical noise level estimates.
124
+ :type minR: float
125
+ :param maxR: Genome-wide upper bound for the local/sample-specific observation noise levels
126
+ :param numNearest: The number of nearest nearby 'sparse' features to use for local
127
+ variance calculation. Ignored if `useALV` is True.
128
+ :type numNearest: int
129
+ :param localWeight: The coefficient for the local noise level (based on the local surrounding window / `numNearest` features) used in the weighted sum measuring sample-specific noise level at the current interval.
130
+ :type localWeight: float
131
+ :param globalWeight: The coefficient for the global noise level (based on all genomic intervals :math:`i=1\ldots n`) used in the weighted sum measuring sample-specific noise level at the current interval.
132
+ :type globalWeight: float
133
+ :param approximationWindowLengthBP: The length of the local variance approximation window in base pairs (BP)
134
+ for the local variance calculation.
135
+ :type approximationWindowLengthBP: int
136
+ :param sparseBedFile: The path to a BED file of 'sparse' regions for the local variance calculation. For genomes with default resources in `src/consenrich/data`, this may be left as `None`,
137
+ and a default annotation that is devoid of putative regulatory elements (ENCODE cCREs) will be used. Users can instead supply a custom BED file or set `observationParams.useALV` to `True`
138
+ to avoid predefined annotations.
139
+ :type sparseBedFile: str, optional
140
+ :param noGlobal: If True, only the 'local' variances are used to approximate observation noise
141
+ covariance :math:`\mathbf{R}_{[:, (11:mm)]}`.
142
+ :type noGlobal: bool
143
+ :param useALV: Whether to use average local variance (ALV) heuristic *exclusively* to approximate observation noise
144
+ covariances per-sample, per-interval. Note that unrestricted ALV (i.e., without masking previously annotated high-signal regions) is comparatively vulnerable to inflated noise estimates in large enriched genomic domains.
145
+ :type useALV: bool
146
+ :param lowPassFilterType: The type of low-pass filter to use (e.g., 'median', 'mean') in the ALV calculation (:func:`consenrich.core.getAverageLocalVarianceTrack`).
147
+ :type lowPassFilterType: Optional[str]
148
+ :param shrinkOffset: An offset applied to local lag-1 autocorrelation, :math:`A_{[i,1]}`, such that the shrinkage factor in :func:`consenrich.core.getAverageLocalVarianceTrack`, :math:`1 - A_{[i,1]}^2`, does not reduce the local variance estimate near zero.
149
+ Setting to >= `1` disables shrinkage.
150
+ :type shrinkOffset: Optional[float]
151
+ :param kappaALV: Applicable if ``minR < 0``. Prevent ill-conditioning by bounding the ratios :math:`\frac{R_{[i,j_{\max}]}}{R_{[i,j_{\min}]}}` at each interval :math:`i=1\ldots n`. Values up to `100` will typically retain most of the initial dynamic range while improving stability and mitigating outliers.
152
+ """
153
+
154
+ minR: float
155
+ maxR: float
156
+ useALV: bool
157
+ useConstantNoiseLevel: bool
158
+ noGlobal: bool
159
+ numNearest: int
160
+ localWeight: float
161
+ globalWeight: float
162
+ approximationWindowLengthBP: int
163
+ lowPassWindowLengthBP: int
164
+ lowPassFilterType: Optional[str]
165
+ returnCenter: bool
166
+ shrinkOffset: Optional[float]
167
+ kappaALV: Optional[float]
168
+
169
+
170
+ class stateParams(NamedTuple):
171
+ r"""Parameters related to state and uncertainty bounds and initialization.
172
+
173
+ :param stateInit: Initial value of the 'primary' state/signal at the first genomic interval: :math:`x_{[1]}`
174
+ :type stateInit: float
175
+ :param stateCovarInit: Initial state covariance (covariance) scale. Note, the *initial* state uncertainty :math:`\mathbf{P}_{[1]}` is a multiple of the identity matrix :math:`\mathbf{I}`. Final results are typically insensitive to this parameter, since the filter effectively 'forgets' its initialization after processing a moderate number of intervals and backward smoothing.
176
+ :type stateCovarInit: float
177
+ :param boundState: If True, the primary state estimate for :math:`x_{[i]}` is reported within `stateLowerBound` and `stateUpperBound`. Note that the internal filtering is unaffected.
178
+ :type boundState: bool
179
+ :param stateLowerBound: Lower bound for the state estimate.
180
+ :type stateLowerBound: float
181
+ :param stateUpperBound: Upper bound for the state estimate.
182
+ :type stateUpperBound: float
183
+ """
184
+
185
+ stateInit: float
186
+ stateCovarInit: float
187
+ boundState: bool
188
+ stateLowerBound: float
189
+ stateUpperBound: float
190
+
191
+
192
+ class samParams(NamedTuple):
193
+ r"""Parameters related to reading BAM files
194
+
195
+ :param samThreads: The number of threads to use for reading BAM files.
196
+ :type samThreads: int
197
+ :param samFlagExclude: The SAM flag to exclude certain reads.
198
+ :type samFlagExclude: int
199
+ :param oneReadPerBin: If 1, only the interval with the greatest read overlap is incremented.
200
+ :type oneReadPerBin: int
201
+ :param chunkSize: maximum number of intervals' data to hold in memory before flushing to disk.
202
+ :type chunkSize: int
203
+ :param offsetStr: A string of two comma-separated integers -- first for the 5' shift on forward strand, second for the 5' shift on reverse strand.
204
+ :type offsetStr: str
205
+ :param maxInsertSize: Maximum frag length/insert to consider when estimating fragment length.
206
+ :type maxInsertSize: int
207
+ :param pairedEndMode: If > 0, use TLEN attribute to determine span of (proper) read pairs and extend reads accordingly.
208
+ :type pairedEndMode: int
209
+ :param inferFragmentLength: Intended for single-end data: if > 0, the maximum correlation lag
210
+ (avg.) between *strand-specific* read tracks is taken as the fragment length estimate and used to
211
+ extend reads from 5'. Ignored if `pairedEndMode > 0`, `countEndsOnly`, or `fragmentLengths` is provided.
212
+ important when targeting broader marks (e.g., ChIP-seq H3K27me3).
213
+ :type inferFragmentLength: int
214
+ :param countEndsOnly: If True, only the 5' read lengths contribute to counting. Overrides `inferFragmentLength` and `pairedEndMode`.
215
+ :type countEndsOnly: Optional[bool]
216
+ :param minMappingQuality: Minimum mapping quality (MAPQ) for reads to be counted.
217
+ :type minMappingQuality: Optional[int]
218
+ :param fragmentLengths:
219
+
220
+ .. tip::
221
+
222
+ For an overview of SAM flags, see https://broadinstitute.github.io/picard/explain-flags.html
223
+
224
+ """
225
+
226
+ samThreads: int
227
+ samFlagExclude: int
228
+ oneReadPerBin: int
229
+ chunkSize: int
230
+ offsetStr: Optional[str] = "0,0"
231
+ maxInsertSize: Optional[int] = 1000
232
+ pairedEndMode: Optional[int] = 0
233
+ inferFragmentLength: Optional[int] = 0
234
+ countEndsOnly: Optional[bool] = False
235
+ minMappingQuality: Optional[int] = 0
236
+ minTemplateLength: Optional[int] = -1
237
+ fragmentLengths: Optional[List[int]] = None
238
+
239
+
240
+ class detrendParams(NamedTuple):
241
+ r"""Parameters related detrending and background-removal after normalizing by sequencing depth.
242
+
243
+ :param useOrderStatFilter: Whether to use a local/moving order statistic (percentile filter) to model and remove trends in the read density data.
244
+ :type useOrderStatFilter: bool
245
+ :param usePolyFilter: Whether to use a low-degree polynomial fit to model and remove trends in the read density data.
246
+ :type usePolyFilter: bool
247
+ :param detrendSavitzkyGolayDegree: The polynomial degree of the Savitzky-Golay filter to use for detrending
248
+ :type detrendSavitzkyGolayDegree: int
249
+ :param detrendTrackPercentile: The percentile to use for the local/moving order-statistic filter.
250
+ Decrease for broad marks + sparse data if `useOrderStatFilter` is True.
251
+ :type detrendTrackPercentile: float
252
+ :param detrendWindowLengthBP: The length of the window in base pairs for detrending.
253
+ Increase for broader marks + sparse data.
254
+ :type detrendWindowLengthBP: int
255
+ """
256
+
257
+ useOrderStatFilter: bool
258
+ usePolyFilter: bool
259
+ detrendTrackPercentile: float
260
+ detrendSavitzkyGolayDegree: int
261
+ detrendWindowLengthBP: int
262
+
263
+
264
+ class inputParams(NamedTuple):
265
+ r"""Parameters related to the input data for Consenrich.
266
+
267
+ :param bamFiles: A list of paths to distinct coordinate-sorted and indexed BAM files.
268
+ :type bamFiles: List[str]
269
+
270
+ :param bamFilesControl: A list of paths to distinct coordinate-sorted and
271
+ indexed control BAM files. e.g., IgG control inputs for ChIP-seq.
272
+ :type bamFilesControl: List[str], optional
273
+ :param pairedEnd: Deprecated: Paired-end/Single-end is inferred automatically from the alignment flags in input BAM files.
274
+ :type pairedEnd: Optional[bool]
275
+ """
276
+
277
+ bamFiles: List[str]
278
+ bamFilesControl: Optional[List[str]]
279
+ pairedEnd: Optional[bool]
280
+
281
+
282
+ class genomeParams(NamedTuple):
283
+ r"""Specify assembly-specific resources, parameters.
284
+
285
+ :param genomeName: If supplied, default resources for the assembly (sizes file, blacklist, and 'sparse' regions) in `src/consenrich/data` are used.
286
+ ``ce10, ce11, dm6, hg19, hg38, mm10, mm39`` have default resources available.
287
+ :type genomeName: str
288
+ :param chromSizesFile: A two-column TSV-like file with chromosome names and sizes (in base pairs).
289
+ :type chromSizesFile: str
290
+ :param blacklistFile: A BED file with regions to exclude.
291
+ :type blacklistFile: str, optional
292
+ :param sparseBedFile: A BED file with 'sparse regions' used to estimate noise levels -- ignored if `observationParams.useALV` is True. 'Sparse regions' broadly refers to genomic intervals devoid of the targeted signal, based on prior annotations.
293
+ Users may supply a custom BED file and/or set `observationParams.useALV` to `True` to avoid relying on predefined annotations.
294
+ :type sparseBedFile: str, optional
295
+ :param chromosomes: A list of chromosome names to analyze. If None, all chromosomes in `chromSizesFile` are used.
296
+ :type chromosomes: List[str]
297
+ """
298
+
299
+ genomeName: str
300
+ chromSizesFile: str
301
+ blacklistFile: Optional[str]
302
+ sparseBedFile: Optional[str]
303
+ chromosomes: List[str]
304
+ excludeChroms: List[str]
305
+ excludeForNorm: List[str]
306
+
307
+
308
+ class countingParams(NamedTuple):
309
+ r"""Parameters related to counting reads in genomic intervals.
310
+
311
+ :param stepSize: Size (bp) of genomic intervals (AKA bin size, interval length, width, etc.).
312
+ ``consenrich.py`` defaults to 25 bp, but users may adjust this based on expected sequencing
313
+ depth and expected feature sizes. Lower sequencing depth and/or broader features may warrant
314
+ larger step sizes (e.g., 50-100bp or more).
315
+ :type stepSize: int
316
+ :param scaleDown: If using paired treatment and control BAM files, whether to
317
+ scale down the larger of the two before computing the difference/ratio
318
+ :type scaleDown: bool, optional
319
+ :param scaleFactors: Scale factors for the read counts.
320
+ :type scaleFactors: List[float], optional
321
+ :param scaleFactorsControl: Scale factors for the control read counts.
322
+ :type scaleFactorsControl: List[float], optional
323
+ :param numReads: Number of reads to sample.
324
+ :type numReads: int
325
+ :param applyAsinh: If true, :math:`\textsf{arsinh}(x)` applied to counts :math:`x` for each supplied BAM file (log-like for large values and linear near the origin).
326
+ :type applyAsinh: bool, optional
327
+ :param applyLog: If true, :math:`\textsf{log}(x + 1)` applied to counts :math:`x` for each supplied BAM file.
328
+ :type applyLog: bool, optional
329
+ :param applySqrt: If true, :math:`\sqrt{x}` applied to counts :math:`x` for each supplied BAM file.
330
+ :type applySqrt: bool, optional
331
+ :param noTransform: Disable all transformations.
332
+ :type noTransform: bool, optional
333
+ :param rescaleToTreatmentCoverage: Deprecated: no effect.
334
+ :type rescaleToTreatmentCoverage: bool, optional
335
+ :param trimLeftTail: If > 0, quantile of scaled counts to trim from the left tail before computing transformations.
336
+ :type trimLeftTail: float, optional
337
+ :param fragmentLengths: List of fragment lengths (bp) to use for extending reads from 5' ends when counting single-end data.
338
+ :type fragmentLengths: List[int], optional
339
+ :param fragmentLengthsControl: List of fragment lengths (bp) to use for extending reads from 5' ends when counting single-end with control data.
340
+ :type fragmentLengthsControl: List[int], optional
341
+ :param useTreatmentFragmentLengths: If True, use fragment lengths estimated from treatment BAM files for control BAM files, too.
342
+ :type useTreatmentFragmentLengths: bool, optional
343
+
344
+
345
+ .. admonition:: Treatment vs. Control Fragment Lengths in Single-End Data
346
+ :class: tip
347
+ :collapsible: closed
348
+
349
+ For single-end data, cross-correlation-based estimates for fragment length
350
+ in control inputs can be biased due to a comparative lack of structure in
351
+ strand-specific coverage tracks.
352
+
353
+ This can create artifacts during counting, so it is common to use the estimated treatment
354
+ fragment length for both treatment and control samples. The argument
355
+ ``observationParams.useTreatmentFragmentLengths`` enables this behavior.
356
+
357
+ :seealso: :ref:`calibration`, :class:`samParams`.
358
+ """
359
+
360
+ stepSize: int
361
+ scaleDown: Optional[bool]
362
+ scaleFactors: Optional[List[float]]
363
+ scaleFactorsControl: Optional[List[float]]
364
+ numReads: int
365
+ applyAsinh: Optional[bool]
366
+ applyLog: Optional[bool]
367
+ applySqrt: Optional[bool]
368
+ noTransform: Optional[bool]
369
+ rescaleToTreatmentCoverage: Optional[bool]
370
+ normMethod: Optional[str]
371
+ trimLeftTail: Optional[float]
372
+ fragmentLengths: Optional[List[int]]
373
+ fragmentLengthsControl: Optional[List[int]]
374
+ useTreatmentFragmentLengths: Optional[bool]
375
+
376
+
377
+ class matchingParams(NamedTuple):
378
+ r"""Parameters related to the matching algorithm.
379
+
380
+ See :ref:`matching` for an overview of the approach.
381
+
382
+ :param templateNames: A list of str values -- each entry references a mother wavelet (or its corresponding scaling function). e.g., `[haar, db2]`
383
+ :type templateNames: List[str]
384
+ :param cascadeLevels: Number of cascade iterations, or 'levels', used to define wavelet-based templates
385
+ Must have the same length as `templateNames`, with each entry aligned to the
386
+ corresponding template. e.g., given templateNames `[haar, db2]`, then `[2,2]` would use 2 cascade levels for both templates.
387
+ :type cascadeLevels: List[int]
388
+ :param iters: Number of random blocks to sample in the response sequence while building
389
+ an empirical null to test significance. See :func:`cconsenrich.csampleBlockStats`.
390
+ :type iters: int
391
+ :param alpha: Primary significance threshold on detected matches. Specifically, the
392
+ minimum corrected empirical p-value approximated from randomly sampled blocks in the
393
+ response sequence.
394
+ :type alpha: float
395
+ :param minMatchLengthBP: Within a window of `minMatchLengthBP` length (bp), relative maxima in
396
+ the signal-template convolution :math:`\mathcal{R}_{[\ast]}` must be greater in value than
397
+ others to qualify as matches. If set to a value less than 1, the minimum length is determined
398
+ via :func:`consenrich.matching.autoMinLengthIntervals` (default behavior).
399
+ :type minMatchLengthBP: Optional[int]
400
+ :param minSignalAtMaxima: Secondary/optional threshold coupled with ``alpha``. Requires the *signal value*, :math:`\widetilde{x}_{[i^*]}`,
401
+ at relative maxima in the response sequence, :math:`\mathcal{R}_{[i^*]}`, to be greater than this threshold.
402
+ If a ``str`` value is provided, looks for 'q:quantileValue', e.g., 'q:0.90'. The threshold is then set to the
403
+ corresponding quantile of the non-zero signal estimates in the distribution of transformed values.
404
+ :type minSignalAtMaxima: Optional[str | float]
405
+ :param useScalingFunction: If True, use (only) the scaling function to build the matching template.
406
+ If False, use (only) the wavelet function.
407
+ :type useScalingFunction: bool
408
+ :param excludeRegionsBedFile: A BED file with regions to exclude from matching
409
+ :type excludeRegionsBedFile: Optional[str]
410
+ :param penalizeBy: Specify a positional metric to scale signal estimate values by when matching.
411
+ For example, ``stateUncertainty`` divides signal values by the square root of the primary state
412
+ variance :math:`\sqrt{\widetilde{P}_{i,(11)}}` at each position :math:`i`,
413
+ thereby down-weighting positions where the posterior state uncertainty is
414
+ high during matching.
415
+ :type penalizeBy: Optional[str]
416
+ :param eps: Tolerance parameter for relative maxima detection in the response sequence. Set to zero to enforce strict
417
+ inequalities when identifying discrete relative maxima.
418
+ :type eps: float
419
+ :param autoLengthQuantile: If `minMatchLengthBP < 1`, the minimum match length (``minMatchLengthBP / stepSize``) is determined
420
+ by the quantile in the distribution of non-zero segment lengths (i.e., consecutive intervals with non-zero signal estimates).
421
+ after local standardization.
422
+ :type autoLengthQuantile: float
423
+ :param methodFDR: Method for genome-wide multiple hypothesis testing correction. Can specify either Benjamini-Hochberg ('BH'), the more conservative Benjamini-Yekutieli ('BY') to account for arbitrary dependencies between tests, or None.
424
+ :type methodFDR: str
425
+ :seealso: :func:`cconsenrich.csampleBlockStats`, :ref:`matching`, :class:`outputParams`.
426
+ """
427
+
428
+ templateNames: List[str]
429
+ cascadeLevels: List[int]
430
+ iters: int
431
+ alpha: float
432
+ useScalingFunction: Optional[bool]
433
+ minMatchLengthBP: Optional[int]
434
+ maxNumMatches: Optional[int]
435
+ minSignalAtMaxima: Optional[str | float]
436
+ merge: Optional[bool]
437
+ mergeGapBP: Optional[int]
438
+ excludeRegionsBedFile: Optional[str]
439
+ penalizeBy: Optional[str]
440
+ randSeed: Optional[int]
441
+ eps: Optional[float]
442
+ autoLengthQuantile: Optional[float]
443
+ methodFDR: Optional[str]
444
+
445
+
446
+ class outputParams(NamedTuple):
447
+ r"""Parameters related to output files.
448
+
449
+ :param convertToBigWig: If True, output bedGraph files are converted to bigWig format.
450
+ :type convertToBigWig: bool
451
+ :param roundDigits: Number of decimal places to round output values (bedGraph)
452
+ :type roundDigits: int
453
+ :param writeResiduals: If True, write to a separate bedGraph the pointwise avg. of precision-weighted residuals at each interval. These may be interpreted as
454
+ a measure of model mismatch. Where these quantities are larger (+-), there may be more unexplained deviation between the data and fitted model.
455
+ :type writeResiduals: bool
456
+ :param writeMuncTrace: If True, write to a separate bedGraph :math:`\sqrt{\frac{\textsf{Trace}\left(\mathbf{R}_{[i]}\right)}{m}}` -- that is, square root of the 'average' observation noise level at each interval :math:`i=1\ldots n`, where :math:`m` is the number of samples/tracks.
457
+ :type writeMuncTrace: bool
458
+ :param writeStateStd: If True, write to a separate bedGraph the estimated pointwise uncertainty in the primary state, :math:`\sqrt{\widetilde{P}_{i,(11)}}`, on a scale comparable to the estimated signal.
459
+ :type writeStateStd: bool
460
+ """
461
+
462
+ convertToBigWig: bool
463
+ roundDigits: int
464
+ writeResiduals: bool
465
+ writeMuncTrace: bool
466
+ writeStateStd: bool
467
+
468
+
469
+ def _numIntervals(start: int, end: int, step: int) -> int:
470
+ # helper for consistency
471
+ length = max(0, end - start)
472
+ return (length + step) // step
473
+
474
+
475
+ def getChromRanges(
476
+ bamFile: str,
477
+ chromosome: str,
478
+ chromLength: int,
479
+ samThreads: int,
480
+ samFlagExclude: int,
481
+ ) -> Tuple[int, int]:
482
+ r"""Get the start and end positions of reads in a chromosome from a BAM file.
483
+
484
+ :param bamFile: See :class:`inputParams`.
485
+ :type bamFile: str
486
+ :param chromosome: the chromosome to read in `bamFile`.
487
+ :type chromosome: str
488
+ :param chromLength: Base pair length of the chromosome.
489
+ :type chromLength: int
490
+ :param samThreads: See :class:`samParams`.
491
+ :type samThreads: int
492
+ :param samFlagExclude: See :class:`samParams`.
493
+ :type samFlagExclude: int
494
+ :return: Tuple of start and end positions (nucleotide coordinates) in the chromosome.
495
+ :rtype: Tuple[int, int]
496
+
497
+ :seealso: :func:`getChromRangesJoint`, :func:`cconsenrich.cgetFirstChromRead`, :func:`cconsenrich.cgetLastChromRead`
498
+ """
499
+ start: int = cconsenrich.cgetFirstChromRead(
500
+ bamFile, chromosome, chromLength, samThreads, samFlagExclude
501
+ )
502
+ end: int = cconsenrich.cgetLastChromRead(
503
+ bamFile, chromosome, chromLength, samThreads, samFlagExclude
504
+ )
505
+ return start, end
506
+
507
+
508
+ def getChromRangesJoint(
509
+ bamFiles: List[str],
510
+ chromosome: str,
511
+ chromSize: int,
512
+ samThreads: int,
513
+ samFlagExclude: int,
514
+ ) -> Tuple[int, int]:
515
+ r"""For multiple BAM files, reconcile a single start and end position over which to count reads,
516
+ where the start and end positions are defined by the first and last reads across all BAM files.
517
+
518
+ :param bamFiles: List of BAM files to read.
519
+ :type bamFiles: List[str]
520
+ :param chromosome: Chromosome to read.
521
+ :type chromosome: str
522
+ :param chromSize: Size of the chromosome.
523
+ :type chromSize: int
524
+ :param samThreads: Number of threads to use for reading the BAM files.
525
+ :type samThreads: int
526
+ :param samFlagExclude: SAM flag to exclude certain reads.
527
+ :type samFlagExclude: int
528
+ :return: Tuple of start and end positions.
529
+ :rtype: Tuple[int, int]
530
+
531
+ :seealso: :func:`getChromRanges`, :func:`cconsenrich.cgetFirstChromRead`, :func:`cconsenrich.cgetLastChromRead`
532
+ """
533
+ starts = []
534
+ ends = []
535
+ for bam_ in bamFiles:
536
+ start, end = getChromRanges(
537
+ bam_,
538
+ chromosome,
539
+ chromLength=chromSize,
540
+ samThreads=samThreads,
541
+ samFlagExclude=samFlagExclude,
542
+ )
543
+ starts.append(start)
544
+ ends.append(end)
545
+ return min(starts), max(ends)
546
+
547
+
548
+ def getReadLength(
549
+ bamFile: str,
550
+ numReads: int,
551
+ maxIterations: int,
552
+ samThreads: int,
553
+ samFlagExclude: int,
554
+ ) -> int:
555
+ r"""Infer read length from mapped reads in a BAM file.
556
+
557
+ Samples at least `numReads` reads passing criteria given by `samFlagExclude`
558
+ and returns the median read length.
559
+
560
+ :param bamFile: See :class:`inputParams`.
561
+ :type bamFile: str
562
+ :param numReads: Number of reads to sample.
563
+ :type numReads: int
564
+ :param maxIterations: Maximum number of iterations to perform.
565
+ :type maxIterations: int
566
+ :param samThreads: See :class:`samParams`.
567
+ :type samThreads: int
568
+ :param samFlagExclude: See :class:`samParams`.
569
+ :type samFlagExclude: int
570
+ :return: The median read length.
571
+ :rtype: int
572
+
573
+ :raises ValueError: If the read length cannot be determined after scanning `maxIterations` reads.
574
+
575
+ :seealso: :func:`cconsenrich.cgetReadLength`
576
+ """
577
+ init_rlen = cconsenrich.cgetReadLength(
578
+ bamFile, numReads, samThreads, maxIterations, samFlagExclude
579
+ )
580
+ if init_rlen == 0:
581
+ raise ValueError(
582
+ f"Failed to determine read length in {bamFile}. Revise `numReads`, and/or `samFlagExclude` parameters?"
583
+ )
584
+ return init_rlen
585
+
586
+
587
+ def getReadLengths(
588
+ bamFiles: List[str],
589
+ numReads: int,
590
+ maxIterations: int,
591
+ samThreads: int,
592
+ samFlagExclude: int,
593
+ ) -> List[int]:
594
+ r"""Get read lengths for a list of BAM files.
595
+
596
+ :seealso: :func:`getReadLength`
597
+ """
598
+ return [
599
+ getReadLength(
600
+ bamFile,
601
+ numReads=numReads,
602
+ maxIterations=maxIterations,
603
+ samThreads=samThreads,
604
+ samFlagExclude=samFlagExclude,
605
+ )
606
+ for bamFile in bamFiles
607
+ ]
608
+
609
+
610
+ def readBamSegments(
611
+ bamFiles: List[str],
612
+ chromosome: str,
613
+ start: int,
614
+ end: int,
615
+ stepSize: int,
616
+ readLengths: List[int],
617
+ scaleFactors: List[float],
618
+ oneReadPerBin: int,
619
+ samThreads: int,
620
+ samFlagExclude: int,
621
+ offsetStr: Optional[str] = "0,0",
622
+ applyAsinh: Optional[bool] = False,
623
+ applyLog: Optional[bool] = False,
624
+ applySqrt: Optional[bool] = False,
625
+ maxInsertSize: Optional[int] = 1000,
626
+ pairedEndMode: Optional[int] = 0,
627
+ inferFragmentLength: Optional[int] = 0,
628
+ countEndsOnly: Optional[bool] = False,
629
+ minMappingQuality: Optional[int] = 0,
630
+ minTemplateLength: Optional[int] = -1,
631
+ trimLeftTail: Optional[float] = 0.0,
632
+ fragmentLengths: Optional[List[int]] = None,
633
+ ) -> npt.NDArray[np.float32]:
634
+ r"""Calculate tracks of read counts (or a function thereof) for each BAM file.
635
+
636
+ See :func:`cconsenrich.creadBamSegment` for the underlying implementation in Cython.
637
+ Note that read counts are scaled by `scaleFactors` and possibly transformed if
638
+ any of `applyAsinh`, `applyLog`, `applySqrt`. Note that these transformations are mutually
639
+ exclusive and may affect interpretation of results.
640
+
641
+ :param bamFiles: See :class:`inputParams`.
642
+ :type bamFiles: List[str]
643
+ :param chromosome: Chromosome to read.
644
+ :type chromosome: str
645
+ :param start: Start position of the genomic segment.
646
+ :type start: int
647
+ :param end: End position of the genomic segment.
648
+ :type end: int
649
+ :param readLengths: List of read lengths for each BAM file.
650
+ :type readLengths: List[int]
651
+ :param scaleFactors: List of scale factors for each BAM file.
652
+ :type scaleFactors: List[float]
653
+ :param stepSize: See :class:`countingParams`.
654
+ :type stepSize: int
655
+ :param oneReadPerBin: See :class:`samParams`.
656
+ :type oneReadPerBin: int
657
+ :param samThreads: See :class:`samParams`.
658
+ :type samThreads: int
659
+ :param samFlagExclude: See :class:`samParams`.
660
+ :type samFlagExclude: int
661
+ :param offsetStr: See :class:`samParams`.
662
+ :type offsetStr: str
663
+ :param maxInsertSize: See :class:`samParams`.
664
+ :type maxInsertSize: int
665
+ :param pairedEndMode: See :class:`samParams`.
666
+ :type pairedEndMode: int
667
+ :param inferFragmentLength: See :class:`samParams`.
668
+ :type inferFragmentLength: int
669
+ :param minMappingQuality: See :class:`samParams`.
670
+ :type minMappingQuality: int
671
+ :param minTemplateLength: See :class:`samParams`.
672
+ :type minTemplateLength: Optional[int]
673
+ :param fragmentLengths: If supplied, a list of estimated fragment lengths for each BAM file.
674
+ In single-end mode, these are values are used to extend reads. They are ignored in paired-end
675
+ mode, where each proper pair `TLEN` is counted.
676
+ :type fragmentLengths: Optional[List[int]]
677
+ """
678
+
679
+ segmentSize_ = end - start
680
+ if stepSize <= 0 or segmentSize_ <= 0:
681
+ raise ValueError(
682
+ "Invalid stepSize or genomic segment specified (end <= start)"
683
+ )
684
+
685
+ if len(bamFiles) == 0:
686
+ raise ValueError("bamFiles list is empty")
687
+
688
+ if len(readLengths) != len(bamFiles) or len(scaleFactors) != len(
689
+ bamFiles
690
+ ):
691
+ raise ValueError(
692
+ "readLengths and scaleFactors must match bamFiles length"
693
+ )
694
+
695
+ offsetStr = ((str(offsetStr) or "0,0").replace(" ", "")).split(
696
+ ","
697
+ )
698
+
699
+ numIntervals = ((end - start) + stepSize - 1) // stepSize
700
+ counts = np.empty((len(bamFiles), numIntervals), dtype=np.float32)
701
+
702
+ if pairedEndMode:
703
+ fragmentLengths = [0] * len(bamFiles)
704
+ inferFragmentLength = 0
705
+ if not pairedEndMode and (
706
+ fragmentLengths is None or len(fragmentLengths) == 0
707
+ ):
708
+ inferFragmentLength = 1
709
+ fragmentLengths = [-1] * len(bamFiles)
710
+
711
+ if isinstance(countEndsOnly, bool) and countEndsOnly:
712
+ # note: setting this option ignores inferFragmentLength, pairedEndMode
713
+ inferFragmentLength = 0
714
+ pairedEndMode = 0
715
+ fragmentLengths = [0] * len(bamFiles)
716
+
717
+ for j, bam in enumerate(bamFiles):
718
+ logger.info(f"Reading {chromosome}: {bam}")
719
+ arr = cconsenrich.creadBamSegment(
720
+ bam,
721
+ chromosome,
722
+ start,
723
+ end,
724
+ stepSize,
725
+ readLengths[j],
726
+ oneReadPerBin,
727
+ samThreads,
728
+ samFlagExclude,
729
+ int(offsetStr[0]),
730
+ int(offsetStr[1]),
731
+ fragmentLengths[j],
732
+ maxInsertSize,
733
+ pairedEndMode,
734
+ inferFragmentLength,
735
+ minMappingQuality,
736
+ minTemplateLength,
737
+ )
738
+
739
+ counts[j, :] = arr
740
+ if trimLeftTail > 0.0:
741
+ counts[j, :] = trimtail(
742
+ counts[j, :], trimLeftTail, tail="left"
743
+ )
744
+ counts[j, :] *= np.float32(scaleFactors[j])
745
+ if applyAsinh:
746
+ np.asinh(counts[j, :], out=counts[j, :])
747
+ elif applyLog:
748
+ np.log1p(counts[j, :], out=counts[j, :])
749
+ elif applySqrt:
750
+ np.sqrt(counts[j, :], out=counts[j, :])
751
+ return counts
752
+
753
+
754
+ def getAverageLocalVarianceTrack(
755
+ values: np.ndarray,
756
+ stepSize: int,
757
+ approximationWindowLengthBP: int,
758
+ lowPassWindowLengthBP: int,
759
+ minR: float,
760
+ maxR: float,
761
+ lowPassFilterType: Optional[str] = "median",
762
+ shrinkOffset: float = 0.5,
763
+ ) -> npt.NDArray[np.float32]:
764
+ r"""A moment-based local variance estimator with autocorrelation-based shrinkage for genome-wide sample-specific noise level approximation.
765
+
766
+ First, computes a moving average of ``values`` using a bp-length window
767
+ ``approximationWindowLengthBP`` and a moving average of ``values**2`` over the
768
+ same window. Their difference is used to approximate the *initial* 'local variance' before
769
+ autocorrelation-based shrinkage. Finally, a broad/low-pass filter (``median`` or ``mean``)
770
+ with window ``lowPassWindowLengthBP`` then smooths the variance track.
771
+
772
+ :param stepSize: see :class:`countingParams`.
773
+ :type stepSize: int
774
+ :param approximationWindowLengthBP: Window (bp) for local mean and second-moment. See :class:`observationParams`.
775
+ :type approximationWindowLengthBP: int
776
+ :param lowPassWindowLengthBP: Window (bp) for the low-pass filter on the variance track. See :class:`observationParams`.
777
+ :type lowPassWindowLengthBP: int
778
+ :param minR: Lower bound for the returned noise level. See :class:`observationParams`.
779
+ :type minR: float
780
+ :param maxR: Upper bound for the returned noise level. See :class:`observationParams`.
781
+ :type maxR: float
782
+ :param lowPassFilterType: ``"median"`` (default) or ``"mean"``. Type of low-pass filter to use for smoothing the final noise level track. See :class:`observationParams`.
783
+ :type lowPassFilterType: Optional[str]
784
+ :param shrinkOffset: Offset applied to lag-1 autocorrelation when shrinking local variance estimates. See :class:`observationParams`.
785
+ :type shrinkOffset: float
786
+ :return: Local noise level per interval.
787
+ :rtype: npt.NDArray[np.float32]
788
+
789
+ :seealso: :class:`observationParams`
790
+ """
791
+ values = np.asarray(values, dtype=np.float32)
792
+ windowLength = int(approximationWindowLengthBP / stepSize)
793
+ if windowLength % 2 == 0:
794
+ windowLength += 1
795
+
796
+ if len(values) < 3:
797
+ constVar = np.var(values)
798
+ if constVar < minR:
799
+ return np.full_like(values, minR, dtype=np.float32)
800
+ return np.full_like(values, constVar, dtype=np.float32)
801
+
802
+ # get local mean (simple moving average)
803
+ localMeanTrack: npt.NDArray[np.float32] = ndimage.uniform_filter(
804
+ values, size=windowLength, mode="nearest"
805
+ )
806
+
807
+ # apply V[X] ~=~ E[X^2] - (E[X])^2 locally to approximate local variance
808
+ totalVarTrack: npt.NDArray[np.float32] = (
809
+ ndimage.uniform_filter(
810
+ values**2, size=windowLength, mode="nearest"
811
+ )
812
+ - localMeanTrack**2
813
+ )
814
+
815
+ np.maximum(totalVarTrack, 0.0, out=totalVarTrack) # JIC
816
+
817
+ noiseLevel: npt.NDArray[np.float32]
818
+ localVarTrack: npt.NDArray[np.float32]
819
+
820
+ if abs(shrinkOffset) < 1:
821
+ # Aim is to shrink the local noise variance estimates
822
+ # ...where there's evidence of structure (signal) in the data
823
+ # ...autocorr small --> retain more of the variance estimate
824
+ # ...autocorr large --> more shrinkage
825
+
826
+ # shift idx +1
827
+ valuesLag = np.roll(values, 1)
828
+ valuesLag[0] = valuesLag[1]
829
+
830
+ # get smooth `x_{[i]} * x_{[i-1]}` and standardize
831
+ localMeanLag: npt.NDArray[np.float32] = (
832
+ ndimage.uniform_filter(
833
+ valuesLag, size=windowLength, mode="nearest"
834
+ )
835
+ )
836
+ smoothProd: npt.NDArray[np.float32] = ndimage.uniform_filter(
837
+ values * valuesLag, size=windowLength, mode="nearest"
838
+ )
839
+ covLag1: npt.NDArray[np.float32] = (
840
+ smoothProd - localMeanTrack * localMeanLag
841
+ )
842
+ rho1: npt.NDArray[np.float32] = np.clip(
843
+ covLag1 / (totalVarTrack + 1.0e-4),
844
+ -1.0 + shrinkOffset,
845
+ 1 - shrinkOffset,
846
+ )
847
+
848
+ noiseFracEstimate: npt.NDArray[np.float32] = 1.0 - rho1**2
849
+ localVarTrack = totalVarTrack * noiseFracEstimate
850
+
851
+ else:
852
+ localVarTrack = totalVarTrack
853
+
854
+ np.maximum(localVarTrack, 0.0, out=localVarTrack)
855
+ lpassWindowLength = int(lowPassWindowLengthBP / stepSize)
856
+ if lpassWindowLength % 2 == 0:
857
+ lpassWindowLength += 1
858
+
859
+ # FFR: consider making this step optional
860
+ if lowPassFilterType is None or (
861
+ isinstance(lowPassFilterType, str)
862
+ and lowPassFilterType.lower() == "median"
863
+ ):
864
+ noiseLevel: npt.NDArray[np.float32] = ndimage.median_filter(
865
+ localVarTrack,
866
+ size=lpassWindowLength,
867
+ )
868
+ elif (
869
+ isinstance(lowPassFilterType, str)
870
+ and lowPassFilterType.lower() == "mean"
871
+ ):
872
+ noiseLevel = ndimage.uniform_filter(
873
+ localVarTrack,
874
+ size=lpassWindowLength,
875
+ )
876
+ else:
877
+ logger.warning(
878
+ f"Unknown lowPassFilterType, expected `median` or `mean`, defaulting to `median`..."
879
+ )
880
+ noiseLevel = ndimage.median_filter(
881
+ localVarTrack,
882
+ size=lpassWindowLength,
883
+ )
884
+
885
+ return np.clip(noiseLevel, minR, maxR).astype(np.float32)
886
+
887
+
888
+ def constructMatrixF(deltaF: float) -> npt.NDArray[np.float32]:
889
+ r"""Build the state transition matrix for the process model
890
+
891
+ :param deltaF: See :class:`processParams`.
892
+ :type deltaF: float
893
+ :return: The state transition matrix :math:`\mathbf{F}`
894
+ :rtype: npt.NDArray[np.float32]
895
+
896
+ :seealso: :class:`processParams`
897
+ """
898
+ initMatrixF: npt.NDArray[np.float32] = np.eye(2, dtype=np.float32)
899
+ initMatrixF[0, 1] = np.float32(deltaF)
900
+ return initMatrixF
901
+
902
+
903
+ def constructMatrixQ(
904
+ minDiagQ: float, offDiagQ: float = 0.0
905
+ ) -> npt.NDArray[np.float32]:
906
+ r"""Build the initial process noise covariance matrix :math:`\mathbf{Q}_{[1]}`.
907
+
908
+ :param minDiagQ: See :class:`processParams`.
909
+ :type minDiagQ: float
910
+ :param offDiagQ: See :class:`processParams`.
911
+ :type offDiagQ: float
912
+ :return: The initial process noise covariance matrix :math:`\mathbf{Q}_{[1]}`.
913
+ :rtype: npt.NDArray[np.float32]
914
+
915
+ :seealso: :class:`processParams`
916
+ """
917
+ minDiagQ = np.float32(minDiagQ)
918
+ offDiagQ = np.float32(offDiagQ)
919
+ initMatrixQ: npt.NDArray[np.float32] = np.zeros(
920
+ (2, 2), dtype=np.float32
921
+ )
922
+ initMatrixQ[0, 0] = minDiagQ
923
+ initMatrixQ[1, 1] = minDiagQ
924
+ initMatrixQ[0, 1] = offDiagQ
925
+ initMatrixQ[1, 0] = offDiagQ
926
+ return initMatrixQ
927
+
928
+
929
+ def constructMatrixH(
930
+ m: int, coefficients: Optional[np.ndarray] = None
931
+ ) -> npt.NDArray[np.float32]:
932
+ r"""Build the observation model matrix :math:`\mathbf{H}`.
933
+
934
+ :param m: Number of observations.
935
+ :type m: int
936
+ :param coefficients: Optional coefficients for the observation model,
937
+ which can be used to weight the observations manually.
938
+ :type coefficients: Optional[np.ndarray]
939
+ :return: The observation model matrix :math:`\mathbf{H}`.
940
+ :rtype: npt.NDArray[np.float32]
941
+
942
+ :seealso: :class:`observationParams`, class:`inputParams`
943
+ """
944
+ if coefficients is None:
945
+ coefficients = np.ones(m, dtype=np.float32)
946
+ elif isinstance(coefficients, list):
947
+ coefficients = np.array(coefficients, dtype=np.float32)
948
+ initMatrixH = np.empty((m, 2), dtype=np.float32)
949
+ initMatrixH[:, 0] = coefficients.astype(np.float32)
950
+ initMatrixH[:, 1] = np.zeros(m, dtype=np.float32)
951
+ return initMatrixH
952
+
953
+
954
+ def runConsenrich(
955
+ matrixData: np.ndarray,
956
+ matrixMunc: np.ndarray,
957
+ deltaF: float,
958
+ minQ: float,
959
+ maxQ: float,
960
+ offDiagQ: float,
961
+ dStatAlpha: float,
962
+ dStatd: float,
963
+ dStatPC: float,
964
+ dStatUseMean: bool,
965
+ stateInit: float,
966
+ stateCovarInit: float,
967
+ boundState: bool,
968
+ stateLowerBound: float,
969
+ stateUpperBound: float,
970
+ chunkSize: int,
971
+ progressIter: int,
972
+ coefficientsH: Optional[np.ndarray] = None,
973
+ residualCovarInversionFunc: Optional[Callable] = None,
974
+ adjustProcessNoiseFunc: Optional[Callable] = None,
975
+ covarClip: float = 3.0,
976
+ ) -> Tuple[
977
+ npt.NDArray[np.float32],
978
+ npt.NDArray[np.float32],
979
+ npt.NDArray[np.float32],
980
+ ]:
981
+ r"""Run consenrich on a contiguous segment (e.g. a chromosome) of read-density-based data.
982
+ Completes the forward and backward passes given data and approximated observation noise
983
+ covariance matrices :math:`\mathbf{R}_{[1:n, (11:mm)]}`.
984
+
985
+ This is the primary function implementing the core Consenrich algorithm. Users requiring specialized
986
+ preprocessing may prefer to call this function programmatically on their own preprocessed data rather
987
+ than using the command-line interface.
988
+
989
+
990
+ :param matrixData: Read density data for a single chromosome or general contiguous segment,
991
+ possibly preprocessed. Two-dimensional array of shape :math:`m \times n` where :math:`m`
992
+ is the number of samples/tracks and :math:`n` the number of genomic intervals.
993
+ :type matrixData: np.ndarray
994
+ :param matrixMunc: Uncertainty estimates for the read coverage data.
995
+ Two-dimensional array of shape :math:`m \times n` where :math:`m` is the number of samples/tracks
996
+ and :math:`n` the number of genomic intervals. See :func:`getMuncTrack`.
997
+ :type matrixMunc: np.ndarray
998
+ :param deltaF: See :class:`processParams`.
999
+ :type deltaF: float
1000
+ :param minQ: See :class:`processParams`.
1001
+ :type minQ: float
1002
+ :param maxQ: See :class:`processParams`.
1003
+ :type maxQ: float
1004
+ :param offDiagQ: See :class:`processParams`.
1005
+ :type offDiagQ: float
1006
+ :param dStatAlpha: See :class:`processParams`.
1007
+ :type dStatAlpha: float
1008
+ :param dStatd: See :class:`processParams`.
1009
+ :type dStatd: float
1010
+ :param dStatPC: See :class:`processParams`.
1011
+ :type dStatPC: float
1012
+ :param dStatUseMean: See :class:`processParams`.
1013
+ :type dStatUseMean: bool
1014
+ :param stateInit: See :class:`stateParams`.
1015
+ :type stateInit: float
1016
+ :param stateCovarInit: See :class:`stateParams`.
1017
+ :type stateCovarInit: float
1018
+ :param chunkSize: Number of genomic intervals' data to keep in memory before flushing to disk.
1019
+ :type chunkSize: int
1020
+ :param progressIter: The number of iterations after which to log progress.
1021
+ :type progressIter: int
1022
+ :param coefficientsH: Optional coefficients for the observation model matrix :math:`\mathbf{H}`.
1023
+ If None, the coefficients are set to 1.0 for all samples.
1024
+ :type coefficientsH: Optional[np.ndarray]
1025
+ :param residualCovarInversionFunc: Callable function to invert the observation covariance matrix :math:`\mathbf{E}_{[i]}`.
1026
+ If None, defaults to :func:`cconsenrich.cinvertMatrixE`.
1027
+ :type residualCovarInversionFunc: Optional[Callable]
1028
+ :param adjustProcessNoiseFunc: Function to adjust the process noise covariance matrix :math:`\mathbf{Q}_{[i]}`.
1029
+ If None, defaults to :func:`cconsenrich.updateProcessNoiseCovariance`.
1030
+ :type adjustProcessNoiseFunc: Optional[Callable]
1031
+ :param covarClip: For numerical stability, truncate state/process noise covariances
1032
+ to :math:`[10^{-\textsf{covarClip}}, 10^{\textsf{covarClip}}]`.
1033
+ :type covarClip: float
1034
+ :return: Tuple of three numpy arrays:
1035
+ - post-fit (forward/backward-smoothed) state estimates :math:`\widetilde{\mathbf{x}}_{[i]}` of shape :math:`n \times 2`
1036
+ - post-fit (forward/backward-smoothed) state covariance estimates :math:`\widetilde{\mathbf{P}}_{[i]}` of shape :math:`n \times 2 \times 2`
1037
+ - post-fit residuals (after forward/backward smoothing) :math:`\widetilde{\mathbf{y}}_{[i]}` of shape :math:`n \times m`
1038
+ :rtype: Tuple[np.ndarray, np.ndarray, np.ndarray]
1039
+
1040
+ :raises ValueError: If the number of samples in `matrixData` is not equal to the number of samples in `matrixMunc`.
1041
+ :seealso: :class:`observationParams`, :class:`processParams`, :class:`stateParams`
1042
+ """
1043
+
1044
+ matrixData = np.ascontiguousarray(matrixData, dtype=np.float32)
1045
+ matrixMunc = np.ascontiguousarray(matrixMunc, dtype=np.float32)
1046
+
1047
+ # -------
1048
+ # check edge cases
1049
+ if matrixData.ndim == 1:
1050
+ matrixData = matrixData[None, :]
1051
+ elif matrixData.ndim != 2:
1052
+ raise ValueError(
1053
+ "`matrixData` must be 1D or 2D (got ndim = "
1054
+ f"{matrixData.ndim})"
1055
+ )
1056
+ if matrixMunc.ndim == 1:
1057
+ matrixMunc = matrixMunc[None, :]
1058
+ elif matrixMunc.ndim != 2:
1059
+ raise ValueError(
1060
+ "`matrixMunc` must be 1D or 2D (got ndim = "
1061
+ f"{matrixMunc.ndim})"
1062
+ )
1063
+ if matrixMunc.shape != matrixData.shape:
1064
+ raise ValueError(
1065
+ f"`matrixMunc` shape {matrixMunc.shape} not equal to `matrixData` shape {matrixData.shape}"
1066
+ )
1067
+
1068
+ m, n = matrixData.shape
1069
+ if m < 1 or n < 1:
1070
+ # ideally, we don't get here, but JIC
1071
+ raise ValueError(
1072
+ f"`matrixData` and `matrixMunc` need positive m x n, shape={matrixData.shape})"
1073
+ )
1074
+
1075
+ if n <= 100:
1076
+ logger.warning(
1077
+ f"`matrixData` and `matrixMunc` span very fer genomic intervals (n={n})...is this correct?"
1078
+ )
1079
+
1080
+ if chunkSize < 1:
1081
+ logger.warning(
1082
+ f"`chunkSize` must be positive, setting to 1000000"
1083
+ )
1084
+ chunkSize = 1_000_000
1085
+
1086
+ if chunkSize > n:
1087
+ logger.warning(
1088
+ f"`chunkSize` of {chunkSize} is greater than the number of intervals (n={n}), setting to {n}"
1089
+ )
1090
+ chunkSize = n
1091
+ # -------
1092
+
1093
+ inflatedQ: bool = False
1094
+ dStat: float = np.float32(0.0)
1095
+ y64 = np.empty(m, dtype=np.float64)
1096
+ sq64 = np.empty(m, dtype=np.float64)
1097
+ countAdjustments: int = 0
1098
+
1099
+ IKH: np.ndarray = np.zeros(shape=(2, 2), dtype=np.float32)
1100
+ matrixEInverse: np.ndarray = np.zeros(
1101
+ shape=(m, m), dtype=np.float32
1102
+ )
1103
+ matrixF: np.ndarray = constructMatrixF(deltaF)
1104
+ matrixQ: np.ndarray = constructMatrixQ(minQ, offDiagQ=offDiagQ)
1105
+ matrixQCopy: np.ndarray = matrixQ.copy()
1106
+ matrixP: np.ndarray = np.eye(2, dtype=np.float32) * np.float32(
1107
+ stateCovarInit
1108
+ )
1109
+ matrixP = matrixP.astype(np.float64)
1110
+ matrixH: np.ndarray = constructMatrixH(
1111
+ m, coefficients=coefficientsH
1112
+ )
1113
+ matrixK: np.ndarray = np.zeros((2, m), dtype=np.float32)
1114
+ vectorX: np.ndarray = np.array([stateInit, 0.0], dtype=np.float32)
1115
+ vectorY: np.ndarray = np.zeros(m, dtype=np.float32)
1116
+ matrixI2: np.ndarray = np.eye(2, dtype=np.float32)
1117
+ clipSmall: float = 10 ** (-covarClip)
1118
+ clipBig: float = 10 ** (covarClip)
1119
+ if residualCovarInversionFunc is None:
1120
+ residualCovarInversionFunc = cconsenrich.cinvertMatrixE
1121
+ if adjustProcessNoiseFunc is None:
1122
+ adjustProcessNoiseFunc = (
1123
+ cconsenrich.updateProcessNoiseCovariance
1124
+ )
1125
+
1126
+ # ==========================
1127
+ # forward: 0,1,2,...,n-1
1128
+ # ==========================
1129
+ with TemporaryDirectory() as tempDir_:
1130
+ stateForwardPathMM = os.path.join(
1131
+ tempDir_, "stateForward.dat"
1132
+ )
1133
+ stateCovarForwardPathMM = os.path.join(
1134
+ tempDir_, "stateCovarForward.dat"
1135
+ )
1136
+ pNoiseForwardPathMM = os.path.join(
1137
+ tempDir_, "pNoiseForward.dat"
1138
+ )
1139
+ stateBackwardPathMM = os.path.join(
1140
+ tempDir_, "stateSmoothed.dat"
1141
+ )
1142
+ stateCovarBackwardPathMM = os.path.join(
1143
+ tempDir_, "stateCovarSmoothed.dat"
1144
+ )
1145
+ postFitResidualsPathMM = os.path.join(
1146
+ tempDir_, "postFitResiduals.dat"
1147
+ )
1148
+
1149
+ # ==========================
1150
+ # forward: 0,1,2,...,n-1
1151
+ # ==========================
1152
+ stateForward = np.memmap(
1153
+ stateForwardPathMM,
1154
+ dtype=np.float32,
1155
+ mode="w+",
1156
+ shape=(n, 2),
1157
+ )
1158
+ stateCovarForward = np.memmap(
1159
+ stateCovarForwardPathMM,
1160
+ dtype=np.float32,
1161
+ mode="w+",
1162
+ shape=(n, 2, 2),
1163
+ )
1164
+ pNoiseForward = np.memmap(
1165
+ pNoiseForwardPathMM,
1166
+ dtype=np.float32,
1167
+ mode="w+",
1168
+ shape=(n, 2, 2),
1169
+ )
1170
+
1171
+ progressIter = max(1, progressIter)
1172
+ avgDstat: float = 0.0
1173
+
1174
+ for i in range(n):
1175
+ if i % progressIter == 0 and i > 0:
1176
+ logger.info(f"\nForward pass interval: {i + 1}/{n}, "
1177
+ f"Gain[0,:] (i --> i+1): {1 - IKH[0, 0]:.4f}\n"
1178
+ )
1179
+
1180
+ vectorZ = matrixData[:, i]
1181
+ vectorX = matrixF @ vectorX
1182
+ matrixP = matrixF @ matrixP @ matrixF.T + matrixQ
1183
+ vectorY = vectorZ - (matrixH @ vectorX)
1184
+
1185
+ matrixEInverse = residualCovarInversionFunc(
1186
+ matrixMunc[:, i],
1187
+ np.float32(matrixP[0, 0]),
1188
+ )
1189
+
1190
+ # D_[i] (`dStat`): NIS-like, but w/ median:
1191
+ # ... median(y_[i]^2 * Einv_[i, diag])
1192
+
1193
+ Einv_diag = matrixEInverse.diagonal()
1194
+ # avoid per-iteration allocations
1195
+ np.copyto(y64, vectorY, casting="same_kind")
1196
+ np.square(y64, out=sq64, casting="same_kind")
1197
+ np.multiply(sq64, Einv_diag, out=sq64)
1198
+ dStat = np.float32(np.median(sq64)) if dStatUseMean else np.float32(np.mean(sq64))
1199
+
1200
+ avgDstat += float(dStat)
1201
+ countAdjustments = countAdjustments + int(
1202
+ dStat > dStatAlpha,
1203
+ )
1204
+
1205
+ matrixQ, inflatedQ = adjustProcessNoiseFunc(
1206
+ matrixQ,
1207
+ matrixQCopy,
1208
+ dStat,
1209
+ dStatAlpha,
1210
+ dStatd,
1211
+ dStatPC,
1212
+ inflatedQ,
1213
+ maxQ,
1214
+ minQ,
1215
+ )
1216
+ np.clip(matrixQ, clipSmall, clipBig, out=matrixQ)
1217
+ matrixK = (matrixP @ matrixH.T) @ matrixEInverse
1218
+ IKH = matrixI2 - (matrixK @ matrixH)
1219
+ # update for forward posterior state
1220
+ vectorX = vectorX + (matrixK @ vectorY)
1221
+ # ... and covariance
1222
+ np.clip(
1223
+ (IKH) @ matrixP @ (IKH).T
1224
+ + (matrixK * matrixMunc[:, i]) @ matrixK.T,
1225
+ clipSmall,
1226
+ clipBig,
1227
+ out=matrixP,
1228
+ )
1229
+ stateForward[i] = vectorX.astype(np.float32)
1230
+ stateCovarForward[i] = matrixP.astype(np.float32)
1231
+ pNoiseForward[i] = matrixQ.astype(np.float32)
1232
+
1233
+ if i % chunkSize == 0 and i > 0:
1234
+ stateForward.flush()
1235
+ stateCovarForward.flush()
1236
+ pNoiseForward.flush()
1237
+
1238
+ stateForward.flush()
1239
+ stateCovarForward.flush()
1240
+ pNoiseForward.flush()
1241
+
1242
+ stateForwardArr = stateForward
1243
+ stateCovarForwardArr = stateCovarForward
1244
+ pNoiseForwardArr = pNoiseForward
1245
+
1246
+ avgDstat /= n
1247
+
1248
+ logger.info(
1249
+ f"Average D_[i] statistic over `n` intervals: {avgDstat:.3f}"
1250
+ )
1251
+ logger.info(
1252
+ f"`D_[i] > α_D` triggered adjustments to Q_[i] at "
1253
+ f"[{round(((1.0 * countAdjustments) / n) * 100.0, 4)}%] of intervals"
1254
+ )
1255
+
1256
+ # ==========================
1257
+ # backward: n-1,n-2,...,0
1258
+ # ==========================
1259
+ stateSmoothed = np.memmap(
1260
+ stateBackwardPathMM,
1261
+ dtype=np.float32,
1262
+ mode="w+",
1263
+ shape=(n, 2),
1264
+ )
1265
+ stateCovarSmoothed = np.memmap(
1266
+ stateCovarBackwardPathMM,
1267
+ dtype=np.float32,
1268
+ mode="w+",
1269
+ shape=(n, 2, 2),
1270
+ )
1271
+ postFitResiduals = np.memmap(
1272
+ postFitResidualsPathMM,
1273
+ dtype=np.float32,
1274
+ mode="w+",
1275
+ shape=(n, m),
1276
+ )
1277
+
1278
+ stateSmoothed[-1] = np.float32(stateForwardArr[-1])
1279
+ stateCovarSmoothed[-1] = np.float32(stateCovarForwardArr[-1])
1280
+ postFitResiduals[-1] = np.float32(
1281
+ matrixData[:, -1] - (matrixH @ stateSmoothed[-1])
1282
+ )
1283
+ smootherGain = np.zeros((2, 2), dtype=np.float32)
1284
+
1285
+ for k in range(n - 2, -1, -1):
1286
+ if k % progressIter == 0:
1287
+ logger.info(
1288
+ f"\nBackward pass interval: {k + 1}/{n}, "
1289
+ f"smootherGain[0,0] (i+1 --> i): {smootherGain[0, 0]:.4f}\n"
1290
+ )
1291
+
1292
+ forwardStatePosterior = stateForwardArr[k]
1293
+ forwardCovariancePosterior = stateCovarForwardArr[k]
1294
+
1295
+ backwardInitialState = matrixF @ forwardStatePosterior
1296
+ backwardInitialCovariance = (
1297
+ matrixF @ forwardCovariancePosterior @ matrixF.T
1298
+ + pNoiseForwardArr[k + 1]
1299
+ )
1300
+
1301
+ smootherGain = np.linalg.solve(
1302
+ backwardInitialCovariance.T,
1303
+ (forwardCovariancePosterior @ matrixF.T).T,
1304
+ ).T
1305
+
1306
+ stateSmoothed[k] = (
1307
+ forwardStatePosterior
1308
+ + smootherGain
1309
+ @ (stateSmoothed[k + 1] - backwardInitialState)
1310
+ ).astype(np.float32)
1311
+
1312
+ stateCovarSmoothed[k] = (
1313
+ forwardCovariancePosterior
1314
+ + smootherGain
1315
+ @ (
1316
+ stateCovarSmoothed[k + 1]
1317
+ - backwardInitialCovariance
1318
+ )
1319
+ @ smootherGain.T
1320
+ ).astype(np.float32)
1321
+
1322
+ postFitResiduals[k] = np.float32(
1323
+ matrixData[:, k] - matrixH @ stateSmoothed[k]
1324
+ )
1325
+
1326
+ if k % chunkSize == 0 and k > 0:
1327
+ stateSmoothed.flush()
1328
+ stateCovarSmoothed.flush()
1329
+ postFitResiduals.flush()
1330
+
1331
+ stateSmoothed.flush()
1332
+ stateCovarSmoothed.flush()
1333
+ postFitResiduals.flush()
1334
+
1335
+ if boundState:
1336
+ stateSmoothed[:, 0] = np.clip(
1337
+ stateSmoothed[:, 0],
1338
+ stateLowerBound,
1339
+ stateUpperBound,
1340
+ ).astype(np.float32)
1341
+
1342
+ outStateSmoothed = np.array(stateSmoothed, copy=True)
1343
+ outStateCovarSmoothed = np.array(
1344
+ stateCovarSmoothed, copy=True
1345
+ )
1346
+ outPostFitResiduals = np.array(postFitResiduals, copy=True)
1347
+
1348
+ return (
1349
+ outStateSmoothed,
1350
+ outStateCovarSmoothed,
1351
+ outPostFitResiduals,
1352
+ )
1353
+
1354
+
1355
+ def getPrimaryState(
1356
+ stateVectors: np.ndarray,
1357
+ roundPrecision: int = 4,
1358
+ ) -> npt.NDArray[np.float32]:
1359
+ r"""Get the primary state estimate from each vector after running Consenrich.
1360
+
1361
+ :param stateVectors: State vectors from :func:`runConsenrich`.
1362
+ :type stateVectors: npt.NDArray[np.float32]
1363
+ :return: A one-dimensional numpy array of the primary state estimates.
1364
+ :rtype: npt.NDArray[np.float32]
1365
+ """
1366
+ out_ = np.ascontiguousarray(stateVectors[:, 0], dtype=np.float32)
1367
+ np.round(out_, decimals=roundPrecision, out=out_)
1368
+ return out_
1369
+
1370
+
1371
+ def getStateCovarTrace(
1372
+ stateCovarMatrices: np.ndarray,
1373
+ roundPrecision: int = 4,
1374
+ ) -> npt.NDArray[np.float32]:
1375
+ r"""Get a one-dimensional array of state covariance traces after running Consenrich
1376
+
1377
+ :param stateCovarMatrices: Estimated state covariance matrices :math:`\widetilde{\mathbf{P}}_{[i]}`
1378
+ :type stateCovarMatrices: np.ndarray
1379
+ :return: A one-dimensional numpy array of the traces of the state covariance matrices.
1380
+ :rtype: npt.NDArray[np.float32]
1381
+ """
1382
+ stateCovarMatrices = np.ascontiguousarray(
1383
+ stateCovarMatrices, dtype=np.float32
1384
+ )
1385
+ out_ = cconsenrich.cgetStateCovarTrace(stateCovarMatrices)
1386
+ np.round(out_, decimals=roundPrecision, out=out_)
1387
+ return out_
1388
+
1389
+
1390
+ def getPrecisionWeightedResidual(
1391
+ postFitResiduals: np.ndarray,
1392
+ matrixMunc: np.ndarray,
1393
+ roundPrecision: int = 4,
1394
+ stateCovarSmoothed: Optional[np.ndarray] = None,
1395
+ ) -> npt.NDArray[np.float32]:
1396
+ r"""Get a one-dimensional precision-weighted array residuals after running Consenrich.
1397
+
1398
+ Post-fit residuals weighted by the inverse of the observation noise covariance and primary state uncertainty.
1399
+
1400
+ :param postFitResiduals: Post-fit residuals :math:`\widetilde{\mathbf{y}}_{[i]}` from :func:`runConsenrich`.
1401
+ :type postFitResiduals: np.ndarray
1402
+ :param matrixMunc: An :math:`m \times n` sample-by-interval matrix -- At genomic intervals :math:`i = 1,2,\ldots,n`, the respective length-:math:`m` column is :math:`\mathbf{R}_{[i,11:mm]}`.
1403
+ That is, the observation noise levels for each sample :math:`j=1,2,\ldots,m` at interval :math:`i`. To keep memory usage minimal `matrixMunc` is not returned in full or computed in
1404
+ in :func:`runConsenrich`. If using Consenrich programmatically, run :func:`consenrich.core.getMuncTrack` for each sample's count data (rows in the matrix output of :func:`readBamSegments`).
1405
+ :type matrixMunc: np.ndarray
1406
+ :param stateCovarSmoothed: Post-fit (forward/backward-smoothed) state covariance matrices :math:`\widetilde{\mathbf{P}}_{[i]}` from :func:`runConsenrich`.
1407
+ :type stateCovarSmoothed: Optional[np.ndarray]
1408
+ :return: A one-dimensional array of "precision-weighted residuals"
1409
+ :rtype: npt.NDArray[np.float32]
1410
+ """
1411
+
1412
+ n, m = postFitResiduals.shape
1413
+ if matrixMunc.shape != (m, n):
1414
+ raise ValueError(
1415
+ f"matrixMunc should be (m,n)=({m}, {n}): observed {matrixMunc.shape}"
1416
+ )
1417
+ if stateCovarSmoothed is not None and (
1418
+ stateCovarSmoothed.ndim < 3 or len(stateCovarSmoothed) != n
1419
+ ):
1420
+ raise ValueError(
1421
+ "stateCovarSmoothed must be shape (n) x (2,2) (if provided)"
1422
+ )
1423
+
1424
+ postFitResiduals_CContig = np.ascontiguousarray(
1425
+ postFitResiduals, dtype=np.float32
1426
+ )
1427
+
1428
+ needsCopy = (
1429
+ (stateCovarSmoothed is not None)
1430
+ and len(stateCovarSmoothed) == n
1431
+ ) or (not matrixMunc.flags.writeable)
1432
+
1433
+ matrixMunc_CContig = np.array(
1434
+ matrixMunc, dtype=np.float32, order="C", copy=needsCopy
1435
+ )
1436
+
1437
+ if needsCopy:
1438
+ stateCovarArr00 = np.asarray(
1439
+ stateCovarSmoothed[:, 0, 0], dtype=np.float32
1440
+ )
1441
+ matrixMunc_CContig += stateCovarArr00
1442
+
1443
+ np.maximum(
1444
+ matrixMunc_CContig, np.float32(1e-8), out=matrixMunc_CContig
1445
+ )
1446
+ out = cconsenrich.cgetPrecisionWeightedResidual(
1447
+ postFitResiduals_CContig, matrixMunc_CContig
1448
+ )
1449
+ np.round(out, decimals=roundPrecision, out=out)
1450
+ return out
1451
+
1452
+
1453
+ def getMuncTrack(
1454
+ chromosome: str,
1455
+ intervals: np.ndarray,
1456
+ stepSize: int,
1457
+ rowValues: np.ndarray,
1458
+ minR: float,
1459
+ maxR: float,
1460
+ useALV: bool,
1461
+ useConstantNoiseLevel: bool,
1462
+ noGlobal: bool,
1463
+ localWeight: float,
1464
+ globalWeight: float,
1465
+ approximationWindowLengthBP: int,
1466
+ lowPassWindowLengthBP: int,
1467
+ returnCenter: bool,
1468
+ sparseMap: Optional[dict[int, int]] = None,
1469
+ lowPassFilterType: Optional[str] = "median",
1470
+ shrinkOffset: float = 0.5,
1471
+ ) -> npt.NDArray[np.float32]:
1472
+ r"""Get observation noise variance :math:`R_{[:,jj]}` for the sample :math:`j`.
1473
+
1474
+ Combines a local ALV estimate (see :func:`getAverageLocalVarianceTrack`) with an
1475
+ optional global component. If ``useALV`` is True, *only* the ALV is used. If
1476
+ ``useConstantNoiseLevel`` is True, a constant track set to the global mean is used.
1477
+ When a ``sparseMap`` is provided, local values are aggregated over nearby 'sparse'
1478
+ regions before mixing with the global component.
1479
+
1480
+ :param chromosome: Tracks are approximated for this chromosome.
1481
+ :type chromosome: str
1482
+ :param intervals: Genomic intervals for which to compute the noise track.
1483
+ :param stepSize: See :class:`countingParams`.
1484
+ :type stepSize: int
1485
+ :param rowValues: Read-density-based values for the sample :math:`j` at the genomic intervals :math:`i=1,2,\ldots,n`.
1486
+ :type rowValues: np.ndarray
1487
+ :param minR: See :class:`observationParams`.
1488
+ :type minR: float
1489
+ :param maxR: See :class:`observationParams`.
1490
+ :type maxR: float
1491
+ :param useALV: See :class:`observationParams`.
1492
+ :type useALV: bool
1493
+ :param useConstantNoiseLevel: See :class:`observationParams`.
1494
+ :type useConstantNoiseLevel: bool
1495
+ :param noGlobal: See :class:`observationParams`.
1496
+ :type noGlobal: bool
1497
+ :param localWeight: See :class:`observationParams`.
1498
+ :type localWeight: float
1499
+ :param globalWeight: See :class:`observationParams`.
1500
+ :type globalWeight: float
1501
+ :param approximationWindowLengthBP: See :class:`observationParams` and/or :func:`getAverageLocalVarianceTrack`.
1502
+ :type approximationWindowLengthBP: int
1503
+ :param lowPassWindowLengthBP: See :class:`observationParams` and/or :func:`getAverageLocalVarianceTrack`.
1504
+ :type lowPassWindowLengthBP: int
1505
+ :param sparseMap: Optional mapping (dictionary) of interval indices to the nearest sparse regions. See :func:`getSparseMap`.
1506
+ :type sparseMap: Optional[dict[int, int]]
1507
+ :param lowPassFilterType: The type of low-pass filter to use in average local variance track (e.g., 'median', 'mean').
1508
+ :type lowPassFilterType: Optional[str]
1509
+ :param shrinkOffset: See :func:`getAverageLocalVarianceTrack`.
1510
+ :type shrinkOffset: float
1511
+ :return: A one-dimensional numpy array of the observation noise track for the sample :math:`j`.
1512
+ :rtype: npt.NDArray[np.float32]
1513
+ """
1514
+
1515
+ # FFR: we should consider whether to apply bounds only after mixing local/global
1516
+ trackALV = getAverageLocalVarianceTrack(
1517
+ rowValues,
1518
+ stepSize,
1519
+ approximationWindowLengthBP,
1520
+ lowPassWindowLengthBP,
1521
+ minR,
1522
+ maxR,
1523
+ lowPassFilterType,
1524
+ shrinkOffset=shrinkOffset,
1525
+ ).astype(np.float32)
1526
+
1527
+ globalNoise: float = np.float32(np.mean(trackALV))
1528
+ if noGlobal or globalWeight == 0 or useALV:
1529
+ return np.clip(trackALV, minR, maxR).astype(np.float32)
1530
+
1531
+ if (
1532
+ useConstantNoiseLevel
1533
+ or localWeight == 0
1534
+ and sparseMap is None
1535
+ ):
1536
+ return np.clip(
1537
+ globalNoise * np.ones_like(rowValues), minR, maxR
1538
+ ).astype(np.float32)
1539
+
1540
+ if sparseMap is not None:
1541
+ trackALV = cconsenrich.cSparseAvg(trackALV, sparseMap)
1542
+
1543
+ return np.clip(
1544
+ trackALV * localWeight + np.mean(trackALV) * globalWeight,
1545
+ minR,
1546
+ maxR,
1547
+ ).astype(np.float32)
1548
+
1549
+
1550
+ def sparseIntersection(
1551
+ chromosome: str, intervals: np.ndarray, sparseBedFile: str
1552
+ ) -> npt.NDArray[np.int64]:
1553
+ r"""Returns intervals in the chromosome that overlap with the sparse features.
1554
+
1555
+ Not relevant if `observationParams.useALV` is True.
1556
+
1557
+ :param chromosome: The chromosome name.
1558
+ :type chromosome: str
1559
+ :param intervals: The genomic intervals to consider.
1560
+ :type intervals: np.ndarray
1561
+ :param sparseBedFile: Path to the sparse BED file.
1562
+ :type sparseBedFile: str
1563
+ :return: A numpy array of start positions of the sparse features that overlap with the intervals
1564
+ :rtype: np.ndarray[Tuple[Any], np.dtype[Any]]
1565
+ """
1566
+
1567
+ stepSize: int = intervals[1] - intervals[0]
1568
+ chromFeatures: bed.BedTool = (
1569
+ bed.BedTool(sparseBedFile)
1570
+ .sort()
1571
+ .merge()
1572
+ .filter(
1573
+ lambda b: (
1574
+ b.chrom == chromosome
1575
+ and b.start > intervals[0]
1576
+ and b.end < intervals[-1]
1577
+ and (b.end - b.start) >= stepSize
1578
+ )
1579
+ )
1580
+ )
1581
+ centeredFeatures: bed.BedTool = chromFeatures.each(
1582
+ adjustFeatureBounds, stepSize=stepSize
1583
+ )
1584
+
1585
+ start0: int = int(intervals[0])
1586
+ last: int = int(intervals[-1])
1587
+ chromFeatures: bed.BedTool = (
1588
+ bed.BedTool(sparseBedFile)
1589
+ .sort()
1590
+ .merge()
1591
+ .filter(
1592
+ lambda b: (
1593
+ b.chrom == chromosome
1594
+ and b.start > start0
1595
+ and b.end < last
1596
+ and (b.end - b.start) >= stepSize
1597
+ )
1598
+ )
1599
+ )
1600
+ centeredFeatures: bed.BedTool = chromFeatures.each(
1601
+ adjustFeatureBounds, stepSize=stepSize
1602
+ )
1603
+ centeredStarts = []
1604
+ for f in centeredFeatures:
1605
+ s = int(f.start)
1606
+ if start0 <= s <= last and (s - start0) % stepSize == 0:
1607
+ centeredStarts.append(s)
1608
+ return np.asarray(centeredStarts, dtype=np.int64)
1609
+
1610
+
1611
+ def adjustFeatureBounds(
1612
+ feature: bed.Interval, stepSize: int
1613
+ ) -> bed.Interval:
1614
+ r"""Adjust the start and end positions of a BED feature to be centered around a step."""
1615
+ feature.start = cconsenrich.stepAdjustment(
1616
+ (feature.start + feature.end) // 2, stepSize
1617
+ )
1618
+ feature.end = feature.start + stepSize
1619
+ return feature
1620
+
1621
+
1622
+ def getSparseMap(
1623
+ chromosome: str,
1624
+ intervals: np.ndarray,
1625
+ numNearest: int,
1626
+ sparseBedFile: str,
1627
+ ) -> dict:
1628
+ r"""Build a map between each genomic interval and numNearest sparse features
1629
+
1630
+ :param chromosome: The chromosome name. Note, this function only needs to be run once per chromosome.
1631
+ :type chromosome: str
1632
+ :param intervals: The genomic intervals to map.
1633
+ :type intervals: np.ndarray
1634
+ :param numNearest: The number of nearest sparse features to consider
1635
+ :type numNearest: int
1636
+ :param sparseBedFile: path to the sparse BED file.
1637
+ :type sparseBedFile: str
1638
+ :return: A dictionary mapping each interval index to the indices of the nearest sparse regions.
1639
+ :rtype: dict[int, np.ndarray]
1640
+
1641
+ """
1642
+ numNearest = numNearest
1643
+ sparseStarts = sparseIntersection(
1644
+ chromosome, intervals, sparseBedFile
1645
+ )
1646
+ idxSparseInIntervals = np.searchsorted(
1647
+ intervals, sparseStarts, side="left"
1648
+ )
1649
+ centers = np.searchsorted(sparseStarts, intervals, side="left")
1650
+ sparseMap: dict = {}
1651
+ for i, (interval, center) in enumerate(zip(intervals, centers)):
1652
+ left = max(0, center - numNearest)
1653
+ right = min(len(sparseStarts), center + numNearest)
1654
+ candidates = np.arange(left, right)
1655
+ dists = np.abs(sparseStarts[candidates] - interval)
1656
+ take = np.argsort(dists)[:numNearest]
1657
+ sparseMap[i] = idxSparseInIntervals[candidates[take]]
1658
+ return sparseMap
1659
+
1660
+
1661
+ def getBedMask(
1662
+ chromosome: str,
1663
+ bedFile: str,
1664
+ intervals: np.ndarray,
1665
+ ) -> np.ndarray:
1666
+ r"""Return a 1/0 mask for intervals overlapping a sorted and merged BED file.
1667
+
1668
+ This function is a wrapper for :func:`cconsenrich.cbedMask`.
1669
+
1670
+ :param chromosome: The chromosome name.
1671
+ :type chromosome: str
1672
+ :param intervals: chromosome-specific, sorted, non-overlapping start positions of genomic intervals.
1673
+ Each interval is assumed `stepSize`.
1674
+ :type intervals: np.ndarray
1675
+ :param bedFile: Path to a sorted and merged BED file
1676
+ :type bedFile: str
1677
+ :return: An `intervals`-length mask s.t. True indicates the interval overlaps a feature in the BED file.
1678
+ :rtype: np.ndarray
1679
+ """
1680
+ if not os.path.exists(bedFile):
1681
+ raise ValueError(f"Could not find {bedFile}")
1682
+ if len(intervals) < 2:
1683
+ raise ValueError(
1684
+ "intervals must contain at least two positions"
1685
+ )
1686
+ bedFile_ = str(bedFile)
1687
+
1688
+ # (possibly redundant) creation of uint32 version
1689
+ # + quick check for constant steps
1690
+ intervals_ = np.asarray(intervals, dtype=np.uint32)
1691
+ if (intervals_[1] - intervals_[0]) != (
1692
+ intervals_[-1] - intervals_[-2]
1693
+ ):
1694
+ raise ValueError("Intervals are not fixed in size")
1695
+
1696
+ stepSize_: int = intervals[1] - intervals[0]
1697
+ return cconsenrich.cbedMask(
1698
+ chromosome,
1699
+ bedFile_,
1700
+ intervals_,
1701
+ stepSize_,
1702
+ ).astype(np.bool_)
1703
+
1704
+
1705
+ def autoDeltaF(
1706
+ bamFiles: List[str],
1707
+ stepSize: int,
1708
+ fragmentLengths: Optional[List[int]] = None,
1709
+ fallBackFragmentLength: int = 147,
1710
+ randomSeed: int = 42,
1711
+ ) -> float:
1712
+ r"""(Experimental) Set `deltaF` as the ratio intervalLength:fragmentLength.
1713
+
1714
+ Computes average fragment length across samples and sets `processParams.deltaF = countingArgs.stepSize / medianFragmentLength`.
1715
+
1716
+ Where `stepSize` is small, adjacent genomic intervals may share information from the same fragments. This motivates
1717
+ a smaller `deltaF` (i.e., less state change between neighboring intervals).
1718
+
1719
+ :param stepSize: Length of genomic intervals/bins. See :class:`countingParams`.
1720
+ :type stepSize: int
1721
+ :param bamFiles: List of sorted/indexed BAM files to estimate fragment lengths from if they are not provided directly.
1722
+ :type bamFiles: List[str]
1723
+ :param fragmentLengths: Optional list of fragment lengths (in bp) for each sample. If provided, these values are used directly instead of estimating from `bamFiles`.
1724
+ :type fragmentLengths: Optional[List[int]]
1725
+ :param fallBackFragmentLength: If fragment length estimation from a BAM file fails, this value is used instead.
1726
+ :type fallBackFragmentLength: int
1727
+ :param randomSeed: Random seed for fragment length estimation.
1728
+ :type randomSeed: int
1729
+ :return: Estimated `deltaF` value.
1730
+ :rtype: float
1731
+ :seealso: :func:`cconsenrich.cgetFragmentLength`, :class:`processParams`, :class:`countingParams`
1732
+ """
1733
+
1734
+ avgFragmentLength: float = 0.0
1735
+ if (
1736
+ fragmentLengths is not None
1737
+ and len(fragmentLengths) > 0
1738
+ and all(isinstance(x, (int, float)) for x in fragmentLengths)
1739
+ ):
1740
+ avgFragmentLength = np.median(fragmentLengths)
1741
+ elif bamFiles is not None and len(bamFiles) > 0:
1742
+ fragmentLengths_ = []
1743
+ for bamFile in bamFiles:
1744
+ fLen = cconsenrich.cgetFragmentLength(
1745
+ bamFile,
1746
+ fallBack=fallBackFragmentLength,
1747
+ randSeed=randomSeed,
1748
+ )
1749
+ fragmentLengths_.append(fLen)
1750
+ avgFragmentLength = np.median(fragmentLengths_)
1751
+ else:
1752
+ raise ValueError(
1753
+ "One of `fragmentLengths` or `bamFiles` is required..."
1754
+ )
1755
+ if avgFragmentLength > 0:
1756
+ deltaF = round(stepSize / float(avgFragmentLength), 4)
1757
+ logger.info(f"Setting `processParams.deltaF`={deltaF}")
1758
+ return np.float32(deltaF)
1759
+ else:
1760
+ raise ValueError(
1761
+ "Average cross-sample fraglen estimation failed"
1762
+ )
1763
+
1764
+
1765
+ def _forPlotsSampleBlockStats(
1766
+ values_: npt.NDArray[np.float32],
1767
+ blockSize_: int,
1768
+ numBlocks_: int,
1769
+ statFunction_: Callable = np.mean,
1770
+ randomSeed_: int = 42,
1771
+ ):
1772
+ r"""Pure python helper for plotting distributions of block-sampled statistics.
1773
+
1774
+ Intended for use in the plotting functions, not as an alternative to
1775
+ the Cython ``cconsenrich.csampleBlockStats`` function used in the
1776
+ `matching` module. Call on 32bit numpy arrays so that copies are not made.
1777
+
1778
+ :param values: One-dimensional array of values to sample blocks from.
1779
+ :type values: np.ndarray
1780
+ :param blockSize: Length of each block to sample.
1781
+ :type blockSize: int
1782
+ :param numBlocks: Number of blocks to sample.
1783
+ :type numBlocks: int
1784
+ """
1785
+ np.random.seed(randomSeed_)
1786
+
1787
+ if type(values_) == npt.NDArray[np.float32]:
1788
+ x = values_
1789
+ else:
1790
+ x = np.ascontiguousarray(values_, dtype=np.float32)
1791
+ n = x.shape[0]
1792
+ if blockSize_ > n:
1793
+ logger.warning(
1794
+ f"`blockSize>values.size`...setting `blockSize` = {max(n // 2, 1)}."
1795
+ )
1796
+ blockSize_ = int(max(n // 2, 1))
1797
+
1798
+ maxStart = n - blockSize_ + 1
1799
+
1800
+ # avoid copies
1801
+ blockView = as_strided(
1802
+ x,
1803
+ shape=(maxStart, blockSize_),
1804
+ strides=(x.strides[0], x.strides[0]),
1805
+ )
1806
+ starts = np.random.randint(0, maxStart, size=numBlocks_)
1807
+ return statFunction_(blockView[starts], axis=1)
1808
+
1809
+
1810
+ def plotStateEstimatesHistogram(
1811
+ chromosome: str,
1812
+ plotPrefix: str,
1813
+ primaryStateValues: npt.NDArray[np.float32],
1814
+ blockSize: int = 10,
1815
+ numBlocks: int = 10_000,
1816
+ statFunction: Callable = np.mean,
1817
+ randomSeed: int = 42,
1818
+ roundPrecision: int = 4,
1819
+ plotHeightInches: float = 8.0,
1820
+ plotWidthInches: float = 10.0,
1821
+ plotDPI: int = 300,
1822
+ plotDirectory: str | None = None,
1823
+ ) -> str | None:
1824
+ r"""(Experimental) Plot a histogram of block-sampled (within-chromosome) primary state estimates.
1825
+
1826
+ :param plotPrefix: Prefixes the output filename
1827
+ :type plotPrefix: str
1828
+ :param primaryStateValues: 1D 32bit float array of primary state estimates, i.e., :math:`\widetilde{\mathbf{x}}_{[i,1]}`,
1829
+ that is, ``stateSmoothed[0,:]`` from :func:`runConsenrich`. See also :func:`getPrimaryState`.
1830
+ :type primaryStateValues: npt.NDArray[np.float32]
1831
+ :param blockSize: Number of contiguous intervals to sample per block.
1832
+ :type blockSize: int
1833
+ :param numBlocks: Number of samples to draw
1834
+ :type numBlocks: int
1835
+ :param statFunction: Numpy callable function to compute on each sampled block (e.g., `np.mean`, `np.median`).
1836
+ :type statFunction: Callable
1837
+ :param plotDirectory: If provided, saves the plot to this directory. The directory should exist.
1838
+ :type plotDirectory: str | None
1839
+ """
1840
+
1841
+ if plotDirectory is None:
1842
+ plotDirectory = os.getcwd()
1843
+ elif not os.path.exists(plotDirectory):
1844
+ raise ValueError(
1845
+ f"`plotDirectory` {plotDirectory} does not exist"
1846
+ )
1847
+ elif not os.path.isdir(plotDirectory):
1848
+ raise ValueError(
1849
+ f"`plotDirectory` {plotDirectory} is not a directory"
1850
+ )
1851
+
1852
+ plotFileName = os.path.join(
1853
+ plotDirectory,
1854
+ f"consenrichPlot_hist_{chromosome}_{plotPrefix}_state.png",
1855
+ )
1856
+ binnedStateEstimates = _forPlotsSampleBlockStats(
1857
+ values_=primaryStateValues,
1858
+ blockSize_=blockSize,
1859
+ numBlocks_=numBlocks,
1860
+ statFunction_=statFunction,
1861
+ randomSeed_=randomSeed,
1862
+ )
1863
+ plt.figure(
1864
+ figsize=(plotWidthInches, plotHeightInches), dpi=plotDPI
1865
+ )
1866
+ plt.hist(
1867
+ binnedStateEstimates,
1868
+ bins="doane",
1869
+ color="blue",
1870
+ alpha=0.85,
1871
+ edgecolor="black",
1872
+ fill=True,
1873
+ )
1874
+ plt.title(
1875
+ rf"Histogram: {numBlocks} sampled blocks ({blockSize} contiguous intervals each): Posterior Signal Estimates $\widetilde{{x}}_{{[1 : n]}}$",
1876
+ )
1877
+ plt.savefig(plotFileName, dpi=plotDPI)
1878
+ plt.close()
1879
+ if os.path.exists(plotFileName):
1880
+ logger.info(
1881
+ f"Wrote state estimate histogram to {plotFileName}"
1882
+ )
1883
+ return plotFileName
1884
+ logger.warning(
1885
+ f"Failed to create histogram. {plotFileName} not written."
1886
+ )
1887
+ return None
1888
+
1889
+
1890
+ def plotResidualsHistogram(
1891
+ chromosome: str,
1892
+ plotPrefix: str,
1893
+ residuals: npt.NDArray[np.float32],
1894
+ blockSize: int = 10,
1895
+ numBlocks: int = 10_000,
1896
+ statFunction: Callable = np.mean,
1897
+ randomSeed: int = 42,
1898
+ roundPrecision: int = 4,
1899
+ plotHeightInches: float = 8.0,
1900
+ plotWidthInches: float = 10.0,
1901
+ plotDPI: int = 300,
1902
+ flattenResiduals: bool = False,
1903
+ plotDirectory: str | None = None,
1904
+ ) -> str | None:
1905
+ r"""(Experimental) Plot a histogram of within-chromosome post-fit residuals.
1906
+
1907
+ .. note::
1908
+
1909
+ To economically represent residuals across multiple samples, at each genomic interval :math:`i`,
1910
+ we randomly select a single sample's residual in vector :math:`\mathbf{y}_{[i]} = \mathbf{Z}_{[:,i]} - \mathbf{H}\widetilde{\mathbf{x}}_{[i]}`
1911
+ to obtain a 1D array, :math:`\mathbf{a} \in \mathbb{R}^{1 \times n}`. Then, contiguous blocks :math:`\mathbf{a}_{[k:k+blockSize]}` are sampled
1912
+ to compute the desired statistic (e.g., mean, median). These block statistics comprise the empirical distribution plotted in the histogram.
1913
+
1914
+ :param plotPrefix: Prefixes the output filename
1915
+ :type plotPrefix: str
1916
+ :param residuals: :math:`m \times n` (sample-by-interval) 32bit float array of post-fit residuals.
1917
+ :type residuals: npt.NDArray[np.float32]
1918
+ :param blockSize: Number of contiguous intervals to sample per block.
1919
+ :type blockSize: int
1920
+ :param numBlocks: Number of samples to draw
1921
+ :type numBlocks: int
1922
+ :param statFunction: Numpy callable function to compute on each sampled block (e.g., `np.mean`, `np.median`).
1923
+ :type statFunction: Callable
1924
+ :param flattenResiduals: If True, flattens the :math:`m \times n` (sample-by-interval) residuals
1925
+ array to 1D (via `np.flatten`) before sampling blocks. If False, a random row (sample) is
1926
+ selected for each column (interval) prior to the block sampling.
1927
+ in each iteration.
1928
+ :type flattenResiduals: bool
1929
+ :param plotDirectory: If provided, saves the plot to this directory. The directory should exist.
1930
+ :type plotDirectory: str | None
1931
+ """
1932
+
1933
+ if plotDirectory is None:
1934
+ plotDirectory = os.getcwd()
1935
+ elif not os.path.exists(plotDirectory):
1936
+ raise ValueError(
1937
+ f"`plotDirectory` {plotDirectory} does not exist"
1938
+ )
1939
+ elif not os.path.isdir(plotDirectory):
1940
+ raise ValueError(
1941
+ f"`plotDirectory` {plotDirectory} is not a directory"
1942
+ )
1943
+
1944
+ plotFileName = os.path.join(
1945
+ plotDirectory,
1946
+ f"consenrichPlot_hist_{chromosome}_{plotPrefix}_residuals.png",
1947
+ )
1948
+
1949
+ x = np.ascontiguousarray(residuals, dtype=np.float32)
1950
+
1951
+ if not flattenResiduals:
1952
+ n, m = x.shape
1953
+ rng = np.random.default_rng(randomSeed)
1954
+ sample_idx = rng.integers(0, m, size=n)
1955
+ x = x[np.arange(n), sample_idx]
1956
+ else:
1957
+ x = x.ravel()
1958
+
1959
+ binnedResiduals = _forPlotsSampleBlockStats(
1960
+ values_=x,
1961
+ blockSize_=blockSize,
1962
+ numBlocks_=numBlocks,
1963
+ statFunction_=statFunction,
1964
+ randomSeed_=randomSeed,
1965
+ )
1966
+ plt.figure(
1967
+ figsize=(plotWidthInches, plotHeightInches), dpi=plotDPI
1968
+ )
1969
+ plt.hist(
1970
+ binnedResiduals,
1971
+ bins="doane",
1972
+ color="blue",
1973
+ alpha=0.85,
1974
+ edgecolor="black",
1975
+ fill=True,
1976
+ )
1977
+ plt.title(
1978
+ rf"Histogram: {numBlocks} sampled blocks ({blockSize} contiguous intervals each): Post-Fit Residuals $\widetilde{{y}}_{{[1 : m, 1 : n]}}$",
1979
+ )
1980
+ plt.savefig(plotFileName, dpi=plotDPI)
1981
+ plt.close()
1982
+ if os.path.exists(plotFileName):
1983
+ logger.info(f"Wrote residuals histogram to {plotFileName}")
1984
+ return plotFileName
1985
+ logger.warning(
1986
+ f"Failed to create histogram. {plotFileName} not written."
1987
+ )
1988
+ return None
1989
+
1990
+
1991
+ def plotStateStdHistogram(
1992
+ chromosome: str,
1993
+ plotPrefix: str,
1994
+ stateStd: npt.NDArray[np.float32],
1995
+ blockSize: int = 10,
1996
+ numBlocks: int = 10_000,
1997
+ statFunction: Callable = np.mean,
1998
+ randomSeed: int = 42,
1999
+ roundPrecision: int = 4,
2000
+ plotHeightInches: float = 8.0,
2001
+ plotWidthInches: float = 10.0,
2002
+ plotDPI: int = 300,
2003
+ plotDirectory: str | None = None,
2004
+ ) -> str | None:
2005
+ r"""(Experimental) Plot a histogram of block-sampled (within-chromosome) primary state standard deviations, i.e., :math:`\sqrt{\widetilde{\mathbf{P}}_{[i,11]}}`.
2006
+
2007
+ :param plotPrefix: Prefixes the output filename
2008
+ :type plotPrefix: str
2009
+ :param stateStd: 1D numpy 32bit float array of primary state standard deviations, i.e., :math:`\sqrt{\widetilde{\mathbf{P}}_{[i,11]}}`,
2010
+ that is, the first diagonal elements in the :math:`n \times (2 \times 2)` numpy array `stateCovarSmoothed`. Access as ``(stateCovarSmoothed[:, 0, 0]``.
2011
+ :type stateStd: npt.NDArray[np.float32]
2012
+ :param blockSize: Number of contiguous intervals to sample per block.
2013
+ :type blockSize: int
2014
+ :param numBlocks: Number of samples to draw
2015
+ :type numBlocks: int
2016
+ :param statFunction: Numpy callable function to compute on each sampled block (e.g., `np.mean`, `np.median`).
2017
+ :type statFunction: Callable
2018
+ :param plotDirectory: If provided, saves the plot to this directory. The directory should exist.
2019
+ :type plotDirectory: str | None
2020
+ """
2021
+
2022
+ if plotDirectory is None:
2023
+ plotDirectory = os.getcwd()
2024
+ elif not os.path.exists(plotDirectory):
2025
+ raise ValueError(
2026
+ f"`plotDirectory` {plotDirectory} does not exist"
2027
+ )
2028
+ elif not os.path.isdir(plotDirectory):
2029
+ raise ValueError(
2030
+ f"`plotDirectory` {plotDirectory} is not a directory"
2031
+ )
2032
+
2033
+ plotFileName = os.path.join(
2034
+ plotDirectory,
2035
+ f"consenrichPlot_hist_{chromosome}_{plotPrefix}_stateStd.png",
2036
+ )
2037
+
2038
+ binnedStateStdEstimates = _forPlotsSampleBlockStats(
2039
+ values_=stateStd,
2040
+ blockSize_=blockSize,
2041
+ numBlocks_=numBlocks,
2042
+ statFunction_=statFunction,
2043
+ randomSeed_=randomSeed,
2044
+ )
2045
+ plt.figure(
2046
+ figsize=(plotWidthInches, plotHeightInches),
2047
+ dpi=plotDPI,
2048
+ )
2049
+ plt.hist(
2050
+ binnedStateStdEstimates,
2051
+ bins="doane",
2052
+ color="blue",
2053
+ alpha=0.85,
2054
+ edgecolor="black",
2055
+ fill=True,
2056
+ )
2057
+ plt.title(
2058
+ rf"Histogram: {numBlocks} sampled blocks ({blockSize} contiguous intervals each): Posterior State StdDev $\sqrt{{\widetilde{{P}}_{{[1:n,11]}}}}$",
2059
+ )
2060
+ plt.savefig(plotFileName, dpi=plotDPI)
2061
+ plt.close()
2062
+ if os.path.exists(plotFileName):
2063
+ logger.info(f"Wrote state std histogram to {plotFileName}")
2064
+ return plotFileName
2065
+ logger.warning(
2066
+ f"Failed to create histogram. {plotFileName} not written."
2067
+ )
2068
+ return None