consenrich 0.7.2b2__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,836 @@
1
+ # -*- coding: utf-8 -*-
2
+ # cython: boundscheck=False, wraparound=False, cdivision=True, nonecheck=False, initializedcheck=False, infer_types=True, language_level=3
3
+ # distutils: language = c
4
+ r"""Cython module for Consenrich core functions.
5
+
6
+ This module contains Cython implementations of core functions used in Consenrich.
7
+ """
8
+
9
+ cimport cython
10
+
11
+ import os
12
+ import numpy as np
13
+ from scipy import ndimage
14
+ import pysam
15
+
16
+ cimport numpy as cnp
17
+ from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
18
+ from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
19
+
20
+ cnp.import_array()
21
+
22
+ cpdef int stepAdjustment(int value, int stepSize, int pushForward=0):
23
+ r"""Adjusts a value to the nearest multiple of stepSize, optionally pushing it forward.
24
+
25
+ :param value: The value to adjust.
26
+ :type value: int
27
+ :param stepSize: The step size to adjust to.
28
+ :type stepSize: int
29
+ :param pushForward: If non-zero, pushes the value forward by stepSize
30
+ :type pushForward: int
31
+ :return: The adjusted value.
32
+ :rtype: int
33
+ """
34
+ return max(0, (value-(value % stepSize))) + pushForward*stepSize
35
+
36
+
37
+ cpdef uint64_t cgetFirstChromRead(str bamFile, str chromosome, uint64_t chromLength, uint32_t samThreads, int samFlagExclude):
38
+ r"""Get the start position of the first read in a BAM file for a given chromosome.
39
+
40
+ :param bamFile: See :func:`consenrich.core.inputParams`.
41
+ :type bamFile: str
42
+ :param chromosome: Chromosome name.
43
+ :type chromosome: str
44
+ :param chromLength: Length of the chromosome in base pairs.
45
+ :type chromLength: uint64_t
46
+ :param samThreads: Number of threads to use for reading the BAM file.
47
+ :type samThreads: uint32_t
48
+ :param samFlagExclude: SAM flags to exclude reads (e.g., unmapped,
49
+ :type samFlagExclude: int
50
+ :return: Start position of the first read in the chromosome, or 0 if no reads are found.
51
+ :rtype: uint64_t
52
+ """
53
+
54
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
55
+ cdef AlignedSegment read
56
+ for read in aln.fetch(contig=chromosome, start=0, end=chromLength):
57
+ if not (read.flag & samFlagExclude):
58
+ aln.close()
59
+ return read.reference_start
60
+ aln.close()
61
+ return 0
62
+
63
+
64
+ cpdef uint64_t cgetLastChromRead(str bamFile, str chromosome, uint64_t chromLength, uint32_t samThreads, int samFlagExclude):
65
+ r"""Get the end position of the last read in a BAM file for a given chromosome.
66
+
67
+ :param bamFile: See :func:`consenrich.core.inputParams`.
68
+ :type bamFile: str
69
+ :param chromosome: Chromosome name.
70
+ :type chromosome: str
71
+ :param chromLength: Length of the chromosome in base pairs.
72
+ :type chromLength: uint64_t
73
+ :param samThreads: Number of threads to use for reading the BAM file.
74
+ :type samThreads: uint32_t
75
+ :param samFlagExclude: See :class:`consenrich.core.samParams`.
76
+ :type samFlagExclude: int
77
+ :return: End position of the last read in the chromosome, or 0 if no reads are found.
78
+ :rtype: uint64_t
79
+ """
80
+
81
+ cdef uint64_t start_ = chromLength - min((chromLength // 2), 1_000_000)
82
+ cdef uint64_t lastPos = 0
83
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
84
+ cdef AlignedSegment read
85
+ for read in aln.fetch(contig=chromosome, start=start_, end=chromLength):
86
+ if not (read.flag & samFlagExclude):
87
+ lastPos = read.reference_end
88
+ aln.close()
89
+ return lastPos
90
+
91
+
92
+
93
+ cpdef uint32_t cgetReadLength(str bamFile, uint32_t minReads, uint32_t samThreads, uint32_t maxIterations, int samFlagExclude):
94
+ r"""Get the median read length from a BAM file after fetching a specified number of reads.
95
+
96
+ :param bamFile: see :class:`consenrich.core.inputParams`.
97
+ :type bamFile: str
98
+ :param minReads: Minimum number of reads to consider for the median calculation.
99
+ :type minReads: uint32_t
100
+ :param samThreads: See :class:`consenrich.core.samParams`.
101
+ :type samThreads: uint32_t
102
+ :param maxIterations: Maximum number of reads to iterate over.
103
+ :type maxIterations: uint32_t
104
+ :param samFlagExclude: See :class:`consenrich.core.samParams`.
105
+ :type samFlagExclude: int
106
+ :return: Median read length from the BAM file.
107
+ :rtype: uint32_t
108
+ """
109
+ cdef uint32_t observedReads = 0
110
+ cdef uint32_t currentIterations = 0
111
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
112
+ cdef AlignedSegment read
113
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] readLengths = np.zeros(maxIterations, dtype=np.uint32)
114
+ cdef uint32_t i = 0
115
+ if <uint32_t>aln.mapped < minReads:
116
+ aln.close()
117
+ return 0
118
+ for read in aln.fetch():
119
+ if not (observedReads < minReads and currentIterations < maxIterations):
120
+ break
121
+ if not (read.flag & samFlagExclude):
122
+ # meets critera -> add it
123
+ readLengths[i] = read.query_length
124
+ observedReads += 1
125
+ i += 1
126
+ currentIterations += 1
127
+ aln.close()
128
+ if observedReads < minReads:
129
+ return 0
130
+ return <uint32_t>np.median(readLengths[:observedReads])
131
+
132
+
133
+ cdef inline Py_ssize_t floordiv64(int64_t a, int64_t b) nogil:
134
+ if a >= 0:
135
+ return <Py_ssize_t>(a // b)
136
+ else:
137
+ return <Py_ssize_t>(- ((-a + b - 1) // b))
138
+
139
+
140
+ cpdef cnp.uint32_t[:] creadBamSegment(
141
+ str bamFile,
142
+ str chromosome,
143
+ uint32_t start,
144
+ uint32_t end,
145
+ uint32_t stepSize,
146
+ int64_t readLength,
147
+ uint8_t oneReadPerBin,
148
+ uint16_t samThreads,
149
+ uint16_t samFlagExclude,
150
+ int64_t shiftForwardStrand53 = 0,
151
+ int64_t shiftReverseStrand53 = 0,
152
+ int64_t extendBP = 0,
153
+ int64_t maxInsertSize=1000,
154
+ int64_t pairedEndMode=0,
155
+ int64_t inferFragmentLength=0):
156
+ r"""Count reads in a BAM file for a given chromosome"""
157
+
158
+ cdef Py_ssize_t numIntervals = <Py_ssize_t>(((end - start) + stepSize - 1) // stepSize)
159
+
160
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] values_np = np.zeros(numIntervals, dtype=np.uint32)
161
+ cdef cnp.uint32_t[::1] values = values_np
162
+
163
+ if numIntervals <= 0:
164
+ return values
165
+
166
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
167
+ cdef AlignedSegment read
168
+ cdef int64_t start64 = start
169
+ cdef int64_t end64 = end
170
+ cdef int64_t step64 = stepSize
171
+ cdef Py_ssize_t i, index0, index1
172
+ cdef Py_ssize_t lastIndex = numIntervals - 1
173
+ cdef bint readIsForward
174
+ cdef int64_t readStart, readEnd
175
+ cdef int64_t adjStart, adjEnd, fivePrime, mid, midIndex, tlen, atlen
176
+ cdef uint16_t flag
177
+ if inferFragmentLength > 0 and pairedEndMode == 0 and extendBP == 0:
178
+ extendBP = cgetFragmentLength(bamFile,
179
+ chromosome,
180
+ <int64_t>start,
181
+ <int64_t>end,
182
+ samThreads = samThreads,
183
+ samFlagExclude=samFlagExclude,
184
+ maxInsertSize=maxInsertSize,
185
+ minInsertSize=<int64_t>(readLength+1), # xCorr peak > rlen ~~> fraglen
186
+ )
187
+ try:
188
+ with aln:
189
+ for read in aln.fetch(chromosome, start64, end64):
190
+ flag = <uint16_t>read.flag
191
+ if flag & samFlagExclude:
192
+ continue
193
+
194
+ readIsForward = (flag & 16) == 0
195
+ readStart = <int64_t>read.reference_start
196
+ readEnd = <int64_t>read.reference_end
197
+
198
+ if pairedEndMode > 0:
199
+ if flag & 1 == 0: # not a paired read
200
+ continue
201
+ # use first in pair + fragment
202
+ if flag & 128:
203
+ continue
204
+ if (flag & 8) or read.next_reference_id != read.reference_id:
205
+ continue
206
+ tlen = <int64_t>read.template_length
207
+ atlen = tlen if tlen >= 0 else -tlen
208
+ if atlen == 0 or atlen > maxInsertSize:
209
+ continue
210
+ if tlen >= 0:
211
+ adjStart = readStart
212
+ adjEnd = readStart + atlen
213
+ else:
214
+ adjEnd = readEnd
215
+ adjStart = adjEnd - atlen
216
+ if shiftForwardStrand53 != 0 or shiftReverseStrand53 != 0:
217
+ if readIsForward:
218
+ adjStart += shiftForwardStrand53
219
+ adjEnd += shiftForwardStrand53
220
+ else:
221
+ adjStart -= shiftReverseStrand53
222
+ adjEnd -= shiftReverseStrand53
223
+ else:
224
+ # SE
225
+ if readIsForward:
226
+ fivePrime = readStart + shiftForwardStrand53
227
+ else:
228
+ fivePrime = (readEnd - 1) - shiftReverseStrand53
229
+
230
+ if extendBP > 0:
231
+ # from the cut 5' --> 3'
232
+ if readIsForward:
233
+ adjStart = fivePrime
234
+ adjEnd = fivePrime + extendBP
235
+ else:
236
+ adjEnd = fivePrime + 1
237
+ adjStart = adjEnd - extendBP
238
+ elif shiftForwardStrand53 != 0 or shiftReverseStrand53 != 0:
239
+ if readIsForward:
240
+ adjStart = readStart + shiftForwardStrand53
241
+ adjEnd = readEnd + shiftForwardStrand53
242
+ else:
243
+ adjStart = readStart - shiftReverseStrand53
244
+ adjEnd = readEnd - shiftReverseStrand53
245
+ else:
246
+ adjStart = readStart
247
+ adjEnd = readEnd
248
+
249
+ if adjEnd <= start64 or adjStart >= end64:
250
+ continue
251
+ if adjStart < start64:
252
+ adjStart = start64
253
+ if adjEnd > end64:
254
+ adjEnd = end64
255
+
256
+ if oneReadPerBin:
257
+ # +1 at midpoint of frag.
258
+ mid = (adjStart + adjEnd) // 2
259
+ midIndex = <Py_ssize_t>((mid - start64) // step64)
260
+ if 0 <= midIndex <= lastIndex:
261
+ values[midIndex] += <uint32_t>1
262
+ else:
263
+ # +1 every interval intersecting frag
264
+ index0 = <Py_ssize_t>((adjStart - start64) // step64)
265
+ index1 = <Py_ssize_t>(((adjEnd - 1) - start64) // step64)
266
+ if index0 < 0:
267
+ index0 = <Py_ssize_t>0
268
+ if index1 > lastIndex:
269
+ index1 = lastIndex
270
+ if index0 > lastIndex or index1 < 0 or index0 > index1:
271
+ continue
272
+ for b_ in range(index0, index1 + 1):
273
+ values[b_] += <uint32_t>1
274
+
275
+ finally:
276
+ aln.close()
277
+
278
+ return values
279
+
280
+
281
+
282
+ cpdef cnp.ndarray[cnp.float32_t, ndim=2] cinvertMatrixE(cnp.ndarray[cnp.float32_t, ndim=1] muncMatrixIter, cnp.float32_t priorCovarianceOO):
283
+ r"""Invert the residual covariance matrix during the forward pass.
284
+
285
+ :param muncMatrixIter: The diagonal elements of the covariance matrix at a given genomic interval.
286
+ :type muncMatrixIter: cnp.ndarray[cnp.float32_t, ndim=1]
287
+ :param priorCovarianceOO: The a priori 'primary' state variance :math:`P_{[i|i-1,11]}`.
288
+ :type priorCovarianceOO: cnp.float32_t
289
+ :return: The inverted covariance matrix.
290
+ :rtype: cnp.ndarray[cnp.float32_t, ndim=2]
291
+ """
292
+
293
+ cdef int m = muncMatrixIter.size
294
+ # we have to invert a P.D. covariance (diagonal) and rank-one (1*priorCovariance) matrix
295
+ cdef cnp.ndarray[cnp.float32_t, ndim=2] inverse = np.empty((m, m), dtype=np.float32)
296
+ # note, not actually an m-dim matrix, just the diagonal elements taken as input
297
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] muncMatrixInverse = np.empty(m, dtype=np.float32)
298
+ cdef float sqrtPrior = np.sqrt(priorCovarianceOO)
299
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] uVec = np.empty(m, dtype=np.float32)
300
+ cdef float divisor = 1.0
301
+ cdef float scale
302
+ cdef float uVecI
303
+ cdef Py_ssize_t i, j
304
+ for i in range(m):
305
+ # two birds: build up the trace while taking the reciprocals
306
+ muncMatrixInverse[i] = 1.0/(muncMatrixIter[i])
307
+ divisor += priorCovarianceOO*muncMatrixInverse[i]
308
+ # we can combine these two loops, keeping construction
309
+ # of muncMatrixInverse and uVec separate for now in case
310
+ # we want to parallelize this later
311
+ for i in range(m):
312
+ uVec[i] = sqrtPrior*muncMatrixInverse[i]
313
+ scale = 1.0 / divisor
314
+ for i in range(m):
315
+ uVecI = uVec[i]
316
+ inverse[i, i] = muncMatrixInverse[i]-(scale*uVecI*uVecI)
317
+ for j in range(i + 1, m):
318
+ inverse[i, j] = -scale * (uVecI*uVec[j])
319
+ inverse[j, i] = inverse[i, j]
320
+ return inverse
321
+
322
+
323
+ cpdef cnp.ndarray[cnp.float32_t, ndim=1] cgetStateCovarTrace(
324
+ cnp.float32_t[:, :, ::1] stateCovarMatrices
325
+ ):
326
+ cdef Py_ssize_t n = stateCovarMatrices.shape[0]
327
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] trace = np.empty(n, dtype=np.float32)
328
+ cdef cnp.float32_t[::1] traceView = trace
329
+ cdef Py_ssize_t i
330
+ for i in range(n):
331
+ traceView[i] = stateCovarMatrices[i, 0, 0] + stateCovarMatrices[i, 1, 1]
332
+
333
+ return trace
334
+
335
+
336
+ cpdef cnp.ndarray[cnp.float32_t, ndim=1] cgetPrecisionWeightedResidual(
337
+ cnp.float32_t[:, ::1] postFitResiduals,
338
+ cnp.float32_t[:, ::1] matrixMunc,
339
+ ):
340
+ cdef Py_ssize_t n = postFitResiduals.shape[0]
341
+ cdef Py_ssize_t m = postFitResiduals.shape[1]
342
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] out = np.empty(n, dtype=np.float32)
343
+ cdef cnp.float32_t[::1] outv = out
344
+ cdef Py_ssize_t i, j
345
+ cdef float wsum, rwsum, w
346
+ cdef float eps = 1e-12 # guard for zeros
347
+
348
+ for i in range(n):
349
+ wsum = 0.0
350
+ rwsum = 0.0
351
+ for j in range(m):
352
+ w = 1.0 / (<float>matrixMunc[j, i] + eps) # weightsIter[j]
353
+ rwsum += (<float>postFitResiduals[i, j]) * w # residualsIter[j] * w
354
+ wsum += w
355
+ outv[i] = <cnp.float32_t>(rwsum / wsum) if wsum > 0.0 else <cnp.float32_t>0.0
356
+
357
+ return out
358
+
359
+
360
+
361
+ cpdef tuple updateProcessNoiseCovariance(cnp.ndarray[cnp.float32_t, ndim=2] matrixQ,
362
+ cnp.ndarray[cnp.float32_t, ndim=2] matrixQCopy,
363
+ float dStat,
364
+ float dStatAlpha,
365
+ float dStatd,
366
+ float dStatPC,
367
+ bint inflatedQ,
368
+ float maxQ,
369
+ float minQ):
370
+ r"""Adjust process noise covariance matrix :math:`\mathbf{Q}_{[i]}`
371
+
372
+ :param matrixQ: Current process noise covariance
373
+ :param matrixQCopy: A copy of the initial original covariance matrix :math:`\mathbf{Q}_{[.]}`
374
+ :param inflatedQ: Flag indicating if the process noise covariance is inflated
375
+ :return: Updated process noise covariance matrix and inflated flag
376
+ :rtype: tuple
377
+ """
378
+
379
+ cdef float scaleQ, fac
380
+ if dStat > dStatAlpha:
381
+ scaleQ = np.sqrt(dStatd * np.abs(dStat-dStatAlpha) + dStatPC)
382
+ if matrixQ[0, 0] * scaleQ <= maxQ:
383
+ matrixQ[0, 0] *= scaleQ
384
+ matrixQ[0, 1] *= scaleQ
385
+ matrixQ[1, 0] *= scaleQ
386
+ matrixQ[1, 1] *= scaleQ
387
+ else:
388
+ fac = maxQ / matrixQCopy[0, 0]
389
+ matrixQ[0, 0] = maxQ
390
+ matrixQ[0, 1] = matrixQCopy[0, 1] * fac
391
+ matrixQ[1, 0] = matrixQCopy[1, 0] * fac
392
+ matrixQ[1, 1] = maxQ
393
+ inflatedQ = True
394
+
395
+ elif dStat < dStatAlpha and inflatedQ:
396
+ scaleQ = np.sqrt(dStatd * np.abs(dStat-dStatAlpha) + dStatPC)
397
+ if matrixQ[0, 0] / scaleQ >= minQ:
398
+ matrixQ[0, 0] /= scaleQ
399
+ matrixQ[0, 1] /= scaleQ
400
+ matrixQ[1, 0] /= scaleQ
401
+ matrixQ[1, 1] /= scaleQ
402
+ else:
403
+ # we've hit the minimum, no longer 'inflated'
404
+ fac = minQ / matrixQCopy[0, 0]
405
+ matrixQ[0, 0] = minQ
406
+ matrixQ[0, 1] = matrixQCopy[0, 1] * fac
407
+ matrixQ[1, 0] = matrixQCopy[1, 0] * fac
408
+ matrixQ[1, 1] = minQ
409
+ inflatedQ = False
410
+ return matrixQ, inflatedQ
411
+
412
+
413
+ cdef void _blockMax(double[::1] valuesView,
414
+ Py_ssize_t[::1] blockStartIndices,
415
+ Py_ssize_t[::1] blockSizes,
416
+ double[::1] outputView) noexcept:
417
+ cdef Py_ssize_t iterIndex, elementIndex, startIndex, blockLength
418
+ cdef double currentMax, currentValue
419
+ for iterIndex in range(outputView.shape[0]):
420
+ startIndex = blockStartIndices[iterIndex]
421
+ blockLength = blockSizes[iterIndex] # note, length of blocks affects upcoming loop
422
+ currentMax = valuesView[startIndex]
423
+ for elementIndex in range(1, blockLength):
424
+ currentValue = valuesView[startIndex + elementIndex]
425
+ if currentValue > currentMax:
426
+ currentMax = currentValue
427
+ outputView[iterIndex] = currentMax
428
+
429
+
430
+ cpdef double[::1] csampleBlockStats(cnp.ndarray[cnp.uint32_t, ndim=1] intervals,
431
+ cnp.ndarray[cnp.float64_t, ndim=1] values,
432
+ int expectedBlockSize,
433
+ int iters,
434
+ int randSeed,
435
+ cnp.ndarray[cnp.uint8_t, ndim=1] excludeIdxMask):
436
+ r"""Sample contiguous blocks in the response sequence (xCorr), record maxima, and repeat.
437
+
438
+ Used to build an empirical null distribution and determine significance of response outputs.
439
+ The size of blocks is drawn from a truncated geometric distribution, preserving rough equality
440
+ in expectation but allowing for variability to account for the sampling across different phases
441
+ in the response sequence.
442
+
443
+ :param values: The response sequence to sample from.
444
+ :type values: cnp.ndarray[cnp.float64_t, ndim=1]
445
+ :param expectedBlockSize: The expected size (geometric) of the blocks to sample.
446
+ :type expectedBlockSize: int
447
+ :param iters: The number of blocks to sample.
448
+ :type iters: int
449
+ :param randSeed: Random seed for reproducibility.
450
+ :type randSeed: int
451
+ :return: An array of sampled block maxima.
452
+ :rtype: cnp.ndarray[cnp.float64_t, ndim=1]
453
+ :seealso: :func:`consenrich.matching.matchWavelet`
454
+ """
455
+ np.random.seed(randSeed)
456
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] valuesArr = np.ascontiguousarray(values, dtype=np.float64)
457
+ cdef double[::1] valuesView = valuesArr
458
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] sizesArr
459
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] startsArr
460
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] out = np.empty(iters, dtype=np.float64)
461
+ cdef Py_ssize_t maxBlockLength, maxSize, minSize
462
+ cdef Py_ssize_t n = <Py_ssize_t>intervals.size
463
+ cdef double maxBlockScale = <double>3.0
464
+ cdef double minBlockScale = <double> (1.0 / 3.0)
465
+
466
+ minSize = <Py_ssize_t> max(3, expectedBlockSize * minBlockScale)
467
+ maxSize = <Py_ssize_t> min(maxBlockScale * expectedBlockSize, n)
468
+ sizesArr = np.random.geometric(1.0 / expectedBlockSize, size=iters).astype(np.intp, copy=False)
469
+ np.clip(sizesArr, minSize, maxSize, out=sizesArr)
470
+ maxBlockLength = sizesArr.max()
471
+ cdef list support = []
472
+ cdef cnp.intp_t i_ = 0
473
+ while i_ < n-maxBlockLength:
474
+ if excludeIdxMask[i_:i_ + maxBlockLength].any():
475
+ i_ = i_ + maxBlockLength + 1
476
+ continue
477
+ support.append(i_)
478
+ i_ = i_ + 1
479
+
480
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] samples = np.random.choice(
481
+ support,
482
+ size=iters,
483
+ replace=True,
484
+ p=None
485
+ ).astype(np.intp)
486
+
487
+ cdef Py_ssize_t[::1] startsView = samples
488
+ cdef Py_ssize_t[::1] sizesView = sizesArr
489
+ cdef double[::1] outView = out
490
+ _blockMax(valuesView, startsView, sizesView, outView)
491
+ return out
492
+
493
+
494
+ cpdef cSparseAvg(cnp.float32_t[::1] trackALV, dict sparseMap):
495
+ r"""Fast access and average of `numNearest` sparse elements.
496
+
497
+ See :func:`consenrich.core.getMuncTrack`
498
+
499
+ :param trackALV: See :func:`consenrich.core.getAverageLocalVarianceTrack`
500
+ :type trackALV: float[::1]
501
+ :param sparseMap: See :func:`consenrich.core.getSparseMap`
502
+ :type sparseMap: dict[int, np.ndarray]
503
+ :return: array of mena('nearest local variances') same length as `trackALV`
504
+ :rtype: cnp.ndarray[cnp.float32_t, ndim=1]
505
+ """
506
+ cdef Py_ssize_t n = <Py_ssize_t>trackALV.shape[0]
507
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] out = np.empty(n, dtype=np.float32)
508
+ cdef Py_ssize_t i, j, m
509
+ cdef float sumNearestVariances = 0.0
510
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] idxs
511
+ cdef cnp.intp_t[::1] idx_view
512
+ for i in range(n):
513
+ idxs = <cnp.ndarray[cnp.intp_t, ndim=1]> sparseMap[i] # FFR: to avoid the cast, create sparseMap as dict[intp, np.ndarray[intp]]
514
+ idx_view = idxs
515
+ m = idx_view.shape[0] # FFR: maybe enforce strict `m == numNearest` in future releases to avoid extra overhead
516
+ if m == 0:
517
+ # this case probably warrants an exception or np.nan
518
+ out[i] = 0.0
519
+ continue
520
+ sumNearestVariances = 0.0
521
+ with nogil:
522
+ for j in range(m):
523
+ sumNearestVariances += trackALV[idx_view[j]]
524
+ out[i] = sumNearestVariances/m
525
+
526
+ return out
527
+
528
+
529
+ cpdef int64_t cgetFragmentLength(
530
+ str bamFile,
531
+ str chromosome,
532
+ int64_t start,
533
+ int64_t end,
534
+ uint16_t samThreads=0,
535
+ uint16_t samFlagExclude=3844,
536
+ int64_t maxInsertSize=1000,
537
+ int64_t minInsertSize=50,
538
+ int64_t iters=1000,
539
+ int64_t blockSize=5000,
540
+ int64_t fallBack=147,
541
+ int64_t rollingChunkSize=250,
542
+ int64_t lagStep=5,
543
+ int64_t earlyExit=100,
544
+ int64_t randSeed=42,
545
+ ):
546
+ np.random.seed(randSeed)
547
+ cdef int64_t regionLen, numRollSteps, numChunks
548
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] rawArr
549
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] medArr
550
+ cdef AlignmentFile aln
551
+ cdef AlignedSegment read_
552
+ cdef list coverageIdxTopK
553
+ cdef list blockCenters
554
+ cdef list bestLags
555
+ cdef int i, j, k, idxVal
556
+ cdef int startIdx, endIdx
557
+ cdef int winSize, takeK
558
+ cdef int blockHalf, readFlag
559
+ cdef int chosenLag, lag, maxValidLag
560
+ cdef int64_t blockStartBP, blockEndBP, readStart, readEnd, med
561
+ cdef double score
562
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] unsortedIdx, sortedIdx, expandedIdx
563
+ cdef cnp.intp_t[::1] expandedIdxView
564
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] unsortedVals
565
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] seen
566
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] fwd = np.zeros(blockSize, dtype=np.float64, order='C')
567
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] rev = np.zeros(blockSize, dtype=np.float64, order='C')
568
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] fwdDiff = np.zeros(blockSize + 1, dtype=np.float64, order='C')
569
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] revDiff = np.zeros(blockSize + 1, dtype=np.float64, order='C')
570
+ cdef int64_t diffS, diffE = 0
571
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] xCorr
572
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] bestLagsArr
573
+
574
+ earlyExit = min(earlyExit, iters)
575
+ regionLen = end - start
576
+
577
+ samThreads_ = <int>samThreads
578
+ numCPUs = os.cpu_count()
579
+ if numCPUs is None:
580
+ numCPUs = 1
581
+ if samThreads < 1:
582
+ samThreads_ = <int>min(max(1,numCPUs // 2), 4)
583
+
584
+ if regionLen < blockSize or regionLen <= 0:
585
+ return <int64_t>fallBack
586
+
587
+ if maxInsertSize < 1:
588
+ maxInsertSize = 1
589
+ if minInsertSize < 1:
590
+ minInsertSize = 1
591
+ if minInsertSize > maxInsertSize:
592
+ minInsertSize, maxInsertSize = maxInsertSize, minInsertSize
593
+
594
+ # first, we build a coarse read coverage track from `start` to `end`
595
+ numRollSteps = regionLen // rollingChunkSize
596
+ if numRollSteps <= 0:
597
+ numRollSteps = 1
598
+ numChunks = <int>numRollSteps
599
+
600
+ rawArr = np.zeros(numChunks, dtype=np.float64)
601
+ medArr = np.zeros(numChunks, dtype=np.float64)
602
+
603
+ aln = AlignmentFile(bamFile, "rb", threads=samThreads_)
604
+ try:
605
+ for read_ in aln.fetch(chromosome, start, end):
606
+ if (read_.flag & samFlagExclude) != 0:
607
+ continue
608
+ if read_.reference_start < start or read_.reference_end >= end:
609
+ continue
610
+ j = <int>((read_.reference_start - start) // rollingChunkSize)
611
+ if 0 <= j < numChunks:
612
+ rawArr[j] += 1.0
613
+
614
+ # second, we apply a rolling/moving/local/weywtci order-statistic filter (median)
615
+ # ...the size of the kernel is based on the blockSize -- we want high-coverage
616
+ # ...blocks as measured by their local median read counts
617
+ winSize = <int>(blockSize // rollingChunkSize)
618
+ if winSize < 1:
619
+ winSize = 1
620
+ if (winSize & 1) == 0:
621
+ winSize += 1
622
+ medArr[:] = ndimage.median_filter(rawArr, size=winSize, mode="nearest")
623
+
624
+ # we pick the largest local-medians and form a block around each
625
+ takeK = iters if iters < numChunks else numChunks
626
+ unsortedIdx = np.argpartition(medArr, -takeK)[-takeK:]
627
+ unsortedVals = medArr[unsortedIdx]
628
+ sortedIdx = unsortedIdx[np.argsort(unsortedVals)[::-1]]
629
+ coverageIdxTopK = sortedIdx[:takeK].tolist()
630
+ expandedLen = takeK*winSize
631
+ expandedIdx = np.empty(expandedLen, dtype=np.intp)
632
+ expandedIdxView = expandedIdx
633
+ k = 0
634
+ for i in range(takeK):
635
+ idxVal = coverageIdxTopK[i]
636
+ startIdx = idxVal - (winSize // 2)
637
+ endIdx = startIdx + winSize
638
+ if startIdx < 0:
639
+ startIdx = 0
640
+ endIdx = winSize if winSize < numChunks else numChunks
641
+ if endIdx > numChunks:
642
+ endIdx = numChunks
643
+ startIdx = endIdx - winSize if winSize <= numChunks else 0
644
+ for j in range(startIdx, endIdx):
645
+ expandedIdxView[k] = j
646
+ k += 1
647
+ if k < expandedLen:
648
+ expandedIdx = expandedIdx[:k]
649
+ expandedIdxView = expandedIdx
650
+
651
+ seen = np.zeros(numChunks, dtype=np.uint8)
652
+ blockCenters = []
653
+ for i in range(expandedIdx.shape[0]):
654
+ j = <int>expandedIdxView[i]
655
+ if seen[j] == 0:
656
+ seen[j] = 1
657
+ blockCenters.append(j)
658
+
659
+ if len(blockCenters) > 1:
660
+ blockCenters = np.random.choice(
661
+ blockCenters,
662
+ size=len(blockCenters),
663
+ replace=False,
664
+ p=None
665
+ ).tolist()
666
+
667
+ bestLags = []
668
+ blockHalf = blockSize // 2
669
+
670
+ for idxVal in blockCenters:
671
+ # this should map back to genomic coordinates
672
+ blockStartBP = start + idxVal * rollingChunkSize + (rollingChunkSize // 2) - blockHalf
673
+ if blockStartBP < start:
674
+ blockStartBP = start
675
+ blockEndBP = blockStartBP + blockSize
676
+ if blockEndBP > end:
677
+ blockEndBP = end
678
+ blockStartBP = blockEndBP - blockSize
679
+ if blockStartBP < start:
680
+ continue
681
+
682
+ # now we build strand-specific tracks over each block
683
+ fwd.fill(0.0)
684
+ fwdDiff.fill(0.0)
685
+ rev.fill(0.0)
686
+ revDiff.fill(0.0)
687
+ readFlag = -1
688
+
689
+ for read_ in aln.fetch(chromosome, blockStartBP, blockEndBP):
690
+ readFlag = read_.flag
691
+ if (readFlag & samFlagExclude) != 0:
692
+ continue
693
+ readStart = min(read_.reference_start, read_.reference_end)
694
+ readEnd = max(read_.reference_start, read_.reference_end)
695
+ diffS = readStart - blockStartBP
696
+ diffE = readEnd - blockStartBP
697
+ strand = readFlag & 16
698
+ if readStart < blockStartBP or readEnd > blockEndBP:
699
+ continue
700
+ posInBlock = readStart - blockStartBP
701
+ if strand == 0:
702
+ # forward
703
+ fwdDiff[diffS] += 1.0
704
+ fwdDiff[diffE] -= 1.0
705
+ else:
706
+ # reverse
707
+ revDiff[diffS] += 1.0
708
+ revDiff[diffE] -= 1.0
709
+ fwd = np.cumsum(fwdDiff[:len(fwdDiff)-1], dtype=np.float64)
710
+ rev = np.cumsum(revDiff[:len(revDiff)-1], dtype=np.float64)
711
+
712
+ # maximizes the crossCovar(forward, reverse, lag) wrt lag.
713
+ fwd = (fwd - np.mean(fwd))
714
+ rev = (rev - np.mean(rev))
715
+ maxValidLag = maxInsertSize if (maxInsertSize < blockSize) else (blockSize - 1)
716
+ xCorr = np.zeros(maxInsertSize + 1, dtype=np.float64)
717
+ for lag in range(minInsertSize, maxValidLag + 1, lagStep):
718
+ score = 0.0
719
+ for i in range(blockSize - lag):
720
+ score += fwd[i] * rev[i + lag]
721
+ xCorr[lag] = score
722
+
723
+ if maxValidLag < minInsertSize:
724
+ continue
725
+ chosenLag = -1
726
+ chosenLag = int(np.argmax(xCorr[minInsertSize:maxValidLag + 1])) + minInsertSize
727
+
728
+ if chosenLag > 0:
729
+ bestLags.append(chosenLag)
730
+ if len(bestLags) >= earlyExit:
731
+ break
732
+
733
+ finally:
734
+ aln.close()
735
+
736
+ if len(bestLags) < 3:
737
+ return fallBack
738
+
739
+ bestLagsArr = np.asarray(bestLags, dtype=np.uint32)
740
+ med = int(np.median(bestLagsArr))
741
+ if med < minInsertSize:
742
+ med = <int>minInsertSize
743
+ elif med > maxInsertSize:
744
+ med = <int>maxInsertSize
745
+ return med
746
+
747
+
748
+ cdef inline Py_ssize_t getInsertion(const uint32_t* array_, Py_ssize_t n, uint32_t x) nogil:
749
+ # helper: binary search to find insertion point into sorted `arrray_`
750
+ cdef Py_ssize_t low = 0
751
+ cdef Py_ssize_t high = n
752
+ cdef Py_ssize_t midpt
753
+ while low < high:
754
+ midpt = low + ((high - low) >> 1)
755
+ if array_[midpt] <= x:
756
+ low = midpt + 1
757
+ else:
758
+ high = midpt
759
+ return low
760
+
761
+
762
+ cdef int maskMembership(const uint32_t* pos, Py_ssize_t numIntervals, const uint32_t* mStarts, const uint32_t* mEnds, Py_ssize_t n, uint8_t* outMask) nogil:
763
+ cdef Py_ssize_t i = 0
764
+ cdef Py_ssize_t k
765
+ cdef uint32_t p
766
+ while i < numIntervals:
767
+ p = pos[i]
768
+ k = getInsertion(mStarts, n, p) - 1
769
+ if k >= 0 and p < mEnds[k]:
770
+ outMask[i] = <uint8_t>1
771
+ else:
772
+ outMask[i] = <uint8_t>0
773
+ i += 1
774
+ return 0
775
+
776
+
777
+ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] cbedMask(
778
+ str chromosome,
779
+ str bedFile,
780
+ cnp.ndarray[cnp.uint32_t, ndim=1] intervals,
781
+ int stepSize
782
+ ):
783
+ r"""Return a 1/0 mask for intervals overlapping a sorted and merged BED file.
784
+
785
+ :param chromosome: Chromosome name.
786
+ :type chromosome: str
787
+ :param bedFile: Path to a sorted and merged BED file.
788
+ :type bedFile: str
789
+ :param intervals: Array of sorted, non-overlapping start positions of genomic intervals.
790
+ Each interval is assumed `stepSize`.
791
+ :type intervals: cnp.ndarray[cnp.uint32_t, ndim=1]
792
+ :param stepSize: Step size between genomic positions in `intervals`.
793
+ :type stepSize: int32_t
794
+ :return: A mask s.t. `1` indicates the corresponding interval overlaps a BED region.
795
+ :rtype: cnp.ndarray[cnp.uint8_t, ndim=1]
796
+
797
+ """
798
+ cdef list startsList = []
799
+ cdef list endsList = []
800
+ cdef object f = open(bedFile, "r")
801
+ cdef str line
802
+ cdef list cols
803
+ try:
804
+ for line in f:
805
+ line = line.strip()
806
+ if not line or line[0] == '#':
807
+ continue
808
+ cols = line.split('\t')
809
+ if not cols or len(cols) < 3:
810
+ continue
811
+ if cols[0] != chromosome:
812
+ continue
813
+ startsList.append(int(cols[1]))
814
+ endsList.append(int(cols[2]))
815
+ finally:
816
+ f.close()
817
+ cdef Py_ssize_t numIntervals = intervals.size
818
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] mask = np.zeros(numIntervals, dtype=np.uint8)
819
+ if not startsList:
820
+ return mask
821
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] starts = np.asarray(startsList, dtype=np.uint32)
822
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] ends = np.asarray(endsList, dtype=np.uint32)
823
+ cdef cnp.uint32_t[:] startsView = starts
824
+ cdef cnp.uint32_t[:] endsView = ends
825
+ cdef cnp.uint32_t[:] posView = intervals
826
+ cdef cnp.uint8_t[:] outView = mask
827
+ # for nogil
828
+ cdef uint32_t* svPtr = &startsView[0] if starts.size > 0 else <uint32_t*>NULL
829
+ cdef uint32_t* evPtr = &endsView[0] if ends.size > 0 else <uint32_t*>NULL
830
+ cdef uint32_t* posPtr = &posView[0] if numIntervals > 0 else <uint32_t*>NULL
831
+ cdef uint8_t* outPtr = &outView[0] if numIntervals > 0 else <uint8_t*>NULL
832
+ cdef Py_ssize_t n = starts.size
833
+ with nogil:
834
+ if numIntervals > 0 and n > 0:
835
+ maskMembership(posPtr, numIntervals, svPtr, evPtr, n, outPtr)
836
+ return mask