consenrich 0.7.4b3__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,861 @@
1
+ # -*- coding: utf-8 -*-
2
+ # cython: boundscheck=False, wraparound=False, cdivision=True, nonecheck=False, initializedcheck=False, infer_types=True, language_level=3
3
+ # distutils: language = c
4
+ r"""Cython module for Consenrich core functions.
5
+
6
+ This module contains Cython implementations of core functions used in Consenrich.
7
+ """
8
+
9
+ cimport cython
10
+
11
+ import os
12
+ import numpy as np
13
+ from scipy import ndimage
14
+ import pysam
15
+
16
+ cimport numpy as cnp
17
+ from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
18
+ from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
19
+ from libc.float cimport DBL_EPSILON
20
+
21
+ cnp.import_array()
22
+
23
+ cpdef int stepAdjustment(int value, int stepSize, int pushForward=0):
24
+ r"""Adjusts a value to the nearest multiple of stepSize, optionally pushing it forward.
25
+
26
+ :param value: The value to adjust.
27
+ :type value: int
28
+ :param stepSize: The step size to adjust to.
29
+ :type stepSize: int
30
+ :param pushForward: If non-zero, pushes the value forward by stepSize
31
+ :type pushForward: int
32
+ :return: The adjusted value.
33
+ :rtype: int
34
+ """
35
+ return max(0, (value-(value % stepSize))) + pushForward*stepSize
36
+
37
+
38
+ cpdef uint64_t cgetFirstChromRead(str bamFile, str chromosome, uint64_t chromLength, uint32_t samThreads, int samFlagExclude):
39
+ r"""Get the start position of the first read in a BAM file for a given chromosome.
40
+
41
+ :param bamFile: See :func:`consenrich.core.inputParams`.
42
+ :type bamFile: str
43
+ :param chromosome: Chromosome name.
44
+ :type chromosome: str
45
+ :param chromLength: Length of the chromosome in base pairs.
46
+ :type chromLength: uint64_t
47
+ :param samThreads: Number of threads to use for reading the BAM file.
48
+ :type samThreads: uint32_t
49
+ :param samFlagExclude: SAM flags to exclude reads (e.g., unmapped,
50
+ :type samFlagExclude: int
51
+ :return: Start position of the first read in the chromosome, or 0 if no reads are found.
52
+ :rtype: uint64_t
53
+ """
54
+
55
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
56
+ cdef AlignedSegment read
57
+ for read in aln.fetch(contig=chromosome, start=0, end=chromLength):
58
+ if not (read.flag & samFlagExclude):
59
+ aln.close()
60
+ return read.reference_start
61
+ aln.close()
62
+ return 0
63
+
64
+
65
+ cpdef uint64_t cgetLastChromRead(str bamFile, str chromosome, uint64_t chromLength, uint32_t samThreads, int samFlagExclude):
66
+ r"""Get the end position of the last read in a BAM file for a given chromosome.
67
+
68
+ :param bamFile: See :func:`consenrich.core.inputParams`.
69
+ :type bamFile: str
70
+ :param chromosome: Chromosome name.
71
+ :type chromosome: str
72
+ :param chromLength: Length of the chromosome in base pairs.
73
+ :type chromLength: uint64_t
74
+ :param samThreads: Number of threads to use for reading the BAM file.
75
+ :type samThreads: uint32_t
76
+ :param samFlagExclude: See :class:`consenrich.core.samParams`.
77
+ :type samFlagExclude: int
78
+ :return: End position of the last read in the chromosome, or 0 if no reads are found.
79
+ :rtype: uint64_t
80
+ """
81
+
82
+ cdef uint64_t start_ = chromLength - min((chromLength // 2), 1_000_000)
83
+ cdef uint64_t lastPos = 0
84
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
85
+ cdef AlignedSegment read
86
+ for read in aln.fetch(contig=chromosome, start=start_, end=chromLength):
87
+ if not (read.flag & samFlagExclude):
88
+ lastPos = read.reference_end
89
+ aln.close()
90
+ return lastPos
91
+
92
+
93
+
94
+ cpdef uint32_t cgetReadLength(str bamFile, uint32_t minReads, uint32_t samThreads, uint32_t maxIterations, int samFlagExclude):
95
+ r"""Get the median read length from a BAM file after fetching a specified number of reads.
96
+
97
+ :param bamFile: see :class:`consenrich.core.inputParams`.
98
+ :type bamFile: str
99
+ :param minReads: Minimum number of reads to consider for the median calculation.
100
+ :type minReads: uint32_t
101
+ :param samThreads: See :class:`consenrich.core.samParams`.
102
+ :type samThreads: uint32_t
103
+ :param maxIterations: Maximum number of reads to iterate over.
104
+ :type maxIterations: uint32_t
105
+ :param samFlagExclude: See :class:`consenrich.core.samParams`.
106
+ :type samFlagExclude: int
107
+ :return: Median read length from the BAM file.
108
+ :rtype: uint32_t
109
+ """
110
+ cdef uint32_t observedReads = 0
111
+ cdef uint32_t currentIterations = 0
112
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
113
+ cdef AlignedSegment read
114
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] readLengths = np.zeros(maxIterations, dtype=np.uint32)
115
+ cdef uint32_t i = 0
116
+ if <uint32_t>aln.mapped < minReads:
117
+ aln.close()
118
+ return 0
119
+ for read in aln.fetch():
120
+ if not (observedReads < minReads and currentIterations < maxIterations):
121
+ break
122
+ if not (read.flag & samFlagExclude):
123
+ # meets critera -> add it
124
+ readLengths[i] = read.query_length
125
+ observedReads += 1
126
+ i += 1
127
+ currentIterations += 1
128
+ aln.close()
129
+ if observedReads < minReads:
130
+ return 0
131
+ return <uint32_t>np.median(readLengths[:observedReads])
132
+
133
+
134
+ cdef inline Py_ssize_t floordiv64(int64_t a, int64_t b) nogil:
135
+ if a >= 0:
136
+ return <Py_ssize_t>(a // b)
137
+ else:
138
+ return <Py_ssize_t>(- ((-a + b - 1) // b))
139
+
140
+
141
+ cpdef cnp.uint32_t[:] creadBamSegment(
142
+ str bamFile,
143
+ str chromosome,
144
+ uint32_t start,
145
+ uint32_t end,
146
+ uint32_t stepSize,
147
+ int64_t readLength,
148
+ uint8_t oneReadPerBin,
149
+ uint16_t samThreads,
150
+ uint16_t samFlagExclude,
151
+ int64_t shiftForwardStrand53 = 0,
152
+ int64_t shiftReverseStrand53 = 0,
153
+ int64_t extendBP = 0,
154
+ int64_t maxInsertSize=1000,
155
+ int64_t pairedEndMode=0,
156
+ int64_t inferFragmentLength=0):
157
+ r"""Count reads in a BAM file for a given chromosome"""
158
+
159
+ cdef Py_ssize_t numIntervals = <Py_ssize_t>(((end - start) + stepSize - 1) // stepSize)
160
+
161
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] values_np = np.zeros(numIntervals, dtype=np.uint32)
162
+ cdef cnp.uint32_t[::1] values = values_np
163
+
164
+ if numIntervals <= 0:
165
+ return values
166
+
167
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
168
+ cdef AlignedSegment read
169
+ cdef int64_t start64 = start
170
+ cdef int64_t end64 = end
171
+ cdef int64_t step64 = stepSize
172
+ cdef Py_ssize_t i, index0, index1
173
+ cdef Py_ssize_t lastIndex = numIntervals - 1
174
+ cdef bint readIsForward
175
+ cdef int64_t readStart, readEnd
176
+ cdef int64_t adjStart, adjEnd, fivePrime, mid, midIndex, tlen, atlen
177
+ cdef uint16_t flag
178
+ if inferFragmentLength > 0 and pairedEndMode == 0 and extendBP == 0:
179
+ extendBP = cgetFragmentLength(bamFile,
180
+ chromosome,
181
+ <int64_t>start,
182
+ <int64_t>end,
183
+ samThreads = samThreads,
184
+ samFlagExclude=samFlagExclude,
185
+ maxInsertSize=maxInsertSize,
186
+ minInsertSize=<int64_t>(readLength+1), # xCorr peak > rlen ~~> fraglen
187
+ )
188
+ try:
189
+ with aln:
190
+ for read in aln.fetch(chromosome, start64, end64):
191
+ flag = <uint16_t>read.flag
192
+ if flag & samFlagExclude:
193
+ continue
194
+
195
+ readIsForward = (flag & 16) == 0
196
+ readStart = <int64_t>read.reference_start
197
+ readEnd = <int64_t>read.reference_end
198
+
199
+ if pairedEndMode > 0:
200
+ if flag & 1 == 0: # not a paired read
201
+ continue
202
+ # use first in pair + fragment
203
+ if flag & 128:
204
+ continue
205
+ if (flag & 8) or read.next_reference_id != read.reference_id:
206
+ continue
207
+ tlen = <int64_t>read.template_length
208
+ atlen = tlen if tlen >= 0 else -tlen
209
+ if atlen == 0 or atlen > maxInsertSize:
210
+ continue
211
+ if tlen >= 0:
212
+ adjStart = readStart
213
+ adjEnd = readStart + atlen
214
+ else:
215
+ adjEnd = readEnd
216
+ adjStart = adjEnd - atlen
217
+ if shiftForwardStrand53 != 0 or shiftReverseStrand53 != 0:
218
+ if readIsForward:
219
+ adjStart += shiftForwardStrand53
220
+ adjEnd += shiftForwardStrand53
221
+ else:
222
+ adjStart -= shiftReverseStrand53
223
+ adjEnd -= shiftReverseStrand53
224
+ else:
225
+ # SE
226
+ if readIsForward:
227
+ fivePrime = readStart + shiftForwardStrand53
228
+ else:
229
+ fivePrime = (readEnd - 1) - shiftReverseStrand53
230
+
231
+ if extendBP > 0:
232
+ # from the cut 5' --> 3'
233
+ if readIsForward:
234
+ adjStart = fivePrime
235
+ adjEnd = fivePrime + extendBP
236
+ else:
237
+ adjEnd = fivePrime + 1
238
+ adjStart = adjEnd - extendBP
239
+ elif shiftForwardStrand53 != 0 or shiftReverseStrand53 != 0:
240
+ if readIsForward:
241
+ adjStart = readStart + shiftForwardStrand53
242
+ adjEnd = readEnd + shiftForwardStrand53
243
+ else:
244
+ adjStart = readStart - shiftReverseStrand53
245
+ adjEnd = readEnd - shiftReverseStrand53
246
+ else:
247
+ adjStart = readStart
248
+ adjEnd = readEnd
249
+
250
+ if adjEnd <= start64 or adjStart >= end64:
251
+ continue
252
+ if adjStart < start64:
253
+ adjStart = start64
254
+ if adjEnd > end64:
255
+ adjEnd = end64
256
+
257
+ if oneReadPerBin:
258
+ # +1 at midpoint of frag.
259
+ mid = (adjStart + adjEnd) // 2
260
+ midIndex = <Py_ssize_t>((mid - start64) // step64)
261
+ if 0 <= midIndex <= lastIndex:
262
+ values[midIndex] += <uint32_t>1
263
+ else:
264
+ # +1 every interval intersecting frag
265
+ index0 = <Py_ssize_t>((adjStart - start64) // step64)
266
+ index1 = <Py_ssize_t>(((adjEnd - 1) - start64) // step64)
267
+ if index0 < 0:
268
+ index0 = <Py_ssize_t>0
269
+ if index1 > lastIndex:
270
+ index1 = lastIndex
271
+ if index0 > lastIndex or index1 < 0 or index0 > index1:
272
+ continue
273
+ for b_ in range(index0, index1 + 1):
274
+ values[b_] += <uint32_t>1
275
+
276
+ finally:
277
+ aln.close()
278
+
279
+ return values
280
+
281
+
282
+
283
+ cpdef cnp.ndarray[cnp.float32_t, ndim=2] cinvertMatrixE(cnp.ndarray[cnp.float32_t, ndim=1] muncMatrixIter, cnp.float32_t priorCovarianceOO):
284
+ r"""Invert the residual covariance matrix during the forward pass.
285
+
286
+ :param muncMatrixIter: The diagonal elements of the covariance matrix at a given genomic interval.
287
+ :type muncMatrixIter: cnp.ndarray[cnp.float32_t, ndim=1]
288
+ :param priorCovarianceOO: The a priori 'primary' state variance :math:`P_{[i|i-1,11]}`.
289
+ :type priorCovarianceOO: cnp.float32_t
290
+ :return: The inverted covariance matrix.
291
+ :rtype: cnp.ndarray[cnp.float32_t, ndim=2]
292
+ """
293
+
294
+ cdef int m = muncMatrixIter.size
295
+ # we have to invert a P.D. covariance (diagonal) and rank-one (1*priorCovariance) matrix
296
+ cdef cnp.ndarray[cnp.float32_t, ndim=2] inverse = np.empty((m, m), dtype=np.float32)
297
+ # note, not actually an m-dim matrix, just the diagonal elements taken as input
298
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] muncMatrixInverse = np.empty(m, dtype=np.float32)
299
+ cdef float sqrtPrior = np.sqrt(priorCovarianceOO)
300
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] uVec = np.empty(m, dtype=np.float32)
301
+ cdef float divisor = 1.0
302
+ cdef float scale
303
+ cdef float uVecI
304
+ cdef Py_ssize_t i, j
305
+ for i in range(m):
306
+ # two birds: build up the trace while taking the reciprocals
307
+ muncMatrixInverse[i] = 1.0/(muncMatrixIter[i])
308
+ divisor += priorCovarianceOO*muncMatrixInverse[i]
309
+ # we can combine these two loops, keeping construction
310
+ # of muncMatrixInverse and uVec separate for now in case
311
+ # we want to parallelize this later
312
+ for i in range(m):
313
+ uVec[i] = sqrtPrior*muncMatrixInverse[i]
314
+ scale = 1.0 / divisor
315
+ for i in range(m):
316
+ uVecI = uVec[i]
317
+ inverse[i, i] = muncMatrixInverse[i]-(scale*uVecI*uVecI)
318
+ for j in range(i + 1, m):
319
+ inverse[i, j] = -scale * (uVecI*uVec[j])
320
+ inverse[j, i] = inverse[i, j]
321
+ return inverse
322
+
323
+
324
+ cpdef cnp.ndarray[cnp.float32_t, ndim=1] cgetStateCovarTrace(
325
+ cnp.float32_t[:, :, ::1] stateCovarMatrices
326
+ ):
327
+ cdef Py_ssize_t n = stateCovarMatrices.shape[0]
328
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] trace = np.empty(n, dtype=np.float32)
329
+ cdef cnp.float32_t[::1] traceView = trace
330
+ cdef Py_ssize_t i
331
+ for i in range(n):
332
+ traceView[i] = stateCovarMatrices[i, 0, 0] + stateCovarMatrices[i, 1, 1]
333
+
334
+ return trace
335
+
336
+
337
+ cpdef cnp.ndarray[cnp.float32_t, ndim=1] cgetPrecisionWeightedResidual(
338
+ cnp.float32_t[:, ::1] postFitResiduals,
339
+ cnp.float32_t[:, ::1] matrixMunc,
340
+ ):
341
+ cdef Py_ssize_t n = postFitResiduals.shape[0]
342
+ cdef Py_ssize_t m = postFitResiduals.shape[1]
343
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] out = np.empty(n, dtype=np.float32)
344
+ cdef cnp.float32_t[::1] outv = out
345
+ cdef Py_ssize_t i, j
346
+ cdef float wsum, rwsum, w
347
+ cdef float eps = 1e-12 # guard for zeros
348
+
349
+ for i in range(n):
350
+ wsum = 0.0
351
+ rwsum = 0.0
352
+ for j in range(m):
353
+ w = 1.0 / (<float>matrixMunc[j, i] + eps) # weightsIter[j]
354
+ rwsum += (<float>postFitResiduals[i, j]) * w # residualsIter[j] * w
355
+ wsum += w
356
+ outv[i] = <cnp.float32_t>(rwsum / wsum) if wsum > 0.0 else <cnp.float32_t>0.0
357
+
358
+ return out
359
+
360
+
361
+
362
+ cpdef tuple updateProcessNoiseCovariance(cnp.ndarray[cnp.float32_t, ndim=2] matrixQ,
363
+ cnp.ndarray[cnp.float32_t, ndim=2] matrixQCopy,
364
+ float dStat,
365
+ float dStatAlpha,
366
+ float dStatd,
367
+ float dStatPC,
368
+ bint inflatedQ,
369
+ float maxQ,
370
+ float minQ):
371
+ r"""Adjust process noise covariance matrix :math:`\mathbf{Q}_{[i]}`
372
+
373
+ :param matrixQ: Current process noise covariance
374
+ :param matrixQCopy: A copy of the initial original covariance matrix :math:`\mathbf{Q}_{[.]}`
375
+ :param inflatedQ: Flag indicating if the process noise covariance is inflated
376
+ :return: Updated process noise covariance matrix and inflated flag
377
+ :rtype: tuple
378
+ """
379
+
380
+ cdef float scaleQ, fac
381
+ if dStat > dStatAlpha:
382
+ scaleQ = np.sqrt(dStatd * np.abs(dStat-dStatAlpha) + dStatPC)
383
+ if matrixQ[0, 0] * scaleQ <= maxQ:
384
+ matrixQ[0, 0] *= scaleQ
385
+ matrixQ[0, 1] *= scaleQ
386
+ matrixQ[1, 0] *= scaleQ
387
+ matrixQ[1, 1] *= scaleQ
388
+ else:
389
+ fac = maxQ / matrixQCopy[0, 0]
390
+ matrixQ[0, 0] = maxQ
391
+ matrixQ[0, 1] = matrixQCopy[0, 1] * fac
392
+ matrixQ[1, 0] = matrixQCopy[1, 0] * fac
393
+ matrixQ[1, 1] = maxQ
394
+ inflatedQ = True
395
+
396
+ elif dStat < dStatAlpha and inflatedQ:
397
+ scaleQ = np.sqrt(dStatd * np.abs(dStat-dStatAlpha) + dStatPC)
398
+ if matrixQ[0, 0] / scaleQ >= minQ:
399
+ matrixQ[0, 0] /= scaleQ
400
+ matrixQ[0, 1] /= scaleQ
401
+ matrixQ[1, 0] /= scaleQ
402
+ matrixQ[1, 1] /= scaleQ
403
+ else:
404
+ # we've hit the minimum, no longer 'inflated'
405
+ fac = minQ / matrixQCopy[0, 0]
406
+ matrixQ[0, 0] = minQ
407
+ matrixQ[0, 1] = matrixQCopy[0, 1] * fac
408
+ matrixQ[1, 0] = matrixQCopy[1, 0] * fac
409
+ matrixQ[1, 1] = minQ
410
+ inflatedQ = False
411
+ return matrixQ, inflatedQ
412
+
413
+
414
+ cdef void _blockMax(double[::1] valuesView,
415
+ Py_ssize_t[::1] blockStartIndices,
416
+ Py_ssize_t[::1] blockSizes,
417
+ double[::1] outputView,
418
+ double eps = 0.0) noexcept:
419
+ cdef Py_ssize_t iterIndex, elementIndex, startIndex, blockLength
420
+ cdef double currentMax, currentValue
421
+ cdef Py_ssize_t firstIdx, lastIdx, centerIdx
422
+
423
+ for iterIndex in range(outputView.shape[0]):
424
+ startIndex = blockStartIndices[iterIndex]
425
+ blockLength = blockSizes[iterIndex]
426
+
427
+ currentMax = valuesView[startIndex]
428
+ for elementIndex in range(1, blockLength):
429
+ currentValue = valuesView[startIndex + elementIndex]
430
+ if currentValue > currentMax:
431
+ currentMax = currentValue
432
+
433
+ firstIdx = -1
434
+ lastIdx = -1
435
+ if eps > 0.0:
436
+ # only run if eps tol is non-zero
437
+ for elementIndex in range(blockLength):
438
+ currentValue = valuesView[startIndex + elementIndex]
439
+ # NOTE: this is intended to mirror the +- eps tol
440
+ if currentValue >= currentMax - eps:
441
+ if firstIdx == -1:
442
+ firstIdx = elementIndex
443
+ lastIdx = elementIndex
444
+
445
+ if firstIdx == -1:
446
+ # case: we didn't find a tie or eps == 0
447
+ outputView[iterIndex] = currentMax
448
+ else:
449
+ # case: there's a tie for eps > 0, pick center
450
+ centerIdx = (firstIdx + lastIdx) // 2
451
+ outputView[iterIndex] = valuesView[startIndex + centerIdx]
452
+
453
+
454
+ cpdef double[::1] csampleBlockStats(cnp.ndarray[cnp.uint32_t, ndim=1] intervals,
455
+ cnp.ndarray[cnp.float64_t, ndim=1] values,
456
+ int expectedBlockSize,
457
+ int iters,
458
+ int randSeed,
459
+ cnp.ndarray[cnp.uint8_t, ndim=1] excludeIdxMask,
460
+ double eps = 0.0):
461
+ r"""Sample contiguous blocks in the response sequence (xCorr), record maxima, and repeat.
462
+
463
+ Used to build an empirical null distribution and determine significance of response outputs.
464
+ The size of blocks is drawn from a truncated geometric distribution, preserving rough equality
465
+ in expectation but allowing for variability to account for the sampling across different phases
466
+ in the response sequence.
467
+
468
+ :param values: The response sequence to sample from.
469
+ :type values: cnp.ndarray[cnp.float64_t, ndim=1]
470
+ :param expectedBlockSize: The expected size (geometric) of the blocks to sample.
471
+ :type expectedBlockSize: int
472
+ :param iters: The number of blocks to sample.
473
+ :type iters: int
474
+ :param randSeed: Random seed for reproducibility.
475
+ :type randSeed: int
476
+ :return: An array of sampled block maxima.
477
+ :rtype: cnp.ndarray[cnp.float64_t, ndim=1]
478
+ :seealso: :func:`consenrich.matching.matchWavelet`
479
+ """
480
+ np.random.seed(randSeed)
481
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] valuesArr = np.ascontiguousarray(values, dtype=np.float64)
482
+ cdef double[::1] valuesView = valuesArr
483
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] sizesArr
484
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] startsArr
485
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] out = np.empty(iters, dtype=np.float64)
486
+ cdef Py_ssize_t maxBlockLength, maxSize, minSize
487
+ cdef Py_ssize_t n = <Py_ssize_t>intervals.size
488
+ cdef double maxBlockScale = <double>3.0
489
+ cdef double minBlockScale = <double> (1.0 / 3.0)
490
+
491
+ minSize = <Py_ssize_t> max(3, expectedBlockSize * minBlockScale)
492
+ maxSize = <Py_ssize_t> min(maxBlockScale * expectedBlockSize, n)
493
+ sizesArr = np.random.geometric(1.0 / expectedBlockSize, size=iters).astype(np.intp, copy=False)
494
+ np.clip(sizesArr, minSize, maxSize, out=sizesArr)
495
+ maxBlockLength = sizesArr.max()
496
+ cdef list support = []
497
+ cdef cnp.intp_t i_ = 0
498
+ while i_ < n-maxBlockLength:
499
+ if excludeIdxMask[i_:i_ + maxBlockLength].any():
500
+ i_ = i_ + maxBlockLength + 1
501
+ continue
502
+ support.append(i_)
503
+ i_ = i_ + 1
504
+
505
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] samples = np.random.choice(
506
+ support,
507
+ size=iters,
508
+ replace=True,
509
+ p=None
510
+ ).astype(np.intp)
511
+
512
+ cdef Py_ssize_t[::1] startsView = samples
513
+ cdef Py_ssize_t[::1] sizesView = sizesArr
514
+ cdef double[::1] outView = out
515
+ _blockMax(valuesView, startsView, sizesView, outView, eps)
516
+ return out
517
+
518
+
519
+ cpdef cSparseAvg(cnp.float32_t[::1] trackALV, dict sparseMap):
520
+ r"""Fast access and average of `numNearest` sparse elements.
521
+
522
+ See :func:`consenrich.core.getMuncTrack`
523
+
524
+ :param trackALV: See :func:`consenrich.core.getAverageLocalVarianceTrack`
525
+ :type trackALV: float[::1]
526
+ :param sparseMap: See :func:`consenrich.core.getSparseMap`
527
+ :type sparseMap: dict[int, np.ndarray]
528
+ :return: array of mena('nearest local variances') same length as `trackALV`
529
+ :rtype: cnp.ndarray[cnp.float32_t, ndim=1]
530
+ """
531
+ cdef Py_ssize_t n = <Py_ssize_t>trackALV.shape[0]
532
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] out = np.empty(n, dtype=np.float32)
533
+ cdef Py_ssize_t i, j, m
534
+ cdef float sumNearestVariances = 0.0
535
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] idxs
536
+ cdef cnp.intp_t[::1] idx_view
537
+ for i in range(n):
538
+ idxs = <cnp.ndarray[cnp.intp_t, ndim=1]> sparseMap[i] # FFR: to avoid the cast, create sparseMap as dict[intp, np.ndarray[intp]]
539
+ idx_view = idxs
540
+ m = idx_view.shape[0] # FFR: maybe enforce strict `m == numNearest` in future releases to avoid extra overhead
541
+ if m == 0:
542
+ # this case probably warrants an exception or np.nan
543
+ out[i] = 0.0
544
+ continue
545
+ sumNearestVariances = 0.0
546
+ with nogil:
547
+ for j in range(m):
548
+ sumNearestVariances += trackALV[idx_view[j]]
549
+ out[i] = sumNearestVariances/m
550
+
551
+ return out
552
+
553
+
554
+ cpdef int64_t cgetFragmentLength(
555
+ str bamFile,
556
+ str chromosome,
557
+ int64_t start,
558
+ int64_t end,
559
+ uint16_t samThreads=0,
560
+ uint16_t samFlagExclude=3844,
561
+ int64_t maxInsertSize=1000,
562
+ int64_t minInsertSize=50,
563
+ int64_t iters=1000,
564
+ int64_t blockSize=5000,
565
+ int64_t fallBack=147,
566
+ int64_t rollingChunkSize=250,
567
+ int64_t lagStep=5,
568
+ int64_t earlyExit=100,
569
+ int64_t randSeed=42,
570
+ ):
571
+ np.random.seed(randSeed)
572
+ cdef int64_t regionLen, numRollSteps, numChunks
573
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] rawArr
574
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] medArr
575
+ cdef AlignmentFile aln
576
+ cdef AlignedSegment read_
577
+ cdef list coverageIdxTopK
578
+ cdef list blockCenters
579
+ cdef list bestLags
580
+ cdef int i, j, k, idxVal
581
+ cdef int startIdx, endIdx
582
+ cdef int winSize, takeK
583
+ cdef int blockHalf, readFlag
584
+ cdef int chosenLag, lag, maxValidLag
585
+ cdef int64_t blockStartBP, blockEndBP, readStart, readEnd, med
586
+ cdef double score
587
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] unsortedIdx, sortedIdx, expandedIdx
588
+ cdef cnp.intp_t[::1] expandedIdxView
589
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] unsortedVals
590
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] seen
591
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] fwd = np.zeros(blockSize, dtype=np.float64, order='C')
592
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] rev = np.zeros(blockSize, dtype=np.float64, order='C')
593
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] fwdDiff = np.zeros(blockSize + 1, dtype=np.float64, order='C')
594
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] revDiff = np.zeros(blockSize + 1, dtype=np.float64, order='C')
595
+ cdef int64_t diffS, diffE = 0
596
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] xCorr
597
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] bestLagsArr
598
+
599
+ earlyExit = min(earlyExit, iters)
600
+ regionLen = end - start
601
+
602
+ samThreads_ = <int>samThreads
603
+ numCPUs = os.cpu_count()
604
+ if numCPUs is None:
605
+ numCPUs = 1
606
+ if samThreads < 1:
607
+ samThreads_ = <int>min(max(1,numCPUs // 2), 4)
608
+
609
+ if regionLen < blockSize or regionLen <= 0:
610
+ return <int64_t>fallBack
611
+
612
+ if maxInsertSize < 1:
613
+ maxInsertSize = 1
614
+ if minInsertSize < 1:
615
+ minInsertSize = 1
616
+ if minInsertSize > maxInsertSize:
617
+ minInsertSize, maxInsertSize = maxInsertSize, minInsertSize
618
+
619
+ # first, we build a coarse read coverage track from `start` to `end`
620
+ numRollSteps = regionLen // rollingChunkSize
621
+ if numRollSteps <= 0:
622
+ numRollSteps = 1
623
+ numChunks = <int>numRollSteps
624
+
625
+ rawArr = np.zeros(numChunks, dtype=np.float64)
626
+ medArr = np.zeros(numChunks, dtype=np.float64)
627
+
628
+ aln = AlignmentFile(bamFile, "rb", threads=samThreads_)
629
+ try:
630
+ for read_ in aln.fetch(chromosome, start, end):
631
+ if (read_.flag & samFlagExclude) != 0:
632
+ continue
633
+ if read_.reference_start < start or read_.reference_end >= end:
634
+ continue
635
+ j = <int>((read_.reference_start - start) // rollingChunkSize)
636
+ if 0 <= j < numChunks:
637
+ rawArr[j] += 1.0
638
+
639
+ # second, we apply a rolling/moving/local/weywtci order-statistic filter (median)
640
+ # ...the size of the kernel is based on the blockSize -- we want high-coverage
641
+ # ...blocks as measured by their local median read counts
642
+ winSize = <int>(blockSize // rollingChunkSize)
643
+ if winSize < 1:
644
+ winSize = 1
645
+ if (winSize & 1) == 0:
646
+ winSize += 1
647
+ medArr[:] = ndimage.median_filter(rawArr, size=winSize, mode="nearest")
648
+
649
+ # we pick the largest local-medians and form a block around each
650
+ takeK = iters if iters < numChunks else numChunks
651
+ unsortedIdx = np.argpartition(medArr, -takeK)[-takeK:]
652
+ unsortedVals = medArr[unsortedIdx]
653
+ sortedIdx = unsortedIdx[np.argsort(unsortedVals)[::-1]]
654
+ coverageIdxTopK = sortedIdx[:takeK].tolist()
655
+ expandedLen = takeK*winSize
656
+ expandedIdx = np.empty(expandedLen, dtype=np.intp)
657
+ expandedIdxView = expandedIdx
658
+ k = 0
659
+ for i in range(takeK):
660
+ idxVal = coverageIdxTopK[i]
661
+ startIdx = idxVal - (winSize // 2)
662
+ endIdx = startIdx + winSize
663
+ if startIdx < 0:
664
+ startIdx = 0
665
+ endIdx = winSize if winSize < numChunks else numChunks
666
+ if endIdx > numChunks:
667
+ endIdx = numChunks
668
+ startIdx = endIdx - winSize if winSize <= numChunks else 0
669
+ for j in range(startIdx, endIdx):
670
+ expandedIdxView[k] = j
671
+ k += 1
672
+ if k < expandedLen:
673
+ expandedIdx = expandedIdx[:k]
674
+ expandedIdxView = expandedIdx
675
+
676
+ seen = np.zeros(numChunks, dtype=np.uint8)
677
+ blockCenters = []
678
+ for i in range(expandedIdx.shape[0]):
679
+ j = <int>expandedIdxView[i]
680
+ if seen[j] == 0:
681
+ seen[j] = 1
682
+ blockCenters.append(j)
683
+
684
+ if len(blockCenters) > 1:
685
+ blockCenters = np.random.choice(
686
+ blockCenters,
687
+ size=len(blockCenters),
688
+ replace=False,
689
+ p=None
690
+ ).tolist()
691
+
692
+ bestLags = []
693
+ blockHalf = blockSize // 2
694
+
695
+ for idxVal in blockCenters:
696
+ # this should map back to genomic coordinates
697
+ blockStartBP = start + idxVal * rollingChunkSize + (rollingChunkSize // 2) - blockHalf
698
+ if blockStartBP < start:
699
+ blockStartBP = start
700
+ blockEndBP = blockStartBP + blockSize
701
+ if blockEndBP > end:
702
+ blockEndBP = end
703
+ blockStartBP = blockEndBP - blockSize
704
+ if blockStartBP < start:
705
+ continue
706
+
707
+ # now we build strand-specific tracks over each block
708
+ fwd.fill(0.0)
709
+ fwdDiff.fill(0.0)
710
+ rev.fill(0.0)
711
+ revDiff.fill(0.0)
712
+ readFlag = -1
713
+
714
+ for read_ in aln.fetch(chromosome, blockStartBP, blockEndBP):
715
+ readFlag = read_.flag
716
+ if (readFlag & samFlagExclude) != 0:
717
+ continue
718
+ readStart = min(read_.reference_start, read_.reference_end)
719
+ readEnd = max(read_.reference_start, read_.reference_end)
720
+ diffS = readStart - blockStartBP
721
+ diffE = readEnd - blockStartBP
722
+ strand = readFlag & 16
723
+ if readStart < blockStartBP or readEnd > blockEndBP:
724
+ continue
725
+ posInBlock = readStart - blockStartBP
726
+ if strand == 0:
727
+ # forward
728
+ fwdDiff[diffS] += 1.0
729
+ fwdDiff[diffE] -= 1.0
730
+ else:
731
+ # reverse
732
+ revDiff[diffS] += 1.0
733
+ revDiff[diffE] -= 1.0
734
+ fwd = np.cumsum(fwdDiff[:len(fwdDiff)-1], dtype=np.float64)
735
+ rev = np.cumsum(revDiff[:len(revDiff)-1], dtype=np.float64)
736
+
737
+ # maximizes the crossCovar(forward, reverse, lag) wrt lag.
738
+ fwd = (fwd - np.mean(fwd))
739
+ rev = (rev - np.mean(rev))
740
+ maxValidLag = maxInsertSize if (maxInsertSize < blockSize) else (blockSize - 1)
741
+ xCorr = np.zeros(maxInsertSize + 1, dtype=np.float64)
742
+ for lag in range(minInsertSize, maxValidLag + 1, lagStep):
743
+ score = 0.0
744
+ for i in range(blockSize - lag):
745
+ score += fwd[i] * rev[i + lag]
746
+ xCorr[lag] = score
747
+
748
+ if maxValidLag < minInsertSize:
749
+ continue
750
+ chosenLag = -1
751
+ chosenLag = int(np.argmax(xCorr[minInsertSize:maxValidLag + 1])) + minInsertSize
752
+
753
+ if chosenLag > 0:
754
+ bestLags.append(chosenLag)
755
+ if len(bestLags) >= earlyExit:
756
+ break
757
+
758
+ finally:
759
+ aln.close()
760
+
761
+ if len(bestLags) < 3:
762
+ return fallBack
763
+
764
+ bestLagsArr = np.asarray(bestLags, dtype=np.uint32)
765
+ med = int(np.median(bestLagsArr))
766
+ if med < minInsertSize:
767
+ med = <int>minInsertSize
768
+ elif med > maxInsertSize:
769
+ med = <int>maxInsertSize
770
+ return med
771
+
772
+
773
+ cdef inline Py_ssize_t getInsertion(const uint32_t* array_, Py_ssize_t n, uint32_t x) nogil:
774
+ # helper: binary search to find insertion point into sorted `arrray_`
775
+ cdef Py_ssize_t low = 0
776
+ cdef Py_ssize_t high = n
777
+ cdef Py_ssize_t midpt
778
+ while low < high:
779
+ midpt = low + ((high - low) >> 1)
780
+ if array_[midpt] <= x:
781
+ low = midpt + 1
782
+ else:
783
+ high = midpt
784
+ return low
785
+
786
+
787
+ cdef int maskMembership(const uint32_t* pos, Py_ssize_t numIntervals, const uint32_t* mStarts, const uint32_t* mEnds, Py_ssize_t n, uint8_t* outMask) nogil:
788
+ cdef Py_ssize_t i = 0
789
+ cdef Py_ssize_t k
790
+ cdef uint32_t p
791
+ while i < numIntervals:
792
+ p = pos[i]
793
+ k = getInsertion(mStarts, n, p) - 1
794
+ if k >= 0 and p < mEnds[k]:
795
+ outMask[i] = <uint8_t>1
796
+ else:
797
+ outMask[i] = <uint8_t>0
798
+ i += 1
799
+ return 0
800
+
801
+
802
+ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] cbedMask(
803
+ str chromosome,
804
+ str bedFile,
805
+ cnp.ndarray[cnp.uint32_t, ndim=1] intervals,
806
+ int stepSize
807
+ ):
808
+ r"""Return a 1/0 mask for intervals overlapping a sorted and merged BED file.
809
+
810
+ :param chromosome: Chromosome name.
811
+ :type chromosome: str
812
+ :param bedFile: Path to a sorted and merged BED file.
813
+ :type bedFile: str
814
+ :param intervals: Array of sorted, non-overlapping start positions of genomic intervals.
815
+ Each interval is assumed `stepSize`.
816
+ :type intervals: cnp.ndarray[cnp.uint32_t, ndim=1]
817
+ :param stepSize: Step size between genomic positions in `intervals`.
818
+ :type stepSize: int32_t
819
+ :return: A mask s.t. `1` indicates the corresponding interval overlaps a BED region.
820
+ :rtype: cnp.ndarray[cnp.uint8_t, ndim=1]
821
+
822
+ """
823
+ cdef list startsList = []
824
+ cdef list endsList = []
825
+ cdef object f = open(bedFile, "r")
826
+ cdef str line
827
+ cdef list cols
828
+ try:
829
+ for line in f:
830
+ line = line.strip()
831
+ if not line or line[0] == '#':
832
+ continue
833
+ cols = line.split('\t')
834
+ if not cols or len(cols) < 3:
835
+ continue
836
+ if cols[0] != chromosome:
837
+ continue
838
+ startsList.append(int(cols[1]))
839
+ endsList.append(int(cols[2]))
840
+ finally:
841
+ f.close()
842
+ cdef Py_ssize_t numIntervals = intervals.size
843
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] mask = np.zeros(numIntervals, dtype=np.uint8)
844
+ if not startsList:
845
+ return mask
846
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] starts = np.asarray(startsList, dtype=np.uint32)
847
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] ends = np.asarray(endsList, dtype=np.uint32)
848
+ cdef cnp.uint32_t[:] startsView = starts
849
+ cdef cnp.uint32_t[:] endsView = ends
850
+ cdef cnp.uint32_t[:] posView = intervals
851
+ cdef cnp.uint8_t[:] outView = mask
852
+ # for nogil
853
+ cdef uint32_t* svPtr = &startsView[0] if starts.size > 0 else <uint32_t*>NULL
854
+ cdef uint32_t* evPtr = &endsView[0] if ends.size > 0 else <uint32_t*>NULL
855
+ cdef uint32_t* posPtr = &posView[0] if numIntervals > 0 else <uint32_t*>NULL
856
+ cdef uint8_t* outPtr = &outView[0] if numIntervals > 0 else <uint8_t*>NULL
857
+ cdef Py_ssize_t n = starts.size
858
+ with nogil:
859
+ if numIntervals > 0 and n > 0:
860
+ maskMembership(posPtr, numIntervals, svPtr, evPtr, n, outPtr)
861
+ return mask