consenrich 0.7.11b2__cp314-cp314-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

Files changed (38) hide show
  1. consenrich/.dylibs/libomp.dylib +0 -0
  2. consenrich/__init__.py +11 -0
  3. consenrich/cconsenrich.c +50610 -0
  4. consenrich/cconsenrich.cpython-314-darwin.so +0 -0
  5. consenrich/cconsenrich.pyx +1065 -0
  6. consenrich/consenrich.py +1802 -0
  7. consenrich/constants.py +172 -0
  8. consenrich/core.py +2068 -0
  9. consenrich/data/ce10.sizes +6 -0
  10. consenrich/data/ce10_blacklist.bed +100 -0
  11. consenrich/data/ce10_sparse.bed +11828 -0
  12. consenrich/data/ce11.sizes +6 -0
  13. consenrich/data/ce11_blacklist.bed +97 -0
  14. consenrich/data/ce11_sparse.bed +11828 -0
  15. consenrich/data/dm6.sizes +7 -0
  16. consenrich/data/dm6_blacklist.bed +182 -0
  17. consenrich/data/dm6_sparse.bed +20000 -0
  18. consenrich/data/hg19.sizes +24 -0
  19. consenrich/data/hg19_blacklist.bed +834 -0
  20. consenrich/data/hg19_sparse.bed +288358 -0
  21. consenrich/data/hg38.sizes +24 -0
  22. consenrich/data/hg38_blacklist.bed +636 -0
  23. consenrich/data/hg38_sparse.bed +288699 -0
  24. consenrich/data/mm10.sizes +21 -0
  25. consenrich/data/mm10_blacklist.bed +3435 -0
  26. consenrich/data/mm10_sparse.bed +100400 -0
  27. consenrich/data/mm39.sizes +21 -0
  28. consenrich/data/mm39_blacklist.bed +3360 -0
  29. consenrich/data/mm39_sparse.bed +100381 -0
  30. consenrich/detrorm.py +297 -0
  31. consenrich/matching.py +929 -0
  32. consenrich/misc_util.py +122 -0
  33. consenrich-0.7.11b2.dist-info/METADATA +66 -0
  34. consenrich-0.7.11b2.dist-info/RECORD +38 -0
  35. consenrich-0.7.11b2.dist-info/WHEEL +6 -0
  36. consenrich-0.7.11b2.dist-info/entry_points.txt +2 -0
  37. consenrich-0.7.11b2.dist-info/licenses/LICENSE +21 -0
  38. consenrich-0.7.11b2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1065 @@
1
+ # -*- coding: utf-8 -*-
2
+ # cython: boundscheck=False, wraparound=False, cdivision=True, nonecheck=False, initializedcheck=False, infer_types=True, language_level=3
3
+ # distutils: language = c
4
+ r"""Cython module for Consenrich core functions.
5
+
6
+ This module contains Cython implementations of core functions used in Consenrich.
7
+ """
8
+
9
+ cimport cython
10
+
11
+ import os
12
+ import numpy as np
13
+ from scipy import ndimage
14
+ import pysam
15
+
16
+ cimport numpy as cnp
17
+ from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
18
+ from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
19
+ from libc.float cimport DBL_EPSILON
20
+ from numpy.random import default_rng
21
+ from cython.parallel import prange
22
+ cnp.import_array()
23
+
24
+ cpdef int stepAdjustment(int value, int stepSize, int pushForward=0):
25
+ r"""Adjusts a value to the nearest multiple of stepSize, optionally pushing it forward.
26
+
27
+ :param value: The value to adjust.
28
+ :type value: int
29
+ :param stepSize: The step size to adjust to.
30
+ :type stepSize: int
31
+ :param pushForward: If non-zero, pushes the value forward by stepSize
32
+ :type pushForward: int
33
+ :return: The adjusted value.
34
+ :rtype: int
35
+ """
36
+ return max(0, (value-(value % stepSize))) + pushForward*stepSize
37
+
38
+
39
+ cpdef uint64_t cgetFirstChromRead(str bamFile, str chromosome, uint64_t chromLength, uint32_t samThreads, int samFlagExclude):
40
+ r"""Get the start position of the first read in a BAM file for a given chromosome.
41
+
42
+ :param bamFile: See :func:`consenrich.core.inputParams`.
43
+ :type bamFile: str
44
+ :param chromosome: Chromosome name.
45
+ :type chromosome: str
46
+ :param chromLength: Length of the chromosome in base pairs.
47
+ :type chromLength: uint64_t
48
+ :param samThreads: Number of threads to use for reading the BAM file.
49
+ :type samThreads: uint32_t
50
+ :param samFlagExclude: SAM flags to exclude reads (e.g., unmapped,
51
+ :type samFlagExclude: int
52
+ :return: Start position of the first read in the chromosome, or 0 if no reads are found.
53
+ :rtype: uint64_t
54
+ """
55
+
56
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
57
+ cdef AlignedSegment read
58
+ for read in aln.fetch(contig=chromosome, start=0, end=chromLength):
59
+ if not (read.flag & samFlagExclude):
60
+ aln.close()
61
+ return read.reference_start
62
+ aln.close()
63
+ return 0
64
+
65
+
66
+ cpdef uint64_t cgetLastChromRead(str bamFile, str chromosome, uint64_t chromLength, uint32_t samThreads, int samFlagExclude):
67
+ r"""Get the end position of the last read in a BAM file for a given chromosome.
68
+
69
+ :param bamFile: See :func:`consenrich.core.inputParams`.
70
+ :type bamFile: str
71
+ :param chromosome: Chromosome name.
72
+ :type chromosome: str
73
+ :param chromLength: Length of the chromosome in base pairs.
74
+ :type chromLength: uint64_t
75
+ :param samThreads: Number of threads to use for reading the BAM file.
76
+ :type samThreads: uint32_t
77
+ :param samFlagExclude: See :class:`consenrich.core.samParams`.
78
+ :type samFlagExclude: int
79
+ :return: End position of the last read in the chromosome, or 0 if no reads are found.
80
+ :rtype: uint64_t
81
+ """
82
+
83
+ cdef uint64_t start_ = chromLength - min((chromLength // 2), 1_000_000)
84
+ cdef uint64_t lastPos = 0
85
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
86
+ cdef AlignedSegment read
87
+ for read in aln.fetch(contig=chromosome, start=start_, end=chromLength):
88
+ if not (read.flag & samFlagExclude):
89
+ lastPos = read.reference_end
90
+ aln.close()
91
+ return lastPos
92
+
93
+
94
+
95
+ cpdef uint32_t cgetReadLength(str bamFile, uint32_t minReads, uint32_t samThreads, uint32_t maxIterations, int samFlagExclude):
96
+ r"""Get the median read length from a BAM file after fetching a specified number of reads.
97
+
98
+ :param bamFile: see :class:`consenrich.core.inputParams`.
99
+ :type bamFile: str
100
+ :param minReads: Minimum number of reads to consider for the median calculation.
101
+ :type minReads: uint32_t
102
+ :param samThreads: See :class:`consenrich.core.samParams`.
103
+ :type samThreads: uint32_t
104
+ :param maxIterations: Maximum number of reads to iterate over.
105
+ :type maxIterations: uint32_t
106
+ :param samFlagExclude: See :class:`consenrich.core.samParams`.
107
+ :type samFlagExclude: int
108
+ :return: Median read length from the BAM file.
109
+ :rtype: uint32_t
110
+ """
111
+ cdef uint32_t observedReads = 0
112
+ cdef uint32_t currentIterations = 0
113
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
114
+ cdef AlignedSegment read
115
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] readLengths = np.zeros(maxIterations, dtype=np.uint32)
116
+ cdef uint32_t i = 0
117
+ if <uint32_t>aln.mapped < minReads:
118
+ aln.close()
119
+ return 0
120
+ for read in aln.fetch():
121
+ if not (observedReads < minReads and currentIterations < maxIterations):
122
+ break
123
+ if not (read.flag & samFlagExclude):
124
+ # meets critera -> add it
125
+ readLengths[i] = read.query_length
126
+ observedReads += 1
127
+ i += 1
128
+ currentIterations += 1
129
+ aln.close()
130
+ if observedReads < minReads:
131
+ return 0
132
+ return <uint32_t>np.median(readLengths[:observedReads])
133
+
134
+
135
+ cdef inline Py_ssize_t floordiv64(int64_t a, int64_t b) nogil:
136
+ if a >= 0:
137
+ return <Py_ssize_t>(a // b)
138
+ else:
139
+ return <Py_ssize_t>(- ((-a + b - 1) // b))
140
+
141
+
142
+ cpdef cnp.float32_t[:] creadBamSegment(
143
+ str bamFile,
144
+ str chromosome,
145
+ uint32_t start,
146
+ uint32_t end,
147
+ uint32_t stepSize,
148
+ int64_t readLength,
149
+ uint8_t oneReadPerBin,
150
+ uint16_t samThreads,
151
+ uint16_t samFlagExclude,
152
+ int64_t shiftForwardStrand53 = 0,
153
+ int64_t shiftReverseStrand53 = 0,
154
+ int64_t extendBP = 0,
155
+ int64_t maxInsertSize=1000,
156
+ int64_t pairedEndMode=0,
157
+ int64_t inferFragmentLength=0,
158
+ int64_t minMappingQuality=0,
159
+ int64_t minTemplateLength=-1,
160
+ uint8_t weightByOverlap=1,
161
+ ):
162
+ r"""Count reads in a BAM file for a given chromosome"""
163
+
164
+ cdef Py_ssize_t numIntervals
165
+ cdef int64_t width = <int64_t>end - <int64_t>start
166
+
167
+ if stepSize <= 0 or width <= 0:
168
+ numIntervals = 0
169
+ else:
170
+ numIntervals = <Py_ssize_t>((width + stepSize - 1) // stepSize)
171
+
172
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] values_np = np.zeros(numIntervals, dtype=np.float32)
173
+ cdef cnp.float32_t[::1] values = values_np
174
+
175
+ if numIntervals <= 0:
176
+ return values
177
+
178
+ cdef AlignmentFile aln = AlignmentFile(bamFile, 'rb', threads=samThreads)
179
+ cdef AlignedSegment read
180
+ cdef int64_t start64 = start
181
+ cdef int64_t end64 = end
182
+ cdef int64_t step64 = stepSize
183
+ cdef Py_ssize_t i, index0, index1, b_, midIndex
184
+ cdef Py_ssize_t lastIndex = numIntervals - 1
185
+ cdef bint readIsForward
186
+ cdef int64_t readStart, readEnd
187
+ cdef int64_t binStart, binEnd
188
+ cdef int64_t overlapStart, overlapEnd, overlap
189
+ cdef int64_t adjStart, adjEnd, fivePrime, mid, tlen, atlen
190
+ cdef uint16_t flag
191
+ cdef int64_t minTLEN = minTemplateLength
192
+ cdef int minMapQ = <int>minMappingQuality
193
+
194
+ if minTLEN < 0:
195
+ minTLEN = readLength
196
+
197
+ if inferFragmentLength > 0 and pairedEndMode <= 0 and extendBP <= 0:
198
+ extendBP = cgetFragmentLength(bamFile,
199
+ samThreads = samThreads,
200
+ samFlagExclude=samFlagExclude,
201
+ )
202
+ try:
203
+ with aln:
204
+ for read in aln.fetch(chromosome, start64, end64):
205
+ flag = <uint16_t>read.flag
206
+ if flag & samFlagExclude or read.mapping_quality < minMapQ:
207
+ continue
208
+
209
+ readIsForward = (flag & 16) == 0
210
+ readStart = <int64_t>read.reference_start
211
+ readEnd = <int64_t>read.reference_end
212
+
213
+ if pairedEndMode > 0:
214
+ if flag & 2 == 0: # not a properly paired read
215
+ continue
216
+ # use first in pair + fragment
217
+ if flag & 128:
218
+ continue
219
+ if (flag & 8) or read.next_reference_id != read.reference_id:
220
+ continue
221
+ tlen = <int64_t>read.template_length
222
+ atlen = tlen if tlen >= 0 else -tlen
223
+ if atlen == 0 or atlen < minTLEN:
224
+ continue
225
+ if tlen >= 0:
226
+ adjStart = readStart
227
+ adjEnd = readStart + atlen
228
+ else:
229
+ adjEnd = readEnd
230
+ adjStart = adjEnd - atlen
231
+ if shiftForwardStrand53 != 0 or shiftReverseStrand53 != 0:
232
+ if readIsForward:
233
+ adjStart += shiftForwardStrand53
234
+ adjEnd += shiftForwardStrand53
235
+ else:
236
+ adjStart -= shiftReverseStrand53
237
+ adjEnd -= shiftReverseStrand53
238
+ else:
239
+ # SE
240
+ if readIsForward:
241
+ fivePrime = readStart + shiftForwardStrand53
242
+ else:
243
+ fivePrime = (readEnd - 1) - shiftReverseStrand53
244
+
245
+ if extendBP > 0:
246
+ # from the cut 5' --> 3'
247
+ if readIsForward:
248
+ adjStart = fivePrime
249
+ adjEnd = fivePrime + extendBP
250
+ else:
251
+ adjEnd = fivePrime + 1
252
+ adjStart = adjEnd - extendBP
253
+ elif shiftForwardStrand53 != 0 or shiftReverseStrand53 != 0:
254
+ if readIsForward:
255
+ adjStart = readStart + shiftForwardStrand53
256
+ adjEnd = readEnd + shiftForwardStrand53
257
+ else:
258
+ adjStart = readStart - shiftReverseStrand53
259
+ adjEnd = readEnd - shiftReverseStrand53
260
+ else:
261
+ adjStart = readStart
262
+ adjEnd = readEnd
263
+
264
+ if adjEnd <= start64 or adjStart >= end64:
265
+ continue
266
+ if adjStart < start64:
267
+ adjStart = start64
268
+ if adjEnd > end64:
269
+ adjEnd = end64
270
+
271
+ if oneReadPerBin:
272
+ mid = (adjStart + adjEnd) // 2
273
+ midIndex = <Py_ssize_t>((mid - start64) // step64)
274
+ if 0 <= midIndex <= lastIndex:
275
+ values[midIndex] += <cnp.float32_t>1.0
276
+
277
+ else:
278
+ index0 = <Py_ssize_t>((adjStart - start64) // step64)
279
+ index1 = <Py_ssize_t>(((adjEnd - 1) - start64) // step64)
280
+ if index0 < 0:
281
+ index0 = 0
282
+ if index1 > lastIndex:
283
+ index1 = lastIndex
284
+ if index0 > lastIndex or index1 < 0 or index0 > index1:
285
+ continue
286
+
287
+ if weightByOverlap:
288
+ for b_ in range(index0, index1 + 1):
289
+ binStart = start64 + (<int64_t>b_) * step64
290
+ binEnd = binStart + step64
291
+ if binEnd > end64:
292
+ binEnd = end64
293
+
294
+ overlapStart = adjStart if adjStart > binStart else binStart
295
+ overlapEnd = adjEnd if adjEnd < binEnd else binEnd
296
+ overlap = overlapEnd - overlapStart
297
+ if overlap > 0:
298
+ values[b_] += (<cnp.float32_t>overlap / <cnp.float32_t>(binEnd - binStart))
299
+ else:
300
+ for b_ in range(index0, index1 + 1):
301
+ values[b_] += <cnp.float32_t>1.0
302
+
303
+
304
+ finally:
305
+ aln.close()
306
+
307
+ return values
308
+
309
+
310
+ cpdef cnp.ndarray[cnp.float32_t, ndim=2] cinvertMatrixE(
311
+ cnp.ndarray[cnp.float32_t, ndim=1] muncMatrixIter,
312
+ cnp.float32_t priorCovarianceOO,
313
+ cnp.float32_t innovationCovariancePadding=1.0e-2):
314
+ r"""Invert the residual covariance matrix during the forward pass.
315
+
316
+ :param muncMatrixIter: The diagonal elements of the covariance matrix at a given genomic interval.
317
+ :type muncMatrixIter: cnp.ndarray[cnp.float32_t, ndim=1]
318
+ :param priorCovarianceOO: The a priori 'primary' state variance :math:`P_{[i|i-1,00]} = \left(\mathbf{F}\mathbf{P}_{[i-1\,|\,i-1]}\mathbf{F}^{\top} + Q_[i]\right)_{[00]}`.
319
+ :type priorCovarianceOO: cnp.float32_t
320
+ :param innovationCovariancePadding: Small value added to the diagonal for numerical stability.
321
+ :type innovationCovariancePadding: cnp.float32_t
322
+ :return: The inverted covariance matrix.
323
+ :rtype: cnp.ndarray[cnp.float32_t, ndim=2]
324
+ """
325
+
326
+ cdef int m = muncMatrixIter.size
327
+ # we have to invert a P.D. covariance (diagonal) and rank-one (1*priorCovariance) matrix
328
+ cdef cnp.ndarray[cnp.float32_t, ndim=2] inverse = np.empty((m, m), dtype=np.float32)
329
+ # note, not actually an m-dim matrix, just the diagonal elements taken as input
330
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] muncMatrixInverse = np.empty(m, dtype=np.float32)
331
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] muncArr = np.ascontiguousarray(muncMatrixIter, dtype=np.float32, )
332
+
333
+ # (numpy) memoryviews for faster indexing + nogil safety
334
+ cdef cnp.float32_t[::1] munc = muncArr
335
+ cdef cnp.float32_t[::1] muncInv = muncMatrixInverse
336
+ cdef cnp.float32_t[:, ::1] inv = inverse
337
+
338
+
339
+ cdef float divisor = 1.0
340
+ cdef float scale, scaleTimesPrior
341
+ cdef float prior = priorCovarianceOO
342
+ cdef float pad = innovationCovariancePadding
343
+ cdef float inv_i
344
+ cdef float val
345
+ cdef Py_ssize_t i, j
346
+
347
+ for i in range(m):
348
+ # two birds: build up the trace while taking the reciprocals
349
+ muncInv[i] = 1.0/(munc[i] + pad)
350
+ divisor += prior*muncInv[i]
351
+
352
+ # precompute both scale, scale*prior
353
+ scale = 1.0 / divisor
354
+ scaleTimesPrior = scale * prior
355
+
356
+ # ----
357
+ # FFR (I): explore prange(...) options to quickly invoke openMP for both cases
358
+ # FFR (II: add nogil block for prange-less case, too?
359
+ # FFR (III): run prange(m, schedule='static', nogil=True)?
360
+ # ----
361
+
362
+ # unless sample size warrants it, no OMP here
363
+ if m < 512:
364
+ for i in range(m):
365
+ inv_i = muncInv[i]
366
+ inv[i, i] = inv_i-(scaleTimesPrior*inv_i*inv_i)
367
+ for j in range(i + 1, m):
368
+ val = -scaleTimesPrior*inv_i*muncInv[j]
369
+ inv[i, j] = val
370
+ inv[j, i] = val
371
+
372
+ # very large sample size --> prange
373
+ else:
374
+ with nogil:
375
+ for i in prange(m, schedule='static'):
376
+ inv_i = muncInv[i]
377
+ inv[i, i] = inv_i-(scaleTimesPrior*inv_i*inv_i)
378
+ for j in range(i + 1, m):
379
+ val = -scaleTimesPrior * inv_i * muncInv[j]
380
+ inv[i, j] = val
381
+ inv[j, i] = val
382
+
383
+ return inverse
384
+
385
+
386
+ cpdef cnp.ndarray[cnp.float32_t, ndim=1] cgetStateCovarTrace(
387
+ cnp.float32_t[:, :, ::1] stateCovarMatrices
388
+ ):
389
+ cdef Py_ssize_t n = stateCovarMatrices.shape[0]
390
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] trace = np.empty(n, dtype=np.float32)
391
+ cdef cnp.float32_t[::1] traceView = trace
392
+ cdef Py_ssize_t i
393
+ for i in range(n):
394
+ traceView[i] = stateCovarMatrices[i, 0, 0] + stateCovarMatrices[i, 1, 1]
395
+
396
+ return trace
397
+
398
+
399
+ cpdef cnp.ndarray[cnp.float32_t, ndim=1] cgetPrecisionWeightedResidual(
400
+ cnp.float32_t[:, ::1] postFitResiduals,
401
+ cnp.float32_t[:, ::1] matrixMunc,
402
+ ):
403
+ cdef Py_ssize_t n = postFitResiduals.shape[0]
404
+ cdef Py_ssize_t m = postFitResiduals.shape[1]
405
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] out = np.empty(n, dtype=np.float32)
406
+ cdef cnp.float32_t[::1] outv = out
407
+ cdef Py_ssize_t i, j
408
+ cdef float wsum, rwsum, w
409
+ cdef float eps = 1e-12 # guard for zeros
410
+
411
+ for i in range(n):
412
+ wsum = 0.0
413
+ rwsum = 0.0
414
+ for j in range(m):
415
+ w = 1.0 / (<float>matrixMunc[j, i] + eps) # weightsIter[j]
416
+ rwsum += (<float>postFitResiduals[i, j]) * w # residualsIter[j] * w
417
+ wsum += w
418
+ outv[i] = <cnp.float32_t>(rwsum / wsum) if wsum > 0.0 else <cnp.float32_t>0.0
419
+
420
+ return out
421
+
422
+
423
+
424
+ cpdef tuple updateProcessNoiseCovariance(cnp.ndarray[cnp.float32_t, ndim=2] matrixQ,
425
+ cnp.ndarray[cnp.float32_t, ndim=2] matrixQCopy,
426
+ float dStat,
427
+ float dStatAlpha,
428
+ float dStatd,
429
+ float dStatPC,
430
+ bint inflatedQ,
431
+ float maxQ,
432
+ float minQ):
433
+ r"""Adjust process noise covariance matrix :math:`\mathbf{Q}_{[i]}`
434
+
435
+ :param matrixQ: Current process noise covariance
436
+ :param matrixQCopy: A copy of the initial original covariance matrix :math:`\mathbf{Q}_{[.]}`
437
+ :param inflatedQ: Flag indicating if the process noise covariance is inflated
438
+ :return: Updated process noise covariance matrix and inflated flag
439
+ :rtype: tuple
440
+ """
441
+
442
+ cdef float scaleQ, fac
443
+ if dStat > dStatAlpha:
444
+ scaleQ = np.sqrt(dStatd * np.abs(dStat-dStatAlpha) + dStatPC)
445
+ if matrixQ[0, 0] * scaleQ <= maxQ:
446
+ matrixQ[0, 0] *= scaleQ
447
+ matrixQ[0, 1] *= scaleQ
448
+ matrixQ[1, 0] *= scaleQ
449
+ matrixQ[1, 1] *= scaleQ
450
+ else:
451
+ fac = maxQ / matrixQCopy[0, 0]
452
+ matrixQ[0, 0] = maxQ
453
+ matrixQ[0, 1] = matrixQCopy[0, 1] * fac
454
+ matrixQ[1, 0] = matrixQCopy[1, 0] * fac
455
+ matrixQ[1, 1] = maxQ
456
+ inflatedQ = True
457
+
458
+ elif dStat < dStatAlpha and inflatedQ:
459
+ scaleQ = np.sqrt(dStatd * np.abs(dStat-dStatAlpha) + dStatPC)
460
+ if matrixQ[0, 0] / scaleQ >= minQ:
461
+ matrixQ[0, 0] /= scaleQ
462
+ matrixQ[0, 1] /= scaleQ
463
+ matrixQ[1, 0] /= scaleQ
464
+ matrixQ[1, 1] /= scaleQ
465
+ else:
466
+ # we've hit the minimum, no longer 'inflated'
467
+ fac = minQ / matrixQCopy[0, 0]
468
+ matrixQ[0, 0] = minQ
469
+ matrixQ[0, 1] = matrixQCopy[0, 1] * fac
470
+ matrixQ[1, 0] = matrixQCopy[1, 0] * fac
471
+ matrixQ[1, 1] = minQ
472
+ inflatedQ = False
473
+ return matrixQ, inflatedQ
474
+
475
+
476
+ cdef void _blockMax(double[::1] valuesView,
477
+ Py_ssize_t[::1] blockStartIndices,
478
+ Py_ssize_t[::1] blockSizes,
479
+ double[::1] outputView,
480
+ double eps = 0.0) noexcept:
481
+ cdef Py_ssize_t iterIndex, elementIndex, startIndex, blockLength
482
+ cdef double currentMax, currentValue
483
+ cdef Py_ssize_t firstIdx, lastIdx, centerIdx
484
+
485
+ for iterIndex in range(outputView.shape[0]):
486
+ startIndex = blockStartIndices[iterIndex]
487
+ blockLength = blockSizes[iterIndex]
488
+
489
+ currentMax = valuesView[startIndex]
490
+ for elementIndex in range(1, blockLength):
491
+ currentValue = valuesView[startIndex + elementIndex]
492
+ if currentValue > currentMax:
493
+ currentMax = currentValue
494
+
495
+ firstIdx = -1
496
+ lastIdx = -1
497
+ if eps > 0.0:
498
+ # only run if eps tol is non-zero
499
+ for elementIndex in range(blockLength):
500
+ currentValue = valuesView[startIndex + elementIndex]
501
+ # NOTE: this is intended to mirror the +- eps tol
502
+ if currentValue >= currentMax - eps:
503
+ if firstIdx == -1:
504
+ firstIdx = elementIndex
505
+ lastIdx = elementIndex
506
+
507
+ if firstIdx == -1:
508
+ # case: we didn't find a tie or eps == 0
509
+ outputView[iterIndex] = currentMax
510
+ else:
511
+ # case: there's a tie for eps > 0, pick center
512
+ centerIdx = (firstIdx + lastIdx) // 2
513
+ outputView[iterIndex] = valuesView[startIndex + centerIdx]
514
+
515
+
516
+ cpdef double[::1] csampleBlockStats(cnp.ndarray[cnp.uint32_t, ndim=1] intervals,
517
+ cnp.ndarray[cnp.float64_t, ndim=1] values,
518
+ int expectedBlockSize,
519
+ int iters,
520
+ int randSeed,
521
+ cnp.ndarray[cnp.uint8_t, ndim=1] excludeIdxMask,
522
+ double eps = 0.0):
523
+ r"""Sample contiguous blocks in the response sequence (xCorr), record maxima, and repeat.
524
+
525
+ Used to build an empirical null distribution and determine significance of response outputs.
526
+ The size of blocks is drawn from a truncated geometric distribution, preserving rough equality
527
+ in expectation but allowing for variability to account for the sampling across different phases
528
+ in the response sequence.
529
+
530
+ :param values: The response sequence to sample from.
531
+ :type values: cnp.ndarray[cnp.float64_t, ndim=1]
532
+ :param expectedBlockSize: The expected size (geometric) of the blocks to sample.
533
+ :type expectedBlockSize: int
534
+ :param iters: The number of blocks to sample.
535
+ :type iters: int
536
+ :param randSeed: Random seed for reproducibility.
537
+ :type randSeed: int
538
+ :return: An array of sampled block maxima.
539
+ :rtype: cnp.ndarray[cnp.float64_t, ndim=1]
540
+ :seealso: :func:`consenrich.matching.matchWavelet`
541
+ """
542
+ np.random.seed(randSeed)
543
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] valuesArr = np.ascontiguousarray(values, dtype=np.float64)
544
+ cdef double[::1] valuesView = valuesArr
545
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] sizesArr
546
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] startsArr
547
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] out = np.empty(iters, dtype=np.float64)
548
+ cdef Py_ssize_t maxBlockLength, maxSize, minSize
549
+ cdef Py_ssize_t n = <Py_ssize_t>intervals.size
550
+ cdef double maxBlockScale = <double>3.0
551
+ cdef double minBlockScale = <double> (1.0 / 3.0)
552
+
553
+ minSize = <Py_ssize_t> max(3, expectedBlockSize * minBlockScale)
554
+ maxSize = <Py_ssize_t> min(maxBlockScale * expectedBlockSize, n)
555
+ sizesArr = np.random.geometric(1.0 / expectedBlockSize, size=iters).astype(np.intp, copy=False)
556
+ np.clip(sizesArr, minSize, maxSize, out=sizesArr)
557
+ maxBlockLength = sizesArr.max()
558
+ cdef list support = []
559
+ cdef cnp.intp_t i_ = 0
560
+ while i_ < n-maxBlockLength:
561
+ if excludeIdxMask[i_:i_ + maxBlockLength].any():
562
+ i_ = i_ + maxBlockLength + 1
563
+ continue
564
+ support.append(i_)
565
+ i_ = i_ + 1
566
+
567
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] samples = np.random.choice(
568
+ support,
569
+ size=iters,
570
+ replace=True,
571
+ p=None
572
+ ).astype(np.intp)
573
+
574
+ cdef Py_ssize_t[::1] startsView = samples
575
+ cdef Py_ssize_t[::1] sizesView = sizesArr
576
+ cdef double[::1] outView = out
577
+ _blockMax(valuesView, startsView, sizesView, outView, eps)
578
+ return out
579
+
580
+
581
+ cpdef cSparseAvg(cnp.float32_t[::1] trackALV, dict sparseMap):
582
+ r"""Fast access and average of `numNearest` sparse elements.
583
+
584
+ See :func:`consenrich.core.getMuncTrack`
585
+
586
+ :param trackALV: See :func:`consenrich.core.getAverageLocalVarianceTrack`
587
+ :type trackALV: float[::1]
588
+ :param sparseMap: See :func:`consenrich.core.getSparseMap`
589
+ :type sparseMap: dict[int, np.ndarray]
590
+ :return: array of mena('nearest local variances') same length as `trackALV`
591
+ :rtype: cnp.ndarray[cnp.float32_t, ndim=1]
592
+ """
593
+ cdef Py_ssize_t n = <Py_ssize_t>trackALV.shape[0]
594
+ cdef cnp.ndarray[cnp.float32_t, ndim=1] out = np.empty(n, dtype=np.float32)
595
+ cdef Py_ssize_t i, j, m
596
+ cdef float sumNearestVariances = 0.0
597
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] idxs
598
+ cdef cnp.intp_t[::1] idx_view
599
+ for i in range(n):
600
+ idxs = <cnp.ndarray[cnp.intp_t, ndim=1]> sparseMap[i] # FFR: to avoid the cast, create sparseMap as dict[intp, np.ndarray[intp]]
601
+ idx_view = idxs
602
+ m = idx_view.shape[0] # FFR: maybe enforce strict `m == numNearest` in future releases to avoid extra overhead
603
+ if m == 0:
604
+ # this case probably warrants an exception or np.nan
605
+ out[i] = 0.0
606
+ continue
607
+ sumNearestVariances = 0.0
608
+ with nogil:
609
+ for j in range(m):
610
+ sumNearestVariances += trackALV[idx_view[j]]
611
+ out[i] = sumNearestVariances/m
612
+
613
+ return out
614
+
615
+
616
+ cpdef int64_t cgetFragmentLength(
617
+ str bamFile,
618
+ uint16_t samThreads=0,
619
+ uint16_t samFlagExclude=3844,
620
+ int64_t maxInsertSize=1000,
621
+ int64_t iters=1000,
622
+ int64_t blockSize=5000,
623
+ int64_t fallBack=147,
624
+ int64_t rollingChunkSize=250,
625
+ int64_t lagStep=10,
626
+ int64_t earlyExit=250,
627
+ int64_t randSeed=42,
628
+ ):
629
+
630
+ # FFR: standardize, across codebase, random seeding (e.g., np.random.seed vs default_rng)
631
+ cdef object rng = default_rng(randSeed)
632
+ cdef int64_t regionLen, numRollSteps
633
+ cdef int numChunks
634
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] rawArr
635
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] medArr
636
+ cdef AlignmentFile aln
637
+ cdef AlignedSegment readSeg
638
+ cdef list coverageIdxTopK
639
+ cdef list blockCenters
640
+ cdef list bestLags
641
+ cdef int i, j, k, idxVal
642
+ cdef int startIdx, endIdx
643
+ cdef int winSize, takeK
644
+ cdef int blockHalf, readFlag
645
+ cdef int chosenLag, lag, maxValidLag
646
+ cdef int strand
647
+ cdef int expandedLen
648
+ cdef int samThreadsInternal
649
+ cdef int cpuCount
650
+ cdef int64_t blockStartBP, blockEndBP, readStart, readEnd
651
+ cdef int64_t med
652
+ cdef double score
653
+ cdef cnp.ndarray[cnp.intp_t, ndim=1] unsortedIdx, sortedIdx, expandedIdx
654
+ cdef cnp.intp_t[::1] expandedIdxView
655
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] unsortedVals
656
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] seen
657
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] fwd
658
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] rev
659
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] fwdDiff
660
+ cdef cnp.ndarray[cnp.float64_t, ndim=1] revDiff
661
+ cdef int64_t diffS, diffE
662
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] bestLagsArr
663
+ cdef bint isPairedEnd = <bint>0
664
+ cdef double avgTemplateLen = <double>0.0
665
+ cdef int64_t templateLenSamples = <int64_t>0
666
+ cdef double avgReadLength = <double>0.0
667
+ cdef int64_t numReadLengthSamples = <int64_t>0
668
+ cdef int64_t minInsertSize
669
+ cdef int64_t requiredSamplesPE
670
+
671
+ # rather than taking `chromosome`, `start`, `end`
672
+ # ... we will just look at BAM contigs present and use
673
+ # ... the three largest to estimate the fragment length
674
+ cdef tuple contigs
675
+ cdef tuple lengths
676
+ cdef Py_ssize_t contigIdx
677
+ cdef str contig
678
+ cdef int64_t contigLen
679
+ cdef object top2ContigsIdx
680
+
681
+ cdef double[::1] fwdView
682
+ cdef double[::1] revView
683
+ cdef double[::1] fwdDiffView
684
+ cdef double[::1] revDiffView
685
+ cdef double runningSum
686
+ cdef double fwdSum
687
+ cdef double revSum
688
+ cdef double fwdMean
689
+ cdef double revMean
690
+ cdef double bestScore
691
+ cdef int bestLag
692
+ cdef int blockLen
693
+ cdef int localMinLag
694
+ cdef int localMaxLag
695
+ cdef int localLagStep
696
+
697
+ earlyExit = min(earlyExit, iters)
698
+
699
+ samThreadsInternal = <int>samThreads
700
+ cpuCount = <uint32_t>os.cpu_count()
701
+ if cpuCount is None:
702
+ cpuCount = 1
703
+ if samThreads < 1:
704
+ samThreadsInternal = <int>min(max(1,cpuCount // 2), 4)
705
+
706
+ aln = AlignmentFile(bamFile, "rb", threads=samThreadsInternal)
707
+ try:
708
+ contigs = aln.references
709
+ lengths = aln.lengths
710
+
711
+ if contigs is None or len(contigs) == 0:
712
+ return <int64_t>fallBack
713
+
714
+ top2ContigsIdx = np.argsort(lengths)[-min(2, len(contigs)):]
715
+
716
+ for contigIdx in top2ContigsIdx:
717
+ contig = contigs[contigIdx]
718
+ for readSeg in aln.fetch(contig):
719
+ if (readSeg.flag & samFlagExclude) != 0:
720
+ continue
721
+ if numReadLengthSamples < iters:
722
+ avgReadLength += readSeg.query_length
723
+ numReadLengthSamples += 1
724
+ else:
725
+ break
726
+
727
+ avgReadLength /= numReadLengthSamples if numReadLengthSamples > 0 else 1
728
+ minInsertSize = <int64_t>(avgReadLength + 0.5)
729
+ if minInsertSize < 1:
730
+ minInsertSize = 1
731
+ if minInsertSize > maxInsertSize:
732
+ minInsertSize = maxInsertSize
733
+
734
+ for contigIdx in top2ContigsIdx:
735
+ contig = contigs[contigIdx]
736
+ for readSeg in aln.fetch(contig):
737
+ if (readSeg.flag & samFlagExclude) != 0:
738
+ continue
739
+ if readSeg.is_paired:
740
+ # skip to the paired-end block below (no xCorr --> average template len)
741
+ isPairedEnd = <bint>1
742
+ break
743
+ if isPairedEnd:
744
+ break
745
+
746
+ if isPairedEnd:
747
+ requiredSamplesPE = max(iters, 1000)
748
+
749
+ for contigIdx in top2ContigsIdx:
750
+ if templateLenSamples >= requiredSamplesPE:
751
+ break
752
+ contig = contigs[contigIdx]
753
+
754
+ for readSeg in aln.fetch(contig):
755
+ if templateLenSamples >= requiredSamplesPE:
756
+ break
757
+ if (readSeg.flag & samFlagExclude) != 0 or (readSeg.flag & 2) == 0:
758
+ # skip any excluded flags, only count proper pairs
759
+ continue
760
+ if readSeg.template_length > 0 and readSeg.is_read1:
761
+ # read1 only: otherwise each pair contributes to the mean twice
762
+ # ...which might reduce breadth of the estimate
763
+ avgTemplateLen += abs(readSeg.template_length)
764
+ templateLenSamples += 1
765
+
766
+ if templateLenSamples < requiredSamplesPE:
767
+ return <int64_t> fallBack
768
+
769
+ avgTemplateLen /= <double>templateLenSamples
770
+
771
+ if avgTemplateLen >= minInsertSize and avgTemplateLen <= maxInsertSize:
772
+ return <int64_t> (avgTemplateLen + 0.5)
773
+ else:
774
+ return <int64_t> fallBack
775
+
776
+ top2ContigsIdx = np.argsort(lengths)[-min(2, len(contigs)):]
777
+ bestLags = []
778
+ blockHalf = blockSize // 2
779
+
780
+ fwd = np.zeros(blockSize, dtype=np.float64, order='C')
781
+ rev = np.zeros(blockSize, dtype=np.float64, order='C')
782
+ fwdDiff = np.zeros(blockSize+1, dtype=np.float64, order='C')
783
+ revDiff = np.zeros(blockSize+1, dtype=np.float64, order='C')
784
+
785
+ fwdView = fwd
786
+ revView = rev
787
+ fwdDiffView = fwdDiff
788
+ revDiffView = revDiff
789
+
790
+ for contigIdx in top2ContigsIdx:
791
+ contig = contigs[contigIdx]
792
+ contigLen = <int64_t>lengths[contigIdx]
793
+ regionLen = contigLen
794
+
795
+ if regionLen < blockSize or regionLen <= 0:
796
+ continue
797
+
798
+ if maxInsertSize < 1:
799
+ maxInsertSize = 1
800
+
801
+ # first, we build a coarse read coverage track from `start` to `end`
802
+ numRollSteps = regionLen // rollingChunkSize
803
+ if numRollSteps <= 0:
804
+ numRollSteps = 1
805
+ numChunks = <int>numRollSteps
806
+
807
+ rawArr = np.zeros(numChunks, dtype=np.float64)
808
+ medArr = np.zeros(numChunks, dtype=np.float64)
809
+
810
+ for readSeg in aln.fetch(contig):
811
+ if (readSeg.flag & samFlagExclude) != 0:
812
+ continue
813
+ j = <int>((readSeg.reference_start) // rollingChunkSize)
814
+ if 0 <= j < numChunks:
815
+ rawArr[j] += 1.0
816
+
817
+ # second, we apply a rolling/moving/local/weywtci order-statistic filter (median)
818
+ # ...the size of the kernel is based on the blockSize -- we want high-coverage
819
+ # ...blocks as measured by their local median read count
820
+ winSize = <int>(blockSize // rollingChunkSize)
821
+ if winSize < 1:
822
+ winSize = 1
823
+ if (winSize & 1) == 0:
824
+ winSize += 1
825
+ medArr[:] = ndimage.median_filter(rawArr, size=winSize, mode="nearest")
826
+
827
+ # we pick the largest local-medians and form a block around each
828
+ takeK = iters if iters < numChunks else numChunks
829
+ unsortedIdx = np.argpartition(medArr, -takeK)[-takeK:]
830
+ unsortedVals = medArr[unsortedIdx]
831
+ sortedIdx = unsortedIdx[np.argsort(unsortedVals)[::-1]]
832
+ coverageIdxTopK = sortedIdx[:takeK].tolist()
833
+
834
+ expandedLen = takeK*winSize
835
+ expandedIdx = np.empty(expandedLen, dtype=np.intp)
836
+ expandedIdxView = expandedIdx
837
+ k = 0
838
+ for i in range(takeK):
839
+ idxVal = coverageIdxTopK[i]
840
+ startIdx = idxVal - (winSize // 2)
841
+ endIdx = startIdx + winSize
842
+ if startIdx < 0:
843
+ startIdx = 0
844
+ endIdx = winSize if winSize < numChunks else numChunks
845
+ if endIdx > numChunks:
846
+ endIdx = numChunks
847
+ startIdx = endIdx - winSize if winSize <= numChunks else 0
848
+ for j in range(startIdx, endIdx):
849
+ expandedIdxView[k] = j
850
+ k += 1
851
+ if k < expandedLen:
852
+ expandedIdx = expandedIdx[:k]
853
+ expandedIdxView = expandedIdx
854
+
855
+ seen = np.zeros(numChunks, dtype=np.uint8)
856
+ blockCenters = []
857
+ for i in range(expandedIdx.shape[0]):
858
+ j = <int>expandedIdxView[i]
859
+ if seen[j] == 0:
860
+ seen[j] = 1
861
+ blockCenters.append(j)
862
+
863
+ if len(blockCenters) > 1:
864
+ rng.shuffle(blockCenters)
865
+
866
+ for idxVal in blockCenters:
867
+ # this should map back to genomic coordinates
868
+ blockStartBP = idxVal * rollingChunkSize + (rollingChunkSize // 2) - blockHalf
869
+ if blockStartBP < 0:
870
+ blockStartBP = 0
871
+ blockEndBP = blockStartBP + blockSize
872
+ if blockEndBP > contigLen:
873
+ blockEndBP = contigLen
874
+ blockStartBP = blockEndBP - blockSize
875
+ if blockStartBP < 0:
876
+ continue
877
+
878
+ # now we build strand-specific tracks
879
+ # ...avoid forward/reverse strand for loops in each block w/ a cumsum
880
+ fwd.fill(0.0)
881
+ fwdDiff.fill(0.0)
882
+ rev.fill(0.0)
883
+ revDiff.fill(0.0)
884
+ readFlag = -1
885
+
886
+ for readSeg in aln.fetch(contig, blockStartBP, blockEndBP):
887
+ readFlag = readSeg.flag
888
+ readStart = <int64_t>readSeg.reference_start
889
+ readEnd = <int64_t>readSeg.reference_end
890
+ if (readFlag & samFlagExclude) != 0:
891
+ continue
892
+ if readStart < blockStartBP or readEnd > blockEndBP:
893
+ continue
894
+ diffS = readStart - blockStartBP
895
+ diffE = readEnd - blockStartBP
896
+ strand = readFlag & 16
897
+ if strand == 0:
898
+ # forward
899
+ # just mark offsets from block start/end
900
+ fwdDiffView[<int>diffS] += 1.0
901
+ fwdDiffView[<int>diffE] -= 1.0
902
+ else:
903
+ # reverse
904
+ # ditto
905
+ revDiffView[<int>diffS] += 1.0
906
+ revDiffView[<int>diffE] -= 1.0
907
+
908
+ maxValidLag = maxInsertSize if (maxInsertSize < blockSize) else (blockSize - 1)
909
+ localMinLag = <int>minInsertSize
910
+ localMaxLag = <int>maxValidLag
911
+ if localMaxLag < localMinLag:
912
+ continue
913
+ localLagStep = <int>lagStep
914
+ if localLagStep < 1:
915
+ localLagStep = 1
916
+
917
+ # now we can get coverage track by summing over diffs
918
+ # maximizes the crossCovar(forward, reverse, lag) wrt lag.
919
+ with nogil:
920
+ runningSum = 0.0
921
+ for i from 0 <= i < blockSize:
922
+ runningSum += fwdDiffView[i]
923
+ fwdView[i] = runningSum
924
+
925
+ runningSum = 0.0
926
+ for i from 0 <= i < blockSize:
927
+ runningSum += revDiffView[i]
928
+ revView[i] = runningSum
929
+
930
+ fwdSum = 0.0
931
+ revSum = 0.0
932
+ for i from 0 <= i < blockSize:
933
+ fwdSum += fwdView[i]
934
+ revSum += revView[i]
935
+
936
+ fwdMean = fwdSum / blockSize
937
+ revMean = revSum / blockSize
938
+
939
+ for i from 0 <= i < blockSize:
940
+ fwdView[i] = fwdView[i] - fwdMean
941
+ revView[i] = revView[i] - revMean
942
+
943
+ bestScore = -1e308
944
+ bestLag = -1
945
+ for lag from localMinLag <= lag <= localMaxLag by localLagStep:
946
+ score = 0.0
947
+ blockLen = blockSize - lag
948
+ for i from 0 <= i < blockLen:
949
+ score += fwdView[i] * revView[i + lag]
950
+ if score > bestScore:
951
+ bestScore = score
952
+ bestLag = lag
953
+
954
+ chosenLag = bestLag
955
+
956
+ if chosenLag > 0 and bestScore != 0.0:
957
+ bestLags.append(chosenLag)
958
+ if len(bestLags) >= earlyExit:
959
+ break
960
+
961
+ finally:
962
+ aln.close()
963
+
964
+ if len(bestLags) < 3:
965
+ return fallBack
966
+
967
+ bestLagsArr = np.asarray(bestLags, dtype=np.uint32)
968
+ med = int(np.median(bestLagsArr) + avgReadLength + 0.5)
969
+ if med < minInsertSize:
970
+ med = <int>minInsertSize
971
+ elif med > maxInsertSize:
972
+ med = <int>maxInsertSize
973
+ return <int64_t>med
974
+
975
+
976
+
977
+ cdef inline Py_ssize_t getInsertion(const uint32_t* array_, Py_ssize_t n, uint32_t x) nogil:
978
+ # helper: binary search to find insertion point into sorted `arrray_`
979
+ cdef Py_ssize_t low = 0
980
+ cdef Py_ssize_t high = n
981
+ cdef Py_ssize_t midpt
982
+ while low < high:
983
+ midpt = low + ((high - low) >> 1)
984
+ if array_[midpt] <= x:
985
+ low = midpt + 1
986
+ else:
987
+ high = midpt
988
+ return low
989
+
990
+
991
+ cdef int maskMembership(const uint32_t* pos, Py_ssize_t numIntervals, const uint32_t* mStarts, const uint32_t* mEnds, Py_ssize_t n, uint8_t* outMask) nogil:
992
+ cdef Py_ssize_t i = 0
993
+ cdef Py_ssize_t k
994
+ cdef uint32_t p
995
+ while i < numIntervals:
996
+ p = pos[i]
997
+ k = getInsertion(mStarts, n, p) - 1
998
+ if k >= 0 and p < mEnds[k]:
999
+ outMask[i] = <uint8_t>1
1000
+ else:
1001
+ outMask[i] = <uint8_t>0
1002
+ i += 1
1003
+ return 0
1004
+
1005
+
1006
+ cpdef cnp.ndarray[cnp.uint8_t, ndim=1] cbedMask(
1007
+ str chromosome,
1008
+ str bedFile,
1009
+ cnp.ndarray[cnp.uint32_t, ndim=1] intervals,
1010
+ int stepSize
1011
+ ):
1012
+ r"""Return a 1/0 mask for intervals overlapping a sorted and merged BED file.
1013
+
1014
+ :param chromosome: Chromosome name.
1015
+ :type chromosome: str
1016
+ :param bedFile: Path to a sorted and merged BED file.
1017
+ :type bedFile: str
1018
+ :param intervals: Array of sorted, non-overlapping start positions of genomic intervals.
1019
+ Each interval is assumed `stepSize`.
1020
+ :type intervals: cnp.ndarray[cnp.uint32_t, ndim=1]
1021
+ :param stepSize: Step size between genomic positions in `intervals`.
1022
+ :type stepSize: int32_t
1023
+ :return: A mask s.t. `1` indicates the corresponding interval overlaps a BED region.
1024
+ :rtype: cnp.ndarray[cnp.uint8_t, ndim=1]
1025
+
1026
+ """
1027
+ cdef list startsList = []
1028
+ cdef list endsList = []
1029
+ cdef object f = open(bedFile, "r")
1030
+ cdef str line
1031
+ cdef list cols
1032
+ try:
1033
+ for line in f:
1034
+ line = line.strip()
1035
+ if not line or line[0] == '#':
1036
+ continue
1037
+ cols = line.split('\t')
1038
+ if not cols or len(cols) < 3:
1039
+ continue
1040
+ if cols[0] != chromosome:
1041
+ continue
1042
+ startsList.append(int(cols[1]))
1043
+ endsList.append(int(cols[2]))
1044
+ finally:
1045
+ f.close()
1046
+ cdef Py_ssize_t numIntervals = intervals.size
1047
+ cdef cnp.ndarray[cnp.uint8_t, ndim=1] mask = np.zeros(numIntervals, dtype=np.uint8)
1048
+ if not startsList:
1049
+ return mask
1050
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] starts = np.asarray(startsList, dtype=np.uint32)
1051
+ cdef cnp.ndarray[cnp.uint32_t, ndim=1] ends = np.asarray(endsList, dtype=np.uint32)
1052
+ cdef cnp.uint32_t[:] startsView = starts
1053
+ cdef cnp.uint32_t[:] endsView = ends
1054
+ cdef cnp.uint32_t[:] posView = intervals
1055
+ cdef cnp.uint8_t[:] outView = mask
1056
+ # for nogil
1057
+ cdef uint32_t* svPtr = &startsView[0] if starts.size > 0 else <uint32_t*>NULL
1058
+ cdef uint32_t* evPtr = &endsView[0] if ends.size > 0 else <uint32_t*>NULL
1059
+ cdef uint32_t* posPtr = &posView[0] if numIntervals > 0 else <uint32_t*>NULL
1060
+ cdef uint8_t* outPtr = &outView[0] if numIntervals > 0 else <uint8_t*>NULL
1061
+ cdef Py_ssize_t n = starts.size
1062
+ with nogil:
1063
+ if numIntervals > 0 and n > 0:
1064
+ maskMembership(posPtr, numIntervals, svPtr, evPtr, n, outPtr)
1065
+ return mask