consenrich 0.7.4b2__cp312-cp312-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- consenrich/__init__.py +11 -0
- consenrich/cconsenrich.c +48685 -0
- consenrich/cconsenrich.cpython-312-darwin.so +0 -0
- consenrich/cconsenrich.pyx +861 -0
- consenrich/consenrich.py +1381 -0
- consenrich/constants.py +172 -0
- consenrich/core.py +1428 -0
- consenrich/data/ce10.sizes +6 -0
- consenrich/data/ce10_blacklist.bed +100 -0
- consenrich/data/ce10_sparse.bed +11828 -0
- consenrich/data/ce11.sizes +6 -0
- consenrich/data/ce11_blacklist.bed +97 -0
- consenrich/data/ce11_sparse.bed +11828 -0
- consenrich/data/dm6.sizes +7 -0
- consenrich/data/dm6_blacklist.bed +182 -0
- consenrich/data/dm6_sparse.bed +20000 -0
- consenrich/data/hg19.sizes +24 -0
- consenrich/data/hg19_blacklist.bed +834 -0
- consenrich/data/hg19_sparse.bed +288358 -0
- consenrich/data/hg38.sizes +24 -0
- consenrich/data/hg38_blacklist.bed +636 -0
- consenrich/data/hg38_sparse.bed +288699 -0
- consenrich/data/mm10.sizes +21 -0
- consenrich/data/mm10_blacklist.bed +3435 -0
- consenrich/data/mm10_sparse.bed +100400 -0
- consenrich/data/mm39.sizes +21 -0
- consenrich/data/mm39_blacklist.bed +3360 -0
- consenrich/data/mm39_sparse.bed +100381 -0
- consenrich/detrorm.py +249 -0
- consenrich/matching.py +901 -0
- consenrich/misc_util.py +122 -0
- consenrich-0.7.4b2.dist-info/METADATA +65 -0
- consenrich-0.7.4b2.dist-info/RECORD +37 -0
- consenrich-0.7.4b2.dist-info/WHEEL +6 -0
- consenrich-0.7.4b2.dist-info/entry_points.txt +2 -0
- consenrich-0.7.4b2.dist-info/licenses/LICENSE +21 -0
- consenrich-0.7.4b2.dist-info/top_level.txt +1 -0
consenrich/detrorm.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import List, Optional, Tuple
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import pybedtools as bed
|
|
10
|
+
import pysam as sam
|
|
11
|
+
|
|
12
|
+
from scipy import signal, ndimage
|
|
13
|
+
|
|
14
|
+
logging.basicConfig(
|
|
15
|
+
level=logging.INFO,
|
|
16
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
17
|
+
)
|
|
18
|
+
logging.basicConfig(
|
|
19
|
+
level=logging.WARNING,
|
|
20
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
21
|
+
)
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
from .misc_util import getChromSizesDict
|
|
25
|
+
from .constants import EFFECTIVE_GENOME_SIZES
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def getScaleFactor1x(
|
|
29
|
+
bamFile: str,
|
|
30
|
+
effectiveGenomeSize: int,
|
|
31
|
+
readLength: int,
|
|
32
|
+
excludeChroms: List[str],
|
|
33
|
+
chromSizesFile: str,
|
|
34
|
+
samThreads: int,
|
|
35
|
+
) -> float:
|
|
36
|
+
r"""Generic normalization factor based on effective genome size and number of mapped reads in non-excluded chromosomes.
|
|
37
|
+
|
|
38
|
+
:param bamFile: See :class:`consenrich.core.inputParams`.
|
|
39
|
+
:type bamFile: str
|
|
40
|
+
:param effectiveGenomeSize: Effective genome size in base pairs. See :func:`consenrich.constants.getEffectiveGenomeSize`.
|
|
41
|
+
:type effectiveGenomeSize: int
|
|
42
|
+
:param readLength: read length or fragment length
|
|
43
|
+
:type readLength: int
|
|
44
|
+
:param excludeChroms: List of chromosomes to exclude from the analysis.
|
|
45
|
+
:type excludeChroms: List[str]
|
|
46
|
+
:param chromSizesFile: Path to the chromosome sizes file.
|
|
47
|
+
:type chromSizesFile: str
|
|
48
|
+
:param samThreads: See :class:`consenrich.core.samParams`.
|
|
49
|
+
:type samThreads: int
|
|
50
|
+
:return: Scale factor for 1x normalization.
|
|
51
|
+
:rtype: float
|
|
52
|
+
"""
|
|
53
|
+
if excludeChroms is not None:
|
|
54
|
+
if chromSizesFile is None:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
"`excludeChroms` is provided...so must be `chromSizesFile`."
|
|
57
|
+
)
|
|
58
|
+
chromSizes: dict = getChromSizesDict(chromSizesFile)
|
|
59
|
+
for chrom in excludeChroms:
|
|
60
|
+
if chrom not in chromSizes:
|
|
61
|
+
continue
|
|
62
|
+
effectiveGenomeSize -= chromSizes[chrom]
|
|
63
|
+
totalMappedReads: int = -1
|
|
64
|
+
with sam.AlignmentFile(bamFile, "rb", threads=samThreads) as aln:
|
|
65
|
+
totalMappedReads = aln.mapped
|
|
66
|
+
if excludeChroms is not None:
|
|
67
|
+
idxStats = aln.get_index_statistics()
|
|
68
|
+
for element in idxStats:
|
|
69
|
+
if element.contig in excludeChroms:
|
|
70
|
+
totalMappedReads -= element.mapped
|
|
71
|
+
if totalMappedReads <= 0 or effectiveGenomeSize <= 0:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Negative EGS after removing excluded chromosomes or no mapped reads: EGS={effectiveGenomeSize}, totalMappedReads={totalMappedReads}."
|
|
74
|
+
)
|
|
75
|
+
return round(
|
|
76
|
+
effectiveGenomeSize / (totalMappedReads * readLength), 4
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def getScaleFactorPerMillion(
|
|
81
|
+
bamFile: str, excludeChroms: List[str]
|
|
82
|
+
) -> float:
|
|
83
|
+
r"""Generic normalization factor based on number of mapped reads in non-excluded chromosomes.
|
|
84
|
+
|
|
85
|
+
:param bamFile: See :class:`consenrich.core.inputParams`.
|
|
86
|
+
:type bamFile: str
|
|
87
|
+
:param excludeChroms: List of chromosomes to exclude when counting mapped reads.
|
|
88
|
+
:type excludeChroms: List[str]
|
|
89
|
+
:return: Scale factor accounting for number of mapped reads (only).
|
|
90
|
+
:rtype: float
|
|
91
|
+
"""
|
|
92
|
+
if not os.path.exists(bamFile):
|
|
93
|
+
raise FileNotFoundError(f"BAM file {bamFile} does not exist.")
|
|
94
|
+
totalMappedReads: int = 0
|
|
95
|
+
with sam.AlignmentFile(bamFile, "rb") as aln:
|
|
96
|
+
totalMappedReads = aln.mapped
|
|
97
|
+
if excludeChroms is not None:
|
|
98
|
+
idxStats = aln.get_index_statistics()
|
|
99
|
+
for element in idxStats:
|
|
100
|
+
if element.contig in excludeChroms:
|
|
101
|
+
totalMappedReads -= element.mapped
|
|
102
|
+
if totalMappedReads <= 0:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"After removing reads mapping to excluded chroms, totalMappedReads is {totalMappedReads}."
|
|
105
|
+
)
|
|
106
|
+
scalePM = round(1_000_000 / totalMappedReads, 4)
|
|
107
|
+
return scalePM
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def getPairScaleFactors(
|
|
111
|
+
bamFileA: str,
|
|
112
|
+
bamFileB: str,
|
|
113
|
+
effectiveGenomeSizeA: int,
|
|
114
|
+
effectiveGenomeSizeB: int,
|
|
115
|
+
readLengthA: int,
|
|
116
|
+
readLengthB: int,
|
|
117
|
+
excludeChroms: List[str],
|
|
118
|
+
chromSizesFile: str,
|
|
119
|
+
samThreads: int,
|
|
120
|
+
scaleDown: bool = True,
|
|
121
|
+
) -> Tuple[float, float]:
|
|
122
|
+
r"""Get scaling constants that normalize two alignment files to each other (e.g. ChIP-seq treatment and control) with respect to sequence coverage.
|
|
123
|
+
|
|
124
|
+
:param bamFileA: Path to the first BAM file.
|
|
125
|
+
:type bamFileA: str
|
|
126
|
+
:param bamFileB: Path to the second BAM file.
|
|
127
|
+
:type bamFileB: str
|
|
128
|
+
:param effectiveGenomeSizeA: Effective genome size for the first BAM file.
|
|
129
|
+
:type effectiveGenomeSizeA: int
|
|
130
|
+
:param effectiveGenomeSizeB: Effective genome size for the second BAM file.
|
|
131
|
+
:type effectiveGenomeSizeB: int
|
|
132
|
+
:param readLengthA: read length or fragment length for the first BAM file.
|
|
133
|
+
:type readLengthA: int
|
|
134
|
+
:param readLengthB: read length or fragment length for the second BAM file.
|
|
135
|
+
:type readLengthB: int
|
|
136
|
+
:param excludeChroms: List of chromosomes to exclude from the analysis.
|
|
137
|
+
:type excludeChroms: List[str]
|
|
138
|
+
:param chromSizesFile: Path to the chromosome sizes file.
|
|
139
|
+
:type chromSizesFile: str
|
|
140
|
+
:param samThreads: Number of threads to use for reading BAM files.
|
|
141
|
+
:type samThreads: int
|
|
142
|
+
:return: A tuple containing the scale factors for the first and second BAM files.
|
|
143
|
+
:rtype: Tuple[float, float]
|
|
144
|
+
"""
|
|
145
|
+
scaleFactorA = getScaleFactor1x(
|
|
146
|
+
bamFileA,
|
|
147
|
+
effectiveGenomeSizeA,
|
|
148
|
+
readLengthA,
|
|
149
|
+
excludeChroms,
|
|
150
|
+
chromSizesFile,
|
|
151
|
+
samThreads,
|
|
152
|
+
)
|
|
153
|
+
scaleFactorB = getScaleFactor1x(
|
|
154
|
+
bamFileB,
|
|
155
|
+
effectiveGenomeSizeB,
|
|
156
|
+
readLengthB,
|
|
157
|
+
excludeChroms,
|
|
158
|
+
chromSizesFile,
|
|
159
|
+
samThreads,
|
|
160
|
+
)
|
|
161
|
+
logger.info(
|
|
162
|
+
f"Initial scale factors: {bamFileA}: {scaleFactorA}, {bamFileB}: {scaleFactorB}"
|
|
163
|
+
)
|
|
164
|
+
if not scaleDown:
|
|
165
|
+
return scaleFactorA, scaleFactorB
|
|
166
|
+
coverageA = 1 / scaleFactorA
|
|
167
|
+
coverageB = 1 / scaleFactorB
|
|
168
|
+
if coverageA < coverageB:
|
|
169
|
+
scaleFactorB *= coverageA / coverageB
|
|
170
|
+
scaleFactorA = 1.0
|
|
171
|
+
else:
|
|
172
|
+
scaleFactorA *= coverageB / coverageA
|
|
173
|
+
scaleFactorB = 1.0
|
|
174
|
+
|
|
175
|
+
logger.info(
|
|
176
|
+
f"Final scale factors: {bamFileA}: {scaleFactorA}, {bamFileB}: {scaleFactorB}"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
ratio = max(scaleFactorA, scaleFactorB) / min(
|
|
180
|
+
scaleFactorA, scaleFactorB
|
|
181
|
+
)
|
|
182
|
+
if ratio > 5.0:
|
|
183
|
+
logger.warning(
|
|
184
|
+
f"Scale factors differ > 5x....\n"
|
|
185
|
+
f"\n\tAre effective genome sizes {effectiveGenomeSizeA} and {effectiveGenomeSizeB} correct?"
|
|
186
|
+
f"\n\tAre read/fragment lengths {readLengthA},{readLengthB} correct?"
|
|
187
|
+
)
|
|
188
|
+
return scaleFactorA, scaleFactorB
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def detrendTrack(
|
|
192
|
+
values: np.ndarray,
|
|
193
|
+
stepSize: int,
|
|
194
|
+
detrendWindowLengthBP: int,
|
|
195
|
+
useOrderStatFilter: bool,
|
|
196
|
+
usePolyFilter: bool,
|
|
197
|
+
detrendTrackPercentile: float,
|
|
198
|
+
detrendSavitzkyGolayDegree: int,
|
|
199
|
+
) -> np.ndarray:
|
|
200
|
+
r"""Detrend tracks using either an order statistic filter or a polynomial filter.
|
|
201
|
+
|
|
202
|
+
:param values: Values to detrend.
|
|
203
|
+
:type values: np.ndarray
|
|
204
|
+
:param stepSize: see :class:`consenrich.core.countingParams`.
|
|
205
|
+
:type stepSize: int
|
|
206
|
+
:param detrendWindowLengthBP: See :class:`consenrich.core.detrendParams`.
|
|
207
|
+
:type detrendWindowLengthBP: int
|
|
208
|
+
:param useOrderStatFilter: Whether to use a sliding order statistic filter.
|
|
209
|
+
:type useOrderStatFilter: bool
|
|
210
|
+
:param usePolyFilter: Whether to use a sliding polynomial/least squares filter.
|
|
211
|
+
:type usePolyFilter: bool
|
|
212
|
+
:param detrendTrackPercentile: Percentile to use for the order statistic filter.
|
|
213
|
+
:type detrendTrackPercentile: float
|
|
214
|
+
:param detrendSavitzkyGolayDegree: Degree of the polynomial for the Savitzky-Golay/Polynomial filter.
|
|
215
|
+
:type detrendSavitzkyGolayDegree: int
|
|
216
|
+
:return: Detrended values.
|
|
217
|
+
:rtype: np.ndarray
|
|
218
|
+
:raises ValueError: If the detrend window length is not greater than 3 times the step size
|
|
219
|
+
or if the values length is less than the detrend window length.
|
|
220
|
+
"""
|
|
221
|
+
bothSpecified: bool = False
|
|
222
|
+
size = int(detrendWindowLengthBP / stepSize)
|
|
223
|
+
if size % 2 == 0:
|
|
224
|
+
size += 1
|
|
225
|
+
if size < 3:
|
|
226
|
+
raise ValueError("Required: windowLengthBP > 3*stepSize.")
|
|
227
|
+
if len(values) < size:
|
|
228
|
+
raise ValueError(
|
|
229
|
+
"values length must be greater than windowLength."
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if useOrderStatFilter and usePolyFilter:
|
|
233
|
+
logger.warning(
|
|
234
|
+
"Both order statistic and polynomial filters specified...using order statistic filter."
|
|
235
|
+
)
|
|
236
|
+
bothSpecified = True
|
|
237
|
+
|
|
238
|
+
if useOrderStatFilter or bothSpecified:
|
|
239
|
+
return values - ndimage.percentile_filter(
|
|
240
|
+
values, detrendTrackPercentile, size=size
|
|
241
|
+
)
|
|
242
|
+
elif usePolyFilter:
|
|
243
|
+
return values - signal.savgol_filter(
|
|
244
|
+
values, size, detrendSavitzkyGolayDegree
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
return values - ndimage.uniform_filter1d(
|
|
248
|
+
values, size=size, mode="nearest"
|
|
249
|
+
)
|