consenrich 0.7.11b2__cp314-cp314-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

Files changed (38) hide show
  1. consenrich/.dylibs/libomp.dylib +0 -0
  2. consenrich/__init__.py +11 -0
  3. consenrich/cconsenrich.c +50610 -0
  4. consenrich/cconsenrich.cpython-314-darwin.so +0 -0
  5. consenrich/cconsenrich.pyx +1065 -0
  6. consenrich/consenrich.py +1802 -0
  7. consenrich/constants.py +172 -0
  8. consenrich/core.py +2068 -0
  9. consenrich/data/ce10.sizes +6 -0
  10. consenrich/data/ce10_blacklist.bed +100 -0
  11. consenrich/data/ce10_sparse.bed +11828 -0
  12. consenrich/data/ce11.sizes +6 -0
  13. consenrich/data/ce11_blacklist.bed +97 -0
  14. consenrich/data/ce11_sparse.bed +11828 -0
  15. consenrich/data/dm6.sizes +7 -0
  16. consenrich/data/dm6_blacklist.bed +182 -0
  17. consenrich/data/dm6_sparse.bed +20000 -0
  18. consenrich/data/hg19.sizes +24 -0
  19. consenrich/data/hg19_blacklist.bed +834 -0
  20. consenrich/data/hg19_sparse.bed +288358 -0
  21. consenrich/data/hg38.sizes +24 -0
  22. consenrich/data/hg38_blacklist.bed +636 -0
  23. consenrich/data/hg38_sparse.bed +288699 -0
  24. consenrich/data/mm10.sizes +21 -0
  25. consenrich/data/mm10_blacklist.bed +3435 -0
  26. consenrich/data/mm10_sparse.bed +100400 -0
  27. consenrich/data/mm39.sizes +21 -0
  28. consenrich/data/mm39_blacklist.bed +3360 -0
  29. consenrich/data/mm39_sparse.bed +100381 -0
  30. consenrich/detrorm.py +297 -0
  31. consenrich/matching.py +929 -0
  32. consenrich/misc_util.py +122 -0
  33. consenrich-0.7.11b2.dist-info/METADATA +66 -0
  34. consenrich-0.7.11b2.dist-info/RECORD +38 -0
  35. consenrich-0.7.11b2.dist-info/WHEEL +6 -0
  36. consenrich-0.7.11b2.dist-info/entry_points.txt +2 -0
  37. consenrich-0.7.11b2.dist-info/licenses/LICENSE +21 -0
  38. consenrich-0.7.11b2.dist-info/top_level.txt +1 -0
consenrich/detrorm.py ADDED
@@ -0,0 +1,297 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import os
4
+ from typing import List, Optional, Tuple
5
+ import logging
6
+ import re
7
+ import numpy as np
8
+ import pandas as pd
9
+ import pybedtools as bed
10
+ import pysam as sam
11
+
12
+ from scipy import signal, ndimage
13
+
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
17
+ )
18
+ logging.basicConfig(
19
+ level=logging.WARNING,
20
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ from .misc_util import getChromSizesDict
25
+ from .constants import EFFECTIVE_GENOME_SIZES
26
+ from .cconsenrich import cgetFragmentLength
27
+
28
+
29
+ def getScaleFactor1x(
30
+ bamFile: str,
31
+ effectiveGenomeSize: int,
32
+ readLength: int,
33
+ excludeChroms: List[str],
34
+ chromSizesFile: str,
35
+ samThreads: int,
36
+ ) -> float:
37
+ r"""Generic normalization factor based on effective genome size and number of mapped reads in non-excluded chromosomes.
38
+
39
+ :param bamFile: See :class:`consenrich.core.inputParams`.
40
+ :type bamFile: str
41
+ :param effectiveGenomeSize: Effective genome size in base pairs. See :func:`consenrich.constants.getEffectiveGenomeSize`.
42
+ :type effectiveGenomeSize: int
43
+ :param readLength: read length or fragment length
44
+ :type readLength: int
45
+ :param excludeChroms: List of chromosomes to exclude from the analysis.
46
+ :type excludeChroms: List[str]
47
+ :param chromSizesFile: Path to the chromosome sizes file.
48
+ :type chromSizesFile: str
49
+ :param samThreads: See :class:`consenrich.core.samParams`.
50
+ :type samThreads: int
51
+ :return: Scale factor for 1x normalization.
52
+ :rtype: float
53
+ """
54
+ if excludeChroms is not None:
55
+ if chromSizesFile is None:
56
+ raise ValueError(
57
+ "`excludeChroms` is provided...so must be `chromSizesFile`."
58
+ )
59
+ chromSizes: dict = getChromSizesDict(chromSizesFile)
60
+ for chrom in excludeChroms:
61
+ if chrom not in chromSizes:
62
+ continue
63
+ effectiveGenomeSize -= chromSizes[chrom]
64
+ totalMappedReads: int = -1
65
+ with sam.AlignmentFile(bamFile, "rb", threads=samThreads) as aln:
66
+ totalMappedReads = aln.mapped
67
+ if excludeChroms is not None:
68
+ idxStats = aln.get_index_statistics()
69
+ for element in idxStats:
70
+ if element.contig in excludeChroms:
71
+ totalMappedReads -= element.mapped
72
+ if totalMappedReads <= 0 or effectiveGenomeSize <= 0:
73
+ raise ValueError(
74
+ f"Negative EGS after removing excluded chromosomes or no mapped reads: EGS={effectiveGenomeSize}, totalMappedReads={totalMappedReads}."
75
+ )
76
+
77
+ return round(
78
+ effectiveGenomeSize / (totalMappedReads * readLength), 5
79
+ )
80
+
81
+
82
+ def getScaleFactorPerMillion(
83
+ bamFile: str, excludeChroms: List[str], stepSize: int
84
+ ) -> float:
85
+ r"""Generic normalization factor based on number of mapped reads in non-excluded chromosomes.
86
+
87
+ :param bamFile: See :class:`consenrich.core.inputParams`.
88
+ :type bamFile: str
89
+ :param excludeChroms: List of chromosomes to exclude when counting mapped reads.
90
+ :type excludeChroms: List[str]
91
+ :return: Scale factor accounting for number of mapped reads (only).
92
+ :rtype: float
93
+ """
94
+ if not os.path.exists(bamFile):
95
+ raise FileNotFoundError(f"BAM file {bamFile} does not exist.")
96
+ totalMappedReads: int = 0
97
+ with sam.AlignmentFile(bamFile, "rb") as aln:
98
+ totalMappedReads = aln.mapped
99
+ if excludeChroms is not None:
100
+ idxStats = aln.get_index_statistics()
101
+ for element in idxStats:
102
+ if element.contig in excludeChroms:
103
+ totalMappedReads -= element.mapped
104
+ if totalMappedReads <= 0:
105
+ raise ValueError(
106
+ f"After removing reads mapping to excluded chroms, totalMappedReads is {totalMappedReads}."
107
+ )
108
+ scalePM = round((1_000_000 / totalMappedReads)*(1000/stepSize), 5)
109
+ return scalePM
110
+
111
+
112
+ def getPairScaleFactors(
113
+ bamFileA: str,
114
+ bamFileB: str,
115
+ effectiveGenomeSizeA: int,
116
+ effectiveGenomeSizeB: int,
117
+ readLengthA: int,
118
+ readLengthB: int,
119
+ excludeChroms: List[str],
120
+ chromSizesFile: str,
121
+ samThreads: int,
122
+ stepSize: int,
123
+ scaleDown: bool = False,
124
+ normMethod: str = "EGS",
125
+ ) -> Tuple[float, float]:
126
+ r"""Get scaling constants that normalize two alignment files to each other (e.g. ChIP-seq treatment and control) with respect to sequence coverage.
127
+
128
+ :param bamFileA: Path to the first BAM file.
129
+ :type bamFileA: str
130
+ :param bamFileB: Path to the second BAM file.
131
+ :type bamFileB: str
132
+ :param effectiveGenomeSizeA: Effective genome size for the first BAM file.
133
+ :type effectiveGenomeSizeA: int
134
+ :param effectiveGenomeSizeB: Effective genome size for the second BAM file.
135
+ :type effectiveGenomeSizeB: int
136
+ :param readLengthA: read length or fragment length for the first BAM file.
137
+ :type readLengthA: int
138
+ :param readLengthB: read length or fragment length for the second BAM file.
139
+ :type readLengthB: int
140
+ :param excludeChroms: List of chromosomes to exclude from the analysis.
141
+ :type excludeChroms: List[str]
142
+ :param chromSizesFile: Path to the chromosome sizes file.
143
+ :type chromSizesFile: str
144
+ :param samThreads: Number of threads to use for reading BAM files.
145
+ :type samThreads: int
146
+ :param normMethod: Normalization method to use ("RPKM" or "EGS").
147
+ :type normMethod: str
148
+ :return: A tuple containing the scale factors for the first and second BAM files.
149
+ :rtype: Tuple[float, float]
150
+ """
151
+ # RPKM
152
+ if normMethod.upper() == "RPKM":
153
+ scaleFactorA = getScaleFactorPerMillion(
154
+ bamFileA,
155
+ excludeChroms,
156
+ stepSize,
157
+ )
158
+ scaleFactorB = getScaleFactorPerMillion(
159
+ bamFileB,
160
+ excludeChroms,
161
+ stepSize,
162
+ )
163
+ logger.info(
164
+ f"Initial scale factors (per million): {bamFileA}: {scaleFactorA}, {bamFileB}: {scaleFactorB}"
165
+ )
166
+
167
+ if not scaleDown:
168
+ return scaleFactorA, scaleFactorB
169
+ coverageA = 1 / scaleFactorA
170
+ coverageB = 1 / scaleFactorB
171
+ if coverageA < coverageB:
172
+ scaleFactorB *= coverageA / coverageB
173
+ scaleFactorA = 1.0
174
+ else:
175
+ scaleFactorA *= coverageB / coverageA
176
+ scaleFactorB = 1.0
177
+
178
+ logger.info(
179
+ f"Final scale factors (per million): {bamFileA}: {scaleFactorA}, {bamFileB}: {scaleFactorB}"
180
+ )
181
+
182
+ ratio = max(scaleFactorA, scaleFactorB) / min(
183
+ scaleFactorA, scaleFactorB
184
+ )
185
+ if ratio > 5.0:
186
+ logger.warning(
187
+ f"Scale factors differ > 5x....\n"
188
+ f"\n\tAre read/fragment lengths {readLengthA},{readLengthB} correct?"
189
+ )
190
+ return scaleFactorA, scaleFactorB
191
+
192
+ # EGS normalization
193
+ scaleFactorA = getScaleFactor1x(
194
+ bamFileA,
195
+ effectiveGenomeSizeA,
196
+ readLengthA,
197
+ excludeChroms,
198
+ chromSizesFile,
199
+ samThreads,
200
+ )
201
+ scaleFactorB = getScaleFactor1x(
202
+ bamFileB,
203
+ effectiveGenomeSizeB,
204
+ readLengthB,
205
+ excludeChroms,
206
+ chromSizesFile,
207
+ samThreads,
208
+ )
209
+ logger.info(
210
+ f"Initial scale factors: {bamFileA}: {scaleFactorA}, {bamFileB}: {scaleFactorB}"
211
+ )
212
+ if not scaleDown:
213
+ return scaleFactorA, scaleFactorB
214
+ coverageA = 1 / scaleFactorA
215
+ coverageB = 1 / scaleFactorB
216
+ if coverageA < coverageB:
217
+ scaleFactorB *= coverageA / coverageB
218
+ scaleFactorA = 1.0
219
+ else:
220
+ scaleFactorA *= coverageB / coverageA
221
+ scaleFactorB = 1.0
222
+
223
+ logger.info(
224
+ f"Final scale factors: {bamFileA}: {scaleFactorA}, {bamFileB}: {scaleFactorB}"
225
+ )
226
+
227
+ ratio = max(scaleFactorA, scaleFactorB) / min(
228
+ scaleFactorA, scaleFactorB
229
+ )
230
+ if ratio > 5.0:
231
+ logger.warning(
232
+ f"Scale factors differ > 5x....\n"
233
+ f"\n\tAre effective genome sizes {effectiveGenomeSizeA} and {effectiveGenomeSizeB} correct?"
234
+ f"\n\tAre read/fragment lengths {readLengthA},{readLengthB} correct?"
235
+ )
236
+ return scaleFactorA, scaleFactorB
237
+
238
+
239
+ def detrendTrack(
240
+ values: np.ndarray,
241
+ stepSize: int,
242
+ detrendWindowLengthBP: int,
243
+ useOrderStatFilter: bool,
244
+ usePolyFilter: bool,
245
+ detrendTrackPercentile: float,
246
+ detrendSavitzkyGolayDegree: int,
247
+ ) -> np.ndarray:
248
+ r"""Detrend tracks using either an order statistic filter or a polynomial filter.
249
+
250
+ :param values: Values to detrend.
251
+ :type values: np.ndarray
252
+ :param stepSize: see :class:`consenrich.core.countingParams`.
253
+ :type stepSize: int
254
+ :param detrendWindowLengthBP: See :class:`consenrich.core.detrendParams`.
255
+ :type detrendWindowLengthBP: int
256
+ :param useOrderStatFilter: Whether to use a sliding order statistic filter.
257
+ :type useOrderStatFilter: bool
258
+ :param usePolyFilter: Whether to use a sliding polynomial/least squares filter.
259
+ :type usePolyFilter: bool
260
+ :param detrendTrackPercentile: Percentile to use for the order statistic filter.
261
+ :type detrendTrackPercentile: float
262
+ :param detrendSavitzkyGolayDegree: Degree of the polynomial for the Savitzky-Golay/Polynomial filter.
263
+ :type detrendSavitzkyGolayDegree: int
264
+ :return: Detrended values.
265
+ :rtype: np.ndarray
266
+ :raises ValueError: If the detrend window length is not greater than 3 times the step size
267
+ or if the values length is less than the detrend window length.
268
+ """
269
+ bothSpecified: bool = False
270
+ size = int(detrendWindowLengthBP / stepSize)
271
+ if size % 2 == 0:
272
+ size += 1
273
+ if size < 3:
274
+ raise ValueError("Required: windowLengthBP > 3*stepSize.")
275
+ if len(values) < size:
276
+ raise ValueError(
277
+ "values length must be greater than windowLength."
278
+ )
279
+
280
+ if useOrderStatFilter and usePolyFilter:
281
+ logger.warning(
282
+ "Both order statistic and polynomial filters specified...using order statistic filter."
283
+ )
284
+ bothSpecified = True
285
+
286
+ if useOrderStatFilter or bothSpecified:
287
+ return values - ndimage.percentile_filter(
288
+ values, detrendTrackPercentile, size=size
289
+ )
290
+ elif usePolyFilter:
291
+ return values - signal.savgol_filter(
292
+ values, size, detrendSavitzkyGolayDegree
293
+ )
294
+
295
+ return values - ndimage.uniform_filter1d(
296
+ values, size=size, mode="nearest"
297
+ )