consenrich 0.7.2b2__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/__init__.py +11 -0
- consenrich/cconsenrich.c +48405 -0
- consenrich/cconsenrich.cpython-313-darwin.so +0 -0
- consenrich/cconsenrich.pyx +836 -0
- consenrich/consenrich.py +1165 -0
- consenrich/constants.py +168 -0
- consenrich/core.py +1342 -0
- consenrich/data/ce10.sizes +6 -0
- consenrich/data/ce10_blacklist.bed +100 -0
- consenrich/data/ce10_sparse.bed +11828 -0
- consenrich/data/ce11.sizes +6 -0
- consenrich/data/ce11_blacklist.bed +97 -0
- consenrich/data/ce11_sparse.bed +11828 -0
- consenrich/data/dm6.sizes +7 -0
- consenrich/data/dm6_blacklist.bed +182 -0
- consenrich/data/dm6_sparse.bed +20000 -0
- consenrich/data/hg19.sizes +24 -0
- consenrich/data/hg19_blacklist.bed +834 -0
- consenrich/data/hg19_sparse.bed +288358 -0
- consenrich/data/hg38.sizes +24 -0
- consenrich/data/hg38_blacklist.bed +636 -0
- consenrich/data/hg38_sparse.bed +288699 -0
- consenrich/data/mm10.sizes +21 -0
- consenrich/data/mm10_blacklist.bed +3435 -0
- consenrich/data/mm10_sparse.bed +100400 -0
- consenrich/data/mm39.sizes +21 -0
- consenrich/data/mm39_blacklist.bed +3360 -0
- consenrich/data/mm39_sparse.bed +100381 -0
- consenrich/detrorm.py +239 -0
- consenrich/matching.py +850 -0
- consenrich/misc_util.py +119 -0
- consenrich-0.7.2b2.dist-info/METADATA +65 -0
- consenrich-0.7.2b2.dist-info/RECORD +37 -0
- consenrich-0.7.2b2.dist-info/WHEEL +6 -0
- consenrich-0.7.2b2.dist-info/entry_points.txt +2 -0
- consenrich-0.7.2b2.dist-info/licenses/LICENSE +21 -0
- consenrich-0.7.2b2.dist-info/top_level.txt +1 -0
consenrich/consenrich.py
ADDED
|
@@ -0,0 +1,1165 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import glob
|
|
6
|
+
import logging
|
|
7
|
+
import pprint
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional, Tuple, Dict, Any, Union
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pysam
|
|
17
|
+
import pywt
|
|
18
|
+
import yaml
|
|
19
|
+
|
|
20
|
+
import consenrich.core as core
|
|
21
|
+
import consenrich.misc_util as misc_util
|
|
22
|
+
import consenrich.constants as constants
|
|
23
|
+
import consenrich.detrorm as detrorm
|
|
24
|
+
import consenrich.matching as matching
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logging.basicConfig(
|
|
28
|
+
level=logging.INFO,
|
|
29
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _listOrEmpty(list_):
|
|
36
|
+
if list_ is None:
|
|
37
|
+
return []
|
|
38
|
+
return list_
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _getMinR(cfg, numBams: int) -> float:
|
|
42
|
+
fallBackMinR: float = 1.0
|
|
43
|
+
try:
|
|
44
|
+
raw = cfg.get("observationParams.minR", None)
|
|
45
|
+
return float(raw) if raw is not None else fallBackMinR
|
|
46
|
+
except (TypeError, ValueError, KeyError):
|
|
47
|
+
logger.warning(
|
|
48
|
+
f"Invalid or missing 'observationParams.minR' in config. Using `{fallBackMinR}`."
|
|
49
|
+
)
|
|
50
|
+
return fallBackMinR
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def checkControlsPresent(inputArgs: core.inputParams) -> bool:
|
|
54
|
+
"""Check if control BAM files are present in the input arguments.
|
|
55
|
+
|
|
56
|
+
:param inputArgs: core.inputParams object
|
|
57
|
+
:return: True if control BAM files are present, False otherwise.
|
|
58
|
+
"""
|
|
59
|
+
return (
|
|
60
|
+
bool(inputArgs.bamFilesControl)
|
|
61
|
+
and isinstance(inputArgs.bamFilesControl, list)
|
|
62
|
+
and len(inputArgs.bamFilesControl) > 0
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def getReadLengths(
|
|
67
|
+
inputArgs: core.inputParams,
|
|
68
|
+
countingArgs: core.countingParams,
|
|
69
|
+
samArgs: core.samParams,
|
|
70
|
+
) -> List[int]:
|
|
71
|
+
r"""Get read lengths for each BAM file in the input arguments.
|
|
72
|
+
|
|
73
|
+
:param inputArgs: core.inputParams object containing BAM file paths.
|
|
74
|
+
:param countingArgs: core.countingParams object containing number of reads.
|
|
75
|
+
:param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
|
|
76
|
+
:return: List of read lengths for each BAM file.
|
|
77
|
+
"""
|
|
78
|
+
if not inputArgs.bamFiles:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
"No BAM files provided in the input arguments."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if (
|
|
84
|
+
not isinstance(inputArgs.bamFiles, list)
|
|
85
|
+
or len(inputArgs.bamFiles) == 0
|
|
86
|
+
):
|
|
87
|
+
raise ValueError("bam files list is empty")
|
|
88
|
+
|
|
89
|
+
return [
|
|
90
|
+
core.getReadLength(
|
|
91
|
+
bamFile,
|
|
92
|
+
countingArgs.numReads,
|
|
93
|
+
1000,
|
|
94
|
+
samArgs.samThreads,
|
|
95
|
+
samArgs.samFlagExclude,
|
|
96
|
+
)
|
|
97
|
+
for bamFile in inputArgs.bamFiles
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
|
|
102
|
+
matchingEnabled = (
|
|
103
|
+
(matchingArgs.templateNames is not None)
|
|
104
|
+
and isinstance(matchingArgs.templateNames, list)
|
|
105
|
+
and len(matchingArgs.templateNames) > 0
|
|
106
|
+
)
|
|
107
|
+
matchingEnabled = (
|
|
108
|
+
matchingEnabled
|
|
109
|
+
and (matchingArgs.cascadeLevels is not None)
|
|
110
|
+
and isinstance(matchingArgs.cascadeLevels, list)
|
|
111
|
+
and len(matchingArgs.cascadeLevels) > 0
|
|
112
|
+
)
|
|
113
|
+
return matchingEnabled
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def getEffectiveGenomeSizes(
|
|
117
|
+
genomeArgs: core.genomeParams, readLengths: List[int]
|
|
118
|
+
) -> List[int]:
|
|
119
|
+
r"""Get effective genome sizes for the given genome name and read lengths.
|
|
120
|
+
:param genomeArgs: core.genomeParams object
|
|
121
|
+
:param readLengths: List of read lengths for which to get effective genome sizes.
|
|
122
|
+
:return: List of effective genome sizes corresponding to the read lengths.
|
|
123
|
+
"""
|
|
124
|
+
genomeName = genomeArgs.genomeName
|
|
125
|
+
if not genomeName or not isinstance(genomeName, str):
|
|
126
|
+
raise ValueError("Genome name must be a non-empty string.")
|
|
127
|
+
|
|
128
|
+
if not isinstance(readLengths, list) or len(readLengths) == 0:
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"Read lengths must be a non-empty list. Try calling `getReadLengths` first."
|
|
131
|
+
)
|
|
132
|
+
return [
|
|
133
|
+
constants.getEffectiveGenomeSize(genomeName, readLength)
|
|
134
|
+
for readLength in readLengths
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def getInputArgs(config_path: str) -> core.inputParams:
|
|
139
|
+
def _expandWildCards(bamList) -> List[str]:
|
|
140
|
+
expanded = []
|
|
141
|
+
for entry in bamList:
|
|
142
|
+
if "*" in entry or "?" in entry or "[" in entry:
|
|
143
|
+
matched = glob.glob(entry)
|
|
144
|
+
expanded.extend(matched)
|
|
145
|
+
else:
|
|
146
|
+
expanded.append(entry)
|
|
147
|
+
return expanded
|
|
148
|
+
|
|
149
|
+
with open(config_path, "r") as f:
|
|
150
|
+
config = yaml.safe_load(f)
|
|
151
|
+
bamFilesRaw = config.get("inputParams.bamFiles", [])
|
|
152
|
+
bamFilesControlRaw = config.get("inputParams.bamFilesControl", [])
|
|
153
|
+
bamFiles = _expandWildCards(bamFilesRaw)
|
|
154
|
+
bamFilesControl = _expandWildCards(bamFilesControlRaw)
|
|
155
|
+
if len(bamFiles) == 0:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
"No BAM files provided in the configuration."
|
|
158
|
+
)
|
|
159
|
+
if (
|
|
160
|
+
len(bamFilesControl) > 0
|
|
161
|
+
and len(bamFilesControl) != len(bamFiles)
|
|
162
|
+
and len(bamFilesControl) != 1
|
|
163
|
+
):
|
|
164
|
+
raise ValueError(
|
|
165
|
+
"Number of control BAM files must be 0, 1, or the same as number of treatment files"
|
|
166
|
+
)
|
|
167
|
+
if len(bamFilesControl) == 1:
|
|
168
|
+
# If there are multiple bamFiles, but 1 control, control is applied for all treatment files
|
|
169
|
+
logger.info(
|
|
170
|
+
f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
|
|
171
|
+
)
|
|
172
|
+
bamFilesControl = bamFilesControl * len(bamFiles)
|
|
173
|
+
|
|
174
|
+
if (
|
|
175
|
+
not bamFiles
|
|
176
|
+
or not isinstance(bamFiles, list)
|
|
177
|
+
or len(bamFiles) == 0
|
|
178
|
+
):
|
|
179
|
+
raise ValueError("No BAM files found")
|
|
180
|
+
|
|
181
|
+
for i, bamFile in enumerate(bamFiles):
|
|
182
|
+
misc_util.checkBamFile(bamFile)
|
|
183
|
+
|
|
184
|
+
if bamFilesControl:
|
|
185
|
+
for i, bamFile in enumerate(bamFilesControl):
|
|
186
|
+
misc_util.checkBamFile(bamFile)
|
|
187
|
+
|
|
188
|
+
# if we've made it here, we can check pairedEnd
|
|
189
|
+
pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
|
|
190
|
+
_isPairedEnd: Optional[bool] = config.get(
|
|
191
|
+
"inputParams.pairedEnd", None
|
|
192
|
+
)
|
|
193
|
+
if _isPairedEnd is None:
|
|
194
|
+
# only set auto if not provided in config
|
|
195
|
+
_isPairedEnd = all(pairedEndList)
|
|
196
|
+
if _isPairedEnd:
|
|
197
|
+
logger.info("Paired-end BAM files detected")
|
|
198
|
+
else:
|
|
199
|
+
logger.info("One or more single-end BAM files detected")
|
|
200
|
+
return core.inputParams(
|
|
201
|
+
bamFiles=bamFiles,
|
|
202
|
+
bamFilesControl=bamFilesControl,
|
|
203
|
+
pairedEnd=_isPairedEnd,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def getGenomeArgs(config_path: str) -> core.genomeParams:
|
|
208
|
+
with open(config_path, "r") as f:
|
|
209
|
+
config = yaml.safe_load(f)
|
|
210
|
+
genomeName = config.get("genomeParams.name", None)
|
|
211
|
+
genome = constants.resolveGenomeName(genomeName)
|
|
212
|
+
chromSizesFile: Optional[str] = None
|
|
213
|
+
blacklistFile: Optional[str] = None
|
|
214
|
+
sparseBedFile: Optional[str] = None
|
|
215
|
+
chromosomes: Optional[List[str]] = None
|
|
216
|
+
excludeChroms: List[str] = config.get(
|
|
217
|
+
"genomeParams.excludeChroms", []
|
|
218
|
+
)
|
|
219
|
+
excludeForNorm: List[str] = config.get(
|
|
220
|
+
"genomeParams.excludeForNorm", []
|
|
221
|
+
)
|
|
222
|
+
if genome:
|
|
223
|
+
chromSizesFile = constants.getGenomeResourceFile(
|
|
224
|
+
genome, "sizes"
|
|
225
|
+
)
|
|
226
|
+
blacklistFile = constants.getGenomeResourceFile(
|
|
227
|
+
genome, "blacklist"
|
|
228
|
+
)
|
|
229
|
+
sparseBedFile = constants.getGenomeResourceFile(
|
|
230
|
+
genome, "sparse"
|
|
231
|
+
)
|
|
232
|
+
if config.get("genomeParams.chromSizesFile", None):
|
|
233
|
+
chromSizesFile = config["genomeParams.chromSizesFile"]
|
|
234
|
+
if config.get("genomeParams.blacklistFile", None):
|
|
235
|
+
blacklistFile = config["genomeParams.blacklistFile"]
|
|
236
|
+
if config.get("genomeParams.sparseBedFile", None):
|
|
237
|
+
sparseBedFile = config["genomeParams.sparseBedFile"]
|
|
238
|
+
if not chromSizesFile or not os.path.exists(chromSizesFile):
|
|
239
|
+
raise FileNotFoundError(
|
|
240
|
+
f"Chromosome sizes file {chromSizesFile} does not exist."
|
|
241
|
+
)
|
|
242
|
+
if config.get("genomeParams.chromosomes", None):
|
|
243
|
+
chromosomes = config["genomeParams.chromosomes"]
|
|
244
|
+
else:
|
|
245
|
+
if chromSizesFile:
|
|
246
|
+
chromosomes = list(
|
|
247
|
+
pd.read_csv(
|
|
248
|
+
chromSizesFile,
|
|
249
|
+
sep="\t",
|
|
250
|
+
header=None,
|
|
251
|
+
names=["chrom", "size"],
|
|
252
|
+
)["chrom"]
|
|
253
|
+
)
|
|
254
|
+
else:
|
|
255
|
+
raise ValueError(
|
|
256
|
+
"No chromosomes provided in the configuration and no chromosome sizes file specified."
|
|
257
|
+
)
|
|
258
|
+
chromosomes = [
|
|
259
|
+
chrom.strip() for chrom in chromosomes if chrom.strip()
|
|
260
|
+
]
|
|
261
|
+
if excludeChroms:
|
|
262
|
+
chromosomes = [
|
|
263
|
+
chrom
|
|
264
|
+
for chrom in chromosomes
|
|
265
|
+
if chrom not in excludeChroms
|
|
266
|
+
]
|
|
267
|
+
if not chromosomes:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
"No valid chromosomes found after excluding specified chromosomes."
|
|
270
|
+
)
|
|
271
|
+
return core.genomeParams(
|
|
272
|
+
genomeName=genome,
|
|
273
|
+
chromSizesFile=chromSizesFile,
|
|
274
|
+
blacklistFile=blacklistFile,
|
|
275
|
+
sparseBedFile=sparseBedFile,
|
|
276
|
+
chromosomes=chromosomes,
|
|
277
|
+
excludeChroms=excludeChroms,
|
|
278
|
+
excludeForNorm=excludeForNorm,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def getCountingArgs(config_path: str) -> core.countingParams:
|
|
283
|
+
with open(config_path, "r") as f:
|
|
284
|
+
config = yaml.safe_load(f)
|
|
285
|
+
stepSize = config.get("countingParams.stepSize", 25)
|
|
286
|
+
scaleDown = config.get("countingParams.scaleDown", True)
|
|
287
|
+
scaleFactors = config.get("countingParams.scaleFactors", None)
|
|
288
|
+
numReads = config.get("countingParams.numReads", 100)
|
|
289
|
+
scaleFactorsControl = config.get(
|
|
290
|
+
"countingParams.scaleFactorsControl", None
|
|
291
|
+
)
|
|
292
|
+
applyAsinh = config.get("countingParams.applyAsinh", False)
|
|
293
|
+
applyLog = config.get("countingParams.applyLog", False)
|
|
294
|
+
if applyAsinh and applyLog:
|
|
295
|
+
applyAsinh = True
|
|
296
|
+
applyLog = False
|
|
297
|
+
logger.warning(
|
|
298
|
+
"Both `applyAsinh` and `applyLog` are set. Overriding `applyLog` to False."
|
|
299
|
+
)
|
|
300
|
+
rescaleToTreatmentCoverage = config.get(
|
|
301
|
+
"countingParams.rescaleToTreatmentCoverage", True
|
|
302
|
+
)
|
|
303
|
+
if scaleFactors is not None and not isinstance(
|
|
304
|
+
scaleFactors, list
|
|
305
|
+
):
|
|
306
|
+
raise ValueError("`scaleFactors` should be a list of floats.")
|
|
307
|
+
if scaleFactorsControl is not None and not isinstance(
|
|
308
|
+
scaleFactorsControl, list
|
|
309
|
+
):
|
|
310
|
+
raise ValueError(
|
|
311
|
+
"`scaleFactorsControl` should be a list of floats."
|
|
312
|
+
)
|
|
313
|
+
if (
|
|
314
|
+
scaleFactors is not None
|
|
315
|
+
and scaleFactorsControl is not None
|
|
316
|
+
and len(scaleFactors) != len(scaleFactorsControl)
|
|
317
|
+
):
|
|
318
|
+
if len(scaleFactorsControl) == 1:
|
|
319
|
+
scaleFactorsControl = scaleFactorsControl * len(
|
|
320
|
+
scaleFactors
|
|
321
|
+
)
|
|
322
|
+
else:
|
|
323
|
+
raise ValueError(
|
|
324
|
+
"control and treatment scale factors: must be equal length or 1 control"
|
|
325
|
+
)
|
|
326
|
+
return core.countingParams(
|
|
327
|
+
stepSize=stepSize,
|
|
328
|
+
scaleDown=scaleDown,
|
|
329
|
+
scaleFactors=scaleFactors,
|
|
330
|
+
scaleFactorsControl=scaleFactorsControl,
|
|
331
|
+
numReads=numReads,
|
|
332
|
+
applyAsinh=applyAsinh,
|
|
333
|
+
applyLog=applyLog,
|
|
334
|
+
rescaleToTreatmentCoverage=rescaleToTreatmentCoverage,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def readConfig(config_path: str) -> Dict[str, Any]:
|
|
339
|
+
with open(config_path, "r") as f:
|
|
340
|
+
config = yaml.safe_load(f)
|
|
341
|
+
|
|
342
|
+
inputParams = getInputArgs(config_path)
|
|
343
|
+
genomeParams = getGenomeArgs(config_path)
|
|
344
|
+
countingParams = getCountingArgs(config_path)
|
|
345
|
+
minR_default = _getMinR(config, len(inputParams.bamFiles))
|
|
346
|
+
minQ_default = (
|
|
347
|
+
minR_default / (len(inputParams.bamFiles))
|
|
348
|
+
) + 0.10 # protect condition number
|
|
349
|
+
|
|
350
|
+
matchingExcludeRegionsBedFile_default: Optional[str] = (
|
|
351
|
+
genomeParams.blacklistFile
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# apply less aggressive *default* detrending/background removal
|
|
355
|
+
# ...IF input controls are present. In either case, respect
|
|
356
|
+
# ...user-specified params
|
|
357
|
+
detrendWindowLengthBP_: int = -1
|
|
358
|
+
detrendSavitzkyGolayDegree_: int = -1
|
|
359
|
+
|
|
360
|
+
if (
|
|
361
|
+
inputParams.bamFilesControl is not None
|
|
362
|
+
and len(inputParams.bamFilesControl) > 0
|
|
363
|
+
):
|
|
364
|
+
detrendWindowLengthBP_ = config.get(
|
|
365
|
+
"detrendParams.detrendWindowLengthBP",
|
|
366
|
+
25_000,
|
|
367
|
+
)
|
|
368
|
+
detrendSavitzkyGolayDegree_ = config.get(
|
|
369
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
370
|
+
1,
|
|
371
|
+
)
|
|
372
|
+
else:
|
|
373
|
+
detrendWindowLengthBP_ = config.get(
|
|
374
|
+
"detrendParams.detrendWindowLengthBP",
|
|
375
|
+
10_000,
|
|
376
|
+
)
|
|
377
|
+
detrendSavitzkyGolayDegree_ = config.get(
|
|
378
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
379
|
+
2,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
"experimentName": config.get(
|
|
384
|
+
"experimentName", "consenrichExperiment"
|
|
385
|
+
),
|
|
386
|
+
"genomeArgs": genomeParams,
|
|
387
|
+
"inputArgs": inputParams,
|
|
388
|
+
"countingArgs": countingParams,
|
|
389
|
+
"processArgs": core.processParams(
|
|
390
|
+
deltaF=config.get("processParams.deltaF", 0.5),
|
|
391
|
+
minQ=config.get("processParams.minQ", minQ_default),
|
|
392
|
+
maxQ=config.get("processParams.maxQ", 500.0),
|
|
393
|
+
offDiagQ=config.get("processParams.offDiagQ", 0.0),
|
|
394
|
+
dStatAlpha=config.get("processParams.dStatAlpha", 3.0),
|
|
395
|
+
dStatd=config.get("processParams.dStatd", 10.0),
|
|
396
|
+
dStatPC=config.get("processParams.dStatPC", 1.0),
|
|
397
|
+
scaleResidualsByP11=config.get(
|
|
398
|
+
"processParams.scaleResidualsByP11", False
|
|
399
|
+
),
|
|
400
|
+
),
|
|
401
|
+
"observationArgs": core.observationParams(
|
|
402
|
+
minR=minR_default,
|
|
403
|
+
maxR=config.get("observationParams.maxR", 500.0),
|
|
404
|
+
useALV=config.get("observationParams.useALV", False),
|
|
405
|
+
useConstantNoiseLevel=config.get(
|
|
406
|
+
"observationParams.useConstantNoiseLevel", False
|
|
407
|
+
),
|
|
408
|
+
noGlobal=config.get("observationParams.noGlobal", False),
|
|
409
|
+
numNearest=config.get("observationParams.numNearest", 25),
|
|
410
|
+
localWeight=config.get(
|
|
411
|
+
"observationParams.localWeight",
|
|
412
|
+
0.333,
|
|
413
|
+
),
|
|
414
|
+
globalWeight=config.get(
|
|
415
|
+
"observationParams.globalWeight",
|
|
416
|
+
0.667,
|
|
417
|
+
),
|
|
418
|
+
approximationWindowLengthBP=config.get(
|
|
419
|
+
"observationParams.approximationWindowLengthBP",
|
|
420
|
+
10000,
|
|
421
|
+
),
|
|
422
|
+
lowPassWindowLengthBP=config.get(
|
|
423
|
+
"observationParams.lowPassWindowLengthBP",
|
|
424
|
+
20000,
|
|
425
|
+
),
|
|
426
|
+
lowPassFilterType=config.get(
|
|
427
|
+
"observationParams.lowPassFilterType",
|
|
428
|
+
"median",
|
|
429
|
+
),
|
|
430
|
+
returnCenter=config.get(
|
|
431
|
+
"observationParams.returnCenter",
|
|
432
|
+
True,
|
|
433
|
+
),
|
|
434
|
+
),
|
|
435
|
+
"stateArgs": core.stateParams(
|
|
436
|
+
stateInit=config.get("stateParams.stateInit", 0.0),
|
|
437
|
+
stateCovarInit=config.get(
|
|
438
|
+
"stateParams.stateCovarInit",
|
|
439
|
+
100.0,
|
|
440
|
+
),
|
|
441
|
+
boundState=config.get("stateParams.boundState", True),
|
|
442
|
+
stateLowerBound=config.get(
|
|
443
|
+
"stateParams.stateLowerBound",
|
|
444
|
+
0.0,
|
|
445
|
+
),
|
|
446
|
+
stateUpperBound=config.get(
|
|
447
|
+
"stateParams.stateUpperBound",
|
|
448
|
+
10000.0,
|
|
449
|
+
),
|
|
450
|
+
),
|
|
451
|
+
"samArgs": core.samParams(
|
|
452
|
+
samThreads=config.get("samParams.samThreads", 1),
|
|
453
|
+
samFlagExclude=config.get(
|
|
454
|
+
"samParams.samFlagExclude", 3844
|
|
455
|
+
),
|
|
456
|
+
oneReadPerBin=config.get("samParams.oneReadPerBin", 0),
|
|
457
|
+
chunkSize=config.get("samParams.chunkSize", 1000000),
|
|
458
|
+
offsetStr=config.get("samParams.offsetStr", "0,0"),
|
|
459
|
+
extendBP=config.get("samParams.extendBP", []),
|
|
460
|
+
maxInsertSize=config.get("samParams.maxInsertSize", 1000),
|
|
461
|
+
pairedEndMode=config.get(
|
|
462
|
+
"samParams.pairedEndMode",
|
|
463
|
+
1
|
|
464
|
+
if inputParams.pairedEnd is not None
|
|
465
|
+
and int(inputParams.pairedEnd) > 0
|
|
466
|
+
else 0,
|
|
467
|
+
),
|
|
468
|
+
inferFragmentLength=config.get(
|
|
469
|
+
"samParams.inferFragmentLength",
|
|
470
|
+
1
|
|
471
|
+
if inputParams.pairedEnd is not None
|
|
472
|
+
and int(inputParams.pairedEnd) == 0
|
|
473
|
+
else 0,
|
|
474
|
+
),
|
|
475
|
+
countEndsOnly=config.get(
|
|
476
|
+
"samParams.countEndsOnly",
|
|
477
|
+
False,
|
|
478
|
+
),
|
|
479
|
+
),
|
|
480
|
+
"detrendArgs": core.detrendParams(
|
|
481
|
+
detrendWindowLengthBP=detrendWindowLengthBP_,
|
|
482
|
+
detrendTrackPercentile=config.get(
|
|
483
|
+
"detrendParams.detrendTrackPercentile",
|
|
484
|
+
75,
|
|
485
|
+
),
|
|
486
|
+
usePolyFilter=config.get(
|
|
487
|
+
"detrendParams.usePolyFilter",
|
|
488
|
+
False,
|
|
489
|
+
),
|
|
490
|
+
detrendSavitzkyGolayDegree=config.get(
|
|
491
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
492
|
+
detrendSavitzkyGolayDegree_,
|
|
493
|
+
),
|
|
494
|
+
useOrderStatFilter=config.get(
|
|
495
|
+
"detrendParams.useOrderStatFilter",
|
|
496
|
+
True,
|
|
497
|
+
),
|
|
498
|
+
),
|
|
499
|
+
"matchingArgs": core.matchingParams(
|
|
500
|
+
templateNames=config.get(
|
|
501
|
+
"matchingParams.templateNames",
|
|
502
|
+
[],
|
|
503
|
+
),
|
|
504
|
+
cascadeLevels=config.get(
|
|
505
|
+
"matchingParams.cascadeLevels",
|
|
506
|
+
[],
|
|
507
|
+
),
|
|
508
|
+
iters=config.get("matchingParams.iters", 25_000),
|
|
509
|
+
alpha=config.get("matchingParams.alpha", 0.05),
|
|
510
|
+
minMatchLengthBP=config.get(
|
|
511
|
+
"matchingParams.minMatchLengthBP", 250
|
|
512
|
+
),
|
|
513
|
+
maxNumMatches=config.get(
|
|
514
|
+
"matchingParams.maxNumMatches", 100_000
|
|
515
|
+
),
|
|
516
|
+
minSignalAtMaxima=config.get(
|
|
517
|
+
"matchingParams.minSignalAtMaxima", "q:0.75"
|
|
518
|
+
),
|
|
519
|
+
merge=config.get("matchingParams.merge", True),
|
|
520
|
+
mergeGapBP=config.get("matchingParams.mergeGapBP", None),
|
|
521
|
+
useScalingFunction=config.get(
|
|
522
|
+
"matchingParams.useScalingFunction", True
|
|
523
|
+
),
|
|
524
|
+
excludeRegionsBedFile=config.get(
|
|
525
|
+
"matchingParams.excludeRegionsBedFile",
|
|
526
|
+
matchingExcludeRegionsBedFile_default,
|
|
527
|
+
),
|
|
528
|
+
randSeed=config.get("matchingParams.randSeed", 42),
|
|
529
|
+
penalizeBy=config.get("matchingParams.penalizeBy", None),
|
|
530
|
+
),
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def convertBedGraphToBigWig(experimentName, chromSizesFile):
|
|
535
|
+
suffixes = ["state", "residuals"]
|
|
536
|
+
path_ = ""
|
|
537
|
+
warningMessage = (
|
|
538
|
+
"Could not find UCSC bedGraphToBigWig binary utility."
|
|
539
|
+
"If you need bigWig files instead of the default, human-readable bedGraph files,"
|
|
540
|
+
"you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
|
|
541
|
+
"OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
logger.info(
|
|
545
|
+
"Attempting to generate bigWig files from bedGraph format..."
|
|
546
|
+
)
|
|
547
|
+
try:
|
|
548
|
+
path_ = shutil.which("bedGraphToBigWig")
|
|
549
|
+
except Exception as e:
|
|
550
|
+
logger.warning(f"\n{warningMessage}\n")
|
|
551
|
+
return
|
|
552
|
+
if path_ is None or len(path_) == 0:
|
|
553
|
+
logger.warning(f"\n{warningMessage}\n")
|
|
554
|
+
return
|
|
555
|
+
logger.info(f"Using bedGraphToBigWig from {path_}")
|
|
556
|
+
for suffix in suffixes:
|
|
557
|
+
bedgraph = (
|
|
558
|
+
f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
|
|
559
|
+
)
|
|
560
|
+
if not os.path.exists(bedgraph):
|
|
561
|
+
logger.warning(
|
|
562
|
+
f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
|
|
563
|
+
)
|
|
564
|
+
continue
|
|
565
|
+
if not os.path.exists(chromSizesFile):
|
|
566
|
+
logger.warning(
|
|
567
|
+
f"{chromSizesFile} does not exist. Skipping bigWig conversion."
|
|
568
|
+
)
|
|
569
|
+
return
|
|
570
|
+
bigwig = f"{experimentName}_consenrich_{suffix}.bw"
|
|
571
|
+
logger.info(f"Start: {bedgraph} --> {bigwig}...")
|
|
572
|
+
try:
|
|
573
|
+
subprocess.run(
|
|
574
|
+
[path_, bedgraph, chromSizesFile, bigwig], check=True
|
|
575
|
+
)
|
|
576
|
+
except Exception as e:
|
|
577
|
+
logger.warning(
|
|
578
|
+
f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
|
|
579
|
+
)
|
|
580
|
+
continue
|
|
581
|
+
if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
|
|
582
|
+
logger.info(
|
|
583
|
+
f"Finished: converted {bedgraph} to {bigwig}."
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def main():
|
|
588
|
+
parser = argparse.ArgumentParser(description="Consenrich CLI")
|
|
589
|
+
parser.add_argument(
|
|
590
|
+
"--config",
|
|
591
|
+
type=str,
|
|
592
|
+
dest="config",
|
|
593
|
+
help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# --- Matching-specific command-line arguments ---
|
|
597
|
+
parser.add_argument(
|
|
598
|
+
"--match-bedGraph",
|
|
599
|
+
type=str,
|
|
600
|
+
dest="matchBedGraph",
|
|
601
|
+
help="Path to a bedGraph file of Consenrich estimates to match templates against.\
|
|
602
|
+
If provided, *only* the matching algorithm is run (no other processing).",
|
|
603
|
+
)
|
|
604
|
+
parser.add_argument(
|
|
605
|
+
"--match-template",
|
|
606
|
+
type=str,
|
|
607
|
+
default="haar",
|
|
608
|
+
choices=[
|
|
609
|
+
x
|
|
610
|
+
for x in pywt.wavelist(kind="discrete")
|
|
611
|
+
if "bio" not in x
|
|
612
|
+
],
|
|
613
|
+
dest="matchTemplate",
|
|
614
|
+
)
|
|
615
|
+
parser.add_argument(
|
|
616
|
+
"--match-level", type=int, default=2, dest="matchLevel"
|
|
617
|
+
)
|
|
618
|
+
parser.add_argument(
|
|
619
|
+
"--match-alpha", type=float, default=0.05, dest="matchAlpha"
|
|
620
|
+
)
|
|
621
|
+
parser.add_argument(
|
|
622
|
+
"--match-min-length",
|
|
623
|
+
type=int,
|
|
624
|
+
default=250,
|
|
625
|
+
dest="matchMinMatchLengthBP",
|
|
626
|
+
)
|
|
627
|
+
parser.add_argument(
|
|
628
|
+
"--match-iters", type=int, default=25000, dest="matchIters"
|
|
629
|
+
)
|
|
630
|
+
parser.add_argument(
|
|
631
|
+
"--match-min-signal",
|
|
632
|
+
type=str,
|
|
633
|
+
default="q:0.75",
|
|
634
|
+
dest="matchMinSignalAtMaxima",
|
|
635
|
+
)
|
|
636
|
+
parser.add_argument(
|
|
637
|
+
"--match-max-matches",
|
|
638
|
+
type=int,
|
|
639
|
+
default=100000,
|
|
640
|
+
dest="matchMaxNumMatches",
|
|
641
|
+
)
|
|
642
|
+
parser.add_argument(
|
|
643
|
+
"--match-no-merge", action="store_true", dest="matchNoMerge"
|
|
644
|
+
)
|
|
645
|
+
parser.add_argument(
|
|
646
|
+
"--match-merge-gap",
|
|
647
|
+
type=int,
|
|
648
|
+
default=None,
|
|
649
|
+
dest="matchMergeGapBP",
|
|
650
|
+
)
|
|
651
|
+
parser.add_argument(
|
|
652
|
+
"--match-use-wavelet",
|
|
653
|
+
action="store_true",
|
|
654
|
+
dest="matchUseWavelet",
|
|
655
|
+
)
|
|
656
|
+
parser.add_argument(
|
|
657
|
+
"--match-seed", type=int, default=42, dest="matchRandSeed"
|
|
658
|
+
)
|
|
659
|
+
parser.add_argument(
|
|
660
|
+
"--match-exclude-bed",
|
|
661
|
+
type=str,
|
|
662
|
+
default=None,
|
|
663
|
+
dest="matchExcludeBed",
|
|
664
|
+
)
|
|
665
|
+
parser.add_argument(
|
|
666
|
+
"--verbose", action="store_true", help="If set, logs config"
|
|
667
|
+
)
|
|
668
|
+
args = parser.parse_args()
|
|
669
|
+
|
|
670
|
+
if args.matchBedGraph:
|
|
671
|
+
if not os.path.exists(args.matchBedGraph):
|
|
672
|
+
raise FileNotFoundError(
|
|
673
|
+
f"bedGraph file {args.matchBedGraph} couldn't be found."
|
|
674
|
+
)
|
|
675
|
+
logger.info(
|
|
676
|
+
f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
outName = matching.matchExistingBedGraph(
|
|
680
|
+
args.matchBedGraph,
|
|
681
|
+
args.matchTemplate,
|
|
682
|
+
args.matchLevel,
|
|
683
|
+
alpha=args.matchAlpha,
|
|
684
|
+
minMatchLengthBP=args.matchMinMatchLengthBP,
|
|
685
|
+
iters=args.matchIters,
|
|
686
|
+
minSignalAtMaxima=args.matchMinSignalAtMaxima,
|
|
687
|
+
maxNumMatches=args.matchMaxNumMatches,
|
|
688
|
+
useScalingFunction=(not args.matchUseWavelet),
|
|
689
|
+
merge=(not args.matchNoMerge),
|
|
690
|
+
mergeGapBP=args.matchMergeGapBP,
|
|
691
|
+
excludeRegionsBedFile=args.matchExcludeBed,
|
|
692
|
+
randSeed=args.matchRandSeed,
|
|
693
|
+
)
|
|
694
|
+
logger.info(f"Finished matching. Written to {outName}")
|
|
695
|
+
sys.exit(0)
|
|
696
|
+
|
|
697
|
+
if args.matchBedGraph:
|
|
698
|
+
# this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
|
|
699
|
+
logger.info(
|
|
700
|
+
"If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
|
|
701
|
+
)
|
|
702
|
+
sys.exit(0)
|
|
703
|
+
|
|
704
|
+
if not args.config:
|
|
705
|
+
logger.info(
|
|
706
|
+
"No config file provided, run with `--config <path_to_config.yaml>`"
|
|
707
|
+
)
|
|
708
|
+
logger.info(
|
|
709
|
+
"See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
|
|
710
|
+
)
|
|
711
|
+
sys.exit(1)
|
|
712
|
+
|
|
713
|
+
if not os.path.exists(args.config):
|
|
714
|
+
logger.info(f"Config file {args.config} does not exist.")
|
|
715
|
+
logger.info(
|
|
716
|
+
"See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
|
|
717
|
+
)
|
|
718
|
+
sys.exit(1)
|
|
719
|
+
|
|
720
|
+
config = readConfig(args.config)
|
|
721
|
+
experimentName = config["experimentName"]
|
|
722
|
+
genomeArgs = config["genomeArgs"]
|
|
723
|
+
inputArgs = config["inputArgs"]
|
|
724
|
+
countingArgs = config["countingArgs"]
|
|
725
|
+
processArgs = config["processArgs"]
|
|
726
|
+
observationArgs = config["observationArgs"]
|
|
727
|
+
stateArgs = config["stateArgs"]
|
|
728
|
+
samArgs = config["samArgs"]
|
|
729
|
+
detrendArgs = config["detrendArgs"]
|
|
730
|
+
matchingArgs = config["matchingArgs"]
|
|
731
|
+
bamFiles = inputArgs.bamFiles
|
|
732
|
+
bamFilesControl = inputArgs.bamFilesControl
|
|
733
|
+
numSamples = len(bamFiles)
|
|
734
|
+
numNearest = observationArgs.numNearest
|
|
735
|
+
stepSize = countingArgs.stepSize
|
|
736
|
+
excludeForNorm = genomeArgs.excludeForNorm
|
|
737
|
+
chromSizes = genomeArgs.chromSizesFile
|
|
738
|
+
scaleDown = countingArgs.scaleDown
|
|
739
|
+
extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
|
|
740
|
+
initialTreatmentScaleFactors = []
|
|
741
|
+
minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
|
|
742
|
+
mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
|
|
743
|
+
|
|
744
|
+
if args.verbose:
|
|
745
|
+
try:
|
|
746
|
+
logger.info("Configuration:\n")
|
|
747
|
+
config_truncated = {
|
|
748
|
+
k: v
|
|
749
|
+
for k, v in config.items()
|
|
750
|
+
if k
|
|
751
|
+
not in ["inputArgs", "genomeArgs", "countingArgs"]
|
|
752
|
+
}
|
|
753
|
+
config_truncated["experimentName"] = experimentName
|
|
754
|
+
config_truncated["inputArgs"] = inputArgs
|
|
755
|
+
config_truncated["genomeArgs"] = genomeArgs
|
|
756
|
+
config_truncated["countingArgs"] = countingArgs
|
|
757
|
+
config_truncated["processArgs"] = processArgs
|
|
758
|
+
config_truncated["observationArgs"] = observationArgs
|
|
759
|
+
config_truncated["stateArgs"] = stateArgs
|
|
760
|
+
config_truncated["samArgs"] = samArgs
|
|
761
|
+
config_truncated["detrendArgs"] = detrendArgs
|
|
762
|
+
pprint.pprint(config_truncated, indent=4)
|
|
763
|
+
except Exception as e:
|
|
764
|
+
logger.warning(f"Failed to print parsed config:\n{e}\n")
|
|
765
|
+
|
|
766
|
+
controlsPresent = checkControlsPresent(inputArgs)
|
|
767
|
+
if args.verbose:
|
|
768
|
+
logger.info(f"controlsPresent: {controlsPresent}")
|
|
769
|
+
readLengthsBamFiles = getReadLengths(
|
|
770
|
+
inputArgs, countingArgs, samArgs
|
|
771
|
+
)
|
|
772
|
+
effectiveGenomeSizes = getEffectiveGenomeSizes(
|
|
773
|
+
genomeArgs, readLengthsBamFiles
|
|
774
|
+
)
|
|
775
|
+
matchingEnabled = checkMatchingEnabled(matchingArgs)
|
|
776
|
+
if args.verbose:
|
|
777
|
+
logger.info(f"matchingEnabled: {matchingEnabled}")
|
|
778
|
+
scaleFactors = countingArgs.scaleFactors
|
|
779
|
+
scaleFactorsControl = countingArgs.scaleFactorsControl
|
|
780
|
+
|
|
781
|
+
if controlsPresent:
|
|
782
|
+
readLengthsControlBamFiles = [
|
|
783
|
+
core.getReadLength(
|
|
784
|
+
bamFile,
|
|
785
|
+
countingArgs.numReads,
|
|
786
|
+
1000,
|
|
787
|
+
samArgs.samThreads,
|
|
788
|
+
samArgs.samFlagExclude,
|
|
789
|
+
)
|
|
790
|
+
for bamFile in bamFilesControl
|
|
791
|
+
]
|
|
792
|
+
effectiveGenomeSizesControl = [
|
|
793
|
+
constants.getEffectiveGenomeSize(
|
|
794
|
+
genomeArgs.genomeName, readLength
|
|
795
|
+
)
|
|
796
|
+
for readLength in readLengthsControlBamFiles
|
|
797
|
+
]
|
|
798
|
+
|
|
799
|
+
if (
|
|
800
|
+
scaleFactors is not None
|
|
801
|
+
and scaleFactorsControl is not None
|
|
802
|
+
):
|
|
803
|
+
treatScaleFactors = scaleFactors
|
|
804
|
+
controlScaleFactors = scaleFactorsControl
|
|
805
|
+
# still make sure this is accessible
|
|
806
|
+
initialTreatmentScaleFactors = [1.0] * len(bamFiles)
|
|
807
|
+
else:
|
|
808
|
+
try:
|
|
809
|
+
initialTreatmentScaleFactors = [
|
|
810
|
+
detrorm.getScaleFactor1x(
|
|
811
|
+
bamFile,
|
|
812
|
+
effectiveGenomeSize,
|
|
813
|
+
readLength,
|
|
814
|
+
genomeArgs.excludeChroms,
|
|
815
|
+
genomeArgs.chromSizesFile,
|
|
816
|
+
samArgs.samThreads,
|
|
817
|
+
)
|
|
818
|
+
for bamFile, effectiveGenomeSize, readLength in zip(
|
|
819
|
+
bamFiles,
|
|
820
|
+
effectiveGenomeSizes,
|
|
821
|
+
readLengthsBamFiles,
|
|
822
|
+
)
|
|
823
|
+
]
|
|
824
|
+
except Exception:
|
|
825
|
+
initialTreatmentScaleFactors = [1.0] * len(bamFiles)
|
|
826
|
+
|
|
827
|
+
pairScalingFactors = [
|
|
828
|
+
detrorm.getPairScaleFactors(
|
|
829
|
+
bamFileA,
|
|
830
|
+
bamFileB,
|
|
831
|
+
effectiveGenomeSizeA,
|
|
832
|
+
effectiveGenomeSizeB,
|
|
833
|
+
readLengthA,
|
|
834
|
+
readLengthB,
|
|
835
|
+
excludeForNorm,
|
|
836
|
+
chromSizes,
|
|
837
|
+
samArgs.samThreads,
|
|
838
|
+
scaleDown,
|
|
839
|
+
)
|
|
840
|
+
for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
|
|
841
|
+
bamFiles,
|
|
842
|
+
bamFilesControl,
|
|
843
|
+
effectiveGenomeSizes,
|
|
844
|
+
effectiveGenomeSizesControl,
|
|
845
|
+
readLengthsBamFiles,
|
|
846
|
+
readLengthsControlBamFiles,
|
|
847
|
+
)
|
|
848
|
+
]
|
|
849
|
+
|
|
850
|
+
treatScaleFactors = []
|
|
851
|
+
controlScaleFactors = []
|
|
852
|
+
for scaleFactorA, scaleFactorB in pairScalingFactors:
|
|
853
|
+
treatScaleFactors.append(scaleFactorA)
|
|
854
|
+
controlScaleFactors.append(scaleFactorB)
|
|
855
|
+
|
|
856
|
+
else:
|
|
857
|
+
treatScaleFactors = scaleFactors
|
|
858
|
+
controlScaleFactors = scaleFactorsControl
|
|
859
|
+
|
|
860
|
+
if scaleFactors is None and not controlsPresent:
|
|
861
|
+
scaleFactors = [
|
|
862
|
+
detrorm.getScaleFactor1x(
|
|
863
|
+
bamFile,
|
|
864
|
+
effectiveGenomeSize,
|
|
865
|
+
readLength,
|
|
866
|
+
genomeArgs.excludeChroms,
|
|
867
|
+
genomeArgs.chromSizesFile,
|
|
868
|
+
samArgs.samThreads,
|
|
869
|
+
)
|
|
870
|
+
for bamFile, effectiveGenomeSize, readLength in zip(
|
|
871
|
+
bamFiles, effectiveGenomeSizes, readLengthsBamFiles
|
|
872
|
+
)
|
|
873
|
+
]
|
|
874
|
+
chromSizesDict = misc_util.getChromSizesDict(
|
|
875
|
+
genomeArgs.chromSizesFile,
|
|
876
|
+
excludeChroms=genomeArgs.excludeChroms,
|
|
877
|
+
)
|
|
878
|
+
chromosomes = genomeArgs.chromosomes
|
|
879
|
+
|
|
880
|
+
for c_, chromosome in enumerate(chromosomes):
|
|
881
|
+
chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
|
|
882
|
+
bamFiles,
|
|
883
|
+
chromosome,
|
|
884
|
+
chromSizesDict[chromosome],
|
|
885
|
+
samArgs.samThreads,
|
|
886
|
+
samArgs.samFlagExclude,
|
|
887
|
+
)
|
|
888
|
+
chromosomeStart = max(
|
|
889
|
+
0, (chromosomeStart - (chromosomeStart % stepSize))
|
|
890
|
+
)
|
|
891
|
+
chromosomeEnd = max(
|
|
892
|
+
0, (chromosomeEnd - (chromosomeEnd % stepSize))
|
|
893
|
+
)
|
|
894
|
+
numIntervals = (
|
|
895
|
+
((chromosomeEnd - chromosomeStart) + stepSize) - 1
|
|
896
|
+
) // stepSize
|
|
897
|
+
intervals = np.arange(
|
|
898
|
+
chromosomeStart, chromosomeEnd, stepSize
|
|
899
|
+
)
|
|
900
|
+
chromMat: np.ndarray = np.empty(
|
|
901
|
+
(numSamples, numIntervals), dtype=np.float32
|
|
902
|
+
)
|
|
903
|
+
if controlsPresent:
|
|
904
|
+
j_: int = 0
|
|
905
|
+
finalSF = 1.0
|
|
906
|
+
for bamA, bamB in zip(bamFiles, bamFilesControl):
|
|
907
|
+
logger.info(
|
|
908
|
+
f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
|
|
909
|
+
)
|
|
910
|
+
pairMatrix: np.ndarray = core.readBamSegments(
|
|
911
|
+
[bamA, bamB],
|
|
912
|
+
chromosome,
|
|
913
|
+
chromosomeStart,
|
|
914
|
+
chromosomeEnd,
|
|
915
|
+
stepSize,
|
|
916
|
+
[
|
|
917
|
+
readLengthsBamFiles[j_],
|
|
918
|
+
readLengthsControlBamFiles[j_],
|
|
919
|
+
],
|
|
920
|
+
[treatScaleFactors[j_], controlScaleFactors[j_]],
|
|
921
|
+
samArgs.oneReadPerBin,
|
|
922
|
+
samArgs.samThreads,
|
|
923
|
+
samArgs.samFlagExclude,
|
|
924
|
+
offsetStr=samArgs.offsetStr,
|
|
925
|
+
extendBP=extendBP_[j_],
|
|
926
|
+
maxInsertSize=samArgs.maxInsertSize,
|
|
927
|
+
pairedEndMode=samArgs.pairedEndMode,
|
|
928
|
+
inferFragmentLength=samArgs.inferFragmentLength,
|
|
929
|
+
applyAsinh=countingArgs.applyAsinh,
|
|
930
|
+
applyLog=countingArgs.applyLog,
|
|
931
|
+
countEndsOnly=samArgs.countEndsOnly,
|
|
932
|
+
)
|
|
933
|
+
if countingArgs.rescaleToTreatmentCoverage:
|
|
934
|
+
finalSF = max(
|
|
935
|
+
1.0, initialTreatmentScaleFactors[j_]
|
|
936
|
+
)
|
|
937
|
+
chromMat[j_, :] = finalSF * (
|
|
938
|
+
pairMatrix[0, :] - pairMatrix[1, :]
|
|
939
|
+
)
|
|
940
|
+
j_ += 1
|
|
941
|
+
else:
|
|
942
|
+
chromMat = core.readBamSegments(
|
|
943
|
+
bamFiles,
|
|
944
|
+
chromosome,
|
|
945
|
+
chromosomeStart,
|
|
946
|
+
chromosomeEnd,
|
|
947
|
+
stepSize,
|
|
948
|
+
readLengthsBamFiles,
|
|
949
|
+
scaleFactors,
|
|
950
|
+
samArgs.oneReadPerBin,
|
|
951
|
+
samArgs.samThreads,
|
|
952
|
+
samArgs.samFlagExclude,
|
|
953
|
+
offsetStr=samArgs.offsetStr,
|
|
954
|
+
extendBP=extendBP_,
|
|
955
|
+
maxInsertSize=samArgs.maxInsertSize,
|
|
956
|
+
pairedEndMode=samArgs.pairedEndMode,
|
|
957
|
+
inferFragmentLength=samArgs.inferFragmentLength,
|
|
958
|
+
applyAsinh=countingArgs.applyAsinh,
|
|
959
|
+
applyLog=countingArgs.applyLog,
|
|
960
|
+
countEndsOnly=samArgs.countEndsOnly,
|
|
961
|
+
)
|
|
962
|
+
sparseMap = None
|
|
963
|
+
if genomeArgs.sparseBedFile and not observationArgs.useALV:
|
|
964
|
+
logger.info(
|
|
965
|
+
f"Building sparse mapping for {chromosome}..."
|
|
966
|
+
)
|
|
967
|
+
sparseMap = core.getSparseMap(
|
|
968
|
+
chromosome,
|
|
969
|
+
intervals,
|
|
970
|
+
numNearest,
|
|
971
|
+
genomeArgs.sparseBedFile,
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
muncMat = np.empty_like(chromMat, dtype=np.float32)
|
|
975
|
+
for j in range(numSamples):
|
|
976
|
+
logger.info(
|
|
977
|
+
f"Muncing {j + 1}/{numSamples} for {chromosome}..."
|
|
978
|
+
)
|
|
979
|
+
muncMat[j, :] = core.getMuncTrack(
|
|
980
|
+
chromosome,
|
|
981
|
+
intervals,
|
|
982
|
+
stepSize,
|
|
983
|
+
chromMat[j, :],
|
|
984
|
+
observationArgs.minR,
|
|
985
|
+
observationArgs.maxR,
|
|
986
|
+
observationArgs.useALV,
|
|
987
|
+
observationArgs.useConstantNoiseLevel,
|
|
988
|
+
observationArgs.noGlobal,
|
|
989
|
+
observationArgs.localWeight,
|
|
990
|
+
observationArgs.globalWeight,
|
|
991
|
+
observationArgs.approximationWindowLengthBP,
|
|
992
|
+
observationArgs.lowPassWindowLengthBP,
|
|
993
|
+
observationArgs.returnCenter,
|
|
994
|
+
sparseMap=sparseMap,
|
|
995
|
+
lowPassFilterType=observationArgs.lowPassFilterType,
|
|
996
|
+
)
|
|
997
|
+
chromMat[j, :] = detrorm.detrendTrack(
|
|
998
|
+
chromMat[j, :],
|
|
999
|
+
stepSize,
|
|
1000
|
+
detrendArgs.detrendWindowLengthBP,
|
|
1001
|
+
detrendArgs.useOrderStatFilter,
|
|
1002
|
+
detrendArgs.usePolyFilter,
|
|
1003
|
+
detrendArgs.detrendTrackPercentile,
|
|
1004
|
+
detrendArgs.detrendSavitzkyGolayDegree,
|
|
1005
|
+
)
|
|
1006
|
+
logger.info(f">>>Running consenrich: {chromosome}<<<")
|
|
1007
|
+
|
|
1008
|
+
x, P, y = core.runConsenrich(
|
|
1009
|
+
chromMat,
|
|
1010
|
+
muncMat,
|
|
1011
|
+
processArgs.deltaF,
|
|
1012
|
+
processArgs.minQ,
|
|
1013
|
+
processArgs.maxQ,
|
|
1014
|
+
processArgs.offDiagQ,
|
|
1015
|
+
processArgs.dStatAlpha,
|
|
1016
|
+
processArgs.dStatd,
|
|
1017
|
+
processArgs.dStatPC,
|
|
1018
|
+
stateArgs.stateInit,
|
|
1019
|
+
stateArgs.stateCovarInit,
|
|
1020
|
+
stateArgs.boundState,
|
|
1021
|
+
stateArgs.stateLowerBound,
|
|
1022
|
+
stateArgs.stateUpperBound,
|
|
1023
|
+
samArgs.chunkSize,
|
|
1024
|
+
progressIter=50_000,
|
|
1025
|
+
)
|
|
1026
|
+
logger.info("Done.")
|
|
1027
|
+
|
|
1028
|
+
x_ = core.getPrimaryState(x)
|
|
1029
|
+
y_ = core.getPrecisionWeightedResidual(
|
|
1030
|
+
y,
|
|
1031
|
+
muncMat,
|
|
1032
|
+
stateCovarSmoothed=P
|
|
1033
|
+
if processArgs.scaleResidualsByP11 is not None
|
|
1034
|
+
and processArgs.scaleResidualsByP11
|
|
1035
|
+
else None,
|
|
1036
|
+
)
|
|
1037
|
+
weights_: Optional[np.ndarray] = None
|
|
1038
|
+
if matchingArgs.penalizeBy is not None:
|
|
1039
|
+
if matchingArgs.penalizeBy == "absResiduals":
|
|
1040
|
+
try:
|
|
1041
|
+
weights_ = np.abs(y_)
|
|
1042
|
+
except Exception as e:
|
|
1043
|
+
logger.warning(
|
|
1044
|
+
f"Error computing weights for 'absResiduals': {e}. No weights applied for matching."
|
|
1045
|
+
)
|
|
1046
|
+
weights_ = None
|
|
1047
|
+
elif matchingArgs.penalizeBy == "stateUncertainty":
|
|
1048
|
+
try:
|
|
1049
|
+
weights_ = np.sqrt(P[:, 0, 0])
|
|
1050
|
+
except Exception as e:
|
|
1051
|
+
logger.warning(
|
|
1052
|
+
f"Error computing weights for 'stateUncertainty': {e}. No weights applied for matching."
|
|
1053
|
+
)
|
|
1054
|
+
weights_ = None
|
|
1055
|
+
else:
|
|
1056
|
+
logger.warning(
|
|
1057
|
+
f"Unrecognized `matchingParams.penalizeBy`: {matchingArgs.penalizeBy}. No weights applied."
|
|
1058
|
+
)
|
|
1059
|
+
weights_ = None
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
df = pd.DataFrame(
|
|
1063
|
+
{
|
|
1064
|
+
"Chromosome": chromosome,
|
|
1065
|
+
"Start": intervals,
|
|
1066
|
+
"End": intervals + stepSize,
|
|
1067
|
+
"State": x_,
|
|
1068
|
+
"Res": y_,
|
|
1069
|
+
}
|
|
1070
|
+
)
|
|
1071
|
+
if c_ == 0 and len(chromosomes) > 1:
|
|
1072
|
+
for file_ in os.listdir("."):
|
|
1073
|
+
if file_.startswith(
|
|
1074
|
+
f"consenrichOutput_{experimentName}"
|
|
1075
|
+
) and (
|
|
1076
|
+
file_.endswith(".bedGraph")
|
|
1077
|
+
or file_.endswith(".narrowPeak")
|
|
1078
|
+
):
|
|
1079
|
+
logger.warning(f"Overwriting: {file_}")
|
|
1080
|
+
os.remove(file_)
|
|
1081
|
+
|
|
1082
|
+
for col, suffix in [("State", "state"), ("Res", "residuals")]:
|
|
1083
|
+
logger.info(
|
|
1084
|
+
f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
|
|
1085
|
+
)
|
|
1086
|
+
df[["Chromosome", "Start", "End", col]].to_csv(
|
|
1087
|
+
f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
|
|
1088
|
+
sep="\t",
|
|
1089
|
+
header=False,
|
|
1090
|
+
index=False,
|
|
1091
|
+
mode="a",
|
|
1092
|
+
float_format="%.3f",
|
|
1093
|
+
lineterminator="\n",
|
|
1094
|
+
)
|
|
1095
|
+
try:
|
|
1096
|
+
if matchingEnabled:
|
|
1097
|
+
if (
|
|
1098
|
+
minMatchLengthBP_ is None
|
|
1099
|
+
or minMatchLengthBP_ <= 0
|
|
1100
|
+
):
|
|
1101
|
+
minMatchLengthBP_ = (
|
|
1102
|
+
matching.autoMinLengthIntervals(x_)
|
|
1103
|
+
* (intervals[1] - intervals[0])
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
if mergeGapBP_ is None:
|
|
1107
|
+
mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
|
|
1108
|
+
|
|
1109
|
+
matchingDF = matching.matchWavelet(
|
|
1110
|
+
chromosome,
|
|
1111
|
+
intervals,
|
|
1112
|
+
x_,
|
|
1113
|
+
matchingArgs.templateNames,
|
|
1114
|
+
matchingArgs.cascadeLevels,
|
|
1115
|
+
matchingArgs.iters,
|
|
1116
|
+
matchingArgs.alpha,
|
|
1117
|
+
minMatchLengthBP_,
|
|
1118
|
+
matchingArgs.maxNumMatches,
|
|
1119
|
+
matchingArgs.minSignalAtMaxima,
|
|
1120
|
+
useScalingFunction=matchingArgs.useScalingFunction,
|
|
1121
|
+
excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
|
|
1122
|
+
randSeed=matchingArgs.randSeed,
|
|
1123
|
+
weights=weights_,
|
|
1124
|
+
)
|
|
1125
|
+
if not matchingDF.empty:
|
|
1126
|
+
matchingDF.to_csv(
|
|
1127
|
+
f"consenrichOutput_{experimentName}_matches.narrowPeak",
|
|
1128
|
+
sep="\t",
|
|
1129
|
+
header=False,
|
|
1130
|
+
index=False,
|
|
1131
|
+
mode="a",
|
|
1132
|
+
float_format="%.3f",
|
|
1133
|
+
lineterminator="\n",
|
|
1134
|
+
)
|
|
1135
|
+
except Exception as e:
|
|
1136
|
+
logger.warning(
|
|
1137
|
+
f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
|
|
1138
|
+
)
|
|
1139
|
+
continue
|
|
1140
|
+
logger.info("Finished: output in human-readable format")
|
|
1141
|
+
convertBedGraphToBigWig(experimentName, genomeArgs.chromSizesFile)
|
|
1142
|
+
if matchingEnabled and matchingArgs.merge:
|
|
1143
|
+
try:
|
|
1144
|
+
mergeGapBP_ = matchingArgs.mergeGapBP
|
|
1145
|
+
if mergeGapBP_ is None or mergeGapBP_ <= 0:
|
|
1146
|
+
mergeGapBP_ = (
|
|
1147
|
+
int(minMatchLengthBP_ / 2) + 1
|
|
1148
|
+
if minMatchLengthBP_ is not None
|
|
1149
|
+
and minMatchLengthBP_ >= 0
|
|
1150
|
+
else 75
|
|
1151
|
+
)
|
|
1152
|
+
matching.mergeMatches(
|
|
1153
|
+
f"consenrichOutput_{experimentName}_matches.narrowPeak",
|
|
1154
|
+
mergeGapBP=mergeGapBP_,
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
except Exception as e:
|
|
1158
|
+
logger.warning(
|
|
1159
|
+
f"Failed to merge matches...SKIPPING:\n{e}\n\n"
|
|
1160
|
+
)
|
|
1161
|
+
logger.info("Done.")
|
|
1162
|
+
|
|
1163
|
+
|
|
1164
|
+
if __name__ == "__main__":
|
|
1165
|
+
main()
|