consenrich 0.7.4b3__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of consenrich might be problematic. Click here for more details.
- consenrich/__init__.py +11 -0
- consenrich/cconsenrich.c +48685 -0
- consenrich/cconsenrich.cpython-311-darwin.so +0 -0
- consenrich/cconsenrich.pyx +861 -0
- consenrich/consenrich.py +1390 -0
- consenrich/constants.py +172 -0
- consenrich/core.py +1441 -0
- consenrich/data/ce10.sizes +6 -0
- consenrich/data/ce10_blacklist.bed +100 -0
- consenrich/data/ce10_sparse.bed +11828 -0
- consenrich/data/ce11.sizes +6 -0
- consenrich/data/ce11_blacklist.bed +97 -0
- consenrich/data/ce11_sparse.bed +11828 -0
- consenrich/data/dm6.sizes +7 -0
- consenrich/data/dm6_blacklist.bed +182 -0
- consenrich/data/dm6_sparse.bed +20000 -0
- consenrich/data/hg19.sizes +24 -0
- consenrich/data/hg19_blacklist.bed +834 -0
- consenrich/data/hg19_sparse.bed +288358 -0
- consenrich/data/hg38.sizes +24 -0
- consenrich/data/hg38_blacklist.bed +636 -0
- consenrich/data/hg38_sparse.bed +288699 -0
- consenrich/data/mm10.sizes +21 -0
- consenrich/data/mm10_blacklist.bed +3435 -0
- consenrich/data/mm10_sparse.bed +100400 -0
- consenrich/data/mm39.sizes +21 -0
- consenrich/data/mm39_blacklist.bed +3360 -0
- consenrich/data/mm39_sparse.bed +100381 -0
- consenrich/detrorm.py +249 -0
- consenrich/matching.py +901 -0
- consenrich/misc_util.py +122 -0
- consenrich-0.7.4b3.dist-info/METADATA +65 -0
- consenrich-0.7.4b3.dist-info/RECORD +37 -0
- consenrich-0.7.4b3.dist-info/WHEEL +6 -0
- consenrich-0.7.4b3.dist-info/entry_points.txt +2 -0
- consenrich-0.7.4b3.dist-info/licenses/LICENSE +21 -0
- consenrich-0.7.4b3.dist-info/top_level.txt +1 -0
consenrich/consenrich.py
ADDED
|
@@ -0,0 +1,1390 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import glob
|
|
6
|
+
import logging
|
|
7
|
+
import pprint
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from typing import List, Optional, Tuple, Dict, Any, Union
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
import sys
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import pysam
|
|
18
|
+
import pywt
|
|
19
|
+
import yaml
|
|
20
|
+
|
|
21
|
+
import consenrich.core as core
|
|
22
|
+
import consenrich.misc_util as misc_util
|
|
23
|
+
import consenrich.constants as constants
|
|
24
|
+
import consenrich.detrorm as detrorm
|
|
25
|
+
import consenrich.matching as matching
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logging.basicConfig(
|
|
29
|
+
level=logging.INFO,
|
|
30
|
+
format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _loadConfig(
|
|
37
|
+
configSource: Union[str, Path, Mapping[str, Any]],
|
|
38
|
+
) -> Dict[str, Any]:
|
|
39
|
+
r"""Load a YAML config from a path or accept an already-parsed mapping.
|
|
40
|
+
|
|
41
|
+
If given a dict-like object, just return it.If given a path, try to load as YAML --> dict
|
|
42
|
+
If given a path, try to load as YAML --> dict
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
if isinstance(configSource, Mapping):
|
|
46
|
+
configData = configSource
|
|
47
|
+
elif isinstance(configSource, (str, Path)):
|
|
48
|
+
with open(configSource, "r") as fileHandle:
|
|
49
|
+
configData = yaml.safe_load(fileHandle) or {}
|
|
50
|
+
else:
|
|
51
|
+
raise TypeError("`config` must be a path or a mapping/dict.")
|
|
52
|
+
|
|
53
|
+
if not isinstance(configData, Mapping):
|
|
54
|
+
raise TypeError("Top-level YAML must be a mapping/object.")
|
|
55
|
+
return configData
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _cfgGet(
|
|
59
|
+
configMap: Mapping[str, Any],
|
|
60
|
+
dottedKey: str,
|
|
61
|
+
defaultVal: Any = None,
|
|
62
|
+
) -> Any:
|
|
63
|
+
r"""Support both dotted keys and yaml/dict-style nested access for configs."""
|
|
64
|
+
|
|
65
|
+
# e.g., inputParams.bamFiles
|
|
66
|
+
if dottedKey in configMap:
|
|
67
|
+
return configMap[dottedKey]
|
|
68
|
+
|
|
69
|
+
# e.g.,
|
|
70
|
+
# inputParams:
|
|
71
|
+
# bamFiles: [...]
|
|
72
|
+
currentVal: Any = configMap
|
|
73
|
+
for keyPart in dottedKey.split("."):
|
|
74
|
+
if isinstance(currentVal, Mapping) and keyPart in currentVal:
|
|
75
|
+
currentVal = currentVal[keyPart]
|
|
76
|
+
else:
|
|
77
|
+
return defaultVal
|
|
78
|
+
return currentVal
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _listOrEmpty(list_):
|
|
82
|
+
if list_ is None:
|
|
83
|
+
return []
|
|
84
|
+
return list_
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _getMinR(configMap, numBams: int) -> float:
|
|
88
|
+
fallbackMinR: float = 1.0
|
|
89
|
+
try:
|
|
90
|
+
rawVal = _cfgGet(configMap, "observationParams.minR", None)
|
|
91
|
+
return float(rawVal) if rawVal is not None else fallbackMinR
|
|
92
|
+
except (TypeError, ValueError, KeyError):
|
|
93
|
+
logger.warning(
|
|
94
|
+
f"Invalid or missing 'observationParams.minR' in config. Using `{fallbackMinR}`."
|
|
95
|
+
)
|
|
96
|
+
return fallbackMinR
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def checkControlsPresent(inputArgs: core.inputParams) -> bool:
|
|
100
|
+
"""Check if control BAM files are present in the input arguments.
|
|
101
|
+
|
|
102
|
+
:param inputArgs: core.inputParams object
|
|
103
|
+
:return: True if control BAM files are present, False otherwise.
|
|
104
|
+
"""
|
|
105
|
+
return (
|
|
106
|
+
bool(inputArgs.bamFilesControl)
|
|
107
|
+
and isinstance(inputArgs.bamFilesControl, list)
|
|
108
|
+
and len(inputArgs.bamFilesControl) > 0
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def getReadLengths(
|
|
113
|
+
inputArgs: core.inputParams,
|
|
114
|
+
countingArgs: core.countingParams,
|
|
115
|
+
samArgs: core.samParams,
|
|
116
|
+
) -> List[int]:
|
|
117
|
+
r"""Get read lengths for each BAM file in the input arguments.
|
|
118
|
+
|
|
119
|
+
:param inputArgs: core.inputParams object containing BAM file paths.
|
|
120
|
+
:param countingArgs: core.countingParams object containing number of reads.
|
|
121
|
+
:param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
|
|
122
|
+
:return: List of read lengths for each BAM file.
|
|
123
|
+
"""
|
|
124
|
+
if not inputArgs.bamFiles:
|
|
125
|
+
raise ValueError(
|
|
126
|
+
"No BAM files provided in the input arguments."
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if (
|
|
130
|
+
not isinstance(inputArgs.bamFiles, list)
|
|
131
|
+
or len(inputArgs.bamFiles) == 0
|
|
132
|
+
):
|
|
133
|
+
raise ValueError("bam files list is empty")
|
|
134
|
+
|
|
135
|
+
return [
|
|
136
|
+
core.getReadLength(
|
|
137
|
+
bamFile,
|
|
138
|
+
countingArgs.numReads,
|
|
139
|
+
1000,
|
|
140
|
+
samArgs.samThreads,
|
|
141
|
+
samArgs.samFlagExclude,
|
|
142
|
+
)
|
|
143
|
+
for bamFile in inputArgs.bamFiles
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
|
|
148
|
+
matchingEnabled = (
|
|
149
|
+
(matchingArgs.templateNames is not None)
|
|
150
|
+
and isinstance(matchingArgs.templateNames, list)
|
|
151
|
+
and len(matchingArgs.templateNames) > 0
|
|
152
|
+
)
|
|
153
|
+
matchingEnabled = (
|
|
154
|
+
matchingEnabled
|
|
155
|
+
and (matchingArgs.cascadeLevels is not None)
|
|
156
|
+
and isinstance(matchingArgs.cascadeLevels, list)
|
|
157
|
+
and len(matchingArgs.cascadeLevels) > 0
|
|
158
|
+
)
|
|
159
|
+
return matchingEnabled
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def getEffectiveGenomeSizes(
|
|
163
|
+
genomeArgs: core.genomeParams, readLengths: List[int]
|
|
164
|
+
) -> List[int]:
|
|
165
|
+
r"""Get effective genome sizes for the given genome name and read lengths.
|
|
166
|
+
:param genomeArgs: core.genomeParams object
|
|
167
|
+
:param readLengths: List of read lengths for which to get effective genome sizes.
|
|
168
|
+
:return: List of effective genome sizes corresponding to the read lengths.
|
|
169
|
+
"""
|
|
170
|
+
genomeName = genomeArgs.genomeName
|
|
171
|
+
if not genomeName or not isinstance(genomeName, str):
|
|
172
|
+
raise ValueError("Genome name must be a non-empty string.")
|
|
173
|
+
|
|
174
|
+
if not isinstance(readLengths, list) or len(readLengths) == 0:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
"Read lengths must be a non-empty list. Try calling `getReadLengths` first."
|
|
177
|
+
)
|
|
178
|
+
return [
|
|
179
|
+
constants.getEffectiveGenomeSize(genomeName, readLength)
|
|
180
|
+
for readLength in readLengths
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def getInputArgs(config_path: str) -> core.inputParams:
|
|
185
|
+
configData = _loadConfig(config_path)
|
|
186
|
+
|
|
187
|
+
def expandWildCards(bamList: List[str]) -> List[str]:
|
|
188
|
+
expandedList: List[str] = []
|
|
189
|
+
for bamEntry in bamList:
|
|
190
|
+
if "*" in bamEntry or "?" in bamEntry or "[" in bamEntry:
|
|
191
|
+
matchedList = glob.glob(bamEntry)
|
|
192
|
+
expandedList.extend(matchedList)
|
|
193
|
+
else:
|
|
194
|
+
expandedList.append(bamEntry)
|
|
195
|
+
return expandedList
|
|
196
|
+
|
|
197
|
+
bamFilesRaw = (
|
|
198
|
+
_cfgGet(configData, "inputParams.bamFiles", []) or []
|
|
199
|
+
)
|
|
200
|
+
bamFilesControlRaw = (
|
|
201
|
+
_cfgGet(configData, "inputParams.bamFilesControl", []) or []
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
bamFiles = expandWildCards(bamFilesRaw)
|
|
205
|
+
bamFilesControl = expandWildCards(bamFilesControlRaw)
|
|
206
|
+
|
|
207
|
+
if len(bamFiles) == 0:
|
|
208
|
+
raise ValueError(
|
|
209
|
+
"No BAM files provided in the configuration."
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if (
|
|
213
|
+
len(bamFilesControl) > 0
|
|
214
|
+
and len(bamFilesControl) != len(bamFiles)
|
|
215
|
+
and len(bamFilesControl) != 1
|
|
216
|
+
):
|
|
217
|
+
raise ValueError(
|
|
218
|
+
"Number of control BAM files must be 0, 1, or the same as number of treatment files"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if len(bamFilesControl) == 1:
|
|
222
|
+
logger.info(
|
|
223
|
+
f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
|
|
224
|
+
)
|
|
225
|
+
bamFilesControl = bamFilesControl * len(bamFiles)
|
|
226
|
+
|
|
227
|
+
if not bamFiles or not isinstance(bamFiles, list):
|
|
228
|
+
raise ValueError("No BAM files found")
|
|
229
|
+
|
|
230
|
+
for bamFile in bamFiles:
|
|
231
|
+
misc_util.checkBamFile(bamFile)
|
|
232
|
+
|
|
233
|
+
if bamFilesControl:
|
|
234
|
+
for bamFile in bamFilesControl:
|
|
235
|
+
misc_util.checkBamFile(bamFile)
|
|
236
|
+
|
|
237
|
+
pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
|
|
238
|
+
pairedEndConfig: Optional[bool] = _cfgGet(
|
|
239
|
+
configData, "inputParams.pairedEnd", None
|
|
240
|
+
)
|
|
241
|
+
if pairedEndConfig is None:
|
|
242
|
+
pairedEndConfig = all(pairedEndList)
|
|
243
|
+
if pairedEndConfig:
|
|
244
|
+
logger.info("Paired-end BAM files detected")
|
|
245
|
+
else:
|
|
246
|
+
logger.info("One or more single-end BAM files detected")
|
|
247
|
+
|
|
248
|
+
return core.inputParams(
|
|
249
|
+
bamFiles=bamFiles,
|
|
250
|
+
bamFilesControl=bamFilesControl,
|
|
251
|
+
pairedEnd=pairedEndConfig,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def getOutputArgs(config_path: str) -> core.outputParams:
|
|
255
|
+
|
|
256
|
+
configData = _loadConfig(config_path)
|
|
257
|
+
|
|
258
|
+
convertToBigWig_ = _cfgGet(
|
|
259
|
+
configData, "outputParams.convertToBigWig", True if shutil.which("bedGraphToBigWig") else False
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
roundDigits_ = _cfgGet(
|
|
263
|
+
configData, "outputParams.roundDigits", 3
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
writeResiduals_ = _cfgGet(
|
|
267
|
+
configData, "outputParams.writeResiduals", True
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
writeMuncTrace: bool = _cfgGet(
|
|
271
|
+
configData, "outputParams.writeMuncTrace", False
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
writeStateStd: bool = _cfgGet(
|
|
275
|
+
configData, "outputParams.writeStateStd", False
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
writeRawResiduals: bool = _cfgGet(
|
|
279
|
+
configData, "outputParams.writeRawResiduals", False
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return core.outputParams(
|
|
283
|
+
convertToBigWig=convertToBigWig_,
|
|
284
|
+
roundDigits=roundDigits_,
|
|
285
|
+
writeResiduals=writeResiduals_,
|
|
286
|
+
writeRawResiduals=writeRawResiduals,
|
|
287
|
+
writeMuncTrace=writeMuncTrace,
|
|
288
|
+
writeStateStd=writeStateStd,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def getGenomeArgs(config_path: str) -> core.genomeParams:
|
|
293
|
+
configData = _loadConfig(config_path)
|
|
294
|
+
|
|
295
|
+
genomeName = _cfgGet(configData, "genomeParams.name", None)
|
|
296
|
+
genomeLabel = constants.resolveGenomeName(genomeName)
|
|
297
|
+
|
|
298
|
+
chromSizesFile: Optional[str] = None
|
|
299
|
+
blacklistFile: Optional[str] = None
|
|
300
|
+
sparseBedFile: Optional[str] = None
|
|
301
|
+
chromosomesList: Optional[List[str]] = None
|
|
302
|
+
|
|
303
|
+
excludeChromsList: List[str] = (
|
|
304
|
+
_cfgGet(configData, "genomeParams.excludeChroms", []) or []
|
|
305
|
+
)
|
|
306
|
+
excludeForNormList: List[str] = (
|
|
307
|
+
_cfgGet(configData, "genomeParams.excludeForNorm", []) or []
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if genomeLabel:
|
|
311
|
+
chromSizesFile = constants.getGenomeResourceFile(
|
|
312
|
+
genomeLabel, "sizes"
|
|
313
|
+
)
|
|
314
|
+
blacklistFile = constants.getGenomeResourceFile(
|
|
315
|
+
genomeLabel, "blacklist"
|
|
316
|
+
)
|
|
317
|
+
sparseBedFile = constants.getGenomeResourceFile(
|
|
318
|
+
genomeLabel, "sparse"
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
chromSizesOverride = _cfgGet(
|
|
322
|
+
configData, "genomeParams.chromSizesFile", None
|
|
323
|
+
)
|
|
324
|
+
if chromSizesOverride:
|
|
325
|
+
chromSizesFile = chromSizesOverride
|
|
326
|
+
|
|
327
|
+
blacklistOverride = _cfgGet(
|
|
328
|
+
configData, "genomeParams.blacklistFile", None
|
|
329
|
+
)
|
|
330
|
+
if blacklistOverride:
|
|
331
|
+
blacklistFile = blacklistOverride
|
|
332
|
+
|
|
333
|
+
sparseOverride = _cfgGet(
|
|
334
|
+
configData, "genomeParams.sparseBedFile", None
|
|
335
|
+
)
|
|
336
|
+
if sparseOverride:
|
|
337
|
+
sparseBedFile = sparseOverride
|
|
338
|
+
|
|
339
|
+
if not chromSizesFile or not os.path.exists(chromSizesFile):
|
|
340
|
+
raise FileNotFoundError(
|
|
341
|
+
f"Chromosome sizes file {chromSizesFile} does not exist."
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
chromosomesConfig = _cfgGet(
|
|
345
|
+
configData, "genomeParams.chromosomes", None
|
|
346
|
+
)
|
|
347
|
+
if chromosomesConfig is not None:
|
|
348
|
+
chromosomesList = chromosomesConfig
|
|
349
|
+
else:
|
|
350
|
+
if chromSizesFile:
|
|
351
|
+
chromosomesFrame = pd.read_csv(
|
|
352
|
+
chromSizesFile,
|
|
353
|
+
sep="\t",
|
|
354
|
+
header=None,
|
|
355
|
+
names=["chrom", "size"],
|
|
356
|
+
)
|
|
357
|
+
chromosomesList = list(chromosomesFrame["chrom"])
|
|
358
|
+
else:
|
|
359
|
+
raise ValueError(
|
|
360
|
+
"No chromosomes provided in the configuration and no chromosome sizes file specified."
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
chromosomesList = [
|
|
364
|
+
chromName.strip()
|
|
365
|
+
for chromName in chromosomesList
|
|
366
|
+
if chromName and chromName.strip()
|
|
367
|
+
]
|
|
368
|
+
if excludeChromsList:
|
|
369
|
+
chromosomesList = [
|
|
370
|
+
chromName
|
|
371
|
+
for chromName in chromosomesList
|
|
372
|
+
if chromName not in excludeChromsList
|
|
373
|
+
]
|
|
374
|
+
if not chromosomesList:
|
|
375
|
+
raise ValueError(
|
|
376
|
+
"No valid chromosomes found after excluding specified chromosomes."
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return core.genomeParams(
|
|
380
|
+
genomeName=genomeLabel,
|
|
381
|
+
chromSizesFile=chromSizesFile,
|
|
382
|
+
blacklistFile=blacklistFile,
|
|
383
|
+
sparseBedFile=sparseBedFile,
|
|
384
|
+
chromosomes=chromosomesList,
|
|
385
|
+
excludeChroms=excludeChromsList,
|
|
386
|
+
excludeForNorm=excludeForNormList,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def getCountingArgs(config_path: str) -> core.countingParams:
|
|
391
|
+
configData = _loadConfig(config_path)
|
|
392
|
+
|
|
393
|
+
stepSize = _cfgGet(configData, "countingParams.stepSize", 25)
|
|
394
|
+
scaleDownFlag = _cfgGet(
|
|
395
|
+
configData, "countingParams.scaleDown", True
|
|
396
|
+
)
|
|
397
|
+
scaleFactorList = _cfgGet(
|
|
398
|
+
configData, "countingParams.scaleFactors", None
|
|
399
|
+
)
|
|
400
|
+
numReads = _cfgGet(configData, "countingParams.numReads", 100)
|
|
401
|
+
scaleFactorsControlList = _cfgGet(
|
|
402
|
+
configData, "countingParams.scaleFactorsControl", None
|
|
403
|
+
)
|
|
404
|
+
applyAsinhFlag = _cfgGet(
|
|
405
|
+
configData, "countingParams.applyAsinh", False
|
|
406
|
+
)
|
|
407
|
+
applyLogFlag = _cfgGet(
|
|
408
|
+
configData, "countingParams.applyLog", False
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if applyAsinhFlag and applyLogFlag:
|
|
412
|
+
applyAsinhFlag = True
|
|
413
|
+
applyLogFlag = False
|
|
414
|
+
logger.warning(
|
|
415
|
+
"Both `applyAsinh` and `applyLog` are set. Overriding `applyLog` to False."
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
rescaleToTreatmentCoverageFlag = _cfgGet(
|
|
419
|
+
configData,
|
|
420
|
+
"countingParams.rescaleToTreatmentCoverage",
|
|
421
|
+
False,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if scaleFactorList is not None and not isinstance(
|
|
425
|
+
scaleFactorList, list
|
|
426
|
+
):
|
|
427
|
+
raise ValueError("`scaleFactors` should be a list of floats.")
|
|
428
|
+
|
|
429
|
+
if scaleFactorsControlList is not None and not isinstance(
|
|
430
|
+
scaleFactorsControlList, list
|
|
431
|
+
):
|
|
432
|
+
raise ValueError(
|
|
433
|
+
"`scaleFactorsControl` should be a list of floats."
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
if (
|
|
437
|
+
scaleFactorList is not None
|
|
438
|
+
and scaleFactorsControlList is not None
|
|
439
|
+
and len(scaleFactorList) != len(scaleFactorsControlList)
|
|
440
|
+
):
|
|
441
|
+
if len(scaleFactorsControlList) == 1:
|
|
442
|
+
scaleFactorsControlList = scaleFactorsControlList * len(
|
|
443
|
+
scaleFactorList
|
|
444
|
+
)
|
|
445
|
+
else:
|
|
446
|
+
raise ValueError(
|
|
447
|
+
"control and treatment scale factors: must be equal length or 1 control"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return core.countingParams(
|
|
451
|
+
stepSize=stepSize,
|
|
452
|
+
scaleDown=scaleDownFlag,
|
|
453
|
+
scaleFactors=scaleFactorList,
|
|
454
|
+
scaleFactorsControl=scaleFactorsControlList,
|
|
455
|
+
numReads=numReads,
|
|
456
|
+
applyAsinh=applyAsinhFlag,
|
|
457
|
+
applyLog=applyLogFlag,
|
|
458
|
+
rescaleToTreatmentCoverage=rescaleToTreatmentCoverageFlag,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def readConfig(config_path: str) -> Dict[str, Any]:
|
|
463
|
+
r"""Read and parse the configuration file for Consenrich.
|
|
464
|
+
|
|
465
|
+
:param config_path: Path to the YAML configuration file.
|
|
466
|
+
:return: Dictionary containing all parsed configuration parameters.
|
|
467
|
+
"""
|
|
468
|
+
configData = _loadConfig(config_path)
|
|
469
|
+
|
|
470
|
+
inputParams = getInputArgs(config_path)
|
|
471
|
+
outputParams = getOutputArgs(config_path)
|
|
472
|
+
genomeParams = getGenomeArgs(config_path)
|
|
473
|
+
countingParams = getCountingArgs(config_path)
|
|
474
|
+
|
|
475
|
+
minRDefault = _getMinR(configData, len(inputParams.bamFiles))
|
|
476
|
+
minQDefault = (
|
|
477
|
+
minRDefault / len(inputParams.bamFiles)
|
|
478
|
+
) + 0.10 # conditioning
|
|
479
|
+
|
|
480
|
+
matchingExcludeRegionsFileDefault: Optional[str] = (
|
|
481
|
+
genomeParams.blacklistFile
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if (
|
|
485
|
+
inputParams.bamFilesControl is not None
|
|
486
|
+
and len(inputParams.bamFilesControl) > 0
|
|
487
|
+
):
|
|
488
|
+
detrendWindowLengthBp = _cfgGet(
|
|
489
|
+
configData,
|
|
490
|
+
"detrendParams.detrendWindowLengthBP",
|
|
491
|
+
25_000,
|
|
492
|
+
)
|
|
493
|
+
detrendSavitzkyGolayDegree = _cfgGet(
|
|
494
|
+
configData,
|
|
495
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
496
|
+
1,
|
|
497
|
+
)
|
|
498
|
+
else:
|
|
499
|
+
detrendWindowLengthBp = _cfgGet(
|
|
500
|
+
configData,
|
|
501
|
+
"detrendParams.detrendWindowLengthBP",
|
|
502
|
+
10_000,
|
|
503
|
+
)
|
|
504
|
+
detrendSavitzkyGolayDegree = _cfgGet(
|
|
505
|
+
configData,
|
|
506
|
+
"detrendParams.detrendSavitzkyGolayDegree",
|
|
507
|
+
2,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
experimentName = _cfgGet(
|
|
511
|
+
configData, "experimentName", "consenrichExperiment"
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
processArgs = core.processParams(
|
|
515
|
+
deltaF=_cfgGet(configData, "processParams.deltaF", 0.5),
|
|
516
|
+
minQ=_cfgGet(configData, "processParams.minQ", minQDefault),
|
|
517
|
+
maxQ=_cfgGet(configData, "processParams.maxQ", 500.0),
|
|
518
|
+
offDiagQ=_cfgGet(configData, "processParams.offDiagQ", 0.0),
|
|
519
|
+
dStatAlpha=_cfgGet(
|
|
520
|
+
configData, "processParams.dStatAlpha", 2.0
|
|
521
|
+
),
|
|
522
|
+
dStatd=_cfgGet(configData, "processParams.dStatd", 1.0),
|
|
523
|
+
dStatPC=_cfgGet(configData, "processParams.dStatPC", 1.0),
|
|
524
|
+
scaleResidualsByP11=_cfgGet(
|
|
525
|
+
configData,
|
|
526
|
+
"processParams.scaleResidualsByP11",
|
|
527
|
+
True,
|
|
528
|
+
),
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
observationArgs = core.observationParams(
|
|
532
|
+
minR=minRDefault,
|
|
533
|
+
maxR=_cfgGet(configData, "observationParams.maxR", 500.0),
|
|
534
|
+
useALV=_cfgGet(configData, "observationParams.useALV", False),
|
|
535
|
+
useConstantNoiseLevel=_cfgGet(
|
|
536
|
+
configData,
|
|
537
|
+
"observationParams.useConstantNoiseLevel",
|
|
538
|
+
False,
|
|
539
|
+
),
|
|
540
|
+
noGlobal=_cfgGet(
|
|
541
|
+
configData, "observationParams.noGlobal", False
|
|
542
|
+
),
|
|
543
|
+
numNearest=_cfgGet(
|
|
544
|
+
configData, "observationParams.numNearest", 25
|
|
545
|
+
),
|
|
546
|
+
localWeight=_cfgGet(
|
|
547
|
+
configData, "observationParams.localWeight", 0.333
|
|
548
|
+
),
|
|
549
|
+
globalWeight=_cfgGet(
|
|
550
|
+
configData, "observationParams.globalWeight", 0.667
|
|
551
|
+
),
|
|
552
|
+
approximationWindowLengthBP=_cfgGet(
|
|
553
|
+
configData,
|
|
554
|
+
"observationParams.approximationWindowLengthBP",
|
|
555
|
+
10_000,
|
|
556
|
+
),
|
|
557
|
+
lowPassWindowLengthBP=_cfgGet(
|
|
558
|
+
configData,
|
|
559
|
+
"observationParams.lowPassWindowLengthBP",
|
|
560
|
+
20_000,
|
|
561
|
+
),
|
|
562
|
+
lowPassFilterType=_cfgGet(
|
|
563
|
+
configData,
|
|
564
|
+
"observationParams.lowPassFilterType",
|
|
565
|
+
"median",
|
|
566
|
+
),
|
|
567
|
+
returnCenter=_cfgGet(
|
|
568
|
+
configData, "observationParams.returnCenter", True
|
|
569
|
+
),
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
stateArgs = core.stateParams(
|
|
573
|
+
stateInit=_cfgGet(configData, "stateParams.stateInit", 0.0),
|
|
574
|
+
stateCovarInit=_cfgGet(
|
|
575
|
+
configData, "stateParams.stateCovarInit", 100.0
|
|
576
|
+
),
|
|
577
|
+
boundState=_cfgGet(
|
|
578
|
+
configData, "stateParams.boundState", True
|
|
579
|
+
),
|
|
580
|
+
stateLowerBound=_cfgGet(
|
|
581
|
+
configData, "stateParams.stateLowerBound", 0.0
|
|
582
|
+
),
|
|
583
|
+
stateUpperBound=_cfgGet(
|
|
584
|
+
configData, "stateParams.stateUpperBound", 10000.0
|
|
585
|
+
),
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
samThreads = _cfgGet(configData, "samParams.samThreads", 1)
|
|
589
|
+
samFlagExclude = _cfgGet(
|
|
590
|
+
configData, "samParams.samFlagExclude", 3844
|
|
591
|
+
)
|
|
592
|
+
oneReadPerBin = _cfgGet(configData, "samParams.oneReadPerBin", 0)
|
|
593
|
+
chunkSize = _cfgGet(configData, "samParams.chunkSize", 1_000_000)
|
|
594
|
+
offsetStr = _cfgGet(configData, "samParams.offsetStr", "0,0")
|
|
595
|
+
extendBpList = _cfgGet(configData, "samParams.extendBP", [])
|
|
596
|
+
maxInsertSize = _cfgGet(
|
|
597
|
+
configData, "samParams.maxInsertSize", 1000
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
pairedEndDefault = (
|
|
601
|
+
1
|
|
602
|
+
if inputParams.pairedEnd is not None
|
|
603
|
+
and int(inputParams.pairedEnd) > 0
|
|
604
|
+
else 0
|
|
605
|
+
)
|
|
606
|
+
inferFragmentDefault = (
|
|
607
|
+
1
|
|
608
|
+
if inputParams.pairedEnd is not None
|
|
609
|
+
and int(inputParams.pairedEnd) == 0
|
|
610
|
+
else 0
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
samArgs = core.samParams(
|
|
614
|
+
samThreads=samThreads,
|
|
615
|
+
samFlagExclude=samFlagExclude,
|
|
616
|
+
oneReadPerBin=oneReadPerBin,
|
|
617
|
+
chunkSize=chunkSize,
|
|
618
|
+
offsetStr=offsetStr,
|
|
619
|
+
extendBP=extendBpList,
|
|
620
|
+
maxInsertSize=maxInsertSize,
|
|
621
|
+
pairedEndMode=_cfgGet(
|
|
622
|
+
configData,
|
|
623
|
+
"samParams.pairedEndMode",
|
|
624
|
+
pairedEndDefault,
|
|
625
|
+
),
|
|
626
|
+
inferFragmentLength=_cfgGet(
|
|
627
|
+
configData,
|
|
628
|
+
"samParams.inferFragmentLength",
|
|
629
|
+
inferFragmentDefault,
|
|
630
|
+
),
|
|
631
|
+
countEndsOnly=_cfgGet(
|
|
632
|
+
configData, "samParams.countEndsOnly", False
|
|
633
|
+
),
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
detrendArgs = core.detrendParams(
|
|
637
|
+
detrendWindowLengthBP=detrendWindowLengthBp,
|
|
638
|
+
detrendTrackPercentile=_cfgGet(
|
|
639
|
+
configData,
|
|
640
|
+
"detrendParams.detrendTrackPercentile",
|
|
641
|
+
75,
|
|
642
|
+
),
|
|
643
|
+
usePolyFilter=_cfgGet(
|
|
644
|
+
configData, "detrendParams.usePolyFilter", False
|
|
645
|
+
),
|
|
646
|
+
detrendSavitzkyGolayDegree=detrendSavitzkyGolayDegree,
|
|
647
|
+
useOrderStatFilter=_cfgGet(
|
|
648
|
+
configData, "detrendParams.useOrderStatFilter", True
|
|
649
|
+
),
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
matchingArgs = core.matchingParams(
|
|
653
|
+
templateNames=_cfgGet(
|
|
654
|
+
configData, "matchingParams.templateNames", []
|
|
655
|
+
),
|
|
656
|
+
cascadeLevels=_cfgGet(
|
|
657
|
+
configData, "matchingParams.cascadeLevels", []
|
|
658
|
+
),
|
|
659
|
+
iters=_cfgGet(configData, "matchingParams.iters", 25_000),
|
|
660
|
+
alpha=_cfgGet(configData, "matchingParams.alpha", 0.05),
|
|
661
|
+
minMatchLengthBP=_cfgGet(
|
|
662
|
+
configData,
|
|
663
|
+
"matchingParams.minMatchLengthBP",
|
|
664
|
+
250,
|
|
665
|
+
),
|
|
666
|
+
maxNumMatches=_cfgGet(
|
|
667
|
+
configData,
|
|
668
|
+
"matchingParams.maxNumMatches",
|
|
669
|
+
100_000,
|
|
670
|
+
),
|
|
671
|
+
minSignalAtMaxima=_cfgGet(
|
|
672
|
+
configData,
|
|
673
|
+
"matchingParams.minSignalAtMaxima",
|
|
674
|
+
"q:0.75",
|
|
675
|
+
),
|
|
676
|
+
merge=_cfgGet(configData, "matchingParams.merge", True),
|
|
677
|
+
mergeGapBP=_cfgGet(
|
|
678
|
+
configData, "matchingParams.mergeGapBP", None
|
|
679
|
+
),
|
|
680
|
+
useScalingFunction=_cfgGet(
|
|
681
|
+
configData,
|
|
682
|
+
"matchingParams.useScalingFunction",
|
|
683
|
+
True,
|
|
684
|
+
),
|
|
685
|
+
excludeRegionsBedFile=_cfgGet(
|
|
686
|
+
configData,
|
|
687
|
+
"matchingParams.excludeRegionsBedFile",
|
|
688
|
+
matchingExcludeRegionsFileDefault,
|
|
689
|
+
),
|
|
690
|
+
randSeed=_cfgGet(configData, "matchingParams.randSeed", 42),
|
|
691
|
+
penalizeBy=_cfgGet(
|
|
692
|
+
configData, "matchingParams.penalizeBy", None
|
|
693
|
+
),
|
|
694
|
+
eps=_cfgGet(configData, "matchingParams.eps", 1.0e-2),
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
return {
|
|
698
|
+
"experimentName": experimentName,
|
|
699
|
+
"genomeArgs": genomeParams,
|
|
700
|
+
"inputArgs": inputParams,
|
|
701
|
+
"outputArgs": outputParams,
|
|
702
|
+
"countingArgs": countingParams,
|
|
703
|
+
"processArgs": processArgs,
|
|
704
|
+
"observationArgs": observationArgs,
|
|
705
|
+
"stateArgs": stateArgs,
|
|
706
|
+
"samArgs": samArgs,
|
|
707
|
+
"detrendArgs": detrendArgs,
|
|
708
|
+
"matchingArgs": matchingArgs,
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def convertBedGraphToBigWig(experimentName, chromSizesFile,
|
|
713
|
+
suffixes: Optional[List[str]] = None):
|
|
714
|
+
|
|
715
|
+
if suffixes is None:
|
|
716
|
+
# at least look for `state` bedGraph
|
|
717
|
+
suffixes = ["state"]
|
|
718
|
+
path_ = ""
|
|
719
|
+
warningMessage = (
|
|
720
|
+
"Could not find UCSC bedGraphToBigWig binary utility."
|
|
721
|
+
"If you need bigWig files instead of the default, human-readable bedGraph files,"
|
|
722
|
+
"you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
|
|
723
|
+
"OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
logger.info(
|
|
727
|
+
"Attempting to generate bigWig files from bedGraph format..."
|
|
728
|
+
)
|
|
729
|
+
try:
|
|
730
|
+
path_ = shutil.which("bedGraphToBigWig")
|
|
731
|
+
except Exception as e:
|
|
732
|
+
logger.warning(f"\n{warningMessage}\n")
|
|
733
|
+
return
|
|
734
|
+
if path_ is None or len(path_) == 0:
|
|
735
|
+
logger.warning(f"\n{warningMessage}\n")
|
|
736
|
+
return
|
|
737
|
+
logger.info(f"Using bedGraphToBigWig from {path_}")
|
|
738
|
+
for suffix in suffixes:
|
|
739
|
+
bedgraph = (
|
|
740
|
+
f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
|
|
741
|
+
)
|
|
742
|
+
if not os.path.exists(bedgraph):
|
|
743
|
+
logger.warning(
|
|
744
|
+
f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
|
|
745
|
+
)
|
|
746
|
+
continue
|
|
747
|
+
if not os.path.exists(chromSizesFile):
|
|
748
|
+
logger.warning(
|
|
749
|
+
f"{chromSizesFile} does not exist. Skipping bigWig conversion."
|
|
750
|
+
)
|
|
751
|
+
return
|
|
752
|
+
bigwig = f"{experimentName}_consenrich_{suffix}.bw"
|
|
753
|
+
logger.info(f"Start: {bedgraph} --> {bigwig}...")
|
|
754
|
+
try:
|
|
755
|
+
subprocess.run(
|
|
756
|
+
[path_, bedgraph, chromSizesFile, bigwig], check=True
|
|
757
|
+
)
|
|
758
|
+
except Exception as e:
|
|
759
|
+
logger.warning(
|
|
760
|
+
f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
|
|
761
|
+
)
|
|
762
|
+
continue
|
|
763
|
+
if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
|
|
764
|
+
logger.info(
|
|
765
|
+
f"Finished: converted {bedgraph} to {bigwig}."
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def main():
|
|
770
|
+
parser = argparse.ArgumentParser(description="Consenrich CLI")
|
|
771
|
+
parser.add_argument(
|
|
772
|
+
"--config",
|
|
773
|
+
type=str,
|
|
774
|
+
dest="config",
|
|
775
|
+
help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# --- Matching-specific command-line arguments ---
|
|
779
|
+
parser.add_argument(
|
|
780
|
+
"--match-bedGraph",
|
|
781
|
+
type=str,
|
|
782
|
+
dest="matchBedGraph",
|
|
783
|
+
help="Path to a bedGraph file of Consenrich estimates to match templates against.\
|
|
784
|
+
If provided, *only* the matching algorithm is run (no other processing). Note that \
|
|
785
|
+
some features in `consenrich.matching` may not be supported through this CLI interface.",
|
|
786
|
+
)
|
|
787
|
+
parser.add_argument(
|
|
788
|
+
"--match-template",
|
|
789
|
+
type=str,
|
|
790
|
+
default="haar",
|
|
791
|
+
choices=[
|
|
792
|
+
x
|
|
793
|
+
for x in pywt.wavelist(kind="discrete")
|
|
794
|
+
if "bio" not in x
|
|
795
|
+
],
|
|
796
|
+
dest="matchTemplate",
|
|
797
|
+
)
|
|
798
|
+
parser.add_argument(
|
|
799
|
+
"--match-level", type=int, default=2, dest="matchLevel"
|
|
800
|
+
)
|
|
801
|
+
parser.add_argument(
|
|
802
|
+
"--match-alpha", type=float, default=0.05, dest="matchAlpha"
|
|
803
|
+
)
|
|
804
|
+
parser.add_argument(
|
|
805
|
+
"--match-min-length",
|
|
806
|
+
type=int,
|
|
807
|
+
default=250,
|
|
808
|
+
dest="matchMinMatchLengthBP",
|
|
809
|
+
)
|
|
810
|
+
parser.add_argument(
|
|
811
|
+
"--match-iters", type=int, default=25000, dest="matchIters"
|
|
812
|
+
)
|
|
813
|
+
parser.add_argument(
|
|
814
|
+
"--match-min-signal",
|
|
815
|
+
type=str,
|
|
816
|
+
default="q:0.75",
|
|
817
|
+
dest="matchMinSignalAtMaxima",
|
|
818
|
+
)
|
|
819
|
+
parser.add_argument(
|
|
820
|
+
"--match-max-matches",
|
|
821
|
+
type=int,
|
|
822
|
+
default=100000,
|
|
823
|
+
dest="matchMaxNumMatches",
|
|
824
|
+
)
|
|
825
|
+
parser.add_argument(
|
|
826
|
+
"--match-no-merge", action="store_true", dest="matchNoMerge"
|
|
827
|
+
)
|
|
828
|
+
parser.add_argument(
|
|
829
|
+
"--match-merge-gap",
|
|
830
|
+
type=int,
|
|
831
|
+
default=None,
|
|
832
|
+
dest="matchMergeGapBP",
|
|
833
|
+
)
|
|
834
|
+
parser.add_argument(
|
|
835
|
+
"--match-use-wavelet",
|
|
836
|
+
action="store_true",
|
|
837
|
+
dest="matchUseWavelet",
|
|
838
|
+
)
|
|
839
|
+
parser.add_argument(
|
|
840
|
+
"--match-seed", type=int, default=42, dest="matchRandSeed"
|
|
841
|
+
)
|
|
842
|
+
parser.add_argument(
|
|
843
|
+
"--match-exclude-bed",
|
|
844
|
+
type=str,
|
|
845
|
+
default=None,
|
|
846
|
+
dest="matchExcludeBed",
|
|
847
|
+
)
|
|
848
|
+
parser.add_argument(
|
|
849
|
+
"--verbose", action="store_true", help="If set, logs config"
|
|
850
|
+
)
|
|
851
|
+
args = parser.parse_args()
|
|
852
|
+
|
|
853
|
+
if args.matchBedGraph:
|
|
854
|
+
if not os.path.exists(args.matchBedGraph):
|
|
855
|
+
raise FileNotFoundError(
|
|
856
|
+
f"bedGraph file {args.matchBedGraph} couldn't be found."
|
|
857
|
+
)
|
|
858
|
+
logger.info(
|
|
859
|
+
f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
outName = matching.matchExistingBedGraph(
|
|
863
|
+
args.matchBedGraph,
|
|
864
|
+
args.matchTemplate,
|
|
865
|
+
args.matchLevel,
|
|
866
|
+
alpha=args.matchAlpha,
|
|
867
|
+
minMatchLengthBP=args.matchMinMatchLengthBP,
|
|
868
|
+
iters=args.matchIters,
|
|
869
|
+
minSignalAtMaxima=args.matchMinSignalAtMaxima,
|
|
870
|
+
maxNumMatches=args.matchMaxNumMatches,
|
|
871
|
+
useScalingFunction=(not args.matchUseWavelet),
|
|
872
|
+
merge=(not args.matchNoMerge),
|
|
873
|
+
mergeGapBP=args.matchMergeGapBP,
|
|
874
|
+
excludeRegionsBedFile=args.matchExcludeBed,
|
|
875
|
+
randSeed=args.matchRandSeed,
|
|
876
|
+
)
|
|
877
|
+
logger.info(f"Finished matching. Written to {outName}")
|
|
878
|
+
sys.exit(0)
|
|
879
|
+
|
|
880
|
+
if args.matchBedGraph:
|
|
881
|
+
# this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
|
|
882
|
+
logger.info(
|
|
883
|
+
"If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
|
|
884
|
+
)
|
|
885
|
+
sys.exit(0)
|
|
886
|
+
|
|
887
|
+
if not args.config:
|
|
888
|
+
logger.info(
|
|
889
|
+
"No config file provided, run with `--config <path_to_config.yaml>`"
|
|
890
|
+
)
|
|
891
|
+
logger.info(
|
|
892
|
+
"See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
|
|
893
|
+
)
|
|
894
|
+
sys.exit(1)
|
|
895
|
+
|
|
896
|
+
if not os.path.exists(args.config):
|
|
897
|
+
logger.info(f"Config file {args.config} does not exist.")
|
|
898
|
+
logger.info(
|
|
899
|
+
"See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
|
|
900
|
+
)
|
|
901
|
+
sys.exit(1)
|
|
902
|
+
|
|
903
|
+
config = readConfig(args.config)
|
|
904
|
+
experimentName = config["experimentName"]
|
|
905
|
+
genomeArgs = config["genomeArgs"]
|
|
906
|
+
inputArgs = config["inputArgs"]
|
|
907
|
+
outputArgs = config["outputArgs"]
|
|
908
|
+
countingArgs = config["countingArgs"]
|
|
909
|
+
processArgs = config["processArgs"]
|
|
910
|
+
observationArgs = config["observationArgs"]
|
|
911
|
+
stateArgs = config["stateArgs"]
|
|
912
|
+
samArgs = config["samArgs"]
|
|
913
|
+
detrendArgs = config["detrendArgs"]
|
|
914
|
+
matchingArgs = config["matchingArgs"]
|
|
915
|
+
bamFiles = inputArgs.bamFiles
|
|
916
|
+
bamFilesControl = inputArgs.bamFilesControl
|
|
917
|
+
numSamples = len(bamFiles)
|
|
918
|
+
numNearest = observationArgs.numNearest
|
|
919
|
+
stepSize = countingArgs.stepSize
|
|
920
|
+
excludeForNorm = genomeArgs.excludeForNorm
|
|
921
|
+
chromSizes = genomeArgs.chromSizesFile
|
|
922
|
+
scaleDown = countingArgs.scaleDown
|
|
923
|
+
extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
|
|
924
|
+
initialTreatmentScaleFactors = []
|
|
925
|
+
minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
|
|
926
|
+
mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
|
|
927
|
+
|
|
928
|
+
if args.verbose:
|
|
929
|
+
try:
|
|
930
|
+
logger.info("Configuration:\n")
|
|
931
|
+
config_truncated = {
|
|
932
|
+
k: v
|
|
933
|
+
for k, v in config.items()
|
|
934
|
+
if k
|
|
935
|
+
not in ["inputArgs", "genomeArgs", "countingArgs"]
|
|
936
|
+
}
|
|
937
|
+
config_truncated["experimentName"] = experimentName
|
|
938
|
+
config_truncated["inputArgs"] = inputArgs
|
|
939
|
+
config_truncated["outputArgs"] = outputArgs
|
|
940
|
+
config_truncated["genomeArgs"] = genomeArgs
|
|
941
|
+
config_truncated["countingArgs"] = countingArgs
|
|
942
|
+
config_truncated["processArgs"] = processArgs
|
|
943
|
+
config_truncated["observationArgs"] = observationArgs
|
|
944
|
+
config_truncated["stateArgs"] = stateArgs
|
|
945
|
+
config_truncated["samArgs"] = samArgs
|
|
946
|
+
config_truncated["detrendArgs"] = detrendArgs
|
|
947
|
+
pprint.pprint(config_truncated, indent=8)
|
|
948
|
+
except Exception as e:
|
|
949
|
+
logger.warning(f"Failed to print parsed config:\n{e}\n")
|
|
950
|
+
|
|
951
|
+
controlsPresent = checkControlsPresent(inputArgs)
|
|
952
|
+
if args.verbose:
|
|
953
|
+
logger.info(f"controlsPresent: {controlsPresent}")
|
|
954
|
+
readLengthsBamFiles = getReadLengths(
|
|
955
|
+
inputArgs, countingArgs, samArgs
|
|
956
|
+
)
|
|
957
|
+
effectiveGenomeSizes = getEffectiveGenomeSizes(
|
|
958
|
+
genomeArgs, readLengthsBamFiles
|
|
959
|
+
)
|
|
960
|
+
matchingEnabled = checkMatchingEnabled(matchingArgs)
|
|
961
|
+
if args.verbose:
|
|
962
|
+
logger.info(f"matchingEnabled: {matchingEnabled}")
|
|
963
|
+
scaleFactors = countingArgs.scaleFactors
|
|
964
|
+
scaleFactorsControl = countingArgs.scaleFactorsControl
|
|
965
|
+
|
|
966
|
+
if controlsPresent:
|
|
967
|
+
readLengthsControlBamFiles = [
|
|
968
|
+
core.getReadLength(
|
|
969
|
+
bamFile,
|
|
970
|
+
countingArgs.numReads,
|
|
971
|
+
1000,
|
|
972
|
+
samArgs.samThreads,
|
|
973
|
+
samArgs.samFlagExclude,
|
|
974
|
+
)
|
|
975
|
+
for bamFile in bamFilesControl
|
|
976
|
+
]
|
|
977
|
+
effectiveGenomeSizesControl = [
|
|
978
|
+
constants.getEffectiveGenomeSize(
|
|
979
|
+
genomeArgs.genomeName, readLength
|
|
980
|
+
)
|
|
981
|
+
for readLength in readLengthsControlBamFiles
|
|
982
|
+
]
|
|
983
|
+
|
|
984
|
+
if (
|
|
985
|
+
scaleFactors is not None
|
|
986
|
+
and scaleFactorsControl is not None
|
|
987
|
+
):
|
|
988
|
+
treatScaleFactors = scaleFactors
|
|
989
|
+
controlScaleFactors = scaleFactorsControl
|
|
990
|
+
# still make sure this is accessible
|
|
991
|
+
initialTreatmentScaleFactors = [1.0] * len(bamFiles)
|
|
992
|
+
else:
|
|
993
|
+
try:
|
|
994
|
+
initialTreatmentScaleFactors = [
|
|
995
|
+
detrorm.getScaleFactor1x(
|
|
996
|
+
bamFile,
|
|
997
|
+
effectiveGenomeSize,
|
|
998
|
+
readLength,
|
|
999
|
+
genomeArgs.excludeChroms,
|
|
1000
|
+
genomeArgs.chromSizesFile,
|
|
1001
|
+
samArgs.samThreads,
|
|
1002
|
+
)
|
|
1003
|
+
for bamFile, effectiveGenomeSize, readLength in zip(
|
|
1004
|
+
bamFiles,
|
|
1005
|
+
effectiveGenomeSizes,
|
|
1006
|
+
readLengthsBamFiles,
|
|
1007
|
+
)
|
|
1008
|
+
]
|
|
1009
|
+
except Exception:
|
|
1010
|
+
initialTreatmentScaleFactors = [1.0] * len(bamFiles)
|
|
1011
|
+
|
|
1012
|
+
pairScalingFactors = [
|
|
1013
|
+
detrorm.getPairScaleFactors(
|
|
1014
|
+
bamFileA,
|
|
1015
|
+
bamFileB,
|
|
1016
|
+
effectiveGenomeSizeA,
|
|
1017
|
+
effectiveGenomeSizeB,
|
|
1018
|
+
readLengthA,
|
|
1019
|
+
readLengthB,
|
|
1020
|
+
excludeForNorm,
|
|
1021
|
+
chromSizes,
|
|
1022
|
+
samArgs.samThreads,
|
|
1023
|
+
scaleDown,
|
|
1024
|
+
)
|
|
1025
|
+
for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
|
|
1026
|
+
bamFiles,
|
|
1027
|
+
bamFilesControl,
|
|
1028
|
+
effectiveGenomeSizes,
|
|
1029
|
+
effectiveGenomeSizesControl,
|
|
1030
|
+
readLengthsBamFiles,
|
|
1031
|
+
readLengthsControlBamFiles,
|
|
1032
|
+
)
|
|
1033
|
+
]
|
|
1034
|
+
|
|
1035
|
+
treatScaleFactors = []
|
|
1036
|
+
controlScaleFactors = []
|
|
1037
|
+
for scaleFactorA, scaleFactorB in pairScalingFactors:
|
|
1038
|
+
treatScaleFactors.append(scaleFactorA)
|
|
1039
|
+
controlScaleFactors.append(scaleFactorB)
|
|
1040
|
+
|
|
1041
|
+
else:
|
|
1042
|
+
treatScaleFactors = scaleFactors
|
|
1043
|
+
controlScaleFactors = scaleFactorsControl
|
|
1044
|
+
|
|
1045
|
+
if scaleFactors is None and not controlsPresent:
|
|
1046
|
+
scaleFactors = [
|
|
1047
|
+
detrorm.getScaleFactor1x(
|
|
1048
|
+
bamFile,
|
|
1049
|
+
effectiveGenomeSize,
|
|
1050
|
+
readLength,
|
|
1051
|
+
genomeArgs.excludeChroms,
|
|
1052
|
+
genomeArgs.chromSizesFile,
|
|
1053
|
+
samArgs.samThreads,
|
|
1054
|
+
)
|
|
1055
|
+
for bamFile, effectiveGenomeSize, readLength in zip(
|
|
1056
|
+
bamFiles, effectiveGenomeSizes, readLengthsBamFiles
|
|
1057
|
+
)
|
|
1058
|
+
]
|
|
1059
|
+
chromSizesDict = misc_util.getChromSizesDict(
|
|
1060
|
+
genomeArgs.chromSizesFile,
|
|
1061
|
+
excludeChroms=genomeArgs.excludeChroms,
|
|
1062
|
+
)
|
|
1063
|
+
chromosomes = genomeArgs.chromosomes
|
|
1064
|
+
|
|
1065
|
+
for c_, chromosome in enumerate(chromosomes):
|
|
1066
|
+
chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
|
|
1067
|
+
bamFiles,
|
|
1068
|
+
chromosome,
|
|
1069
|
+
chromSizesDict[chromosome],
|
|
1070
|
+
samArgs.samThreads,
|
|
1071
|
+
samArgs.samFlagExclude,
|
|
1072
|
+
)
|
|
1073
|
+
chromosomeStart = max(
|
|
1074
|
+
0, (chromosomeStart - (chromosomeStart % stepSize))
|
|
1075
|
+
)
|
|
1076
|
+
chromosomeEnd = max(
|
|
1077
|
+
0, (chromosomeEnd - (chromosomeEnd % stepSize))
|
|
1078
|
+
)
|
|
1079
|
+
numIntervals = (
|
|
1080
|
+
((chromosomeEnd - chromosomeStart) + stepSize) - 1
|
|
1081
|
+
) // stepSize
|
|
1082
|
+
intervals = np.arange(
|
|
1083
|
+
chromosomeStart, chromosomeEnd, stepSize
|
|
1084
|
+
)
|
|
1085
|
+
chromMat: np.ndarray = np.empty(
|
|
1086
|
+
(numSamples, numIntervals), dtype=np.float32
|
|
1087
|
+
)
|
|
1088
|
+
if controlsPresent:
|
|
1089
|
+
j_: int = 0
|
|
1090
|
+
for bamA, bamB in zip(bamFiles, bamFilesControl):
|
|
1091
|
+
logger.info(
|
|
1092
|
+
f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
|
|
1093
|
+
)
|
|
1094
|
+
pairMatrix: np.ndarray = core.readBamSegments(
|
|
1095
|
+
[bamA, bamB],
|
|
1096
|
+
chromosome,
|
|
1097
|
+
chromosomeStart,
|
|
1098
|
+
chromosomeEnd,
|
|
1099
|
+
stepSize,
|
|
1100
|
+
[
|
|
1101
|
+
readLengthsBamFiles[j_],
|
|
1102
|
+
readLengthsControlBamFiles[j_],
|
|
1103
|
+
],
|
|
1104
|
+
[treatScaleFactors[j_], controlScaleFactors[j_]],
|
|
1105
|
+
samArgs.oneReadPerBin,
|
|
1106
|
+
samArgs.samThreads,
|
|
1107
|
+
samArgs.samFlagExclude,
|
|
1108
|
+
offsetStr=samArgs.offsetStr,
|
|
1109
|
+
extendBP=extendBP_[j_],
|
|
1110
|
+
maxInsertSize=samArgs.maxInsertSize,
|
|
1111
|
+
pairedEndMode=samArgs.pairedEndMode,
|
|
1112
|
+
inferFragmentLength=samArgs.inferFragmentLength,
|
|
1113
|
+
applyAsinh=countingArgs.applyAsinh,
|
|
1114
|
+
applyLog=countingArgs.applyLog,
|
|
1115
|
+
countEndsOnly=samArgs.countEndsOnly,
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
chromMat[j_, :] = (
|
|
1119
|
+
pairMatrix[0, :] - pairMatrix[1, :]
|
|
1120
|
+
)
|
|
1121
|
+
j_ += 1
|
|
1122
|
+
else:
|
|
1123
|
+
chromMat = core.readBamSegments(
|
|
1124
|
+
bamFiles,
|
|
1125
|
+
chromosome,
|
|
1126
|
+
chromosomeStart,
|
|
1127
|
+
chromosomeEnd,
|
|
1128
|
+
stepSize,
|
|
1129
|
+
readLengthsBamFiles,
|
|
1130
|
+
scaleFactors,
|
|
1131
|
+
samArgs.oneReadPerBin,
|
|
1132
|
+
samArgs.samThreads,
|
|
1133
|
+
samArgs.samFlagExclude,
|
|
1134
|
+
offsetStr=samArgs.offsetStr,
|
|
1135
|
+
extendBP=extendBP_,
|
|
1136
|
+
maxInsertSize=samArgs.maxInsertSize,
|
|
1137
|
+
pairedEndMode=samArgs.pairedEndMode,
|
|
1138
|
+
inferFragmentLength=samArgs.inferFragmentLength,
|
|
1139
|
+
applyAsinh=countingArgs.applyAsinh,
|
|
1140
|
+
applyLog=countingArgs.applyLog,
|
|
1141
|
+
countEndsOnly=samArgs.countEndsOnly,
|
|
1142
|
+
)
|
|
1143
|
+
sparseMap = None
|
|
1144
|
+
if genomeArgs.sparseBedFile and not observationArgs.useALV:
|
|
1145
|
+
logger.info(
|
|
1146
|
+
f"Building sparse mapping for {chromosome}..."
|
|
1147
|
+
)
|
|
1148
|
+
sparseMap = core.getSparseMap(
|
|
1149
|
+
chromosome,
|
|
1150
|
+
intervals,
|
|
1151
|
+
numNearest,
|
|
1152
|
+
genomeArgs.sparseBedFile,
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
muncMat = np.empty_like(chromMat, dtype=np.float32)
|
|
1156
|
+
for j in range(numSamples):
|
|
1157
|
+
logger.info(
|
|
1158
|
+
f"Muncing {j + 1}/{numSamples} for {chromosome}..."
|
|
1159
|
+
)
|
|
1160
|
+
muncMat[j, :] = core.getMuncTrack(
|
|
1161
|
+
chromosome,
|
|
1162
|
+
intervals,
|
|
1163
|
+
stepSize,
|
|
1164
|
+
chromMat[j, :],
|
|
1165
|
+
observationArgs.minR,
|
|
1166
|
+
observationArgs.maxR,
|
|
1167
|
+
observationArgs.useALV,
|
|
1168
|
+
observationArgs.useConstantNoiseLevel,
|
|
1169
|
+
observationArgs.noGlobal,
|
|
1170
|
+
observationArgs.localWeight,
|
|
1171
|
+
observationArgs.globalWeight,
|
|
1172
|
+
observationArgs.approximationWindowLengthBP,
|
|
1173
|
+
observationArgs.lowPassWindowLengthBP,
|
|
1174
|
+
observationArgs.returnCenter,
|
|
1175
|
+
sparseMap=sparseMap,
|
|
1176
|
+
lowPassFilterType=observationArgs.lowPassFilterType,
|
|
1177
|
+
)
|
|
1178
|
+
chromMat[j, :] = detrorm.detrendTrack(
|
|
1179
|
+
chromMat[j, :],
|
|
1180
|
+
stepSize,
|
|
1181
|
+
detrendArgs.detrendWindowLengthBP,
|
|
1182
|
+
detrendArgs.useOrderStatFilter,
|
|
1183
|
+
detrendArgs.usePolyFilter,
|
|
1184
|
+
detrendArgs.detrendTrackPercentile,
|
|
1185
|
+
detrendArgs.detrendSavitzkyGolayDegree,
|
|
1186
|
+
)
|
|
1187
|
+
logger.info(f">>>Running consenrich: {chromosome}<<<")
|
|
1188
|
+
|
|
1189
|
+
x, P, y = core.runConsenrich(
|
|
1190
|
+
chromMat,
|
|
1191
|
+
muncMat,
|
|
1192
|
+
processArgs.deltaF,
|
|
1193
|
+
processArgs.minQ,
|
|
1194
|
+
processArgs.maxQ,
|
|
1195
|
+
processArgs.offDiagQ,
|
|
1196
|
+
processArgs.dStatAlpha,
|
|
1197
|
+
processArgs.dStatd,
|
|
1198
|
+
processArgs.dStatPC,
|
|
1199
|
+
stateArgs.stateInit,
|
|
1200
|
+
stateArgs.stateCovarInit,
|
|
1201
|
+
stateArgs.boundState,
|
|
1202
|
+
stateArgs.stateLowerBound,
|
|
1203
|
+
stateArgs.stateUpperBound,
|
|
1204
|
+
samArgs.chunkSize,
|
|
1205
|
+
progressIter=50_000,
|
|
1206
|
+
)
|
|
1207
|
+
logger.info("Done.")
|
|
1208
|
+
|
|
1209
|
+
x_ = core.getPrimaryState(x)
|
|
1210
|
+
y_ = core.getPrecisionWeightedResidual(
|
|
1211
|
+
y,
|
|
1212
|
+
muncMat,
|
|
1213
|
+
stateCovarSmoothed=P
|
|
1214
|
+
if processArgs.scaleResidualsByP11 is not None
|
|
1215
|
+
and processArgs.scaleResidualsByP11
|
|
1216
|
+
else None,
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
weights_: Optional[np.ndarray] = None
|
|
1220
|
+
if matchingArgs.penalizeBy is not None:
|
|
1221
|
+
if matchingArgs.penalizeBy == "absResiduals":
|
|
1222
|
+
try:
|
|
1223
|
+
weights_ = np.abs(y_)
|
|
1224
|
+
except Exception as e:
|
|
1225
|
+
logger.warning(
|
|
1226
|
+
f"Error computing weights for 'absResiduals': {e}. No weights applied for matching."
|
|
1227
|
+
)
|
|
1228
|
+
weights_ = None
|
|
1229
|
+
elif matchingArgs.penalizeBy == "stateUncertainty" or matchingArgs.penalizeBy == "stateStdDev":
|
|
1230
|
+
try:
|
|
1231
|
+
weights_ = np.sqrt(P[:, 0, 0])
|
|
1232
|
+
except Exception as e:
|
|
1233
|
+
logger.warning(
|
|
1234
|
+
f"Error computing weights for 'stateUncertainty': {e}. No weights applied for matching."
|
|
1235
|
+
)
|
|
1236
|
+
weights_ = None
|
|
1237
|
+
elif matchingArgs.penalizeBy == "muncTrace":
|
|
1238
|
+
try:
|
|
1239
|
+
weights_ = np.sqrt(
|
|
1240
|
+
np.mean(muncMat.astype(np.float64), axis=0)
|
|
1241
|
+
)
|
|
1242
|
+
except Exception as e:
|
|
1243
|
+
logger.warning(
|
|
1244
|
+
f"Error computing weights for 'muncTrace': {e}. No weights applied for matching."
|
|
1245
|
+
)
|
|
1246
|
+
weights_ = None
|
|
1247
|
+
else:
|
|
1248
|
+
logger.warning(
|
|
1249
|
+
f"Unrecognized `matchingParams.penalizeBy`: {matchingArgs.penalizeBy}. No weights applied."
|
|
1250
|
+
)
|
|
1251
|
+
weights_ = None
|
|
1252
|
+
|
|
1253
|
+
df = pd.DataFrame(
|
|
1254
|
+
{
|
|
1255
|
+
"Chromosome": chromosome,
|
|
1256
|
+
"Start": intervals,
|
|
1257
|
+
"End": intervals + stepSize,
|
|
1258
|
+
"State": x_,
|
|
1259
|
+
}
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
if outputArgs.writeResiduals:
|
|
1263
|
+
df["Res"] = y_.astype(np.float32) # FFR: cast necessary?
|
|
1264
|
+
if outputArgs.writeRawResiduals:
|
|
1265
|
+
df["RawRes"] = np.mean(y, axis=1).astype(np.float32)
|
|
1266
|
+
if outputArgs.writeMuncTrace:
|
|
1267
|
+
munc_std = np.sqrt(
|
|
1268
|
+
np.mean(muncMat.astype(np.float64), axis=0)
|
|
1269
|
+
).astype(np.float32)
|
|
1270
|
+
df["Munc"] = munc_std
|
|
1271
|
+
if outputArgs.writeStateStd:
|
|
1272
|
+
df["StateStd"] = np.sqrt(P[:, 0, 0]).astype(np.float32)
|
|
1273
|
+
cols_ = ["Chromosome", "Start", "End", "State"]
|
|
1274
|
+
if outputArgs.writeResiduals:
|
|
1275
|
+
cols_.append("Res")
|
|
1276
|
+
if outputArgs.writeMuncTrace:
|
|
1277
|
+
cols_.append("Munc")
|
|
1278
|
+
if outputArgs.writeStateStd:
|
|
1279
|
+
cols_.append("StateStd")
|
|
1280
|
+
if outputArgs.writeRawResiduals:
|
|
1281
|
+
cols_.append("RawRes")
|
|
1282
|
+
df = df[cols_]
|
|
1283
|
+
suffixes = ['state']
|
|
1284
|
+
if outputArgs.writeResiduals:
|
|
1285
|
+
suffixes.append('residuals')
|
|
1286
|
+
if outputArgs.writeMuncTrace:
|
|
1287
|
+
suffixes.append('muncTraces')
|
|
1288
|
+
if outputArgs.writeStateStd:
|
|
1289
|
+
suffixes.append('stdDevs')
|
|
1290
|
+
if outputArgs.writeRawResiduals:
|
|
1291
|
+
suffixes.append('rawResiduals')
|
|
1292
|
+
|
|
1293
|
+
if (c_ == 0 and len(chromosomes) > 1) or (len(chromosomes) == 1):
|
|
1294
|
+
for file_ in os.listdir("."):
|
|
1295
|
+
if file_.startswith(
|
|
1296
|
+
f"consenrichOutput_{experimentName}"
|
|
1297
|
+
) and (
|
|
1298
|
+
file_.endswith(".bedGraph")
|
|
1299
|
+
or file_.endswith(".narrowPeak")
|
|
1300
|
+
):
|
|
1301
|
+
logger.warning(f"Overwriting: {file_}")
|
|
1302
|
+
os.remove(file_)
|
|
1303
|
+
|
|
1304
|
+
for col, suffix in zip(cols_[3:], suffixes):
|
|
1305
|
+
logger.info(
|
|
1306
|
+
f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
|
|
1307
|
+
)
|
|
1308
|
+
df[["Chromosome", "Start", "End", col]].to_csv(
|
|
1309
|
+
f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
|
|
1310
|
+
sep="\t",
|
|
1311
|
+
header=False,
|
|
1312
|
+
index=False,
|
|
1313
|
+
mode="a",
|
|
1314
|
+
float_format="%.3f",
|
|
1315
|
+
lineterminator="\n",
|
|
1316
|
+
)
|
|
1317
|
+
try:
|
|
1318
|
+
if matchingEnabled:
|
|
1319
|
+
if (
|
|
1320
|
+
minMatchLengthBP_ is None
|
|
1321
|
+
or minMatchLengthBP_ <= 0
|
|
1322
|
+
):
|
|
1323
|
+
minMatchLengthBP_ = (
|
|
1324
|
+
matching.autoMinLengthIntervals(x_)
|
|
1325
|
+
* (intervals[1] - intervals[0])
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1328
|
+
if mergeGapBP_ is None:
|
|
1329
|
+
mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
|
|
1330
|
+
|
|
1331
|
+
matchingDF = matching.matchWavelet(
|
|
1332
|
+
chromosome,
|
|
1333
|
+
intervals,
|
|
1334
|
+
x_,
|
|
1335
|
+
matchingArgs.templateNames,
|
|
1336
|
+
matchingArgs.cascadeLevels,
|
|
1337
|
+
matchingArgs.iters,
|
|
1338
|
+
matchingArgs.alpha,
|
|
1339
|
+
minMatchLengthBP_,
|
|
1340
|
+
matchingArgs.maxNumMatches,
|
|
1341
|
+
matchingArgs.minSignalAtMaxima,
|
|
1342
|
+
useScalingFunction=matchingArgs.useScalingFunction,
|
|
1343
|
+
excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
|
|
1344
|
+
randSeed=matchingArgs.randSeed,
|
|
1345
|
+
weights=weights_,
|
|
1346
|
+
)
|
|
1347
|
+
if not matchingDF.empty:
|
|
1348
|
+
matchingDF.to_csv(
|
|
1349
|
+
f"consenrichOutput_{experimentName}_matches.narrowPeak",
|
|
1350
|
+
sep="\t",
|
|
1351
|
+
header=False,
|
|
1352
|
+
index=False,
|
|
1353
|
+
mode="a",
|
|
1354
|
+
float_format=f"%.{outputArgs.roundDigits}f",
|
|
1355
|
+
lineterminator="\n",
|
|
1356
|
+
)
|
|
1357
|
+
except Exception as e:
|
|
1358
|
+
logger.warning(
|
|
1359
|
+
f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
|
|
1360
|
+
)
|
|
1361
|
+
continue
|
|
1362
|
+
logger.info("Finished: output in human-readable format")
|
|
1363
|
+
|
|
1364
|
+
if outputArgs.convertToBigWig:
|
|
1365
|
+
convertBedGraphToBigWig(experimentName, genomeArgs.chromSizesFile, suffixes=suffixes)
|
|
1366
|
+
|
|
1367
|
+
if matchingEnabled and matchingArgs.merge:
|
|
1368
|
+
try:
|
|
1369
|
+
mergeGapBP_ = matchingArgs.mergeGapBP
|
|
1370
|
+
if mergeGapBP_ is None or mergeGapBP_ <= 0:
|
|
1371
|
+
mergeGapBP_ = (
|
|
1372
|
+
int(minMatchLengthBP_ / 2) + 1
|
|
1373
|
+
if minMatchLengthBP_ is not None
|
|
1374
|
+
and minMatchLengthBP_ >= 0
|
|
1375
|
+
else 75
|
|
1376
|
+
)
|
|
1377
|
+
matching.mergeMatches(
|
|
1378
|
+
f"consenrichOutput_{experimentName}_matches.narrowPeak",
|
|
1379
|
+
mergeGapBP=mergeGapBP_,
|
|
1380
|
+
)
|
|
1381
|
+
|
|
1382
|
+
except Exception as e:
|
|
1383
|
+
logger.warning(
|
|
1384
|
+
f"Failed to merge matches...SKIPPING:\n{e}\n\n"
|
|
1385
|
+
)
|
|
1386
|
+
logger.info("Done.")
|
|
1387
|
+
|
|
1388
|
+
|
|
1389
|
+
if __name__ == "__main__":
|
|
1390
|
+
main()
|