consenrich 0.7.5b2__cp314-cp314-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,1426 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import pprint
8
+ import os
9
+ from pathlib import Path
10
+ from collections.abc import Mapping
11
+ from typing import List, Optional, Tuple, Dict, Any, Union
12
+ import shutil
13
+ import subprocess
14
+ import sys
15
+ import numpy as np
16
+ import pandas as pd
17
+ import pysam
18
+ import pywt
19
+ import yaml
20
+
21
+ import consenrich.core as core
22
+ import consenrich.misc_util as misc_util
23
+ import consenrich.constants as constants
24
+ import consenrich.detrorm as detrorm
25
+ import consenrich.matching as matching
26
+
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def _loadConfig(
37
+ configSource: Union[str, Path, Mapping[str, Any]],
38
+ ) -> Dict[str, Any]:
39
+ r"""Load a YAML config from a path or accept an already-parsed mapping.
40
+
41
+ If given a dict-like object, just return it.If given a path, try to load as YAML --> dict
42
+ If given a path, try to load as YAML --> dict
43
+
44
+ """
45
+ if isinstance(configSource, Mapping):
46
+ configData = configSource
47
+ elif isinstance(configSource, (str, Path)):
48
+ with open(configSource, "r") as fileHandle:
49
+ configData = yaml.safe_load(fileHandle) or {}
50
+ else:
51
+ raise TypeError("`config` must be a path or a mapping/dict.")
52
+
53
+ if not isinstance(configData, Mapping):
54
+ raise TypeError("Top-level YAML must be a mapping/object.")
55
+ return configData
56
+
57
+
58
+ def _cfgGet(
59
+ configMap: Mapping[str, Any],
60
+ dottedKey: str,
61
+ defaultVal: Any = None,
62
+ ) -> Any:
63
+ r"""Support both dotted keys and yaml/dict-style nested access for configs."""
64
+
65
+ # e.g., inputParams.bamFiles
66
+ if dottedKey in configMap:
67
+ return configMap[dottedKey]
68
+
69
+ # e.g.,
70
+ # inputParams:
71
+ # bamFiles: [...]
72
+ currentVal: Any = configMap
73
+ for keyPart in dottedKey.split("."):
74
+ if isinstance(currentVal, Mapping) and keyPart in currentVal:
75
+ currentVal = currentVal[keyPart]
76
+ else:
77
+ return defaultVal
78
+ return currentVal
79
+
80
+
81
+ def _listOrEmpty(list_):
82
+ if list_ is None:
83
+ return []
84
+ return list_
85
+
86
+
87
+ def _getMinR(configMap, numBams: int) -> float:
88
+ fallbackMinR: float = 1.0
89
+ try:
90
+ rawVal = _cfgGet(configMap, "observationParams.minR", None)
91
+ return float(rawVal) if rawVal is not None else fallbackMinR
92
+ except (TypeError, ValueError, KeyError):
93
+ logger.warning(
94
+ f"Invalid or missing 'observationParams.minR' in config. Using `{fallbackMinR}`."
95
+ )
96
+ return fallbackMinR
97
+
98
+
99
+ def checkControlsPresent(inputArgs: core.inputParams) -> bool:
100
+ """Check if control BAM files are present in the input arguments.
101
+
102
+ :param inputArgs: core.inputParams object
103
+ :return: True if control BAM files are present, False otherwise.
104
+ """
105
+ return (
106
+ bool(inputArgs.bamFilesControl)
107
+ and isinstance(inputArgs.bamFilesControl, list)
108
+ and len(inputArgs.bamFilesControl) > 0
109
+ )
110
+
111
+
112
+ def getReadLengths(
113
+ inputArgs: core.inputParams,
114
+ countingArgs: core.countingParams,
115
+ samArgs: core.samParams,
116
+ ) -> List[int]:
117
+ r"""Get read lengths for each BAM file in the input arguments.
118
+
119
+ :param inputArgs: core.inputParams object containing BAM file paths.
120
+ :param countingArgs: core.countingParams object containing number of reads.
121
+ :param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
122
+ :return: List of read lengths for each BAM file.
123
+ """
124
+ if not inputArgs.bamFiles:
125
+ raise ValueError(
126
+ "No BAM files provided in the input arguments."
127
+ )
128
+
129
+ if (
130
+ not isinstance(inputArgs.bamFiles, list)
131
+ or len(inputArgs.bamFiles) == 0
132
+ ):
133
+ raise ValueError("bam files list is empty")
134
+
135
+ return [
136
+ core.getReadLength(
137
+ bamFile,
138
+ countingArgs.numReads,
139
+ 1000,
140
+ samArgs.samThreads,
141
+ samArgs.samFlagExclude,
142
+ )
143
+ for bamFile in inputArgs.bamFiles
144
+ ]
145
+
146
+
147
+ def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
148
+ matchingEnabled = (
149
+ (matchingArgs.templateNames is not None)
150
+ and isinstance(matchingArgs.templateNames, list)
151
+ and len(matchingArgs.templateNames) > 0
152
+ )
153
+ matchingEnabled = (
154
+ matchingEnabled
155
+ and (matchingArgs.cascadeLevels is not None)
156
+ and isinstance(matchingArgs.cascadeLevels, list)
157
+ and len(matchingArgs.cascadeLevels) > 0
158
+ )
159
+ return matchingEnabled
160
+
161
+
162
+ def getEffectiveGenomeSizes(
163
+ genomeArgs: core.genomeParams, readLengths: List[int]
164
+ ) -> List[int]:
165
+ r"""Get effective genome sizes for the given genome name and read lengths.
166
+ :param genomeArgs: core.genomeParams object
167
+ :param readLengths: List of read lengths for which to get effective genome sizes.
168
+ :return: List of effective genome sizes corresponding to the read lengths.
169
+ """
170
+ genomeName = genomeArgs.genomeName
171
+ if not genomeName or not isinstance(genomeName, str):
172
+ raise ValueError("Genome name must be a non-empty string.")
173
+
174
+ if not isinstance(readLengths, list) or len(readLengths) == 0:
175
+ raise ValueError(
176
+ "Read lengths must be a non-empty list. Try calling `getReadLengths` first."
177
+ )
178
+ return [
179
+ constants.getEffectiveGenomeSize(genomeName, readLength)
180
+ for readLength in readLengths
181
+ ]
182
+
183
+
184
+ def getInputArgs(config_path: str) -> core.inputParams:
185
+ configData = _loadConfig(config_path)
186
+
187
+ def expandWildCards(bamList: List[str]) -> List[str]:
188
+ expandedList: List[str] = []
189
+ for bamEntry in bamList:
190
+ if "*" in bamEntry or "?" in bamEntry or "[" in bamEntry:
191
+ matchedList = glob.glob(bamEntry)
192
+ expandedList.extend(matchedList)
193
+ else:
194
+ expandedList.append(bamEntry)
195
+ return expandedList
196
+
197
+ bamFilesRaw = (
198
+ _cfgGet(configData, "inputParams.bamFiles", []) or []
199
+ )
200
+ bamFilesControlRaw = (
201
+ _cfgGet(configData, "inputParams.bamFilesControl", []) or []
202
+ )
203
+
204
+ bamFiles = expandWildCards(bamFilesRaw)
205
+ bamFilesControl = expandWildCards(bamFilesControlRaw)
206
+
207
+ if len(bamFiles) == 0:
208
+ raise ValueError(
209
+ "No BAM files provided in the configuration."
210
+ )
211
+
212
+ if (
213
+ len(bamFilesControl) > 0
214
+ and len(bamFilesControl) != len(bamFiles)
215
+ and len(bamFilesControl) != 1
216
+ ):
217
+ raise ValueError(
218
+ "Number of control BAM files must be 0, 1, or the same as number of treatment files"
219
+ )
220
+
221
+ if len(bamFilesControl) == 1:
222
+ logger.info(
223
+ f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
224
+ )
225
+ bamFilesControl = bamFilesControl * len(bamFiles)
226
+
227
+ if not bamFiles or not isinstance(bamFiles, list):
228
+ raise ValueError("No BAM files found")
229
+
230
+ for bamFile in bamFiles:
231
+ misc_util.checkBamFile(bamFile)
232
+
233
+ if bamFilesControl:
234
+ for bamFile in bamFilesControl:
235
+ misc_util.checkBamFile(bamFile)
236
+
237
+ pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
238
+ pairedEndConfig: Optional[bool] = _cfgGet(
239
+ configData, "inputParams.pairedEnd", None
240
+ )
241
+ if pairedEndConfig is None:
242
+ pairedEndConfig = all(pairedEndList)
243
+ if pairedEndConfig:
244
+ logger.info("Paired-end BAM files detected")
245
+ else:
246
+ logger.info("One or more single-end BAM files detected")
247
+
248
+ return core.inputParams(
249
+ bamFiles=bamFiles,
250
+ bamFilesControl=bamFilesControl,
251
+ pairedEnd=pairedEndConfig,
252
+ )
253
+
254
+
255
+ def getOutputArgs(config_path: str) -> core.outputParams:
256
+ configData = _loadConfig(config_path)
257
+
258
+ convertToBigWig_ = _cfgGet(
259
+ configData,
260
+ "outputParams.convertToBigWig",
261
+ True if shutil.which("bedGraphToBigWig") else False,
262
+ )
263
+
264
+ roundDigits_ = _cfgGet(configData, "outputParams.roundDigits", 3)
265
+
266
+ writeResiduals_ = _cfgGet(
267
+ configData, "outputParams.writeResiduals", True
268
+ )
269
+
270
+ writeMuncTrace: bool = _cfgGet(
271
+ configData, "outputParams.writeMuncTrace", False
272
+ )
273
+
274
+ writeStateStd: bool = _cfgGet(
275
+ configData, "outputParams.writeStateStd", False
276
+ )
277
+
278
+ writeRawResiduals: bool = _cfgGet(
279
+ configData, "outputParams.writeRawResiduals", False
280
+ )
281
+
282
+ return core.outputParams(
283
+ convertToBigWig=convertToBigWig_,
284
+ roundDigits=roundDigits_,
285
+ writeResiduals=writeResiduals_,
286
+ writeRawResiduals=writeRawResiduals,
287
+ writeMuncTrace=writeMuncTrace,
288
+ writeStateStd=writeStateStd,
289
+ )
290
+
291
+
292
+ def getGenomeArgs(config_path: str) -> core.genomeParams:
293
+ configData = _loadConfig(config_path)
294
+
295
+ genomeName = _cfgGet(configData, "genomeParams.name", None)
296
+ genomeLabel = constants.resolveGenomeName(genomeName)
297
+
298
+ chromSizesFile: Optional[str] = None
299
+ blacklistFile: Optional[str] = None
300
+ sparseBedFile: Optional[str] = None
301
+ chromosomesList: Optional[List[str]] = None
302
+
303
+ excludeChromsList: List[str] = (
304
+ _cfgGet(configData, "genomeParams.excludeChroms", []) or []
305
+ )
306
+ excludeForNormList: List[str] = (
307
+ _cfgGet(configData, "genomeParams.excludeForNorm", []) or []
308
+ )
309
+
310
+ if genomeLabel:
311
+ chromSizesFile = constants.getGenomeResourceFile(
312
+ genomeLabel, "sizes"
313
+ )
314
+ blacklistFile = constants.getGenomeResourceFile(
315
+ genomeLabel, "blacklist"
316
+ )
317
+ sparseBedFile = constants.getGenomeResourceFile(
318
+ genomeLabel, "sparse"
319
+ )
320
+
321
+ chromSizesOverride = _cfgGet(
322
+ configData, "genomeParams.chromSizesFile", None
323
+ )
324
+ if chromSizesOverride:
325
+ chromSizesFile = chromSizesOverride
326
+
327
+ blacklistOverride = _cfgGet(
328
+ configData, "genomeParams.blacklistFile", None
329
+ )
330
+ if blacklistOverride:
331
+ blacklistFile = blacklistOverride
332
+
333
+ sparseOverride = _cfgGet(
334
+ configData, "genomeParams.sparseBedFile", None
335
+ )
336
+ if sparseOverride:
337
+ sparseBedFile = sparseOverride
338
+
339
+ if not chromSizesFile or not os.path.exists(chromSizesFile):
340
+ raise FileNotFoundError(
341
+ f"Chromosome sizes file {chromSizesFile} does not exist."
342
+ )
343
+
344
+ chromosomesConfig = _cfgGet(
345
+ configData, "genomeParams.chromosomes", None
346
+ )
347
+ if chromosomesConfig is not None:
348
+ chromosomesList = chromosomesConfig
349
+ else:
350
+ if chromSizesFile:
351
+ chromosomesFrame = pd.read_csv(
352
+ chromSizesFile,
353
+ sep="\t",
354
+ header=None,
355
+ names=["chrom", "size"],
356
+ )
357
+ chromosomesList = list(chromosomesFrame["chrom"])
358
+ else:
359
+ raise ValueError(
360
+ "No chromosomes provided in the configuration and no chromosome sizes file specified."
361
+ )
362
+
363
+ chromosomesList = [
364
+ chromName.strip()
365
+ for chromName in chromosomesList
366
+ if chromName and chromName.strip()
367
+ ]
368
+ if excludeChromsList:
369
+ chromosomesList = [
370
+ chromName
371
+ for chromName in chromosomesList
372
+ if chromName not in excludeChromsList
373
+ ]
374
+ if not chromosomesList:
375
+ raise ValueError(
376
+ "No valid chromosomes found after excluding specified chromosomes."
377
+ )
378
+
379
+ return core.genomeParams(
380
+ genomeName=genomeLabel,
381
+ chromSizesFile=chromSizesFile,
382
+ blacklistFile=blacklistFile,
383
+ sparseBedFile=sparseBedFile,
384
+ chromosomes=chromosomesList,
385
+ excludeChroms=excludeChromsList,
386
+ excludeForNorm=excludeForNormList,
387
+ )
388
+
389
+
390
+ def getCountingArgs(config_path: str) -> core.countingParams:
391
+ configData = _loadConfig(config_path)
392
+
393
+ stepSize = _cfgGet(configData, "countingParams.stepSize", 25)
394
+ scaleDownFlag = _cfgGet(
395
+ configData,
396
+ "countingParams.scaleDown",
397
+ False,
398
+ )
399
+ scaleFactorList = _cfgGet(
400
+ configData, "countingParams.scaleFactors", None
401
+ )
402
+ numReads = _cfgGet(configData, "countingParams.numReads", 100)
403
+ scaleFactorsControlList = _cfgGet(
404
+ configData, "countingParams.scaleFactorsControl", None
405
+ )
406
+ applyAsinhFlag = _cfgGet(
407
+ configData, "countingParams.applyAsinh", False
408
+ )
409
+ applyLogFlag = _cfgGet(
410
+ configData, "countingParams.applyLog", False
411
+ )
412
+
413
+ if applyAsinhFlag and applyLogFlag:
414
+ applyAsinhFlag = True
415
+ applyLogFlag = False
416
+ logger.warning(
417
+ "Both `applyAsinh` and `applyLog` are set. Overriding `applyLog=False` & `applyAsinh=True`."
418
+ )
419
+
420
+ rescaleToTreatmentCoverageFlag = _cfgGet(
421
+ configData,
422
+ "countingParams.rescaleToTreatmentCoverage",
423
+ False,
424
+ )
425
+
426
+ if scaleFactorList is not None and not isinstance(
427
+ scaleFactorList, list
428
+ ):
429
+ raise ValueError("`scaleFactors` should be a list of floats.")
430
+
431
+ if scaleFactorsControlList is not None and not isinstance(
432
+ scaleFactorsControlList, list
433
+ ):
434
+ raise ValueError(
435
+ "`scaleFactorsControl` should be a list of floats."
436
+ )
437
+
438
+ if (
439
+ scaleFactorList is not None
440
+ and scaleFactorsControlList is not None
441
+ and len(scaleFactorList) != len(scaleFactorsControlList)
442
+ ):
443
+ if len(scaleFactorsControlList) == 1:
444
+ scaleFactorsControlList = scaleFactorsControlList * len(
445
+ scaleFactorList
446
+ )
447
+ else:
448
+ raise ValueError(
449
+ "control and treatment scale factors: must be equal length or 1 control"
450
+ )
451
+
452
+
453
+ normMethod_ = _cfgGet(
454
+ configData,
455
+ "countingParams.normMethod", "EGS",
456
+ )
457
+ if normMethod_.upper() not in ["EGS", "RPKM"]:
458
+ logger.warning(
459
+ f"Unknown `countingParams.normMethod`...Using `EGS`...",
460
+ )
461
+ normMethod_ = "EGS"
462
+
463
+ return core.countingParams(
464
+ stepSize=stepSize,
465
+ scaleDown=scaleDownFlag,
466
+ scaleFactors=scaleFactorList,
467
+ scaleFactorsControl=scaleFactorsControlList,
468
+ numReads=numReads,
469
+ applyAsinh=applyAsinhFlag,
470
+ applyLog=applyLogFlag,
471
+ rescaleToTreatmentCoverage=rescaleToTreatmentCoverageFlag,
472
+ normMethod=normMethod_,
473
+ )
474
+
475
+
476
+ def readConfig(config_path: str) -> Dict[str, Any]:
477
+ r"""Read and parse the configuration file for Consenrich.
478
+
479
+ :param config_path: Path to the YAML configuration file.
480
+ :return: Dictionary containing all parsed configuration parameters.
481
+ """
482
+ configData = _loadConfig(config_path)
483
+
484
+ inputParams = getInputArgs(config_path)
485
+ outputParams = getOutputArgs(config_path)
486
+ genomeParams = getGenomeArgs(config_path)
487
+ countingParams = getCountingArgs(config_path)
488
+
489
+ minRDefault = _getMinR(configData, len(inputParams.bamFiles))
490
+ minQDefault = (
491
+ minRDefault / len(inputParams.bamFiles)
492
+ ) + 0.10 # conditioning
493
+
494
+ matchingExcludeRegionsFileDefault: Optional[str] = (
495
+ genomeParams.blacklistFile
496
+ )
497
+
498
+ if (
499
+ inputParams.bamFilesControl is not None
500
+ and len(inputParams.bamFilesControl) > 0
501
+ ):
502
+ detrendWindowLengthBp = _cfgGet(
503
+ configData,
504
+ "detrendParams.detrendWindowLengthBP",
505
+ 25_000,
506
+ )
507
+ detrendSavitzkyGolayDegree = _cfgGet(
508
+ configData,
509
+ "detrendParams.detrendSavitzkyGolayDegree",
510
+ 1,
511
+ )
512
+ else:
513
+ detrendWindowLengthBp = _cfgGet(
514
+ configData,
515
+ "detrendParams.detrendWindowLengthBP",
516
+ 10_000,
517
+ )
518
+ detrendSavitzkyGolayDegree = _cfgGet(
519
+ configData,
520
+ "detrendParams.detrendSavitzkyGolayDegree",
521
+ 2,
522
+ )
523
+
524
+ experimentName = _cfgGet(
525
+ configData, "experimentName", "consenrichExperiment"
526
+ )
527
+
528
+ processArgs = core.processParams(
529
+ deltaF=_cfgGet(configData, "processParams.deltaF", 0.5),
530
+ minQ=_cfgGet(configData, "processParams.minQ", minQDefault),
531
+ maxQ=_cfgGet(configData, "processParams.maxQ", 500.0),
532
+ offDiagQ=_cfgGet(configData, "processParams.offDiagQ", 0.0),
533
+ dStatAlpha=_cfgGet(
534
+ configData, "processParams.dStatAlpha", 2.0
535
+ ),
536
+ dStatd=_cfgGet(configData, "processParams.dStatd", 1.0),
537
+ dStatPC=_cfgGet(configData, "processParams.dStatPC", 1.0),
538
+ scaleResidualsByP11=_cfgGet(
539
+ configData,
540
+ "processParams.scaleResidualsByP11",
541
+ True,
542
+ ),
543
+ )
544
+
545
+ observationArgs = core.observationParams(
546
+ minR=minRDefault,
547
+ maxR=_cfgGet(configData, "observationParams.maxR", 500.0),
548
+ useALV=_cfgGet(configData, "observationParams.useALV", False),
549
+ useConstantNoiseLevel=_cfgGet(
550
+ configData,
551
+ "observationParams.useConstantNoiseLevel",
552
+ False,
553
+ ),
554
+ noGlobal=_cfgGet(
555
+ configData, "observationParams.noGlobal", False
556
+ ),
557
+ numNearest=_cfgGet(
558
+ configData, "observationParams.numNearest", 25
559
+ ),
560
+ localWeight=_cfgGet(
561
+ configData, "observationParams.localWeight", 0.333
562
+ ),
563
+ globalWeight=_cfgGet(
564
+ configData, "observationParams.globalWeight", 0.667
565
+ ),
566
+ approximationWindowLengthBP=_cfgGet(
567
+ configData,
568
+ "observationParams.approximationWindowLengthBP",
569
+ 10_000,
570
+ ),
571
+ lowPassWindowLengthBP=_cfgGet(
572
+ configData,
573
+ "observationParams.lowPassWindowLengthBP",
574
+ 20_000,
575
+ ),
576
+ lowPassFilterType=_cfgGet(
577
+ configData,
578
+ "observationParams.lowPassFilterType",
579
+ "median",
580
+ ),
581
+ returnCenter=_cfgGet(
582
+ configData, "observationParams.returnCenter", True
583
+ ),
584
+ )
585
+
586
+ stateArgs = core.stateParams(
587
+ stateInit=_cfgGet(configData, "stateParams.stateInit", 0.0),
588
+ stateCovarInit=_cfgGet(
589
+ configData, "stateParams.stateCovarInit", 100.0
590
+ ),
591
+ boundState=_cfgGet(
592
+ configData, "stateParams.boundState", True
593
+ ),
594
+ stateLowerBound=_cfgGet(
595
+ configData, "stateParams.stateLowerBound", 0.0
596
+ ),
597
+ stateUpperBound=_cfgGet(
598
+ configData, "stateParams.stateUpperBound", 10000.0
599
+ ),
600
+ )
601
+
602
+ samThreads = _cfgGet(configData, "samParams.samThreads", 1)
603
+ samFlagExclude = _cfgGet(
604
+ configData, "samParams.samFlagExclude", 3844
605
+ )
606
+ oneReadPerBin = _cfgGet(configData, "samParams.oneReadPerBin", 0)
607
+ chunkSize = _cfgGet(configData, "samParams.chunkSize", 1_000_000)
608
+ offsetStr = _cfgGet(configData, "samParams.offsetStr", "0,0")
609
+ extendBpList = _cfgGet(configData, "samParams.extendBP", [])
610
+ maxInsertSize = _cfgGet(
611
+ configData, "samParams.maxInsertSize", 1000
612
+ )
613
+
614
+ pairedEndDefault = (
615
+ 1
616
+ if inputParams.pairedEnd is not None
617
+ and int(inputParams.pairedEnd) > 0
618
+ else 0
619
+ )
620
+ inferFragmentDefault = (
621
+ 1
622
+ if inputParams.pairedEnd is not None
623
+ and int(inputParams.pairedEnd) == 0
624
+ else 0
625
+ )
626
+
627
+ samArgs = core.samParams(
628
+ samThreads=samThreads,
629
+ samFlagExclude=samFlagExclude,
630
+ oneReadPerBin=oneReadPerBin,
631
+ chunkSize=chunkSize,
632
+ offsetStr=offsetStr,
633
+ extendBP=extendBpList,
634
+ maxInsertSize=maxInsertSize,
635
+ pairedEndMode=_cfgGet(
636
+ configData,
637
+ "samParams.pairedEndMode",
638
+ pairedEndDefault,
639
+ ),
640
+ inferFragmentLength=_cfgGet(
641
+ configData,
642
+ "samParams.inferFragmentLength",
643
+ inferFragmentDefault,
644
+ ),
645
+ countEndsOnly=_cfgGet(
646
+ configData, "samParams.countEndsOnly", False
647
+ ),
648
+ )
649
+
650
+ detrendArgs = core.detrendParams(
651
+ detrendWindowLengthBP=detrendWindowLengthBp,
652
+ detrendTrackPercentile=_cfgGet(
653
+ configData,
654
+ "detrendParams.detrendTrackPercentile",
655
+ 75,
656
+ ),
657
+ usePolyFilter=_cfgGet(
658
+ configData, "detrendParams.usePolyFilter", False
659
+ ),
660
+ detrendSavitzkyGolayDegree=detrendSavitzkyGolayDegree,
661
+ useOrderStatFilter=_cfgGet(
662
+ configData, "detrendParams.useOrderStatFilter", True
663
+ ),
664
+ )
665
+
666
+ matchingArgs = core.matchingParams(
667
+ templateNames=_cfgGet(
668
+ configData, "matchingParams.templateNames", []
669
+ ),
670
+ cascadeLevels=_cfgGet(
671
+ configData, "matchingParams.cascadeLevels", []
672
+ ),
673
+ iters=_cfgGet(configData, "matchingParams.iters", 25_000),
674
+ alpha=_cfgGet(configData, "matchingParams.alpha", 0.05),
675
+ minMatchLengthBP=_cfgGet(
676
+ configData,
677
+ "matchingParams.minMatchLengthBP",
678
+ 250,
679
+ ),
680
+ maxNumMatches=_cfgGet(
681
+ configData,
682
+ "matchingParams.maxNumMatches",
683
+ 100_000,
684
+ ),
685
+ minSignalAtMaxima=_cfgGet(
686
+ configData,
687
+ "matchingParams.minSignalAtMaxima",
688
+ "q:0.75",
689
+ ),
690
+ merge=_cfgGet(configData, "matchingParams.merge", True),
691
+ mergeGapBP=_cfgGet(
692
+ configData, "matchingParams.mergeGapBP", None
693
+ ),
694
+ useScalingFunction=_cfgGet(
695
+ configData,
696
+ "matchingParams.useScalingFunction",
697
+ True,
698
+ ),
699
+ excludeRegionsBedFile=_cfgGet(
700
+ configData,
701
+ "matchingParams.excludeRegionsBedFile",
702
+ matchingExcludeRegionsFileDefault,
703
+ ),
704
+ randSeed=_cfgGet(configData, "matchingParams.randSeed", 42),
705
+ penalizeBy=_cfgGet(
706
+ configData, "matchingParams.penalizeBy", None
707
+ ),
708
+ eps=_cfgGet(configData, "matchingParams.eps", 1.0e-2),
709
+ )
710
+
711
+ return {
712
+ "experimentName": experimentName,
713
+ "genomeArgs": genomeParams,
714
+ "inputArgs": inputParams,
715
+ "outputArgs": outputParams,
716
+ "countingArgs": countingParams,
717
+ "processArgs": processArgs,
718
+ "observationArgs": observationArgs,
719
+ "stateArgs": stateArgs,
720
+ "samArgs": samArgs,
721
+ "detrendArgs": detrendArgs,
722
+ "matchingArgs": matchingArgs,
723
+ }
724
+
725
+
726
+ def convertBedGraphToBigWig(
727
+ experimentName,
728
+ chromSizesFile,
729
+ suffixes: Optional[List[str]] = None,
730
+ ):
731
+ if suffixes is None:
732
+ # at least look for `state` bedGraph
733
+ suffixes = ["state"]
734
+ path_ = ""
735
+ warningMessage = (
736
+ "Could not find UCSC bedGraphToBigWig binary utility."
737
+ "If you need bigWig files instead of the default, human-readable bedGraph files,"
738
+ "you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
739
+ "OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
740
+ )
741
+
742
+ logger.info(
743
+ "Attempting to generate bigWig files from bedGraph format..."
744
+ )
745
+ try:
746
+ path_ = shutil.which("bedGraphToBigWig")
747
+ except Exception as e:
748
+ logger.warning(f"\n{warningMessage}\n")
749
+ return
750
+ if path_ is None or len(path_) == 0:
751
+ logger.warning(f"\n{warningMessage}\n")
752
+ return
753
+ logger.info(f"Using bedGraphToBigWig from {path_}")
754
+ for suffix in suffixes:
755
+ bedgraph = (
756
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
757
+ )
758
+ if not os.path.exists(bedgraph):
759
+ logger.warning(
760
+ f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
761
+ )
762
+ continue
763
+ if not os.path.exists(chromSizesFile):
764
+ logger.warning(
765
+ f"{chromSizesFile} does not exist. Skipping bigWig conversion."
766
+ )
767
+ return
768
+ bigwig = f"{experimentName}_consenrich_{suffix}.bw"
769
+ logger.info(f"Start: {bedgraph} --> {bigwig}...")
770
+ try:
771
+ subprocess.run(
772
+ [path_, bedgraph, chromSizesFile, bigwig], check=True
773
+ )
774
+ except Exception as e:
775
+ logger.warning(
776
+ f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
777
+ )
778
+ continue
779
+ if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
780
+ logger.info(
781
+ f"Finished: converted {bedgraph} to {bigwig}."
782
+ )
783
+
784
+
785
+ def main():
786
+ parser = argparse.ArgumentParser(description="Consenrich CLI")
787
+ parser.add_argument(
788
+ "--config",
789
+ type=str,
790
+ dest="config",
791
+ help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
792
+ )
793
+
794
+ # --- Matching-specific command-line arguments ---
795
+ parser.add_argument(
796
+ "--match-bedGraph",
797
+ type=str,
798
+ dest="matchBedGraph",
799
+ help="Path to a bedGraph file of Consenrich estimates to match templates against.\
800
+ If provided, *only* the matching algorithm is run (no other processing). Note that \
801
+ some features in `consenrich.matching` may not be supported through this CLI interface.",
802
+ )
803
+ parser.add_argument(
804
+ "--match-template",
805
+ type=str,
806
+ default="haar",
807
+ choices=[
808
+ x
809
+ for x in pywt.wavelist(kind="discrete")
810
+ if "bio" not in x
811
+ ],
812
+ dest="matchTemplate",
813
+ )
814
+ parser.add_argument(
815
+ "--match-level", type=int, default=2, dest="matchLevel"
816
+ )
817
+ parser.add_argument(
818
+ "--match-alpha", type=float, default=0.05, dest="matchAlpha"
819
+ )
820
+ parser.add_argument(
821
+ "--match-min-length",
822
+ type=int,
823
+ default=250,
824
+ dest="matchMinMatchLengthBP",
825
+ )
826
+ parser.add_argument(
827
+ "--match-iters", type=int, default=25000, dest="matchIters"
828
+ )
829
+ parser.add_argument(
830
+ "--match-min-signal",
831
+ type=str,
832
+ default="q:0.75",
833
+ dest="matchMinSignalAtMaxima",
834
+ )
835
+ parser.add_argument(
836
+ "--match-max-matches",
837
+ type=int,
838
+ default=100000,
839
+ dest="matchMaxNumMatches",
840
+ )
841
+ parser.add_argument(
842
+ "--match-no-merge", action="store_true", dest="matchNoMerge"
843
+ )
844
+ parser.add_argument(
845
+ "--match-merge-gap",
846
+ type=int,
847
+ default=None,
848
+ dest="matchMergeGapBP",
849
+ )
850
+ parser.add_argument(
851
+ "--match-use-wavelet",
852
+ action="store_true",
853
+ dest="matchUseWavelet",
854
+ )
855
+ parser.add_argument(
856
+ "--match-seed", type=int, default=42, dest="matchRandSeed"
857
+ )
858
+ parser.add_argument(
859
+ "--match-exclude-bed",
860
+ type=str,
861
+ default=None,
862
+ dest="matchExcludeBed",
863
+ )
864
+ parser.add_argument(
865
+ "--verbose", action="store_true", help="If set, logs config"
866
+ )
867
+ args = parser.parse_args()
868
+
869
+ if args.matchBedGraph:
870
+ if not os.path.exists(args.matchBedGraph):
871
+ raise FileNotFoundError(
872
+ f"bedGraph file {args.matchBedGraph} couldn't be found."
873
+ )
874
+ logger.info(
875
+ f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
876
+ )
877
+
878
+ outName = matching.matchExistingBedGraph(
879
+ args.matchBedGraph,
880
+ args.matchTemplate,
881
+ args.matchLevel,
882
+ alpha=args.matchAlpha,
883
+ minMatchLengthBP=args.matchMinMatchLengthBP,
884
+ iters=args.matchIters,
885
+ minSignalAtMaxima=args.matchMinSignalAtMaxima,
886
+ maxNumMatches=args.matchMaxNumMatches,
887
+ useScalingFunction=(not args.matchUseWavelet),
888
+ merge=(not args.matchNoMerge),
889
+ mergeGapBP=args.matchMergeGapBP,
890
+ excludeRegionsBedFile=args.matchExcludeBed,
891
+ randSeed=args.matchRandSeed,
892
+ )
893
+ logger.info(f"Finished matching. Written to {outName}")
894
+ sys.exit(0)
895
+
896
+ if args.matchBedGraph:
897
+ # this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
898
+ logger.info(
899
+ "If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
900
+ )
901
+ sys.exit(0)
902
+
903
+ if not args.config:
904
+ logger.info(
905
+ "No config file provided, run with `--config <path_to_config.yaml>`"
906
+ )
907
+ logger.info(
908
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
909
+ )
910
+ sys.exit(1)
911
+
912
+ if not os.path.exists(args.config):
913
+ logger.info(f"Config file {args.config} does not exist.")
914
+ logger.info(
915
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
916
+ )
917
+ sys.exit(1)
918
+
919
+ config = readConfig(args.config)
920
+ experimentName = config["experimentName"]
921
+ genomeArgs = config["genomeArgs"]
922
+ inputArgs = config["inputArgs"]
923
+ outputArgs = config["outputArgs"]
924
+ countingArgs = config["countingArgs"]
925
+ processArgs = config["processArgs"]
926
+ observationArgs = config["observationArgs"]
927
+ stateArgs = config["stateArgs"]
928
+ samArgs = config["samArgs"]
929
+ detrendArgs = config["detrendArgs"]
930
+ matchingArgs = config["matchingArgs"]
931
+ bamFiles = inputArgs.bamFiles
932
+ bamFilesControl = inputArgs.bamFilesControl
933
+ numSamples = len(bamFiles)
934
+ numNearest = observationArgs.numNearest
935
+ stepSize = countingArgs.stepSize
936
+ excludeForNorm = genomeArgs.excludeForNorm
937
+ chromSizes = genomeArgs.chromSizesFile
938
+ scaleDown = countingArgs.scaleDown
939
+ extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
940
+ initialTreatmentScaleFactors = []
941
+ minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
942
+ mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
943
+
944
+ if args.verbose:
945
+ try:
946
+ logger.info("Configuration:\n")
947
+ config_truncated = {
948
+ k: v
949
+ for k, v in config.items()
950
+ if k
951
+ not in ["inputArgs", "genomeArgs", "countingArgs"]
952
+ }
953
+ config_truncated["experimentName"] = experimentName
954
+ config_truncated["inputArgs"] = inputArgs
955
+ config_truncated["outputArgs"] = outputArgs
956
+ config_truncated["genomeArgs"] = genomeArgs
957
+ config_truncated["countingArgs"] = countingArgs
958
+ config_truncated["processArgs"] = processArgs
959
+ config_truncated["observationArgs"] = observationArgs
960
+ config_truncated["stateArgs"] = stateArgs
961
+ config_truncated["samArgs"] = samArgs
962
+ config_truncated["detrendArgs"] = detrendArgs
963
+ pprint.pprint(config_truncated, indent=8)
964
+ except Exception as e:
965
+ logger.warning(f"Failed to print parsed config:\n{e}\n")
966
+
967
+ controlsPresent = checkControlsPresent(inputArgs)
968
+ if args.verbose:
969
+ logger.info(f"controlsPresent: {controlsPresent}")
970
+ readLengthsBamFiles = getReadLengths(
971
+ inputArgs, countingArgs, samArgs
972
+ )
973
+ effectiveGenomeSizes = getEffectiveGenomeSizes(
974
+ genomeArgs, readLengthsBamFiles
975
+ )
976
+
977
+ matchingEnabled = checkMatchingEnabled(matchingArgs)
978
+ if args.verbose:
979
+ logger.info(f"matchingEnabled: {matchingEnabled}")
980
+ scaleFactors = countingArgs.scaleFactors
981
+ scaleFactorsControl = countingArgs.scaleFactorsControl
982
+
983
+ if controlsPresent:
984
+ readLengthsControlBamFiles = [
985
+ core.getReadLength(
986
+ bamFile,
987
+ countingArgs.numReads,
988
+ 1000,
989
+ samArgs.samThreads,
990
+ samArgs.samFlagExclude,
991
+ )
992
+ for bamFile in bamFilesControl
993
+ ]
994
+ effectiveGenomeSizesControl = [
995
+ constants.getEffectiveGenomeSize(
996
+ genomeArgs.genomeName, readLength
997
+ )
998
+ for readLength in readLengthsControlBamFiles
999
+ ]
1000
+
1001
+ if (
1002
+ scaleFactors is not None
1003
+ and scaleFactorsControl is not None
1004
+ ):
1005
+ treatScaleFactors = scaleFactors
1006
+ controlScaleFactors = scaleFactorsControl
1007
+ # still make sure this is accessible
1008
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
1009
+ else:
1010
+ try:
1011
+ initialTreatmentScaleFactors = [
1012
+ detrorm.getScaleFactor1x(
1013
+ bamFile,
1014
+ effectiveGenomeSize,
1015
+ readLength,
1016
+ excludeForNorm,
1017
+ genomeArgs.chromSizesFile,
1018
+ samArgs.samThreads,
1019
+ )
1020
+ for bamFile, effectiveGenomeSize, readLength in zip(
1021
+ bamFiles,
1022
+ effectiveGenomeSizes,
1023
+ readLengthsBamFiles,
1024
+ )
1025
+ ]
1026
+ except Exception:
1027
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
1028
+
1029
+ pairScalingFactors = [
1030
+ detrorm.getPairScaleFactors(
1031
+ bamFileA,
1032
+ bamFileB,
1033
+ effectiveGenomeSizeA,
1034
+ effectiveGenomeSizeB,
1035
+ readLengthA,
1036
+ readLengthB,
1037
+ excludeForNorm,
1038
+ chromSizes,
1039
+ samArgs.samThreads,
1040
+ stepSize,
1041
+ scaleDown,
1042
+ normMethod=countingArgs.normMethod,
1043
+ )
1044
+ for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
1045
+ bamFiles,
1046
+ bamFilesControl,
1047
+ effectiveGenomeSizes,
1048
+ effectiveGenomeSizesControl,
1049
+ readLengthsBamFiles,
1050
+ readLengthsControlBamFiles,
1051
+ )
1052
+ ]
1053
+
1054
+ treatScaleFactors = []
1055
+ controlScaleFactors = []
1056
+ for scaleFactorA, scaleFactorB in pairScalingFactors:
1057
+ treatScaleFactors.append(scaleFactorA)
1058
+ controlScaleFactors.append(scaleFactorB)
1059
+
1060
+ else:
1061
+ treatScaleFactors = scaleFactors
1062
+ controlScaleFactors = scaleFactorsControl
1063
+
1064
+ if scaleFactors is None and not controlsPresent:
1065
+ if countingArgs.normMethod.upper() == "RPKM":
1066
+ scaleFactors = [
1067
+ detrorm.getScaleFactorPerMillion(
1068
+ bamFile, excludeForNorm, stepSize,
1069
+ )
1070
+ for bamFile in bamFiles
1071
+ ]
1072
+ else:
1073
+ scaleFactors = [
1074
+ detrorm.getScaleFactor1x(
1075
+ bamFile,
1076
+ effectiveGenomeSize,
1077
+ readLength,
1078
+ excludeForNorm,
1079
+ genomeArgs.chromSizesFile,
1080
+ samArgs.samThreads,
1081
+ )
1082
+ for bamFile, effectiveGenomeSize, readLength in zip(
1083
+ bamFiles,
1084
+ effectiveGenomeSizes,
1085
+ readLengthsBamFiles,
1086
+ )
1087
+ ]
1088
+ chromSizesDict = misc_util.getChromSizesDict(
1089
+ genomeArgs.chromSizesFile,
1090
+ excludeChroms=genomeArgs.excludeChroms,
1091
+ )
1092
+ chromosomes = genomeArgs.chromosomes
1093
+
1094
+ for c_, chromosome in enumerate(chromosomes):
1095
+ chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
1096
+ bamFiles,
1097
+ chromosome,
1098
+ chromSizesDict[chromosome],
1099
+ samArgs.samThreads,
1100
+ samArgs.samFlagExclude,
1101
+ )
1102
+ chromosomeStart = max(
1103
+ 0, (chromosomeStart - (chromosomeStart % stepSize))
1104
+ )
1105
+ chromosomeEnd = max(
1106
+ 0, (chromosomeEnd - (chromosomeEnd % stepSize))
1107
+ )
1108
+ numIntervals = (
1109
+ ((chromosomeEnd - chromosomeStart) + stepSize) - 1
1110
+ ) // stepSize
1111
+ intervals = np.arange(
1112
+ chromosomeStart, chromosomeEnd, stepSize
1113
+ )
1114
+ chromMat: np.ndarray = np.empty(
1115
+ (numSamples, numIntervals), dtype=np.float32
1116
+ )
1117
+ if controlsPresent:
1118
+ j_: int = 0
1119
+ for bamA, bamB in zip(bamFiles, bamFilesControl):
1120
+ logger.info(
1121
+ f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
1122
+ )
1123
+ pairMatrix: np.ndarray = core.readBamSegments(
1124
+ [bamA, bamB],
1125
+ chromosome,
1126
+ chromosomeStart,
1127
+ chromosomeEnd,
1128
+ stepSize,
1129
+ [
1130
+ readLengthsBamFiles[j_],
1131
+ readLengthsControlBamFiles[j_],
1132
+ ],
1133
+ [treatScaleFactors[j_], controlScaleFactors[j_]],
1134
+ samArgs.oneReadPerBin,
1135
+ samArgs.samThreads,
1136
+ samArgs.samFlagExclude,
1137
+ offsetStr=samArgs.offsetStr,
1138
+ extendBP=extendBP_[j_],
1139
+ maxInsertSize=samArgs.maxInsertSize,
1140
+ pairedEndMode=samArgs.pairedEndMode,
1141
+ inferFragmentLength=samArgs.inferFragmentLength,
1142
+ applyAsinh=countingArgs.applyAsinh,
1143
+ applyLog=countingArgs.applyLog,
1144
+ countEndsOnly=samArgs.countEndsOnly,
1145
+ )
1146
+
1147
+ chromMat[j_, :] = pairMatrix[0, :] - pairMatrix[1, :]
1148
+ j_ += 1
1149
+ else:
1150
+ chromMat = core.readBamSegments(
1151
+ bamFiles,
1152
+ chromosome,
1153
+ chromosomeStart,
1154
+ chromosomeEnd,
1155
+ stepSize,
1156
+ readLengthsBamFiles,
1157
+ scaleFactors,
1158
+ samArgs.oneReadPerBin,
1159
+ samArgs.samThreads,
1160
+ samArgs.samFlagExclude,
1161
+ offsetStr=samArgs.offsetStr,
1162
+ extendBP=extendBP_,
1163
+ maxInsertSize=samArgs.maxInsertSize,
1164
+ pairedEndMode=samArgs.pairedEndMode,
1165
+ inferFragmentLength=samArgs.inferFragmentLength,
1166
+ applyAsinh=countingArgs.applyAsinh,
1167
+ applyLog=countingArgs.applyLog,
1168
+ countEndsOnly=samArgs.countEndsOnly,
1169
+ )
1170
+ sparseMap = None
1171
+ if genomeArgs.sparseBedFile and not observationArgs.useALV:
1172
+ logger.info(
1173
+ f"Building sparse mapping for {chromosome}..."
1174
+ )
1175
+ sparseMap = core.getSparseMap(
1176
+ chromosome,
1177
+ intervals,
1178
+ numNearest,
1179
+ genomeArgs.sparseBedFile,
1180
+ )
1181
+
1182
+ muncMat = np.empty_like(chromMat, dtype=np.float32)
1183
+ for j in range(numSamples):
1184
+ logger.info(
1185
+ f"Muncing {j + 1}/{numSamples} for {chromosome}..."
1186
+ )
1187
+ muncMat[j, :] = core.getMuncTrack(
1188
+ chromosome,
1189
+ intervals,
1190
+ stepSize,
1191
+ chromMat[j, :],
1192
+ observationArgs.minR,
1193
+ observationArgs.maxR,
1194
+ observationArgs.useALV,
1195
+ observationArgs.useConstantNoiseLevel,
1196
+ observationArgs.noGlobal,
1197
+ observationArgs.localWeight,
1198
+ observationArgs.globalWeight,
1199
+ observationArgs.approximationWindowLengthBP,
1200
+ observationArgs.lowPassWindowLengthBP,
1201
+ observationArgs.returnCenter,
1202
+ sparseMap=sparseMap,
1203
+ lowPassFilterType=observationArgs.lowPassFilterType,
1204
+ )
1205
+ chromMat[j, :] = detrorm.detrendTrack(
1206
+ chromMat[j, :],
1207
+ stepSize,
1208
+ detrendArgs.detrendWindowLengthBP,
1209
+ detrendArgs.useOrderStatFilter,
1210
+ detrendArgs.usePolyFilter,
1211
+ detrendArgs.detrendTrackPercentile,
1212
+ detrendArgs.detrendSavitzkyGolayDegree,
1213
+ )
1214
+ logger.info(f">>>Running consenrich: {chromosome}<<<")
1215
+
1216
+ x, P, y = core.runConsenrich(
1217
+ chromMat,
1218
+ muncMat,
1219
+ processArgs.deltaF,
1220
+ processArgs.minQ,
1221
+ processArgs.maxQ,
1222
+ processArgs.offDiagQ,
1223
+ processArgs.dStatAlpha,
1224
+ processArgs.dStatd,
1225
+ processArgs.dStatPC,
1226
+ stateArgs.stateInit,
1227
+ stateArgs.stateCovarInit,
1228
+ stateArgs.boundState,
1229
+ stateArgs.stateLowerBound,
1230
+ stateArgs.stateUpperBound,
1231
+ samArgs.chunkSize,
1232
+ progressIter=50_000,
1233
+ )
1234
+ logger.info("Done.")
1235
+
1236
+ x_ = core.getPrimaryState(x)
1237
+ y_ = core.getPrecisionWeightedResidual(
1238
+ y,
1239
+ muncMat,
1240
+ stateCovarSmoothed=P
1241
+ if processArgs.scaleResidualsByP11 is not None
1242
+ and processArgs.scaleResidualsByP11
1243
+ else None,
1244
+ )
1245
+
1246
+ weights_: Optional[np.ndarray] = None
1247
+ if matchingArgs.penalizeBy is not None:
1248
+ if matchingArgs.penalizeBy.lower() in [
1249
+ "stateuncertainty",
1250
+ "statestddev",
1251
+ "statestd",
1252
+ "p11",
1253
+ ]:
1254
+ try:
1255
+ weights_ = np.sqrt(P[:, 0, 0] + 1.0)
1256
+ except Exception as e:
1257
+ logger.warning(
1258
+ f"Error computing weights for 'stateUncertainty': {e}. No weights applied for matching."
1259
+ )
1260
+ weights_ = None
1261
+ elif matchingArgs.penalizeBy == "muncTrace":
1262
+ try:
1263
+ weights_ = np.sqrt(
1264
+ np.mean(muncMat.astype(np.float64), axis=0)
1265
+ + 1.0,
1266
+ )
1267
+ except Exception as e:
1268
+ logger.warning(
1269
+ f"Error computing weights for 'muncTrace': {e}. No weights applied for matching."
1270
+ )
1271
+ weights_ = None
1272
+ else:
1273
+ logger.warning(
1274
+ f"Unrecognized `matchingParams.penalizeBy`: {matchingArgs.penalizeBy}. No weights applied."
1275
+ )
1276
+ weights_ = None
1277
+
1278
+ df = pd.DataFrame(
1279
+ {
1280
+ "Chromosome": chromosome,
1281
+ "Start": intervals,
1282
+ "End": intervals + stepSize,
1283
+ "State": x_,
1284
+ }
1285
+ )
1286
+
1287
+ if outputArgs.writeResiduals:
1288
+ df["Res"] = y_.astype(np.float32) # FFR: cast necessary?
1289
+ if outputArgs.writeRawResiduals:
1290
+ df["RawRes"] = np.mean(y, axis=1).astype(np.float32)
1291
+ if outputArgs.writeMuncTrace:
1292
+ munc_std = np.sqrt(
1293
+ np.mean(muncMat.astype(np.float64), axis=0)
1294
+ ).astype(np.float32)
1295
+ df["Munc"] = munc_std
1296
+ if outputArgs.writeStateStd:
1297
+ df["StateStd"] = np.sqrt(P[:, 0, 0]).astype(np.float32)
1298
+ cols_ = ["Chromosome", "Start", "End", "State"]
1299
+ if outputArgs.writeResiduals:
1300
+ cols_.append("Res")
1301
+ if outputArgs.writeMuncTrace:
1302
+ cols_.append("Munc")
1303
+ if outputArgs.writeStateStd:
1304
+ cols_.append("StateStd")
1305
+ if outputArgs.writeRawResiduals:
1306
+ cols_.append("RawRes")
1307
+ df = df[cols_]
1308
+ suffixes = ["state"]
1309
+ if outputArgs.writeResiduals:
1310
+ suffixes.append("residuals")
1311
+ if outputArgs.writeMuncTrace:
1312
+ suffixes.append("muncTraces")
1313
+ if outputArgs.writeStateStd:
1314
+ suffixes.append("stdDevs")
1315
+ if outputArgs.writeRawResiduals:
1316
+ suffixes.append("rawResiduals")
1317
+
1318
+ if (c_ == 0 and len(chromosomes) > 1) or (
1319
+ len(chromosomes) == 1
1320
+ ):
1321
+ for file_ in os.listdir("."):
1322
+ if file_.startswith(
1323
+ f"consenrichOutput_{experimentName}"
1324
+ ) and (
1325
+ file_.endswith(".bedGraph")
1326
+ or file_.endswith(".narrowPeak")
1327
+ ):
1328
+ logger.warning(f"Overwriting: {file_}")
1329
+ os.remove(file_)
1330
+
1331
+ for col, suffix in zip(cols_[3:], suffixes):
1332
+ logger.info(
1333
+ f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
1334
+ )
1335
+ df[["Chromosome", "Start", "End", col]].to_csv(
1336
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
1337
+ sep="\t",
1338
+ header=False,
1339
+ index=False,
1340
+ mode="a",
1341
+ float_format="%.3f",
1342
+ lineterminator="\n",
1343
+ )
1344
+ try:
1345
+ if matchingEnabled:
1346
+ if (
1347
+ minMatchLengthBP_ is None
1348
+ or minMatchLengthBP_ <= 0
1349
+ ):
1350
+ minMatchLengthBP_ = (
1351
+ matching.autoMinLengthIntervals(x_)
1352
+ * (intervals[1] - intervals[0])
1353
+ )
1354
+
1355
+ if mergeGapBP_ is None:
1356
+ mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
1357
+
1358
+ matchingDF = matching.matchWavelet(
1359
+ chromosome,
1360
+ intervals,
1361
+ x_,
1362
+ matchingArgs.templateNames,
1363
+ matchingArgs.cascadeLevels,
1364
+ matchingArgs.iters,
1365
+ matchingArgs.alpha,
1366
+ minMatchLengthBP_,
1367
+ matchingArgs.maxNumMatches,
1368
+ matchingArgs.minSignalAtMaxima,
1369
+ useScalingFunction=matchingArgs.useScalingFunction,
1370
+ excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
1371
+ randSeed=matchingArgs.randSeed,
1372
+ weights=1.0 / weights_
1373
+ if weights_ is not None
1374
+ else None,
1375
+ eps=matchingArgs.eps,
1376
+ isLogScale=countingArgs.applyLog
1377
+ or countingArgs.applyAsinh,
1378
+ )
1379
+ if not matchingDF.empty:
1380
+ matchingDF.to_csv(
1381
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1382
+ sep="\t",
1383
+ header=False,
1384
+ index=False,
1385
+ mode="a",
1386
+ float_format=f"%.{outputArgs.roundDigits}f",
1387
+ lineterminator="\n",
1388
+ )
1389
+ except Exception as e:
1390
+ logger.warning(
1391
+ f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
1392
+ )
1393
+ continue
1394
+ logger.info("Finished: output in human-readable format")
1395
+
1396
+ if outputArgs.convertToBigWig:
1397
+ convertBedGraphToBigWig(
1398
+ experimentName,
1399
+ genomeArgs.chromSizesFile,
1400
+ suffixes=suffixes,
1401
+ )
1402
+
1403
+ if matchingEnabled and matchingArgs.merge:
1404
+ try:
1405
+ mergeGapBP_ = matchingArgs.mergeGapBP
1406
+ if mergeGapBP_ is None or mergeGapBP_ <= 0:
1407
+ mergeGapBP_ = (
1408
+ int(minMatchLengthBP_ / 2) + 1
1409
+ if minMatchLengthBP_ is not None
1410
+ and minMatchLengthBP_ >= 0
1411
+ else 75
1412
+ )
1413
+ matching.mergeMatches(
1414
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1415
+ mergeGapBP=mergeGapBP_,
1416
+ )
1417
+
1418
+ except Exception as e:
1419
+ logger.warning(
1420
+ f"Failed to merge matches...SKIPPING:\n{e}\n\n"
1421
+ )
1422
+ logger.info("Done.")
1423
+
1424
+
1425
+ if __name__ == "__main__":
1426
+ main()