consenrich 0.7.4b2__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1381 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import pprint
8
+ import os
9
+ from pathlib import Path
10
+ from collections.abc import Mapping
11
+ from typing import List, Optional, Tuple, Dict, Any, Union
12
+ import shutil
13
+ import subprocess
14
+ import sys
15
+ import numpy as np
16
+ import pandas as pd
17
+ import pysam
18
+ import pywt
19
+ import yaml
20
+
21
+ import consenrich.core as core
22
+ import consenrich.misc_util as misc_util
23
+ import consenrich.constants as constants
24
+ import consenrich.detrorm as detrorm
25
+ import consenrich.matching as matching
26
+
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def _loadConfig(
37
+ configSource: Union[str, Path, Mapping[str, Any]],
38
+ ) -> Dict[str, Any]:
39
+ r"""Load a YAML config from a path or accept an already-parsed mapping.
40
+
41
+ If given a dict-like object, just return it.If given a path, try to load as YAML --> dict
42
+ If given a path, try to load as YAML --> dict
43
+
44
+ """
45
+ if isinstance(configSource, Mapping):
46
+ configData = configSource
47
+ elif isinstance(configSource, (str, Path)):
48
+ with open(configSource, "r") as fileHandle:
49
+ configData = yaml.safe_load(fileHandle) or {}
50
+ else:
51
+ raise TypeError("`config` must be a path or a mapping/dict.")
52
+
53
+ if not isinstance(configData, Mapping):
54
+ raise TypeError("Top-level YAML must be a mapping/object.")
55
+ return configData
56
+
57
+
58
+ def _cfgGet(
59
+ configMap: Mapping[str, Any],
60
+ dottedKey: str,
61
+ defaultVal: Any = None,
62
+ ) -> Any:
63
+ r"""Support both dotted keys and yaml/dict-style nested access for configs."""
64
+
65
+ # e.g., inputParams.bamFiles
66
+ if dottedKey in configMap:
67
+ return configMap[dottedKey]
68
+
69
+ # e.g.,
70
+ # inputParams:
71
+ # bamFiles: [...]
72
+ currentVal: Any = configMap
73
+ for keyPart in dottedKey.split("."):
74
+ if isinstance(currentVal, Mapping) and keyPart in currentVal:
75
+ currentVal = currentVal[keyPart]
76
+ else:
77
+ return defaultVal
78
+ return currentVal
79
+
80
+
81
+ def _listOrEmpty(list_):
82
+ if list_ is None:
83
+ return []
84
+ return list_
85
+
86
+
87
+ def _getMinR(configMap, numBams: int) -> float:
88
+ fallbackMinR: float = 1.0
89
+ try:
90
+ rawVal = _cfgGet(configMap, "observationParams.minR", None)
91
+ return float(rawVal) if rawVal is not None else fallbackMinR
92
+ except (TypeError, ValueError, KeyError):
93
+ logger.warning(
94
+ f"Invalid or missing 'observationParams.minR' in config. Using `{fallbackMinR}`."
95
+ )
96
+ return fallbackMinR
97
+
98
+
99
+ def checkControlsPresent(inputArgs: core.inputParams) -> bool:
100
+ """Check if control BAM files are present in the input arguments.
101
+
102
+ :param inputArgs: core.inputParams object
103
+ :return: True if control BAM files are present, False otherwise.
104
+ """
105
+ return (
106
+ bool(inputArgs.bamFilesControl)
107
+ and isinstance(inputArgs.bamFilesControl, list)
108
+ and len(inputArgs.bamFilesControl) > 0
109
+ )
110
+
111
+
112
+ def getReadLengths(
113
+ inputArgs: core.inputParams,
114
+ countingArgs: core.countingParams,
115
+ samArgs: core.samParams,
116
+ ) -> List[int]:
117
+ r"""Get read lengths for each BAM file in the input arguments.
118
+
119
+ :param inputArgs: core.inputParams object containing BAM file paths.
120
+ :param countingArgs: core.countingParams object containing number of reads.
121
+ :param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
122
+ :return: List of read lengths for each BAM file.
123
+ """
124
+ if not inputArgs.bamFiles:
125
+ raise ValueError(
126
+ "No BAM files provided in the input arguments."
127
+ )
128
+
129
+ if (
130
+ not isinstance(inputArgs.bamFiles, list)
131
+ or len(inputArgs.bamFiles) == 0
132
+ ):
133
+ raise ValueError("bam files list is empty")
134
+
135
+ return [
136
+ core.getReadLength(
137
+ bamFile,
138
+ countingArgs.numReads,
139
+ 1000,
140
+ samArgs.samThreads,
141
+ samArgs.samFlagExclude,
142
+ )
143
+ for bamFile in inputArgs.bamFiles
144
+ ]
145
+
146
+
147
+ def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
148
+ matchingEnabled = (
149
+ (matchingArgs.templateNames is not None)
150
+ and isinstance(matchingArgs.templateNames, list)
151
+ and len(matchingArgs.templateNames) > 0
152
+ )
153
+ matchingEnabled = (
154
+ matchingEnabled
155
+ and (matchingArgs.cascadeLevels is not None)
156
+ and isinstance(matchingArgs.cascadeLevels, list)
157
+ and len(matchingArgs.cascadeLevels) > 0
158
+ )
159
+ return matchingEnabled
160
+
161
+
162
+ def getEffectiveGenomeSizes(
163
+ genomeArgs: core.genomeParams, readLengths: List[int]
164
+ ) -> List[int]:
165
+ r"""Get effective genome sizes for the given genome name and read lengths.
166
+ :param genomeArgs: core.genomeParams object
167
+ :param readLengths: List of read lengths for which to get effective genome sizes.
168
+ :return: List of effective genome sizes corresponding to the read lengths.
169
+ """
170
+ genomeName = genomeArgs.genomeName
171
+ if not genomeName or not isinstance(genomeName, str):
172
+ raise ValueError("Genome name must be a non-empty string.")
173
+
174
+ if not isinstance(readLengths, list) or len(readLengths) == 0:
175
+ raise ValueError(
176
+ "Read lengths must be a non-empty list. Try calling `getReadLengths` first."
177
+ )
178
+ return [
179
+ constants.getEffectiveGenomeSize(genomeName, readLength)
180
+ for readLength in readLengths
181
+ ]
182
+
183
+
184
+ def getInputArgs(config_path: str) -> core.inputParams:
185
+ configData = _loadConfig(config_path)
186
+
187
+ def expandWildCards(bamList: List[str]) -> List[str]:
188
+ expandedList: List[str] = []
189
+ for bamEntry in bamList:
190
+ if "*" in bamEntry or "?" in bamEntry or "[" in bamEntry:
191
+ matchedList = glob.glob(bamEntry)
192
+ expandedList.extend(matchedList)
193
+ else:
194
+ expandedList.append(bamEntry)
195
+ return expandedList
196
+
197
+ bamFilesRaw = (
198
+ _cfgGet(configData, "inputParams.bamFiles", []) or []
199
+ )
200
+ bamFilesControlRaw = (
201
+ _cfgGet(configData, "inputParams.bamFilesControl", []) or []
202
+ )
203
+
204
+ bamFiles = expandWildCards(bamFilesRaw)
205
+ bamFilesControl = expandWildCards(bamFilesControlRaw)
206
+
207
+ if len(bamFiles) == 0:
208
+ raise ValueError(
209
+ "No BAM files provided in the configuration."
210
+ )
211
+
212
+ if (
213
+ len(bamFilesControl) > 0
214
+ and len(bamFilesControl) != len(bamFiles)
215
+ and len(bamFilesControl) != 1
216
+ ):
217
+ raise ValueError(
218
+ "Number of control BAM files must be 0, 1, or the same as number of treatment files"
219
+ )
220
+
221
+ if len(bamFilesControl) == 1:
222
+ logger.info(
223
+ f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
224
+ )
225
+ bamFilesControl = bamFilesControl * len(bamFiles)
226
+
227
+ if not bamFiles or not isinstance(bamFiles, list):
228
+ raise ValueError("No BAM files found")
229
+
230
+ for bamFile in bamFiles:
231
+ misc_util.checkBamFile(bamFile)
232
+
233
+ if bamFilesControl:
234
+ for bamFile in bamFilesControl:
235
+ misc_util.checkBamFile(bamFile)
236
+
237
+ pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
238
+ pairedEndConfig: Optional[bool] = _cfgGet(
239
+ configData, "inputParams.pairedEnd", None
240
+ )
241
+ if pairedEndConfig is None:
242
+ pairedEndConfig = all(pairedEndList)
243
+ if pairedEndConfig:
244
+ logger.info("Paired-end BAM files detected")
245
+ else:
246
+ logger.info("One or more single-end BAM files detected")
247
+
248
+ return core.inputParams(
249
+ bamFiles=bamFiles,
250
+ bamFilesControl=bamFilesControl,
251
+ pairedEnd=pairedEndConfig,
252
+ )
253
+
254
+ def getOutputArgs(config_path: str) -> core.outputParams:
255
+
256
+ configData = _loadConfig(config_path)
257
+
258
+ convertToBigWig_ = _cfgGet(
259
+ configData, "outputParams.convertToBigWig", True if shutil.which("bedGraphToBigWig") else False
260
+ )
261
+
262
+ roundDigits_ = _cfgGet(
263
+ configData, "outputParams.roundDigits", 3
264
+ )
265
+
266
+ writeResiduals_ = _cfgGet(
267
+ configData, "outputParams.writeResiduals", True
268
+ )
269
+
270
+ writeMuncTrace: bool = _cfgGet(
271
+ configData, "outputParams.writeMuncTrace", False
272
+ )
273
+
274
+ writeStateStd: bool = _cfgGet(
275
+ configData, "outputParams.writeStateStd", False
276
+ )
277
+
278
+ return core.outputParams(
279
+ convertToBigWig=convertToBigWig_,
280
+ roundDigits=roundDigits_,
281
+ writeResiduals=writeResiduals_,
282
+ writeMuncTrace=writeMuncTrace,
283
+ writeStateStd=writeStateStd,
284
+ )
285
+
286
+
287
+ def getGenomeArgs(config_path: str) -> core.genomeParams:
288
+ configData = _loadConfig(config_path)
289
+
290
+ genomeName = _cfgGet(configData, "genomeParams.name", None)
291
+ genomeLabel = constants.resolveGenomeName(genomeName)
292
+
293
+ chromSizesFile: Optional[str] = None
294
+ blacklistFile: Optional[str] = None
295
+ sparseBedFile: Optional[str] = None
296
+ chromosomesList: Optional[List[str]] = None
297
+
298
+ excludeChromsList: List[str] = (
299
+ _cfgGet(configData, "genomeParams.excludeChroms", []) or []
300
+ )
301
+ excludeForNormList: List[str] = (
302
+ _cfgGet(configData, "genomeParams.excludeForNorm", []) or []
303
+ )
304
+
305
+ if genomeLabel:
306
+ chromSizesFile = constants.getGenomeResourceFile(
307
+ genomeLabel, "sizes"
308
+ )
309
+ blacklistFile = constants.getGenomeResourceFile(
310
+ genomeLabel, "blacklist"
311
+ )
312
+ sparseBedFile = constants.getGenomeResourceFile(
313
+ genomeLabel, "sparse"
314
+ )
315
+
316
+ chromSizesOverride = _cfgGet(
317
+ configData, "genomeParams.chromSizesFile", None
318
+ )
319
+ if chromSizesOverride:
320
+ chromSizesFile = chromSizesOverride
321
+
322
+ blacklistOverride = _cfgGet(
323
+ configData, "genomeParams.blacklistFile", None
324
+ )
325
+ if blacklistOverride:
326
+ blacklistFile = blacklistOverride
327
+
328
+ sparseOverride = _cfgGet(
329
+ configData, "genomeParams.sparseBedFile", None
330
+ )
331
+ if sparseOverride:
332
+ sparseBedFile = sparseOverride
333
+
334
+ if not chromSizesFile or not os.path.exists(chromSizesFile):
335
+ raise FileNotFoundError(
336
+ f"Chromosome sizes file {chromSizesFile} does not exist."
337
+ )
338
+
339
+ chromosomesConfig = _cfgGet(
340
+ configData, "genomeParams.chromosomes", None
341
+ )
342
+ if chromosomesConfig is not None:
343
+ chromosomesList = chromosomesConfig
344
+ else:
345
+ if chromSizesFile:
346
+ chromosomesFrame = pd.read_csv(
347
+ chromSizesFile,
348
+ sep="\t",
349
+ header=None,
350
+ names=["chrom", "size"],
351
+ )
352
+ chromosomesList = list(chromosomesFrame["chrom"])
353
+ else:
354
+ raise ValueError(
355
+ "No chromosomes provided in the configuration and no chromosome sizes file specified."
356
+ )
357
+
358
+ chromosomesList = [
359
+ chromName.strip()
360
+ for chromName in chromosomesList
361
+ if chromName and chromName.strip()
362
+ ]
363
+ if excludeChromsList:
364
+ chromosomesList = [
365
+ chromName
366
+ for chromName in chromosomesList
367
+ if chromName not in excludeChromsList
368
+ ]
369
+ if not chromosomesList:
370
+ raise ValueError(
371
+ "No valid chromosomes found after excluding specified chromosomes."
372
+ )
373
+
374
+ return core.genomeParams(
375
+ genomeName=genomeLabel,
376
+ chromSizesFile=chromSizesFile,
377
+ blacklistFile=blacklistFile,
378
+ sparseBedFile=sparseBedFile,
379
+ chromosomes=chromosomesList,
380
+ excludeChroms=excludeChromsList,
381
+ excludeForNorm=excludeForNormList,
382
+ )
383
+
384
+
385
+ def getCountingArgs(config_path: str) -> core.countingParams:
386
+ configData = _loadConfig(config_path)
387
+
388
+ stepSize = _cfgGet(configData, "countingParams.stepSize", 25)
389
+ scaleDownFlag = _cfgGet(
390
+ configData, "countingParams.scaleDown", True
391
+ )
392
+ scaleFactorList = _cfgGet(
393
+ configData, "countingParams.scaleFactors", None
394
+ )
395
+ numReads = _cfgGet(configData, "countingParams.numReads", 100)
396
+ scaleFactorsControlList = _cfgGet(
397
+ configData, "countingParams.scaleFactorsControl", None
398
+ )
399
+ applyAsinhFlag = _cfgGet(
400
+ configData, "countingParams.applyAsinh", False
401
+ )
402
+ applyLogFlag = _cfgGet(
403
+ configData, "countingParams.applyLog", False
404
+ )
405
+
406
+ if applyAsinhFlag and applyLogFlag:
407
+ applyAsinhFlag = True
408
+ applyLogFlag = False
409
+ logger.warning(
410
+ "Both `applyAsinh` and `applyLog` are set. Overriding `applyLog` to False."
411
+ )
412
+
413
+ rescaleToTreatmentCoverageFlag = _cfgGet(
414
+ configData,
415
+ "countingParams.rescaleToTreatmentCoverage",
416
+ True,
417
+ )
418
+
419
+ if scaleFactorList is not None and not isinstance(
420
+ scaleFactorList, list
421
+ ):
422
+ raise ValueError("`scaleFactors` should be a list of floats.")
423
+
424
+ if scaleFactorsControlList is not None and not isinstance(
425
+ scaleFactorsControlList, list
426
+ ):
427
+ raise ValueError(
428
+ "`scaleFactorsControl` should be a list of floats."
429
+ )
430
+
431
+ if (
432
+ scaleFactorList is not None
433
+ and scaleFactorsControlList is not None
434
+ and len(scaleFactorList) != len(scaleFactorsControlList)
435
+ ):
436
+ if len(scaleFactorsControlList) == 1:
437
+ scaleFactorsControlList = scaleFactorsControlList * len(
438
+ scaleFactorList
439
+ )
440
+ else:
441
+ raise ValueError(
442
+ "control and treatment scale factors: must be equal length or 1 control"
443
+ )
444
+
445
+ return core.countingParams(
446
+ stepSize=stepSize,
447
+ scaleDown=scaleDownFlag,
448
+ scaleFactors=scaleFactorList,
449
+ scaleFactorsControl=scaleFactorsControlList,
450
+ numReads=numReads,
451
+ applyAsinh=applyAsinhFlag,
452
+ applyLog=applyLogFlag,
453
+ rescaleToTreatmentCoverage=rescaleToTreatmentCoverageFlag,
454
+ )
455
+
456
+
457
+ def readConfig(config_path: str) -> Dict[str, Any]:
458
+ r"""Read and parse the configuration file for Consenrich.
459
+
460
+ :param config_path: Path to the YAML configuration file.
461
+ :return: Dictionary containing all parsed configuration parameters.
462
+ """
463
+ configData = _loadConfig(config_path)
464
+
465
+ inputParams = getInputArgs(config_path)
466
+ outputParams = getOutputArgs(config_path)
467
+ genomeParams = getGenomeArgs(config_path)
468
+ countingParams = getCountingArgs(config_path)
469
+
470
+ minRDefault = _getMinR(configData, len(inputParams.bamFiles))
471
+ minQDefault = (
472
+ minRDefault / len(inputParams.bamFiles)
473
+ ) + 0.10 # conditioning
474
+
475
+ matchingExcludeRegionsFileDefault: Optional[str] = (
476
+ genomeParams.blacklistFile
477
+ )
478
+
479
+ if (
480
+ inputParams.bamFilesControl is not None
481
+ and len(inputParams.bamFilesControl) > 0
482
+ ):
483
+ detrendWindowLengthBp = _cfgGet(
484
+ configData,
485
+ "detrendParams.detrendWindowLengthBP",
486
+ 25_000,
487
+ )
488
+ detrendSavitzkyGolayDegree = _cfgGet(
489
+ configData,
490
+ "detrendParams.detrendSavitzkyGolayDegree",
491
+ 1,
492
+ )
493
+ else:
494
+ detrendWindowLengthBp = _cfgGet(
495
+ configData,
496
+ "detrendParams.detrendWindowLengthBP",
497
+ 10_000,
498
+ )
499
+ detrendSavitzkyGolayDegree = _cfgGet(
500
+ configData,
501
+ "detrendParams.detrendSavitzkyGolayDegree",
502
+ 2,
503
+ )
504
+
505
+ experimentName = _cfgGet(
506
+ configData, "experimentName", "consenrichExperiment"
507
+ )
508
+
509
+ processArgs = core.processParams(
510
+ deltaF=_cfgGet(configData, "processParams.deltaF", 0.5),
511
+ minQ=_cfgGet(configData, "processParams.minQ", minQDefault),
512
+ maxQ=_cfgGet(configData, "processParams.maxQ", 500.0),
513
+ offDiagQ=_cfgGet(configData, "processParams.offDiagQ", 0.0),
514
+ dStatAlpha=_cfgGet(
515
+ configData, "processParams.dStatAlpha", 3.0
516
+ ),
517
+ dStatd=_cfgGet(configData, "processParams.dStatd", 10.0),
518
+ dStatPC=_cfgGet(configData, "processParams.dStatPC", 1.0),
519
+ scaleResidualsByP11=_cfgGet(
520
+ configData,
521
+ "processParams.scaleResidualsByP11",
522
+ False,
523
+ ),
524
+ )
525
+
526
+ observationArgs = core.observationParams(
527
+ minR=minRDefault,
528
+ maxR=_cfgGet(configData, "observationParams.maxR", 500.0),
529
+ useALV=_cfgGet(configData, "observationParams.useALV", False),
530
+ useConstantNoiseLevel=_cfgGet(
531
+ configData,
532
+ "observationParams.useConstantNoiseLevel",
533
+ False,
534
+ ),
535
+ noGlobal=_cfgGet(
536
+ configData, "observationParams.noGlobal", False
537
+ ),
538
+ numNearest=_cfgGet(
539
+ configData, "observationParams.numNearest", 25
540
+ ),
541
+ localWeight=_cfgGet(
542
+ configData, "observationParams.localWeight", 0.333
543
+ ),
544
+ globalWeight=_cfgGet(
545
+ configData, "observationParams.globalWeight", 0.667
546
+ ),
547
+ approximationWindowLengthBP=_cfgGet(
548
+ configData,
549
+ "observationParams.approximationWindowLengthBP",
550
+ 10_000,
551
+ ),
552
+ lowPassWindowLengthBP=_cfgGet(
553
+ configData,
554
+ "observationParams.lowPassWindowLengthBP",
555
+ 20_000,
556
+ ),
557
+ lowPassFilterType=_cfgGet(
558
+ configData,
559
+ "observationParams.lowPassFilterType",
560
+ "median",
561
+ ),
562
+ returnCenter=_cfgGet(
563
+ configData, "observationParams.returnCenter", True
564
+ ),
565
+ )
566
+
567
+ stateArgs = core.stateParams(
568
+ stateInit=_cfgGet(configData, "stateParams.stateInit", 0.0),
569
+ stateCovarInit=_cfgGet(
570
+ configData, "stateParams.stateCovarInit", 100.0
571
+ ),
572
+ boundState=_cfgGet(
573
+ configData, "stateParams.boundState", True
574
+ ),
575
+ stateLowerBound=_cfgGet(
576
+ configData, "stateParams.stateLowerBound", 0.0
577
+ ),
578
+ stateUpperBound=_cfgGet(
579
+ configData, "stateParams.stateUpperBound", 10000.0
580
+ ),
581
+ )
582
+
583
+ samThreads = _cfgGet(configData, "samParams.samThreads", 1)
584
+ samFlagExclude = _cfgGet(
585
+ configData, "samParams.samFlagExclude", 3844
586
+ )
587
+ oneReadPerBin = _cfgGet(configData, "samParams.oneReadPerBin", 0)
588
+ chunkSize = _cfgGet(configData, "samParams.chunkSize", 1_000_000)
589
+ offsetStr = _cfgGet(configData, "samParams.offsetStr", "0,0")
590
+ extendBpList = _cfgGet(configData, "samParams.extendBP", [])
591
+ maxInsertSize = _cfgGet(
592
+ configData, "samParams.maxInsertSize", 1000
593
+ )
594
+
595
+ pairedEndDefault = (
596
+ 1
597
+ if inputParams.pairedEnd is not None
598
+ and int(inputParams.pairedEnd) > 0
599
+ else 0
600
+ )
601
+ inferFragmentDefault = (
602
+ 1
603
+ if inputParams.pairedEnd is not None
604
+ and int(inputParams.pairedEnd) == 0
605
+ else 0
606
+ )
607
+
608
+ samArgs = core.samParams(
609
+ samThreads=samThreads,
610
+ samFlagExclude=samFlagExclude,
611
+ oneReadPerBin=oneReadPerBin,
612
+ chunkSize=chunkSize,
613
+ offsetStr=offsetStr,
614
+ extendBP=extendBpList,
615
+ maxInsertSize=maxInsertSize,
616
+ pairedEndMode=_cfgGet(
617
+ configData,
618
+ "samParams.pairedEndMode",
619
+ pairedEndDefault,
620
+ ),
621
+ inferFragmentLength=_cfgGet(
622
+ configData,
623
+ "samParams.inferFragmentLength",
624
+ inferFragmentDefault,
625
+ ),
626
+ countEndsOnly=_cfgGet(
627
+ configData, "samParams.countEndsOnly", False
628
+ ),
629
+ )
630
+
631
+ detrendArgs = core.detrendParams(
632
+ detrendWindowLengthBP=detrendWindowLengthBp,
633
+ detrendTrackPercentile=_cfgGet(
634
+ configData,
635
+ "detrendParams.detrendTrackPercentile",
636
+ 75,
637
+ ),
638
+ usePolyFilter=_cfgGet(
639
+ configData, "detrendParams.usePolyFilter", False
640
+ ),
641
+ detrendSavitzkyGolayDegree=detrendSavitzkyGolayDegree,
642
+ useOrderStatFilter=_cfgGet(
643
+ configData, "detrendParams.useOrderStatFilter", True
644
+ ),
645
+ )
646
+
647
+ matchingArgs = core.matchingParams(
648
+ templateNames=_cfgGet(
649
+ configData, "matchingParams.templateNames", []
650
+ ),
651
+ cascadeLevels=_cfgGet(
652
+ configData, "matchingParams.cascadeLevels", []
653
+ ),
654
+ iters=_cfgGet(configData, "matchingParams.iters", 25_000),
655
+ alpha=_cfgGet(configData, "matchingParams.alpha", 0.05),
656
+ minMatchLengthBP=_cfgGet(
657
+ configData,
658
+ "matchingParams.minMatchLengthBP",
659
+ 250,
660
+ ),
661
+ maxNumMatches=_cfgGet(
662
+ configData,
663
+ "matchingParams.maxNumMatches",
664
+ 100_000,
665
+ ),
666
+ minSignalAtMaxima=_cfgGet(
667
+ configData,
668
+ "matchingParams.minSignalAtMaxima",
669
+ "q:0.75",
670
+ ),
671
+ merge=_cfgGet(configData, "matchingParams.merge", True),
672
+ mergeGapBP=_cfgGet(
673
+ configData, "matchingParams.mergeGapBP", None
674
+ ),
675
+ useScalingFunction=_cfgGet(
676
+ configData,
677
+ "matchingParams.useScalingFunction",
678
+ True,
679
+ ),
680
+ excludeRegionsBedFile=_cfgGet(
681
+ configData,
682
+ "matchingParams.excludeRegionsBedFile",
683
+ matchingExcludeRegionsFileDefault,
684
+ ),
685
+ randSeed=_cfgGet(configData, "matchingParams.randSeed", 42),
686
+ penalizeBy=_cfgGet(
687
+ configData, "matchingParams.penalizeBy", None
688
+ ),
689
+ eps=_cfgGet(configData, "matchingParams.eps", 1.0e-2),
690
+ )
691
+
692
+ return {
693
+ "experimentName": experimentName,
694
+ "genomeArgs": genomeParams,
695
+ "inputArgs": inputParams,
696
+ "outputArgs": outputParams,
697
+ "countingArgs": countingParams,
698
+ "processArgs": processArgs,
699
+ "observationArgs": observationArgs,
700
+ "stateArgs": stateArgs,
701
+ "samArgs": samArgs,
702
+ "detrendArgs": detrendArgs,
703
+ "matchingArgs": matchingArgs,
704
+ }
705
+
706
+
707
+ def convertBedGraphToBigWig(experimentName, chromSizesFile,
708
+ suffixes: Optional[List[str]] = None):
709
+
710
+ if suffixes is None:
711
+ # at least look for `state` bedGraph
712
+ suffixes = ["state"]
713
+ path_ = ""
714
+ warningMessage = (
715
+ "Could not find UCSC bedGraphToBigWig binary utility."
716
+ "If you need bigWig files instead of the default, human-readable bedGraph files,"
717
+ "you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
718
+ "OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
719
+ )
720
+
721
+ logger.info(
722
+ "Attempting to generate bigWig files from bedGraph format..."
723
+ )
724
+ try:
725
+ path_ = shutil.which("bedGraphToBigWig")
726
+ except Exception as e:
727
+ logger.warning(f"\n{warningMessage}\n")
728
+ return
729
+ if path_ is None or len(path_) == 0:
730
+ logger.warning(f"\n{warningMessage}\n")
731
+ return
732
+ logger.info(f"Using bedGraphToBigWig from {path_}")
733
+ for suffix in suffixes:
734
+ bedgraph = (
735
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
736
+ )
737
+ if not os.path.exists(bedgraph):
738
+ logger.warning(
739
+ f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
740
+ )
741
+ continue
742
+ if not os.path.exists(chromSizesFile):
743
+ logger.warning(
744
+ f"{chromSizesFile} does not exist. Skipping bigWig conversion."
745
+ )
746
+ return
747
+ bigwig = f"{experimentName}_consenrich_{suffix}.bw"
748
+ logger.info(f"Start: {bedgraph} --> {bigwig}...")
749
+ try:
750
+ subprocess.run(
751
+ [path_, bedgraph, chromSizesFile, bigwig], check=True
752
+ )
753
+ except Exception as e:
754
+ logger.warning(
755
+ f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
756
+ )
757
+ continue
758
+ if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
759
+ logger.info(
760
+ f"Finished: converted {bedgraph} to {bigwig}."
761
+ )
762
+
763
+
764
+ def main():
765
+ parser = argparse.ArgumentParser(description="Consenrich CLI")
766
+ parser.add_argument(
767
+ "--config",
768
+ type=str,
769
+ dest="config",
770
+ help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
771
+ )
772
+
773
+ # --- Matching-specific command-line arguments ---
774
+ parser.add_argument(
775
+ "--match-bedGraph",
776
+ type=str,
777
+ dest="matchBedGraph",
778
+ help="Path to a bedGraph file of Consenrich estimates to match templates against.\
779
+ If provided, *only* the matching algorithm is run (no other processing). Note that \
780
+ some features in `consenrich.matching` may not be supported through this CLI interface.",
781
+ )
782
+ parser.add_argument(
783
+ "--match-template",
784
+ type=str,
785
+ default="haar",
786
+ choices=[
787
+ x
788
+ for x in pywt.wavelist(kind="discrete")
789
+ if "bio" not in x
790
+ ],
791
+ dest="matchTemplate",
792
+ )
793
+ parser.add_argument(
794
+ "--match-level", type=int, default=2, dest="matchLevel"
795
+ )
796
+ parser.add_argument(
797
+ "--match-alpha", type=float, default=0.05, dest="matchAlpha"
798
+ )
799
+ parser.add_argument(
800
+ "--match-min-length",
801
+ type=int,
802
+ default=250,
803
+ dest="matchMinMatchLengthBP",
804
+ )
805
+ parser.add_argument(
806
+ "--match-iters", type=int, default=25000, dest="matchIters"
807
+ )
808
+ parser.add_argument(
809
+ "--match-min-signal",
810
+ type=str,
811
+ default="q:0.75",
812
+ dest="matchMinSignalAtMaxima",
813
+ )
814
+ parser.add_argument(
815
+ "--match-max-matches",
816
+ type=int,
817
+ default=100000,
818
+ dest="matchMaxNumMatches",
819
+ )
820
+ parser.add_argument(
821
+ "--match-no-merge", action="store_true", dest="matchNoMerge"
822
+ )
823
+ parser.add_argument(
824
+ "--match-merge-gap",
825
+ type=int,
826
+ default=None,
827
+ dest="matchMergeGapBP",
828
+ )
829
+ parser.add_argument(
830
+ "--match-use-wavelet",
831
+ action="store_true",
832
+ dest="matchUseWavelet",
833
+ )
834
+ parser.add_argument(
835
+ "--match-seed", type=int, default=42, dest="matchRandSeed"
836
+ )
837
+ parser.add_argument(
838
+ "--match-exclude-bed",
839
+ type=str,
840
+ default=None,
841
+ dest="matchExcludeBed",
842
+ )
843
+ parser.add_argument(
844
+ "--verbose", action="store_true", help="If set, logs config"
845
+ )
846
+ args = parser.parse_args()
847
+
848
+ if args.matchBedGraph:
849
+ if not os.path.exists(args.matchBedGraph):
850
+ raise FileNotFoundError(
851
+ f"bedGraph file {args.matchBedGraph} couldn't be found."
852
+ )
853
+ logger.info(
854
+ f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
855
+ )
856
+
857
+ outName = matching.matchExistingBedGraph(
858
+ args.matchBedGraph,
859
+ args.matchTemplate,
860
+ args.matchLevel,
861
+ alpha=args.matchAlpha,
862
+ minMatchLengthBP=args.matchMinMatchLengthBP,
863
+ iters=args.matchIters,
864
+ minSignalAtMaxima=args.matchMinSignalAtMaxima,
865
+ maxNumMatches=args.matchMaxNumMatches,
866
+ useScalingFunction=(not args.matchUseWavelet),
867
+ merge=(not args.matchNoMerge),
868
+ mergeGapBP=args.matchMergeGapBP,
869
+ excludeRegionsBedFile=args.matchExcludeBed,
870
+ randSeed=args.matchRandSeed,
871
+ )
872
+ logger.info(f"Finished matching. Written to {outName}")
873
+ sys.exit(0)
874
+
875
+ if args.matchBedGraph:
876
+ # this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
877
+ logger.info(
878
+ "If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
879
+ )
880
+ sys.exit(0)
881
+
882
+ if not args.config:
883
+ logger.info(
884
+ "No config file provided, run with `--config <path_to_config.yaml>`"
885
+ )
886
+ logger.info(
887
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
888
+ )
889
+ sys.exit(1)
890
+
891
+ if not os.path.exists(args.config):
892
+ logger.info(f"Config file {args.config} does not exist.")
893
+ logger.info(
894
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
895
+ )
896
+ sys.exit(1)
897
+
898
+ config = readConfig(args.config)
899
+ experimentName = config["experimentName"]
900
+ genomeArgs = config["genomeArgs"]
901
+ inputArgs = config["inputArgs"]
902
+ outputArgs = config["outputArgs"]
903
+ countingArgs = config["countingArgs"]
904
+ processArgs = config["processArgs"]
905
+ observationArgs = config["observationArgs"]
906
+ stateArgs = config["stateArgs"]
907
+ samArgs = config["samArgs"]
908
+ detrendArgs = config["detrendArgs"]
909
+ matchingArgs = config["matchingArgs"]
910
+ bamFiles = inputArgs.bamFiles
911
+ bamFilesControl = inputArgs.bamFilesControl
912
+ numSamples = len(bamFiles)
913
+ numNearest = observationArgs.numNearest
914
+ stepSize = countingArgs.stepSize
915
+ excludeForNorm = genomeArgs.excludeForNorm
916
+ chromSizes = genomeArgs.chromSizesFile
917
+ scaleDown = countingArgs.scaleDown
918
+ extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
919
+ initialTreatmentScaleFactors = []
920
+ minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
921
+ mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
922
+
923
+ if args.verbose:
924
+ try:
925
+ logger.info("Configuration:\n")
926
+ config_truncated = {
927
+ k: v
928
+ for k, v in config.items()
929
+ if k
930
+ not in ["inputArgs", "genomeArgs", "countingArgs"]
931
+ }
932
+ config_truncated["experimentName"] = experimentName
933
+ config_truncated["inputArgs"] = inputArgs
934
+ config_truncated["outputArgs"] = outputArgs
935
+ config_truncated["genomeArgs"] = genomeArgs
936
+ config_truncated["countingArgs"] = countingArgs
937
+ config_truncated["processArgs"] = processArgs
938
+ config_truncated["observationArgs"] = observationArgs
939
+ config_truncated["stateArgs"] = stateArgs
940
+ config_truncated["samArgs"] = samArgs
941
+ config_truncated["detrendArgs"] = detrendArgs
942
+ pprint.pprint(config_truncated, indent=8)
943
+ except Exception as e:
944
+ logger.warning(f"Failed to print parsed config:\n{e}\n")
945
+
946
+ controlsPresent = checkControlsPresent(inputArgs)
947
+ if args.verbose:
948
+ logger.info(f"controlsPresent: {controlsPresent}")
949
+ readLengthsBamFiles = getReadLengths(
950
+ inputArgs, countingArgs, samArgs
951
+ )
952
+ effectiveGenomeSizes = getEffectiveGenomeSizes(
953
+ genomeArgs, readLengthsBamFiles
954
+ )
955
+ matchingEnabled = checkMatchingEnabled(matchingArgs)
956
+ if args.verbose:
957
+ logger.info(f"matchingEnabled: {matchingEnabled}")
958
+ scaleFactors = countingArgs.scaleFactors
959
+ scaleFactorsControl = countingArgs.scaleFactorsControl
960
+
961
+ if controlsPresent:
962
+ readLengthsControlBamFiles = [
963
+ core.getReadLength(
964
+ bamFile,
965
+ countingArgs.numReads,
966
+ 1000,
967
+ samArgs.samThreads,
968
+ samArgs.samFlagExclude,
969
+ )
970
+ for bamFile in bamFilesControl
971
+ ]
972
+ effectiveGenomeSizesControl = [
973
+ constants.getEffectiveGenomeSize(
974
+ genomeArgs.genomeName, readLength
975
+ )
976
+ for readLength in readLengthsControlBamFiles
977
+ ]
978
+
979
+ if (
980
+ scaleFactors is not None
981
+ and scaleFactorsControl is not None
982
+ ):
983
+ treatScaleFactors = scaleFactors
984
+ controlScaleFactors = scaleFactorsControl
985
+ # still make sure this is accessible
986
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
987
+ else:
988
+ try:
989
+ initialTreatmentScaleFactors = [
990
+ detrorm.getScaleFactor1x(
991
+ bamFile,
992
+ effectiveGenomeSize,
993
+ readLength,
994
+ genomeArgs.excludeChroms,
995
+ genomeArgs.chromSizesFile,
996
+ samArgs.samThreads,
997
+ )
998
+ for bamFile, effectiveGenomeSize, readLength in zip(
999
+ bamFiles,
1000
+ effectiveGenomeSizes,
1001
+ readLengthsBamFiles,
1002
+ )
1003
+ ]
1004
+ except Exception:
1005
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
1006
+
1007
+ pairScalingFactors = [
1008
+ detrorm.getPairScaleFactors(
1009
+ bamFileA,
1010
+ bamFileB,
1011
+ effectiveGenomeSizeA,
1012
+ effectiveGenomeSizeB,
1013
+ readLengthA,
1014
+ readLengthB,
1015
+ excludeForNorm,
1016
+ chromSizes,
1017
+ samArgs.samThreads,
1018
+ scaleDown,
1019
+ )
1020
+ for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
1021
+ bamFiles,
1022
+ bamFilesControl,
1023
+ effectiveGenomeSizes,
1024
+ effectiveGenomeSizesControl,
1025
+ readLengthsBamFiles,
1026
+ readLengthsControlBamFiles,
1027
+ )
1028
+ ]
1029
+
1030
+ treatScaleFactors = []
1031
+ controlScaleFactors = []
1032
+ for scaleFactorA, scaleFactorB in pairScalingFactors:
1033
+ treatScaleFactors.append(scaleFactorA)
1034
+ controlScaleFactors.append(scaleFactorB)
1035
+
1036
+ else:
1037
+ treatScaleFactors = scaleFactors
1038
+ controlScaleFactors = scaleFactorsControl
1039
+
1040
+ if scaleFactors is None and not controlsPresent:
1041
+ scaleFactors = [
1042
+ detrorm.getScaleFactor1x(
1043
+ bamFile,
1044
+ effectiveGenomeSize,
1045
+ readLength,
1046
+ genomeArgs.excludeChroms,
1047
+ genomeArgs.chromSizesFile,
1048
+ samArgs.samThreads,
1049
+ )
1050
+ for bamFile, effectiveGenomeSize, readLength in zip(
1051
+ bamFiles, effectiveGenomeSizes, readLengthsBamFiles
1052
+ )
1053
+ ]
1054
+ chromSizesDict = misc_util.getChromSizesDict(
1055
+ genomeArgs.chromSizesFile,
1056
+ excludeChroms=genomeArgs.excludeChroms,
1057
+ )
1058
+ chromosomes = genomeArgs.chromosomes
1059
+
1060
+ for c_, chromosome in enumerate(chromosomes):
1061
+ chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
1062
+ bamFiles,
1063
+ chromosome,
1064
+ chromSizesDict[chromosome],
1065
+ samArgs.samThreads,
1066
+ samArgs.samFlagExclude,
1067
+ )
1068
+ chromosomeStart = max(
1069
+ 0, (chromosomeStart - (chromosomeStart % stepSize))
1070
+ )
1071
+ chromosomeEnd = max(
1072
+ 0, (chromosomeEnd - (chromosomeEnd % stepSize))
1073
+ )
1074
+ numIntervals = (
1075
+ ((chromosomeEnd - chromosomeStart) + stepSize) - 1
1076
+ ) // stepSize
1077
+ intervals = np.arange(
1078
+ chromosomeStart, chromosomeEnd, stepSize
1079
+ )
1080
+ chromMat: np.ndarray = np.empty(
1081
+ (numSamples, numIntervals), dtype=np.float32
1082
+ )
1083
+ if controlsPresent:
1084
+ j_: int = 0
1085
+ finalSF = 1.0
1086
+ for bamA, bamB in zip(bamFiles, bamFilesControl):
1087
+ logger.info(
1088
+ f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
1089
+ )
1090
+ pairMatrix: np.ndarray = core.readBamSegments(
1091
+ [bamA, bamB],
1092
+ chromosome,
1093
+ chromosomeStart,
1094
+ chromosomeEnd,
1095
+ stepSize,
1096
+ [
1097
+ readLengthsBamFiles[j_],
1098
+ readLengthsControlBamFiles[j_],
1099
+ ],
1100
+ [treatScaleFactors[j_], controlScaleFactors[j_]],
1101
+ samArgs.oneReadPerBin,
1102
+ samArgs.samThreads,
1103
+ samArgs.samFlagExclude,
1104
+ offsetStr=samArgs.offsetStr,
1105
+ extendBP=extendBP_[j_],
1106
+ maxInsertSize=samArgs.maxInsertSize,
1107
+ pairedEndMode=samArgs.pairedEndMode,
1108
+ inferFragmentLength=samArgs.inferFragmentLength,
1109
+ applyAsinh=countingArgs.applyAsinh,
1110
+ applyLog=countingArgs.applyLog,
1111
+ countEndsOnly=samArgs.countEndsOnly,
1112
+ )
1113
+ if countingArgs.rescaleToTreatmentCoverage:
1114
+ finalSF = max(
1115
+ 1.0, initialTreatmentScaleFactors[j_]
1116
+ )
1117
+ chromMat[j_, :] = finalSF * (
1118
+ pairMatrix[0, :] - pairMatrix[1, :]
1119
+ )
1120
+ j_ += 1
1121
+ else:
1122
+ chromMat = core.readBamSegments(
1123
+ bamFiles,
1124
+ chromosome,
1125
+ chromosomeStart,
1126
+ chromosomeEnd,
1127
+ stepSize,
1128
+ readLengthsBamFiles,
1129
+ scaleFactors,
1130
+ samArgs.oneReadPerBin,
1131
+ samArgs.samThreads,
1132
+ samArgs.samFlagExclude,
1133
+ offsetStr=samArgs.offsetStr,
1134
+ extendBP=extendBP_,
1135
+ maxInsertSize=samArgs.maxInsertSize,
1136
+ pairedEndMode=samArgs.pairedEndMode,
1137
+ inferFragmentLength=samArgs.inferFragmentLength,
1138
+ applyAsinh=countingArgs.applyAsinh,
1139
+ applyLog=countingArgs.applyLog,
1140
+ countEndsOnly=samArgs.countEndsOnly,
1141
+ )
1142
+ sparseMap = None
1143
+ if genomeArgs.sparseBedFile and not observationArgs.useALV:
1144
+ logger.info(
1145
+ f"Building sparse mapping for {chromosome}..."
1146
+ )
1147
+ sparseMap = core.getSparseMap(
1148
+ chromosome,
1149
+ intervals,
1150
+ numNearest,
1151
+ genomeArgs.sparseBedFile,
1152
+ )
1153
+
1154
+ muncMat = np.empty_like(chromMat, dtype=np.float32)
1155
+ for j in range(numSamples):
1156
+ logger.info(
1157
+ f"Muncing {j + 1}/{numSamples} for {chromosome}..."
1158
+ )
1159
+ muncMat[j, :] = core.getMuncTrack(
1160
+ chromosome,
1161
+ intervals,
1162
+ stepSize,
1163
+ chromMat[j, :],
1164
+ observationArgs.minR,
1165
+ observationArgs.maxR,
1166
+ observationArgs.useALV,
1167
+ observationArgs.useConstantNoiseLevel,
1168
+ observationArgs.noGlobal,
1169
+ observationArgs.localWeight,
1170
+ observationArgs.globalWeight,
1171
+ observationArgs.approximationWindowLengthBP,
1172
+ observationArgs.lowPassWindowLengthBP,
1173
+ observationArgs.returnCenter,
1174
+ sparseMap=sparseMap,
1175
+ lowPassFilterType=observationArgs.lowPassFilterType,
1176
+ )
1177
+ chromMat[j, :] = detrorm.detrendTrack(
1178
+ chromMat[j, :],
1179
+ stepSize,
1180
+ detrendArgs.detrendWindowLengthBP,
1181
+ detrendArgs.useOrderStatFilter,
1182
+ detrendArgs.usePolyFilter,
1183
+ detrendArgs.detrendTrackPercentile,
1184
+ detrendArgs.detrendSavitzkyGolayDegree,
1185
+ )
1186
+ logger.info(f">>>Running consenrich: {chromosome}<<<")
1187
+
1188
+ x, P, y = core.runConsenrich(
1189
+ chromMat,
1190
+ muncMat,
1191
+ processArgs.deltaF,
1192
+ processArgs.minQ,
1193
+ processArgs.maxQ,
1194
+ processArgs.offDiagQ,
1195
+ processArgs.dStatAlpha,
1196
+ processArgs.dStatd,
1197
+ processArgs.dStatPC,
1198
+ stateArgs.stateInit,
1199
+ stateArgs.stateCovarInit,
1200
+ stateArgs.boundState,
1201
+ stateArgs.stateLowerBound,
1202
+ stateArgs.stateUpperBound,
1203
+ samArgs.chunkSize,
1204
+ progressIter=50_000,
1205
+ )
1206
+ logger.info("Done.")
1207
+
1208
+ x_ = core.getPrimaryState(x)
1209
+ y_ = core.getPrecisionWeightedResidual(
1210
+ y,
1211
+ muncMat,
1212
+ stateCovarSmoothed=P
1213
+ if processArgs.scaleResidualsByP11 is not None
1214
+ and processArgs.scaleResidualsByP11
1215
+ else None,
1216
+ )
1217
+
1218
+ weights_: Optional[np.ndarray] = None
1219
+ if matchingArgs.penalizeBy is not None:
1220
+ if matchingArgs.penalizeBy == "absResiduals":
1221
+ try:
1222
+ weights_ = np.abs(y_)
1223
+ except Exception as e:
1224
+ logger.warning(
1225
+ f"Error computing weights for 'absResiduals': {e}. No weights applied for matching."
1226
+ )
1227
+ weights_ = None
1228
+ elif matchingArgs.penalizeBy == "stateUncertainty" or matchingArgs.penalizeBy == "stateStdDev":
1229
+ try:
1230
+ weights_ = np.sqrt(P[:, 0, 0])
1231
+ except Exception as e:
1232
+ logger.warning(
1233
+ f"Error computing weights for 'stateUncertainty': {e}. No weights applied for matching."
1234
+ )
1235
+ weights_ = None
1236
+ elif matchingArgs.penalizeBy == "muncTrace":
1237
+ try:
1238
+ weights_ = np.sqrt(
1239
+ np.trace(muncMat, axis1=0, axis2=1)
1240
+ / numSamples
1241
+ )
1242
+ except Exception as e:
1243
+ logger.warning(
1244
+ f"Error computing weights for 'muncTrace': {e}. No weights applied for matching."
1245
+ )
1246
+ weights_ = None
1247
+ else:
1248
+ logger.warning(
1249
+ f"Unrecognized `matchingParams.penalizeBy`: {matchingArgs.penalizeBy}. No weights applied."
1250
+ )
1251
+ weights_ = None
1252
+
1253
+ df = pd.DataFrame(
1254
+ {
1255
+ "Chromosome": chromosome,
1256
+ "Start": intervals,
1257
+ "End": intervals + stepSize,
1258
+ "State": x_,
1259
+ }
1260
+ )
1261
+
1262
+ if outputArgs.writeResiduals:
1263
+ df["Res"] = y_.astype(np.float32) # FFR: cast necessary?
1264
+ if outputArgs.writeMuncTrace:
1265
+ df["Munc"] = np.sqrt(np.trace(muncMat, axis1=0, axis2=1) / numSamples).astype(np.float32)
1266
+ if outputArgs.writeStateStd:
1267
+ df["StateStd"] = np.sqrt(P[:, 0, 0]).astype(np.float32)
1268
+ cols_ = ["Chromosome", "Start", "End", "State"]
1269
+ if outputArgs.writeResiduals:
1270
+ cols_.append("Res")
1271
+ if outputArgs.writeMuncTrace:
1272
+ cols_.append("Munc")
1273
+ if outputArgs.writeStateStd:
1274
+ cols_.append("StateStd")
1275
+ df = df[cols_]
1276
+ suffixes = ['state']
1277
+ if outputArgs.writeResiduals:
1278
+ suffixes.append('residuals')
1279
+ if outputArgs.writeMuncTrace:
1280
+ suffixes.append('muncTraces')
1281
+ if outputArgs.writeStateStd:
1282
+ suffixes.append('stdDevs')
1283
+
1284
+ if c_ == 0 and len(chromosomes) > 1:
1285
+ for file_ in os.listdir("."):
1286
+ if file_.startswith(
1287
+ f"consenrichOutput_{experimentName}"
1288
+ ) and (
1289
+ file_.endswith(".bedGraph")
1290
+ or file_.endswith(".narrowPeak")
1291
+ ):
1292
+ logger.warning(f"Overwriting: {file_}")
1293
+ os.remove(file_)
1294
+
1295
+ for col, suffix in zip(cols_[3:], suffixes):
1296
+ logger.info(
1297
+ f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
1298
+ )
1299
+ df[["Chromosome", "Start", "End", col]].to_csv(
1300
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
1301
+ sep="\t",
1302
+ header=False,
1303
+ index=False,
1304
+ mode="a",
1305
+ float_format="%.3f",
1306
+ lineterminator="\n",
1307
+ )
1308
+ try:
1309
+ if matchingEnabled:
1310
+ if (
1311
+ minMatchLengthBP_ is None
1312
+ or minMatchLengthBP_ <= 0
1313
+ ):
1314
+ minMatchLengthBP_ = (
1315
+ matching.autoMinLengthIntervals(x_)
1316
+ * (intervals[1] - intervals[0])
1317
+ )
1318
+
1319
+ if mergeGapBP_ is None:
1320
+ mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
1321
+
1322
+ matchingDF = matching.matchWavelet(
1323
+ chromosome,
1324
+ intervals,
1325
+ x_,
1326
+ matchingArgs.templateNames,
1327
+ matchingArgs.cascadeLevels,
1328
+ matchingArgs.iters,
1329
+ matchingArgs.alpha,
1330
+ minMatchLengthBP_,
1331
+ matchingArgs.maxNumMatches,
1332
+ matchingArgs.minSignalAtMaxima,
1333
+ useScalingFunction=matchingArgs.useScalingFunction,
1334
+ excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
1335
+ randSeed=matchingArgs.randSeed,
1336
+ weights=weights_,
1337
+ )
1338
+ if not matchingDF.empty:
1339
+ matchingDF.to_csv(
1340
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1341
+ sep="\t",
1342
+ header=False,
1343
+ index=False,
1344
+ mode="a",
1345
+ float_format=f"%.{outputArgs.roundDigits}f",
1346
+ lineterminator="\n",
1347
+ )
1348
+ except Exception as e:
1349
+ logger.warning(
1350
+ f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
1351
+ )
1352
+ continue
1353
+ logger.info("Finished: output in human-readable format")
1354
+
1355
+ if outputArgs.convertToBigWig:
1356
+ convertBedGraphToBigWig(experimentName, genomeArgs.chromSizesFile, suffixes=suffixes)
1357
+
1358
+ if matchingEnabled and matchingArgs.merge:
1359
+ try:
1360
+ mergeGapBP_ = matchingArgs.mergeGapBP
1361
+ if mergeGapBP_ is None or mergeGapBP_ <= 0:
1362
+ mergeGapBP_ = (
1363
+ int(minMatchLengthBP_ / 2) + 1
1364
+ if minMatchLengthBP_ is not None
1365
+ and minMatchLengthBP_ >= 0
1366
+ else 75
1367
+ )
1368
+ matching.mergeMatches(
1369
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1370
+ mergeGapBP=mergeGapBP_,
1371
+ )
1372
+
1373
+ except Exception as e:
1374
+ logger.warning(
1375
+ f"Failed to merge matches...SKIPPING:\n{e}\n\n"
1376
+ )
1377
+ logger.info("Done.")
1378
+
1379
+
1380
+ if __name__ == "__main__":
1381
+ main()