consenrich 0.7.4b3__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,1390 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import pprint
8
+ import os
9
+ from pathlib import Path
10
+ from collections.abc import Mapping
11
+ from typing import List, Optional, Tuple, Dict, Any, Union
12
+ import shutil
13
+ import subprocess
14
+ import sys
15
+ import numpy as np
16
+ import pandas as pd
17
+ import pysam
18
+ import pywt
19
+ import yaml
20
+
21
+ import consenrich.core as core
22
+ import consenrich.misc_util as misc_util
23
+ import consenrich.constants as constants
24
+ import consenrich.detrorm as detrorm
25
+ import consenrich.matching as matching
26
+
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def _loadConfig(
37
+ configSource: Union[str, Path, Mapping[str, Any]],
38
+ ) -> Dict[str, Any]:
39
+ r"""Load a YAML config from a path or accept an already-parsed mapping.
40
+
41
+ If given a dict-like object, just return it.If given a path, try to load as YAML --> dict
42
+ If given a path, try to load as YAML --> dict
43
+
44
+ """
45
+ if isinstance(configSource, Mapping):
46
+ configData = configSource
47
+ elif isinstance(configSource, (str, Path)):
48
+ with open(configSource, "r") as fileHandle:
49
+ configData = yaml.safe_load(fileHandle) or {}
50
+ else:
51
+ raise TypeError("`config` must be a path or a mapping/dict.")
52
+
53
+ if not isinstance(configData, Mapping):
54
+ raise TypeError("Top-level YAML must be a mapping/object.")
55
+ return configData
56
+
57
+
58
+ def _cfgGet(
59
+ configMap: Mapping[str, Any],
60
+ dottedKey: str,
61
+ defaultVal: Any = None,
62
+ ) -> Any:
63
+ r"""Support both dotted keys and yaml/dict-style nested access for configs."""
64
+
65
+ # e.g., inputParams.bamFiles
66
+ if dottedKey in configMap:
67
+ return configMap[dottedKey]
68
+
69
+ # e.g.,
70
+ # inputParams:
71
+ # bamFiles: [...]
72
+ currentVal: Any = configMap
73
+ for keyPart in dottedKey.split("."):
74
+ if isinstance(currentVal, Mapping) and keyPart in currentVal:
75
+ currentVal = currentVal[keyPart]
76
+ else:
77
+ return defaultVal
78
+ return currentVal
79
+
80
+
81
+ def _listOrEmpty(list_):
82
+ if list_ is None:
83
+ return []
84
+ return list_
85
+
86
+
87
+ def _getMinR(configMap, numBams: int) -> float:
88
+ fallbackMinR: float = 1.0
89
+ try:
90
+ rawVal = _cfgGet(configMap, "observationParams.minR", None)
91
+ return float(rawVal) if rawVal is not None else fallbackMinR
92
+ except (TypeError, ValueError, KeyError):
93
+ logger.warning(
94
+ f"Invalid or missing 'observationParams.minR' in config. Using `{fallbackMinR}`."
95
+ )
96
+ return fallbackMinR
97
+
98
+
99
+ def checkControlsPresent(inputArgs: core.inputParams) -> bool:
100
+ """Check if control BAM files are present in the input arguments.
101
+
102
+ :param inputArgs: core.inputParams object
103
+ :return: True if control BAM files are present, False otherwise.
104
+ """
105
+ return (
106
+ bool(inputArgs.bamFilesControl)
107
+ and isinstance(inputArgs.bamFilesControl, list)
108
+ and len(inputArgs.bamFilesControl) > 0
109
+ )
110
+
111
+
112
+ def getReadLengths(
113
+ inputArgs: core.inputParams,
114
+ countingArgs: core.countingParams,
115
+ samArgs: core.samParams,
116
+ ) -> List[int]:
117
+ r"""Get read lengths for each BAM file in the input arguments.
118
+
119
+ :param inputArgs: core.inputParams object containing BAM file paths.
120
+ :param countingArgs: core.countingParams object containing number of reads.
121
+ :param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
122
+ :return: List of read lengths for each BAM file.
123
+ """
124
+ if not inputArgs.bamFiles:
125
+ raise ValueError(
126
+ "No BAM files provided in the input arguments."
127
+ )
128
+
129
+ if (
130
+ not isinstance(inputArgs.bamFiles, list)
131
+ or len(inputArgs.bamFiles) == 0
132
+ ):
133
+ raise ValueError("bam files list is empty")
134
+
135
+ return [
136
+ core.getReadLength(
137
+ bamFile,
138
+ countingArgs.numReads,
139
+ 1000,
140
+ samArgs.samThreads,
141
+ samArgs.samFlagExclude,
142
+ )
143
+ for bamFile in inputArgs.bamFiles
144
+ ]
145
+
146
+
147
+ def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
148
+ matchingEnabled = (
149
+ (matchingArgs.templateNames is not None)
150
+ and isinstance(matchingArgs.templateNames, list)
151
+ and len(matchingArgs.templateNames) > 0
152
+ )
153
+ matchingEnabled = (
154
+ matchingEnabled
155
+ and (matchingArgs.cascadeLevels is not None)
156
+ and isinstance(matchingArgs.cascadeLevels, list)
157
+ and len(matchingArgs.cascadeLevels) > 0
158
+ )
159
+ return matchingEnabled
160
+
161
+
162
+ def getEffectiveGenomeSizes(
163
+ genomeArgs: core.genomeParams, readLengths: List[int]
164
+ ) -> List[int]:
165
+ r"""Get effective genome sizes for the given genome name and read lengths.
166
+ :param genomeArgs: core.genomeParams object
167
+ :param readLengths: List of read lengths for which to get effective genome sizes.
168
+ :return: List of effective genome sizes corresponding to the read lengths.
169
+ """
170
+ genomeName = genomeArgs.genomeName
171
+ if not genomeName or not isinstance(genomeName, str):
172
+ raise ValueError("Genome name must be a non-empty string.")
173
+
174
+ if not isinstance(readLengths, list) or len(readLengths) == 0:
175
+ raise ValueError(
176
+ "Read lengths must be a non-empty list. Try calling `getReadLengths` first."
177
+ )
178
+ return [
179
+ constants.getEffectiveGenomeSize(genomeName, readLength)
180
+ for readLength in readLengths
181
+ ]
182
+
183
+
184
+ def getInputArgs(config_path: str) -> core.inputParams:
185
+ configData = _loadConfig(config_path)
186
+
187
+ def expandWildCards(bamList: List[str]) -> List[str]:
188
+ expandedList: List[str] = []
189
+ for bamEntry in bamList:
190
+ if "*" in bamEntry or "?" in bamEntry or "[" in bamEntry:
191
+ matchedList = glob.glob(bamEntry)
192
+ expandedList.extend(matchedList)
193
+ else:
194
+ expandedList.append(bamEntry)
195
+ return expandedList
196
+
197
+ bamFilesRaw = (
198
+ _cfgGet(configData, "inputParams.bamFiles", []) or []
199
+ )
200
+ bamFilesControlRaw = (
201
+ _cfgGet(configData, "inputParams.bamFilesControl", []) or []
202
+ )
203
+
204
+ bamFiles = expandWildCards(bamFilesRaw)
205
+ bamFilesControl = expandWildCards(bamFilesControlRaw)
206
+
207
+ if len(bamFiles) == 0:
208
+ raise ValueError(
209
+ "No BAM files provided in the configuration."
210
+ )
211
+
212
+ if (
213
+ len(bamFilesControl) > 0
214
+ and len(bamFilesControl) != len(bamFiles)
215
+ and len(bamFilesControl) != 1
216
+ ):
217
+ raise ValueError(
218
+ "Number of control BAM files must be 0, 1, or the same as number of treatment files"
219
+ )
220
+
221
+ if len(bamFilesControl) == 1:
222
+ logger.info(
223
+ f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
224
+ )
225
+ bamFilesControl = bamFilesControl * len(bamFiles)
226
+
227
+ if not bamFiles or not isinstance(bamFiles, list):
228
+ raise ValueError("No BAM files found")
229
+
230
+ for bamFile in bamFiles:
231
+ misc_util.checkBamFile(bamFile)
232
+
233
+ if bamFilesControl:
234
+ for bamFile in bamFilesControl:
235
+ misc_util.checkBamFile(bamFile)
236
+
237
+ pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
238
+ pairedEndConfig: Optional[bool] = _cfgGet(
239
+ configData, "inputParams.pairedEnd", None
240
+ )
241
+ if pairedEndConfig is None:
242
+ pairedEndConfig = all(pairedEndList)
243
+ if pairedEndConfig:
244
+ logger.info("Paired-end BAM files detected")
245
+ else:
246
+ logger.info("One or more single-end BAM files detected")
247
+
248
+ return core.inputParams(
249
+ bamFiles=bamFiles,
250
+ bamFilesControl=bamFilesControl,
251
+ pairedEnd=pairedEndConfig,
252
+ )
253
+
254
+ def getOutputArgs(config_path: str) -> core.outputParams:
255
+
256
+ configData = _loadConfig(config_path)
257
+
258
+ convertToBigWig_ = _cfgGet(
259
+ configData, "outputParams.convertToBigWig", True if shutil.which("bedGraphToBigWig") else False
260
+ )
261
+
262
+ roundDigits_ = _cfgGet(
263
+ configData, "outputParams.roundDigits", 3
264
+ )
265
+
266
+ writeResiduals_ = _cfgGet(
267
+ configData, "outputParams.writeResiduals", True
268
+ )
269
+
270
+ writeMuncTrace: bool = _cfgGet(
271
+ configData, "outputParams.writeMuncTrace", False
272
+ )
273
+
274
+ writeStateStd: bool = _cfgGet(
275
+ configData, "outputParams.writeStateStd", False
276
+ )
277
+
278
+ writeRawResiduals: bool = _cfgGet(
279
+ configData, "outputParams.writeRawResiduals", False
280
+ )
281
+
282
+ return core.outputParams(
283
+ convertToBigWig=convertToBigWig_,
284
+ roundDigits=roundDigits_,
285
+ writeResiduals=writeResiduals_,
286
+ writeRawResiduals=writeRawResiduals,
287
+ writeMuncTrace=writeMuncTrace,
288
+ writeStateStd=writeStateStd,
289
+ )
290
+
291
+
292
+ def getGenomeArgs(config_path: str) -> core.genomeParams:
293
+ configData = _loadConfig(config_path)
294
+
295
+ genomeName = _cfgGet(configData, "genomeParams.name", None)
296
+ genomeLabel = constants.resolveGenomeName(genomeName)
297
+
298
+ chromSizesFile: Optional[str] = None
299
+ blacklistFile: Optional[str] = None
300
+ sparseBedFile: Optional[str] = None
301
+ chromosomesList: Optional[List[str]] = None
302
+
303
+ excludeChromsList: List[str] = (
304
+ _cfgGet(configData, "genomeParams.excludeChroms", []) or []
305
+ )
306
+ excludeForNormList: List[str] = (
307
+ _cfgGet(configData, "genomeParams.excludeForNorm", []) or []
308
+ )
309
+
310
+ if genomeLabel:
311
+ chromSizesFile = constants.getGenomeResourceFile(
312
+ genomeLabel, "sizes"
313
+ )
314
+ blacklistFile = constants.getGenomeResourceFile(
315
+ genomeLabel, "blacklist"
316
+ )
317
+ sparseBedFile = constants.getGenomeResourceFile(
318
+ genomeLabel, "sparse"
319
+ )
320
+
321
+ chromSizesOverride = _cfgGet(
322
+ configData, "genomeParams.chromSizesFile", None
323
+ )
324
+ if chromSizesOverride:
325
+ chromSizesFile = chromSizesOverride
326
+
327
+ blacklistOverride = _cfgGet(
328
+ configData, "genomeParams.blacklistFile", None
329
+ )
330
+ if blacklistOverride:
331
+ blacklistFile = blacklistOverride
332
+
333
+ sparseOverride = _cfgGet(
334
+ configData, "genomeParams.sparseBedFile", None
335
+ )
336
+ if sparseOverride:
337
+ sparseBedFile = sparseOverride
338
+
339
+ if not chromSizesFile or not os.path.exists(chromSizesFile):
340
+ raise FileNotFoundError(
341
+ f"Chromosome sizes file {chromSizesFile} does not exist."
342
+ )
343
+
344
+ chromosomesConfig = _cfgGet(
345
+ configData, "genomeParams.chromosomes", None
346
+ )
347
+ if chromosomesConfig is not None:
348
+ chromosomesList = chromosomesConfig
349
+ else:
350
+ if chromSizesFile:
351
+ chromosomesFrame = pd.read_csv(
352
+ chromSizesFile,
353
+ sep="\t",
354
+ header=None,
355
+ names=["chrom", "size"],
356
+ )
357
+ chromosomesList = list(chromosomesFrame["chrom"])
358
+ else:
359
+ raise ValueError(
360
+ "No chromosomes provided in the configuration and no chromosome sizes file specified."
361
+ )
362
+
363
+ chromosomesList = [
364
+ chromName.strip()
365
+ for chromName in chromosomesList
366
+ if chromName and chromName.strip()
367
+ ]
368
+ if excludeChromsList:
369
+ chromosomesList = [
370
+ chromName
371
+ for chromName in chromosomesList
372
+ if chromName not in excludeChromsList
373
+ ]
374
+ if not chromosomesList:
375
+ raise ValueError(
376
+ "No valid chromosomes found after excluding specified chromosomes."
377
+ )
378
+
379
+ return core.genomeParams(
380
+ genomeName=genomeLabel,
381
+ chromSizesFile=chromSizesFile,
382
+ blacklistFile=blacklistFile,
383
+ sparseBedFile=sparseBedFile,
384
+ chromosomes=chromosomesList,
385
+ excludeChroms=excludeChromsList,
386
+ excludeForNorm=excludeForNormList,
387
+ )
388
+
389
+
390
+ def getCountingArgs(config_path: str) -> core.countingParams:
391
+ configData = _loadConfig(config_path)
392
+
393
+ stepSize = _cfgGet(configData, "countingParams.stepSize", 25)
394
+ scaleDownFlag = _cfgGet(
395
+ configData, "countingParams.scaleDown", True
396
+ )
397
+ scaleFactorList = _cfgGet(
398
+ configData, "countingParams.scaleFactors", None
399
+ )
400
+ numReads = _cfgGet(configData, "countingParams.numReads", 100)
401
+ scaleFactorsControlList = _cfgGet(
402
+ configData, "countingParams.scaleFactorsControl", None
403
+ )
404
+ applyAsinhFlag = _cfgGet(
405
+ configData, "countingParams.applyAsinh", False
406
+ )
407
+ applyLogFlag = _cfgGet(
408
+ configData, "countingParams.applyLog", False
409
+ )
410
+
411
+ if applyAsinhFlag and applyLogFlag:
412
+ applyAsinhFlag = True
413
+ applyLogFlag = False
414
+ logger.warning(
415
+ "Both `applyAsinh` and `applyLog` are set. Overriding `applyLog` to False."
416
+ )
417
+
418
+ rescaleToTreatmentCoverageFlag = _cfgGet(
419
+ configData,
420
+ "countingParams.rescaleToTreatmentCoverage",
421
+ False,
422
+ )
423
+
424
+ if scaleFactorList is not None and not isinstance(
425
+ scaleFactorList, list
426
+ ):
427
+ raise ValueError("`scaleFactors` should be a list of floats.")
428
+
429
+ if scaleFactorsControlList is not None and not isinstance(
430
+ scaleFactorsControlList, list
431
+ ):
432
+ raise ValueError(
433
+ "`scaleFactorsControl` should be a list of floats."
434
+ )
435
+
436
+ if (
437
+ scaleFactorList is not None
438
+ and scaleFactorsControlList is not None
439
+ and len(scaleFactorList) != len(scaleFactorsControlList)
440
+ ):
441
+ if len(scaleFactorsControlList) == 1:
442
+ scaleFactorsControlList = scaleFactorsControlList * len(
443
+ scaleFactorList
444
+ )
445
+ else:
446
+ raise ValueError(
447
+ "control and treatment scale factors: must be equal length or 1 control"
448
+ )
449
+
450
+ return core.countingParams(
451
+ stepSize=stepSize,
452
+ scaleDown=scaleDownFlag,
453
+ scaleFactors=scaleFactorList,
454
+ scaleFactorsControl=scaleFactorsControlList,
455
+ numReads=numReads,
456
+ applyAsinh=applyAsinhFlag,
457
+ applyLog=applyLogFlag,
458
+ rescaleToTreatmentCoverage=rescaleToTreatmentCoverageFlag,
459
+ )
460
+
461
+
462
+ def readConfig(config_path: str) -> Dict[str, Any]:
463
+ r"""Read and parse the configuration file for Consenrich.
464
+
465
+ :param config_path: Path to the YAML configuration file.
466
+ :return: Dictionary containing all parsed configuration parameters.
467
+ """
468
+ configData = _loadConfig(config_path)
469
+
470
+ inputParams = getInputArgs(config_path)
471
+ outputParams = getOutputArgs(config_path)
472
+ genomeParams = getGenomeArgs(config_path)
473
+ countingParams = getCountingArgs(config_path)
474
+
475
+ minRDefault = _getMinR(configData, len(inputParams.bamFiles))
476
+ minQDefault = (
477
+ minRDefault / len(inputParams.bamFiles)
478
+ ) + 0.10 # conditioning
479
+
480
+ matchingExcludeRegionsFileDefault: Optional[str] = (
481
+ genomeParams.blacklistFile
482
+ )
483
+
484
+ if (
485
+ inputParams.bamFilesControl is not None
486
+ and len(inputParams.bamFilesControl) > 0
487
+ ):
488
+ detrendWindowLengthBp = _cfgGet(
489
+ configData,
490
+ "detrendParams.detrendWindowLengthBP",
491
+ 25_000,
492
+ )
493
+ detrendSavitzkyGolayDegree = _cfgGet(
494
+ configData,
495
+ "detrendParams.detrendSavitzkyGolayDegree",
496
+ 1,
497
+ )
498
+ else:
499
+ detrendWindowLengthBp = _cfgGet(
500
+ configData,
501
+ "detrendParams.detrendWindowLengthBP",
502
+ 10_000,
503
+ )
504
+ detrendSavitzkyGolayDegree = _cfgGet(
505
+ configData,
506
+ "detrendParams.detrendSavitzkyGolayDegree",
507
+ 2,
508
+ )
509
+
510
+ experimentName = _cfgGet(
511
+ configData, "experimentName", "consenrichExperiment"
512
+ )
513
+
514
+ processArgs = core.processParams(
515
+ deltaF=_cfgGet(configData, "processParams.deltaF", 0.5),
516
+ minQ=_cfgGet(configData, "processParams.minQ", minQDefault),
517
+ maxQ=_cfgGet(configData, "processParams.maxQ", 500.0),
518
+ offDiagQ=_cfgGet(configData, "processParams.offDiagQ", 0.0),
519
+ dStatAlpha=_cfgGet(
520
+ configData, "processParams.dStatAlpha", 2.0
521
+ ),
522
+ dStatd=_cfgGet(configData, "processParams.dStatd", 1.0),
523
+ dStatPC=_cfgGet(configData, "processParams.dStatPC", 1.0),
524
+ scaleResidualsByP11=_cfgGet(
525
+ configData,
526
+ "processParams.scaleResidualsByP11",
527
+ True,
528
+ ),
529
+ )
530
+
531
+ observationArgs = core.observationParams(
532
+ minR=minRDefault,
533
+ maxR=_cfgGet(configData, "observationParams.maxR", 500.0),
534
+ useALV=_cfgGet(configData, "observationParams.useALV", False),
535
+ useConstantNoiseLevel=_cfgGet(
536
+ configData,
537
+ "observationParams.useConstantNoiseLevel",
538
+ False,
539
+ ),
540
+ noGlobal=_cfgGet(
541
+ configData, "observationParams.noGlobal", False
542
+ ),
543
+ numNearest=_cfgGet(
544
+ configData, "observationParams.numNearest", 25
545
+ ),
546
+ localWeight=_cfgGet(
547
+ configData, "observationParams.localWeight", 0.333
548
+ ),
549
+ globalWeight=_cfgGet(
550
+ configData, "observationParams.globalWeight", 0.667
551
+ ),
552
+ approximationWindowLengthBP=_cfgGet(
553
+ configData,
554
+ "observationParams.approximationWindowLengthBP",
555
+ 10_000,
556
+ ),
557
+ lowPassWindowLengthBP=_cfgGet(
558
+ configData,
559
+ "observationParams.lowPassWindowLengthBP",
560
+ 20_000,
561
+ ),
562
+ lowPassFilterType=_cfgGet(
563
+ configData,
564
+ "observationParams.lowPassFilterType",
565
+ "median",
566
+ ),
567
+ returnCenter=_cfgGet(
568
+ configData, "observationParams.returnCenter", True
569
+ ),
570
+ )
571
+
572
+ stateArgs = core.stateParams(
573
+ stateInit=_cfgGet(configData, "stateParams.stateInit", 0.0),
574
+ stateCovarInit=_cfgGet(
575
+ configData, "stateParams.stateCovarInit", 100.0
576
+ ),
577
+ boundState=_cfgGet(
578
+ configData, "stateParams.boundState", True
579
+ ),
580
+ stateLowerBound=_cfgGet(
581
+ configData, "stateParams.stateLowerBound", 0.0
582
+ ),
583
+ stateUpperBound=_cfgGet(
584
+ configData, "stateParams.stateUpperBound", 10000.0
585
+ ),
586
+ )
587
+
588
+ samThreads = _cfgGet(configData, "samParams.samThreads", 1)
589
+ samFlagExclude = _cfgGet(
590
+ configData, "samParams.samFlagExclude", 3844
591
+ )
592
+ oneReadPerBin = _cfgGet(configData, "samParams.oneReadPerBin", 0)
593
+ chunkSize = _cfgGet(configData, "samParams.chunkSize", 1_000_000)
594
+ offsetStr = _cfgGet(configData, "samParams.offsetStr", "0,0")
595
+ extendBpList = _cfgGet(configData, "samParams.extendBP", [])
596
+ maxInsertSize = _cfgGet(
597
+ configData, "samParams.maxInsertSize", 1000
598
+ )
599
+
600
+ pairedEndDefault = (
601
+ 1
602
+ if inputParams.pairedEnd is not None
603
+ and int(inputParams.pairedEnd) > 0
604
+ else 0
605
+ )
606
+ inferFragmentDefault = (
607
+ 1
608
+ if inputParams.pairedEnd is not None
609
+ and int(inputParams.pairedEnd) == 0
610
+ else 0
611
+ )
612
+
613
+ samArgs = core.samParams(
614
+ samThreads=samThreads,
615
+ samFlagExclude=samFlagExclude,
616
+ oneReadPerBin=oneReadPerBin,
617
+ chunkSize=chunkSize,
618
+ offsetStr=offsetStr,
619
+ extendBP=extendBpList,
620
+ maxInsertSize=maxInsertSize,
621
+ pairedEndMode=_cfgGet(
622
+ configData,
623
+ "samParams.pairedEndMode",
624
+ pairedEndDefault,
625
+ ),
626
+ inferFragmentLength=_cfgGet(
627
+ configData,
628
+ "samParams.inferFragmentLength",
629
+ inferFragmentDefault,
630
+ ),
631
+ countEndsOnly=_cfgGet(
632
+ configData, "samParams.countEndsOnly", False
633
+ ),
634
+ )
635
+
636
+ detrendArgs = core.detrendParams(
637
+ detrendWindowLengthBP=detrendWindowLengthBp,
638
+ detrendTrackPercentile=_cfgGet(
639
+ configData,
640
+ "detrendParams.detrendTrackPercentile",
641
+ 75,
642
+ ),
643
+ usePolyFilter=_cfgGet(
644
+ configData, "detrendParams.usePolyFilter", False
645
+ ),
646
+ detrendSavitzkyGolayDegree=detrendSavitzkyGolayDegree,
647
+ useOrderStatFilter=_cfgGet(
648
+ configData, "detrendParams.useOrderStatFilter", True
649
+ ),
650
+ )
651
+
652
+ matchingArgs = core.matchingParams(
653
+ templateNames=_cfgGet(
654
+ configData, "matchingParams.templateNames", []
655
+ ),
656
+ cascadeLevels=_cfgGet(
657
+ configData, "matchingParams.cascadeLevels", []
658
+ ),
659
+ iters=_cfgGet(configData, "matchingParams.iters", 25_000),
660
+ alpha=_cfgGet(configData, "matchingParams.alpha", 0.05),
661
+ minMatchLengthBP=_cfgGet(
662
+ configData,
663
+ "matchingParams.minMatchLengthBP",
664
+ 250,
665
+ ),
666
+ maxNumMatches=_cfgGet(
667
+ configData,
668
+ "matchingParams.maxNumMatches",
669
+ 100_000,
670
+ ),
671
+ minSignalAtMaxima=_cfgGet(
672
+ configData,
673
+ "matchingParams.minSignalAtMaxima",
674
+ "q:0.75",
675
+ ),
676
+ merge=_cfgGet(configData, "matchingParams.merge", True),
677
+ mergeGapBP=_cfgGet(
678
+ configData, "matchingParams.mergeGapBP", None
679
+ ),
680
+ useScalingFunction=_cfgGet(
681
+ configData,
682
+ "matchingParams.useScalingFunction",
683
+ True,
684
+ ),
685
+ excludeRegionsBedFile=_cfgGet(
686
+ configData,
687
+ "matchingParams.excludeRegionsBedFile",
688
+ matchingExcludeRegionsFileDefault,
689
+ ),
690
+ randSeed=_cfgGet(configData, "matchingParams.randSeed", 42),
691
+ penalizeBy=_cfgGet(
692
+ configData, "matchingParams.penalizeBy", None
693
+ ),
694
+ eps=_cfgGet(configData, "matchingParams.eps", 1.0e-2),
695
+ )
696
+
697
+ return {
698
+ "experimentName": experimentName,
699
+ "genomeArgs": genomeParams,
700
+ "inputArgs": inputParams,
701
+ "outputArgs": outputParams,
702
+ "countingArgs": countingParams,
703
+ "processArgs": processArgs,
704
+ "observationArgs": observationArgs,
705
+ "stateArgs": stateArgs,
706
+ "samArgs": samArgs,
707
+ "detrendArgs": detrendArgs,
708
+ "matchingArgs": matchingArgs,
709
+ }
710
+
711
+
712
+ def convertBedGraphToBigWig(experimentName, chromSizesFile,
713
+ suffixes: Optional[List[str]] = None):
714
+
715
+ if suffixes is None:
716
+ # at least look for `state` bedGraph
717
+ suffixes = ["state"]
718
+ path_ = ""
719
+ warningMessage = (
720
+ "Could not find UCSC bedGraphToBigWig binary utility."
721
+ "If you need bigWig files instead of the default, human-readable bedGraph files,"
722
+ "you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
723
+ "OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
724
+ )
725
+
726
+ logger.info(
727
+ "Attempting to generate bigWig files from bedGraph format..."
728
+ )
729
+ try:
730
+ path_ = shutil.which("bedGraphToBigWig")
731
+ except Exception as e:
732
+ logger.warning(f"\n{warningMessage}\n")
733
+ return
734
+ if path_ is None or len(path_) == 0:
735
+ logger.warning(f"\n{warningMessage}\n")
736
+ return
737
+ logger.info(f"Using bedGraphToBigWig from {path_}")
738
+ for suffix in suffixes:
739
+ bedgraph = (
740
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
741
+ )
742
+ if not os.path.exists(bedgraph):
743
+ logger.warning(
744
+ f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
745
+ )
746
+ continue
747
+ if not os.path.exists(chromSizesFile):
748
+ logger.warning(
749
+ f"{chromSizesFile} does not exist. Skipping bigWig conversion."
750
+ )
751
+ return
752
+ bigwig = f"{experimentName}_consenrich_{suffix}.bw"
753
+ logger.info(f"Start: {bedgraph} --> {bigwig}...")
754
+ try:
755
+ subprocess.run(
756
+ [path_, bedgraph, chromSizesFile, bigwig], check=True
757
+ )
758
+ except Exception as e:
759
+ logger.warning(
760
+ f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
761
+ )
762
+ continue
763
+ if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
764
+ logger.info(
765
+ f"Finished: converted {bedgraph} to {bigwig}."
766
+ )
767
+
768
+
769
+ def main():
770
+ parser = argparse.ArgumentParser(description="Consenrich CLI")
771
+ parser.add_argument(
772
+ "--config",
773
+ type=str,
774
+ dest="config",
775
+ help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
776
+ )
777
+
778
+ # --- Matching-specific command-line arguments ---
779
+ parser.add_argument(
780
+ "--match-bedGraph",
781
+ type=str,
782
+ dest="matchBedGraph",
783
+ help="Path to a bedGraph file of Consenrich estimates to match templates against.\
784
+ If provided, *only* the matching algorithm is run (no other processing). Note that \
785
+ some features in `consenrich.matching` may not be supported through this CLI interface.",
786
+ )
787
+ parser.add_argument(
788
+ "--match-template",
789
+ type=str,
790
+ default="haar",
791
+ choices=[
792
+ x
793
+ for x in pywt.wavelist(kind="discrete")
794
+ if "bio" not in x
795
+ ],
796
+ dest="matchTemplate",
797
+ )
798
+ parser.add_argument(
799
+ "--match-level", type=int, default=2, dest="matchLevel"
800
+ )
801
+ parser.add_argument(
802
+ "--match-alpha", type=float, default=0.05, dest="matchAlpha"
803
+ )
804
+ parser.add_argument(
805
+ "--match-min-length",
806
+ type=int,
807
+ default=250,
808
+ dest="matchMinMatchLengthBP",
809
+ )
810
+ parser.add_argument(
811
+ "--match-iters", type=int, default=25000, dest="matchIters"
812
+ )
813
+ parser.add_argument(
814
+ "--match-min-signal",
815
+ type=str,
816
+ default="q:0.75",
817
+ dest="matchMinSignalAtMaxima",
818
+ )
819
+ parser.add_argument(
820
+ "--match-max-matches",
821
+ type=int,
822
+ default=100000,
823
+ dest="matchMaxNumMatches",
824
+ )
825
+ parser.add_argument(
826
+ "--match-no-merge", action="store_true", dest="matchNoMerge"
827
+ )
828
+ parser.add_argument(
829
+ "--match-merge-gap",
830
+ type=int,
831
+ default=None,
832
+ dest="matchMergeGapBP",
833
+ )
834
+ parser.add_argument(
835
+ "--match-use-wavelet",
836
+ action="store_true",
837
+ dest="matchUseWavelet",
838
+ )
839
+ parser.add_argument(
840
+ "--match-seed", type=int, default=42, dest="matchRandSeed"
841
+ )
842
+ parser.add_argument(
843
+ "--match-exclude-bed",
844
+ type=str,
845
+ default=None,
846
+ dest="matchExcludeBed",
847
+ )
848
+ parser.add_argument(
849
+ "--verbose", action="store_true", help="If set, logs config"
850
+ )
851
+ args = parser.parse_args()
852
+
853
+ if args.matchBedGraph:
854
+ if not os.path.exists(args.matchBedGraph):
855
+ raise FileNotFoundError(
856
+ f"bedGraph file {args.matchBedGraph} couldn't be found."
857
+ )
858
+ logger.info(
859
+ f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
860
+ )
861
+
862
+ outName = matching.matchExistingBedGraph(
863
+ args.matchBedGraph,
864
+ args.matchTemplate,
865
+ args.matchLevel,
866
+ alpha=args.matchAlpha,
867
+ minMatchLengthBP=args.matchMinMatchLengthBP,
868
+ iters=args.matchIters,
869
+ minSignalAtMaxima=args.matchMinSignalAtMaxima,
870
+ maxNumMatches=args.matchMaxNumMatches,
871
+ useScalingFunction=(not args.matchUseWavelet),
872
+ merge=(not args.matchNoMerge),
873
+ mergeGapBP=args.matchMergeGapBP,
874
+ excludeRegionsBedFile=args.matchExcludeBed,
875
+ randSeed=args.matchRandSeed,
876
+ )
877
+ logger.info(f"Finished matching. Written to {outName}")
878
+ sys.exit(0)
879
+
880
+ if args.matchBedGraph:
881
+ # this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
882
+ logger.info(
883
+ "If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
884
+ )
885
+ sys.exit(0)
886
+
887
+ if not args.config:
888
+ logger.info(
889
+ "No config file provided, run with `--config <path_to_config.yaml>`"
890
+ )
891
+ logger.info(
892
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
893
+ )
894
+ sys.exit(1)
895
+
896
+ if not os.path.exists(args.config):
897
+ logger.info(f"Config file {args.config} does not exist.")
898
+ logger.info(
899
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
900
+ )
901
+ sys.exit(1)
902
+
903
+ config = readConfig(args.config)
904
+ experimentName = config["experimentName"]
905
+ genomeArgs = config["genomeArgs"]
906
+ inputArgs = config["inputArgs"]
907
+ outputArgs = config["outputArgs"]
908
+ countingArgs = config["countingArgs"]
909
+ processArgs = config["processArgs"]
910
+ observationArgs = config["observationArgs"]
911
+ stateArgs = config["stateArgs"]
912
+ samArgs = config["samArgs"]
913
+ detrendArgs = config["detrendArgs"]
914
+ matchingArgs = config["matchingArgs"]
915
+ bamFiles = inputArgs.bamFiles
916
+ bamFilesControl = inputArgs.bamFilesControl
917
+ numSamples = len(bamFiles)
918
+ numNearest = observationArgs.numNearest
919
+ stepSize = countingArgs.stepSize
920
+ excludeForNorm = genomeArgs.excludeForNorm
921
+ chromSizes = genomeArgs.chromSizesFile
922
+ scaleDown = countingArgs.scaleDown
923
+ extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
924
+ initialTreatmentScaleFactors = []
925
+ minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
926
+ mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
927
+
928
+ if args.verbose:
929
+ try:
930
+ logger.info("Configuration:\n")
931
+ config_truncated = {
932
+ k: v
933
+ for k, v in config.items()
934
+ if k
935
+ not in ["inputArgs", "genomeArgs", "countingArgs"]
936
+ }
937
+ config_truncated["experimentName"] = experimentName
938
+ config_truncated["inputArgs"] = inputArgs
939
+ config_truncated["outputArgs"] = outputArgs
940
+ config_truncated["genomeArgs"] = genomeArgs
941
+ config_truncated["countingArgs"] = countingArgs
942
+ config_truncated["processArgs"] = processArgs
943
+ config_truncated["observationArgs"] = observationArgs
944
+ config_truncated["stateArgs"] = stateArgs
945
+ config_truncated["samArgs"] = samArgs
946
+ config_truncated["detrendArgs"] = detrendArgs
947
+ pprint.pprint(config_truncated, indent=8)
948
+ except Exception as e:
949
+ logger.warning(f"Failed to print parsed config:\n{e}\n")
950
+
951
+ controlsPresent = checkControlsPresent(inputArgs)
952
+ if args.verbose:
953
+ logger.info(f"controlsPresent: {controlsPresent}")
954
+ readLengthsBamFiles = getReadLengths(
955
+ inputArgs, countingArgs, samArgs
956
+ )
957
+ effectiveGenomeSizes = getEffectiveGenomeSizes(
958
+ genomeArgs, readLengthsBamFiles
959
+ )
960
+ matchingEnabled = checkMatchingEnabled(matchingArgs)
961
+ if args.verbose:
962
+ logger.info(f"matchingEnabled: {matchingEnabled}")
963
+ scaleFactors = countingArgs.scaleFactors
964
+ scaleFactorsControl = countingArgs.scaleFactorsControl
965
+
966
+ if controlsPresent:
967
+ readLengthsControlBamFiles = [
968
+ core.getReadLength(
969
+ bamFile,
970
+ countingArgs.numReads,
971
+ 1000,
972
+ samArgs.samThreads,
973
+ samArgs.samFlagExclude,
974
+ )
975
+ for bamFile in bamFilesControl
976
+ ]
977
+ effectiveGenomeSizesControl = [
978
+ constants.getEffectiveGenomeSize(
979
+ genomeArgs.genomeName, readLength
980
+ )
981
+ for readLength in readLengthsControlBamFiles
982
+ ]
983
+
984
+ if (
985
+ scaleFactors is not None
986
+ and scaleFactorsControl is not None
987
+ ):
988
+ treatScaleFactors = scaleFactors
989
+ controlScaleFactors = scaleFactorsControl
990
+ # still make sure this is accessible
991
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
992
+ else:
993
+ try:
994
+ initialTreatmentScaleFactors = [
995
+ detrorm.getScaleFactor1x(
996
+ bamFile,
997
+ effectiveGenomeSize,
998
+ readLength,
999
+ genomeArgs.excludeChroms,
1000
+ genomeArgs.chromSizesFile,
1001
+ samArgs.samThreads,
1002
+ )
1003
+ for bamFile, effectiveGenomeSize, readLength in zip(
1004
+ bamFiles,
1005
+ effectiveGenomeSizes,
1006
+ readLengthsBamFiles,
1007
+ )
1008
+ ]
1009
+ except Exception:
1010
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
1011
+
1012
+ pairScalingFactors = [
1013
+ detrorm.getPairScaleFactors(
1014
+ bamFileA,
1015
+ bamFileB,
1016
+ effectiveGenomeSizeA,
1017
+ effectiveGenomeSizeB,
1018
+ readLengthA,
1019
+ readLengthB,
1020
+ excludeForNorm,
1021
+ chromSizes,
1022
+ samArgs.samThreads,
1023
+ scaleDown,
1024
+ )
1025
+ for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
1026
+ bamFiles,
1027
+ bamFilesControl,
1028
+ effectiveGenomeSizes,
1029
+ effectiveGenomeSizesControl,
1030
+ readLengthsBamFiles,
1031
+ readLengthsControlBamFiles,
1032
+ )
1033
+ ]
1034
+
1035
+ treatScaleFactors = []
1036
+ controlScaleFactors = []
1037
+ for scaleFactorA, scaleFactorB in pairScalingFactors:
1038
+ treatScaleFactors.append(scaleFactorA)
1039
+ controlScaleFactors.append(scaleFactorB)
1040
+
1041
+ else:
1042
+ treatScaleFactors = scaleFactors
1043
+ controlScaleFactors = scaleFactorsControl
1044
+
1045
+ if scaleFactors is None and not controlsPresent:
1046
+ scaleFactors = [
1047
+ detrorm.getScaleFactor1x(
1048
+ bamFile,
1049
+ effectiveGenomeSize,
1050
+ readLength,
1051
+ genomeArgs.excludeChroms,
1052
+ genomeArgs.chromSizesFile,
1053
+ samArgs.samThreads,
1054
+ )
1055
+ for bamFile, effectiveGenomeSize, readLength in zip(
1056
+ bamFiles, effectiveGenomeSizes, readLengthsBamFiles
1057
+ )
1058
+ ]
1059
+ chromSizesDict = misc_util.getChromSizesDict(
1060
+ genomeArgs.chromSizesFile,
1061
+ excludeChroms=genomeArgs.excludeChroms,
1062
+ )
1063
+ chromosomes = genomeArgs.chromosomes
1064
+
1065
+ for c_, chromosome in enumerate(chromosomes):
1066
+ chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
1067
+ bamFiles,
1068
+ chromosome,
1069
+ chromSizesDict[chromosome],
1070
+ samArgs.samThreads,
1071
+ samArgs.samFlagExclude,
1072
+ )
1073
+ chromosomeStart = max(
1074
+ 0, (chromosomeStart - (chromosomeStart % stepSize))
1075
+ )
1076
+ chromosomeEnd = max(
1077
+ 0, (chromosomeEnd - (chromosomeEnd % stepSize))
1078
+ )
1079
+ numIntervals = (
1080
+ ((chromosomeEnd - chromosomeStart) + stepSize) - 1
1081
+ ) // stepSize
1082
+ intervals = np.arange(
1083
+ chromosomeStart, chromosomeEnd, stepSize
1084
+ )
1085
+ chromMat: np.ndarray = np.empty(
1086
+ (numSamples, numIntervals), dtype=np.float32
1087
+ )
1088
+ if controlsPresent:
1089
+ j_: int = 0
1090
+ for bamA, bamB in zip(bamFiles, bamFilesControl):
1091
+ logger.info(
1092
+ f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
1093
+ )
1094
+ pairMatrix: np.ndarray = core.readBamSegments(
1095
+ [bamA, bamB],
1096
+ chromosome,
1097
+ chromosomeStart,
1098
+ chromosomeEnd,
1099
+ stepSize,
1100
+ [
1101
+ readLengthsBamFiles[j_],
1102
+ readLengthsControlBamFiles[j_],
1103
+ ],
1104
+ [treatScaleFactors[j_], controlScaleFactors[j_]],
1105
+ samArgs.oneReadPerBin,
1106
+ samArgs.samThreads,
1107
+ samArgs.samFlagExclude,
1108
+ offsetStr=samArgs.offsetStr,
1109
+ extendBP=extendBP_[j_],
1110
+ maxInsertSize=samArgs.maxInsertSize,
1111
+ pairedEndMode=samArgs.pairedEndMode,
1112
+ inferFragmentLength=samArgs.inferFragmentLength,
1113
+ applyAsinh=countingArgs.applyAsinh,
1114
+ applyLog=countingArgs.applyLog,
1115
+ countEndsOnly=samArgs.countEndsOnly,
1116
+ )
1117
+
1118
+ chromMat[j_, :] = (
1119
+ pairMatrix[0, :] - pairMatrix[1, :]
1120
+ )
1121
+ j_ += 1
1122
+ else:
1123
+ chromMat = core.readBamSegments(
1124
+ bamFiles,
1125
+ chromosome,
1126
+ chromosomeStart,
1127
+ chromosomeEnd,
1128
+ stepSize,
1129
+ readLengthsBamFiles,
1130
+ scaleFactors,
1131
+ samArgs.oneReadPerBin,
1132
+ samArgs.samThreads,
1133
+ samArgs.samFlagExclude,
1134
+ offsetStr=samArgs.offsetStr,
1135
+ extendBP=extendBP_,
1136
+ maxInsertSize=samArgs.maxInsertSize,
1137
+ pairedEndMode=samArgs.pairedEndMode,
1138
+ inferFragmentLength=samArgs.inferFragmentLength,
1139
+ applyAsinh=countingArgs.applyAsinh,
1140
+ applyLog=countingArgs.applyLog,
1141
+ countEndsOnly=samArgs.countEndsOnly,
1142
+ )
1143
+ sparseMap = None
1144
+ if genomeArgs.sparseBedFile and not observationArgs.useALV:
1145
+ logger.info(
1146
+ f"Building sparse mapping for {chromosome}..."
1147
+ )
1148
+ sparseMap = core.getSparseMap(
1149
+ chromosome,
1150
+ intervals,
1151
+ numNearest,
1152
+ genomeArgs.sparseBedFile,
1153
+ )
1154
+
1155
+ muncMat = np.empty_like(chromMat, dtype=np.float32)
1156
+ for j in range(numSamples):
1157
+ logger.info(
1158
+ f"Muncing {j + 1}/{numSamples} for {chromosome}..."
1159
+ )
1160
+ muncMat[j, :] = core.getMuncTrack(
1161
+ chromosome,
1162
+ intervals,
1163
+ stepSize,
1164
+ chromMat[j, :],
1165
+ observationArgs.minR,
1166
+ observationArgs.maxR,
1167
+ observationArgs.useALV,
1168
+ observationArgs.useConstantNoiseLevel,
1169
+ observationArgs.noGlobal,
1170
+ observationArgs.localWeight,
1171
+ observationArgs.globalWeight,
1172
+ observationArgs.approximationWindowLengthBP,
1173
+ observationArgs.lowPassWindowLengthBP,
1174
+ observationArgs.returnCenter,
1175
+ sparseMap=sparseMap,
1176
+ lowPassFilterType=observationArgs.lowPassFilterType,
1177
+ )
1178
+ chromMat[j, :] = detrorm.detrendTrack(
1179
+ chromMat[j, :],
1180
+ stepSize,
1181
+ detrendArgs.detrendWindowLengthBP,
1182
+ detrendArgs.useOrderStatFilter,
1183
+ detrendArgs.usePolyFilter,
1184
+ detrendArgs.detrendTrackPercentile,
1185
+ detrendArgs.detrendSavitzkyGolayDegree,
1186
+ )
1187
+ logger.info(f">>>Running consenrich: {chromosome}<<<")
1188
+
1189
+ x, P, y = core.runConsenrich(
1190
+ chromMat,
1191
+ muncMat,
1192
+ processArgs.deltaF,
1193
+ processArgs.minQ,
1194
+ processArgs.maxQ,
1195
+ processArgs.offDiagQ,
1196
+ processArgs.dStatAlpha,
1197
+ processArgs.dStatd,
1198
+ processArgs.dStatPC,
1199
+ stateArgs.stateInit,
1200
+ stateArgs.stateCovarInit,
1201
+ stateArgs.boundState,
1202
+ stateArgs.stateLowerBound,
1203
+ stateArgs.stateUpperBound,
1204
+ samArgs.chunkSize,
1205
+ progressIter=50_000,
1206
+ )
1207
+ logger.info("Done.")
1208
+
1209
+ x_ = core.getPrimaryState(x)
1210
+ y_ = core.getPrecisionWeightedResidual(
1211
+ y,
1212
+ muncMat,
1213
+ stateCovarSmoothed=P
1214
+ if processArgs.scaleResidualsByP11 is not None
1215
+ and processArgs.scaleResidualsByP11
1216
+ else None,
1217
+ )
1218
+
1219
+ weights_: Optional[np.ndarray] = None
1220
+ if matchingArgs.penalizeBy is not None:
1221
+ if matchingArgs.penalizeBy == "absResiduals":
1222
+ try:
1223
+ weights_ = np.abs(y_)
1224
+ except Exception as e:
1225
+ logger.warning(
1226
+ f"Error computing weights for 'absResiduals': {e}. No weights applied for matching."
1227
+ )
1228
+ weights_ = None
1229
+ elif matchingArgs.penalizeBy == "stateUncertainty" or matchingArgs.penalizeBy == "stateStdDev":
1230
+ try:
1231
+ weights_ = np.sqrt(P[:, 0, 0])
1232
+ except Exception as e:
1233
+ logger.warning(
1234
+ f"Error computing weights for 'stateUncertainty': {e}. No weights applied for matching."
1235
+ )
1236
+ weights_ = None
1237
+ elif matchingArgs.penalizeBy == "muncTrace":
1238
+ try:
1239
+ weights_ = np.sqrt(
1240
+ np.mean(muncMat.astype(np.float64), axis=0)
1241
+ )
1242
+ except Exception as e:
1243
+ logger.warning(
1244
+ f"Error computing weights for 'muncTrace': {e}. No weights applied for matching."
1245
+ )
1246
+ weights_ = None
1247
+ else:
1248
+ logger.warning(
1249
+ f"Unrecognized `matchingParams.penalizeBy`: {matchingArgs.penalizeBy}. No weights applied."
1250
+ )
1251
+ weights_ = None
1252
+
1253
+ df = pd.DataFrame(
1254
+ {
1255
+ "Chromosome": chromosome,
1256
+ "Start": intervals,
1257
+ "End": intervals + stepSize,
1258
+ "State": x_,
1259
+ }
1260
+ )
1261
+
1262
+ if outputArgs.writeResiduals:
1263
+ df["Res"] = y_.astype(np.float32) # FFR: cast necessary?
1264
+ if outputArgs.writeRawResiduals:
1265
+ df["RawRes"] = np.mean(y, axis=1).astype(np.float32)
1266
+ if outputArgs.writeMuncTrace:
1267
+ munc_std = np.sqrt(
1268
+ np.mean(muncMat.astype(np.float64), axis=0)
1269
+ ).astype(np.float32)
1270
+ df["Munc"] = munc_std
1271
+ if outputArgs.writeStateStd:
1272
+ df["StateStd"] = np.sqrt(P[:, 0, 0]).astype(np.float32)
1273
+ cols_ = ["Chromosome", "Start", "End", "State"]
1274
+ if outputArgs.writeResiduals:
1275
+ cols_.append("Res")
1276
+ if outputArgs.writeMuncTrace:
1277
+ cols_.append("Munc")
1278
+ if outputArgs.writeStateStd:
1279
+ cols_.append("StateStd")
1280
+ if outputArgs.writeRawResiduals:
1281
+ cols_.append("RawRes")
1282
+ df = df[cols_]
1283
+ suffixes = ['state']
1284
+ if outputArgs.writeResiduals:
1285
+ suffixes.append('residuals')
1286
+ if outputArgs.writeMuncTrace:
1287
+ suffixes.append('muncTraces')
1288
+ if outputArgs.writeStateStd:
1289
+ suffixes.append('stdDevs')
1290
+ if outputArgs.writeRawResiduals:
1291
+ suffixes.append('rawResiduals')
1292
+
1293
+ if (c_ == 0 and len(chromosomes) > 1) or (len(chromosomes) == 1):
1294
+ for file_ in os.listdir("."):
1295
+ if file_.startswith(
1296
+ f"consenrichOutput_{experimentName}"
1297
+ ) and (
1298
+ file_.endswith(".bedGraph")
1299
+ or file_.endswith(".narrowPeak")
1300
+ ):
1301
+ logger.warning(f"Overwriting: {file_}")
1302
+ os.remove(file_)
1303
+
1304
+ for col, suffix in zip(cols_[3:], suffixes):
1305
+ logger.info(
1306
+ f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
1307
+ )
1308
+ df[["Chromosome", "Start", "End", col]].to_csv(
1309
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
1310
+ sep="\t",
1311
+ header=False,
1312
+ index=False,
1313
+ mode="a",
1314
+ float_format="%.3f",
1315
+ lineterminator="\n",
1316
+ )
1317
+ try:
1318
+ if matchingEnabled:
1319
+ if (
1320
+ minMatchLengthBP_ is None
1321
+ or minMatchLengthBP_ <= 0
1322
+ ):
1323
+ minMatchLengthBP_ = (
1324
+ matching.autoMinLengthIntervals(x_)
1325
+ * (intervals[1] - intervals[0])
1326
+ )
1327
+
1328
+ if mergeGapBP_ is None:
1329
+ mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
1330
+
1331
+ matchingDF = matching.matchWavelet(
1332
+ chromosome,
1333
+ intervals,
1334
+ x_,
1335
+ matchingArgs.templateNames,
1336
+ matchingArgs.cascadeLevels,
1337
+ matchingArgs.iters,
1338
+ matchingArgs.alpha,
1339
+ minMatchLengthBP_,
1340
+ matchingArgs.maxNumMatches,
1341
+ matchingArgs.minSignalAtMaxima,
1342
+ useScalingFunction=matchingArgs.useScalingFunction,
1343
+ excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
1344
+ randSeed=matchingArgs.randSeed,
1345
+ weights=weights_,
1346
+ )
1347
+ if not matchingDF.empty:
1348
+ matchingDF.to_csv(
1349
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1350
+ sep="\t",
1351
+ header=False,
1352
+ index=False,
1353
+ mode="a",
1354
+ float_format=f"%.{outputArgs.roundDigits}f",
1355
+ lineterminator="\n",
1356
+ )
1357
+ except Exception as e:
1358
+ logger.warning(
1359
+ f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
1360
+ )
1361
+ continue
1362
+ logger.info("Finished: output in human-readable format")
1363
+
1364
+ if outputArgs.convertToBigWig:
1365
+ convertBedGraphToBigWig(experimentName, genomeArgs.chromSizesFile, suffixes=suffixes)
1366
+
1367
+ if matchingEnabled and matchingArgs.merge:
1368
+ try:
1369
+ mergeGapBP_ = matchingArgs.mergeGapBP
1370
+ if mergeGapBP_ is None or mergeGapBP_ <= 0:
1371
+ mergeGapBP_ = (
1372
+ int(minMatchLengthBP_ / 2) + 1
1373
+ if minMatchLengthBP_ is not None
1374
+ and minMatchLengthBP_ >= 0
1375
+ else 75
1376
+ )
1377
+ matching.mergeMatches(
1378
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1379
+ mergeGapBP=mergeGapBP_,
1380
+ )
1381
+
1382
+ except Exception as e:
1383
+ logger.warning(
1384
+ f"Failed to merge matches...SKIPPING:\n{e}\n\n"
1385
+ )
1386
+ logger.info("Done.")
1387
+
1388
+
1389
+ if __name__ == "__main__":
1390
+ main()