consenrich 0.7.11b2__cp314-cp314-macosx_15_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

Files changed (38) hide show
  1. consenrich/.dylibs/libomp.dylib +0 -0
  2. consenrich/__init__.py +11 -0
  3. consenrich/cconsenrich.c +50610 -0
  4. consenrich/cconsenrich.cpython-314-darwin.so +0 -0
  5. consenrich/cconsenrich.pyx +1065 -0
  6. consenrich/consenrich.py +1802 -0
  7. consenrich/constants.py +172 -0
  8. consenrich/core.py +2068 -0
  9. consenrich/data/ce10.sizes +6 -0
  10. consenrich/data/ce10_blacklist.bed +100 -0
  11. consenrich/data/ce10_sparse.bed +11828 -0
  12. consenrich/data/ce11.sizes +6 -0
  13. consenrich/data/ce11_blacklist.bed +97 -0
  14. consenrich/data/ce11_sparse.bed +11828 -0
  15. consenrich/data/dm6.sizes +7 -0
  16. consenrich/data/dm6_blacklist.bed +182 -0
  17. consenrich/data/dm6_sparse.bed +20000 -0
  18. consenrich/data/hg19.sizes +24 -0
  19. consenrich/data/hg19_blacklist.bed +834 -0
  20. consenrich/data/hg19_sparse.bed +288358 -0
  21. consenrich/data/hg38.sizes +24 -0
  22. consenrich/data/hg38_blacklist.bed +636 -0
  23. consenrich/data/hg38_sparse.bed +288699 -0
  24. consenrich/data/mm10.sizes +21 -0
  25. consenrich/data/mm10_blacklist.bed +3435 -0
  26. consenrich/data/mm10_sparse.bed +100400 -0
  27. consenrich/data/mm39.sizes +21 -0
  28. consenrich/data/mm39_blacklist.bed +3360 -0
  29. consenrich/data/mm39_sparse.bed +100381 -0
  30. consenrich/detrorm.py +297 -0
  31. consenrich/matching.py +929 -0
  32. consenrich/misc_util.py +122 -0
  33. consenrich-0.7.11b2.dist-info/METADATA +66 -0
  34. consenrich-0.7.11b2.dist-info/RECORD +38 -0
  35. consenrich-0.7.11b2.dist-info/WHEEL +6 -0
  36. consenrich-0.7.11b2.dist-info/entry_points.txt +2 -0
  37. consenrich-0.7.11b2.dist-info/licenses/LICENSE +21 -0
  38. consenrich-0.7.11b2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1802 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import pprint
8
+ import os
9
+ from pathlib import Path
10
+ from collections.abc import Mapping
11
+ from typing import List, Optional, Tuple, Dict, Any, Union, Sequence
12
+ import shutil
13
+ import subprocess
14
+ import sys
15
+ import numpy as np
16
+ import pandas as pd
17
+ import pysam
18
+ import pywt
19
+ import yaml
20
+
21
+ import consenrich.core as core
22
+ import consenrich.misc_util as misc_util
23
+ import consenrich.constants as constants
24
+ import consenrich.detrorm as detrorm
25
+ import consenrich.matching as matching
26
+ import consenrich.cconsenrich as cconsenrich
27
+
28
+
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ def _resolveFragmentLengthPairs(
38
+ treatmentFragmentLengths: Optional[Sequence[Union[int, float]]],
39
+ controlFragmentLengths: Optional[Sequence[Union[int, float]]],
40
+ ) -> Tuple[List[int], List[int]]:
41
+ r"""Assign consistent fragment length estimates to treatment and control BAM files.
42
+
43
+ For single-end data, cross-correlation-based fragment estimates for control inputs
44
+ can be much smaller than for treatment samples due to lack of structure. This creates
45
+ artifacts during signal quantification and normalization steps, and it's common to use
46
+ the treatment fragment length for both treatment and control samples. So we offer that here.
47
+ """
48
+
49
+ if not treatmentFragmentLengths:
50
+ logger.warning(
51
+ "No treatment fragment lengths provided...returning [],[]"
52
+ )
53
+ return [], []
54
+
55
+ n_treat = len(treatmentFragmentLengths)
56
+
57
+ if controlFragmentLengths:
58
+ if len(controlFragmentLengths) == 1 and n_treat > 1:
59
+ controlFragmentLengths = (
60
+ list(controlFragmentLengths) * n_treat
61
+ )
62
+ logger.info(
63
+ "Only one control fragment length provided: broadcasting this value for all control BAM files."
64
+ )
65
+ elif len(controlFragmentLengths) != n_treat:
66
+ logger.warning(
67
+ "Sizes of treatment and control fragment length lists are incompatible...returning [],[]"
68
+ )
69
+ return [], []
70
+ else:
71
+ controlFragmentLengths = list(controlFragmentLengths)
72
+ else:
73
+ controlFragmentLengths = list(treatmentFragmentLengths)
74
+
75
+ finalTreatment = [int(x) for x in treatmentFragmentLengths]
76
+ finalControl = [int(x) for x in treatmentFragmentLengths]
77
+
78
+ return finalTreatment, finalControl
79
+
80
+
81
+ def loadConfig(
82
+ configSource: Union[str, Path, Mapping[str, Any]],
83
+ ) -> Dict[str, Any]:
84
+ r"""Load a YAML config from a path or accept an already-parsed mapping.
85
+
86
+ If given a dict-like object, just return it. If given a path, try to load as YAML --> dict
87
+ If given a path, try to load as YAML --> dict
88
+
89
+ """
90
+ if isinstance(configSource, Mapping):
91
+ configData = configSource
92
+ elif isinstance(configSource, (str, Path)):
93
+ with open(configSource, "r") as fileHandle:
94
+ configData = yaml.safe_load(fileHandle) or {}
95
+ else:
96
+ raise TypeError("`config` must be a path or a mapping/dict.")
97
+
98
+ if not isinstance(configData, Mapping):
99
+ raise TypeError("Top-level YAML must be a mapping/object.")
100
+ return configData
101
+
102
+
103
+ def _cfgGet(
104
+ configMap: Mapping[str, Any],
105
+ dottedKey: str,
106
+ defaultVal: Any = None,
107
+ ) -> Any:
108
+ r"""Support both dotted keys and yaml/dict-style nested access for configs."""
109
+
110
+ # e.g., inputParams.bamFiles
111
+ if dottedKey in configMap:
112
+ return configMap[dottedKey]
113
+
114
+ # e.g.,
115
+ # inputParams:
116
+ # bamFiles: [...]
117
+ currentVal: Any = configMap
118
+ for keyPart in dottedKey.split("."):
119
+ if isinstance(currentVal, Mapping) and keyPart in currentVal:
120
+ currentVal = currentVal[keyPart]
121
+ else:
122
+ return defaultVal
123
+ return currentVal
124
+
125
+
126
+ def _listOrEmpty(list_):
127
+ if list_ is None:
128
+ return []
129
+ return list_
130
+
131
+
132
+ def checkControlsPresent(inputArgs: core.inputParams) -> bool:
133
+ """Check if control BAM files are present in the input arguments.
134
+
135
+ :param inputArgs: core.inputParams object
136
+ :return: True if control BAM files are present, False otherwise.
137
+ """
138
+ return (
139
+ bool(inputArgs.bamFilesControl)
140
+ and isinstance(inputArgs.bamFilesControl, list)
141
+ and len(inputArgs.bamFilesControl) > 0
142
+ )
143
+
144
+
145
+ def getReadLengths(
146
+ inputArgs: core.inputParams,
147
+ countingArgs: core.countingParams,
148
+ samArgs: core.samParams,
149
+ ) -> List[int]:
150
+ r"""Get read lengths for each BAM file in the input arguments.
151
+
152
+ :param inputArgs: core.inputParams object containing BAM file paths.
153
+ :param countingArgs: core.countingParams object containing number of reads.
154
+ :param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
155
+ :return: List of read lengths for each BAM file.
156
+ """
157
+ if not inputArgs.bamFiles:
158
+ raise ValueError(
159
+ "No BAM files provided in the input arguments."
160
+ )
161
+
162
+ if (
163
+ not isinstance(inputArgs.bamFiles, list)
164
+ or len(inputArgs.bamFiles) == 0
165
+ ):
166
+ raise ValueError("bam files list is empty")
167
+
168
+ return [
169
+ core.getReadLength(
170
+ bamFile,
171
+ countingArgs.numReads,
172
+ 1000,
173
+ samArgs.samThreads,
174
+ samArgs.samFlagExclude,
175
+ )
176
+ for bamFile in inputArgs.bamFiles
177
+ ]
178
+
179
+
180
+ def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
181
+ matchingEnabled = (
182
+ (matchingArgs.templateNames is not None)
183
+ and isinstance(matchingArgs.templateNames, list)
184
+ and len(matchingArgs.templateNames) > 0
185
+ )
186
+ matchingEnabled = (
187
+ matchingEnabled
188
+ and (matchingArgs.cascadeLevels is not None)
189
+ and isinstance(matchingArgs.cascadeLevels, list)
190
+ and len(matchingArgs.cascadeLevels) > 0
191
+ )
192
+ return matchingEnabled
193
+
194
+
195
+ def getEffectiveGenomeSizes(
196
+ genomeArgs: core.genomeParams, readLengths: List[int]
197
+ ) -> List[int]:
198
+ r"""Get effective genome sizes for the given genome name and read lengths.
199
+ :param genomeArgs: core.genomeParams object
200
+ :param readLengths: List of read lengths for which to get effective genome sizes.
201
+ :return: List of effective genome sizes corresponding to the read lengths.
202
+ """
203
+ genomeName = genomeArgs.genomeName
204
+ if not genomeName or not isinstance(genomeName, str):
205
+ raise ValueError("Genome name must be a non-empty string.")
206
+
207
+ if not isinstance(readLengths, list) or len(readLengths) == 0:
208
+ raise ValueError(
209
+ "Read lengths must be a non-empty list. Try calling `getReadLengths` first."
210
+ )
211
+ return [
212
+ constants.getEffectiveGenomeSize(genomeName, readLength)
213
+ for readLength in readLengths
214
+ ]
215
+
216
+
217
+ def getInputArgs(config_path: str) -> core.inputParams:
218
+ configData = loadConfig(config_path)
219
+
220
+ def expandWildCards(bamList: List[str]) -> List[str]:
221
+ expandedList: List[str] = []
222
+ for bamEntry in bamList:
223
+ if "*" in bamEntry or "?" in bamEntry or "[" in bamEntry:
224
+ matchedList = glob.glob(bamEntry)
225
+ else:
226
+ expandedList.append(bamEntry)
227
+ return expandedList
228
+
229
+ bamFilesRaw = (
230
+ _cfgGet(configData, "inputParams.bamFiles", []) or []
231
+ )
232
+ bamFilesControlRaw = (
233
+ _cfgGet(configData, "inputParams.bamFilesControl", []) or []
234
+ )
235
+
236
+ bamFiles = expandWildCards(bamFilesRaw)
237
+ bamFilesControl = expandWildCards(bamFilesControlRaw)
238
+
239
+ if len(bamFiles) == 0:
240
+ raise ValueError(
241
+ "No BAM files provided in the configuration."
242
+ )
243
+
244
+ if (
245
+ len(bamFilesControl) > 0
246
+ and len(bamFilesControl) != len(bamFiles)
247
+ and len(bamFilesControl) != 1
248
+ ):
249
+ raise ValueError(
250
+ "Number of control BAM files must be 0, 1, or the same as number of treatment files"
251
+ )
252
+
253
+ if len(bamFilesControl) == 1:
254
+ logger.info(
255
+ f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
256
+ )
257
+ bamFilesControl = bamFilesControl * len(bamFiles)
258
+
259
+ if not bamFiles or not isinstance(bamFiles, list):
260
+ raise ValueError("No BAM files found")
261
+
262
+ for bamFile in bamFiles:
263
+ misc_util.checkBamFile(bamFile)
264
+
265
+ if bamFilesControl:
266
+ for bamFile in bamFilesControl:
267
+ misc_util.checkBamFile(bamFile)
268
+
269
+ pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
270
+ pairedEndConfig: Optional[bool] = _cfgGet(
271
+ configData, "inputParams.pairedEnd", None
272
+ )
273
+ if pairedEndConfig is None:
274
+ pairedEndConfig = all(pairedEndList)
275
+ if pairedEndConfig:
276
+ logger.info("Paired-end BAM files detected")
277
+ else:
278
+ logger.info("One or more single-end BAM files detected")
279
+
280
+ return core.inputParams(
281
+ bamFiles=bamFiles,
282
+ bamFilesControl=bamFilesControl,
283
+ pairedEnd=pairedEndConfig,
284
+ )
285
+
286
+
287
+ def getOutputArgs(config_path: str) -> core.outputParams:
288
+ configData = loadConfig(config_path)
289
+
290
+ convertToBigWig_ = _cfgGet(
291
+ configData,
292
+ "outputParams.convertToBigWig",
293
+ True if shutil.which("bedGraphToBigWig") else False,
294
+ )
295
+
296
+ roundDigits_ = _cfgGet(configData, "outputParams.roundDigits", 3)
297
+
298
+ writeResiduals_ = _cfgGet(
299
+ configData,
300
+ "outputParams.writeResiduals",
301
+ True,
302
+ )
303
+
304
+ writeMuncTrace: bool = _cfgGet(
305
+ configData, "outputParams.writeMuncTrace", False
306
+ )
307
+
308
+ writeStateStd: bool = _cfgGet(
309
+ configData,
310
+ "outputParams.writeStateStd",
311
+ True,
312
+ )
313
+
314
+ return core.outputParams(
315
+ convertToBigWig=convertToBigWig_,
316
+ roundDigits=roundDigits_,
317
+ writeResiduals=writeResiduals_,
318
+ writeMuncTrace=writeMuncTrace,
319
+ writeStateStd=writeStateStd,
320
+ )
321
+
322
+
323
+ def getGenomeArgs(config_path: str) -> core.genomeParams:
324
+ configData = loadConfig(config_path)
325
+
326
+ genomeName = _cfgGet(configData, "genomeParams.name", None)
327
+ genomeLabel = constants.resolveGenomeName(genomeName)
328
+
329
+ chromSizesFile: Optional[str] = None
330
+ blacklistFile: Optional[str] = None
331
+ sparseBedFile: Optional[str] = None
332
+ chromosomesList: Optional[List[str]] = None
333
+
334
+ excludeChromsList: List[str] = (
335
+ _cfgGet(configData, "genomeParams.excludeChroms", []) or []
336
+ )
337
+ excludeForNormList: List[str] = (
338
+ _cfgGet(configData, "genomeParams.excludeForNorm", []) or []
339
+ )
340
+
341
+ if genomeLabel:
342
+ chromSizesFile = constants.getGenomeResourceFile(
343
+ genomeLabel, "sizes"
344
+ )
345
+ blacklistFile = constants.getGenomeResourceFile(
346
+ genomeLabel, "blacklist"
347
+ )
348
+ sparseBedFile = constants.getGenomeResourceFile(
349
+ genomeLabel, "sparse"
350
+ )
351
+
352
+ chromSizesOverride = _cfgGet(
353
+ configData, "genomeParams.chromSizesFile", None
354
+ )
355
+ if chromSizesOverride:
356
+ chromSizesFile = chromSizesOverride
357
+
358
+ blacklistOverride = _cfgGet(
359
+ configData, "genomeParams.blacklistFile", None
360
+ )
361
+ if blacklistOverride:
362
+ blacklistFile = blacklistOverride
363
+
364
+ sparseOverride = _cfgGet(
365
+ configData, "genomeParams.sparseBedFile", None
366
+ )
367
+ if sparseOverride:
368
+ sparseBedFile = sparseOverride
369
+
370
+ if not chromSizesFile or not os.path.exists(chromSizesFile):
371
+ raise FileNotFoundError(
372
+ f"Chromosome sizes file {chromSizesFile} does not exist."
373
+ )
374
+
375
+ chromosomesConfig = _cfgGet(
376
+ configData, "genomeParams.chromosomes", None
377
+ )
378
+ if chromosomesConfig is not None:
379
+ chromosomesList = chromosomesConfig
380
+ else:
381
+ if chromSizesFile:
382
+ chromosomesFrame = pd.read_csv(
383
+ chromSizesFile,
384
+ sep="\t",
385
+ header=None,
386
+ names=["chrom", "size"],
387
+ )
388
+ chromosomesList = list(chromosomesFrame["chrom"])
389
+ else:
390
+ raise ValueError(
391
+ "No chromosomes provided in the configuration and no chromosome sizes file specified."
392
+ )
393
+
394
+ chromosomesList = [
395
+ chromName.strip()
396
+ for chromName in chromosomesList
397
+ if chromName and chromName.strip()
398
+ ]
399
+ if excludeChromsList:
400
+ chromosomesList = [
401
+ chromName
402
+ for chromName in chromosomesList
403
+ if chromName not in excludeChromsList
404
+ ]
405
+ if not chromosomesList:
406
+ raise ValueError(
407
+ "No valid chromosomes found after excluding specified chromosomes."
408
+ )
409
+
410
+ return core.genomeParams(
411
+ genomeName=genomeLabel,
412
+ chromSizesFile=chromSizesFile,
413
+ blacklistFile=blacklistFile,
414
+ sparseBedFile=sparseBedFile,
415
+ chromosomes=chromosomesList,
416
+ excludeChroms=excludeChromsList,
417
+ excludeForNorm=excludeForNormList,
418
+ )
419
+
420
+
421
+ def getCountingArgs(config_path: str) -> core.countingParams:
422
+ configData = loadConfig(config_path)
423
+
424
+ stepSize = _cfgGet(configData, "countingParams.stepSize", 25)
425
+ scaleDownFlag = _cfgGet(
426
+ configData,
427
+ "countingParams.scaleDown",
428
+ False,
429
+ )
430
+ scaleFactorList = _cfgGet(
431
+ configData, "countingParams.scaleFactors", None
432
+ )
433
+ numReads = _cfgGet(configData, "countingParams.numReads", 100)
434
+ scaleFactorsControlList = _cfgGet(
435
+ configData, "countingParams.scaleFactorsControl", None
436
+ )
437
+ applyAsinhFlag = _cfgGet(
438
+ configData,
439
+ "countingParams.applyAsinh",
440
+ False,
441
+ )
442
+ applyLogFlag = _cfgGet(
443
+ configData,
444
+ "countingParams.applyLog",
445
+ False,
446
+ )
447
+ applySqrtFlag = _cfgGet(
448
+ configData,
449
+ "countingParams.applySqrt",
450
+ False,
451
+ )
452
+
453
+ noTransformFlag = _cfgGet(
454
+ configData,
455
+ "countingParams.noTransform",
456
+ False,
457
+ )
458
+
459
+ if (
460
+ int(applyAsinhFlag) + int(applyLogFlag) + int(applySqrtFlag)
461
+ > 1
462
+ and not noTransformFlag
463
+ ):
464
+ logger.warning(
465
+ "Only <= 1 of `applyAsinh`, `applyLog`, `applySqrt` can be true...using applySqrt..."
466
+ )
467
+ applyAsinhFlag = False
468
+ applyLogFlag = False
469
+ applySqrtFlag = True
470
+
471
+ if noTransformFlag:
472
+ applyAsinhFlag = False
473
+ applyLogFlag = False
474
+ applySqrtFlag = False
475
+
476
+ rescaleToTreatmentCoverageFlag = _cfgGet(
477
+ configData,
478
+ "countingParams.rescaleToTreatmentCoverage",
479
+ False,
480
+ )
481
+
482
+ trimLeftTail = _cfgGet(
483
+ configData,
484
+ "countingParams.trimLeftTail",
485
+ 0.0,
486
+ )
487
+
488
+ if scaleFactorList is not None and not isinstance(
489
+ scaleFactorList, list
490
+ ):
491
+ raise ValueError("`scaleFactors` should be a list of floats.")
492
+
493
+ if scaleFactorsControlList is not None and not isinstance(
494
+ scaleFactorsControlList, list
495
+ ):
496
+ raise ValueError(
497
+ "`scaleFactorsControl` should be a list of floats."
498
+ )
499
+
500
+ if (
501
+ scaleFactorList is not None
502
+ and scaleFactorsControlList is not None
503
+ and len(scaleFactorList) != len(scaleFactorsControlList)
504
+ ):
505
+ if len(scaleFactorsControlList) == 1:
506
+ scaleFactorsControlList = scaleFactorsControlList * len(
507
+ scaleFactorList
508
+ )
509
+ else:
510
+ raise ValueError(
511
+ "control and treatment scale factors: must be equal length or 1 control"
512
+ )
513
+
514
+ normMethod_ = _cfgGet(
515
+ configData,
516
+ "countingParams.normMethod",
517
+ "EGS",
518
+ )
519
+ if normMethod_.upper() not in ["EGS", "RPKM"]:
520
+ logger.warning(
521
+ f"Unknown `countingParams.normMethod`...Using `EGS`...",
522
+ )
523
+ normMethod_ = "EGS"
524
+
525
+ fragmentLengths: Optional[List[int]] = _cfgGet(
526
+ configData,
527
+ "countingParams.fragmentLengths",
528
+ None,
529
+ )
530
+ fragmentLengthsControl: Optional[List[int]] = _cfgGet(
531
+ configData,
532
+ "countingParams.fragmentLengthsControl",
533
+ None,
534
+ )
535
+
536
+ if fragmentLengths is not None and not isinstance(
537
+ fragmentLengths, list
538
+ ):
539
+ raise ValueError(
540
+ "`fragmentLengths` should be a list of integers."
541
+ )
542
+ if fragmentLengthsControl is not None and not isinstance(
543
+ fragmentLengthsControl, list
544
+ ):
545
+ raise ValueError(
546
+ "`fragmentLengthsControl` should be a list of integers."
547
+ )
548
+ if (
549
+ fragmentLengths is not None
550
+ and fragmentLengthsControl is not None
551
+ and len(fragmentLengths) != len(fragmentLengthsControl)
552
+ ):
553
+ if len(fragmentLengthsControl) == 1:
554
+ fragmentLengthsControl = fragmentLengthsControl * len(
555
+ fragmentLengths
556
+ )
557
+ else:
558
+ raise ValueError(
559
+ "control and treatment fragment lengths: must be equal length or 1 control"
560
+ )
561
+
562
+ return core.countingParams(
563
+ stepSize=stepSize,
564
+ scaleDown=scaleDownFlag,
565
+ scaleFactors=scaleFactorList,
566
+ scaleFactorsControl=scaleFactorsControlList,
567
+ numReads=numReads,
568
+ applyAsinh=applyAsinhFlag,
569
+ applyLog=applyLogFlag,
570
+ applySqrt=applySqrtFlag,
571
+ rescaleToTreatmentCoverage=rescaleToTreatmentCoverageFlag,
572
+ normMethod=normMethod_,
573
+ noTransform=noTransformFlag,
574
+ trimLeftTail=trimLeftTail,
575
+ fragmentLengths=fragmentLengths,
576
+ fragmentLengthsControl=fragmentLengthsControl,
577
+ useTreatmentFragmentLengths=_cfgGet(
578
+ configData,
579
+ "countingParams.useTreatmentFragmentLengths",
580
+ True,
581
+ ),
582
+ )
583
+
584
+
585
+ def getPlotArgs(
586
+ config_path: str, experimentName: str
587
+ ) -> core.plotParams:
588
+ configData = loadConfig(config_path)
589
+
590
+ plotPrefix_ = _cfgGet(
591
+ configData, "plotParams.plotPrefix", experimentName
592
+ )
593
+
594
+ plotStateEstimatesHistogram_ = _cfgGet(
595
+ configData,
596
+ "plotParams.plotStateEstimatesHistogram",
597
+ False,
598
+ )
599
+
600
+ plotResidualsHistogram_ = _cfgGet(
601
+ configData,
602
+ "plotParams.plotResidualsHistogram",
603
+ False,
604
+ )
605
+
606
+ plotStateStdHistogram_ = _cfgGet(
607
+ configData,
608
+ "plotParams.plotStateStdHistogram",
609
+ False,
610
+ )
611
+
612
+ plotHeightInches_ = _cfgGet(
613
+ configData,
614
+ "plotParams.plotHeightInches",
615
+ 6.0,
616
+ )
617
+
618
+ plotWidthInches_ = _cfgGet(
619
+ configData,
620
+ "plotParams.plotWidthInches",
621
+ 8.0,
622
+ )
623
+
624
+ plotDPI_ = _cfgGet(
625
+ configData,
626
+ "plotParams.plotDPI",
627
+ 300,
628
+ )
629
+
630
+ plotDirectory_ = _cfgGet(
631
+ configData,
632
+ "plotParams.plotDirectory",
633
+ os.path.join(
634
+ os.getcwd(), f"{experimentName}_consenrichPlots"
635
+ ),
636
+ )
637
+
638
+ if (
639
+ int(plotStateEstimatesHistogram_)
640
+ + int(plotResidualsHistogram_)
641
+ + int(plotStateStdHistogram_)
642
+ >= 1
643
+ ):
644
+ if plotDirectory_ is not None and (
645
+ not os.path.exists(plotDirectory_)
646
+ or not os.path.isdir(plotDirectory_)
647
+ ):
648
+ try:
649
+ os.makedirs(plotDirectory_, exist_ok=True)
650
+ except Exception as e:
651
+ logger.warning(
652
+ f"Failed to create {plotDirectory_}:\n\t{e}\nUsing CWD."
653
+ )
654
+ plotDirectory_ = os.getcwd()
655
+ elif plotDirectory_ is None:
656
+ plotDirectory_ = os.getcwd()
657
+
658
+ elif os.path.exists(plotDirectory_) and os.path.isdir(
659
+ plotDirectory_
660
+ ):
661
+ logger.warning(
662
+ f"Using existing plot directory: {plotDirectory_}"
663
+ )
664
+ else:
665
+ logger.warning(
666
+ f"Failed creating/identifying {plotDirectory_}...Using CWD."
667
+ )
668
+ plotDirectory_ = os.getcwd()
669
+
670
+ return core.plotParams(
671
+ plotPrefix=plotPrefix_,
672
+ plotStateEstimatesHistogram=plotStateEstimatesHistogram_,
673
+ plotResidualsHistogram=plotResidualsHistogram_,
674
+ plotStateStdHistogram=plotStateStdHistogram_,
675
+ plotHeightInches=plotHeightInches_,
676
+ plotWidthInches=plotWidthInches_,
677
+ plotDPI=plotDPI_,
678
+ plotDirectory=plotDirectory_,
679
+ )
680
+
681
+
682
+ def readConfig(config_path: str) -> Dict[str, Any]:
683
+ r"""Read and parse the configuration file for Consenrich.
684
+
685
+ :param config_path: Path to the YAML configuration file.
686
+ :return: Dictionary containing all parsed configuration parameters.
687
+ """
688
+ configData = loadConfig(config_path)
689
+
690
+ inputParams = getInputArgs(config_path)
691
+ outputParams = getOutputArgs(config_path)
692
+ genomeParams = getGenomeArgs(config_path)
693
+ countingParams = getCountingArgs(config_path)
694
+
695
+ matchingExcludeRegionsFileDefault: Optional[str] = (
696
+ genomeParams.blacklistFile
697
+ )
698
+
699
+ experimentName = _cfgGet(
700
+ configData, "experimentName", "consenrichExperiment"
701
+ )
702
+
703
+ processArgs = core.processParams(
704
+ deltaF=_cfgGet(configData, "processParams.deltaF", -1.0),
705
+ minQ=_cfgGet(configData, "processParams.minQ", -1.0),
706
+ maxQ=_cfgGet(configData, "processParams.maxQ", 10_000),
707
+ offDiagQ=_cfgGet(
708
+ configData, "processParams.offDiagQ", 1.0e-3
709
+ ),
710
+ dStatAlpha=_cfgGet(
711
+ configData,
712
+ "processParams.dStatAlpha",
713
+ 2.0,
714
+ ),
715
+ dStatd=_cfgGet(configData, "processParams.dStatd", 1.0),
716
+ dStatPC=_cfgGet(configData, "processParams.dStatPC", 1.0),
717
+ dStatUseMean=_cfgGet(
718
+ configData,
719
+ "processParams.dStatUseMean",
720
+ False,
721
+ ),
722
+ scaleResidualsByP11=_cfgGet(
723
+ configData,
724
+ "processParams.scaleResidualsByP11",
725
+ True,
726
+ ),
727
+ )
728
+
729
+ plotArgs = getPlotArgs(config_path, experimentName)
730
+
731
+ observationArgs = core.observationParams(
732
+ minR=_cfgGet(configData, "observationParams.minR", -1.0),
733
+ maxR=_cfgGet(configData, "observationParams.maxR", 10_000),
734
+ useALV=_cfgGet(configData, "observationParams.useALV", False),
735
+ useConstantNoiseLevel=_cfgGet(
736
+ configData,
737
+ "observationParams.useConstantNoiseLevel",
738
+ False,
739
+ ),
740
+ noGlobal=_cfgGet(
741
+ configData, "observationParams.noGlobal", False
742
+ ),
743
+ numNearest=_cfgGet(
744
+ configData,
745
+ "observationParams.numNearest",
746
+ 25,
747
+ ),
748
+ localWeight=_cfgGet(
749
+ configData, "observationParams.localWeight", 0.333,
750
+ ),
751
+ globalWeight=_cfgGet(
752
+ configData, "observationParams.globalWeight", 0.667,
753
+ ),
754
+ approximationWindowLengthBP=_cfgGet(
755
+ configData,
756
+ "observationParams.approximationWindowLengthBP",
757
+ 25_000,
758
+ ),
759
+ lowPassWindowLengthBP=_cfgGet(
760
+ configData,
761
+ "observationParams.lowPassWindowLengthBP",
762
+ 50_000,
763
+ ),
764
+ lowPassFilterType=_cfgGet(
765
+ configData,
766
+ "observationParams.lowPassFilterType",
767
+ "median",
768
+ ),
769
+ returnCenter=_cfgGet(
770
+ configData, "observationParams.returnCenter", True
771
+ ),
772
+ shrinkOffset=_cfgGet(
773
+ configData,
774
+ "observationParams.shrinkOffset",
775
+ 1 - 0.05,
776
+ ),
777
+ kappaALV=_cfgGet(
778
+ configData,
779
+ "observationParams.kappaALV",
780
+ 50.0,
781
+ ),
782
+ )
783
+
784
+ stateArgs = core.stateParams(
785
+ stateInit=_cfgGet(configData, "stateParams.stateInit", 0.0),
786
+ stateCovarInit=_cfgGet(
787
+ configData,
788
+ "stateParams.stateCovarInit",
789
+ 1000.0,
790
+ ),
791
+ boundState=_cfgGet(
792
+ configData,
793
+ "stateParams.boundState",
794
+ True,
795
+ ),
796
+ stateLowerBound=_cfgGet(
797
+ configData,
798
+ "stateParams.stateLowerBound",
799
+ 0.0,
800
+ ),
801
+ stateUpperBound=_cfgGet(
802
+ configData,
803
+ "stateParams.stateUpperBound",
804
+ 10000.0,
805
+ ),
806
+ )
807
+
808
+ samThreads = _cfgGet(configData, "samParams.samThreads", 1)
809
+ samFlagExclude = _cfgGet(
810
+ configData,
811
+ "samParams.samFlagExclude",
812
+ 3844,
813
+ )
814
+ minMappingQuality = _cfgGet(
815
+ configData,
816
+ "samParams.minMappingQuality",
817
+ 0,
818
+ )
819
+ oneReadPerBin = _cfgGet(configData, "samParams.oneReadPerBin", 0)
820
+ chunkSize = _cfgGet(configData, "samParams.chunkSize", 1_000_000)
821
+ offsetStr = _cfgGet(configData, "samParams.offsetStr", "0,0")
822
+ maxInsertSize = _cfgGet(
823
+ configData,
824
+ "samParams.maxInsertSize",
825
+ 1000,
826
+ )
827
+
828
+ pairedEndDefault = (
829
+ 1
830
+ if inputParams.pairedEnd is not None
831
+ and int(inputParams.pairedEnd) > 0
832
+ else 0
833
+ )
834
+ inferFragmentDefault = (
835
+ 1
836
+ if inputParams.pairedEnd is not None
837
+ and int(inputParams.pairedEnd) == 0
838
+ else 0
839
+ )
840
+
841
+ samArgs = core.samParams(
842
+ samThreads=samThreads,
843
+ samFlagExclude=samFlagExclude,
844
+ oneReadPerBin=oneReadPerBin,
845
+ chunkSize=chunkSize,
846
+ offsetStr=offsetStr,
847
+ maxInsertSize=maxInsertSize,
848
+ pairedEndMode=_cfgGet(
849
+ configData,
850
+ "samParams.pairedEndMode",
851
+ pairedEndDefault,
852
+ ),
853
+ inferFragmentLength=_cfgGet(
854
+ configData,
855
+ "samParams.inferFragmentLength",
856
+ inferFragmentDefault,
857
+ ),
858
+ countEndsOnly=_cfgGet(
859
+ configData, "samParams.countEndsOnly", False
860
+ ),
861
+ minMappingQuality=minMappingQuality,
862
+ minTemplateLength=_cfgGet(
863
+ configData,
864
+ "samParams.minTemplateLength",
865
+ -1,
866
+ ),
867
+ )
868
+
869
+ detrendArgs = core.detrendParams(
870
+ detrendWindowLengthBP=_cfgGet(
871
+ configData, "detrendParams.detrendWindowLengthBP", 20_000
872
+ ),
873
+ detrendTrackPercentile=_cfgGet(
874
+ configData,
875
+ "detrendParams.detrendTrackPercentile",
876
+ 75.0,
877
+ ),
878
+ usePolyFilter=_cfgGet(
879
+ configData,
880
+ "detrendParams.usePolyFilter",
881
+ False,
882
+ ),
883
+ detrendSavitzkyGolayDegree=_cfgGet(
884
+ configData,
885
+ "detrendParams.detrendSavitzkyGolayDegree",
886
+ 0,
887
+ ),
888
+ useOrderStatFilter=_cfgGet(
889
+ configData,
890
+ "detrendParams.useOrderStatFilter",
891
+ True,
892
+ ),
893
+ )
894
+
895
+ matchingArgs = core.matchingParams(
896
+ templateNames=_cfgGet(
897
+ configData, "matchingParams.templateNames", []
898
+ ),
899
+ cascadeLevels=_cfgGet(
900
+ configData, "matchingParams.cascadeLevels", []
901
+ ),
902
+ iters=_cfgGet(configData, "matchingParams.iters", 25_000),
903
+ alpha=_cfgGet(configData, "matchingParams.alpha", 0.05),
904
+ minMatchLengthBP=_cfgGet(
905
+ configData,
906
+ "matchingParams.minMatchLengthBP",
907
+ -1,
908
+ ),
909
+ maxNumMatches=_cfgGet(
910
+ configData,
911
+ "matchingParams.maxNumMatches",
912
+ 100_000,
913
+ ),
914
+ minSignalAtMaxima=_cfgGet(
915
+ configData,
916
+ "matchingParams.minSignalAtMaxima",
917
+ "q:0.75",
918
+ ),
919
+ merge=_cfgGet(configData, "matchingParams.merge", True),
920
+ mergeGapBP=_cfgGet(
921
+ configData,
922
+ "matchingParams.mergeGapBP",
923
+ -1,
924
+ ),
925
+ useScalingFunction=_cfgGet(
926
+ configData,
927
+ "matchingParams.useScalingFunction",
928
+ True,
929
+ ),
930
+ excludeRegionsBedFile=_cfgGet(
931
+ configData,
932
+ "matchingParams.excludeRegionsBedFile",
933
+ matchingExcludeRegionsFileDefault,
934
+ ),
935
+ randSeed=_cfgGet(configData, "matchingParams.randSeed", 42),
936
+ penalizeBy=_cfgGet(
937
+ configData, "matchingParams.penalizeBy", None
938
+ ),
939
+ eps=_cfgGet(configData, "matchingParams.eps", 1.0e-2),
940
+ autoLengthQuantile=_cfgGet(
941
+ configData,
942
+ "matchingParams.autoLengthQuantile",
943
+ 0.90,
944
+ ),
945
+ methodFDR=_cfgGet(
946
+ configData,
947
+ "matchingParams.methodFDR",
948
+ None,
949
+ ),
950
+ )
951
+
952
+ return {
953
+ "experimentName": experimentName,
954
+ "genomeArgs": genomeParams,
955
+ "inputArgs": inputParams,
956
+ "outputArgs": outputParams,
957
+ "countingArgs": countingParams,
958
+ "processArgs": processArgs,
959
+ "plotArgs": plotArgs,
960
+ "observationArgs": observationArgs,
961
+ "stateArgs": stateArgs,
962
+ "samArgs": samArgs,
963
+ "detrendArgs": detrendArgs,
964
+ "matchingArgs": matchingArgs,
965
+ }
966
+
967
+
968
+ def convertBedGraphToBigWig(
969
+ experimentName,
970
+ chromSizesFile,
971
+ suffixes: Optional[List[str]] = None,
972
+ ):
973
+ if suffixes is None:
974
+ # at least look for `state` bedGraph
975
+ suffixes = ["state"]
976
+ path_ = ""
977
+ warningMessage = (
978
+ "Could not find UCSC bedGraphToBigWig binary utility."
979
+ "If you need bigWig files instead of the default, human-readable bedGraph files,"
980
+ "you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
981
+ "OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
982
+ )
983
+
984
+ logger.info(
985
+ "Attempting to generate bigWig files from bedGraph format..."
986
+ )
987
+ try:
988
+ path_ = shutil.which("bedGraphToBigWig")
989
+ except Exception as e:
990
+ logger.warning(f"\n{warningMessage}\n")
991
+ return
992
+ if path_ is None or len(path_) == 0:
993
+ logger.warning(f"\n{warningMessage}\n")
994
+ return
995
+ logger.info(f"Using bedGraphToBigWig from {path_}")
996
+ for suffix in suffixes:
997
+ bedgraph = (
998
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
999
+ )
1000
+ if not os.path.exists(bedgraph):
1001
+ logger.warning(
1002
+ f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
1003
+ )
1004
+ continue
1005
+ if not os.path.exists(chromSizesFile):
1006
+ logger.warning(
1007
+ f"{chromSizesFile} does not exist. Skipping bigWig conversion."
1008
+ )
1009
+ return
1010
+ bigwig = f"{experimentName}_consenrich_{suffix}.bw"
1011
+ logger.info(f"Start: {bedgraph} --> {bigwig}...")
1012
+ try:
1013
+ subprocess.run(
1014
+ [path_, bedgraph, chromSizesFile, bigwig], check=True
1015
+ )
1016
+ except Exception as e:
1017
+ logger.warning(
1018
+ f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
1019
+ )
1020
+ continue
1021
+ if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
1022
+ logger.info(
1023
+ f"Finished: converted {bedgraph} to {bigwig}."
1024
+ )
1025
+
1026
+
1027
+ def main():
1028
+ parser = argparse.ArgumentParser(description="Consenrich CLI")
1029
+ parser.add_argument(
1030
+ "--config",
1031
+ type=str,
1032
+ dest="config",
1033
+ help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
1034
+ )
1035
+
1036
+ # --- Matching-specific command-line arguments ---
1037
+ parser.add_argument(
1038
+ "--match-bedGraph",
1039
+ type=str,
1040
+ dest="matchBedGraph",
1041
+ help="Path to a bedGraph file of Consenrich estimates to match templates against.\
1042
+ If provided, *only* the matching algorithm is run (no other processing). Note that \
1043
+ some features in `consenrich.matching` may not be supported through this CLI interface.",
1044
+ )
1045
+ parser.add_argument(
1046
+ "--match-template",
1047
+ nargs="+",
1048
+ type=str,
1049
+ help="List of template names to use in matching. See PyWavelets discrete wavelet families: https://pywavelets.readthedocs.io/en/latest/ref/wavelets.html#discrete-wavelets. \
1050
+ Needs to match `--match-level` in length",
1051
+ dest="matchTemplate",
1052
+ )
1053
+
1054
+ parser.add_argument(
1055
+ "--match-level",
1056
+ nargs="+",
1057
+ type=int,
1058
+ help="List of cascade levels to use in matching. Needs to match `--match-template` in length",
1059
+ dest="matchLevel",
1060
+ )
1061
+
1062
+ parser.add_argument(
1063
+ "--match-alpha",
1064
+ type=float,
1065
+ default=0.05,
1066
+ dest="matchAlpha",
1067
+ help="Cutoff qualifying candidate matches as significant (FDR-adjusted p-value < alpha).",
1068
+ )
1069
+ parser.add_argument(
1070
+ "--match-min-length",
1071
+ type=int,
1072
+ default=-1,
1073
+ dest="matchMinMatchLengthBP",
1074
+ help="Minimum length (bp) qualifying candidate matches. Set to -1 for auto calculation from data",
1075
+ )
1076
+ parser.add_argument(
1077
+ "--match-iters",
1078
+ type=int,
1079
+ default=50000,
1080
+ dest="matchIters",
1081
+ help="Number of sampled blocks for estimating null distribution of match scores (cross correlations with templates).",
1082
+ )
1083
+ parser.add_argument(
1084
+ "--match-min-signal",
1085
+ type=str,
1086
+ default="q:0.75",
1087
+ dest="matchMinSignalAtMaxima",
1088
+ help="Minimum signal at local maxima in the response sequence that qualifies candidate matches\
1089
+ Can be an absolute value (e.g., `50.0`) or a quantile (e.g., `q:0.75` for 75th percentile).",
1090
+ )
1091
+ parser.add_argument(
1092
+ "--match-max-matches",
1093
+ type=int,
1094
+ default=1000000,
1095
+ dest="matchMaxNumMatches",
1096
+ )
1097
+ parser.add_argument(
1098
+ "--match-merge-gap",
1099
+ type=int,
1100
+ default=-1,
1101
+ dest="matchMergeGapBP",
1102
+ help="Maximum gap (bp) between candidate matches to merge into a single match.\
1103
+ Set to -1 for auto calculation from data.",
1104
+ )
1105
+ parser.add_argument(
1106
+ "--match-use-wavelet",
1107
+ action="store_true",
1108
+ dest="matchUseWavelet",
1109
+ help="If set, use the wavelet function at the given level rather than scaling function.",
1110
+ )
1111
+ parser.add_argument(
1112
+ "--match-seed", type=int, default=42, dest="matchRandSeed"
1113
+ )
1114
+ parser.add_argument(
1115
+ "--match-exclude-bed",
1116
+ type=str,
1117
+ default=None,
1118
+ dest="matchExcludeBed",
1119
+ )
1120
+ parser.add_argument(
1121
+ "--match-auto-length-quantile",
1122
+ type=float,
1123
+ default=0.90,
1124
+ dest="matchAutoLengthQuantile",
1125
+ help="Cutoff in standardized values to use when auto-calculating minimum match length and merge gap.",
1126
+ )
1127
+ parser.add_argument(
1128
+ "--match-method-fdr",
1129
+ type=str,
1130
+ default=None,
1131
+ dest="matchMethodFDR",
1132
+ help="Method for multiple hypothesis correction of p-values. (bh, by)",
1133
+ )
1134
+ parser.add_argument(
1135
+ "--match-is-log-scale",
1136
+ action="store_true",
1137
+ dest="matchIsLogScale",
1138
+ help="If set, indicates that the input bedGraph has already been transformed.",
1139
+ )
1140
+ parser.add_argument(
1141
+ "--verbose", action="store_true", help="If set, logs config"
1142
+ )
1143
+ args = parser.parse_args()
1144
+
1145
+ if args.matchBedGraph:
1146
+ if not os.path.exists(args.matchBedGraph):
1147
+ raise FileNotFoundError(
1148
+ f"bedGraph file {args.matchBedGraph} couldn't be found."
1149
+ )
1150
+ logger.info(
1151
+ f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
1152
+ )
1153
+
1154
+ outName = matching.runMatchingAlgorithm(
1155
+ args.matchBedGraph,
1156
+ args.matchTemplate,
1157
+ args.matchLevel,
1158
+ alpha=args.matchAlpha,
1159
+ minMatchLengthBP=args.matchMinMatchLengthBP,
1160
+ iters=args.matchIters,
1161
+ minSignalAtMaxima=args.matchMinSignalAtMaxima,
1162
+ maxNumMatches=args.matchMaxNumMatches,
1163
+ useScalingFunction=(not args.matchUseWavelet),
1164
+ mergeGapBP=args.matchMergeGapBP,
1165
+ excludeRegionsBedFile=args.matchExcludeBed,
1166
+ autoLengthQuantile=args.matchAutoLengthQuantile,
1167
+ methodFDR=args.matchMethodFDR.lower()
1168
+ if args.matchMethodFDR
1169
+ else None,
1170
+ isLogScale=args.matchIsLogScale,
1171
+ randSeed=args.matchRandSeed,
1172
+ merge=True, # always merge for CLI use -- either way, both files produced
1173
+ )
1174
+ logger.info(f"Finished matching. Written to {outName}")
1175
+ sys.exit(0)
1176
+
1177
+ if not args.config:
1178
+ logger.info(
1179
+ "No config file provided, run with `--config <path_to_config.yaml>`"
1180
+ )
1181
+ logger.info(
1182
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
1183
+ )
1184
+ sys.exit(1)
1185
+
1186
+ if not os.path.exists(args.config):
1187
+ logger.info(f"Config file {args.config} does not exist.")
1188
+ logger.info(
1189
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
1190
+ )
1191
+ sys.exit(1)
1192
+
1193
+ config = readConfig(args.config)
1194
+ experimentName = config["experimentName"]
1195
+ genomeArgs = config["genomeArgs"]
1196
+ inputArgs = config["inputArgs"]
1197
+ outputArgs = config["outputArgs"]
1198
+ countingArgs = config["countingArgs"]
1199
+ processArgs = config["processArgs"]
1200
+ observationArgs = config["observationArgs"]
1201
+ stateArgs = config["stateArgs"]
1202
+ samArgs = config["samArgs"]
1203
+ detrendArgs = config["detrendArgs"]
1204
+ matchingArgs = config["matchingArgs"]
1205
+ plotArgs = config["plotArgs"]
1206
+ bamFiles = inputArgs.bamFiles
1207
+ bamFilesControl = inputArgs.bamFilesControl
1208
+ numSamples = len(bamFiles)
1209
+ numNearest = observationArgs.numNearest
1210
+ stepSize = countingArgs.stepSize
1211
+ excludeForNorm = genomeArgs.excludeForNorm
1212
+ chromSizes = genomeArgs.chromSizesFile
1213
+ scaleDown = countingArgs.scaleDown
1214
+ initialTreatmentScaleFactors = []
1215
+ minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
1216
+ deltaF_ = processArgs.deltaF
1217
+ minR_ = observationArgs.minR
1218
+ maxR_ = observationArgs.maxR
1219
+ minQ_ = processArgs.minQ
1220
+ maxQ_ = processArgs.maxQ
1221
+ offDiagQ_ = processArgs.offDiagQ
1222
+ muncEps: float = 10e-2
1223
+
1224
+ if args.verbose:
1225
+ try:
1226
+ logger.info("Initial Configuration:\n")
1227
+ config_truncated = {
1228
+ k: v
1229
+ for k, v in config.items()
1230
+ if k
1231
+ not in ["inputArgs", "genomeArgs", "countingArgs"]
1232
+ }
1233
+ config_truncated["experimentName"] = experimentName
1234
+ config_truncated["inputArgs"] = inputArgs
1235
+ config_truncated["outputArgs"] = outputArgs
1236
+ config_truncated["genomeArgs"] = genomeArgs
1237
+ config_truncated["countingArgs"] = countingArgs
1238
+ config_truncated["processArgs"] = processArgs
1239
+ config_truncated["observationArgs"] = observationArgs
1240
+ config_truncated["stateArgs"] = stateArgs
1241
+ config_truncated["samArgs"] = samArgs
1242
+ config_truncated["detrendArgs"] = detrendArgs
1243
+ pprint.pprint(config_truncated, indent=8)
1244
+ except Exception as e:
1245
+ logger.warning(f"Failed to print parsed config:\n{e}\n")
1246
+
1247
+ controlsPresent = checkControlsPresent(inputArgs)
1248
+ if args.verbose:
1249
+ logger.info(f"controlsPresent: {controlsPresent}")
1250
+ readLengthsBamFiles = getReadLengths(
1251
+ inputArgs, countingArgs, samArgs
1252
+ )
1253
+ effectiveGenomeSizes = getEffectiveGenomeSizes(
1254
+ genomeArgs, readLengthsBamFiles
1255
+ )
1256
+
1257
+ matchingEnabled = checkMatchingEnabled(matchingArgs)
1258
+ if args.verbose:
1259
+ logger.info(f"matchingEnabled: {matchingEnabled}")
1260
+ scaleFactors = countingArgs.scaleFactors
1261
+ scaleFactorsControl = countingArgs.scaleFactorsControl
1262
+
1263
+ fragmentLengthsTreatment: List[int] = []
1264
+ fragmentLengthsControl: List[int] = []
1265
+
1266
+ if countingArgs.fragmentLengths is not None:
1267
+ fragmentLengthsTreatment = list(countingArgs.fragmentLengths)
1268
+ else:
1269
+ for bamFile in bamFiles:
1270
+ fragmentLengthsTreatment.append(
1271
+ cconsenrich.cgetFragmentLength(
1272
+ bamFile,
1273
+ samThreads=samArgs.samThreads,
1274
+ samFlagExclude=samArgs.samFlagExclude,
1275
+ maxInsertSize=samArgs.maxInsertSize,
1276
+ )
1277
+ )
1278
+ logger.info(
1279
+ f"Estimated fragment length for {bamFile}: {fragmentLengthsTreatment[-1]}"
1280
+ )
1281
+ if controlsPresent:
1282
+ readLengthsControlBamFiles = [
1283
+ core.getReadLength(
1284
+ bamFile,
1285
+ countingArgs.numReads,
1286
+ 1000,
1287
+ samArgs.samThreads,
1288
+ samArgs.samFlagExclude,
1289
+ )
1290
+ for bamFile in bamFilesControl
1291
+ ]
1292
+ effectiveGenomeSizesControl = [
1293
+ constants.getEffectiveGenomeSize(
1294
+ genomeArgs.genomeName, readLength
1295
+ )
1296
+ for readLength in readLengthsControlBamFiles
1297
+ ]
1298
+
1299
+ if countingArgs.fragmentLengthsControl is not None:
1300
+ fragmentLengthsControl = list(
1301
+ countingArgs.fragmentLengthsControl
1302
+ )
1303
+ elif not countingArgs.useTreatmentFragmentLengths:
1304
+ for bamFile in bamFilesControl:
1305
+ fragmentLengthsControl.append(
1306
+ cconsenrich.cgetFragmentLength(
1307
+ bamFile,
1308
+ samThreads=samArgs.samThreads,
1309
+ samFlagExclude=samArgs.samFlagExclude,
1310
+ maxInsertSize=samArgs.maxInsertSize,
1311
+ )
1312
+ )
1313
+ logger.info(
1314
+ f"Estimated fragment length for {bamFile}: {fragmentLengthsControl[-1]}"
1315
+ )
1316
+ if countingArgs.useTreatmentFragmentLengths:
1317
+ logger.info(
1318
+ "`countingParams.useTreatmentFragmentLengths=True`"
1319
+ "`\n\t--> using treatment fraglens for control samples, too"
1320
+ )
1321
+ fragmentLengthsTreatment, fragmentLengthsControl = _resolveFragmentLengthPairs(
1322
+ fragmentLengthsTreatment, fragmentLengthsControl
1323
+ )
1324
+
1325
+
1326
+
1327
+
1328
+ if (
1329
+ scaleFactors is not None
1330
+ and scaleFactorsControl is not None
1331
+ ):
1332
+ treatScaleFactors = scaleFactors
1333
+ controlScaleFactors = scaleFactorsControl
1334
+ # still make sure this is accessible
1335
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
1336
+ else:
1337
+ try:
1338
+ initialTreatmentScaleFactors = [
1339
+ detrorm.getScaleFactor1x(
1340
+ bamFile,
1341
+ effectiveGenomeSize,
1342
+ readLength,
1343
+ excludeForNorm,
1344
+ genomeArgs.chromSizesFile,
1345
+ samArgs.samThreads,
1346
+ )
1347
+ for bamFile, effectiveGenomeSize, readLength in zip(
1348
+ bamFiles,
1349
+ effectiveGenomeSizes,
1350
+ fragmentLengthsTreatment,
1351
+ )
1352
+ ]
1353
+ except Exception:
1354
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
1355
+
1356
+ pairScalingFactors = [
1357
+ detrorm.getPairScaleFactors(
1358
+ bamFileA,
1359
+ bamFileB,
1360
+ effectiveGenomeSizeA,
1361
+ effectiveGenomeSizeB,
1362
+ readLengthA,
1363
+ readLengthB,
1364
+ excludeForNorm,
1365
+ chromSizes,
1366
+ samArgs.samThreads,
1367
+ stepSize,
1368
+ scaleDown,
1369
+ normMethod=countingArgs.normMethod,
1370
+ )
1371
+ for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
1372
+ bamFiles,
1373
+ bamFilesControl,
1374
+ effectiveGenomeSizes,
1375
+ effectiveGenomeSizesControl,
1376
+ fragmentLengthsTreatment,
1377
+ fragmentLengthsControl,
1378
+ )
1379
+ ]
1380
+
1381
+ treatScaleFactors = []
1382
+ controlScaleFactors = []
1383
+ for scaleFactorA, scaleFactorB in pairScalingFactors:
1384
+ treatScaleFactors.append(scaleFactorA)
1385
+ controlScaleFactors.append(scaleFactorB)
1386
+
1387
+ else:
1388
+ treatScaleFactors = scaleFactors
1389
+ controlScaleFactors = scaleFactorsControl
1390
+
1391
+ if scaleFactors is None and not controlsPresent:
1392
+ if countingArgs.normMethod.upper() == "RPKM":
1393
+ scaleFactors = [
1394
+ detrorm.getScaleFactorPerMillion(
1395
+ bamFile,
1396
+ excludeForNorm,
1397
+ stepSize,
1398
+ )
1399
+ for bamFile in bamFiles
1400
+ ]
1401
+ else:
1402
+ scaleFactors = [
1403
+ detrorm.getScaleFactor1x(
1404
+ bamFile,
1405
+ effectiveGenomeSize,
1406
+ readLength,
1407
+ excludeForNorm,
1408
+ genomeArgs.chromSizesFile,
1409
+ samArgs.samThreads,
1410
+ )
1411
+ for bamFile, effectiveGenomeSize, readLength in zip(
1412
+ bamFiles,
1413
+ effectiveGenomeSizes,
1414
+ fragmentLengthsTreatment,
1415
+ )
1416
+ ]
1417
+ chromSizesDict = misc_util.getChromSizesDict(
1418
+ genomeArgs.chromSizesFile,
1419
+ excludeChroms=genomeArgs.excludeChroms,
1420
+ )
1421
+ chromosomes = genomeArgs.chromosomes
1422
+
1423
+ for c_, chromosome in enumerate(chromosomes):
1424
+ chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
1425
+ bamFiles,
1426
+ chromosome,
1427
+ chromSizesDict[chromosome],
1428
+ samArgs.samThreads,
1429
+ samArgs.samFlagExclude,
1430
+ )
1431
+ chromosomeStart = max(
1432
+ 0, (chromosomeStart - (chromosomeStart % stepSize))
1433
+ )
1434
+ chromosomeEnd = max(
1435
+ 0, (chromosomeEnd - (chromosomeEnd % stepSize))
1436
+ )
1437
+ numIntervals = (
1438
+ ((chromosomeEnd - chromosomeStart) + stepSize) - 1
1439
+ ) // stepSize
1440
+ intervals = np.arange(
1441
+ chromosomeStart, chromosomeEnd, stepSize
1442
+ )
1443
+
1444
+ if c_ == 0 and deltaF_ < 0:
1445
+ logger.info(
1446
+ f"`processParams.deltaF < 0` --> calling core.autoDeltaF()..."
1447
+ )
1448
+ deltaF_ = core.autoDeltaF(
1449
+ bamFiles,
1450
+ stepSize,
1451
+ fragmentLengths=fragmentLengthsTreatment,
1452
+ )
1453
+
1454
+ chromMat: np.ndarray = np.empty(
1455
+ (numSamples, numIntervals), dtype=np.float32
1456
+ )
1457
+ if controlsPresent:
1458
+ j_: int = 0
1459
+ for bamA, bamB in zip(bamFiles, bamFilesControl):
1460
+ logger.info(
1461
+ f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
1462
+ )
1463
+ pairMatrix: np.ndarray = core.readBamSegments(
1464
+ [bamA, bamB],
1465
+ chromosome,
1466
+ chromosomeStart,
1467
+ chromosomeEnd,
1468
+ stepSize,
1469
+ [
1470
+ readLengthsBamFiles[j_],
1471
+ readLengthsControlBamFiles[j_],
1472
+ ],
1473
+ [treatScaleFactors[j_], controlScaleFactors[j_]],
1474
+ samArgs.oneReadPerBin,
1475
+ samArgs.samThreads,
1476
+ samArgs.samFlagExclude,
1477
+ offsetStr=samArgs.offsetStr,
1478
+ maxInsertSize=samArgs.maxInsertSize,
1479
+ pairedEndMode=samArgs.pairedEndMode,
1480
+ inferFragmentLength=samArgs.inferFragmentLength,
1481
+ applyAsinh=countingArgs.applyAsinh,
1482
+ applyLog=countingArgs.applyLog,
1483
+ applySqrt=countingArgs.applySqrt,
1484
+ countEndsOnly=samArgs.countEndsOnly,
1485
+ minMappingQuality=samArgs.minMappingQuality,
1486
+ minTemplateLength=samArgs.minTemplateLength,
1487
+ trimLeftTail=countingArgs.trimLeftTail,
1488
+ fragmentLengths=[
1489
+ fragmentLengthsTreatment[j_],
1490
+ fragmentLengthsControl[j_],
1491
+ ],
1492
+ )
1493
+ chromMat[j_, :] = pairMatrix[0, :] - pairMatrix[1, :]
1494
+ j_ += 1
1495
+ else:
1496
+ chromMat = core.readBamSegments(
1497
+ bamFiles,
1498
+ chromosome,
1499
+ chromosomeStart,
1500
+ chromosomeEnd,
1501
+ stepSize,
1502
+ readLengthsBamFiles,
1503
+ scaleFactors,
1504
+ samArgs.oneReadPerBin,
1505
+ samArgs.samThreads,
1506
+ samArgs.samFlagExclude,
1507
+ offsetStr=samArgs.offsetStr,
1508
+ maxInsertSize=samArgs.maxInsertSize,
1509
+ pairedEndMode=samArgs.pairedEndMode,
1510
+ inferFragmentLength=samArgs.inferFragmentLength,
1511
+ applyAsinh=countingArgs.applyAsinh,
1512
+ applyLog=countingArgs.applyLog,
1513
+ applySqrt=countingArgs.applySqrt,
1514
+ countEndsOnly=samArgs.countEndsOnly,
1515
+ minMappingQuality=samArgs.minMappingQuality,
1516
+ minTemplateLength=samArgs.minTemplateLength,
1517
+ trimLeftTail=countingArgs.trimLeftTail,
1518
+ fragmentLengths=fragmentLengthsTreatment,
1519
+ )
1520
+ sparseMap = None
1521
+ if genomeArgs.sparseBedFile and not observationArgs.useALV:
1522
+ if c_ == 0:
1523
+ logger.info(
1524
+ f"\n\t`useALV={observationArgs.useALV}`\n\t\t--> The local component of sample-specific observation uncertainty tracks will be estimated at each interval from the `numNearest={observationArgs.numNearest}` regions in `sparseBedFile={genomeArgs.sparseBedFile}`...\n"
1525
+ )
1526
+ sparseMap = core.getSparseMap(
1527
+ chromosome,
1528
+ intervals,
1529
+ numNearest,
1530
+ genomeArgs.sparseBedFile,
1531
+ )
1532
+
1533
+ # negative --> data-based
1534
+ if observationArgs.minR < 0.0 or observationArgs.maxR < 0.0:
1535
+ minR_ = 0.0
1536
+ maxR_ = 1e4
1537
+ if processArgs.minQ < 0.0 or processArgs.maxQ < 0.0:
1538
+ minQ_ = 0.0
1539
+ maxQ_ = 1e4
1540
+
1541
+ muncMat = np.empty_like(chromMat, dtype=np.float32)
1542
+ for j in range(numSamples):
1543
+ logger.info(
1544
+ f"Muncing {j + 1}/{numSamples} for {chromosome}..."
1545
+ )
1546
+
1547
+ chromMat[j, :] = detrorm.detrendTrack(
1548
+ chromMat[j, :],
1549
+ stepSize,
1550
+ detrendArgs.detrendWindowLengthBP,
1551
+ detrendArgs.useOrderStatFilter,
1552
+ detrendArgs.usePolyFilter,
1553
+ detrendArgs.detrendTrackPercentile,
1554
+ detrendArgs.detrendSavitzkyGolayDegree,
1555
+ )
1556
+
1557
+ muncMat[j, :] = core.getMuncTrack(
1558
+ chromosome,
1559
+ intervals,
1560
+ stepSize,
1561
+ chromMat[j, :],
1562
+ minR_,
1563
+ maxR_,
1564
+ observationArgs.useALV,
1565
+ observationArgs.useConstantNoiseLevel,
1566
+ observationArgs.noGlobal,
1567
+ observationArgs.localWeight,
1568
+ observationArgs.globalWeight,
1569
+ observationArgs.approximationWindowLengthBP,
1570
+ observationArgs.lowPassWindowLengthBP,
1571
+ observationArgs.returnCenter,
1572
+ sparseMap=sparseMap,
1573
+ lowPassFilterType=observationArgs.lowPassFilterType,
1574
+ shrinkOffset=observationArgs.shrinkOffset,
1575
+ )
1576
+
1577
+ if observationArgs.minR < 0.0 or observationArgs.maxR < 0.0:
1578
+ kappa = np.float32(observationArgs.kappaALV)
1579
+ minR_ = np.float32(
1580
+ np.quantile(muncMat[muncMat > muncEps], 0.10)
1581
+ )
1582
+
1583
+ colMax = np.maximum(muncMat.max(axis=0), minR_).astype(
1584
+ np.float32
1585
+ )
1586
+ colMin = np.maximum(
1587
+ muncMat.min(axis=0), (colMax / kappa)
1588
+ ).astype(np.float32)
1589
+
1590
+ np.clip(muncMat, colMin, colMax, out=muncMat)
1591
+ muncMat += muncEps
1592
+ muncMat = muncMat.astype(np.float32, copy=False)
1593
+ minQ_ = processArgs.minQ
1594
+ maxQ_ = processArgs.maxQ
1595
+
1596
+ if processArgs.minQ < 0.0 or processArgs.maxQ < 0.0:
1597
+ if minR_ is None:
1598
+ minR_ = np.float32(
1599
+ np.quantile(muncMat[muncMat > muncEps], 0.10)
1600
+ )
1601
+
1602
+ autoMinQ = np.float32(
1603
+ (minR_ / numSamples) + offDiagQ_,
1604
+ )
1605
+
1606
+ if processArgs.minQ < 0.0:
1607
+ minQ_ = autoMinQ
1608
+ else:
1609
+ minQ_ = np.float32(processArgs.minQ)
1610
+
1611
+ if processArgs.maxQ < 0.0:
1612
+ maxQ_ = minQ_
1613
+ else:
1614
+ maxQ_ = np.float32(max(processArgs.maxQ, minQ_))
1615
+ else:
1616
+ maxQ_ = np.float32(max(maxQ_, minQ_))
1617
+
1618
+ logger.info(f">>>Running consenrich: {chromosome}<<<")
1619
+ x, P, y = core.runConsenrich(
1620
+ chromMat,
1621
+ muncMat,
1622
+ deltaF_,
1623
+ minQ_,
1624
+ maxQ_,
1625
+ offDiagQ_,
1626
+ processArgs.dStatAlpha,
1627
+ processArgs.dStatd,
1628
+ processArgs.dStatPC,
1629
+ processArgs.dStatUseMean,
1630
+ stateArgs.stateInit,
1631
+ stateArgs.stateCovarInit,
1632
+ stateArgs.boundState,
1633
+ stateArgs.stateLowerBound,
1634
+ stateArgs.stateUpperBound,
1635
+ samArgs.chunkSize,
1636
+ progressIter=25_000,
1637
+ )
1638
+ logger.info("Done.")
1639
+
1640
+ x_ = core.getPrimaryState(x)
1641
+ y_ = core.getPrecisionWeightedResidual(
1642
+ y,
1643
+ muncMat,
1644
+ stateCovarSmoothed=P
1645
+ if processArgs.scaleResidualsByP11 is not None
1646
+ and processArgs.scaleResidualsByP11
1647
+ else None,
1648
+ )
1649
+
1650
+ if plotArgs.plotStateEstimatesHistogram:
1651
+ core.plotStateEstimatesHistogram(
1652
+ chromosome,
1653
+ plotArgs.plotPrefix,
1654
+ x_,
1655
+ plotDirectory=plotArgs.plotDirectory,
1656
+ )
1657
+
1658
+ if plotArgs.plotResidualsHistogram:
1659
+ core.plotResidualsHistogram(
1660
+ chromosome,
1661
+ plotArgs.plotPrefix,
1662
+ y,
1663
+ plotDirectory=plotArgs.plotDirectory,
1664
+ )
1665
+
1666
+ if plotArgs.plotStateStdHistogram:
1667
+ core.plotStateStdHistogram(
1668
+ chromosome,
1669
+ plotArgs.plotPrefix,
1670
+ np.sqrt(P[:, 0, 0]),
1671
+ plotDirectory=plotArgs.plotDirectory,
1672
+ )
1673
+
1674
+ df = pd.DataFrame(
1675
+ {
1676
+ "Chromosome": chromosome,
1677
+ "Start": intervals,
1678
+ "End": intervals + stepSize,
1679
+ "State": x_,
1680
+ }
1681
+ )
1682
+
1683
+ if outputArgs.writeResiduals:
1684
+ df["Res"] = y_.astype(np.float32) # FFR: cast necessary?
1685
+ if outputArgs.writeMuncTrace:
1686
+ munc_std = np.sqrt(
1687
+ np.mean(muncMat.astype(np.float64), axis=0)
1688
+ ).astype(np.float32)
1689
+ df["Munc"] = munc_std
1690
+ if outputArgs.writeStateStd:
1691
+ df["StateStd"] = np.sqrt(P[:, 0, 0]).astype(np.float32)
1692
+ cols_ = ["Chromosome", "Start", "End", "State"]
1693
+ if outputArgs.writeResiduals:
1694
+ cols_.append("Res")
1695
+ if outputArgs.writeMuncTrace:
1696
+ cols_.append("Munc")
1697
+ if outputArgs.writeStateStd:
1698
+ cols_.append("StateStd")
1699
+ df = df[cols_]
1700
+ suffixes = ["state"]
1701
+ if outputArgs.writeResiduals:
1702
+ suffixes.append("residuals")
1703
+ if outputArgs.writeMuncTrace:
1704
+ suffixes.append("muncTraces")
1705
+ if outputArgs.writeStateStd:
1706
+ suffixes.append("stdDevs")
1707
+
1708
+ if (c_ == 0 and len(chromosomes) > 1) or (
1709
+ len(chromosomes) == 1
1710
+ ):
1711
+ for file_ in os.listdir("."):
1712
+ if file_.startswith(
1713
+ f"consenrichOutput_{experimentName}"
1714
+ ) and (
1715
+ file_.endswith(".bedGraph")
1716
+ or file_.endswith(".narrowPeak")
1717
+ ):
1718
+ logger.warning(f"Overwriting: {file_}")
1719
+ os.remove(file_)
1720
+
1721
+ for col, suffix in zip(cols_[3:], suffixes):
1722
+ logger.info(
1723
+ f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
1724
+ )
1725
+ df[["Chromosome", "Start", "End", col]].to_csv(
1726
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
1727
+ sep="\t",
1728
+ header=False,
1729
+ index=False,
1730
+ mode="a",
1731
+ float_format="%.3f",
1732
+ lineterminator="\n",
1733
+ )
1734
+
1735
+ logger.info("Finished: output in human-readable format")
1736
+
1737
+ if outputArgs.convertToBigWig:
1738
+ convertBedGraphToBigWig(
1739
+ experimentName,
1740
+ genomeArgs.chromSizesFile,
1741
+ suffixes=suffixes,
1742
+ )
1743
+
1744
+ if matchingEnabled:
1745
+ try:
1746
+ weightsBedGraph: str | None = None
1747
+ logger.info("Running matching algorithm...")
1748
+ if matchingArgs.penalizeBy is not None:
1749
+ if matchingArgs.penalizeBy.lower() in [
1750
+ "stateuncertainty",
1751
+ "statestddev",
1752
+ "statestd",
1753
+ "p11",
1754
+ ]:
1755
+ weightsBedGraph = f"consenrichOutput_{experimentName}_stdDevs.bedGraph"
1756
+ elif matchingArgs.penalizeBy.lower() in [
1757
+ "munc",
1758
+ "munctrace",
1759
+ "avgmunctrace",
1760
+ ]:
1761
+ weightsBedGraph = f"consenrichOutput_{experimentName}_muncTraces.bedGraph"
1762
+ elif matchingArgs.penalizeBy.lower() == "none":
1763
+ weightsBedGraph = None
1764
+ else:
1765
+ weightsBedGraph = None
1766
+
1767
+ outName = matching.runMatchingAlgorithm(
1768
+ f"consenrichOutput_{experimentName}_state.bedGraph",
1769
+ matchingArgs.templateNames,
1770
+ matchingArgs.cascadeLevels,
1771
+ matchingArgs.iters,
1772
+ alpha=matchingArgs.alpha,
1773
+ minMatchLengthBP=minMatchLengthBP_,
1774
+ maxNumMatches=matchingArgs.maxNumMatches,
1775
+ minSignalAtMaxima=matchingArgs.minSignalAtMaxima,
1776
+ useScalingFunction=matchingArgs.useScalingFunction,
1777
+ mergeGapBP=matchingArgs.mergeGapBP,
1778
+ excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
1779
+ randSeed=matchingArgs.randSeed,
1780
+ weightsBedGraph=weightsBedGraph,
1781
+ eps=matchingArgs.eps,
1782
+ isLogScale=countingArgs.applyLog
1783
+ or countingArgs.applyAsinh
1784
+ or countingArgs.applySqrt,
1785
+ autoLengthQuantile=matchingArgs.autoLengthQuantile,
1786
+ methodFDR=matchingArgs.methodFDR.lower()
1787
+ if matchingArgs.methodFDR is not None
1788
+ else None,
1789
+ merge=matchingArgs.merge,
1790
+ )
1791
+
1792
+ logger.info(f"Finished matching. Written to {outName}")
1793
+ except Exception as ex_:
1794
+ logger.warning(
1795
+ f"Matching algorithm raised an exception:\n\n\t{ex_}\n"
1796
+ f"Skipping matching step...try running post-hoc via `consenrich --match-bedGraph <bedGraphFile>`\n"
1797
+ f"\tSee ``consenrich -h`` for more details.\n"
1798
+ )
1799
+
1800
+
1801
+ if __name__ == "__main__":
1802
+ main()