consenrich 0.7.2b2__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,1165 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import pprint
8
+ import os
9
+ from pathlib import Path
10
+ from typing import List, Optional, Tuple, Dict, Any, Union
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pysam
17
+ import pywt
18
+ import yaml
19
+
20
+ import consenrich.core as core
21
+ import consenrich.misc_util as misc_util
22
+ import consenrich.constants as constants
23
+ import consenrich.detrorm as detrorm
24
+ import consenrich.matching as matching
25
+
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _listOrEmpty(list_):
36
+ if list_ is None:
37
+ return []
38
+ return list_
39
+
40
+
41
+ def _getMinR(cfg, numBams: int) -> float:
42
+ fallBackMinR: float = 1.0
43
+ try:
44
+ raw = cfg.get("observationParams.minR", None)
45
+ return float(raw) if raw is not None else fallBackMinR
46
+ except (TypeError, ValueError, KeyError):
47
+ logger.warning(
48
+ f"Invalid or missing 'observationParams.minR' in config. Using `{fallBackMinR}`."
49
+ )
50
+ return fallBackMinR
51
+
52
+
53
+ def checkControlsPresent(inputArgs: core.inputParams) -> bool:
54
+ """Check if control BAM files are present in the input arguments.
55
+
56
+ :param inputArgs: core.inputParams object
57
+ :return: True if control BAM files are present, False otherwise.
58
+ """
59
+ return (
60
+ bool(inputArgs.bamFilesControl)
61
+ and isinstance(inputArgs.bamFilesControl, list)
62
+ and len(inputArgs.bamFilesControl) > 0
63
+ )
64
+
65
+
66
+ def getReadLengths(
67
+ inputArgs: core.inputParams,
68
+ countingArgs: core.countingParams,
69
+ samArgs: core.samParams,
70
+ ) -> List[int]:
71
+ r"""Get read lengths for each BAM file in the input arguments.
72
+
73
+ :param inputArgs: core.inputParams object containing BAM file paths.
74
+ :param countingArgs: core.countingParams object containing number of reads.
75
+ :param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
76
+ :return: List of read lengths for each BAM file.
77
+ """
78
+ if not inputArgs.bamFiles:
79
+ raise ValueError(
80
+ "No BAM files provided in the input arguments."
81
+ )
82
+
83
+ if (
84
+ not isinstance(inputArgs.bamFiles, list)
85
+ or len(inputArgs.bamFiles) == 0
86
+ ):
87
+ raise ValueError("bam files list is empty")
88
+
89
+ return [
90
+ core.getReadLength(
91
+ bamFile,
92
+ countingArgs.numReads,
93
+ 1000,
94
+ samArgs.samThreads,
95
+ samArgs.samFlagExclude,
96
+ )
97
+ for bamFile in inputArgs.bamFiles
98
+ ]
99
+
100
+
101
+ def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
102
+ matchingEnabled = (
103
+ (matchingArgs.templateNames is not None)
104
+ and isinstance(matchingArgs.templateNames, list)
105
+ and len(matchingArgs.templateNames) > 0
106
+ )
107
+ matchingEnabled = (
108
+ matchingEnabled
109
+ and (matchingArgs.cascadeLevels is not None)
110
+ and isinstance(matchingArgs.cascadeLevels, list)
111
+ and len(matchingArgs.cascadeLevels) > 0
112
+ )
113
+ return matchingEnabled
114
+
115
+
116
+ def getEffectiveGenomeSizes(
117
+ genomeArgs: core.genomeParams, readLengths: List[int]
118
+ ) -> List[int]:
119
+ r"""Get effective genome sizes for the given genome name and read lengths.
120
+ :param genomeArgs: core.genomeParams object
121
+ :param readLengths: List of read lengths for which to get effective genome sizes.
122
+ :return: List of effective genome sizes corresponding to the read lengths.
123
+ """
124
+ genomeName = genomeArgs.genomeName
125
+ if not genomeName or not isinstance(genomeName, str):
126
+ raise ValueError("Genome name must be a non-empty string.")
127
+
128
+ if not isinstance(readLengths, list) or len(readLengths) == 0:
129
+ raise ValueError(
130
+ "Read lengths must be a non-empty list. Try calling `getReadLengths` first."
131
+ )
132
+ return [
133
+ constants.getEffectiveGenomeSize(genomeName, readLength)
134
+ for readLength in readLengths
135
+ ]
136
+
137
+
138
+ def getInputArgs(config_path: str) -> core.inputParams:
139
+ def _expandWildCards(bamList) -> List[str]:
140
+ expanded = []
141
+ for entry in bamList:
142
+ if "*" in entry or "?" in entry or "[" in entry:
143
+ matched = glob.glob(entry)
144
+ expanded.extend(matched)
145
+ else:
146
+ expanded.append(entry)
147
+ return expanded
148
+
149
+ with open(config_path, "r") as f:
150
+ config = yaml.safe_load(f)
151
+ bamFilesRaw = config.get("inputParams.bamFiles", [])
152
+ bamFilesControlRaw = config.get("inputParams.bamFilesControl", [])
153
+ bamFiles = _expandWildCards(bamFilesRaw)
154
+ bamFilesControl = _expandWildCards(bamFilesControlRaw)
155
+ if len(bamFiles) == 0:
156
+ raise ValueError(
157
+ "No BAM files provided in the configuration."
158
+ )
159
+ if (
160
+ len(bamFilesControl) > 0
161
+ and len(bamFilesControl) != len(bamFiles)
162
+ and len(bamFilesControl) != 1
163
+ ):
164
+ raise ValueError(
165
+ "Number of control BAM files must be 0, 1, or the same as number of treatment files"
166
+ )
167
+ if len(bamFilesControl) == 1:
168
+ # If there are multiple bamFiles, but 1 control, control is applied for all treatment files
169
+ logger.info(
170
+ f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
171
+ )
172
+ bamFilesControl = bamFilesControl * len(bamFiles)
173
+
174
+ if (
175
+ not bamFiles
176
+ or not isinstance(bamFiles, list)
177
+ or len(bamFiles) == 0
178
+ ):
179
+ raise ValueError("No BAM files found")
180
+
181
+ for i, bamFile in enumerate(bamFiles):
182
+ misc_util.checkBamFile(bamFile)
183
+
184
+ if bamFilesControl:
185
+ for i, bamFile in enumerate(bamFilesControl):
186
+ misc_util.checkBamFile(bamFile)
187
+
188
+ # if we've made it here, we can check pairedEnd
189
+ pairedEndList = misc_util.bamsArePairedEnd(bamFiles)
190
+ _isPairedEnd: Optional[bool] = config.get(
191
+ "inputParams.pairedEnd", None
192
+ )
193
+ if _isPairedEnd is None:
194
+ # only set auto if not provided in config
195
+ _isPairedEnd = all(pairedEndList)
196
+ if _isPairedEnd:
197
+ logger.info("Paired-end BAM files detected")
198
+ else:
199
+ logger.info("One or more single-end BAM files detected")
200
+ return core.inputParams(
201
+ bamFiles=bamFiles,
202
+ bamFilesControl=bamFilesControl,
203
+ pairedEnd=_isPairedEnd,
204
+ )
205
+
206
+
207
+ def getGenomeArgs(config_path: str) -> core.genomeParams:
208
+ with open(config_path, "r") as f:
209
+ config = yaml.safe_load(f)
210
+ genomeName = config.get("genomeParams.name", None)
211
+ genome = constants.resolveGenomeName(genomeName)
212
+ chromSizesFile: Optional[str] = None
213
+ blacklistFile: Optional[str] = None
214
+ sparseBedFile: Optional[str] = None
215
+ chromosomes: Optional[List[str]] = None
216
+ excludeChroms: List[str] = config.get(
217
+ "genomeParams.excludeChroms", []
218
+ )
219
+ excludeForNorm: List[str] = config.get(
220
+ "genomeParams.excludeForNorm", []
221
+ )
222
+ if genome:
223
+ chromSizesFile = constants.getGenomeResourceFile(
224
+ genome, "sizes"
225
+ )
226
+ blacklistFile = constants.getGenomeResourceFile(
227
+ genome, "blacklist"
228
+ )
229
+ sparseBedFile = constants.getGenomeResourceFile(
230
+ genome, "sparse"
231
+ )
232
+ if config.get("genomeParams.chromSizesFile", None):
233
+ chromSizesFile = config["genomeParams.chromSizesFile"]
234
+ if config.get("genomeParams.blacklistFile", None):
235
+ blacklistFile = config["genomeParams.blacklistFile"]
236
+ if config.get("genomeParams.sparseBedFile", None):
237
+ sparseBedFile = config["genomeParams.sparseBedFile"]
238
+ if not chromSizesFile or not os.path.exists(chromSizesFile):
239
+ raise FileNotFoundError(
240
+ f"Chromosome sizes file {chromSizesFile} does not exist."
241
+ )
242
+ if config.get("genomeParams.chromosomes", None):
243
+ chromosomes = config["genomeParams.chromosomes"]
244
+ else:
245
+ if chromSizesFile:
246
+ chromosomes = list(
247
+ pd.read_csv(
248
+ chromSizesFile,
249
+ sep="\t",
250
+ header=None,
251
+ names=["chrom", "size"],
252
+ )["chrom"]
253
+ )
254
+ else:
255
+ raise ValueError(
256
+ "No chromosomes provided in the configuration and no chromosome sizes file specified."
257
+ )
258
+ chromosomes = [
259
+ chrom.strip() for chrom in chromosomes if chrom.strip()
260
+ ]
261
+ if excludeChroms:
262
+ chromosomes = [
263
+ chrom
264
+ for chrom in chromosomes
265
+ if chrom not in excludeChroms
266
+ ]
267
+ if not chromosomes:
268
+ raise ValueError(
269
+ "No valid chromosomes found after excluding specified chromosomes."
270
+ )
271
+ return core.genomeParams(
272
+ genomeName=genome,
273
+ chromSizesFile=chromSizesFile,
274
+ blacklistFile=blacklistFile,
275
+ sparseBedFile=sparseBedFile,
276
+ chromosomes=chromosomes,
277
+ excludeChroms=excludeChroms,
278
+ excludeForNorm=excludeForNorm,
279
+ )
280
+
281
+
282
+ def getCountingArgs(config_path: str) -> core.countingParams:
283
+ with open(config_path, "r") as f:
284
+ config = yaml.safe_load(f)
285
+ stepSize = config.get("countingParams.stepSize", 25)
286
+ scaleDown = config.get("countingParams.scaleDown", True)
287
+ scaleFactors = config.get("countingParams.scaleFactors", None)
288
+ numReads = config.get("countingParams.numReads", 100)
289
+ scaleFactorsControl = config.get(
290
+ "countingParams.scaleFactorsControl", None
291
+ )
292
+ applyAsinh = config.get("countingParams.applyAsinh", False)
293
+ applyLog = config.get("countingParams.applyLog", False)
294
+ if applyAsinh and applyLog:
295
+ applyAsinh = True
296
+ applyLog = False
297
+ logger.warning(
298
+ "Both `applyAsinh` and `applyLog` are set. Overriding `applyLog` to False."
299
+ )
300
+ rescaleToTreatmentCoverage = config.get(
301
+ "countingParams.rescaleToTreatmentCoverage", True
302
+ )
303
+ if scaleFactors is not None and not isinstance(
304
+ scaleFactors, list
305
+ ):
306
+ raise ValueError("`scaleFactors` should be a list of floats.")
307
+ if scaleFactorsControl is not None and not isinstance(
308
+ scaleFactorsControl, list
309
+ ):
310
+ raise ValueError(
311
+ "`scaleFactorsControl` should be a list of floats."
312
+ )
313
+ if (
314
+ scaleFactors is not None
315
+ and scaleFactorsControl is not None
316
+ and len(scaleFactors) != len(scaleFactorsControl)
317
+ ):
318
+ if len(scaleFactorsControl) == 1:
319
+ scaleFactorsControl = scaleFactorsControl * len(
320
+ scaleFactors
321
+ )
322
+ else:
323
+ raise ValueError(
324
+ "control and treatment scale factors: must be equal length or 1 control"
325
+ )
326
+ return core.countingParams(
327
+ stepSize=stepSize,
328
+ scaleDown=scaleDown,
329
+ scaleFactors=scaleFactors,
330
+ scaleFactorsControl=scaleFactorsControl,
331
+ numReads=numReads,
332
+ applyAsinh=applyAsinh,
333
+ applyLog=applyLog,
334
+ rescaleToTreatmentCoverage=rescaleToTreatmentCoverage,
335
+ )
336
+
337
+
338
+ def readConfig(config_path: str) -> Dict[str, Any]:
339
+ with open(config_path, "r") as f:
340
+ config = yaml.safe_load(f)
341
+
342
+ inputParams = getInputArgs(config_path)
343
+ genomeParams = getGenomeArgs(config_path)
344
+ countingParams = getCountingArgs(config_path)
345
+ minR_default = _getMinR(config, len(inputParams.bamFiles))
346
+ minQ_default = (
347
+ minR_default / (len(inputParams.bamFiles))
348
+ ) + 0.10 # protect condition number
349
+
350
+ matchingExcludeRegionsBedFile_default: Optional[str] = (
351
+ genomeParams.blacklistFile
352
+ )
353
+
354
+ # apply less aggressive *default* detrending/background removal
355
+ # ...IF input controls are present. In either case, respect
356
+ # ...user-specified params
357
+ detrendWindowLengthBP_: int = -1
358
+ detrendSavitzkyGolayDegree_: int = -1
359
+
360
+ if (
361
+ inputParams.bamFilesControl is not None
362
+ and len(inputParams.bamFilesControl) > 0
363
+ ):
364
+ detrendWindowLengthBP_ = config.get(
365
+ "detrendParams.detrendWindowLengthBP",
366
+ 25_000,
367
+ )
368
+ detrendSavitzkyGolayDegree_ = config.get(
369
+ "detrendParams.detrendSavitzkyGolayDegree",
370
+ 1,
371
+ )
372
+ else:
373
+ detrendWindowLengthBP_ = config.get(
374
+ "detrendParams.detrendWindowLengthBP",
375
+ 10_000,
376
+ )
377
+ detrendSavitzkyGolayDegree_ = config.get(
378
+ "detrendParams.detrendSavitzkyGolayDegree",
379
+ 2,
380
+ )
381
+
382
+ return {
383
+ "experimentName": config.get(
384
+ "experimentName", "consenrichExperiment"
385
+ ),
386
+ "genomeArgs": genomeParams,
387
+ "inputArgs": inputParams,
388
+ "countingArgs": countingParams,
389
+ "processArgs": core.processParams(
390
+ deltaF=config.get("processParams.deltaF", 0.5),
391
+ minQ=config.get("processParams.minQ", minQ_default),
392
+ maxQ=config.get("processParams.maxQ", 500.0),
393
+ offDiagQ=config.get("processParams.offDiagQ", 0.0),
394
+ dStatAlpha=config.get("processParams.dStatAlpha", 3.0),
395
+ dStatd=config.get("processParams.dStatd", 10.0),
396
+ dStatPC=config.get("processParams.dStatPC", 1.0),
397
+ scaleResidualsByP11=config.get(
398
+ "processParams.scaleResidualsByP11", False
399
+ ),
400
+ ),
401
+ "observationArgs": core.observationParams(
402
+ minR=minR_default,
403
+ maxR=config.get("observationParams.maxR", 500.0),
404
+ useALV=config.get("observationParams.useALV", False),
405
+ useConstantNoiseLevel=config.get(
406
+ "observationParams.useConstantNoiseLevel", False
407
+ ),
408
+ noGlobal=config.get("observationParams.noGlobal", False),
409
+ numNearest=config.get("observationParams.numNearest", 25),
410
+ localWeight=config.get(
411
+ "observationParams.localWeight",
412
+ 0.333,
413
+ ),
414
+ globalWeight=config.get(
415
+ "observationParams.globalWeight",
416
+ 0.667,
417
+ ),
418
+ approximationWindowLengthBP=config.get(
419
+ "observationParams.approximationWindowLengthBP",
420
+ 10000,
421
+ ),
422
+ lowPassWindowLengthBP=config.get(
423
+ "observationParams.lowPassWindowLengthBP",
424
+ 20000,
425
+ ),
426
+ lowPassFilterType=config.get(
427
+ "observationParams.lowPassFilterType",
428
+ "median",
429
+ ),
430
+ returnCenter=config.get(
431
+ "observationParams.returnCenter",
432
+ True,
433
+ ),
434
+ ),
435
+ "stateArgs": core.stateParams(
436
+ stateInit=config.get("stateParams.stateInit", 0.0),
437
+ stateCovarInit=config.get(
438
+ "stateParams.stateCovarInit",
439
+ 100.0,
440
+ ),
441
+ boundState=config.get("stateParams.boundState", True),
442
+ stateLowerBound=config.get(
443
+ "stateParams.stateLowerBound",
444
+ 0.0,
445
+ ),
446
+ stateUpperBound=config.get(
447
+ "stateParams.stateUpperBound",
448
+ 10000.0,
449
+ ),
450
+ ),
451
+ "samArgs": core.samParams(
452
+ samThreads=config.get("samParams.samThreads", 1),
453
+ samFlagExclude=config.get(
454
+ "samParams.samFlagExclude", 3844
455
+ ),
456
+ oneReadPerBin=config.get("samParams.oneReadPerBin", 0),
457
+ chunkSize=config.get("samParams.chunkSize", 1000000),
458
+ offsetStr=config.get("samParams.offsetStr", "0,0"),
459
+ extendBP=config.get("samParams.extendBP", []),
460
+ maxInsertSize=config.get("samParams.maxInsertSize", 1000),
461
+ pairedEndMode=config.get(
462
+ "samParams.pairedEndMode",
463
+ 1
464
+ if inputParams.pairedEnd is not None
465
+ and int(inputParams.pairedEnd) > 0
466
+ else 0,
467
+ ),
468
+ inferFragmentLength=config.get(
469
+ "samParams.inferFragmentLength",
470
+ 1
471
+ if inputParams.pairedEnd is not None
472
+ and int(inputParams.pairedEnd) == 0
473
+ else 0,
474
+ ),
475
+ countEndsOnly=config.get(
476
+ "samParams.countEndsOnly",
477
+ False,
478
+ ),
479
+ ),
480
+ "detrendArgs": core.detrendParams(
481
+ detrendWindowLengthBP=detrendWindowLengthBP_,
482
+ detrendTrackPercentile=config.get(
483
+ "detrendParams.detrendTrackPercentile",
484
+ 75,
485
+ ),
486
+ usePolyFilter=config.get(
487
+ "detrendParams.usePolyFilter",
488
+ False,
489
+ ),
490
+ detrendSavitzkyGolayDegree=config.get(
491
+ "detrendParams.detrendSavitzkyGolayDegree",
492
+ detrendSavitzkyGolayDegree_,
493
+ ),
494
+ useOrderStatFilter=config.get(
495
+ "detrendParams.useOrderStatFilter",
496
+ True,
497
+ ),
498
+ ),
499
+ "matchingArgs": core.matchingParams(
500
+ templateNames=config.get(
501
+ "matchingParams.templateNames",
502
+ [],
503
+ ),
504
+ cascadeLevels=config.get(
505
+ "matchingParams.cascadeLevels",
506
+ [],
507
+ ),
508
+ iters=config.get("matchingParams.iters", 25_000),
509
+ alpha=config.get("matchingParams.alpha", 0.05),
510
+ minMatchLengthBP=config.get(
511
+ "matchingParams.minMatchLengthBP", 250
512
+ ),
513
+ maxNumMatches=config.get(
514
+ "matchingParams.maxNumMatches", 100_000
515
+ ),
516
+ minSignalAtMaxima=config.get(
517
+ "matchingParams.minSignalAtMaxima", "q:0.75"
518
+ ),
519
+ merge=config.get("matchingParams.merge", True),
520
+ mergeGapBP=config.get("matchingParams.mergeGapBP", None),
521
+ useScalingFunction=config.get(
522
+ "matchingParams.useScalingFunction", True
523
+ ),
524
+ excludeRegionsBedFile=config.get(
525
+ "matchingParams.excludeRegionsBedFile",
526
+ matchingExcludeRegionsBedFile_default,
527
+ ),
528
+ randSeed=config.get("matchingParams.randSeed", 42),
529
+ penalizeBy=config.get("matchingParams.penalizeBy", None),
530
+ ),
531
+ }
532
+
533
+
534
+ def convertBedGraphToBigWig(experimentName, chromSizesFile):
535
+ suffixes = ["state", "residuals"]
536
+ path_ = ""
537
+ warningMessage = (
538
+ "Could not find UCSC bedGraphToBigWig binary utility."
539
+ "If you need bigWig files instead of the default, human-readable bedGraph files,"
540
+ "you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
541
+ "OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
542
+ )
543
+
544
+ logger.info(
545
+ "Attempting to generate bigWig files from bedGraph format..."
546
+ )
547
+ try:
548
+ path_ = shutil.which("bedGraphToBigWig")
549
+ except Exception as e:
550
+ logger.warning(f"\n{warningMessage}\n")
551
+ return
552
+ if path_ is None or len(path_) == 0:
553
+ logger.warning(f"\n{warningMessage}\n")
554
+ return
555
+ logger.info(f"Using bedGraphToBigWig from {path_}")
556
+ for suffix in suffixes:
557
+ bedgraph = (
558
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
559
+ )
560
+ if not os.path.exists(bedgraph):
561
+ logger.warning(
562
+ f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
563
+ )
564
+ continue
565
+ if not os.path.exists(chromSizesFile):
566
+ logger.warning(
567
+ f"{chromSizesFile} does not exist. Skipping bigWig conversion."
568
+ )
569
+ return
570
+ bigwig = f"{experimentName}_consenrich_{suffix}.bw"
571
+ logger.info(f"Start: {bedgraph} --> {bigwig}...")
572
+ try:
573
+ subprocess.run(
574
+ [path_, bedgraph, chromSizesFile, bigwig], check=True
575
+ )
576
+ except Exception as e:
577
+ logger.warning(
578
+ f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
579
+ )
580
+ continue
581
+ if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
582
+ logger.info(
583
+ f"Finished: converted {bedgraph} to {bigwig}."
584
+ )
585
+
586
+
587
+ def main():
588
+ parser = argparse.ArgumentParser(description="Consenrich CLI")
589
+ parser.add_argument(
590
+ "--config",
591
+ type=str,
592
+ dest="config",
593
+ help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
594
+ )
595
+
596
+ # --- Matching-specific command-line arguments ---
597
+ parser.add_argument(
598
+ "--match-bedGraph",
599
+ type=str,
600
+ dest="matchBedGraph",
601
+ help="Path to a bedGraph file of Consenrich estimates to match templates against.\
602
+ If provided, *only* the matching algorithm is run (no other processing).",
603
+ )
604
+ parser.add_argument(
605
+ "--match-template",
606
+ type=str,
607
+ default="haar",
608
+ choices=[
609
+ x
610
+ for x in pywt.wavelist(kind="discrete")
611
+ if "bio" not in x
612
+ ],
613
+ dest="matchTemplate",
614
+ )
615
+ parser.add_argument(
616
+ "--match-level", type=int, default=2, dest="matchLevel"
617
+ )
618
+ parser.add_argument(
619
+ "--match-alpha", type=float, default=0.05, dest="matchAlpha"
620
+ )
621
+ parser.add_argument(
622
+ "--match-min-length",
623
+ type=int,
624
+ default=250,
625
+ dest="matchMinMatchLengthBP",
626
+ )
627
+ parser.add_argument(
628
+ "--match-iters", type=int, default=25000, dest="matchIters"
629
+ )
630
+ parser.add_argument(
631
+ "--match-min-signal",
632
+ type=str,
633
+ default="q:0.75",
634
+ dest="matchMinSignalAtMaxima",
635
+ )
636
+ parser.add_argument(
637
+ "--match-max-matches",
638
+ type=int,
639
+ default=100000,
640
+ dest="matchMaxNumMatches",
641
+ )
642
+ parser.add_argument(
643
+ "--match-no-merge", action="store_true", dest="matchNoMerge"
644
+ )
645
+ parser.add_argument(
646
+ "--match-merge-gap",
647
+ type=int,
648
+ default=None,
649
+ dest="matchMergeGapBP",
650
+ )
651
+ parser.add_argument(
652
+ "--match-use-wavelet",
653
+ action="store_true",
654
+ dest="matchUseWavelet",
655
+ )
656
+ parser.add_argument(
657
+ "--match-seed", type=int, default=42, dest="matchRandSeed"
658
+ )
659
+ parser.add_argument(
660
+ "--match-exclude-bed",
661
+ type=str,
662
+ default=None,
663
+ dest="matchExcludeBed",
664
+ )
665
+ parser.add_argument(
666
+ "--verbose", action="store_true", help="If set, logs config"
667
+ )
668
+ args = parser.parse_args()
669
+
670
+ if args.matchBedGraph:
671
+ if not os.path.exists(args.matchBedGraph):
672
+ raise FileNotFoundError(
673
+ f"bedGraph file {args.matchBedGraph} couldn't be found."
674
+ )
675
+ logger.info(
676
+ f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
677
+ )
678
+
679
+ outName = matching.matchExistingBedGraph(
680
+ args.matchBedGraph,
681
+ args.matchTemplate,
682
+ args.matchLevel,
683
+ alpha=args.matchAlpha,
684
+ minMatchLengthBP=args.matchMinMatchLengthBP,
685
+ iters=args.matchIters,
686
+ minSignalAtMaxima=args.matchMinSignalAtMaxima,
687
+ maxNumMatches=args.matchMaxNumMatches,
688
+ useScalingFunction=(not args.matchUseWavelet),
689
+ merge=(not args.matchNoMerge),
690
+ mergeGapBP=args.matchMergeGapBP,
691
+ excludeRegionsBedFile=args.matchExcludeBed,
692
+ randSeed=args.matchRandSeed,
693
+ )
694
+ logger.info(f"Finished matching. Written to {outName}")
695
+ sys.exit(0)
696
+
697
+ if args.matchBedGraph:
698
+ # this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
699
+ logger.info(
700
+ "If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
701
+ )
702
+ sys.exit(0)
703
+
704
+ if not args.config:
705
+ logger.info(
706
+ "No config file provided, run with `--config <path_to_config.yaml>`"
707
+ )
708
+ logger.info(
709
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
710
+ )
711
+ sys.exit(1)
712
+
713
+ if not os.path.exists(args.config):
714
+ logger.info(f"Config file {args.config} does not exist.")
715
+ logger.info(
716
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
717
+ )
718
+ sys.exit(1)
719
+
720
+ config = readConfig(args.config)
721
+ experimentName = config["experimentName"]
722
+ genomeArgs = config["genomeArgs"]
723
+ inputArgs = config["inputArgs"]
724
+ countingArgs = config["countingArgs"]
725
+ processArgs = config["processArgs"]
726
+ observationArgs = config["observationArgs"]
727
+ stateArgs = config["stateArgs"]
728
+ samArgs = config["samArgs"]
729
+ detrendArgs = config["detrendArgs"]
730
+ matchingArgs = config["matchingArgs"]
731
+ bamFiles = inputArgs.bamFiles
732
+ bamFilesControl = inputArgs.bamFilesControl
733
+ numSamples = len(bamFiles)
734
+ numNearest = observationArgs.numNearest
735
+ stepSize = countingArgs.stepSize
736
+ excludeForNorm = genomeArgs.excludeForNorm
737
+ chromSizes = genomeArgs.chromSizesFile
738
+ scaleDown = countingArgs.scaleDown
739
+ extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
740
+ initialTreatmentScaleFactors = []
741
+ minMatchLengthBP_: Optional[int] = matchingArgs.minMatchLengthBP
742
+ mergeGapBP_: Optional[int] = matchingArgs.mergeGapBP
743
+
744
+ if args.verbose:
745
+ try:
746
+ logger.info("Configuration:\n")
747
+ config_truncated = {
748
+ k: v
749
+ for k, v in config.items()
750
+ if k
751
+ not in ["inputArgs", "genomeArgs", "countingArgs"]
752
+ }
753
+ config_truncated["experimentName"] = experimentName
754
+ config_truncated["inputArgs"] = inputArgs
755
+ config_truncated["genomeArgs"] = genomeArgs
756
+ config_truncated["countingArgs"] = countingArgs
757
+ config_truncated["processArgs"] = processArgs
758
+ config_truncated["observationArgs"] = observationArgs
759
+ config_truncated["stateArgs"] = stateArgs
760
+ config_truncated["samArgs"] = samArgs
761
+ config_truncated["detrendArgs"] = detrendArgs
762
+ pprint.pprint(config_truncated, indent=4)
763
+ except Exception as e:
764
+ logger.warning(f"Failed to print parsed config:\n{e}\n")
765
+
766
+ controlsPresent = checkControlsPresent(inputArgs)
767
+ if args.verbose:
768
+ logger.info(f"controlsPresent: {controlsPresent}")
769
+ readLengthsBamFiles = getReadLengths(
770
+ inputArgs, countingArgs, samArgs
771
+ )
772
+ effectiveGenomeSizes = getEffectiveGenomeSizes(
773
+ genomeArgs, readLengthsBamFiles
774
+ )
775
+ matchingEnabled = checkMatchingEnabled(matchingArgs)
776
+ if args.verbose:
777
+ logger.info(f"matchingEnabled: {matchingEnabled}")
778
+ scaleFactors = countingArgs.scaleFactors
779
+ scaleFactorsControl = countingArgs.scaleFactorsControl
780
+
781
+ if controlsPresent:
782
+ readLengthsControlBamFiles = [
783
+ core.getReadLength(
784
+ bamFile,
785
+ countingArgs.numReads,
786
+ 1000,
787
+ samArgs.samThreads,
788
+ samArgs.samFlagExclude,
789
+ )
790
+ for bamFile in bamFilesControl
791
+ ]
792
+ effectiveGenomeSizesControl = [
793
+ constants.getEffectiveGenomeSize(
794
+ genomeArgs.genomeName, readLength
795
+ )
796
+ for readLength in readLengthsControlBamFiles
797
+ ]
798
+
799
+ if (
800
+ scaleFactors is not None
801
+ and scaleFactorsControl is not None
802
+ ):
803
+ treatScaleFactors = scaleFactors
804
+ controlScaleFactors = scaleFactorsControl
805
+ # still make sure this is accessible
806
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
807
+ else:
808
+ try:
809
+ initialTreatmentScaleFactors = [
810
+ detrorm.getScaleFactor1x(
811
+ bamFile,
812
+ effectiveGenomeSize,
813
+ readLength,
814
+ genomeArgs.excludeChroms,
815
+ genomeArgs.chromSizesFile,
816
+ samArgs.samThreads,
817
+ )
818
+ for bamFile, effectiveGenomeSize, readLength in zip(
819
+ bamFiles,
820
+ effectiveGenomeSizes,
821
+ readLengthsBamFiles,
822
+ )
823
+ ]
824
+ except Exception:
825
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
826
+
827
+ pairScalingFactors = [
828
+ detrorm.getPairScaleFactors(
829
+ bamFileA,
830
+ bamFileB,
831
+ effectiveGenomeSizeA,
832
+ effectiveGenomeSizeB,
833
+ readLengthA,
834
+ readLengthB,
835
+ excludeForNorm,
836
+ chromSizes,
837
+ samArgs.samThreads,
838
+ scaleDown,
839
+ )
840
+ for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
841
+ bamFiles,
842
+ bamFilesControl,
843
+ effectiveGenomeSizes,
844
+ effectiveGenomeSizesControl,
845
+ readLengthsBamFiles,
846
+ readLengthsControlBamFiles,
847
+ )
848
+ ]
849
+
850
+ treatScaleFactors = []
851
+ controlScaleFactors = []
852
+ for scaleFactorA, scaleFactorB in pairScalingFactors:
853
+ treatScaleFactors.append(scaleFactorA)
854
+ controlScaleFactors.append(scaleFactorB)
855
+
856
+ else:
857
+ treatScaleFactors = scaleFactors
858
+ controlScaleFactors = scaleFactorsControl
859
+
860
+ if scaleFactors is None and not controlsPresent:
861
+ scaleFactors = [
862
+ detrorm.getScaleFactor1x(
863
+ bamFile,
864
+ effectiveGenomeSize,
865
+ readLength,
866
+ genomeArgs.excludeChroms,
867
+ genomeArgs.chromSizesFile,
868
+ samArgs.samThreads,
869
+ )
870
+ for bamFile, effectiveGenomeSize, readLength in zip(
871
+ bamFiles, effectiveGenomeSizes, readLengthsBamFiles
872
+ )
873
+ ]
874
+ chromSizesDict = misc_util.getChromSizesDict(
875
+ genomeArgs.chromSizesFile,
876
+ excludeChroms=genomeArgs.excludeChroms,
877
+ )
878
+ chromosomes = genomeArgs.chromosomes
879
+
880
+ for c_, chromosome in enumerate(chromosomes):
881
+ chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
882
+ bamFiles,
883
+ chromosome,
884
+ chromSizesDict[chromosome],
885
+ samArgs.samThreads,
886
+ samArgs.samFlagExclude,
887
+ )
888
+ chromosomeStart = max(
889
+ 0, (chromosomeStart - (chromosomeStart % stepSize))
890
+ )
891
+ chromosomeEnd = max(
892
+ 0, (chromosomeEnd - (chromosomeEnd % stepSize))
893
+ )
894
+ numIntervals = (
895
+ ((chromosomeEnd - chromosomeStart) + stepSize) - 1
896
+ ) // stepSize
897
+ intervals = np.arange(
898
+ chromosomeStart, chromosomeEnd, stepSize
899
+ )
900
+ chromMat: np.ndarray = np.empty(
901
+ (numSamples, numIntervals), dtype=np.float32
902
+ )
903
+ if controlsPresent:
904
+ j_: int = 0
905
+ finalSF = 1.0
906
+ for bamA, bamB in zip(bamFiles, bamFilesControl):
907
+ logger.info(
908
+ f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
909
+ )
910
+ pairMatrix: np.ndarray = core.readBamSegments(
911
+ [bamA, bamB],
912
+ chromosome,
913
+ chromosomeStart,
914
+ chromosomeEnd,
915
+ stepSize,
916
+ [
917
+ readLengthsBamFiles[j_],
918
+ readLengthsControlBamFiles[j_],
919
+ ],
920
+ [treatScaleFactors[j_], controlScaleFactors[j_]],
921
+ samArgs.oneReadPerBin,
922
+ samArgs.samThreads,
923
+ samArgs.samFlagExclude,
924
+ offsetStr=samArgs.offsetStr,
925
+ extendBP=extendBP_[j_],
926
+ maxInsertSize=samArgs.maxInsertSize,
927
+ pairedEndMode=samArgs.pairedEndMode,
928
+ inferFragmentLength=samArgs.inferFragmentLength,
929
+ applyAsinh=countingArgs.applyAsinh,
930
+ applyLog=countingArgs.applyLog,
931
+ countEndsOnly=samArgs.countEndsOnly,
932
+ )
933
+ if countingArgs.rescaleToTreatmentCoverage:
934
+ finalSF = max(
935
+ 1.0, initialTreatmentScaleFactors[j_]
936
+ )
937
+ chromMat[j_, :] = finalSF * (
938
+ pairMatrix[0, :] - pairMatrix[1, :]
939
+ )
940
+ j_ += 1
941
+ else:
942
+ chromMat = core.readBamSegments(
943
+ bamFiles,
944
+ chromosome,
945
+ chromosomeStart,
946
+ chromosomeEnd,
947
+ stepSize,
948
+ readLengthsBamFiles,
949
+ scaleFactors,
950
+ samArgs.oneReadPerBin,
951
+ samArgs.samThreads,
952
+ samArgs.samFlagExclude,
953
+ offsetStr=samArgs.offsetStr,
954
+ extendBP=extendBP_,
955
+ maxInsertSize=samArgs.maxInsertSize,
956
+ pairedEndMode=samArgs.pairedEndMode,
957
+ inferFragmentLength=samArgs.inferFragmentLength,
958
+ applyAsinh=countingArgs.applyAsinh,
959
+ applyLog=countingArgs.applyLog,
960
+ countEndsOnly=samArgs.countEndsOnly,
961
+ )
962
+ sparseMap = None
963
+ if genomeArgs.sparseBedFile and not observationArgs.useALV:
964
+ logger.info(
965
+ f"Building sparse mapping for {chromosome}..."
966
+ )
967
+ sparseMap = core.getSparseMap(
968
+ chromosome,
969
+ intervals,
970
+ numNearest,
971
+ genomeArgs.sparseBedFile,
972
+ )
973
+
974
+ muncMat = np.empty_like(chromMat, dtype=np.float32)
975
+ for j in range(numSamples):
976
+ logger.info(
977
+ f"Muncing {j + 1}/{numSamples} for {chromosome}..."
978
+ )
979
+ muncMat[j, :] = core.getMuncTrack(
980
+ chromosome,
981
+ intervals,
982
+ stepSize,
983
+ chromMat[j, :],
984
+ observationArgs.minR,
985
+ observationArgs.maxR,
986
+ observationArgs.useALV,
987
+ observationArgs.useConstantNoiseLevel,
988
+ observationArgs.noGlobal,
989
+ observationArgs.localWeight,
990
+ observationArgs.globalWeight,
991
+ observationArgs.approximationWindowLengthBP,
992
+ observationArgs.lowPassWindowLengthBP,
993
+ observationArgs.returnCenter,
994
+ sparseMap=sparseMap,
995
+ lowPassFilterType=observationArgs.lowPassFilterType,
996
+ )
997
+ chromMat[j, :] = detrorm.detrendTrack(
998
+ chromMat[j, :],
999
+ stepSize,
1000
+ detrendArgs.detrendWindowLengthBP,
1001
+ detrendArgs.useOrderStatFilter,
1002
+ detrendArgs.usePolyFilter,
1003
+ detrendArgs.detrendTrackPercentile,
1004
+ detrendArgs.detrendSavitzkyGolayDegree,
1005
+ )
1006
+ logger.info(f">>>Running consenrich: {chromosome}<<<")
1007
+
1008
+ x, P, y = core.runConsenrich(
1009
+ chromMat,
1010
+ muncMat,
1011
+ processArgs.deltaF,
1012
+ processArgs.minQ,
1013
+ processArgs.maxQ,
1014
+ processArgs.offDiagQ,
1015
+ processArgs.dStatAlpha,
1016
+ processArgs.dStatd,
1017
+ processArgs.dStatPC,
1018
+ stateArgs.stateInit,
1019
+ stateArgs.stateCovarInit,
1020
+ stateArgs.boundState,
1021
+ stateArgs.stateLowerBound,
1022
+ stateArgs.stateUpperBound,
1023
+ samArgs.chunkSize,
1024
+ progressIter=50_000,
1025
+ )
1026
+ logger.info("Done.")
1027
+
1028
+ x_ = core.getPrimaryState(x)
1029
+ y_ = core.getPrecisionWeightedResidual(
1030
+ y,
1031
+ muncMat,
1032
+ stateCovarSmoothed=P
1033
+ if processArgs.scaleResidualsByP11 is not None
1034
+ and processArgs.scaleResidualsByP11
1035
+ else None,
1036
+ )
1037
+ weights_: Optional[np.ndarray] = None
1038
+ if matchingArgs.penalizeBy is not None:
1039
+ if matchingArgs.penalizeBy == "absResiduals":
1040
+ try:
1041
+ weights_ = np.abs(y_)
1042
+ except Exception as e:
1043
+ logger.warning(
1044
+ f"Error computing weights for 'absResiduals': {e}. No weights applied for matching."
1045
+ )
1046
+ weights_ = None
1047
+ elif matchingArgs.penalizeBy == "stateUncertainty":
1048
+ try:
1049
+ weights_ = np.sqrt(P[:, 0, 0])
1050
+ except Exception as e:
1051
+ logger.warning(
1052
+ f"Error computing weights for 'stateUncertainty': {e}. No weights applied for matching."
1053
+ )
1054
+ weights_ = None
1055
+ else:
1056
+ logger.warning(
1057
+ f"Unrecognized `matchingParams.penalizeBy`: {matchingArgs.penalizeBy}. No weights applied."
1058
+ )
1059
+ weights_ = None
1060
+
1061
+
1062
+ df = pd.DataFrame(
1063
+ {
1064
+ "Chromosome": chromosome,
1065
+ "Start": intervals,
1066
+ "End": intervals + stepSize,
1067
+ "State": x_,
1068
+ "Res": y_,
1069
+ }
1070
+ )
1071
+ if c_ == 0 and len(chromosomes) > 1:
1072
+ for file_ in os.listdir("."):
1073
+ if file_.startswith(
1074
+ f"consenrichOutput_{experimentName}"
1075
+ ) and (
1076
+ file_.endswith(".bedGraph")
1077
+ or file_.endswith(".narrowPeak")
1078
+ ):
1079
+ logger.warning(f"Overwriting: {file_}")
1080
+ os.remove(file_)
1081
+
1082
+ for col, suffix in [("State", "state"), ("Res", "residuals")]:
1083
+ logger.info(
1084
+ f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
1085
+ )
1086
+ df[["Chromosome", "Start", "End", col]].to_csv(
1087
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
1088
+ sep="\t",
1089
+ header=False,
1090
+ index=False,
1091
+ mode="a",
1092
+ float_format="%.3f",
1093
+ lineterminator="\n",
1094
+ )
1095
+ try:
1096
+ if matchingEnabled:
1097
+ if (
1098
+ minMatchLengthBP_ is None
1099
+ or minMatchLengthBP_ <= 0
1100
+ ):
1101
+ minMatchLengthBP_ = (
1102
+ matching.autoMinLengthIntervals(x_)
1103
+ * (intervals[1] - intervals[0])
1104
+ )
1105
+
1106
+ if mergeGapBP_ is None:
1107
+ mergeGapBP_ = int(minMatchLengthBP_ / 2) + 1
1108
+
1109
+ matchingDF = matching.matchWavelet(
1110
+ chromosome,
1111
+ intervals,
1112
+ x_,
1113
+ matchingArgs.templateNames,
1114
+ matchingArgs.cascadeLevels,
1115
+ matchingArgs.iters,
1116
+ matchingArgs.alpha,
1117
+ minMatchLengthBP_,
1118
+ matchingArgs.maxNumMatches,
1119
+ matchingArgs.minSignalAtMaxima,
1120
+ useScalingFunction=matchingArgs.useScalingFunction,
1121
+ excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
1122
+ randSeed=matchingArgs.randSeed,
1123
+ weights=weights_,
1124
+ )
1125
+ if not matchingDF.empty:
1126
+ matchingDF.to_csv(
1127
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1128
+ sep="\t",
1129
+ header=False,
1130
+ index=False,
1131
+ mode="a",
1132
+ float_format="%.3f",
1133
+ lineterminator="\n",
1134
+ )
1135
+ except Exception as e:
1136
+ logger.warning(
1137
+ f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
1138
+ )
1139
+ continue
1140
+ logger.info("Finished: output in human-readable format")
1141
+ convertBedGraphToBigWig(experimentName, genomeArgs.chromSizesFile)
1142
+ if matchingEnabled and matchingArgs.merge:
1143
+ try:
1144
+ mergeGapBP_ = matchingArgs.mergeGapBP
1145
+ if mergeGapBP_ is None or mergeGapBP_ <= 0:
1146
+ mergeGapBP_ = (
1147
+ int(minMatchLengthBP_ / 2) + 1
1148
+ if minMatchLengthBP_ is not None
1149
+ and minMatchLengthBP_ >= 0
1150
+ else 75
1151
+ )
1152
+ matching.mergeMatches(
1153
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
1154
+ mergeGapBP=mergeGapBP_,
1155
+ )
1156
+
1157
+ except Exception as e:
1158
+ logger.warning(
1159
+ f"Failed to merge matches...SKIPPING:\n{e}\n\n"
1160
+ )
1161
+ logger.info("Done.")
1162
+
1163
+
1164
+ if __name__ == "__main__":
1165
+ main()