consenrich 0.6.3b1__cp314-cp314-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of consenrich might be problematic. Click here for more details.

@@ -0,0 +1,923 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import pprint
8
+ import os
9
+ from pathlib import Path
10
+ from typing import List, Optional, Tuple, Dict, Any, Union
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pysam
17
+ import pywt
18
+ import yaml
19
+
20
+ import consenrich.core as core
21
+ import consenrich.misc_util as misc_util
22
+ import consenrich.constants as constants
23
+ import consenrich.detrorm as detrorm
24
+ import consenrich.matching as matching
25
+
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format="%(asctime)s - %(module)s.%(funcName)s - %(levelname)s - %(message)s",
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _listOrEmpty(list_):
36
+ if list_ is None:
37
+ return []
38
+ return list_
39
+
40
+
41
+ def _getMinR(cfg, numBams: int) -> float:
42
+ try:
43
+ raw = cfg.get("observationParams.minR", None)
44
+ return float(raw) if raw is not None else (1 / numBams) + 1e-4
45
+ except (TypeError, ValueError, KeyError):
46
+ fallBackMinR: float = 1.0e-2
47
+ logger.warning(
48
+ f"Invalid or missing 'observationParams.minR' in config. Using `{fallBackMinR}`."
49
+ )
50
+ return fallBackMinR
51
+
52
+
53
+ def checkControlsPresent(inputArgs: core.inputParams) -> bool:
54
+ """Check if control BAM files are present in the input arguments.
55
+
56
+ :param inputArgs: core.inputParams object
57
+ :return: True if control BAM files are present, False otherwise.
58
+ """
59
+ return (
60
+ bool(inputArgs.bamFilesControl)
61
+ and isinstance(inputArgs.bamFilesControl, list)
62
+ and len(inputArgs.bamFilesControl) > 0
63
+ )
64
+
65
+
66
+ def getReadLengths(
67
+ inputArgs: core.inputParams,
68
+ countingArgs: core.countingParams,
69
+ samArgs: core.samParams,
70
+ ) -> List[int]:
71
+ r"""Get read lengths for each BAM file in the input arguments.
72
+
73
+ :param inputArgs: core.inputParams object containing BAM file paths.
74
+ :param countingArgs: core.countingParams object containing number of reads.
75
+ :param samArgs: core.samParams object containing SAM thread and flag exclude parameters.
76
+ :return: List of read lengths for each BAM file.
77
+ """
78
+ if not inputArgs.bamFiles:
79
+ raise ValueError("No BAM files provided in the input arguments.")
80
+
81
+ if not isinstance(inputArgs.bamFiles, list) or len(inputArgs.bamFiles) == 0:
82
+ raise ValueError("bam files list is empty")
83
+
84
+ return [
85
+ core.getReadLength(
86
+ bamFile,
87
+ countingArgs.numReads,
88
+ 1000,
89
+ samArgs.samThreads,
90
+ samArgs.samFlagExclude,
91
+ )
92
+ for bamFile in inputArgs.bamFiles
93
+ ]
94
+
95
+
96
+ def checkMatchingEnabled(matchingArgs: core.matchingParams) -> bool:
97
+ matchingEnabled = (
98
+ (matchingArgs.templateNames is not None)
99
+ and isinstance(matchingArgs.templateNames, list)
100
+ and len(matchingArgs.templateNames) > 0
101
+ )
102
+ matchingEnabled = (
103
+ matchingEnabled
104
+ and (matchingArgs.cascadeLevels is not None)
105
+ and isinstance(matchingArgs.cascadeLevels, list)
106
+ and len(matchingArgs.cascadeLevels) > 0
107
+ )
108
+ return matchingEnabled
109
+
110
+
111
+ def getEffectiveGenomeSizes(
112
+ genomeArgs: core.genomeParams, readLengths: List[int]
113
+ ) -> List[int]:
114
+ r"""Get effective genome sizes for the given genome name and read lengths.
115
+ :param genomeArgs: core.genomeParams object
116
+ :param readLengths: List of read lengths for which to get effective genome sizes.
117
+ :return: List of effective genome sizes corresponding to the read lengths.
118
+ """
119
+ genomeName = genomeArgs.genomeName
120
+ if not genomeName or not isinstance(genomeName, str):
121
+ raise ValueError("Genome name must be a non-empty string.")
122
+
123
+ if not isinstance(readLengths, list) or len(readLengths) == 0:
124
+ raise ValueError(
125
+ "Read lengths must be a non-empty list. Try calling `getReadLengths` first."
126
+ )
127
+ return [
128
+ constants.getEffectiveGenomeSize(genomeName, readLength)
129
+ for readLength in readLengths
130
+ ]
131
+
132
+
133
+ def getInputArgs(config_path: str) -> core.inputParams:
134
+ def _expandWildCards(bamList) -> List[str]:
135
+ expanded = []
136
+ for entry in bamList:
137
+ if "*" in entry or "?" in entry or "[" in entry:
138
+ matched = glob.glob(entry)
139
+ expanded.extend(matched)
140
+ else:
141
+ expanded.append(entry)
142
+ return expanded
143
+
144
+ with open(config_path, "r") as f:
145
+ config = yaml.safe_load(f)
146
+ bamFilesRaw = config.get("inputParams.bamFiles", [])
147
+ bamFilesControlRaw = config.get("inputParams.bamFilesControl", [])
148
+ bamFiles = _expandWildCards(bamFilesRaw)
149
+ bamFilesControl = _expandWildCards(bamFilesControlRaw)
150
+ if len(bamFiles) == 0:
151
+ raise ValueError("No BAM files provided in the configuration.")
152
+ if (
153
+ len(bamFilesControl) > 0
154
+ and len(bamFilesControl) != len(bamFiles)
155
+ and len(bamFilesControl) != 1
156
+ ):
157
+ raise ValueError(
158
+ "Number of control BAM files must be 0, 1, or the same as number of treatment files"
159
+ )
160
+ if len(bamFilesControl) == 1:
161
+ # If there are multiple bamFiles, but 1 control, control is applied for all treatment files
162
+ logger.info(
163
+ f"Only one control given: Using {bamFilesControl[0]} for all treatment files."
164
+ )
165
+ bamFilesControl = bamFilesControl * len(bamFiles)
166
+
167
+ if not bamFiles or not isinstance(bamFiles, list) or len(bamFiles) == 0:
168
+ raise ValueError("No BAM files found")
169
+
170
+ for i, bamFile in enumerate(bamFiles):
171
+ misc_util.checkBamFile(bamFile)
172
+
173
+ if bamFilesControl:
174
+ for i, bamFile in enumerate(bamFilesControl):
175
+ misc_util.checkBamFile(bamFile)
176
+
177
+ return core.inputParams(bamFiles=bamFiles, bamFilesControl=bamFilesControl)
178
+
179
+
180
+ def getGenomeArgs(config_path: str) -> core.genomeParams:
181
+ with open(config_path, "r") as f:
182
+ config = yaml.safe_load(f)
183
+ genomeName = config.get("genomeParams.name", None)
184
+ genome = constants.resolveGenomeName(genomeName)
185
+ chromSizesFile: Optional[str] = None
186
+ blacklistFile: Optional[str] = None
187
+ sparseBedFile: Optional[str] = None
188
+ chromosomes: Optional[List[str]] = None
189
+ excludeChroms: List[str] = config.get("genomeParams.excludeChroms", [])
190
+ excludeForNorm: List[str] = config.get("genomeParams.excludeForNorm", [])
191
+ if genome:
192
+ chromSizesFile = constants.getGenomeResourceFile(genome, "sizes")
193
+ blacklistFile = constants.getGenomeResourceFile(genome, "blacklist")
194
+ sparseBedFile = constants.getGenomeResourceFile(genome, "sparse")
195
+ if config.get("genomeParams.chromSizesFile", None):
196
+ chromSizesFile = config["genomeParams.chromSizesFile"]
197
+ if config.get("genomeParams.blacklistFile", None):
198
+ blacklistFile = config["genomeParams.blacklistFile"]
199
+ if config.get("genomeParams.sparseBedFile", None):
200
+ sparseBedFile = config["genomeParams.sparseBedFile"]
201
+ if not chromSizesFile or not os.path.exists(chromSizesFile):
202
+ raise FileNotFoundError(
203
+ f"Chromosome sizes file {chromSizesFile} does not exist."
204
+ )
205
+ if config.get("genomeParams.chromosomes", None):
206
+ chromosomes = config["genomeParams.chromosomes"]
207
+ else:
208
+ if chromSizesFile:
209
+ chromosomes = list(
210
+ pd.read_csv(
211
+ chromSizesFile,
212
+ sep="\t",
213
+ header=None,
214
+ names=["chrom", "size"],
215
+ )["chrom"]
216
+ )
217
+ else:
218
+ raise ValueError(
219
+ "No chromosomes provided in the configuration and no chromosome sizes file specified."
220
+ )
221
+ chromosomes = [chrom.strip() for chrom in chromosomes if chrom.strip()]
222
+ if excludeChroms:
223
+ chromosomes = [
224
+ chrom for chrom in chromosomes if chrom not in excludeChroms
225
+ ]
226
+ if not chromosomes:
227
+ raise ValueError(
228
+ "No valid chromosomes found after excluding specified chromosomes."
229
+ )
230
+ return core.genomeParams(
231
+ genomeName=genome,
232
+ chromSizesFile=chromSizesFile,
233
+ blacklistFile=blacklistFile,
234
+ sparseBedFile=sparseBedFile,
235
+ chromosomes=chromosomes,
236
+ excludeChroms=excludeChroms,
237
+ excludeForNorm=excludeForNorm,
238
+ )
239
+
240
+
241
+ def getCountingArgs(config_path: str) -> core.countingParams:
242
+ with open(config_path, "r") as f:
243
+ config = yaml.safe_load(f)
244
+ stepSize = config.get("countingParams.stepSize", 25)
245
+ scaleDown = config.get("countingParams.scaleDown", True)
246
+ scaleFactors = config.get("countingParams.scaleFactors", None)
247
+ numReads = config.get("countingParams.numReads", 100)
248
+ scaleFactorsControl = config.get("countingParams.scaleFactorsControl", None)
249
+ applyAsinh = config.get("countingParams.applyAsinh", False)
250
+ applyLog = config.get("countingParams.applyLog", False)
251
+ if applyAsinh and applyLog:
252
+ applyAsinh = True
253
+ applyLog = False
254
+ logger.warning(
255
+ "Both `applyAsinh` and `applyLog` are set. Overriding `applyLog` to False."
256
+ )
257
+ rescaleToTreatmentCoverage = config.get(
258
+ "countingParams.rescaleToTreatmentCoverage", True
259
+ )
260
+ if scaleFactors is not None and not isinstance(scaleFactors, list):
261
+ raise ValueError("`scaleFactors` should be a list of floats.")
262
+ if scaleFactorsControl is not None and not isinstance(
263
+ scaleFactorsControl, list
264
+ ):
265
+ raise ValueError("`scaleFactorsControl` should be a list of floats.")
266
+ if (
267
+ scaleFactors is not None
268
+ and scaleFactorsControl is not None
269
+ and len(scaleFactors) != len(scaleFactorsControl)
270
+ ):
271
+ if len(scaleFactorsControl) == 1:
272
+ scaleFactorsControl = scaleFactorsControl * len(scaleFactors)
273
+ else:
274
+ raise ValueError(
275
+ "control and treatment scale factors: must be equal length or 1 control"
276
+ )
277
+ return core.countingParams(
278
+ stepSize=stepSize,
279
+ scaleDown=scaleDown,
280
+ scaleFactors=scaleFactors,
281
+ scaleFactorsControl=scaleFactorsControl,
282
+ numReads=numReads,
283
+ applyAsinh=applyAsinh,
284
+ applyLog=applyLog,
285
+ rescaleToTreatmentCoverage=rescaleToTreatmentCoverage,
286
+ )
287
+
288
+
289
+ def readConfig(config_path: str) -> Dict[str, Any]:
290
+ with open(config_path, "r") as f:
291
+ config = yaml.safe_load(f)
292
+
293
+ inputParams = getInputArgs(config_path)
294
+ genomeParams = getGenomeArgs(config_path)
295
+ countingParams = getCountingArgs(config_path)
296
+ minR_default = _getMinR(config, len(inputParams.bamFiles))
297
+ matchingExcludeRegionsBedFile_default: Optional[str] = (
298
+ genomeParams.blacklistFile
299
+ )
300
+ return {
301
+ "experimentName": config.get("experimentName", "consenrichExperiment"),
302
+ "genomeArgs": genomeParams,
303
+ "inputArgs": inputParams,
304
+ "countingArgs": countingParams,
305
+ "processArgs": core.processParams(
306
+ deltaF=config.get("processParams.deltaF", 0.5),
307
+ minQ=config.get("processParams.minQ", 0.25),
308
+ maxQ=config.get("processParams.maxQ", 500.0),
309
+ offDiagQ=config.get("processParams.offDiagQ", 0.0),
310
+ dStatAlpha=config.get("processParams.dStatAlpha", 3.0),
311
+ dStatd=config.get("processParams.dStatd", 10.0),
312
+ dStatPC=config.get("processParams.dStatPC", 2.0),
313
+ scaleResidualsByP11=config.get(
314
+ "processParams.scaleResidualsByP11", False
315
+ ),
316
+ ),
317
+ "observationArgs": core.observationParams(
318
+ minR=minR_default,
319
+ maxR=config.get("observationParams.maxR", 500.0),
320
+ useALV=config.get("observationParams.useALV", False),
321
+ useConstantNoiseLevel=config.get(
322
+ "observationParams.useConstantNoiseLevel", False
323
+ ),
324
+ noGlobal=config.get("observationParams.noGlobal", False),
325
+ numNearest=config.get("observationParams.numNearest", 25),
326
+ localWeight=config.get("observationParams.localWeight", 0.333),
327
+ globalWeight=config.get("observationParams.globalWeight", 0.667),
328
+ approximationWindowLengthBP=config.get(
329
+ "observationParams.approximationWindowLengthBP", 10000
330
+ ),
331
+ lowPassWindowLengthBP=config.get(
332
+ "observationParams.lowPassWindowLengthBP", 20000
333
+ ),
334
+ lowPassFilterType=config.get(
335
+ "observationParams.lowPassFilterType", "median"
336
+ ),
337
+ returnCenter=config.get("observationParams.returnCenter", True),
338
+ ),
339
+ "stateArgs": core.stateParams(
340
+ stateInit=config.get("stateParams.stateInit", 0.0),
341
+ stateCovarInit=config.get("stateParams.stateCovarInit", 100.0),
342
+ boundState=config.get("stateParams.boundState", True),
343
+ stateLowerBound=config.get("stateParams.stateLowerBound", 0.0),
344
+ stateUpperBound=config.get("stateParams.stateUpperBound", 10000.0),
345
+ ),
346
+ "samArgs": core.samParams(
347
+ samThreads=config.get("samParams.samThreads", 1),
348
+ samFlagExclude=config.get("samParams.samFlagExclude", 3844),
349
+ oneReadPerBin=config.get("samParams.oneReadPerBin", 0),
350
+ chunkSize=config.get("samParams.chunkSize", 1000000),
351
+ offsetStr=config.get("samParams.offsetStr", "0,0"),
352
+ extendBP=config.get("samParams.extendBP", []),
353
+ maxInsertSize=config.get("samParams.maxInsertSize", 1000),
354
+ pairedEndMode=config.get("samParams.pairedEndMode", 0),
355
+ inferFragmentLength=config.get("samParams.inferFragmentLength", 0),
356
+ ),
357
+ "detrendArgs": core.detrendParams(
358
+ detrendWindowLengthBP=config.get(
359
+ "detrendParams.detrendWindowLengthBP", 10000
360
+ ),
361
+ detrendTrackPercentile=config.get(
362
+ "detrendParams.detrendTrackPercentile", 75.0
363
+ ),
364
+ usePolyFilter=config.get("detrendParams.usePolyFilter", False),
365
+ detrendSavitzkyGolayDegree=config.get(
366
+ "detrendParams.detrendSavitzkyGolayDegree", 2
367
+ ),
368
+ useOrderStatFilter=config.get(
369
+ "detrendParams.useOrderStatFilter", True
370
+ ),
371
+ ),
372
+ "matchingArgs": core.matchingParams(
373
+ templateNames=config.get("matchingParams.templateNames", []),
374
+ cascadeLevels=config.get("matchingParams.cascadeLevels", [2]),
375
+ iters=config.get("matchingParams.iters", 25_000),
376
+ alpha=config.get("matchingParams.alpha", 0.05),
377
+ minMatchLengthBP=config.get("matchingParams.minMatchLengthBP", 250),
378
+ maxNumMatches=config.get("matchingParams.maxNumMatches", 100_000),
379
+ minSignalAtMaxima=config.get(
380
+ "matchingParams.minSignalAtMaxima", "q:0.75"
381
+ ),
382
+ merge=config.get("matchingParams.merge", True),
383
+ mergeGapBP=config.get("matchingParams.mergeGapBP", 50),
384
+ useScalingFunction=config.get(
385
+ "matchingParams.useScalingFunction", True
386
+ ),
387
+ excludeRegionsBedFile=config.get(
388
+ "matchingParams.excludeRegionsBedFile",
389
+ matchingExcludeRegionsBedFile_default,
390
+ ),
391
+ ),
392
+ }
393
+
394
+
395
+ def convertBedGraphToBigWig(experimentName, chromSizesFile):
396
+ suffixes = ["state", "residuals"]
397
+ path_ = ""
398
+ warningMessage = (
399
+ "Could not find UCSC bedGraphToBigWig binary utility."
400
+ "If you need bigWig files instead of the default, human-readable bedGraph files,"
401
+ "you can download the `bedGraphToBigWig` binary from https://hgdownload.soe.ucsc.edu/admin/exe/<operatingSystem, architecture>"
402
+ "OR install via conda (conda install -c bioconda ucsc-bedgraphtobigwig)."
403
+ )
404
+
405
+ logger.info("Attempting to generate bigWig files from bedGraph format...")
406
+ try:
407
+ path_ = shutil.which("bedGraphToBigWig")
408
+ except Exception as e:
409
+ logger.warning(f"\n{warningMessage}\n")
410
+ return
411
+ if path_ is None or len(path_) == 0:
412
+ logger.warning(f"\n{warningMessage}\n")
413
+ return
414
+ logger.info(f"Using bedGraphToBigWig from {path_}")
415
+ for suffix in suffixes:
416
+ bedgraph = f"consenrichOutput_{experimentName}_{suffix}.bedGraph"
417
+ if not os.path.exists(bedgraph):
418
+ logger.warning(
419
+ f"bedGraph file {bedgraph} does not exist. Skipping bigWig conversion."
420
+ )
421
+ continue
422
+ if not os.path.exists(chromSizesFile):
423
+ logger.warning(
424
+ f"{chromSizesFile} does not exist. Skipping bigWig conversion."
425
+ )
426
+ return
427
+ bigwig = f"{experimentName}_consenrich_{suffix}.bw"
428
+ logger.info(f"Start: {bedgraph} --> {bigwig}...")
429
+ try:
430
+ subprocess.run(
431
+ [path_, bedgraph, chromSizesFile, bigwig], check=True
432
+ )
433
+ except Exception as e:
434
+ logger.warning(
435
+ f"bedGraph-->bigWig conversion with\n\n\t`bedGraphToBigWig {bedgraph} {chromSizesFile} {bigwig}`\nraised: \n{e}\n\n"
436
+ )
437
+ continue
438
+ if os.path.exists(bigwig) and os.path.getsize(bigwig) > 100:
439
+ logger.info(f"Finished: converted {bedgraph} to {bigwig}.")
440
+
441
+
442
+ def main():
443
+ parser = argparse.ArgumentParser(description="Consenrich CLI")
444
+ parser.add_argument(
445
+ "--config",
446
+ type=str,
447
+ dest="config",
448
+ help="Path to a YAML config file with parameters + arguments defined in `consenrich.core`",
449
+ )
450
+
451
+ # --- Matching-specific command-line arguments ---
452
+ parser.add_argument(
453
+ "--match-bedGraph",
454
+ type=str,
455
+ dest="matchBedGraph",
456
+ help="Path to a bedGraph file of Consenrich estimates to match templates against.\
457
+ If provided, *only* the matching algorithm is run (no other processing).",
458
+ )
459
+ parser.add_argument(
460
+ "--match-template",
461
+ type=str,
462
+ default="haar",
463
+ choices=[x for x in pywt.wavelist(kind="discrete") if "bio" not in x],
464
+ dest="matchTemplate",
465
+ )
466
+ parser.add_argument("--match-level", type=int, default=2, dest="matchLevel")
467
+ parser.add_argument(
468
+ "--match-alpha", type=float, default=0.05, dest="matchAlpha"
469
+ )
470
+ parser.add_argument(
471
+ "--match-min-length",
472
+ type=int,
473
+ default=250,
474
+ dest="matchMinMatchLengthBP",
475
+ )
476
+ parser.add_argument(
477
+ "--match-iters", type=int, default=25000, dest="matchIters"
478
+ )
479
+ parser.add_argument(
480
+ "--match-min-signal",
481
+ type=str,
482
+ default="q:0.75",
483
+ dest="matchMinSignalAtMaxima",
484
+ )
485
+ parser.add_argument(
486
+ "--match-max-matches",
487
+ type=int,
488
+ default=100000,
489
+ dest="matchMaxNumMatches",
490
+ )
491
+ parser.add_argument(
492
+ "--match-no-merge", action="store_true", dest="matchNoMerge"
493
+ )
494
+ parser.add_argument(
495
+ "--match-merge-gap", type=int, default=50, dest="matchMergeGapBP"
496
+ )
497
+ parser.add_argument(
498
+ "--match-use-wavelet", action="store_true", dest="matchUseWavelet"
499
+ )
500
+ parser.add_argument(
501
+ "--match-seed", type=int, default=42, dest="matchRandSeed"
502
+ )
503
+ parser.add_argument(
504
+ "--match-exclude-bed", type=str, default=None, dest="matchExcludeBed"
505
+ )
506
+ parser.add_argument(
507
+ "--verbose", action="store_true", help="If set, logs config"
508
+ )
509
+ args = parser.parse_args()
510
+
511
+ if args.matchBedGraph:
512
+ if not os.path.exists(args.matchBedGraph):
513
+ raise FileNotFoundError(
514
+ f"bedGraph file {args.matchBedGraph} couldn't be found."
515
+ )
516
+ logger.info(
517
+ f"Running matching algorithm using bedGraph file {args.matchBedGraph}..."
518
+ )
519
+
520
+ outName = matching.matchExistingBedGraph(
521
+ args.matchBedGraph,
522
+ args.matchTemplate,
523
+ args.matchLevel,
524
+ alpha=args.matchAlpha,
525
+ minMatchLengthBP=args.matchMinMatchLengthBP,
526
+ iters=args.matchIters,
527
+ minSignalAtMaxima=args.matchMinSignalAtMaxima,
528
+ maxNumMatches=args.matchMaxNumMatches,
529
+ useScalingFunction=(not args.matchUseWavelet),
530
+ merge=(not args.matchNoMerge),
531
+ mergeGapBP=args.matchMergeGapBP,
532
+ excludeRegionsBedFile=args.matchExcludeBed,
533
+ randSeed=args.matchRandSeed,
534
+ )
535
+ logger.info(f"Finished matching. Written to {outName}")
536
+ sys.exit(0)
537
+
538
+ if args.matchBedGraph:
539
+ # this shouldn't happen, but just in case -- matching on previous bedGraph means no other processing
540
+ logger.info(
541
+ "If `--match-bedgraph <path_to_bedgraph>` is provided, only the matching algorithm is run."
542
+ )
543
+ sys.exit(0)
544
+
545
+ if not args.config:
546
+ logger.info(
547
+ "No config file provided, run with `--config <path_to_config.yaml>`"
548
+ )
549
+ logger.info(
550
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
551
+ )
552
+ sys.exit(1)
553
+
554
+ if not os.path.exists(args.config):
555
+ logger.info(f"Config file {args.config} does not exist.")
556
+ logger.info(
557
+ "See documentation: https://nolan-h-hamilton.github.io/Consenrich/"
558
+ )
559
+ sys.exit(1)
560
+
561
+ config = readConfig(args.config)
562
+ experimentName = config["experimentName"]
563
+ genomeArgs = config["genomeArgs"]
564
+ inputArgs = config["inputArgs"]
565
+ countingArgs = config["countingArgs"]
566
+ processArgs = config["processArgs"]
567
+ observationArgs = config["observationArgs"]
568
+ stateArgs = config["stateArgs"]
569
+ samArgs = config["samArgs"]
570
+ detrendArgs = config["detrendArgs"]
571
+ matchingArgs = config["matchingArgs"]
572
+ bamFiles = inputArgs.bamFiles
573
+ bamFilesControl = inputArgs.bamFilesControl
574
+ numSamples = len(bamFiles)
575
+ numNearest = observationArgs.numNearest
576
+ stepSize = countingArgs.stepSize
577
+ excludeForNorm = genomeArgs.excludeForNorm
578
+ chromSizes = genomeArgs.chromSizesFile
579
+ scaleDown = countingArgs.scaleDown
580
+ extendBP_ = core.resolveExtendBP(samArgs.extendBP, bamFiles)
581
+ initialTreatmentScaleFactors = []
582
+ if args.verbose:
583
+ try:
584
+ logger.info("Configuration:\n")
585
+ config_truncated = {
586
+ k: v
587
+ for k, v in config.items()
588
+ if k not in ["inputArgs", "genomeArgs", "countingArgs"]
589
+ }
590
+ config_truncated["experimentName"] = experimentName
591
+ config_truncated["inputArgs"] = inputArgs
592
+ config_truncated["genomeArgs"] = genomeArgs
593
+ config_truncated["countingArgs"] = countingArgs
594
+ config_truncated["processArgs"] = processArgs
595
+ config_truncated["observationArgs"] = observationArgs
596
+ config_truncated["stateArgs"] = stateArgs
597
+ config_truncated["samArgs"] = samArgs
598
+ config_truncated["detrendArgs"] = detrendArgs
599
+ pprint.pprint(config_truncated, indent=4)
600
+ except Exception as e:
601
+ logger.warning(f"Failed to print parsed config:\n{e}\n")
602
+
603
+ controlsPresent = checkControlsPresent(inputArgs)
604
+ if args.verbose:
605
+ logger.info(f"controlsPresent: {controlsPresent}")
606
+ readLengthsBamFiles = getReadLengths(inputArgs, countingArgs, samArgs)
607
+ effectiveGenomeSizes = getEffectiveGenomeSizes(
608
+ genomeArgs, readLengthsBamFiles
609
+ )
610
+ matchingEnabled = checkMatchingEnabled(matchingArgs)
611
+ if args.verbose:
612
+ logger.info(f"matchingEnabled: {matchingEnabled}")
613
+ scaleFactors = countingArgs.scaleFactors
614
+ scaleFactorsControl = countingArgs.scaleFactorsControl
615
+
616
+ if controlsPresent:
617
+ readLengthsControlBamFiles = [
618
+ core.getReadLength(
619
+ bamFile,
620
+ countingArgs.numReads,
621
+ 1000,
622
+ samArgs.samThreads,
623
+ samArgs.samFlagExclude,
624
+ )
625
+ for bamFile in bamFilesControl
626
+ ]
627
+ effectiveGenomeSizesControl = [
628
+ constants.getEffectiveGenomeSize(genomeArgs.genomeName, readLength)
629
+ for readLength in readLengthsControlBamFiles
630
+ ]
631
+
632
+ if scaleFactors is not None and scaleFactorsControl is not None:
633
+ treatScaleFactors = scaleFactors
634
+ controlScaleFactors = scaleFactorsControl
635
+ # still make sure this is accessible
636
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
637
+ else:
638
+ try:
639
+ initialTreatmentScaleFactors = [
640
+ detrorm.getScaleFactor1x(
641
+ bamFile,
642
+ effectiveGenomeSize,
643
+ readLength,
644
+ genomeArgs.excludeChroms,
645
+ genomeArgs.chromSizesFile,
646
+ samArgs.samThreads,
647
+ )
648
+ for bamFile, effectiveGenomeSize, readLength in zip(
649
+ bamFiles, effectiveGenomeSizes, readLengthsBamFiles
650
+ )
651
+ ]
652
+ except Exception:
653
+ initialTreatmentScaleFactors = [1.0] * len(bamFiles)
654
+
655
+ pairScalingFactors = [
656
+ detrorm.getPairScaleFactors(
657
+ bamFileA,
658
+ bamFileB,
659
+ effectiveGenomeSizeA,
660
+ effectiveGenomeSizeB,
661
+ readLengthA,
662
+ readLengthB,
663
+ excludeForNorm,
664
+ chromSizes,
665
+ samArgs.samThreads,
666
+ scaleDown,
667
+ )
668
+ for bamFileA, bamFileB, effectiveGenomeSizeA, effectiveGenomeSizeB, readLengthA, readLengthB in zip(
669
+ bamFiles,
670
+ bamFilesControl,
671
+ effectiveGenomeSizes,
672
+ effectiveGenomeSizesControl,
673
+ readLengthsBamFiles,
674
+ readLengthsControlBamFiles,
675
+ )
676
+ ]
677
+
678
+ treatScaleFactors = []
679
+ controlScaleFactors = []
680
+ for scaleFactorA, scaleFactorB in pairScalingFactors:
681
+ treatScaleFactors.append(scaleFactorA)
682
+ controlScaleFactors.append(scaleFactorB)
683
+
684
+ else:
685
+ treatScaleFactors = scaleFactors
686
+ controlScaleFactors = scaleFactorsControl
687
+
688
+ if scaleFactors is None and not controlsPresent:
689
+ scaleFactors = [
690
+ detrorm.getScaleFactor1x(
691
+ bamFile,
692
+ effectiveGenomeSize,
693
+ readLength,
694
+ genomeArgs.excludeChroms,
695
+ genomeArgs.chromSizesFile,
696
+ samArgs.samThreads,
697
+ )
698
+ for bamFile, effectiveGenomeSize, readLength in zip(
699
+ bamFiles, effectiveGenomeSizes, readLengthsBamFiles
700
+ )
701
+ ]
702
+ chromSizesDict = misc_util.getChromSizesDict(
703
+ genomeArgs.chromSizesFile, excludeChroms=genomeArgs.excludeChroms
704
+ )
705
+ chromosomes = genomeArgs.chromosomes
706
+
707
+ for c_, chromosome in enumerate(chromosomes):
708
+ chromosomeStart, chromosomeEnd = core.getChromRangesJoint(
709
+ bamFiles,
710
+ chromosome,
711
+ chromSizesDict[chromosome],
712
+ samArgs.samThreads,
713
+ samArgs.samFlagExclude,
714
+ )
715
+ chromosomeStart = max(
716
+ 0, (chromosomeStart - (chromosomeStart % stepSize))
717
+ )
718
+ chromosomeEnd = max(0, (chromosomeEnd - (chromosomeEnd % stepSize)))
719
+ numIntervals = (
720
+ ((chromosomeEnd - chromosomeStart) + stepSize) - 1
721
+ ) // stepSize
722
+ intervals = np.arange(chromosomeStart, chromosomeEnd, stepSize)
723
+ chromMat: np.ndarray = np.empty(
724
+ (numSamples, numIntervals), dtype=np.float32
725
+ )
726
+ if controlsPresent:
727
+ j_: int = 0
728
+ finalSF = 1.0
729
+ for bamA, bamB in zip(bamFiles, bamFilesControl):
730
+ logger.info(
731
+ f"Counting (trt,ctrl) for {chromosome}: ({bamA}, {bamB})"
732
+ )
733
+ pairMatrix: np.ndarray = core.readBamSegments(
734
+ [bamA, bamB],
735
+ chromosome,
736
+ chromosomeStart,
737
+ chromosomeEnd,
738
+ stepSize,
739
+ [readLengthsBamFiles[j_], readLengthsControlBamFiles[j_]],
740
+ [treatScaleFactors[j_], controlScaleFactors[j_]],
741
+ samArgs.oneReadPerBin,
742
+ samArgs.samThreads,
743
+ samArgs.samFlagExclude,
744
+ offsetStr=samArgs.offsetStr,
745
+ extendBP=extendBP_[j_],
746
+ maxInsertSize=samArgs.maxInsertSize,
747
+ pairedEndMode=samArgs.pairedEndMode,
748
+ inferFragmentLength=samArgs.inferFragmentLength,
749
+ applyAsinh=countingArgs.applyAsinh,
750
+ applyLog=countingArgs.applyLog,
751
+ )
752
+ if countingArgs.rescaleToTreatmentCoverage:
753
+ finalSF = max(1.0, initialTreatmentScaleFactors[j_])
754
+ chromMat[j_, :] = finalSF * (
755
+ pairMatrix[0, :] - pairMatrix[1, :]
756
+ )
757
+ j_ += 1
758
+ else:
759
+ chromMat = core.readBamSegments(
760
+ bamFiles,
761
+ chromosome,
762
+ chromosomeStart,
763
+ chromosomeEnd,
764
+ stepSize,
765
+ readLengthsBamFiles,
766
+ scaleFactors,
767
+ samArgs.oneReadPerBin,
768
+ samArgs.samThreads,
769
+ samArgs.samFlagExclude,
770
+ offsetStr=samArgs.offsetStr,
771
+ extendBP=samArgs.extendBP,
772
+ maxInsertSize=samArgs.maxInsertSize,
773
+ pairedEndMode=samArgs.pairedEndMode,
774
+ inferFragmentLength=samArgs.inferFragmentLength,
775
+ applyAsinh=countingArgs.applyAsinh,
776
+ applyLog=countingArgs.applyLog,
777
+ )
778
+ sparseMap = None
779
+ if genomeArgs.sparseBedFile and not observationArgs.useALV:
780
+ logger.info(f"Building sparse mapping for {chromosome}...")
781
+ sparseMap = core.getSparseMap(
782
+ chromosome, intervals, numNearest, genomeArgs.sparseBedFile
783
+ )
784
+
785
+ muncMat = np.empty_like(chromMat, dtype=np.float32)
786
+ for j in range(numSamples):
787
+ logger.info(f"Muncing {j + 1}/{numSamples} for {chromosome}...")
788
+ muncMat[j, :] = core.getMuncTrack(
789
+ chromosome,
790
+ intervals,
791
+ stepSize,
792
+ chromMat[j, :],
793
+ observationArgs.minR,
794
+ observationArgs.maxR,
795
+ observationArgs.useALV,
796
+ observationArgs.useConstantNoiseLevel,
797
+ observationArgs.noGlobal,
798
+ observationArgs.localWeight,
799
+ observationArgs.globalWeight,
800
+ observationArgs.approximationWindowLengthBP,
801
+ observationArgs.lowPassWindowLengthBP,
802
+ observationArgs.returnCenter,
803
+ sparseMap=sparseMap,
804
+ lowPassFilterType=observationArgs.lowPassFilterType,
805
+ )
806
+ chromMat[j, :] = detrorm.detrendTrack(
807
+ chromMat[j, :],
808
+ stepSize,
809
+ detrendArgs.detrendWindowLengthBP,
810
+ detrendArgs.useOrderStatFilter,
811
+ detrendArgs.usePolyFilter,
812
+ detrendArgs.detrendTrackPercentile,
813
+ detrendArgs.detrendSavitzkyGolayDegree,
814
+ )
815
+ logger.info(f">>>Running consenrich: {chromosome}<<<")
816
+
817
+ x, P, y = core.runConsenrich(
818
+ chromMat,
819
+ muncMat,
820
+ processArgs.deltaF,
821
+ processArgs.minQ,
822
+ processArgs.maxQ,
823
+ processArgs.offDiagQ,
824
+ processArgs.dStatAlpha,
825
+ processArgs.dStatd,
826
+ processArgs.dStatPC,
827
+ stateArgs.stateInit,
828
+ stateArgs.stateCovarInit,
829
+ stateArgs.boundState,
830
+ stateArgs.stateLowerBound,
831
+ stateArgs.stateUpperBound,
832
+ samArgs.chunkSize,
833
+ progressIter=50_000,
834
+ )
835
+ logger.info("Done.")
836
+
837
+ x_ = core.getPrimaryState(x)
838
+ y_ = core.getPrecisionWeightedResidual(
839
+ y,
840
+ muncMat,
841
+ stateCovarSmoothed=P
842
+ if processArgs.scaleResidualsByP11 is not None
843
+ and processArgs.scaleResidualsByP11
844
+ else None,
845
+ )
846
+
847
+ df = pd.DataFrame(
848
+ {
849
+ "Chromosome": chromosome,
850
+ "Start": intervals,
851
+ "End": intervals + stepSize,
852
+ "State": x_,
853
+ "Res": y_,
854
+ }
855
+ )
856
+ if c_ == 0 and len(chromosomes) > 1:
857
+ for file_ in os.listdir("."):
858
+ if file_.startswith(f"consenrichOutput_{experimentName}") and (
859
+ file_.endswith(".bedGraph") or file_.endswith(".narrowPeak")
860
+ ):
861
+ logger.warning(f"Overwriting: {file_}")
862
+ os.remove(file_)
863
+
864
+ for col, suffix in [("State", "state"), ("Res", "residuals")]:
865
+ logger.info(
866
+ f"{chromosome}: writing/appending to: consenrichOutput_{experimentName}_{suffix}.bedGraph"
867
+ )
868
+ df[["Chromosome", "Start", "End", col]].to_csv(
869
+ f"consenrichOutput_{experimentName}_{suffix}.bedGraph",
870
+ sep="\t",
871
+ header=False,
872
+ index=False,
873
+ mode="a",
874
+ float_format="%.3f",
875
+ lineterminator="\n",
876
+ )
877
+ try:
878
+ if matchingEnabled:
879
+ matchingDF = matching.matchWavelet(
880
+ chromosome,
881
+ intervals,
882
+ x_,
883
+ matchingArgs.templateNames,
884
+ matchingArgs.cascadeLevels,
885
+ matchingArgs.iters,
886
+ matchingArgs.alpha,
887
+ matchingArgs.minMatchLengthBP,
888
+ matchingArgs.maxNumMatches,
889
+ matchingArgs.minSignalAtMaxima,
890
+ useScalingFunction=matchingArgs.useScalingFunction,
891
+ excludeRegionsBedFile=matchingArgs.excludeRegionsBedFile,
892
+ )
893
+ if not matchingDF.empty:
894
+ matchingDF.to_csv(
895
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
896
+ sep="\t",
897
+ header=False,
898
+ index=False,
899
+ mode="a",
900
+ float_format="%.3f",
901
+ lineterminator="\n",
902
+ )
903
+ except Exception as e:
904
+ logger.warning(
905
+ f"Matching routine unsuccessful for {chromosome}...SKIPPING:\n{e}\n\n"
906
+ )
907
+ continue
908
+ logger.info("Finished: output in human-readable format")
909
+ convertBedGraphToBigWig(experimentName, genomeArgs.chromSizesFile)
910
+ if matchingEnabled and matchingArgs.merge:
911
+ try:
912
+ matching.mergeMatches(
913
+ f"consenrichOutput_{experimentName}_matches.narrowPeak",
914
+ mergeGapBP=matchingArgs.mergeGapBP,
915
+ )
916
+
917
+ except Exception as e:
918
+ logger.warning(f"Failed to merge matches...SKIPPING:\n{e}\n\n")
919
+ logger.info("Done.")
920
+
921
+
922
+ if __name__ == "__main__":
923
+ main()