krewlyzer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krewlyzer/__init__.py +1 -0
- krewlyzer/cli.py +47 -0
- krewlyzer/fsc.py +450 -0
- krewlyzer/fsd.py +139 -0
- krewlyzer/fsr.py +171 -0
- krewlyzer/helpers.py +187 -0
- krewlyzer/motif.py +275 -0
- krewlyzer/ocf.py +133 -0
- krewlyzer/uxm.py +188 -0
- krewlyzer/wps.py +173 -0
- krewlyzer/wrapper.py +125 -0
- krewlyzer-0.1.0.dist-info/METADATA +15 -0
- krewlyzer-0.1.0.dist-info/RECORD +17 -0
- krewlyzer-0.1.0.dist-info/WHEEL +5 -0
- krewlyzer-0.1.0.dist-info/entry_points.txt +2 -0
- krewlyzer-0.1.0.dist-info/licenses/LICENSE +619 -0
- krewlyzer-0.1.0.dist-info/top_level.txt +1 -0
krewlyzer/fsr.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import pysam
|
|
8
|
+
import pybedtools
|
|
9
|
+
import numpy as np
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.logging import RichHandler
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
15
|
+
logger = logging.getLogger("fsr")
|
|
16
|
+
|
|
17
|
+
from .helpers import gc_correct
|
|
18
|
+
|
|
19
|
+
def _calc_fsr(bedgz_input, bin_input, windows, continue_n, output_file):
|
|
20
|
+
"""
|
|
21
|
+
Internal: Calculate fragment size ratio (FSR) for a single .bed.gz file.
|
|
22
|
+
Writes region-based ratios for short, intermediate, and long fragments.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
logger.info(f"input file: {bedgz_input}, {bin_input}")
|
|
26
|
+
try:
|
|
27
|
+
inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
|
|
28
|
+
except Exception as e:
|
|
29
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
30
|
+
raise typer.Exit(1)
|
|
31
|
+
try:
|
|
32
|
+
bins = pybedtools.BedTool(bin_input)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.error(f"Could not load bins from {bin_input}: {e}")
|
|
35
|
+
raise typer.Exit(1)
|
|
36
|
+
length = len(bins)
|
|
37
|
+
shorts_data, intermediates_data, longs_data, totals_data, bingc = [], [], [], [], []
|
|
38
|
+
chrom = []
|
|
39
|
+
logger.info(f"output file: {output_file}")
|
|
40
|
+
for idx in range(length):
|
|
41
|
+
bin = bins[idx]
|
|
42
|
+
try:
|
|
43
|
+
chrom.append(bin.chrom)
|
|
44
|
+
inputbed.fetch(bin.chrom, bin.start, bin.end)
|
|
45
|
+
except ValueError:
|
|
46
|
+
bingc.append(np.nan)
|
|
47
|
+
shorts_data.append(0)
|
|
48
|
+
intermediates_data.append(0)
|
|
49
|
+
longs_data.append(0)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.error(f"Error fetching bin {bin}: {e}")
|
|
52
|
+
raise typer.Exit(1)
|
|
53
|
+
else:
|
|
54
|
+
bin_data = []
|
|
55
|
+
gc = []
|
|
56
|
+
try:
|
|
57
|
+
for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
|
|
58
|
+
bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
|
|
59
|
+
if 65 <= int(read.split("\t")[2]) - int(read.split("\t")[1]) <= 400:
|
|
60
|
+
gc.append(float(read.split("\t")[3]))
|
|
61
|
+
count = np.bincount(bin_data, minlength=401)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.error(f"Error processing reads in bin {bin}: {e}")
|
|
64
|
+
raise typer.Exit(1)
|
|
65
|
+
if len(gc) == 0:
|
|
66
|
+
bingc.append(np.nan)
|
|
67
|
+
else:
|
|
68
|
+
bingc.append(np.mean(gc))
|
|
69
|
+
shorts = sum(count[65:150])
|
|
70
|
+
intermediates = sum(count[151:260])
|
|
71
|
+
longs = sum(count[261:400])
|
|
72
|
+
totals = sum(count[65:400])
|
|
73
|
+
if totals == 0:
|
|
74
|
+
shorts_data.append(0)
|
|
75
|
+
intermediates_data.append(0)
|
|
76
|
+
longs_data.append(0)
|
|
77
|
+
else:
|
|
78
|
+
shorts_data.append(shorts / totals)
|
|
79
|
+
intermediates_data.append(intermediates / totals)
|
|
80
|
+
longs_data.append(longs / totals)
|
|
81
|
+
start = 0
|
|
82
|
+
step = 0
|
|
83
|
+
try:
|
|
84
|
+
with open(output_file, 'w') as fsrfile:
|
|
85
|
+
fsrfile.write("region\tshort-ratio\titermediate-ratio\tlong-ratio\n")
|
|
86
|
+
while step < length:
|
|
87
|
+
num = chrom.count(chrom[step])
|
|
88
|
+
continues_bin = num // continue_n
|
|
89
|
+
last_bin = num % continue_n
|
|
90
|
+
for _ in range(continues_bin):
|
|
91
|
+
bin_start = start * windows
|
|
92
|
+
bin_end = (start + continue_n) * windows - 1
|
|
93
|
+
combine_shorts = shorts_data[step: step + continue_n]
|
|
94
|
+
combine_intermediates = intermediates_data[step: step + continue_n]
|
|
95
|
+
combine_longs = longs_data[step: step + continue_n]
|
|
96
|
+
tmp_array = np.zeros(3)
|
|
97
|
+
tmp_array[0] = np.mean(combine_shorts)
|
|
98
|
+
tmp_array[1] = np.mean(combine_intermediates)
|
|
99
|
+
tmp_array[2] = np.mean(combine_longs)
|
|
100
|
+
region = f"{chrom[step]}:{bin_start}-{bin_end}"
|
|
101
|
+
temp_str = f"{region}\t" + "\t".join(map(str, tmp_array)) + "\n"
|
|
102
|
+
fsrfile.write(temp_str)
|
|
103
|
+
step += continue_n
|
|
104
|
+
start += continue_n
|
|
105
|
+
if last_bin != 0:
|
|
106
|
+
step += last_bin
|
|
107
|
+
start = 0
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Error writing FSR output file: {e}")
|
|
110
|
+
raise typer.Exit(1)
|
|
111
|
+
logger.info(f"FSR calculation complete. Results written to {output_file}")
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"Fatal error in _calc_fsr: {e}")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
|
|
116
|
+
def fsr(
|
|
117
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
118
|
+
bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
|
|
119
|
+
windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
|
|
120
|
+
continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
|
|
121
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
122
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
123
|
+
):
|
|
124
|
+
"""
|
|
125
|
+
Calculate fragment size ratio (FSR) features for all .bed.gz files in a folder.
|
|
126
|
+
"""
|
|
127
|
+
# Input checks
|
|
128
|
+
if not bedgz_path.exists():
|
|
129
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
130
|
+
raise typer.Exit(1)
|
|
131
|
+
if bin_input and not bin_input.exists():
|
|
132
|
+
logger.error(f"Bin input file not found: {bin_input}")
|
|
133
|
+
raise typer.Exit(1)
|
|
134
|
+
try:
|
|
135
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
138
|
+
raise typer.Exit(1)
|
|
139
|
+
"""
|
|
140
|
+
The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
|
|
141
|
+
Output files are written to the output directory, one per .bed.gz file.
|
|
142
|
+
"""
|
|
143
|
+
if not output.exists():
|
|
144
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
146
|
+
if not bedgz_files:
|
|
147
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
148
|
+
raise typer.Exit(1)
|
|
149
|
+
if bin_input is None:
|
|
150
|
+
bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
|
|
151
|
+
logger.info(f"No bin_input specified. Using default: {bin_input}")
|
|
152
|
+
if not bin_input.exists():
|
|
153
|
+
logger.error(f"Bin input file does not exist: {bin_input}")
|
|
154
|
+
raise typer.Exit(1)
|
|
155
|
+
logger.info(f"Calculating FSR for {len(bedgz_files)} files...")
|
|
156
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
157
|
+
logger.info(f"Starting parallel FSR calculation using {threads} processes...")
|
|
158
|
+
def run_fsr_file(bedgz_file):
|
|
159
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSR.txt')
|
|
160
|
+
_calc_fsr(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
|
|
161
|
+
return str(output_file)
|
|
162
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
163
|
+
futures = {executor.submit(run_fsr_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
164
|
+
for future in as_completed(futures):
|
|
165
|
+
bedgz_file = futures[future]
|
|
166
|
+
try:
|
|
167
|
+
result = future.result()
|
|
168
|
+
logger.info(f"FSR calculated: {result}")
|
|
169
|
+
except Exception as exc:
|
|
170
|
+
logger.error(f"FSR calculation failed for {bedgz_file}: {exc}")
|
|
171
|
+
logger.info(f"FSR features calculated for {len(bedgz_files)} files.")
|
krewlyzer/helpers.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import math
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from rich.logging import RichHandler
|
|
9
|
+
import logging
|
|
10
|
+
from skmisc.loess import loess
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler()], format="%(message)s")
|
|
14
|
+
logger = logging.getLogger("krewlyzer-helpers")
|
|
15
|
+
|
|
16
|
+
def gc_correct(coverage, bias):
|
|
17
|
+
"""
|
|
18
|
+
Perform GC bias correction on coverage values using LOESS regression.
|
|
19
|
+
Logs errors and raises commonError if fitting fails.
|
|
20
|
+
"""
|
|
21
|
+
covl = len(coverage)
|
|
22
|
+
valid = [True for _ in range(covl)]
|
|
23
|
+
temp_cov = []
|
|
24
|
+
temp_bias = []
|
|
25
|
+
for i in range(covl):
|
|
26
|
+
if np.isnan(bias[i]):
|
|
27
|
+
valid[i] = False
|
|
28
|
+
else:
|
|
29
|
+
temp_cov.append(coverage[i])
|
|
30
|
+
temp_bias.append(bias[i])
|
|
31
|
+
if not temp_cov or not temp_bias:
|
|
32
|
+
logger.error("No valid coverage/bias values for GC correction.")
|
|
33
|
+
raise commonError("No valid coverage/bias values for GC correction.")
|
|
34
|
+
med = np.median(temp_cov)
|
|
35
|
+
correct_cov = []
|
|
36
|
+
try:
|
|
37
|
+
i = np.arange(np.min(temp_bias), np.max(temp_bias), 0.001)
|
|
38
|
+
coverage_trend = loess(temp_bias, temp_cov, span=0.75)
|
|
39
|
+
coverage_trend.fit()
|
|
40
|
+
coverage_model = loess(i, coverage_trend.predict(i, stderror=True).values)
|
|
41
|
+
coverage_model.fit()
|
|
42
|
+
coverage_pred = coverage_model.predict(temp_bias, stderror=True)
|
|
43
|
+
pred = np.array(coverage_pred.values)
|
|
44
|
+
coverage_corrected = temp_cov - pred + med
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.error(f"GC correction failed: {e}")
|
|
47
|
+
raise commonError(f"GC correction failed: {e}")
|
|
48
|
+
i, j = 0, 0
|
|
49
|
+
while i < covl:
|
|
50
|
+
if valid[i]:
|
|
51
|
+
if coverage_corrected[j] < 0:
|
|
52
|
+
correct_cov.append(0)
|
|
53
|
+
else:
|
|
54
|
+
correct_cov.append(coverage_corrected[j])
|
|
55
|
+
j += 1
|
|
56
|
+
else:
|
|
57
|
+
correct_cov.append(0)
|
|
58
|
+
i += 1
|
|
59
|
+
return correct_cov
|
|
60
|
+
|
|
61
|
+
class commonError(Exception):
|
|
62
|
+
def __init__(self, message):
|
|
63
|
+
logger.error(f"commonError: {message}")
|
|
64
|
+
self.message = message
|
|
65
|
+
|
|
66
|
+
def maxCore(nCore=None):
|
|
67
|
+
if nCore and nCore > 16:
|
|
68
|
+
logger.warning("Requested nCore > 16; capping to 16.")
|
|
69
|
+
return 16
|
|
70
|
+
else:
|
|
71
|
+
return nCore
|
|
72
|
+
|
|
73
|
+
# Alias for CLI import consistency
|
|
74
|
+
max_core = maxCore
|
|
75
|
+
|
|
76
|
+
def rmEndString(x, y):
|
|
77
|
+
for item in y:
|
|
78
|
+
if x.endswith(item):
|
|
79
|
+
x = x.replace(item, "")
|
|
80
|
+
return x
|
|
81
|
+
|
|
82
|
+
def isSoftClipped(cigar):
|
|
83
|
+
"""
|
|
84
|
+
cigar information:
|
|
85
|
+
S BAM_CSOFT_CLIP 4
|
|
86
|
+
H BAM_CHARD_CLIP 5
|
|
87
|
+
P BAM_CPAD 6
|
|
88
|
+
"""
|
|
89
|
+
for (op, count) in cigar:
|
|
90
|
+
if op in [4, 5, 6]:
|
|
91
|
+
return True
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
def GCcontent(seq):
|
|
95
|
+
try:
|
|
96
|
+
nA = seq.count("a") + seq.count("A")
|
|
97
|
+
nT = seq.count("t") + seq.count("T")
|
|
98
|
+
nG = seq.count("g") + seq.count("G")
|
|
99
|
+
nC = seq.count("c") + seq.count("C")
|
|
100
|
+
percent_GC = (nG + nC) / (nA + nT + nG + nC) if (nA + nT + nG + nC) > 0 else 0
|
|
101
|
+
return percent_GC
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"GCcontent calculation failed: {e}")
|
|
104
|
+
return 0
|
|
105
|
+
|
|
106
|
+
def read_pair_generator(bam, region_string=None):
|
|
107
|
+
"""
|
|
108
|
+
Generate read pairs in a BAM file or within a region string.
|
|
109
|
+
Reads are added to read_dict until a pair is found.
|
|
110
|
+
Reference: https://www.biostars.org/p/306041/
|
|
111
|
+
"""
|
|
112
|
+
read_dict = defaultdict(lambda: [None, None])
|
|
113
|
+
try:
|
|
114
|
+
for read in bam.fetch(region=region_string):
|
|
115
|
+
if read.is_unmapped or read.is_qcfail or read.is_duplicate:
|
|
116
|
+
continue
|
|
117
|
+
if not read.is_paired or not read.is_proper_pair:
|
|
118
|
+
continue
|
|
119
|
+
if read.is_secondary or read.is_supplementary:
|
|
120
|
+
continue
|
|
121
|
+
if read.mate_is_unmapped:
|
|
122
|
+
continue
|
|
123
|
+
if read.rnext != read.tid:
|
|
124
|
+
continue
|
|
125
|
+
if read.template_length == 0:
|
|
126
|
+
continue
|
|
127
|
+
if isSoftClipped(read.cigar):
|
|
128
|
+
continue
|
|
129
|
+
qname = read.query_name
|
|
130
|
+
if qname not in read_dict:
|
|
131
|
+
if read.is_read1:
|
|
132
|
+
read_dict[qname][0] = read
|
|
133
|
+
else:
|
|
134
|
+
read_dict[qname][1] = read
|
|
135
|
+
else:
|
|
136
|
+
if read.is_read1:
|
|
137
|
+
yield read, read_dict[qname][1]
|
|
138
|
+
else:
|
|
139
|
+
yield read_dict[qname][0], read
|
|
140
|
+
del read_dict[qname]
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Error during BAM read pair generation: {e}")
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
def reverse_seq(seq):
|
|
146
|
+
r_seq = ''
|
|
147
|
+
for i in seq:
|
|
148
|
+
if i == 'A':
|
|
149
|
+
r_seq += 'T'
|
|
150
|
+
elif i == 'T':
|
|
151
|
+
r_seq += 'A'
|
|
152
|
+
elif i == 'C':
|
|
153
|
+
r_seq += 'G'
|
|
154
|
+
elif i == 'G':
|
|
155
|
+
r_seq += 'C'
|
|
156
|
+
else:
|
|
157
|
+
r_seq += i
|
|
158
|
+
return r_seq
|
|
159
|
+
|
|
160
|
+
def get_End_motif(Emotif, seq1, seq2):
|
|
161
|
+
if seq1.count('N') + seq1.count('n') + seq2.count('N') + seq2.count('n') != 0:
|
|
162
|
+
return Emotif
|
|
163
|
+
seq2 = reverse_seq(seq2)
|
|
164
|
+
if seq1 in Emotif.keys():
|
|
165
|
+
Emotif[seq1] += 1
|
|
166
|
+
if seq2 in Emotif.keys():
|
|
167
|
+
Emotif[seq2] += 1
|
|
168
|
+
return Emotif
|
|
169
|
+
|
|
170
|
+
def calc_MDS(inputEndMotifFile, outputfile):
|
|
171
|
+
inputfile = pd.read_table(inputEndMotifFile, header=None, names=['bases', 'frequency'])
|
|
172
|
+
k_mer = math.log(len(inputfile), 4)
|
|
173
|
+
frequency = inputfile['frequency'].to_numpy()
|
|
174
|
+
MDS = np.sum(-frequency * np.log2(frequency) / np.log2(4 ** k_mer))
|
|
175
|
+
with open(outputfile, 'a') as f:
|
|
176
|
+
f.write(inputEndMotifFile + '\t' + str(MDS) + '\n')
|
|
177
|
+
|
|
178
|
+
def get_Breakpoint_motif(Bpmotif, seq1, seq2):
|
|
179
|
+
# seq1 and seq2 do not include N
|
|
180
|
+
if seq1.count('N') + seq1.count('n') + seq2.count('N') + seq2.count('n') != 0:
|
|
181
|
+
return Bpmotif
|
|
182
|
+
seq2 = reverse_seq(seq2)
|
|
183
|
+
if seq1 in Bpmotif.keys():
|
|
184
|
+
Bpmotif[seq1] += 1
|
|
185
|
+
if seq2 in Bpmotif.keys():
|
|
186
|
+
Bpmotif[seq2] += 1
|
|
187
|
+
return Bpmotif
|
krewlyzer/motif.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# motif.py: Extracts motif-based features from BAM files
|
|
2
|
+
import typer
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import os
|
|
6
|
+
import pysam
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import math
|
|
10
|
+
import pybedtools
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from .helpers import (
|
|
13
|
+
reverse_seq,
|
|
14
|
+
get_End_motif,
|
|
15
|
+
get_Breakpoint_motif,
|
|
16
|
+
GCcontent,
|
|
17
|
+
read_pair_generator,
|
|
18
|
+
maxCore,
|
|
19
|
+
rmEndString,
|
|
20
|
+
calc_MDS
|
|
21
|
+
)
|
|
22
|
+
from rich.progress import Progress
|
|
23
|
+
from rich.console import Console
|
|
24
|
+
from rich.logging import RichHandler
|
|
25
|
+
import logging
|
|
26
|
+
|
|
27
|
+
console = Console()
|
|
28
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
29
|
+
logger = logging.getLogger("motif")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def motif(
|
|
33
|
+
bam_path: Path = typer.Argument(..., help="Path to input BAM file or directory of BAM files (GRCh37 aligned)"),
|
|
34
|
+
genome_reference: Path = typer.Option(..., '-g', help="Path to genome reference file (GRCh37/hg19)"),
|
|
35
|
+
output: Path = typer.Option(..., '-o', help="Output directory"),
|
|
36
|
+
blacklist: Optional[Path] = typer.Option(None, '-b', help="Path to blacklist regions file"),
|
|
37
|
+
map_quality: int = typer.Option(20, '-m', help="Minimum mapping quality"),
|
|
38
|
+
min_length: int = typer.Option(65, '--minlen', help="Minimum fragment length"),
|
|
39
|
+
max_length: int = typer.Option(400, '--maxlen', help="Maximum fragment length"),
|
|
40
|
+
kmer: int = typer.Option(3, '-k', help="K-mer size for motif extraction"),
|
|
41
|
+
chromosomes: Optional[str] = typer.Option(None, '--chromosomes', help="Comma-separated list of chromosomes to process"),
|
|
42
|
+
verbose: bool = typer.Option(False, '--verbose', help="Enable verbose logging"),
|
|
43
|
+
threads: int = typer.Option(1, '--threads', help="Number of parallel processes (default: 1)")
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
Extract motif-based features from BAM files.
|
|
47
|
+
"""
|
|
48
|
+
# Input checks
|
|
49
|
+
if not bam_path.exists():
|
|
50
|
+
logger.error(f"Input BAM file or directory not found: {bam_path}")
|
|
51
|
+
raise typer.Exit(1)
|
|
52
|
+
if not genome_reference.exists() or not genome_reference.is_file():
|
|
53
|
+
logger.error(f"Reference genome file not found: {genome_reference}")
|
|
54
|
+
raise typer.Exit(1)
|
|
55
|
+
try:
|
|
56
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
59
|
+
raise typer.Exit(1)
|
|
60
|
+
"""
|
|
61
|
+
Extracts end motif, breakpoint motif, and Motif-Diversity Score (MDS) from one or more BAM files.
|
|
62
|
+
If a directory is provided, all BAM files in the directory will be processed in parallel using multiple processes.
|
|
63
|
+
Output files are written to the output directory, with EDM, BPM, and MDS subfolders.
|
|
64
|
+
"""
|
|
65
|
+
import concurrent.futures
|
|
66
|
+
if verbose:
|
|
67
|
+
logger.setLevel(logging.DEBUG)
|
|
68
|
+
logger.info(f"Reference genome: {genome_reference}")
|
|
69
|
+
logger.info(f"Output directory: {output}")
|
|
70
|
+
if bam_path.is_dir():
|
|
71
|
+
bam_files = sorted([f for f in bam_path.iterdir() if f.suffix == '.bam'])
|
|
72
|
+
if not bam_files:
|
|
73
|
+
logger.error(f"No BAM files found in directory: {bam_path}")
|
|
74
|
+
raise typer.Exit(1)
|
|
75
|
+
logger.info(f"Processing {len(bam_files)} BAM files in parallel using {threads} processes...")
|
|
76
|
+
def run_motif_for_bam(bam_file):
|
|
77
|
+
logger.info(f"Processing BAM: {bam_file}")
|
|
78
|
+
motif_process(
|
|
79
|
+
str(bam_file),
|
|
80
|
+
str(blacklist) if blacklist else None,
|
|
81
|
+
str(output / (bam_file.stem + '.bed')),
|
|
82
|
+
str(genome_reference),
|
|
83
|
+
chromosomes.split(',') if chromosomes else None,
|
|
84
|
+
map_quality,
|
|
85
|
+
kmer,
|
|
86
|
+
fragFilter=True,
|
|
87
|
+
minLen=min_length,
|
|
88
|
+
maxLen=max_length
|
|
89
|
+
)
|
|
90
|
+
return str(bam_file)
|
|
91
|
+
with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
|
|
92
|
+
futures = {executor.submit(run_motif_for_bam, bam_file): bam_file for bam_file in bam_files}
|
|
93
|
+
for future in concurrent.futures.as_completed(futures):
|
|
94
|
+
bam_file = futures[future]
|
|
95
|
+
try:
|
|
96
|
+
result = future.result()
|
|
97
|
+
logger.info(f"Motif extraction complete for: {result}")
|
|
98
|
+
except Exception as exc:
|
|
99
|
+
logger.error(f"Motif extraction failed for {bam_file}: {exc}")
|
|
100
|
+
logger.info(f"All BAM files processed.")
|
|
101
|
+
else:
|
|
102
|
+
logger.info(f"Processing BAM: {bam_path}")
|
|
103
|
+
motif_process(
|
|
104
|
+
str(bam_path),
|
|
105
|
+
str(blacklist) if blacklist else None,
|
|
106
|
+
str(output / (bam_path.stem + '.bed')),
|
|
107
|
+
str(genome_reference),
|
|
108
|
+
chromosomes.split(',') if chromosomes else None,
|
|
109
|
+
map_quality,
|
|
110
|
+
kmer,
|
|
111
|
+
fragFilter=True,
|
|
112
|
+
minLen=min_length,
|
|
113
|
+
maxLen=max_length
|
|
114
|
+
)
|
|
115
|
+
logger.info("End motif, Breakpoint motif, and MDS extraction complete.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def motif_process(
|
|
119
|
+
bamInput,
|
|
120
|
+
blacklistInput,
|
|
121
|
+
bedOutput,
|
|
122
|
+
genome_reference,
|
|
123
|
+
CHR,
|
|
124
|
+
mapQuality,
|
|
125
|
+
k_mer,
|
|
126
|
+
fragFilter=False,
|
|
127
|
+
minLen=None,
|
|
128
|
+
maxLen=None
|
|
129
|
+
):
|
|
130
|
+
"""
|
|
131
|
+
Main motif feature extraction process with rich logging and consistent CLI output.
|
|
132
|
+
"""
|
|
133
|
+
from rich.table import Table
|
|
134
|
+
from rich.panel import Panel
|
|
135
|
+
from rich.text import Text
|
|
136
|
+
bedOutput_path = os.path.abspath(bedOutput)
|
|
137
|
+
EDM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'EDM')
|
|
138
|
+
BPM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'BPM')
|
|
139
|
+
MDS_output_path = os.path.join(os.path.dirname(bedOutput_path), 'MDS')
|
|
140
|
+
try:
|
|
141
|
+
os.makedirs(EDM_output_path, exist_ok=True)
|
|
142
|
+
os.makedirs(BPM_output_path, exist_ok=True)
|
|
143
|
+
os.makedirs(MDS_output_path, exist_ok=True)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.error(f"Failed to create output directories: {e}")
|
|
146
|
+
raise typer.Exit(1)
|
|
147
|
+
bases = ['A', 'C', 'T', 'G']
|
|
148
|
+
End_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
|
|
149
|
+
Breakpoint_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
|
|
150
|
+
try:
|
|
151
|
+
bamfile = pysam.AlignmentFile(bamInput, 'rb')
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Failed to open BAM file: {e}")
|
|
154
|
+
raise typer.Exit(1)
|
|
155
|
+
try:
|
|
156
|
+
genome = pysam.FastaFile(genome_reference)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Failed to open genome FASTA: {e}")
|
|
159
|
+
raise typer.Exit(1)
|
|
160
|
+
temp_bed = bedOutput + '.tmp'
|
|
161
|
+
try:
|
|
162
|
+
bedWrite = open(temp_bed, 'w')
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Failed to open temp BED for writing: {e}")
|
|
165
|
+
raise typer.Exit(1)
|
|
166
|
+
chroms = CHR if CHR else list(bamfile.references)
|
|
167
|
+
logger.info("Extracting motif features from BAM file...")
|
|
168
|
+
total_pairs = bamfile.mapped // 2 if bamfile.mapped else 1000000
|
|
169
|
+
motif_errors = 0
|
|
170
|
+
with Progress(console=console, transient=True) as progress:
|
|
171
|
+
task = progress.add_task("Processing fragments", total=total_pairs)
|
|
172
|
+
for idx, pair in enumerate(read_pair_generator(bamfile)):
|
|
173
|
+
try:
|
|
174
|
+
read1, read2 = pair
|
|
175
|
+
if read1.mapping_quality < mapQuality or read2.mapping_quality < mapQuality or read1.reference_name not in chroms:
|
|
176
|
+
continue
|
|
177
|
+
read1Start = read1.reference_start
|
|
178
|
+
read1End = read1.reference_end
|
|
179
|
+
read2Start = read2.reference_start
|
|
180
|
+
read2End = read2.reference_end
|
|
181
|
+
if not read1.is_reverse:
|
|
182
|
+
rstart = read1Start
|
|
183
|
+
rend = read2End
|
|
184
|
+
forward_end5 = read1.query_sequence[:k_mer].upper()
|
|
185
|
+
forward_end3 = read2.query_sequence[-k_mer:].upper()
|
|
186
|
+
else:
|
|
187
|
+
rstart = read2Start
|
|
188
|
+
rend = read1End
|
|
189
|
+
forward_end5 = read2.query_sequence[:k_mer].upper()
|
|
190
|
+
forward_end3 = read1.query_sequence[-k_mer:].upper()
|
|
191
|
+
if (rstart < 0) or (rend < 0) or (rstart >= rend):
|
|
192
|
+
continue
|
|
193
|
+
if fragFilter:
|
|
194
|
+
readLen = rend - rstart
|
|
195
|
+
if (minLen and readLen < minLen) or (maxLen and readLen > maxLen):
|
|
196
|
+
continue
|
|
197
|
+
gc = GCcontent(genome.fetch(read1.reference_name, rstart, rend))
|
|
198
|
+
bedWrite.write(f"{read1.reference_name}\t{rstart+1}\t{rend+1}\t{gc}\n")
|
|
199
|
+
End_motif = get_End_motif(End_motif, forward_end3, forward_end3)
|
|
200
|
+
pos = math.ceil(k_mer / 2)
|
|
201
|
+
try:
|
|
202
|
+
if k_mer % 2 == 0:
|
|
203
|
+
ref_seq1 = genome.fetch(read1.reference_name, rstart - pos, rstart).upper()
|
|
204
|
+
ref_seq2 = genome.fetch(read2.reference_name, rend, rend + pos).upper()
|
|
205
|
+
Breakpoint_motif = get_Breakpoint_motif(Breakpoint_motif, ref_seq1 + forward_end5[:pos], forward_end3[-pos:] + ref_seq2)
|
|
206
|
+
else:
|
|
207
|
+
ref_seq1 = genome.fetch(read1.reference_name, rstart - pos + 1, rstart).upper()
|
|
208
|
+
ref_seq2 = genome.fetch(read2.reference_name, rend, rend + pos - 1).upper()
|
|
209
|
+
Breakpoint_motif = get_Breakpoint_motif(Breakpoint_motif, ref_seq1 + forward_end5[:pos], forward_end3[-pos:] + ref_seq2)
|
|
210
|
+
except Exception as e:
|
|
211
|
+
motif_errors += 1
|
|
212
|
+
logger.warning(f"Motif extraction failed for fragment at {read1.reference_name}:{rstart}-{rend}: {e}")
|
|
213
|
+
continue
|
|
214
|
+
if idx % 10000 == 0:
|
|
215
|
+
progress.update(task, advance=10000)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
motif_errors += 1
|
|
218
|
+
logger.error(f"Unexpected error during fragment processing: {e}")
|
|
219
|
+
continue
|
|
220
|
+
progress.update(task, completed=total_pairs)
|
|
221
|
+
bedWrite.close()
|
|
222
|
+
logger.info("Filtering and sorting fragments with blacklist (if provided)...")
|
|
223
|
+
try:
|
|
224
|
+
bedData = pybedtools.BedTool(temp_bed)
|
|
225
|
+
if blacklistInput:
|
|
226
|
+
black_reigon = pybedtools.BedTool(blacklistInput)
|
|
227
|
+
bedData = bedData.subtract(black_reigon, A=True)
|
|
228
|
+
bedData.sort(output=bedOutput)
|
|
229
|
+
os.remove(temp_bed)
|
|
230
|
+
except Exception as e:
|
|
231
|
+
logger.error(f"Error during BED filtering/sorting: {e}")
|
|
232
|
+
raise typer.Exit(1)
|
|
233
|
+
# Write EndMotif
|
|
234
|
+
edm_file = os.path.join(EDM_output_path, Path(bedOutput).stem + '.EndMotif')
|
|
235
|
+
logger.info(f"Writing End Motif frequencies to {edm_file}")
|
|
236
|
+
try:
|
|
237
|
+
with open(edm_file, 'w') as f:
|
|
238
|
+
total = sum(End_motif.values())
|
|
239
|
+
for k, v in End_motif.items():
|
|
240
|
+
f.write(f"{k}\t{v/total if total else 0}\n")
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.error(f"Failed to write End Motif output: {e}")
|
|
243
|
+
raise typer.Exit(1)
|
|
244
|
+
# Write BreakPointMotif
|
|
245
|
+
bpm_file = os.path.join(BPM_output_path, Path(bedOutput).stem + '.BreakPointMotif')
|
|
246
|
+
logger.info(f"Writing Breakpoint Motif frequencies to {bpm_file}")
|
|
247
|
+
try:
|
|
248
|
+
with open(bpm_file, 'w') as f:
|
|
249
|
+
total = sum(Breakpoint_motif.values())
|
|
250
|
+
for k, v in Breakpoint_motif.items():
|
|
251
|
+
f.write(f"{k}\t{v/total if total else 0}\n")
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.error(f"Failed to write Breakpoint Motif output: {e}")
|
|
254
|
+
raise typer.Exit(1)
|
|
255
|
+
# Write MDS (Motif Diversity Score)
|
|
256
|
+
mds_file = os.path.join(MDS_output_path, Path(bedOutput).stem + '.MDS')
|
|
257
|
+
logger.info(f"Writing Motif Diversity Score to {mds_file}")
|
|
258
|
+
try:
|
|
259
|
+
df = pd.read_csv(edm_file, sep='\t', header=None, names=['motif', 'frequency'])
|
|
260
|
+
freq = df['frequency'].values
|
|
261
|
+
mds = -np.sum(freq * np.log2(freq + 1e-12)) / np.log2(len(freq))
|
|
262
|
+
with open(mds_file, 'w') as f:
|
|
263
|
+
f.write(f"{mds}\n")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.error(f"Failed to write MDS output: {e}")
|
|
266
|
+
raise typer.Exit(1)
|
|
267
|
+
# Print summary
|
|
268
|
+
summary_table = Table(title="Motif Extraction Summary", show_header=True, header_style="bold magenta")
|
|
269
|
+
summary_table.add_column("Output Type", style="bold")
|
|
270
|
+
summary_table.add_column("File Path")
|
|
271
|
+
summary_table.add_row("End Motif (EDM)", edm_file)
|
|
272
|
+
summary_table.add_row("Breakpoint Motif (BPM)", bpm_file)
|
|
273
|
+
summary_table.add_row("Motif Diversity Score (MDS)", mds_file)
|
|
274
|
+
console.print(Panel(summary_table, title="[green]Extraction Complete", subtitle=f"Motif errors: {motif_errors}", expand=False))
|
|
275
|
+
logger.info("Motif feature extraction complete.")
|