krewlyzer 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krewlyzer/__init__.py +3 -0
- krewlyzer/cli.py +53 -0
- krewlyzer/fsc.py +330 -0
- krewlyzer/fsd.py +170 -0
- krewlyzer/fsr.py +225 -0
- krewlyzer/helpers.py +237 -0
- krewlyzer/mfsd.py +236 -0
- krewlyzer/motif.py +430 -0
- krewlyzer/ocf.py +133 -0
- krewlyzer/uxm.py +188 -0
- krewlyzer/wps.py +264 -0
- krewlyzer/wrapper.py +147 -0
- krewlyzer-0.1.4.dist-info/METADATA +22 -0
- krewlyzer-0.1.4.dist-info/RECORD +18 -0
- krewlyzer-0.1.4.dist-info/WHEEL +5 -0
- krewlyzer-0.1.4.dist-info/entry_points.txt +2 -0
- krewlyzer-0.1.4.dist-info/licenses/LICENSE +619 -0
- krewlyzer-0.1.4.dist-info/top_level.txt +1 -0
krewlyzer/motif.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import os
|
|
5
|
+
import pysam
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import math
|
|
9
|
+
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
import itertools
|
|
12
|
+
from .helpers import (
|
|
13
|
+
reverse_complement,
|
|
14
|
+
get_End_motif,
|
|
15
|
+
get_Breakpoint_motif,
|
|
16
|
+
GCcontent,
|
|
17
|
+
read_pair_generator,
|
|
18
|
+
maxCore,
|
|
19
|
+
rmEndString,
|
|
20
|
+
calc_MDS
|
|
21
|
+
)
|
|
22
|
+
from rich.progress import Progress
|
|
23
|
+
from rich.console import Console
|
|
24
|
+
from rich.logging import RichHandler
|
|
25
|
+
import logging
|
|
26
|
+
|
|
27
|
+
console = Console()
|
|
28
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
29
|
+
logger = logging.getLogger("motif")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def motif(
|
|
33
|
+
bam_path: Path = typer.Argument(..., help="Path to input BAM file or directory of BAM files (GRCh37 aligned)"),
|
|
34
|
+
genome_reference: Path = typer.Option(..., '-g', help="Path to genome reference file (GRCh37/hg19)"),
|
|
35
|
+
output: Path = typer.Option(..., '-o', help="Output directory"),
|
|
36
|
+
blacklist: Optional[Path] = typer.Option(None, '-b', help="Path to blacklist regions file"),
|
|
37
|
+
map_quality: int = typer.Option(20, '-m', help="Minimum mapping quality"),
|
|
38
|
+
min_length: int = typer.Option(65, '--minlen', help="Minimum fragment length"),
|
|
39
|
+
max_length: int = typer.Option(400, '--maxlen', help="Maximum fragment length"),
|
|
40
|
+
kmer: int = typer.Option(3, '-k', help="K-mer size for motif extraction"),
|
|
41
|
+
chromosomes: Optional[str] = typer.Option(None, '--chromosomes', help="Comma-separated list of chromosomes to process"),
|
|
42
|
+
verbose: bool = typer.Option(False, '--verbose', help="Enable verbose logging"),
|
|
43
|
+
threads: int = typer.Option(1, '--threads', help="Number of parallel processes (default: 1)")
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
Extract motif-based features from BAM files.
|
|
47
|
+
"""
|
|
48
|
+
# Input checks
|
|
49
|
+
if not bam_path.exists():
|
|
50
|
+
logger.error(f"Input BAM file or directory not found: {bam_path}")
|
|
51
|
+
raise typer.Exit(1)
|
|
52
|
+
if not genome_reference.exists() or not genome_reference.is_file():
|
|
53
|
+
logger.error(f"Reference genome file not found: {genome_reference}")
|
|
54
|
+
raise typer.Exit(1)
|
|
55
|
+
try:
|
|
56
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
59
|
+
raise typer.Exit(1)
|
|
60
|
+
"""
|
|
61
|
+
Extracts end motif, breakpoint motif, and Motif-Diversity Score (MDS) from one or more BAM files.
|
|
62
|
+
If a directory is provided, all BAM files in the directory will be processed in parallel using multiple processes.
|
|
63
|
+
Output files are written to the output directory, with EDM, BPM, and MDS subfolders.
|
|
64
|
+
"""
|
|
65
|
+
import concurrent.futures
|
|
66
|
+
if verbose:
|
|
67
|
+
logger.setLevel(logging.DEBUG)
|
|
68
|
+
logger.info(f"Reference genome: {genome_reference}")
|
|
69
|
+
logger.info(f"Output directory: {output}")
|
|
70
|
+
if bam_path.is_dir():
|
|
71
|
+
bam_files = sorted([f for f in bam_path.iterdir() if f.suffix == '.bam'])
|
|
72
|
+
if not bam_files:
|
|
73
|
+
logger.error(f"No BAM files found in directory: {bam_path}")
|
|
74
|
+
raise typer.Exit(1)
|
|
75
|
+
logger.info(f"Processing {len(bam_files)} BAM files in parallel using {threads} processes...")
|
|
76
|
+
def run_motif_for_bam(bam_file):
|
|
77
|
+
logger.info(f"Processing BAM: {bam_file}")
|
|
78
|
+
motif_process(
|
|
79
|
+
str(bam_file),
|
|
80
|
+
str(blacklist) if blacklist else None,
|
|
81
|
+
str(output / (bam_file.stem + '.bed')),
|
|
82
|
+
str(genome_reference),
|
|
83
|
+
chromosomes.split(',') if chromosomes else None,
|
|
84
|
+
map_quality,
|
|
85
|
+
kmer,
|
|
86
|
+
fragFilter=True,
|
|
87
|
+
minLen=min_length,
|
|
88
|
+
maxLen=max_length
|
|
89
|
+
)
|
|
90
|
+
return str(bam_file)
|
|
91
|
+
with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
|
|
92
|
+
futures = {executor.submit(run_motif_for_bam, bam_file): bam_file for bam_file in bam_files}
|
|
93
|
+
for future in concurrent.futures.as_completed(futures):
|
|
94
|
+
bam_file = futures[future]
|
|
95
|
+
try:
|
|
96
|
+
result = future.result()
|
|
97
|
+
logger.info(f"Motif extraction complete for: {result}")
|
|
98
|
+
except Exception as exc:
|
|
99
|
+
logger.error(f"Motif extraction failed for {bam_file}: {exc}")
|
|
100
|
+
logger.info(f"All BAM files processed.")
|
|
101
|
+
else:
|
|
102
|
+
logger.info(f"Processing BAM: {bam_path}")
|
|
103
|
+
motif_process(
|
|
104
|
+
str(bam_path),
|
|
105
|
+
str(blacklist) if blacklist else None,
|
|
106
|
+
str(output / (bam_path.stem + '.bed')),
|
|
107
|
+
str(genome_reference),
|
|
108
|
+
chromosomes.split(',') if chromosomes else None,
|
|
109
|
+
map_quality,
|
|
110
|
+
kmer,
|
|
111
|
+
fragFilter=True,
|
|
112
|
+
minLen=min_length,
|
|
113
|
+
maxLen=max_length
|
|
114
|
+
)
|
|
115
|
+
logger.info("End motif, Breakpoint motif, and MDS extraction complete.")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def motif_process(
|
|
119
|
+
bamInput: str | Path,
|
|
120
|
+
blacklistInput: str | Path | None,
|
|
121
|
+
bedOutput: str | Path,
|
|
122
|
+
genome_reference: str | Path,
|
|
123
|
+
CHR: list[str] | None,
|
|
124
|
+
mapQuality: int,
|
|
125
|
+
k_mer: int,
|
|
126
|
+
fragFilter: bool = False,
|
|
127
|
+
minLen: int | None = None,
|
|
128
|
+
maxLen: int | None = None
|
|
129
|
+
):
|
|
130
|
+
"""
|
|
131
|
+
Main motif feature extraction process with rich logging and consistent CLI output.
|
|
132
|
+
"""
|
|
133
|
+
from rich.table import Table
|
|
134
|
+
from rich.panel import Panel
|
|
135
|
+
from rich.text import Text
|
|
136
|
+
bedOutput_path = os.path.abspath(bedOutput)
|
|
137
|
+
EDM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'EDM')
|
|
138
|
+
BPM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'BPM')
|
|
139
|
+
MDS_output_path = os.path.join(os.path.dirname(bedOutput_path), 'MDS')
|
|
140
|
+
try:
|
|
141
|
+
os.makedirs(EDM_output_path, exist_ok=True)
|
|
142
|
+
os.makedirs(BPM_output_path, exist_ok=True)
|
|
143
|
+
os.makedirs(MDS_output_path, exist_ok=True)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.error(f"Failed to create output directories: {e}")
|
|
146
|
+
raise typer.Exit(1)
|
|
147
|
+
bases = ['A', 'C', 'T', 'G']
|
|
148
|
+
End_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
|
|
149
|
+
Breakpoint_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
|
|
150
|
+
try:
|
|
151
|
+
bamfile = pysam.AlignmentFile(bamInput, 'rb')
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Failed to open BAM file: {e}")
|
|
154
|
+
raise typer.Exit(1)
|
|
155
|
+
try:
|
|
156
|
+
genome = pysam.FastaFile(genome_reference)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Failed to open genome FASTA: {e}")
|
|
159
|
+
raise typer.Exit(1)
|
|
160
|
+
temp_bed = bedOutput + '.tmp'
|
|
161
|
+
try:
|
|
162
|
+
bedWrite = open(temp_bed, 'w')
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Failed to open temp BED for writing: {e}")
|
|
165
|
+
raise typer.Exit(1)
|
|
166
|
+
chroms = CHR if CHR else list(bamfile.references)
|
|
167
|
+
logger.info("Extracting motif features from BAM file...")
|
|
168
|
+
total_pairs = bamfile.mapped // 2 if bamfile.mapped else 1000000
|
|
169
|
+
motif_errors = 0
|
|
170
|
+
with Progress(console=console, transient=True) as progress:
|
|
171
|
+
task = progress.add_task("Processing fragments", total=total_pairs)
|
|
172
|
+
for idx, pair in enumerate(read_pair_generator(bamfile)):
|
|
173
|
+
try:
|
|
174
|
+
read1, read2 = pair
|
|
175
|
+
if read1.mapping_quality < mapQuality or read2.mapping_quality < mapQuality or read1.reference_name not in chroms:
|
|
176
|
+
continue
|
|
177
|
+
read1Start = read1.reference_start
|
|
178
|
+
read1End = read1.reference_end
|
|
179
|
+
read2Start = read2.reference_start
|
|
180
|
+
read2End = read2.reference_end
|
|
181
|
+
|
|
182
|
+
# Determine fragment coordinates
|
|
183
|
+
if not read1.is_reverse:
|
|
184
|
+
rstart = read1Start
|
|
185
|
+
rend = read2End
|
|
186
|
+
# 5' end of fragment (Strand 1)
|
|
187
|
+
motif_left = read1.query_sequence[:k_mer].upper()
|
|
188
|
+
# 5' end of fragment (Strand 2) - which is 5' end of Read 2
|
|
189
|
+
motif_right = read2.query_sequence[:k_mer].upper()
|
|
190
|
+
else:
|
|
191
|
+
rstart = read2Start
|
|
192
|
+
rend = read1End
|
|
193
|
+
# 5' end of fragment (Strand 1) - which is 5' end of Read 2
|
|
194
|
+
motif_left = read2.query_sequence[:k_mer].upper()
|
|
195
|
+
# 5' end of fragment (Strand 2) - which is 5' end of Read 1
|
|
196
|
+
motif_right = read1.query_sequence[:k_mer].upper()
|
|
197
|
+
|
|
198
|
+
if (rstart < 0) or (rend < 0) or (rstart >= rend):
|
|
199
|
+
continue
|
|
200
|
+
if fragFilter:
|
|
201
|
+
readLen = rend - rstart
|
|
202
|
+
if (minLen and readLen < minLen) or (maxLen and readLen > maxLen):
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
# Write BED (0-based start, 0-based exclusive end)
|
|
206
|
+
gc = GCcontent(genome.fetch(read1.reference_name, rstart, rend))
|
|
207
|
+
bedWrite.write(f"{read1.reference_name}\t{rstart}\t{rend}\t{gc}\n")
|
|
208
|
+
|
|
209
|
+
# Update End Motif
|
|
210
|
+
End_motif = get_End_motif(End_motif, motif_left, motif_right)
|
|
211
|
+
|
|
212
|
+
# Update Breakpoint Motif
|
|
213
|
+
# We need genomic context around the 5' ends.
|
|
214
|
+
# Left breakpoint (rstart): 5' end of fragment.
|
|
215
|
+
# Right breakpoint (rend): 3' end of fragment (on + strand).
|
|
216
|
+
|
|
217
|
+
# For Breakpoint Motif, we want the sequence AROUND the breakpoint.
|
|
218
|
+
# Typically k_mer/2 bases before and k_mer/2 bases after.
|
|
219
|
+
pos = math.ceil(k_mer / 2)
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
# Left breakpoint (rstart)
|
|
223
|
+
# Genomic sequence before rstart
|
|
224
|
+
ref_seq_left = genome.fetch(read1.reference_name, rstart - pos, rstart).upper()
|
|
225
|
+
# Fragment sequence starting at rstart (from read)
|
|
226
|
+
frag_seq_left = motif_left[:pos] # This is from the read
|
|
227
|
+
|
|
228
|
+
# Right breakpoint (rend)
|
|
229
|
+
# Fragment sequence ending at rend
|
|
230
|
+
# The 3' end of the fragment on + strand corresponds to the 5' end of the reverse read.
|
|
231
|
+
# The reverse read sequence (motif_right) is the reverse complement of the + strand.
|
|
232
|
+
# So motif_right is 5'->3' on - strand.
|
|
233
|
+
# Its complement is 3'->5' on + strand.
|
|
234
|
+
# Its reverse complement is 5'->3' on + strand.
|
|
235
|
+
|
|
236
|
+
# Wait, let's simplify.
|
|
237
|
+
# Breakpoint motif usually captures the genomic context at the nick.
|
|
238
|
+
# Left nick: [Genomic Pre][Fragment Start]
|
|
239
|
+
# Right nick: [Fragment End][Genomic Post]
|
|
240
|
+
|
|
241
|
+
# Left: ref[rstart-pos:rstart] + read[0:pos]
|
|
242
|
+
bp_left_seq = ref_seq_left + frag_seq_left
|
|
243
|
+
|
|
244
|
+
# Right:
|
|
245
|
+
# Genomic Post: ref[rend:rend+pos]
|
|
246
|
+
ref_seq_right = genome.fetch(read1.reference_name, rend, rend + pos).upper()
|
|
247
|
+
|
|
248
|
+
# Fragment End:
|
|
249
|
+
# We have motif_right which is the 5' end of the reverse read.
|
|
250
|
+
# This corresponds to the 3' end of the fragment on the forward strand.
|
|
251
|
+
# But motif_right is the sequence of the read itself (reverse strand).
|
|
252
|
+
# To get the forward strand sequence at the 3' end:
|
|
253
|
+
# It is reverse_complement(motif_right).
|
|
254
|
+
# And we want the last 'pos' bases of the fragment.
|
|
255
|
+
# If motif_right is 5'->3' on - strand.
|
|
256
|
+
# The first 'pos' bases of motif_right correspond to the last 'pos' bases of the fragment on the + strand (complement).
|
|
257
|
+
|
|
258
|
+
# Actually, let's just use the genome for simplicity and consistency if possible?
|
|
259
|
+
# No, we should use the read for the fragment part to capture variants/errors if desired,
|
|
260
|
+
# but usually for motifs we assume reference or read is fine.
|
|
261
|
+
# Using read is better for actual fragment end.
|
|
262
|
+
|
|
263
|
+
# Let's use the logic:
|
|
264
|
+
# Right breakpoint is at 'rend'.
|
|
265
|
+
# We want [Fragment End][Genomic Post]
|
|
266
|
+
# Fragment End is the sequence ending at rend.
|
|
267
|
+
# Genomic Post is sequence starting at rend.
|
|
268
|
+
|
|
269
|
+
# The sequence of the fragment at the 3' end (on + strand) is:
|
|
270
|
+
# reverse_complement(motif_right).
|
|
271
|
+
# We want the last 'pos' bases.
|
|
272
|
+
# motif_right[:pos] is the 5' end of the reverse read.
|
|
273
|
+
# reverse_complement(motif_right[:pos]) is the 3' end of the forward fragment.
|
|
274
|
+
|
|
275
|
+
frag_seq_right = reverse_complement(motif_right[:pos])
|
|
276
|
+
|
|
277
|
+
bp_right_seq = frag_seq_right + ref_seq_right
|
|
278
|
+
|
|
279
|
+
Breakpoint_motif = get_Breakpoint_motif(Breakpoint_motif, bp_left_seq, bp_right_seq)
|
|
280
|
+
|
|
281
|
+
except Exception as e:
|
|
282
|
+
motif_errors += 1
|
|
283
|
+
# logger.warning(f"Motif extraction failed for fragment at {read1.reference_name}:{rstart}-{rend}: {e}")
|
|
284
|
+
continue
|
|
285
|
+
if idx % 10000 == 0:
|
|
286
|
+
progress.update(task, advance=10000)
|
|
287
|
+
except Exception as e:
|
|
288
|
+
motif_errors += 1
|
|
289
|
+
logger.error(f"Unexpected error during fragment processing: {e}")
|
|
290
|
+
continue
|
|
291
|
+
progress.update(task, completed=total_pairs)
|
|
292
|
+
bedWrite.close()
|
|
293
|
+
logger.info("Filtering and sorting fragments with blacklist (if provided)...")
|
|
294
|
+
bedWrite.close()
|
|
295
|
+
logger.info("Filtering and sorting fragments with blacklist (if provided)...")
|
|
296
|
+
try:
|
|
297
|
+
# Load temp BED into DataFrame
|
|
298
|
+
# Columns: chrom, start, end, gc
|
|
299
|
+
# Use low_memory=False or specify dtypes for safety
|
|
300
|
+
df = pd.read_csv(temp_bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'gc'], dtype={'chrom': str, 'start': int, 'end': int, 'gc': float})
|
|
301
|
+
|
|
302
|
+
if blacklistInput:
|
|
303
|
+
logger.info(f"Filtering with blacklist: {blacklistInput}")
|
|
304
|
+
# Load blacklist
|
|
305
|
+
# Assume BED format: chrom, start, end
|
|
306
|
+
bl_df = pd.read_csv(blacklistInput, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
|
|
307
|
+
|
|
308
|
+
# Filter
|
|
309
|
+
# Since blacklist is usually small, we can iterate over blacklist intervals and remove overlapping rows.
|
|
310
|
+
# Or use a more efficient interval join if blacklist is large.
|
|
311
|
+
# For typical blacklists (e.g. ENCODE), iteration might be slow if many intervals.
|
|
312
|
+
# But compared to pybedtools (which writes to disk and runs bedtools), pandas in-memory might be competitive or faster for small blacklists.
|
|
313
|
+
|
|
314
|
+
# Let's use a simple approach:
|
|
315
|
+
# 1. Iterate by chromosome
|
|
316
|
+
# 2. For each chromosome, find overlapping intervals.
|
|
317
|
+
|
|
318
|
+
# Optimization: If blacklist is small, we can do it.
|
|
319
|
+
# If blacklist is large, we might need IntervalTree or similar.
|
|
320
|
+
# But we want to avoid new dependencies.
|
|
321
|
+
|
|
322
|
+
# Let's try a vectorized approach if possible? No easy way without interval index.
|
|
323
|
+
# Let's use the iteration method per chromosome, which is reasonably fast for typical blacklists.
|
|
324
|
+
|
|
325
|
+
# Filter out rows that overlap with ANY blacklist interval
|
|
326
|
+
# overlap: max(start1, start2) < min(end1, end2)
|
|
327
|
+
|
|
328
|
+
keep_mask = np.ones(len(df), dtype=bool)
|
|
329
|
+
|
|
330
|
+
for chrom, bl_group in bl_df.groupby('chrom'):
|
|
331
|
+
if chrom not in df['chrom'].values:
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
chrom_mask = (df['chrom'] == chrom)
|
|
335
|
+
chrom_df_indices = df.index[chrom_mask]
|
|
336
|
+
|
|
337
|
+
# Get starts and ends for this chrom
|
|
338
|
+
starts = df.loc[chrom_mask, 'start'].values
|
|
339
|
+
ends = df.loc[chrom_mask, 'end'].values
|
|
340
|
+
|
|
341
|
+
# Check against all blacklist intervals for this chrom
|
|
342
|
+
# This is O(N_reads * M_blacklist_intervals). Can be slow if M is large.
|
|
343
|
+
# ENCODE blacklist is ~300 regions? Then it's fast.
|
|
344
|
+
# If M is large, this is bad.
|
|
345
|
+
|
|
346
|
+
# Alternative: Sort both and sweep?
|
|
347
|
+
# Or just assume blacklist is small enough (typical case).
|
|
348
|
+
|
|
349
|
+
for _, bl_row in bl_group.iterrows():
|
|
350
|
+
bl_start = bl_row['start']
|
|
351
|
+
bl_end = bl_row['end']
|
|
352
|
+
|
|
353
|
+
# Vectorized overlap check
|
|
354
|
+
# overlap = (starts < bl_end) & (ends > bl_start)
|
|
355
|
+
overlap = (starts < bl_end) & (ends > bl_start)
|
|
356
|
+
|
|
357
|
+
# Mark overlapping reads to drop
|
|
358
|
+
# We need to map back to original indices
|
|
359
|
+
# overlap is boolean array for the subset
|
|
360
|
+
|
|
361
|
+
if np.any(overlap):
|
|
362
|
+
# Get indices in original df to drop
|
|
363
|
+
drop_indices = chrom_df_indices[overlap]
|
|
364
|
+
keep_mask[drop_indices] = False
|
|
365
|
+
|
|
366
|
+
df = df[keep_mask]
|
|
367
|
+
|
|
368
|
+
# Sort
|
|
369
|
+
df.sort_values(by=['chrom', 'start', 'end'], inplace=True)
|
|
370
|
+
|
|
371
|
+
# Write to output
|
|
372
|
+
df.to_csv(bedOutput, sep='\t', header=False, index=False)
|
|
373
|
+
|
|
374
|
+
os.remove(temp_bed)
|
|
375
|
+
|
|
376
|
+
# Compress and Index
|
|
377
|
+
logger.info(f"Compressing and indexing {bedOutput}...")
|
|
378
|
+
pysam.tabix_compress(bedOutput, bedOutput + ".gz", force=True)
|
|
379
|
+
pysam.tabix_index(bedOutput + ".gz", preset="bed", force=True)
|
|
380
|
+
# Remove uncompressed file to save space and avoid confusion
|
|
381
|
+
if os.path.exists(bedOutput):
|
|
382
|
+
os.remove(bedOutput)
|
|
383
|
+
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.error(f"Error during BED filtering/sorting/compression: {e}")
|
|
386
|
+
raise typer.Exit(1)
|
|
387
|
+
# Write EndMotif
|
|
388
|
+
edm_file = os.path.join(EDM_output_path, Path(bedOutput).stem + '.EndMotif')
|
|
389
|
+
logger.info(f"Writing End Motif frequencies to {edm_file}")
|
|
390
|
+
try:
|
|
391
|
+
with open(edm_file, 'w') as f:
|
|
392
|
+
total = sum(End_motif.values())
|
|
393
|
+
for k, v in End_motif.items():
|
|
394
|
+
f.write(f"{k}\t{v/total if total else 0}\n")
|
|
395
|
+
except Exception as e:
|
|
396
|
+
logger.error(f"Failed to write End Motif output: {e}")
|
|
397
|
+
raise typer.Exit(1)
|
|
398
|
+
# Write BreakPointMotif
|
|
399
|
+
bpm_file = os.path.join(BPM_output_path, Path(bedOutput).stem + '.BreakPointMotif')
|
|
400
|
+
logger.info(f"Writing Breakpoint Motif frequencies to {bpm_file}")
|
|
401
|
+
try:
|
|
402
|
+
with open(bpm_file, 'w') as f:
|
|
403
|
+
total = sum(Breakpoint_motif.values())
|
|
404
|
+
for k, v in Breakpoint_motif.items():
|
|
405
|
+
f.write(f"{k}\t{v/total if total else 0}\n")
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.error(f"Failed to write Breakpoint Motif output: {e}")
|
|
408
|
+
raise typer.Exit(1)
|
|
409
|
+
# Write MDS (Motif Diversity Score)
|
|
410
|
+
mds_file = os.path.join(MDS_output_path, Path(bedOutput).stem + '.MDS')
|
|
411
|
+
logger.info(f"Writing Motif Diversity Score to {mds_file}")
|
|
412
|
+
try:
|
|
413
|
+
df = pd.read_csv(edm_file, sep='\t', header=None, names=['motif', 'frequency'])
|
|
414
|
+
freq = df['frequency'].values
|
|
415
|
+
mds = -np.sum(freq * np.log2(freq + 1e-12)) / np.log2(len(freq))
|
|
416
|
+
with open(mds_file, 'w') as f:
|
|
417
|
+
f.write(f"{mds}\n")
|
|
418
|
+
except Exception as e:
|
|
419
|
+
logger.error(f"Failed to write MDS output: {e}")
|
|
420
|
+
raise typer.Exit(1)
|
|
421
|
+
# Print summary
|
|
422
|
+
summary_table = Table(title="Motif Extraction Summary", show_header=True, header_style="bold magenta")
|
|
423
|
+
summary_table.add_column("Output Type", style="bold")
|
|
424
|
+
summary_table.add_column("File Path")
|
|
425
|
+
summary_table.add_row("End Motif (EDM)", edm_file)
|
|
426
|
+
summary_table.add_row("Breakpoint Motif (BPM)", bpm_file)
|
|
427
|
+
summary_table.add_row("Motif Diversity Score (MDS)", mds_file)
|
|
428
|
+
summary_table.add_row("Fragment BED", bedOutput + ".gz")
|
|
429
|
+
console.print(Panel(summary_table, title="[green]Extraction Complete", subtitle=f"Motif errors: {motif_errors}", expand=False))
|
|
430
|
+
logger.info("Motif feature extraction complete.")
|
krewlyzer/ocf.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import pysam
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from functools import partial
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
16
|
+
logger = logging.getLogger("ocf")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def calc_ocf(bedgz_file: Path, ocr_file: Path, output_dir: Path) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Calculate OCF for a single .bed.gz file and OCR region file.
|
|
22
|
+
Output is per-region .sync.end files and a summary all.ocf.csv.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
tbx = pysam.TabixFile(str(bedgz_file))
|
|
26
|
+
regions = pd.read_csv(ocr_file, sep="\t", header=None, names=["chr", "start", "end", "description"])
|
|
27
|
+
leftPOS = defaultdict(partial(defaultdict, int))
|
|
28
|
+
rightPOS = defaultdict(partial(defaultdict, int))
|
|
29
|
+
total = defaultdict(lambda: [0, 0])
|
|
30
|
+
for _, region in regions.iterrows():
|
|
31
|
+
region_Chr, region_Start, region_End, region_Label = (
|
|
32
|
+
region["chr"], region["start"], region["end"], region["description"])
|
|
33
|
+
try:
|
|
34
|
+
fetched_reads = tbx.fetch(region_Chr, region_Start, region_End)
|
|
35
|
+
except ValueError:
|
|
36
|
+
continue
|
|
37
|
+
for row in fetched_reads:
|
|
38
|
+
tmp_row = row.split()
|
|
39
|
+
rstart = int(tmp_row[1])
|
|
40
|
+
rend = int(tmp_row[2])
|
|
41
|
+
if rstart >= region_Start:
|
|
42
|
+
s = rstart - region_Start
|
|
43
|
+
leftPOS[region_Label][s] += 1
|
|
44
|
+
total[region_Label][0] += 1
|
|
45
|
+
if rend <= region_End:
|
|
46
|
+
e = rend - region_Start + 1
|
|
47
|
+
rightPOS[region_Label][e] += 1
|
|
48
|
+
total[region_Label][1] += 1
|
|
49
|
+
Labels = []
|
|
50
|
+
ocf = []
|
|
51
|
+
outputfile = output_dir / 'all.ocf.csv'
|
|
52
|
+
for label in total.keys():
|
|
53
|
+
output = output_dir / f'{label}.sync.end'
|
|
54
|
+
Labels.append(label)
|
|
55
|
+
le = leftPOS[label]
|
|
56
|
+
re = rightPOS[label]
|
|
57
|
+
ts = total[label][0] / 10000 if total[label][0] else 1
|
|
58
|
+
te = total[label][1] / 10000 if total[label][1] else 1
|
|
59
|
+
num = 2000
|
|
60
|
+
with open(output, 'w') as output_write:
|
|
61
|
+
for k in range(num):
|
|
62
|
+
l = le[k]
|
|
63
|
+
r = re[k]
|
|
64
|
+
output_write.write(
|
|
65
|
+
f"{k - 1000}\t{l}\t{l / ts}\t{r}\t{r / te}\n")
|
|
66
|
+
# OCF calculation
|
|
67
|
+
with open(output, 'r') as o:
|
|
68
|
+
peak = 60
|
|
69
|
+
bin = 10
|
|
70
|
+
trueends = 0
|
|
71
|
+
background = 0
|
|
72
|
+
for line in o.readlines():
|
|
73
|
+
loc, left, Left, right, Right = line.split()
|
|
74
|
+
loc = int(loc)
|
|
75
|
+
if -peak - bin <= loc <= -peak + bin:
|
|
76
|
+
trueends += float(Right)
|
|
77
|
+
background += float(Left)
|
|
78
|
+
elif peak - bin <= loc <= peak + bin:
|
|
79
|
+
trueends += float(Left)
|
|
80
|
+
background += float(Right)
|
|
81
|
+
ocf.append(trueends - background)
|
|
82
|
+
import pandas as pd
|
|
83
|
+
ocf_df = pd.DataFrame({"tissue": Labels, "OCF": ocf})
|
|
84
|
+
ocf_df.to_csv(outputfile, sep="\t", index=None)
|
|
85
|
+
logger.info(f"OCF calculation complete for {bedgz_file}. Results in {output_dir}.")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Fatal error in calc_ocf: {e}")
|
|
88
|
+
raise typer.Exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def ocf(
|
|
92
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
93
|
+
ocr_input: Optional[Path] = typer.Option(None, "--ocr-input", "-r", help="Path to open chromatin region BED file (default: packaged tissue file)"),
|
|
94
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
95
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
96
|
+
) -> None:
|
|
97
|
+
"""
|
|
98
|
+
Calculate orientation-aware cfDNA fragmentation (OCF) features for all .bed.gz files in a folder.
|
|
99
|
+
"""
|
|
100
|
+
# Input checks
|
|
101
|
+
if not bedgz_path.exists():
|
|
102
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
103
|
+
raise typer.Exit(1)
|
|
104
|
+
if ocr_input and not ocr_input.exists():
|
|
105
|
+
logger.error(f"OCR region BED file not found: {ocr_input}")
|
|
106
|
+
raise typer.Exit(1)
|
|
107
|
+
try:
|
|
108
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
111
|
+
raise typer.Exit(1)
|
|
112
|
+
# Set default OCR file if not provided
|
|
113
|
+
if ocr_input is None:
|
|
114
|
+
pkg_dir = Path(__file__).parent
|
|
115
|
+
ocr_input = pkg_dir / "data/OpenChromatinRegion/7specificTissue.all.OC.bed"
|
|
116
|
+
bedgz_files = [f for f in Path(bedgz_path).glob("*.bed.gz")]
|
|
117
|
+
output = Path(output)
|
|
118
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
def run_ocf_file(bedgz_file):
|
|
120
|
+
sample_dir = output / bedgz_file.stem.replace('.bed', '')
|
|
121
|
+
sample_dir.mkdir(exist_ok=True)
|
|
122
|
+
calc_ocf(bedgz_file, ocr_input, sample_dir)
|
|
123
|
+
return str(sample_dir)
|
|
124
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
125
|
+
futures = {executor.submit(run_ocf_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
126
|
+
for future in as_completed(futures):
|
|
127
|
+
bedgz_file = futures[future]
|
|
128
|
+
try:
|
|
129
|
+
result = future.result()
|
|
130
|
+
logger.info(f"OCF calculated: {result}")
|
|
131
|
+
except Exception as exc:
|
|
132
|
+
logger.error(f"OCF calculation failed for {bedgz_file}: {exc}")
|
|
133
|
+
logger.info(f"OCF features calculated for {len(bedgz_files)} files.")
|