krewlyzer 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
krewlyzer/motif.py ADDED
@@ -0,0 +1,430 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import os
5
+ import pysam
6
+ import numpy as np
7
+ import pandas as pd
8
+ import math
9
+
10
+ from collections import defaultdict
11
+ import itertools
12
+ from .helpers import (
13
+ reverse_complement,
14
+ get_End_motif,
15
+ get_Breakpoint_motif,
16
+ GCcontent,
17
+ read_pair_generator,
18
+ maxCore,
19
+ rmEndString,
20
+ calc_MDS
21
+ )
22
+ from rich.progress import Progress
23
+ from rich.console import Console
24
+ from rich.logging import RichHandler
25
+ import logging
26
+
27
+ console = Console()
28
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
29
+ logger = logging.getLogger("motif")
30
+
31
+
32
+ def motif(
33
+ bam_path: Path = typer.Argument(..., help="Path to input BAM file or directory of BAM files (GRCh37 aligned)"),
34
+ genome_reference: Path = typer.Option(..., '-g', help="Path to genome reference file (GRCh37/hg19)"),
35
+ output: Path = typer.Option(..., '-o', help="Output directory"),
36
+ blacklist: Optional[Path] = typer.Option(None, '-b', help="Path to blacklist regions file"),
37
+ map_quality: int = typer.Option(20, '-m', help="Minimum mapping quality"),
38
+ min_length: int = typer.Option(65, '--minlen', help="Minimum fragment length"),
39
+ max_length: int = typer.Option(400, '--maxlen', help="Maximum fragment length"),
40
+ kmer: int = typer.Option(3, '-k', help="K-mer size for motif extraction"),
41
+ chromosomes: Optional[str] = typer.Option(None, '--chromosomes', help="Comma-separated list of chromosomes to process"),
42
+ verbose: bool = typer.Option(False, '--verbose', help="Enable verbose logging"),
43
+ threads: int = typer.Option(1, '--threads', help="Number of parallel processes (default: 1)")
44
+ ):
45
+ """
46
+ Extract motif-based features from BAM files.
47
+ """
48
+ # Input checks
49
+ if not bam_path.exists():
50
+ logger.error(f"Input BAM file or directory not found: {bam_path}")
51
+ raise typer.Exit(1)
52
+ if not genome_reference.exists() or not genome_reference.is_file():
53
+ logger.error(f"Reference genome file not found: {genome_reference}")
54
+ raise typer.Exit(1)
55
+ try:
56
+ output.mkdir(parents=True, exist_ok=True)
57
+ except Exception as e:
58
+ logger.error(f"Could not create output directory {output}: {e}")
59
+ raise typer.Exit(1)
60
+ """
61
+ Extracts end motif, breakpoint motif, and Motif-Diversity Score (MDS) from one or more BAM files.
62
+ If a directory is provided, all BAM files in the directory will be processed in parallel using multiple processes.
63
+ Output files are written to the output directory, with EDM, BPM, and MDS subfolders.
64
+ """
65
+ import concurrent.futures
66
+ if verbose:
67
+ logger.setLevel(logging.DEBUG)
68
+ logger.info(f"Reference genome: {genome_reference}")
69
+ logger.info(f"Output directory: {output}")
70
+ if bam_path.is_dir():
71
+ bam_files = sorted([f for f in bam_path.iterdir() if f.suffix == '.bam'])
72
+ if not bam_files:
73
+ logger.error(f"No BAM files found in directory: {bam_path}")
74
+ raise typer.Exit(1)
75
+ logger.info(f"Processing {len(bam_files)} BAM files in parallel using {threads} processes...")
76
+ def run_motif_for_bam(bam_file):
77
+ logger.info(f"Processing BAM: {bam_file}")
78
+ motif_process(
79
+ str(bam_file),
80
+ str(blacklist) if blacklist else None,
81
+ str(output / (bam_file.stem + '.bed')),
82
+ str(genome_reference),
83
+ chromosomes.split(',') if chromosomes else None,
84
+ map_quality,
85
+ kmer,
86
+ fragFilter=True,
87
+ minLen=min_length,
88
+ maxLen=max_length
89
+ )
90
+ return str(bam_file)
91
+ with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
92
+ futures = {executor.submit(run_motif_for_bam, bam_file): bam_file for bam_file in bam_files}
93
+ for future in concurrent.futures.as_completed(futures):
94
+ bam_file = futures[future]
95
+ try:
96
+ result = future.result()
97
+ logger.info(f"Motif extraction complete for: {result}")
98
+ except Exception as exc:
99
+ logger.error(f"Motif extraction failed for {bam_file}: {exc}")
100
+ logger.info(f"All BAM files processed.")
101
+ else:
102
+ logger.info(f"Processing BAM: {bam_path}")
103
+ motif_process(
104
+ str(bam_path),
105
+ str(blacklist) if blacklist else None,
106
+ str(output / (bam_path.stem + '.bed')),
107
+ str(genome_reference),
108
+ chromosomes.split(',') if chromosomes else None,
109
+ map_quality,
110
+ kmer,
111
+ fragFilter=True,
112
+ minLen=min_length,
113
+ maxLen=max_length
114
+ )
115
+ logger.info("End motif, Breakpoint motif, and MDS extraction complete.")
116
+
117
+
118
+ def motif_process(
119
+ bamInput: str | Path,
120
+ blacklistInput: str | Path | None,
121
+ bedOutput: str | Path,
122
+ genome_reference: str | Path,
123
+ CHR: list[str] | None,
124
+ mapQuality: int,
125
+ k_mer: int,
126
+ fragFilter: bool = False,
127
+ minLen: int | None = None,
128
+ maxLen: int | None = None
129
+ ):
130
+ """
131
+ Main motif feature extraction process with rich logging and consistent CLI output.
132
+ """
133
+ from rich.table import Table
134
+ from rich.panel import Panel
135
+ from rich.text import Text
136
+ bedOutput_path = os.path.abspath(bedOutput)
137
+ EDM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'EDM')
138
+ BPM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'BPM')
139
+ MDS_output_path = os.path.join(os.path.dirname(bedOutput_path), 'MDS')
140
+ try:
141
+ os.makedirs(EDM_output_path, exist_ok=True)
142
+ os.makedirs(BPM_output_path, exist_ok=True)
143
+ os.makedirs(MDS_output_path, exist_ok=True)
144
+ except Exception as e:
145
+ logger.error(f"Failed to create output directories: {e}")
146
+ raise typer.Exit(1)
147
+ bases = ['A', 'C', 'T', 'G']
148
+ End_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
149
+ Breakpoint_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
150
+ try:
151
+ bamfile = pysam.AlignmentFile(bamInput, 'rb')
152
+ except Exception as e:
153
+ logger.error(f"Failed to open BAM file: {e}")
154
+ raise typer.Exit(1)
155
+ try:
156
+ genome = pysam.FastaFile(genome_reference)
157
+ except Exception as e:
158
+ logger.error(f"Failed to open genome FASTA: {e}")
159
+ raise typer.Exit(1)
160
+ temp_bed = bedOutput + '.tmp'
161
+ try:
162
+ bedWrite = open(temp_bed, 'w')
163
+ except Exception as e:
164
+ logger.error(f"Failed to open temp BED for writing: {e}")
165
+ raise typer.Exit(1)
166
+ chroms = CHR if CHR else list(bamfile.references)
167
+ logger.info("Extracting motif features from BAM file...")
168
+ total_pairs = bamfile.mapped // 2 if bamfile.mapped else 1000000
169
+ motif_errors = 0
170
+ with Progress(console=console, transient=True) as progress:
171
+ task = progress.add_task("Processing fragments", total=total_pairs)
172
+ for idx, pair in enumerate(read_pair_generator(bamfile)):
173
+ try:
174
+ read1, read2 = pair
175
+ if read1.mapping_quality < mapQuality or read2.mapping_quality < mapQuality or read1.reference_name not in chroms:
176
+ continue
177
+ read1Start = read1.reference_start
178
+ read1End = read1.reference_end
179
+ read2Start = read2.reference_start
180
+ read2End = read2.reference_end
181
+
182
+ # Determine fragment coordinates
183
+ if not read1.is_reverse:
184
+ rstart = read1Start
185
+ rend = read2End
186
+ # 5' end of fragment (Strand 1)
187
+ motif_left = read1.query_sequence[:k_mer].upper()
188
+ # 5' end of fragment (Strand 2) - which is 5' end of Read 2
189
+ motif_right = read2.query_sequence[:k_mer].upper()
190
+ else:
191
+ rstart = read2Start
192
+ rend = read1End
193
+ # 5' end of fragment (Strand 1) - which is 5' end of Read 2
194
+ motif_left = read2.query_sequence[:k_mer].upper()
195
+ # 5' end of fragment (Strand 2) - which is 5' end of Read 1
196
+ motif_right = read1.query_sequence[:k_mer].upper()
197
+
198
+ if (rstart < 0) or (rend < 0) or (rstart >= rend):
199
+ continue
200
+ if fragFilter:
201
+ readLen = rend - rstart
202
+ if (minLen and readLen < minLen) or (maxLen and readLen > maxLen):
203
+ continue
204
+
205
+ # Write BED (0-based start, 0-based exclusive end)
206
+ gc = GCcontent(genome.fetch(read1.reference_name, rstart, rend))
207
+ bedWrite.write(f"{read1.reference_name}\t{rstart}\t{rend}\t{gc}\n")
208
+
209
+ # Update End Motif
210
+ End_motif = get_End_motif(End_motif, motif_left, motif_right)
211
+
212
+ # Update Breakpoint Motif
213
+ # We need genomic context around the 5' ends.
214
+ # Left breakpoint (rstart): 5' end of fragment.
215
+ # Right breakpoint (rend): 3' end of fragment (on + strand).
216
+
217
+ # For Breakpoint Motif, we want the sequence AROUND the breakpoint.
218
+ # Typically k_mer/2 bases before and k_mer/2 bases after.
219
+ pos = math.ceil(k_mer / 2)
220
+
221
+ try:
222
+ # Left breakpoint (rstart)
223
+ # Genomic sequence before rstart
224
+ ref_seq_left = genome.fetch(read1.reference_name, rstart - pos, rstart).upper()
225
+ # Fragment sequence starting at rstart (from read)
226
+ frag_seq_left = motif_left[:pos] # This is from the read
227
+
228
+ # Right breakpoint (rend)
229
+ # Fragment sequence ending at rend
230
+ # The 3' end of the fragment on + strand corresponds to the 5' end of the reverse read.
231
+ # The reverse read sequence (motif_right) is the reverse complement of the + strand.
232
+ # So motif_right is 5'->3' on - strand.
233
+ # Its complement is 3'->5' on + strand.
234
+ # Its reverse complement is 5'->3' on + strand.
235
+
236
+ # Wait, let's simplify.
237
+ # Breakpoint motif usually captures the genomic context at the nick.
238
+ # Left nick: [Genomic Pre][Fragment Start]
239
+ # Right nick: [Fragment End][Genomic Post]
240
+
241
+ # Left: ref[rstart-pos:rstart] + read[0:pos]
242
+ bp_left_seq = ref_seq_left + frag_seq_left
243
+
244
+ # Right:
245
+ # Genomic Post: ref[rend:rend+pos]
246
+ ref_seq_right = genome.fetch(read1.reference_name, rend, rend + pos).upper()
247
+
248
+ # Fragment End:
249
+ # We have motif_right which is the 5' end of the reverse read.
250
+ # This corresponds to the 3' end of the fragment on the forward strand.
251
+ # But motif_right is the sequence of the read itself (reverse strand).
252
+ # To get the forward strand sequence at the 3' end:
253
+ # It is reverse_complement(motif_right).
254
+ # And we want the last 'pos' bases of the fragment.
255
+ # If motif_right is 5'->3' on - strand.
256
+ # The first 'pos' bases of motif_right correspond to the last 'pos' bases of the fragment on the + strand (complement).
257
+
258
+ # Actually, let's just use the genome for simplicity and consistency if possible?
259
+ # No, we should use the read for the fragment part to capture variants/errors if desired,
260
+ # but usually for motifs we assume reference or read is fine.
261
+ # Using read is better for actual fragment end.
262
+
263
+ # Let's use the logic:
264
+ # Right breakpoint is at 'rend'.
265
+ # We want [Fragment End][Genomic Post]
266
+ # Fragment End is the sequence ending at rend.
267
+ # Genomic Post is sequence starting at rend.
268
+
269
+ # The sequence of the fragment at the 3' end (on + strand) is:
270
+ # reverse_complement(motif_right).
271
+ # We want the last 'pos' bases.
272
+ # motif_right[:pos] is the 5' end of the reverse read.
273
+ # reverse_complement(motif_right[:pos]) is the 3' end of the forward fragment.
274
+
275
+ frag_seq_right = reverse_complement(motif_right[:pos])
276
+
277
+ bp_right_seq = frag_seq_right + ref_seq_right
278
+
279
+ Breakpoint_motif = get_Breakpoint_motif(Breakpoint_motif, bp_left_seq, bp_right_seq)
280
+
281
+ except Exception as e:
282
+ motif_errors += 1
283
+ # logger.warning(f"Motif extraction failed for fragment at {read1.reference_name}:{rstart}-{rend}: {e}")
284
+ continue
285
+ if idx % 10000 == 0:
286
+ progress.update(task, advance=10000)
287
+ except Exception as e:
288
+ motif_errors += 1
289
+ logger.error(f"Unexpected error during fragment processing: {e}")
290
+ continue
291
+ progress.update(task, completed=total_pairs)
292
+ bedWrite.close()
293
+ logger.info("Filtering and sorting fragments with blacklist (if provided)...")
294
+ bedWrite.close()
295
+ logger.info("Filtering and sorting fragments with blacklist (if provided)...")
296
+ try:
297
+ # Load temp BED into DataFrame
298
+ # Columns: chrom, start, end, gc
299
+ # Use low_memory=False or specify dtypes for safety
300
+ df = pd.read_csv(temp_bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'gc'], dtype={'chrom': str, 'start': int, 'end': int, 'gc': float})
301
+
302
+ if blacklistInput:
303
+ logger.info(f"Filtering with blacklist: {blacklistInput}")
304
+ # Load blacklist
305
+ # Assume BED format: chrom, start, end
306
+ bl_df = pd.read_csv(blacklistInput, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
307
+
308
+ # Filter
309
+ # Since blacklist is usually small, we can iterate over blacklist intervals and remove overlapping rows.
310
+ # Or use a more efficient interval join if blacklist is large.
311
+ # For typical blacklists (e.g. ENCODE), iteration might be slow if many intervals.
312
+ # But compared to pybedtools (which writes to disk and runs bedtools), pandas in-memory might be competitive or faster for small blacklists.
313
+
314
+ # Let's use a simple approach:
315
+ # 1. Iterate by chromosome
316
+ # 2. For each chromosome, find overlapping intervals.
317
+
318
+ # Optimization: If blacklist is small, we can do it.
319
+ # If blacklist is large, we might need IntervalTree or similar.
320
+ # But we want to avoid new dependencies.
321
+
322
+ # Let's try a vectorized approach if possible? No easy way without interval index.
323
+ # Let's use the iteration method per chromosome, which is reasonably fast for typical blacklists.
324
+
325
+ # Filter out rows that overlap with ANY blacklist interval
326
+ # overlap: max(start1, start2) < min(end1, end2)
327
+
328
+ keep_mask = np.ones(len(df), dtype=bool)
329
+
330
+ for chrom, bl_group in bl_df.groupby('chrom'):
331
+ if chrom not in df['chrom'].values:
332
+ continue
333
+
334
+ chrom_mask = (df['chrom'] == chrom)
335
+ chrom_df_indices = df.index[chrom_mask]
336
+
337
+ # Get starts and ends for this chrom
338
+ starts = df.loc[chrom_mask, 'start'].values
339
+ ends = df.loc[chrom_mask, 'end'].values
340
+
341
+ # Check against all blacklist intervals for this chrom
342
+ # This is O(N_reads * M_blacklist_intervals). Can be slow if M is large.
343
+ # ENCODE blacklist is ~300 regions? Then it's fast.
344
+ # If M is large, this is bad.
345
+
346
+ # Alternative: Sort both and sweep?
347
+ # Or just assume blacklist is small enough (typical case).
348
+
349
+ for _, bl_row in bl_group.iterrows():
350
+ bl_start = bl_row['start']
351
+ bl_end = bl_row['end']
352
+
353
+ # Vectorized overlap check
354
+ # overlap = (starts < bl_end) & (ends > bl_start)
355
+ overlap = (starts < bl_end) & (ends > bl_start)
356
+
357
+ # Mark overlapping reads to drop
358
+ # We need to map back to original indices
359
+ # overlap is boolean array for the subset
360
+
361
+ if np.any(overlap):
362
+ # Get indices in original df to drop
363
+ drop_indices = chrom_df_indices[overlap]
364
+ keep_mask[drop_indices] = False
365
+
366
+ df = df[keep_mask]
367
+
368
+ # Sort
369
+ df.sort_values(by=['chrom', 'start', 'end'], inplace=True)
370
+
371
+ # Write to output
372
+ df.to_csv(bedOutput, sep='\t', header=False, index=False)
373
+
374
+ os.remove(temp_bed)
375
+
376
+ # Compress and Index
377
+ logger.info(f"Compressing and indexing {bedOutput}...")
378
+ pysam.tabix_compress(bedOutput, bedOutput + ".gz", force=True)
379
+ pysam.tabix_index(bedOutput + ".gz", preset="bed", force=True)
380
+ # Remove uncompressed file to save space and avoid confusion
381
+ if os.path.exists(bedOutput):
382
+ os.remove(bedOutput)
383
+
384
+ except Exception as e:
385
+ logger.error(f"Error during BED filtering/sorting/compression: {e}")
386
+ raise typer.Exit(1)
387
+ # Write EndMotif
388
+ edm_file = os.path.join(EDM_output_path, Path(bedOutput).stem + '.EndMotif')
389
+ logger.info(f"Writing End Motif frequencies to {edm_file}")
390
+ try:
391
+ with open(edm_file, 'w') as f:
392
+ total = sum(End_motif.values())
393
+ for k, v in End_motif.items():
394
+ f.write(f"{k}\t{v/total if total else 0}\n")
395
+ except Exception as e:
396
+ logger.error(f"Failed to write End Motif output: {e}")
397
+ raise typer.Exit(1)
398
+ # Write BreakPointMotif
399
+ bpm_file = os.path.join(BPM_output_path, Path(bedOutput).stem + '.BreakPointMotif')
400
+ logger.info(f"Writing Breakpoint Motif frequencies to {bpm_file}")
401
+ try:
402
+ with open(bpm_file, 'w') as f:
403
+ total = sum(Breakpoint_motif.values())
404
+ for k, v in Breakpoint_motif.items():
405
+ f.write(f"{k}\t{v/total if total else 0}\n")
406
+ except Exception as e:
407
+ logger.error(f"Failed to write Breakpoint Motif output: {e}")
408
+ raise typer.Exit(1)
409
+ # Write MDS (Motif Diversity Score)
410
+ mds_file = os.path.join(MDS_output_path, Path(bedOutput).stem + '.MDS')
411
+ logger.info(f"Writing Motif Diversity Score to {mds_file}")
412
+ try:
413
+ df = pd.read_csv(edm_file, sep='\t', header=None, names=['motif', 'frequency'])
414
+ freq = df['frequency'].values
415
+ mds = -np.sum(freq * np.log2(freq + 1e-12)) / np.log2(len(freq))
416
+ with open(mds_file, 'w') as f:
417
+ f.write(f"{mds}\n")
418
+ except Exception as e:
419
+ logger.error(f"Failed to write MDS output: {e}")
420
+ raise typer.Exit(1)
421
+ # Print summary
422
+ summary_table = Table(title="Motif Extraction Summary", show_header=True, header_style="bold magenta")
423
+ summary_table.add_column("Output Type", style="bold")
424
+ summary_table.add_column("File Path")
425
+ summary_table.add_row("End Motif (EDM)", edm_file)
426
+ summary_table.add_row("Breakpoint Motif (BPM)", bpm_file)
427
+ summary_table.add_row("Motif Diversity Score (MDS)", mds_file)
428
+ summary_table.add_row("Fragment BED", bedOutput + ".gz")
429
+ console.print(Panel(summary_table, title="[green]Extraction Complete", subtitle=f"Motif errors: {motif_errors}", expand=False))
430
+ logger.info("Motif feature extraction complete.")
krewlyzer/ocf.py ADDED
@@ -0,0 +1,133 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import pysam
6
+ import pandas as pd
7
+ from collections import defaultdict
8
+ from functools import partial
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
12
+ import os
13
+
14
+ console = Console()
15
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
16
+ logger = logging.getLogger("ocf")
17
+
18
+
19
+ def calc_ocf(bedgz_file: Path, ocr_file: Path, output_dir: Path) -> None:
20
+ """
21
+ Calculate OCF for a single .bed.gz file and OCR region file.
22
+ Output is per-region .sync.end files and a summary all.ocf.csv.
23
+ """
24
+ try:
25
+ tbx = pysam.TabixFile(str(bedgz_file))
26
+ regions = pd.read_csv(ocr_file, sep="\t", header=None, names=["chr", "start", "end", "description"])
27
+ leftPOS = defaultdict(partial(defaultdict, int))
28
+ rightPOS = defaultdict(partial(defaultdict, int))
29
+ total = defaultdict(lambda: [0, 0])
30
+ for _, region in regions.iterrows():
31
+ region_Chr, region_Start, region_End, region_Label = (
32
+ region["chr"], region["start"], region["end"], region["description"])
33
+ try:
34
+ fetched_reads = tbx.fetch(region_Chr, region_Start, region_End)
35
+ except ValueError:
36
+ continue
37
+ for row in fetched_reads:
38
+ tmp_row = row.split()
39
+ rstart = int(tmp_row[1])
40
+ rend = int(tmp_row[2])
41
+ if rstart >= region_Start:
42
+ s = rstart - region_Start
43
+ leftPOS[region_Label][s] += 1
44
+ total[region_Label][0] += 1
45
+ if rend <= region_End:
46
+ e = rend - region_Start + 1
47
+ rightPOS[region_Label][e] += 1
48
+ total[region_Label][1] += 1
49
+ Labels = []
50
+ ocf = []
51
+ outputfile = output_dir / 'all.ocf.csv'
52
+ for label in total.keys():
53
+ output = output_dir / f'{label}.sync.end'
54
+ Labels.append(label)
55
+ le = leftPOS[label]
56
+ re = rightPOS[label]
57
+ ts = total[label][0] / 10000 if total[label][0] else 1
58
+ te = total[label][1] / 10000 if total[label][1] else 1
59
+ num = 2000
60
+ with open(output, 'w') as output_write:
61
+ for k in range(num):
62
+ l = le[k]
63
+ r = re[k]
64
+ output_write.write(
65
+ f"{k - 1000}\t{l}\t{l / ts}\t{r}\t{r / te}\n")
66
+ # OCF calculation
67
+ with open(output, 'r') as o:
68
+ peak = 60
69
+ bin = 10
70
+ trueends = 0
71
+ background = 0
72
+ for line in o.readlines():
73
+ loc, left, Left, right, Right = line.split()
74
+ loc = int(loc)
75
+ if -peak - bin <= loc <= -peak + bin:
76
+ trueends += float(Right)
77
+ background += float(Left)
78
+ elif peak - bin <= loc <= peak + bin:
79
+ trueends += float(Left)
80
+ background += float(Right)
81
+ ocf.append(trueends - background)
82
+ import pandas as pd
83
+ ocf_df = pd.DataFrame({"tissue": Labels, "OCF": ocf})
84
+ ocf_df.to_csv(outputfile, sep="\t", index=None)
85
+ logger.info(f"OCF calculation complete for {bedgz_file}. Results in {output_dir}.")
86
+ except Exception as e:
87
+ logger.error(f"Fatal error in calc_ocf: {e}")
88
+ raise typer.Exit(1)
89
+
90
+
91
+ def ocf(
92
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
93
+ ocr_input: Optional[Path] = typer.Option(None, "--ocr-input", "-r", help="Path to open chromatin region BED file (default: packaged tissue file)"),
94
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
95
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
96
+ ) -> None:
97
+ """
98
+ Calculate orientation-aware cfDNA fragmentation (OCF) features for all .bed.gz files in a folder.
99
+ """
100
+ # Input checks
101
+ if not bedgz_path.exists():
102
+ logger.error(f"Input directory not found: {bedgz_path}")
103
+ raise typer.Exit(1)
104
+ if ocr_input and not ocr_input.exists():
105
+ logger.error(f"OCR region BED file not found: {ocr_input}")
106
+ raise typer.Exit(1)
107
+ try:
108
+ output.mkdir(parents=True, exist_ok=True)
109
+ except Exception as e:
110
+ logger.error(f"Could not create output directory {output}: {e}")
111
+ raise typer.Exit(1)
112
+ # Set default OCR file if not provided
113
+ if ocr_input is None:
114
+ pkg_dir = Path(__file__).parent
115
+ ocr_input = pkg_dir / "data/OpenChromatinRegion/7specificTissue.all.OC.bed"
116
+ bedgz_files = [f for f in Path(bedgz_path).glob("*.bed.gz")]
117
+ output = Path(output)
118
+ output.mkdir(parents=True, exist_ok=True)
119
+ def run_ocf_file(bedgz_file):
120
+ sample_dir = output / bedgz_file.stem.replace('.bed', '')
121
+ sample_dir.mkdir(exist_ok=True)
122
+ calc_ocf(bedgz_file, ocr_input, sample_dir)
123
+ return str(sample_dir)
124
+ with ProcessPoolExecutor(max_workers=threads) as executor:
125
+ futures = {executor.submit(run_ocf_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
126
+ for future in as_completed(futures):
127
+ bedgz_file = futures[future]
128
+ try:
129
+ result = future.result()
130
+ logger.info(f"OCF calculated: {result}")
131
+ except Exception as exc:
132
+ logger.error(f"OCF calculation failed for {bedgz_file}: {exc}")
133
+ logger.info(f"OCF features calculated for {len(bedgz_files)} files.")