krewlyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
krewlyzer/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """cfDNAFE - Feature extraction tools for circulating tumor DNA"""
krewlyzer/cli.py ADDED
@@ -0,0 +1,47 @@
1
+ """Command-line interface for cfDNAFE"""
2
+
3
+ from typing import Optional
4
+ import typer
5
+ from pathlib import Path
6
+ from rich.console import Console
7
+ from rich.logging import RichHandler
8
+ import logging
9
+
10
+ console = Console()
11
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
12
+ logger = logging.getLogger("krewlyzer-cli")
13
+
14
+ def set_log_level(log_level: str = typer.Option("INFO", "--log-level", help="Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL")):
15
+ """Set global logging level."""
16
+ level = getattr(logging, log_level.upper(), logging.INFO)
17
+ for handler in logging.root.handlers:
18
+ handler.setLevel(level)
19
+ logging.getLogger().setLevel(level)
20
+
21
+ app = typer.Typer(callback=set_log_level)
22
+
23
+ from .motif import motif
24
+ from .fsc import fsc
25
+ from .fsr import fsr
26
+ from .fsd import fsd
27
+ from .wps import wps
28
+ from .ocf import ocf
29
+ from .uxm import uxm
30
+ from .wrapper import run_all
31
+
32
+ app.command()(motif)
33
+ app.command()(fsc)
34
+ app.command()(fsr)
35
+ app.command()(fsd)
36
+ app.command()(wps)
37
+ app.command()(ocf)
38
+ app.command()(uxm)
39
+ app.command()(run_all)
40
+
41
+ @app.command()
42
+ def version() -> None:
43
+ """Show version information"""
44
+ logger.info("krewlyzer 0.1.0")
45
+
46
+ if __name__ == "__main__":
47
+ app()
krewlyzer/fsc.py ADDED
@@ -0,0 +1,450 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import sys
6
+
7
+ import pysam
8
+ import pybedtools
9
+ import numpy as np
10
+ import pandas as pd
11
+ from skmisc.loess import loess
12
+ from rich.console import Console
13
+ from rich.logging import RichHandler
14
+
15
+ console = Console()
16
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
17
+ logger = logging.getLogger("fsc")
18
+
19
+
20
+ from .helpers import gc_correct
21
+
22
+
23
+ def _calc_fsr(bedgz_input, bin_input, windows, continue_n, output_file):
24
+ """
25
+ Internal: Calculate fragment size ratio (FSR) for a single .bed.gz file.
26
+ Writes region-based ratios for short, intermediate, and long fragments.
27
+ """
28
+ try:
29
+ logger.info(f"input file: {bedgz_input}, {bin_input}")
30
+ try:
31
+ inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
32
+ except Exception as e:
33
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
34
+ raise typer.Exit(1)
35
+ try:
36
+ bins = pybedtools.BedTool(bin_input)
37
+ except Exception as e:
38
+ logger.error(f"Could not load bins from {bin_input}: {e}")
39
+ raise typer.Exit(1)
40
+ length = len(bins)
41
+ shorts_data, intermediates_data, longs_data, totals_data, bingc = [], [], [], [], []
42
+ chrom = []
43
+ logger.info(f"output file: {output_file}")
44
+ for idx in range(length):
45
+ bin = bins[idx]
46
+ try:
47
+ chrom.append(bin.chrom)
48
+ inputbed.fetch(bin.chrom, bin.start, bin.end)
49
+ except ValueError:
50
+ bingc.append(np.nan)
51
+ shorts_data.append(0)
52
+ intermediates_data.append(0)
53
+ longs_data.append(0)
54
+ except Exception as e:
55
+ logger.error(f"Error fetching bin {bin}: {e}")
56
+ raise typer.Exit(1)
57
+ else:
58
+ bin_data = []
59
+ gc = []
60
+ try:
61
+ for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
62
+ bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
63
+ if 65 <= int(read.split("\t")[2]) - int(read.split("\t")[1]) <= 400:
64
+ gc.append(float(read.split("\t")[3]))
65
+ count = np.bincount(bin_data, minlength=401)
66
+ except Exception as e:
67
+ logger.error(f"Error processing reads in bin {bin}: {e}")
68
+ raise typer.Exit(1)
69
+ if len(gc) == 0:
70
+ bingc.append(np.nan)
71
+ else:
72
+ bingc.append(np.mean(gc))
73
+ shorts = sum(count[65:150])
74
+ intermediates = sum(count[151:260])
75
+ longs = sum(count[261:400])
76
+ totals = sum(count[65:400])
77
+ if totals == 0:
78
+ shorts_data.append(0)
79
+ intermediates_data.append(0)
80
+ longs_data.append(0)
81
+ else:
82
+ shorts_data.append(shorts / totals)
83
+ intermediates_data.append(intermediates / totals)
84
+ longs_data.append(longs / totals)
85
+ start = 0
86
+ step = 0
87
+ try:
88
+ with open(output_file, 'w') as fsrfile:
89
+ fsrfile.write("region\tshort-ratio\titermediate-ratio\tlong-ratio\n")
90
+ while step < length:
91
+ num = chrom.count(chrom[step])
92
+ continues_bin = num // continue_n
93
+ last_bin = num % continue_n
94
+ for _ in range(continues_bin):
95
+ bin_start = start * windows
96
+ bin_end = (start + continue_n) * windows - 1
97
+ combine_shorts = shorts_data[step: step + continue_n]
98
+ combine_intermediates = intermediates_data[step: step + continue_n]
99
+ combine_longs = longs_data[step: step + continue_n]
100
+ tmp_array = np.zeros(3)
101
+ tmp_array[0] = np.mean(combine_shorts)
102
+ tmp_array[1] = np.mean(combine_intermediates)
103
+ tmp_array[2] = np.mean(combine_longs)
104
+ region = f"{chrom[step]}:{bin_start}-{bin_end}"
105
+ temp_str = f"{region}\t" + "\t".join(map(str, tmp_array)) + "\n"
106
+ fsrfile.write(temp_str)
107
+ step += continue_n
108
+ start += continue_n
109
+ if last_bin != 0:
110
+ step += last_bin
111
+ start = 0
112
+ except Exception as e:
113
+ logger.error(f"Error writing FSR output file: {e}")
114
+ raise typer.Exit(1)
115
+ logger.info(f"FSR calculation complete. Results written to {output_file}")
116
+ except Exception as e:
117
+ logger.error(f"Fatal error in _calc_fsr: {e}")
118
+ raise typer.Exit(1)
119
+
120
+ def _calc_fsd(bedgz_input, arms_file, output_file):
121
+ """
122
+ Internal: Calculate fragment size distribution (FSD) for a single .bed.gz file.
123
+ Writes region-based fragment size distributions in 5bp bins from 65-399bp.
124
+ """
125
+ try:
126
+ logger.info(f"input file: {bedgz_input}, {arms_file}")
127
+ try:
128
+ inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
129
+ except Exception as e:
130
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
131
+ raise typer.Exit(1)
132
+ try:
133
+ bins = pybedtools.BedTool(arms_file)
134
+ except Exception as e:
135
+ logger.error(f"Could not load bins from {arms_file}: {e}")
136
+ raise typer.Exit(1)
137
+ length = len(bins)
138
+ interval_data = []
139
+ region = []
140
+ logger.info(f"output file: {output_file}")
141
+ for idx in range(length):
142
+ bin = bins[idx]
143
+ region.append(f"{bin.chrom}:{bin.start}-{bin.end}")
144
+ try:
145
+ inputbed.fetch(bin.chrom, bin.start, bin.end)
146
+ except ValueError:
147
+ interval_data.append([0] * 67)
148
+ continue
149
+ except Exception as e:
150
+ logger.error(f"Error fetching bin {bin}: {e}")
151
+ raise typer.Exit(1)
152
+ else:
153
+ bin_data = []
154
+ try:
155
+ for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
156
+ bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
157
+ count = np.bincount(bin_data, minlength=401)
158
+ step_size = 5
159
+ start_bin = 65
160
+ end_bin = 400
161
+ bin_len = int((end_bin - start_bin) / step_size)
162
+ temp_bin = []
163
+ for bin_id in range(bin_len):
164
+ temp_bin.append(np.sum(count[(start_bin + step_size * bin_id):(start_bin + step_size * (bin_id + 1))]))
165
+ interval_data.append(temp_bin)
166
+ except Exception as e:
167
+ logger.error(f"Error processing reads in bin {bin}: {e}")
168
+ interval_data.append([0] * 67)
169
+ continue
170
+ try:
171
+ with open(output_file, 'w') as fsdfile:
172
+ sbin = np.arange(65, 400, 5)
173
+ head_str = 'region' + '\t' + '\t'.join([f"{s}-{s+4}" for s in sbin]) + '\n'
174
+ fsdfile.write(head_str)
175
+ for i in range(length):
176
+ arms = interval_data[i]
177
+ score = np.zeros(67)
178
+ if np.sum(arms) != 0:
179
+ score = np.array(arms) / np.sum(arms)
180
+ temp_str = region[i] + '\t' + '\t'.join(map(str, score)) + '\n'
181
+ fsdfile.write(temp_str)
182
+ except Exception as e:
183
+ logger.error(f"Error writing FSD output file: {e}")
184
+ raise typer.Exit(1)
185
+ logger.info(f"FSD calculation complete. Results written to {output_file}")
186
+ except Exception as e:
187
+ logger.error(f"Fatal error in _calc_fsd: {e}")
188
+ raise typer.Exit(1)
189
+
190
+
191
+ """
192
+ Internal: Calculate fragment size coverage (FSC) for a single .bed.gz file.
193
+ Handles errors and logs all steps. Raises typer.Exit(1) on fatal errors.
194
+ """
195
+ try:
196
+ logger.info(f"input file: {bedgz_input}, {bin_input}")
197
+ try:
198
+ inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
199
+ except Exception as e:
200
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
201
+ raise typer.Exit(1)
202
+ try:
203
+ bins = pybedtools.BedTool(bin_input)
204
+ except Exception as e:
205
+ logger.error(f"Could not load bins from {bin_input}: {e}")
206
+ raise typer.Exit(1)
207
+ length = len(bins)
208
+ shorts_data, intermediates_data, longs_data, totals_data, bingc = [], [], [], [], []
209
+ chrom = []
210
+ logger.info(f"output file: {output_file}")
211
+ for idx in range(length):
212
+ bin = bins[idx]
213
+ try:
214
+ chrom.append(bin.chrom)
215
+ inputbed.fetch(bin.chrom, bin.start, bin.end)
216
+ except ValueError:
217
+ bingc.append(np.nan)
218
+ shorts_data.append(0)
219
+ intermediates_data.append(0)
220
+ longs_data.append(0)
221
+ totals_data.append(0)
222
+ except Exception as e:
223
+ logger.error(f"Error fetching bin {bin}: {e}")
224
+ raise typer.Exit(1)
225
+ else:
226
+ bin_data = []
227
+ gc = []
228
+ try:
229
+ for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
230
+ bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
231
+ if 65 <= int(read.split("\t")[2]) - int(read.split("\t")[1]) <= 400:
232
+ gc.append(float(read.split("\t")[3]))
233
+ count = np.bincount(bin_data, minlength=401)
234
+ except Exception as e:
235
+ logger.error(f"Error processing reads in bin {bin}: {e}")
236
+ raise typer.Exit(1)
237
+ if len(gc) == 0:
238
+ bingc.append(np.nan)
239
+ else:
240
+ bingc.append(np.mean(gc))
241
+ shorts = sum(count[65:150])
242
+ intermediates = sum(count[151:260])
243
+ longs = sum(count[261:400])
244
+ totals = sum(count[65:400])
245
+ shorts_data.append(shorts)
246
+ intermediates_data.append(intermediates)
247
+ longs_data.append(longs)
248
+ totals_data.append(totals)
249
+ try:
250
+ correct_shorts = gc_correct(shorts_data, bingc)
251
+ correct_intermediates = gc_correct(intermediates_data, bingc)
252
+ correct_longs = gc_correct(longs_data, bingc)
253
+ correct_totals = gc_correct(totals_data, bingc)
254
+ except Exception as e:
255
+ logger.error(f"GC correction failed: {e}")
256
+ raise typer.Exit(1)
257
+ start = 0
258
+ step = 0
259
+ short_s, intermediate_s, long_s, total_s = [], [], [], []
260
+ region = []
261
+ try:
262
+ with open(output_file, 'w') as fscfile:
263
+ fscfile.write(
264
+ "region\tshort-fragment-zscore\titermediate-fragment-zscore\tlong-fragment-zscore\ttotal-fragment-zscore\n"
265
+ )
266
+ while step < length:
267
+ num = chrom.count(chrom[step])
268
+ continues_bin = num // continue_n
269
+ last_bin = num % continue_n
270
+ for _ in range(continues_bin):
271
+ bin_start = start * windows
272
+ bin_end = (start + continue_n) * windows - 1
273
+ combine_shorts = correct_shorts[step: step + continue_n]
274
+ combine_intermediates = correct_intermediates[step: step + continue_n]
275
+ combine_longs = correct_longs[step: step + continue_n]
276
+ combine_totals = correct_totals[step: step + continue_n]
277
+ short_s.append(np.sum(combine_shorts))
278
+ intermediate_s.append(np.sum(combine_intermediates))
279
+ long_s.append(np.sum(combine_longs))
280
+ total_s.append(np.sum(combine_totals))
281
+ region.append(f"{chrom[step]}:{bin_start}-{bin_end}")
282
+ step += continue_n
283
+ start += continue_n
284
+ if last_bin != 0:
285
+ step += last_bin
286
+ start = 0
287
+ try:
288
+ short_z = (np.array(short_s) - np.mean(short_s)) / np.std(short_s)
289
+ intermediate_z = (np.array(intermediate_s) - np.mean(intermediate_s)) / np.std(intermediate_s)
290
+ long_z = (np.array(long_s) - np.mean(long_s)) / np.std(long_s)
291
+ total_z = (np.array(total_s) - np.mean(total_s)) / np.std(total_s)
292
+ except Exception as e:
293
+ logger.error(f"Error calculating z-scores: {e}")
294
+ raise typer.Exit(1)
295
+ for j in range(len(region)):
296
+ temp_str = f"{region[j]}\t{short_z[j]}\t{intermediate_z[j]}\t{long_z[j]}\t{total_z[j]}\n"
297
+ fscfile.write(temp_str)
298
+ except Exception as e:
299
+ logger.error(f"Error writing FSC output file: {e}")
300
+ raise typer.Exit(1)
301
+ logger.info(f"FSC calculation complete. Results written to {output_file}")
302
+ except Exception as e:
303
+ logger.error(f"Fatal error in _calc_fsc: {e}")
304
+ raise typer.Exit(1)
305
+
306
+
307
+ def fsc(
308
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
309
+ bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
310
+ windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
311
+ continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
312
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
313
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
314
+ ):
315
+ """
316
+ Calculate fragment size coverage (FSC) features for all .bed.gz files in a folder.
317
+ """
318
+ # Input checks
319
+ if not bedgz_path.exists():
320
+ logger.error(f"Input directory not found: {bedgz_path}")
321
+ raise typer.Exit(1)
322
+ if bin_input and not bin_input.exists():
323
+ logger.error(f"Bin input file not found: {bin_input}")
324
+ raise typer.Exit(1)
325
+ try:
326
+ output.mkdir(parents=True, exist_ok=True)
327
+ except Exception as e:
328
+ logger.error(f"Could not create output directory {output}: {e}")
329
+ raise typer.Exit(1)
330
+ if not output.exists():
331
+ logger.error(f"Output directory not found: {output}")
332
+ raise typer.Exit(1)
333
+ if not output.is_dir():
334
+ logger.error(f"Output path is not a directory: {output}")
335
+ raise typer.Exit(1)
336
+ if not output.is_writable():
337
+ logger.error(f"Output directory is not writable: {output}")
338
+ raise typer.Exit(1)
339
+
340
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
341
+ if not bedgz_files:
342
+ logger.error("No .bed.gz files found in the specified folder.")
343
+ raise typer.Exit(1)
344
+ if bin_input is None:
345
+ # Use package-relative default
346
+ bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
347
+ logger.info(f"No bin_input specified. Using default: {bin_input}")
348
+ if not bin_input.exists():
349
+ logger.error(f"Bin input file does not exist: {bin_input}")
350
+ raise typer.Exit(1)
351
+ logger.info(f"Calculating FSC for {len(bedgz_files)} files...")
352
+ from concurrent.futures import ProcessPoolExecutor, as_completed
353
+ logger.info(f"Starting parallel FSC calculation using {threads} processes...")
354
+ def run_fsc_file(bedgz_file):
355
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSC.txt')
356
+ _calc_fsc(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
357
+ return str(output_file)
358
+ with ProcessPoolExecutor(max_workers=threads) as executor:
359
+ futures = {executor.submit(run_fsc_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
360
+ for future in as_completed(futures):
361
+ bedgz_file = futures[future]
362
+ try:
363
+ result = future.result()
364
+ logger.info(f"FSC calculated: {result}")
365
+ except Exception as exc:
366
+ logger.error(f"FSC calculation failed for {bedgz_file}: {exc}")
367
+ logger.info(f"FSC features calculated for {len(bedgz_files)} files.")
368
+
369
+
370
+ def fsr(
371
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
372
+ bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
373
+ windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
374
+ continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
375
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
376
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
377
+ ):
378
+ """
379
+ Calculate fragment size ratio (FSR) features for all .bed.gz files in a folder.
380
+ The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
381
+ Output files are written to the output directory, one per .bed.gz file.
382
+ """
383
+ if not output.exists():
384
+ output.mkdir(parents=True, exist_ok=True)
385
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
386
+ if not bedgz_files:
387
+ logger.error("No .bed.gz files found in the specified folder.")
388
+ raise typer.Exit(1)
389
+ if bin_input is None:
390
+ bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
391
+ logger.info(f"No bin_input specified. Using default: {bin_input}")
392
+ if not bin_input.exists():
393
+ logger.error(f"Bin input file does not exist: {bin_input}")
394
+ raise typer.Exit(1)
395
+ logger.info(f"Calculating FSR for {len(bedgz_files)} files...")
396
+ from concurrent.futures import ProcessPoolExecutor, as_completed
397
+ logger.info(f"Starting parallel FSR calculation using {threads} processes...")
398
+ def run_fsr_file(bedgz_file):
399
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSR.txt')
400
+ _calc_fsr(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
401
+ return str(output_file)
402
+ with ProcessPoolExecutor(max_workers=threads) as executor:
403
+ futures = {executor.submit(run_fsr_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
404
+ for future in as_completed(futures):
405
+ bedgz_file = futures[future]
406
+ try:
407
+ result = future.result()
408
+ logger.info(f"FSR calculated: {result}")
409
+ except Exception as exc:
410
+ logger.error(f"FSR calculation failed for {bedgz_file}: {exc}")
411
+ logger.info(f"FSR features calculated for {len(bedgz_files)} files.")
412
+
413
+
414
+ def fsd(
415
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
416
+ arms_file: Path = typer.Option(..., "--arms-file", "-a", help="Path to arms/region file (BED format)"),
417
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
418
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
419
+ ):
420
+ """
421
+ Calculate fragment size distribution (FSD) features for all .bed.gz files in a folder.
422
+ The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
423
+ Output files are written to the output directory, one per .bed.gz file.
424
+ """
425
+ if not output.exists():
426
+ output.mkdir(parents=True, exist_ok=True)
427
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
428
+ if not bedgz_files:
429
+ logger.error("No .bed.gz files found in the specified folder.")
430
+ raise typer.Exit(1)
431
+ if not arms_file.exists():
432
+ logger.error(f"Arms/region file does not exist: {arms_file}")
433
+ raise typer.Exit(1)
434
+ logger.info(f"Calculating FSD for {len(bedgz_files)} files...")
435
+ from concurrent.futures import ProcessPoolExecutor, as_completed
436
+ logger.info(f"Starting parallel FSD calculation using {threads} processes...")
437
+ def run_fsd_file(bedgz_file):
438
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSD.txt')
439
+ _calc_fsd(str(bedgz_file), str(arms_file), str(output_file))
440
+ return str(output_file)
441
+ with ProcessPoolExecutor(max_workers=threads) as executor:
442
+ futures = {executor.submit(run_fsd_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
443
+ for future in as_completed(futures):
444
+ bedgz_file = futures[future]
445
+ try:
446
+ result = future.result()
447
+ logger.info(f"FSD calculated: {result}")
448
+ except Exception as exc:
449
+ logger.error(f"FSD calculation failed for {bedgz_file}: {exc}")
450
+ logger.info(f"FSD features calculated for {len(bedgz_files)} files.")
krewlyzer/fsd.py ADDED
@@ -0,0 +1,139 @@
1
+ import typer
2
+ from pathlib import Path
3
+ import logging
4
+ import pysam
5
+ import pybedtools
6
+ import numpy as np
7
+ from rich.console import Console
8
+ from rich.logging import RichHandler
9
+
10
+ console = Console()
11
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
12
+ logger = logging.getLogger("fsd")
13
+
14
+ def _calc_fsd(bedgz_input, arms_file, output_file):
15
+ """
16
+ Internal: Calculate fragment size distribution (FSD) for a single .bed.gz file.
17
+ Writes region-based fragment size distributions in 5bp bins from 65-399bp.
18
+ """
19
+ try:
20
+ logger.info(f"input file: {bedgz_input}, {arms_file}")
21
+ try:
22
+ inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
23
+ except Exception as e:
24
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
25
+ raise typer.Exit(1)
26
+ try:
27
+ bins = pybedtools.BedTool(arms_file)
28
+ except Exception as e:
29
+ logger.error(f"Could not load bins from {arms_file}: {e}")
30
+ raise typer.Exit(1)
31
+ length = len(bins)
32
+ interval_data = []
33
+ region = []
34
+ logger.info(f"output file: {output_file}")
35
+ for idx in range(length):
36
+ bin = bins[idx]
37
+ region.append(f"{bin.chrom}:{bin.start}-{bin.end}")
38
+ try:
39
+ inputbed.fetch(bin.chrom, bin.start, bin.end)
40
+ except ValueError:
41
+ interval_data.append([0] * 67)
42
+ continue
43
+ except Exception as e:
44
+ logger.error(f"Error fetching bin {bin}: {e}")
45
+ raise typer.Exit(1)
46
+ else:
47
+ bin_data = []
48
+ try:
49
+ for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
50
+ bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
51
+ count = np.bincount(bin_data, minlength=401)
52
+ step_size = 5
53
+ start_bin = 65
54
+ end_bin = 400
55
+ bin_len = int((end_bin - start_bin) / step_size)
56
+ temp_bin = []
57
+ for bin_id in range(bin_len):
58
+ temp_bin.append(np.sum(count[(start_bin + step_size * bin_id):(start_bin + step_size * (bin_id + 1))]))
59
+ interval_data.append(temp_bin)
60
+ except Exception as e:
61
+ logger.error(f"Error processing reads in bin {bin}: {e}")
62
+ interval_data.append([0] * 67)
63
+ continue
64
+ try:
65
+ with open(output_file, 'w') as fsdfile:
66
+ sbin = np.arange(65, 400, 5)
67
+ head_str = 'region' + '\t' + '\t'.join([f"{s}-{s+4}" for s in sbin]) + '\n'
68
+ fsdfile.write(head_str)
69
+ for i in range(length):
70
+ arms = interval_data[i]
71
+ score = np.zeros(67)
72
+ if np.sum(arms) != 0:
73
+ score = np.array(arms) / np.sum(arms)
74
+ temp_str = region[i] + '\t' + '\t'.join(map(str, score)) + '\n'
75
+ fsdfile.write(temp_str)
76
+ except Exception as e:
77
+ logger.error(f"Error writing FSD output file: {e}")
78
+ raise typer.Exit(1)
79
+ logger.info(f"FSD calculation complete. Results written to {output_file}")
80
+ except Exception as e:
81
+ logger.error(f"Fatal error in _calc_fsd: {e}")
82
+ raise typer.Exit(1)
83
+
84
+ def fsd(
85
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
86
+ arms_file: Path = typer.Option(..., "--arms-file", "-a", help="Path to arms/region file (BED format)"),
87
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
88
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
89
+ ):
90
+ """
91
+ Calculate fragment size distribution (FSD) features for all .bed.gz files in a folder.
92
+ The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
93
+ Output files are written to the output directory, one per .bed.gz file.
94
+ """
95
+ # Input checks
96
+ if not bedgz_path.exists():
97
+ logger.error(f"Input directory not found: {bedgz_path}")
98
+ raise typer.Exit(1)
99
+ if not arms_file.exists():
100
+ logger.error(f"Arms/region file not found: {arms_file}")
101
+ raise typer.Exit(1)
102
+ try:
103
+ output.mkdir(parents=True, exist_ok=True)
104
+ except Exception as e:
105
+ logger.error(f"Could not create output directory {output}: {e}")
106
+ raise typer.Exit(1)
107
+ if not output.exists():
108
+ logger.error(f"Output directory not found: {output}")
109
+ raise typer.Exit(1)
110
+ if not output.is_dir():
111
+ logger.error(f"Output path is not a directory: {output}")
112
+ raise typer.Exit(1)
113
+ if not output.is_writable():
114
+ logger.error(f"Output directory is not writable: {output}")
115
+ raise typer.Exit(1)
116
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
117
+ if not bedgz_files:
118
+ logger.error("No .bed.gz files found in the specified folder.")
119
+ raise typer.Exit(1)
120
+ if not arms_file.exists():
121
+ logger.error(f"Arms/region file does not exist: {arms_file}")
122
+ raise typer.Exit(1)
123
+ logger.info(f"Calculating FSD for {len(bedgz_files)} files...")
124
+ from concurrent.futures import ProcessPoolExecutor, as_completed
125
+ logger.info(f"Starting parallel FSD calculation using {threads} processes...")
126
+ def run_fsd_file(bedgz_file):
127
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSD.txt')
128
+ _calc_fsd(str(bedgz_file), str(arms_file), str(output_file))
129
+ return str(output_file)
130
+ with ProcessPoolExecutor(max_workers=threads) as executor:
131
+ futures = {executor.submit(run_fsd_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
132
+ for future in as_completed(futures):
133
+ bedgz_file = futures[future]
134
+ try:
135
+ result = future.result()
136
+ logger.info(f"FSD calculated: {result}")
137
+ except Exception as exc:
138
+ logger.error(f"FSD calculation failed for {bedgz_file}: {exc}")
139
+ logger.info(f"FSD features calculated for {len(bedgz_files)} files.")