krewlyzer 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krewlyzer/__init__.py +3 -0
- krewlyzer/cli.py +53 -0
- krewlyzer/fsc.py +330 -0
- krewlyzer/fsd.py +170 -0
- krewlyzer/fsr.py +225 -0
- krewlyzer/helpers.py +237 -0
- krewlyzer/mfsd.py +236 -0
- krewlyzer/motif.py +430 -0
- krewlyzer/ocf.py +133 -0
- krewlyzer/uxm.py +188 -0
- krewlyzer/wps.py +264 -0
- krewlyzer/wrapper.py +147 -0
- krewlyzer-0.1.4.dist-info/METADATA +22 -0
- krewlyzer-0.1.4.dist-info/RECORD +18 -0
- krewlyzer-0.1.4.dist-info/WHEEL +5 -0
- krewlyzer-0.1.4.dist-info/entry_points.txt +2 -0
- krewlyzer-0.1.4.dist-info/licenses/LICENSE +619 -0
- krewlyzer-0.1.4.dist-info/top_level.txt +1 -0
krewlyzer/__init__.py
ADDED
krewlyzer/cli.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Command-line interface for cfDNAFE"""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import typer
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.logging import RichHandler
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
12
|
+
logger = logging.getLogger("krewlyzer-cli")
|
|
13
|
+
|
|
14
|
+
def set_log_level(log_level: str = typer.Option("INFO", "--log-level", help="Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL")):
|
|
15
|
+
"""Set global logging level."""
|
|
16
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
17
|
+
for handler in logging.root.handlers:
|
|
18
|
+
handler.setLevel(level)
|
|
19
|
+
logging.getLogger().setLevel(level)
|
|
20
|
+
|
|
21
|
+
app = typer.Typer(help="krewlyzer: A comprehensive toolkit for ctDNA fragmentomics analysis.")
|
|
22
|
+
|
|
23
|
+
from krewlyzer.motif import motif
|
|
24
|
+
from krewlyzer.fsc import fsc
|
|
25
|
+
from krewlyzer.fsr import fsr
|
|
26
|
+
from krewlyzer.fsd import fsd
|
|
27
|
+
from krewlyzer.wps import wps
|
|
28
|
+
from krewlyzer.ocf import ocf
|
|
29
|
+
from krewlyzer.uxm import uxm
|
|
30
|
+
from krewlyzer.mfsd import mfsd
|
|
31
|
+
from krewlyzer.wrapper import run_all
|
|
32
|
+
from krewlyzer import __version__
|
|
33
|
+
|
|
34
|
+
app.command()(motif)
|
|
35
|
+
app.command()(fsc)
|
|
36
|
+
app.command()(fsr)
|
|
37
|
+
app.command()(fsd)
|
|
38
|
+
app.command()(wps)
|
|
39
|
+
app.command()(ocf)
|
|
40
|
+
app.command()(uxm)
|
|
41
|
+
app.command()(mfsd)
|
|
42
|
+
app.command()(run_all)
|
|
43
|
+
|
|
44
|
+
@app.callback()
|
|
45
|
+
def main(
|
|
46
|
+
version: bool = typer.Option(False, "--version", "-v", help="Show version and exit"),
|
|
47
|
+
):
|
|
48
|
+
if version:
|
|
49
|
+
typer.echo(f"krewlyzer version: {__version__}")
|
|
50
|
+
raise typer.Exit()
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
app()
|
krewlyzer/fsc.py
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import pysam
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from skmisc.loess import loess
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.logging import RichHandler
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
17
|
+
logger = logging.getLogger("fsc")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
from .helpers import gc_correct
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _calc_fsc(
|
|
24
|
+
bedgz_input: str | Path,
|
|
25
|
+
bin_input: str | Path,
|
|
26
|
+
windows: int,
|
|
27
|
+
continue_n: int,
|
|
28
|
+
output_file: str | Path
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Internal: Calculate fragment size coverage (FSC) for a single .bed.gz file.
|
|
32
|
+
Optimized with vectorized operations.
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
logger.info(f"Processing {bedgz_input} with bins from {bin_input}")
|
|
36
|
+
|
|
37
|
+
# Load bins
|
|
38
|
+
try:
|
|
39
|
+
# Use pandas for faster loading if possible, but is fine for iteration
|
|
40
|
+
# bins = pybedtools.BedTool(bin_input)
|
|
41
|
+
# Actually, reading bins into a dataframe might be easier for grouping
|
|
42
|
+
bins_df = pd.read_csv(bin_input, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Could not load bins from {bin_input}: {e}")
|
|
45
|
+
raise typer.Exit(1)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
tbx = pysam.TabixFile(filename=bedgz_input, mode="r")
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
51
|
+
raise typer.Exit(1)
|
|
52
|
+
|
|
53
|
+
shorts_data = []
|
|
54
|
+
intermediates_data = []
|
|
55
|
+
longs_data = []
|
|
56
|
+
totals_data = []
|
|
57
|
+
bingc = []
|
|
58
|
+
|
|
59
|
+
# Iterate over bins
|
|
60
|
+
# To optimize, we can process by chromosome to avoid random seeking if bins are sorted?
|
|
61
|
+
# But Tabix is good at random access.
|
|
62
|
+
|
|
63
|
+
for _, bin_row in bins_df.iterrows():
|
|
64
|
+
chrom = bin_row['chrom']
|
|
65
|
+
start = bin_row['start']
|
|
66
|
+
end = bin_row['end']
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
# Fetch reads in bin
|
|
70
|
+
# parser=pysam.asTuple() is slightly faster than splitting string manually
|
|
71
|
+
rows = list(tbx.fetch(chrom, start, end, parser=pysam.asTuple()))
|
|
72
|
+
except ValueError:
|
|
73
|
+
# Region not in file (e.g. chromosome not present)
|
|
74
|
+
rows = []
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.error(f"Error fetching {chrom}:{start}-{end}: {e}")
|
|
77
|
+
raise typer.Exit(1)
|
|
78
|
+
|
|
79
|
+
if not rows:
|
|
80
|
+
bingc.append(np.nan)
|
|
81
|
+
shorts_data.append(0)
|
|
82
|
+
intermediates_data.append(0)
|
|
83
|
+
longs_data.append(0)
|
|
84
|
+
totals_data.append(0)
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Vectorize parsing
|
|
88
|
+
# rows is a list of tuples: (chrom, start, end, gc)
|
|
89
|
+
# We need start (idx 1), end (idx 2), gc (idx 3)
|
|
90
|
+
# Note: BED is 0-based start, 1-based end?
|
|
91
|
+
# motif.py writes: rstart, rend, gc.
|
|
92
|
+
# pysam.TabixFile returns what's in the file.
|
|
93
|
+
# motif.py writes: f"{read1.reference_name}\t{rstart}\t{rend}\t{gc}\n"
|
|
94
|
+
# So col 1 is start, col 2 is end.
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Extract columns
|
|
98
|
+
# This is still a Python loop but faster than append in loop
|
|
99
|
+
# We can use zip to transpose
|
|
100
|
+
_, starts, ends, gcs = zip(*rows)
|
|
101
|
+
starts = np.array(starts, dtype=int)
|
|
102
|
+
ends = np.array(ends, dtype=int)
|
|
103
|
+
gcs = np.array(gcs, dtype=float)
|
|
104
|
+
|
|
105
|
+
lengths = ends - starts
|
|
106
|
+
|
|
107
|
+
# Filter by length (65-400)
|
|
108
|
+
# We only care about reads with length in range [65, 400] for GC calculation?
|
|
109
|
+
# Original code: if 65 <= len <= 400: gc.append(...)
|
|
110
|
+
|
|
111
|
+
mask = (lengths >= 65) & (lengths <= 400)
|
|
112
|
+
valid_gcs = gcs[mask]
|
|
113
|
+
|
|
114
|
+
if len(valid_gcs) == 0:
|
|
115
|
+
bingc.append(np.nan)
|
|
116
|
+
else:
|
|
117
|
+
bingc.append(np.mean(valid_gcs))
|
|
118
|
+
|
|
119
|
+
# Counts
|
|
120
|
+
# np.bincount requires non-negative integers
|
|
121
|
+
# We can filter lengths to be within [0, 400] for bincount safety, though mask handles 65-400
|
|
122
|
+
|
|
123
|
+
# We need counts for specific ranges
|
|
124
|
+
# shorts: 65-150
|
|
125
|
+
# intermediates: 151-260
|
|
126
|
+
# longs: 261-400
|
|
127
|
+
# totals: 65-400
|
|
128
|
+
|
|
129
|
+
# Using histogram might be cleaner or just boolean indexing
|
|
130
|
+
shorts = np.sum((lengths >= 65) & (lengths <= 150))
|
|
131
|
+
intermediates = np.sum((lengths >= 151) & (lengths <= 260))
|
|
132
|
+
longs = np.sum((lengths >= 261) & (lengths <= 400))
|
|
133
|
+
totals = np.sum(mask) # 65-400
|
|
134
|
+
|
|
135
|
+
shorts_data.append(shorts)
|
|
136
|
+
intermediates_data.append(intermediates)
|
|
137
|
+
longs_data.append(longs)
|
|
138
|
+
totals_data.append(totals)
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"Error processing data in bin {chrom}:{start}-{end}: {e}")
|
|
142
|
+
raise typer.Exit(1)
|
|
143
|
+
|
|
144
|
+
# GC Correction
|
|
145
|
+
try:
|
|
146
|
+
correct_shorts = gc_correct(shorts_data, bingc)
|
|
147
|
+
correct_intermediates = gc_correct(intermediates_data, bingc)
|
|
148
|
+
correct_longs = gc_correct(longs_data, bingc)
|
|
149
|
+
correct_totals = gc_correct(totals_data, bingc)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"GC correction failed: {e}")
|
|
152
|
+
raise typer.Exit(1)
|
|
153
|
+
|
|
154
|
+
# Aggregation into windows
|
|
155
|
+
# The original logic aggregates 'continue_n' bins into one window.
|
|
156
|
+
# It assumes bins are contiguous and ordered by chromosome.
|
|
157
|
+
# It resets when chromosome changes.
|
|
158
|
+
|
|
159
|
+
# We can do this more pandas-style
|
|
160
|
+
df = pd.DataFrame({
|
|
161
|
+
'chrom': bins_df['chrom'],
|
|
162
|
+
'start': bins_df['start'], # bin start
|
|
163
|
+
'end': bins_df['end'], # bin end
|
|
164
|
+
'shorts': correct_shorts,
|
|
165
|
+
'intermediates': correct_intermediates,
|
|
166
|
+
'longs': correct_longs,
|
|
167
|
+
'totals': correct_totals
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
results = []
|
|
171
|
+
|
|
172
|
+
# Group by chromosome
|
|
173
|
+
for chrom, group in df.groupby('chrom', sort=False):
|
|
174
|
+
# Rolling window or block aggregation?
|
|
175
|
+
# Original code:
|
|
176
|
+
# num = chrom.count(chrom[step]) -> count of bins in this chrom
|
|
177
|
+
# continues_bin = num // continue_n
|
|
178
|
+
# for i in range(continues_bin):
|
|
179
|
+
# combine...
|
|
180
|
+
# This is block aggregation (non-overlapping blocks of size continue_n)
|
|
181
|
+
|
|
182
|
+
n_bins = len(group)
|
|
183
|
+
n_windows = n_bins // continue_n
|
|
184
|
+
|
|
185
|
+
# We can reshape the array to (n_windows, continue_n) and sum along axis 1
|
|
186
|
+
# But we need to handle the remainder (last_bin) which is ignored in original code?
|
|
187
|
+
# Original: "if last_bin != 0: step += last_bin; start = 0" -> It skips the remainder?
|
|
188
|
+
# "for i in range(continues_bin): ... step += continue_n"
|
|
189
|
+
# It seems it drops the last partial window.
|
|
190
|
+
|
|
191
|
+
if n_windows == 0:
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
# Truncate to multiple of continue_n
|
|
195
|
+
trunc_len = n_windows * continue_n
|
|
196
|
+
|
|
197
|
+
shorts_mat = group['shorts'].values[:trunc_len].reshape(n_windows, continue_n)
|
|
198
|
+
inter_mat = group['intermediates'].values[:trunc_len].reshape(n_windows, continue_n)
|
|
199
|
+
longs_mat = group['longs'].values[:trunc_len].reshape(n_windows, continue_n)
|
|
200
|
+
totals_mat = group['totals'].values[:trunc_len].reshape(n_windows, continue_n)
|
|
201
|
+
|
|
202
|
+
sum_shorts = shorts_mat.sum(axis=1)
|
|
203
|
+
sum_inter = inter_mat.sum(axis=1)
|
|
204
|
+
sum_longs = longs_mat.sum(axis=1)
|
|
205
|
+
sum_totals = totals_mat.sum(axis=1)
|
|
206
|
+
|
|
207
|
+
# Calculate window coordinates
|
|
208
|
+
# bin_start = start * windows
|
|
209
|
+
# bin_end = (start + continue_n) * windows - 1
|
|
210
|
+
# 'windows' param is actually the bin size? Or the window size?
|
|
211
|
+
# CLI says: --windows "-w" help="Window size (default: 100000)"
|
|
212
|
+
# CLI says: --bin-input "-b" help="Path to bin file (default: .../hg19_window_100kb.bed)"
|
|
213
|
+
# If bin file has 100kb bins, and 'windows' is 100000 (100kb).
|
|
214
|
+
# And continue_n is 50.
|
|
215
|
+
# Then the output window is 50 * 100kb = 5Mb?
|
|
216
|
+
# Original code: bin_start = start * windows
|
|
217
|
+
# start increments by continue_n.
|
|
218
|
+
# So yes, it seems to be creating larger windows from smaller bins.
|
|
219
|
+
|
|
220
|
+
# Let's trust the logic:
|
|
221
|
+
# start index 0, 1, 2... corresponding to blocks of continue_n
|
|
222
|
+
|
|
223
|
+
window_starts = np.arange(n_windows) * continue_n * windows
|
|
224
|
+
window_ends = (np.arange(n_windows) + 1) * continue_n * windows - 1
|
|
225
|
+
|
|
226
|
+
# Z-scores
|
|
227
|
+
# Calculated per chromosome or globally?
|
|
228
|
+
# Original code:
|
|
229
|
+
# short_z = (short_s - np.mean(short_s)) / np.std(short_s)
|
|
230
|
+
# It accumulates `short_s` lists across ALL chromosomes in the loop `while step < length`.
|
|
231
|
+
# Then calculates Z-score on the full list.
|
|
232
|
+
# So Z-score is global.
|
|
233
|
+
|
|
234
|
+
results.append(pd.DataFrame({
|
|
235
|
+
'chrom': chrom,
|
|
236
|
+
'start': window_starts,
|
|
237
|
+
'end': window_ends,
|
|
238
|
+
'short_sum': sum_shorts,
|
|
239
|
+
'inter_sum': sum_inter,
|
|
240
|
+
'long_sum': sum_longs,
|
|
241
|
+
'total_sum': sum_totals
|
|
242
|
+
}))
|
|
243
|
+
|
|
244
|
+
if not results:
|
|
245
|
+
logger.warning("No valid windows found.")
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
final_df = pd.concat(results, ignore_index=True)
|
|
249
|
+
|
|
250
|
+
# Calculate Z-scores globally
|
|
251
|
+
final_df['short_z'] = (final_df['short_sum'] - final_df['short_sum'].mean()) / final_df['short_sum'].std()
|
|
252
|
+
final_df['inter_z'] = (final_df['inter_sum'] - final_df['inter_sum'].mean()) / final_df['inter_sum'].std()
|
|
253
|
+
final_df['long_z'] = (final_df['long_sum'] - final_df['long_sum'].mean()) / final_df['long_sum'].std()
|
|
254
|
+
final_df['total_z'] = (final_df['total_sum'] - final_df['total_sum'].mean()) / final_df['total_sum'].std()
|
|
255
|
+
|
|
256
|
+
# Write output
|
|
257
|
+
with open(output_file, 'w') as f:
|
|
258
|
+
f.write("region\tshort-fragment-zscore\titermediate-fragment-zscore\tlong-fragment-zscore\ttotal-fragment-zscore\n")
|
|
259
|
+
for _, row in final_df.iterrows():
|
|
260
|
+
region = f"{row['chrom']}:{int(row['start'])}-{int(row['end'])}"
|
|
261
|
+
f.write(f"{region}\t{row['short_z']:.4f}\t{row['inter_z']:.4f}\t{row['long_z']:.4f}\t{row['total_z']:.4f}\n")
|
|
262
|
+
|
|
263
|
+
logger.info(f"FSC calculation complete. Results written to {output_file}")
|
|
264
|
+
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.error(f"Fatal error in _calc_fsc: {e}")
|
|
267
|
+
raise typer.Exit(1)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def fsc(
|
|
271
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
272
|
+
bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
|
|
273
|
+
windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
|
|
274
|
+
continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
|
|
275
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
276
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
277
|
+
):
|
|
278
|
+
"""
|
|
279
|
+
Calculate fragment size coverage (FSC) features for all .bed.gz files in a folder.
|
|
280
|
+
"""
|
|
281
|
+
# Input checks
|
|
282
|
+
if not bedgz_path.exists():
|
|
283
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
284
|
+
raise typer.Exit(1)
|
|
285
|
+
if bin_input and not bin_input.exists():
|
|
286
|
+
logger.error(f"Bin input file not found: {bin_input}")
|
|
287
|
+
raise typer.Exit(1)
|
|
288
|
+
try:
|
|
289
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
292
|
+
raise typer.Exit(1)
|
|
293
|
+
if not output.exists():
|
|
294
|
+
logger.error(f"Output directory not found: {output}")
|
|
295
|
+
raise typer.Exit(1)
|
|
296
|
+
if not output.is_dir():
|
|
297
|
+
logger.error(f"Output path is not a directory: {output}")
|
|
298
|
+
raise typer.Exit(1)
|
|
299
|
+
if not output.is_writable():
|
|
300
|
+
logger.error(f"Output directory is not writable: {output}")
|
|
301
|
+
raise typer.Exit(1)
|
|
302
|
+
|
|
303
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
304
|
+
if not bedgz_files:
|
|
305
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
306
|
+
raise typer.Exit(1)
|
|
307
|
+
if bin_input is None:
|
|
308
|
+
# Use package-relative default
|
|
309
|
+
bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
|
|
310
|
+
logger.info(f"No bin_input specified. Using default: {bin_input}")
|
|
311
|
+
if not bin_input.exists():
|
|
312
|
+
logger.error(f"Bin input file does not exist: {bin_input}")
|
|
313
|
+
raise typer.Exit(1)
|
|
314
|
+
logger.info(f"Calculating FSC for {len(bedgz_files)} files...")
|
|
315
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
316
|
+
logger.info(f"Starting parallel FSC calculation using {threads} processes...")
|
|
317
|
+
def run_fsc_file(bedgz_file):
|
|
318
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSC.txt')
|
|
319
|
+
_calc_fsc(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
|
|
320
|
+
return str(output_file)
|
|
321
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
322
|
+
futures = {executor.submit(run_fsc_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
323
|
+
for future in as_completed(futures):
|
|
324
|
+
bedgz_file = futures[future]
|
|
325
|
+
try:
|
|
326
|
+
result = future.result()
|
|
327
|
+
logger.info(f"FSC calculated: {result}")
|
|
328
|
+
except Exception as exc:
|
|
329
|
+
logger.error(f"FSC calculation failed for {bedgz_file}: {exc}")
|
|
330
|
+
logger.info(f"FSC features calculated for {len(bedgz_files)} files.")
|
krewlyzer/fsd.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import pysam
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.logging import RichHandler
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
12
|
+
logger = logging.getLogger("fsd")
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
def _calc_fsd(
|
|
17
|
+
bedgz_input: str | Path,
|
|
18
|
+
arms_file: str | Path,
|
|
19
|
+
output_file: str | Path
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Internal: Calculate fragment size distribution (FSD) for a single .bed.gz file.
|
|
23
|
+
Optimized with vectorized operations.
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
logger.info(f"Processing {bedgz_input} with regions from {arms_file}")
|
|
27
|
+
|
|
28
|
+
# Load regions
|
|
29
|
+
try:
|
|
30
|
+
bins_df = pd.read_csv(arms_file, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
|
|
31
|
+
except Exception as e:
|
|
32
|
+
logger.error(f"Could not load regions from {arms_file}: {e}")
|
|
33
|
+
raise typer.Exit(1)
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
tbx = pysam.TabixFile(filename=bedgz_input, mode="r")
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
39
|
+
raise typer.Exit(1)
|
|
40
|
+
|
|
41
|
+
results = []
|
|
42
|
+
regions = []
|
|
43
|
+
|
|
44
|
+
# Define histogram bins
|
|
45
|
+
hist_bins = np.arange(65, 401, 5) # 65, 70, ..., 400. 67 bins.
|
|
46
|
+
|
|
47
|
+
for _, bin_row in bins_df.iterrows():
|
|
48
|
+
chrom = bin_row['chrom']
|
|
49
|
+
start = bin_row['start']
|
|
50
|
+
end = bin_row['end']
|
|
51
|
+
regions.append(f"{chrom}:{start}-{end}")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
rows = list(tbx.fetch(chrom, start, end, parser=pysam.asTuple()))
|
|
55
|
+
except ValueError:
|
|
56
|
+
rows = []
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Error fetching {chrom}:{start}-{end}: {e}")
|
|
59
|
+
raise typer.Exit(1)
|
|
60
|
+
|
|
61
|
+
if not rows:
|
|
62
|
+
results.append(np.zeros(67))
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Vectorized parsing
|
|
67
|
+
_, starts, ends, _ = zip(*rows)
|
|
68
|
+
starts = np.array(starts, dtype=int)
|
|
69
|
+
ends = np.array(ends, dtype=int)
|
|
70
|
+
lengths = ends - starts
|
|
71
|
+
|
|
72
|
+
# Histogram
|
|
73
|
+
counts, _ = np.histogram(lengths, bins=hist_bins)
|
|
74
|
+
|
|
75
|
+
# Normalize
|
|
76
|
+
total = np.sum(counts)
|
|
77
|
+
if total > 0:
|
|
78
|
+
results.append(counts / total)
|
|
79
|
+
else:
|
|
80
|
+
results.append(np.zeros(67))
|
|
81
|
+
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.error(f"Error processing data in bin {chrom}:{start}-{end}: {e}")
|
|
84
|
+
results.append(np.zeros(67))
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Write output
|
|
88
|
+
try:
|
|
89
|
+
with open(output_file, 'w') as f:
|
|
90
|
+
# Header
|
|
91
|
+
# sbin = np.arange(65, 400, 5) -> 65, 70, ... 395
|
|
92
|
+
# ranges: 65-69, 70-74, ...
|
|
93
|
+
# Original code: f"{s}-{s+4}"
|
|
94
|
+
# My hist_bins: 65, 70...
|
|
95
|
+
# So bin 0 is [65, 70). i.e. 65, 66, 67, 68, 69.
|
|
96
|
+
# Matches 65-69 (inclusive).
|
|
97
|
+
|
|
98
|
+
header_bins = [f"{s}-{s+4}" for s in hist_bins[:-1]]
|
|
99
|
+
f.write('region\t' + '\t'.join(header_bins) + '\n')
|
|
100
|
+
|
|
101
|
+
for i, region in enumerate(regions):
|
|
102
|
+
scores = results[i]
|
|
103
|
+
f.write(f"{region}\t" + "\t".join(map(str, scores)) + "\n")
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"Error writing FSD output file: {e}")
|
|
107
|
+
raise typer.Exit(1)
|
|
108
|
+
|
|
109
|
+
logger.info(f"FSD calculation complete. Results written to {output_file}")
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error(f"Fatal error in _calc_fsd: {e}")
|
|
113
|
+
raise typer.Exit(1)
|
|
114
|
+
|
|
115
|
+
def fsd(
|
|
116
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
117
|
+
arms_file: Path = typer.Option(..., "--arms-file", "-a", help="Path to arms/region file (BED format)"),
|
|
118
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
119
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
|
|
120
|
+
):
|
|
121
|
+
"""
|
|
122
|
+
Calculate fragment size distribution (FSD) features for all .bed.gz files in a folder.
|
|
123
|
+
The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
|
|
124
|
+
Output files are written to the output directory, one per .bed.gz file.
|
|
125
|
+
"""
|
|
126
|
+
# Input checks
|
|
127
|
+
if not bedgz_path.exists():
|
|
128
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
129
|
+
raise typer.Exit(1)
|
|
130
|
+
if not arms_file.exists():
|
|
131
|
+
logger.error(f"Arms/region file not found: {arms_file}")
|
|
132
|
+
raise typer.Exit(1)
|
|
133
|
+
try:
|
|
134
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
137
|
+
raise typer.Exit(1)
|
|
138
|
+
if not output.exists():
|
|
139
|
+
logger.error(f"Output directory not found: {output}")
|
|
140
|
+
raise typer.Exit(1)
|
|
141
|
+
if not output.is_dir():
|
|
142
|
+
logger.error(f"Output path is not a directory: {output}")
|
|
143
|
+
raise typer.Exit(1)
|
|
144
|
+
if not output.is_writable():
|
|
145
|
+
logger.error(f"Output directory is not writable: {output}")
|
|
146
|
+
raise typer.Exit(1)
|
|
147
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
148
|
+
if not bedgz_files:
|
|
149
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
150
|
+
raise typer.Exit(1)
|
|
151
|
+
if not arms_file.exists():
|
|
152
|
+
logger.error(f"Arms/region file does not exist: {arms_file}")
|
|
153
|
+
raise typer.Exit(1)
|
|
154
|
+
logger.info(f"Calculating FSD for {len(bedgz_files)} files...")
|
|
155
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
156
|
+
logger.info(f"Starting parallel FSD calculation using {threads} processes...")
|
|
157
|
+
def run_fsd_file(bedgz_file):
|
|
158
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSD.txt')
|
|
159
|
+
_calc_fsd(str(bedgz_file), str(arms_file), str(output_file))
|
|
160
|
+
return str(output_file)
|
|
161
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
162
|
+
futures = {executor.submit(run_fsd_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
163
|
+
for future in as_completed(futures):
|
|
164
|
+
bedgz_file = futures[future]
|
|
165
|
+
try:
|
|
166
|
+
result = future.result()
|
|
167
|
+
logger.info(f"FSD calculated: {result}")
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
logger.error(f"FSD calculation failed for {bedgz_file}: {exc}")
|
|
170
|
+
logger.info(f"FSD features calculated for {len(bedgz_files)} files.")
|