krewlyzer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of krewlyzer might be problematic. Click here for more details.
- krewlyzer/__init__.py +1 -0
- krewlyzer/cli.py +47 -0
- krewlyzer/fsc.py +450 -0
- krewlyzer/fsd.py +139 -0
- krewlyzer/fsr.py +171 -0
- krewlyzer/helpers.py +187 -0
- krewlyzer/motif.py +275 -0
- krewlyzer/ocf.py +133 -0
- krewlyzer/uxm.py +188 -0
- krewlyzer/wps.py +173 -0
- krewlyzer/wrapper.py +125 -0
- krewlyzer-0.1.0.dist-info/METADATA +15 -0
- krewlyzer-0.1.0.dist-info/RECORD +17 -0
- krewlyzer-0.1.0.dist-info/WHEEL +5 -0
- krewlyzer-0.1.0.dist-info/entry_points.txt +2 -0
- krewlyzer-0.1.0.dist-info/licenses/LICENSE +619 -0
- krewlyzer-0.1.0.dist-info/top_level.txt +1 -0
krewlyzer/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""cfDNAFE - Feature extraction tools for circulating tumor DNA"""
|
krewlyzer/cli.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Command-line interface for cfDNAFE"""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import typer
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.logging import RichHandler
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
12
|
+
logger = logging.getLogger("krewlyzer-cli")
|
|
13
|
+
|
|
14
|
+
def set_log_level(log_level: str = typer.Option("INFO", "--log-level", help="Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL")):
|
|
15
|
+
"""Set global logging level."""
|
|
16
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
17
|
+
for handler in logging.root.handlers:
|
|
18
|
+
handler.setLevel(level)
|
|
19
|
+
logging.getLogger().setLevel(level)
|
|
20
|
+
|
|
21
|
+
app = typer.Typer(callback=set_log_level)
|
|
22
|
+
|
|
23
|
+
from .motif import motif
|
|
24
|
+
from .fsc import fsc
|
|
25
|
+
from .fsr import fsr
|
|
26
|
+
from .fsd import fsd
|
|
27
|
+
from .wps import wps
|
|
28
|
+
from .ocf import ocf
|
|
29
|
+
from .uxm import uxm
|
|
30
|
+
from .wrapper import run_all
|
|
31
|
+
|
|
32
|
+
app.command()(motif)
|
|
33
|
+
app.command()(fsc)
|
|
34
|
+
app.command()(fsr)
|
|
35
|
+
app.command()(fsd)
|
|
36
|
+
app.command()(wps)
|
|
37
|
+
app.command()(ocf)
|
|
38
|
+
app.command()(uxm)
|
|
39
|
+
app.command()(run_all)
|
|
40
|
+
|
|
41
|
+
@app.command()
|
|
42
|
+
def version() -> None:
|
|
43
|
+
"""Show version information"""
|
|
44
|
+
logger.info("krewlyzer 0.1.0")
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
app()
|
krewlyzer/fsc.py
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import pysam
|
|
8
|
+
import pybedtools
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from skmisc.loess import loess
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.logging import RichHandler
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
17
|
+
logger = logging.getLogger("fsc")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
from .helpers import gc_correct
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _calc_fsr(bedgz_input, bin_input, windows, continue_n, output_file):
|
|
24
|
+
"""
|
|
25
|
+
Internal: Calculate fragment size ratio (FSR) for a single .bed.gz file.
|
|
26
|
+
Writes region-based ratios for short, intermediate, and long fragments.
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
logger.info(f"input file: {bedgz_input}, {bin_input}")
|
|
30
|
+
try:
|
|
31
|
+
inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
|
|
32
|
+
except Exception as e:
|
|
33
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
34
|
+
raise typer.Exit(1)
|
|
35
|
+
try:
|
|
36
|
+
bins = pybedtools.BedTool(bin_input)
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.error(f"Could not load bins from {bin_input}: {e}")
|
|
39
|
+
raise typer.Exit(1)
|
|
40
|
+
length = len(bins)
|
|
41
|
+
shorts_data, intermediates_data, longs_data, totals_data, bingc = [], [], [], [], []
|
|
42
|
+
chrom = []
|
|
43
|
+
logger.info(f"output file: {output_file}")
|
|
44
|
+
for idx in range(length):
|
|
45
|
+
bin = bins[idx]
|
|
46
|
+
try:
|
|
47
|
+
chrom.append(bin.chrom)
|
|
48
|
+
inputbed.fetch(bin.chrom, bin.start, bin.end)
|
|
49
|
+
except ValueError:
|
|
50
|
+
bingc.append(np.nan)
|
|
51
|
+
shorts_data.append(0)
|
|
52
|
+
intermediates_data.append(0)
|
|
53
|
+
longs_data.append(0)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Error fetching bin {bin}: {e}")
|
|
56
|
+
raise typer.Exit(1)
|
|
57
|
+
else:
|
|
58
|
+
bin_data = []
|
|
59
|
+
gc = []
|
|
60
|
+
try:
|
|
61
|
+
for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
|
|
62
|
+
bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
|
|
63
|
+
if 65 <= int(read.split("\t")[2]) - int(read.split("\t")[1]) <= 400:
|
|
64
|
+
gc.append(float(read.split("\t")[3]))
|
|
65
|
+
count = np.bincount(bin_data, minlength=401)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"Error processing reads in bin {bin}: {e}")
|
|
68
|
+
raise typer.Exit(1)
|
|
69
|
+
if len(gc) == 0:
|
|
70
|
+
bingc.append(np.nan)
|
|
71
|
+
else:
|
|
72
|
+
bingc.append(np.mean(gc))
|
|
73
|
+
shorts = sum(count[65:150])
|
|
74
|
+
intermediates = sum(count[151:260])
|
|
75
|
+
longs = sum(count[261:400])
|
|
76
|
+
totals = sum(count[65:400])
|
|
77
|
+
if totals == 0:
|
|
78
|
+
shorts_data.append(0)
|
|
79
|
+
intermediates_data.append(0)
|
|
80
|
+
longs_data.append(0)
|
|
81
|
+
else:
|
|
82
|
+
shorts_data.append(shorts / totals)
|
|
83
|
+
intermediates_data.append(intermediates / totals)
|
|
84
|
+
longs_data.append(longs / totals)
|
|
85
|
+
start = 0
|
|
86
|
+
step = 0
|
|
87
|
+
try:
|
|
88
|
+
with open(output_file, 'w') as fsrfile:
|
|
89
|
+
fsrfile.write("region\tshort-ratio\titermediate-ratio\tlong-ratio\n")
|
|
90
|
+
while step < length:
|
|
91
|
+
num = chrom.count(chrom[step])
|
|
92
|
+
continues_bin = num // continue_n
|
|
93
|
+
last_bin = num % continue_n
|
|
94
|
+
for _ in range(continues_bin):
|
|
95
|
+
bin_start = start * windows
|
|
96
|
+
bin_end = (start + continue_n) * windows - 1
|
|
97
|
+
combine_shorts = shorts_data[step: step + continue_n]
|
|
98
|
+
combine_intermediates = intermediates_data[step: step + continue_n]
|
|
99
|
+
combine_longs = longs_data[step: step + continue_n]
|
|
100
|
+
tmp_array = np.zeros(3)
|
|
101
|
+
tmp_array[0] = np.mean(combine_shorts)
|
|
102
|
+
tmp_array[1] = np.mean(combine_intermediates)
|
|
103
|
+
tmp_array[2] = np.mean(combine_longs)
|
|
104
|
+
region = f"{chrom[step]}:{bin_start}-{bin_end}"
|
|
105
|
+
temp_str = f"{region}\t" + "\t".join(map(str, tmp_array)) + "\n"
|
|
106
|
+
fsrfile.write(temp_str)
|
|
107
|
+
step += continue_n
|
|
108
|
+
start += continue_n
|
|
109
|
+
if last_bin != 0:
|
|
110
|
+
step += last_bin
|
|
111
|
+
start = 0
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"Error writing FSR output file: {e}")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
logger.info(f"FSR calculation complete. Results written to {output_file}")
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.error(f"Fatal error in _calc_fsr: {e}")
|
|
118
|
+
raise typer.Exit(1)
|
|
119
|
+
|
|
120
|
+
def _calc_fsd(bedgz_input, arms_file, output_file):
|
|
121
|
+
"""
|
|
122
|
+
Internal: Calculate fragment size distribution (FSD) for a single .bed.gz file.
|
|
123
|
+
Writes region-based fragment size distributions in 5bp bins from 65-399bp.
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
logger.info(f"input file: {bedgz_input}, {arms_file}")
|
|
127
|
+
try:
|
|
128
|
+
inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
131
|
+
raise typer.Exit(1)
|
|
132
|
+
try:
|
|
133
|
+
bins = pybedtools.BedTool(arms_file)
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"Could not load bins from {arms_file}: {e}")
|
|
136
|
+
raise typer.Exit(1)
|
|
137
|
+
length = len(bins)
|
|
138
|
+
interval_data = []
|
|
139
|
+
region = []
|
|
140
|
+
logger.info(f"output file: {output_file}")
|
|
141
|
+
for idx in range(length):
|
|
142
|
+
bin = bins[idx]
|
|
143
|
+
region.append(f"{bin.chrom}:{bin.start}-{bin.end}")
|
|
144
|
+
try:
|
|
145
|
+
inputbed.fetch(bin.chrom, bin.start, bin.end)
|
|
146
|
+
except ValueError:
|
|
147
|
+
interval_data.append([0] * 67)
|
|
148
|
+
continue
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.error(f"Error fetching bin {bin}: {e}")
|
|
151
|
+
raise typer.Exit(1)
|
|
152
|
+
else:
|
|
153
|
+
bin_data = []
|
|
154
|
+
try:
|
|
155
|
+
for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
|
|
156
|
+
bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
|
|
157
|
+
count = np.bincount(bin_data, minlength=401)
|
|
158
|
+
step_size = 5
|
|
159
|
+
start_bin = 65
|
|
160
|
+
end_bin = 400
|
|
161
|
+
bin_len = int((end_bin - start_bin) / step_size)
|
|
162
|
+
temp_bin = []
|
|
163
|
+
for bin_id in range(bin_len):
|
|
164
|
+
temp_bin.append(np.sum(count[(start_bin + step_size * bin_id):(start_bin + step_size * (bin_id + 1))]))
|
|
165
|
+
interval_data.append(temp_bin)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"Error processing reads in bin {bin}: {e}")
|
|
168
|
+
interval_data.append([0] * 67)
|
|
169
|
+
continue
|
|
170
|
+
try:
|
|
171
|
+
with open(output_file, 'w') as fsdfile:
|
|
172
|
+
sbin = np.arange(65, 400, 5)
|
|
173
|
+
head_str = 'region' + '\t' + '\t'.join([f"{s}-{s+4}" for s in sbin]) + '\n'
|
|
174
|
+
fsdfile.write(head_str)
|
|
175
|
+
for i in range(length):
|
|
176
|
+
arms = interval_data[i]
|
|
177
|
+
score = np.zeros(67)
|
|
178
|
+
if np.sum(arms) != 0:
|
|
179
|
+
score = np.array(arms) / np.sum(arms)
|
|
180
|
+
temp_str = region[i] + '\t' + '\t'.join(map(str, score)) + '\n'
|
|
181
|
+
fsdfile.write(temp_str)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"Error writing FSD output file: {e}")
|
|
184
|
+
raise typer.Exit(1)
|
|
185
|
+
logger.info(f"FSD calculation complete. Results written to {output_file}")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"Fatal error in _calc_fsd: {e}")
|
|
188
|
+
raise typer.Exit(1)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
"""
|
|
192
|
+
Internal: Calculate fragment size coverage (FSC) for a single .bed.gz file.
|
|
193
|
+
Handles errors and logs all steps. Raises typer.Exit(1) on fatal errors.
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
logger.info(f"input file: {bedgz_input}, {bin_input}")
|
|
197
|
+
try:
|
|
198
|
+
inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
201
|
+
raise typer.Exit(1)
|
|
202
|
+
try:
|
|
203
|
+
bins = pybedtools.BedTool(bin_input)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f"Could not load bins from {bin_input}: {e}")
|
|
206
|
+
raise typer.Exit(1)
|
|
207
|
+
length = len(bins)
|
|
208
|
+
shorts_data, intermediates_data, longs_data, totals_data, bingc = [], [], [], [], []
|
|
209
|
+
chrom = []
|
|
210
|
+
logger.info(f"output file: {output_file}")
|
|
211
|
+
for idx in range(length):
|
|
212
|
+
bin = bins[idx]
|
|
213
|
+
try:
|
|
214
|
+
chrom.append(bin.chrom)
|
|
215
|
+
inputbed.fetch(bin.chrom, bin.start, bin.end)
|
|
216
|
+
except ValueError:
|
|
217
|
+
bingc.append(np.nan)
|
|
218
|
+
shorts_data.append(0)
|
|
219
|
+
intermediates_data.append(0)
|
|
220
|
+
longs_data.append(0)
|
|
221
|
+
totals_data.append(0)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"Error fetching bin {bin}: {e}")
|
|
224
|
+
raise typer.Exit(1)
|
|
225
|
+
else:
|
|
226
|
+
bin_data = []
|
|
227
|
+
gc = []
|
|
228
|
+
try:
|
|
229
|
+
for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
|
|
230
|
+
bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
|
|
231
|
+
if 65 <= int(read.split("\t")[2]) - int(read.split("\t")[1]) <= 400:
|
|
232
|
+
gc.append(float(read.split("\t")[3]))
|
|
233
|
+
count = np.bincount(bin_data, minlength=401)
|
|
234
|
+
except Exception as e:
|
|
235
|
+
logger.error(f"Error processing reads in bin {bin}: {e}")
|
|
236
|
+
raise typer.Exit(1)
|
|
237
|
+
if len(gc) == 0:
|
|
238
|
+
bingc.append(np.nan)
|
|
239
|
+
else:
|
|
240
|
+
bingc.append(np.mean(gc))
|
|
241
|
+
shorts = sum(count[65:150])
|
|
242
|
+
intermediates = sum(count[151:260])
|
|
243
|
+
longs = sum(count[261:400])
|
|
244
|
+
totals = sum(count[65:400])
|
|
245
|
+
shorts_data.append(shorts)
|
|
246
|
+
intermediates_data.append(intermediates)
|
|
247
|
+
longs_data.append(longs)
|
|
248
|
+
totals_data.append(totals)
|
|
249
|
+
try:
|
|
250
|
+
correct_shorts = gc_correct(shorts_data, bingc)
|
|
251
|
+
correct_intermediates = gc_correct(intermediates_data, bingc)
|
|
252
|
+
correct_longs = gc_correct(longs_data, bingc)
|
|
253
|
+
correct_totals = gc_correct(totals_data, bingc)
|
|
254
|
+
except Exception as e:
|
|
255
|
+
logger.error(f"GC correction failed: {e}")
|
|
256
|
+
raise typer.Exit(1)
|
|
257
|
+
start = 0
|
|
258
|
+
step = 0
|
|
259
|
+
short_s, intermediate_s, long_s, total_s = [], [], [], []
|
|
260
|
+
region = []
|
|
261
|
+
try:
|
|
262
|
+
with open(output_file, 'w') as fscfile:
|
|
263
|
+
fscfile.write(
|
|
264
|
+
"region\tshort-fragment-zscore\titermediate-fragment-zscore\tlong-fragment-zscore\ttotal-fragment-zscore\n"
|
|
265
|
+
)
|
|
266
|
+
while step < length:
|
|
267
|
+
num = chrom.count(chrom[step])
|
|
268
|
+
continues_bin = num // continue_n
|
|
269
|
+
last_bin = num % continue_n
|
|
270
|
+
for _ in range(continues_bin):
|
|
271
|
+
bin_start = start * windows
|
|
272
|
+
bin_end = (start + continue_n) * windows - 1
|
|
273
|
+
combine_shorts = correct_shorts[step: step + continue_n]
|
|
274
|
+
combine_intermediates = correct_intermediates[step: step + continue_n]
|
|
275
|
+
combine_longs = correct_longs[step: step + continue_n]
|
|
276
|
+
combine_totals = correct_totals[step: step + continue_n]
|
|
277
|
+
short_s.append(np.sum(combine_shorts))
|
|
278
|
+
intermediate_s.append(np.sum(combine_intermediates))
|
|
279
|
+
long_s.append(np.sum(combine_longs))
|
|
280
|
+
total_s.append(np.sum(combine_totals))
|
|
281
|
+
region.append(f"{chrom[step]}:{bin_start}-{bin_end}")
|
|
282
|
+
step += continue_n
|
|
283
|
+
start += continue_n
|
|
284
|
+
if last_bin != 0:
|
|
285
|
+
step += last_bin
|
|
286
|
+
start = 0
|
|
287
|
+
try:
|
|
288
|
+
short_z = (np.array(short_s) - np.mean(short_s)) / np.std(short_s)
|
|
289
|
+
intermediate_z = (np.array(intermediate_s) - np.mean(intermediate_s)) / np.std(intermediate_s)
|
|
290
|
+
long_z = (np.array(long_s) - np.mean(long_s)) / np.std(long_s)
|
|
291
|
+
total_z = (np.array(total_s) - np.mean(total_s)) / np.std(total_s)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.error(f"Error calculating z-scores: {e}")
|
|
294
|
+
raise typer.Exit(1)
|
|
295
|
+
for j in range(len(region)):
|
|
296
|
+
temp_str = f"{region[j]}\t{short_z[j]}\t{intermediate_z[j]}\t{long_z[j]}\t{total_z[j]}\n"
|
|
297
|
+
fscfile.write(temp_str)
|
|
298
|
+
except Exception as e:
|
|
299
|
+
logger.error(f"Error writing FSC output file: {e}")
|
|
300
|
+
raise typer.Exit(1)
|
|
301
|
+
logger.info(f"FSC calculation complete. Results written to {output_file}")
|
|
302
|
+
except Exception as e:
|
|
303
|
+
logger.error(f"Fatal error in _calc_fsc: {e}")
|
|
304
|
+
raise typer.Exit(1)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def fsc(
|
|
308
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
309
|
+
bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
|
|
310
|
+
windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
|
|
311
|
+
continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
|
|
312
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
313
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
314
|
+
):
|
|
315
|
+
"""
|
|
316
|
+
Calculate fragment size coverage (FSC) features for all .bed.gz files in a folder.
|
|
317
|
+
"""
|
|
318
|
+
# Input checks
|
|
319
|
+
if not bedgz_path.exists():
|
|
320
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
321
|
+
raise typer.Exit(1)
|
|
322
|
+
if bin_input and not bin_input.exists():
|
|
323
|
+
logger.error(f"Bin input file not found: {bin_input}")
|
|
324
|
+
raise typer.Exit(1)
|
|
325
|
+
try:
|
|
326
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
329
|
+
raise typer.Exit(1)
|
|
330
|
+
if not output.exists():
|
|
331
|
+
logger.error(f"Output directory not found: {output}")
|
|
332
|
+
raise typer.Exit(1)
|
|
333
|
+
if not output.is_dir():
|
|
334
|
+
logger.error(f"Output path is not a directory: {output}")
|
|
335
|
+
raise typer.Exit(1)
|
|
336
|
+
if not output.is_writable():
|
|
337
|
+
logger.error(f"Output directory is not writable: {output}")
|
|
338
|
+
raise typer.Exit(1)
|
|
339
|
+
|
|
340
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
341
|
+
if not bedgz_files:
|
|
342
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
343
|
+
raise typer.Exit(1)
|
|
344
|
+
if bin_input is None:
|
|
345
|
+
# Use package-relative default
|
|
346
|
+
bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
|
|
347
|
+
logger.info(f"No bin_input specified. Using default: {bin_input}")
|
|
348
|
+
if not bin_input.exists():
|
|
349
|
+
logger.error(f"Bin input file does not exist: {bin_input}")
|
|
350
|
+
raise typer.Exit(1)
|
|
351
|
+
logger.info(f"Calculating FSC for {len(bedgz_files)} files...")
|
|
352
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
353
|
+
logger.info(f"Starting parallel FSC calculation using {threads} processes...")
|
|
354
|
+
def run_fsc_file(bedgz_file):
|
|
355
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSC.txt')
|
|
356
|
+
_calc_fsc(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
|
|
357
|
+
return str(output_file)
|
|
358
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
359
|
+
futures = {executor.submit(run_fsc_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
360
|
+
for future in as_completed(futures):
|
|
361
|
+
bedgz_file = futures[future]
|
|
362
|
+
try:
|
|
363
|
+
result = future.result()
|
|
364
|
+
logger.info(f"FSC calculated: {result}")
|
|
365
|
+
except Exception as exc:
|
|
366
|
+
logger.error(f"FSC calculation failed for {bedgz_file}: {exc}")
|
|
367
|
+
logger.info(f"FSC features calculated for {len(bedgz_files)} files.")
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def fsr(
|
|
371
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
372
|
+
bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
|
|
373
|
+
windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
|
|
374
|
+
continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
|
|
375
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
376
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
|
|
377
|
+
):
|
|
378
|
+
"""
|
|
379
|
+
Calculate fragment size ratio (FSR) features for all .bed.gz files in a folder.
|
|
380
|
+
The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
|
|
381
|
+
Output files are written to the output directory, one per .bed.gz file.
|
|
382
|
+
"""
|
|
383
|
+
if not output.exists():
|
|
384
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
385
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
386
|
+
if not bedgz_files:
|
|
387
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
388
|
+
raise typer.Exit(1)
|
|
389
|
+
if bin_input is None:
|
|
390
|
+
bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
|
|
391
|
+
logger.info(f"No bin_input specified. Using default: {bin_input}")
|
|
392
|
+
if not bin_input.exists():
|
|
393
|
+
logger.error(f"Bin input file does not exist: {bin_input}")
|
|
394
|
+
raise typer.Exit(1)
|
|
395
|
+
logger.info(f"Calculating FSR for {len(bedgz_files)} files...")
|
|
396
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
397
|
+
logger.info(f"Starting parallel FSR calculation using {threads} processes...")
|
|
398
|
+
def run_fsr_file(bedgz_file):
|
|
399
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSR.txt')
|
|
400
|
+
_calc_fsr(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
|
|
401
|
+
return str(output_file)
|
|
402
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
403
|
+
futures = {executor.submit(run_fsr_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
404
|
+
for future in as_completed(futures):
|
|
405
|
+
bedgz_file = futures[future]
|
|
406
|
+
try:
|
|
407
|
+
result = future.result()
|
|
408
|
+
logger.info(f"FSR calculated: {result}")
|
|
409
|
+
except Exception as exc:
|
|
410
|
+
logger.error(f"FSR calculation failed for {bedgz_file}: {exc}")
|
|
411
|
+
logger.info(f"FSR features calculated for {len(bedgz_files)} files.")
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def fsd(
|
|
415
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
416
|
+
arms_file: Path = typer.Option(..., "--arms-file", "-a", help="Path to arms/region file (BED format)"),
|
|
417
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
418
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
|
|
419
|
+
):
|
|
420
|
+
"""
|
|
421
|
+
Calculate fragment size distribution (FSD) features for all .bed.gz files in a folder.
|
|
422
|
+
The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
|
|
423
|
+
Output files are written to the output directory, one per .bed.gz file.
|
|
424
|
+
"""
|
|
425
|
+
if not output.exists():
|
|
426
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
427
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
428
|
+
if not bedgz_files:
|
|
429
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
430
|
+
raise typer.Exit(1)
|
|
431
|
+
if not arms_file.exists():
|
|
432
|
+
logger.error(f"Arms/region file does not exist: {arms_file}")
|
|
433
|
+
raise typer.Exit(1)
|
|
434
|
+
logger.info(f"Calculating FSD for {len(bedgz_files)} files...")
|
|
435
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
436
|
+
logger.info(f"Starting parallel FSD calculation using {threads} processes...")
|
|
437
|
+
def run_fsd_file(bedgz_file):
|
|
438
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSD.txt')
|
|
439
|
+
_calc_fsd(str(bedgz_file), str(arms_file), str(output_file))
|
|
440
|
+
return str(output_file)
|
|
441
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
442
|
+
futures = {executor.submit(run_fsd_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
443
|
+
for future in as_completed(futures):
|
|
444
|
+
bedgz_file = futures[future]
|
|
445
|
+
try:
|
|
446
|
+
result = future.result()
|
|
447
|
+
logger.info(f"FSD calculated: {result}")
|
|
448
|
+
except Exception as exc:
|
|
449
|
+
logger.error(f"FSD calculation failed for {bedgz_file}: {exc}")
|
|
450
|
+
logger.info(f"FSD features calculated for {len(bedgz_files)} files.")
|
krewlyzer/fsd.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import pysam
|
|
5
|
+
import pybedtools
|
|
6
|
+
import numpy as np
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.logging import RichHandler
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
12
|
+
logger = logging.getLogger("fsd")
|
|
13
|
+
|
|
14
|
+
def _calc_fsd(bedgz_input, arms_file, output_file):
|
|
15
|
+
"""
|
|
16
|
+
Internal: Calculate fragment size distribution (FSD) for a single .bed.gz file.
|
|
17
|
+
Writes region-based fragment size distributions in 5bp bins from 65-399bp.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
logger.info(f"input file: {bedgz_input}, {arms_file}")
|
|
21
|
+
try:
|
|
22
|
+
inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
|
|
23
|
+
except Exception as e:
|
|
24
|
+
logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
|
|
25
|
+
raise typer.Exit(1)
|
|
26
|
+
try:
|
|
27
|
+
bins = pybedtools.BedTool(arms_file)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
logger.error(f"Could not load bins from {arms_file}: {e}")
|
|
30
|
+
raise typer.Exit(1)
|
|
31
|
+
length = len(bins)
|
|
32
|
+
interval_data = []
|
|
33
|
+
region = []
|
|
34
|
+
logger.info(f"output file: {output_file}")
|
|
35
|
+
for idx in range(length):
|
|
36
|
+
bin = bins[idx]
|
|
37
|
+
region.append(f"{bin.chrom}:{bin.start}-{bin.end}")
|
|
38
|
+
try:
|
|
39
|
+
inputbed.fetch(bin.chrom, bin.start, bin.end)
|
|
40
|
+
except ValueError:
|
|
41
|
+
interval_data.append([0] * 67)
|
|
42
|
+
continue
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Error fetching bin {bin}: {e}")
|
|
45
|
+
raise typer.Exit(1)
|
|
46
|
+
else:
|
|
47
|
+
bin_data = []
|
|
48
|
+
try:
|
|
49
|
+
for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
|
|
50
|
+
bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
|
|
51
|
+
count = np.bincount(bin_data, minlength=401)
|
|
52
|
+
step_size = 5
|
|
53
|
+
start_bin = 65
|
|
54
|
+
end_bin = 400
|
|
55
|
+
bin_len = int((end_bin - start_bin) / step_size)
|
|
56
|
+
temp_bin = []
|
|
57
|
+
for bin_id in range(bin_len):
|
|
58
|
+
temp_bin.append(np.sum(count[(start_bin + step_size * bin_id):(start_bin + step_size * (bin_id + 1))]))
|
|
59
|
+
interval_data.append(temp_bin)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(f"Error processing reads in bin {bin}: {e}")
|
|
62
|
+
interval_data.append([0] * 67)
|
|
63
|
+
continue
|
|
64
|
+
try:
|
|
65
|
+
with open(output_file, 'w') as fsdfile:
|
|
66
|
+
sbin = np.arange(65, 400, 5)
|
|
67
|
+
head_str = 'region' + '\t' + '\t'.join([f"{s}-{s+4}" for s in sbin]) + '\n'
|
|
68
|
+
fsdfile.write(head_str)
|
|
69
|
+
for i in range(length):
|
|
70
|
+
arms = interval_data[i]
|
|
71
|
+
score = np.zeros(67)
|
|
72
|
+
if np.sum(arms) != 0:
|
|
73
|
+
score = np.array(arms) / np.sum(arms)
|
|
74
|
+
temp_str = region[i] + '\t' + '\t'.join(map(str, score)) + '\n'
|
|
75
|
+
fsdfile.write(temp_str)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Error writing FSD output file: {e}")
|
|
78
|
+
raise typer.Exit(1)
|
|
79
|
+
logger.info(f"FSD calculation complete. Results written to {output_file}")
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Fatal error in _calc_fsd: {e}")
|
|
82
|
+
raise typer.Exit(1)
|
|
83
|
+
|
|
84
|
+
def fsd(
|
|
85
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
86
|
+
arms_file: Path = typer.Option(..., "--arms-file", "-a", help="Path to arms/region file (BED format)"),
|
|
87
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
88
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Calculate fragment size distribution (FSD) features for all .bed.gz files in a folder.
|
|
92
|
+
The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
|
|
93
|
+
Output files are written to the output directory, one per .bed.gz file.
|
|
94
|
+
"""
|
|
95
|
+
# Input checks
|
|
96
|
+
if not bedgz_path.exists():
|
|
97
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
98
|
+
raise typer.Exit(1)
|
|
99
|
+
if not arms_file.exists():
|
|
100
|
+
logger.error(f"Arms/region file not found: {arms_file}")
|
|
101
|
+
raise typer.Exit(1)
|
|
102
|
+
try:
|
|
103
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
106
|
+
raise typer.Exit(1)
|
|
107
|
+
if not output.exists():
|
|
108
|
+
logger.error(f"Output directory not found: {output}")
|
|
109
|
+
raise typer.Exit(1)
|
|
110
|
+
if not output.is_dir():
|
|
111
|
+
logger.error(f"Output path is not a directory: {output}")
|
|
112
|
+
raise typer.Exit(1)
|
|
113
|
+
if not output.is_writable():
|
|
114
|
+
logger.error(f"Output directory is not writable: {output}")
|
|
115
|
+
raise typer.Exit(1)
|
|
116
|
+
bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
|
|
117
|
+
if not bedgz_files:
|
|
118
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
119
|
+
raise typer.Exit(1)
|
|
120
|
+
if not arms_file.exists():
|
|
121
|
+
logger.error(f"Arms/region file does not exist: {arms_file}")
|
|
122
|
+
raise typer.Exit(1)
|
|
123
|
+
logger.info(f"Calculating FSD for {len(bedgz_files)} files...")
|
|
124
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
125
|
+
logger.info(f"Starting parallel FSD calculation using {threads} processes...")
|
|
126
|
+
def run_fsd_file(bedgz_file):
|
|
127
|
+
output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSD.txt')
|
|
128
|
+
_calc_fsd(str(bedgz_file), str(arms_file), str(output_file))
|
|
129
|
+
return str(output_file)
|
|
130
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
131
|
+
futures = {executor.submit(run_fsd_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
132
|
+
for future in as_completed(futures):
|
|
133
|
+
bedgz_file = futures[future]
|
|
134
|
+
try:
|
|
135
|
+
result = future.result()
|
|
136
|
+
logger.info(f"FSD calculated: {result}")
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
logger.error(f"FSD calculation failed for {bedgz_file}: {exc}")
|
|
139
|
+
logger.info(f"FSD features calculated for {len(bedgz_files)} files.")
|