krewlyzer 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
krewlyzer/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """cfDNAFE - Feature extraction tools for circulating tumor DNA"""
2
+
3
+ __version__ = "0.1.4"
krewlyzer/cli.py ADDED
@@ -0,0 +1,53 @@
1
+ """Command-line interface for cfDNAFE"""
2
+
3
+ from typing import Optional
4
+ import typer
5
+ from pathlib import Path
6
+ from rich.console import Console
7
+ from rich.logging import RichHandler
8
+ import logging
9
+
10
+ console = Console()
11
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
12
+ logger = logging.getLogger("krewlyzer-cli")
13
+
14
+ def set_log_level(log_level: str = typer.Option("INFO", "--log-level", help="Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL")):
15
+ """Set global logging level."""
16
+ level = getattr(logging, log_level.upper(), logging.INFO)
17
+ for handler in logging.root.handlers:
18
+ handler.setLevel(level)
19
+ logging.getLogger().setLevel(level)
20
+
21
+ app = typer.Typer(help="krewlyzer: A comprehensive toolkit for ctDNA fragmentomics analysis.")
22
+
23
+ from krewlyzer.motif import motif
24
+ from krewlyzer.fsc import fsc
25
+ from krewlyzer.fsr import fsr
26
+ from krewlyzer.fsd import fsd
27
+ from krewlyzer.wps import wps
28
+ from krewlyzer.ocf import ocf
29
+ from krewlyzer.uxm import uxm
30
+ from krewlyzer.mfsd import mfsd
31
+ from krewlyzer.wrapper import run_all
32
+ from krewlyzer import __version__
33
+
34
+ app.command()(motif)
35
+ app.command()(fsc)
36
+ app.command()(fsr)
37
+ app.command()(fsd)
38
+ app.command()(wps)
39
+ app.command()(ocf)
40
+ app.command()(uxm)
41
+ app.command()(mfsd)
42
+ app.command()(run_all)
43
+
44
+ @app.callback()
45
+ def main(
46
+ version: bool = typer.Option(False, "--version", "-v", help="Show version and exit"),
47
+ ):
48
+ if version:
49
+ typer.echo(f"krewlyzer version: {__version__}")
50
+ raise typer.Exit()
51
+
52
+ if __name__ == "__main__":
53
+ app()
krewlyzer/fsc.py ADDED
@@ -0,0 +1,330 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import sys
6
+
7
+ import pysam
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from skmisc.loess import loess
12
+ from rich.console import Console
13
+ from rich.logging import RichHandler
14
+
15
+ console = Console()
16
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
17
+ logger = logging.getLogger("fsc")
18
+
19
+
20
+ from .helpers import gc_correct
21
+
22
+
23
+ def _calc_fsc(
24
+ bedgz_input: str | Path,
25
+ bin_input: str | Path,
26
+ windows: int,
27
+ continue_n: int,
28
+ output_file: str | Path
29
+ ):
30
+ """
31
+ Internal: Calculate fragment size coverage (FSC) for a single .bed.gz file.
32
+ Optimized with vectorized operations.
33
+ """
34
+ try:
35
+ logger.info(f"Processing {bedgz_input} with bins from {bin_input}")
36
+
37
+ # Load bins
38
+ try:
39
+ # Use pandas for faster loading if possible, but is fine for iteration
40
+ # bins = pybedtools.BedTool(bin_input)
41
+ # Actually, reading bins into a dataframe might be easier for grouping
42
+ bins_df = pd.read_csv(bin_input, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
43
+ except Exception as e:
44
+ logger.error(f"Could not load bins from {bin_input}: {e}")
45
+ raise typer.Exit(1)
46
+
47
+ try:
48
+ tbx = pysam.TabixFile(filename=bedgz_input, mode="r")
49
+ except Exception as e:
50
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
51
+ raise typer.Exit(1)
52
+
53
+ shorts_data = []
54
+ intermediates_data = []
55
+ longs_data = []
56
+ totals_data = []
57
+ bingc = []
58
+
59
+ # Iterate over bins
60
+ # To optimize, we can process by chromosome to avoid random seeking if bins are sorted?
61
+ # But Tabix is good at random access.
62
+
63
+ for _, bin_row in bins_df.iterrows():
64
+ chrom = bin_row['chrom']
65
+ start = bin_row['start']
66
+ end = bin_row['end']
67
+
68
+ try:
69
+ # Fetch reads in bin
70
+ # parser=pysam.asTuple() is slightly faster than splitting string manually
71
+ rows = list(tbx.fetch(chrom, start, end, parser=pysam.asTuple()))
72
+ except ValueError:
73
+ # Region not in file (e.g. chromosome not present)
74
+ rows = []
75
+ except Exception as e:
76
+ logger.error(f"Error fetching {chrom}:{start}-{end}: {e}")
77
+ raise typer.Exit(1)
78
+
79
+ if not rows:
80
+ bingc.append(np.nan)
81
+ shorts_data.append(0)
82
+ intermediates_data.append(0)
83
+ longs_data.append(0)
84
+ totals_data.append(0)
85
+ continue
86
+
87
+ # Vectorize parsing
88
+ # rows is a list of tuples: (chrom, start, end, gc)
89
+ # We need start (idx 1), end (idx 2), gc (idx 3)
90
+ # Note: BED is 0-based start, 1-based end?
91
+ # motif.py writes: rstart, rend, gc.
92
+ # pysam.TabixFile returns what's in the file.
93
+ # motif.py writes: f"{read1.reference_name}\t{rstart}\t{rend}\t{gc}\n"
94
+ # So col 1 is start, col 2 is end.
95
+
96
+ try:
97
+ # Extract columns
98
+ # This is still a Python loop but faster than append in loop
99
+ # We can use zip to transpose
100
+ _, starts, ends, gcs = zip(*rows)
101
+ starts = np.array(starts, dtype=int)
102
+ ends = np.array(ends, dtype=int)
103
+ gcs = np.array(gcs, dtype=float)
104
+
105
+ lengths = ends - starts
106
+
107
+ # Filter by length (65-400)
108
+ # We only care about reads with length in range [65, 400] for GC calculation?
109
+ # Original code: if 65 <= len <= 400: gc.append(...)
110
+
111
+ mask = (lengths >= 65) & (lengths <= 400)
112
+ valid_gcs = gcs[mask]
113
+
114
+ if len(valid_gcs) == 0:
115
+ bingc.append(np.nan)
116
+ else:
117
+ bingc.append(np.mean(valid_gcs))
118
+
119
+ # Counts
120
+ # np.bincount requires non-negative integers
121
+ # We can filter lengths to be within [0, 400] for bincount safety, though mask handles 65-400
122
+
123
+ # We need counts for specific ranges
124
+ # shorts: 65-150
125
+ # intermediates: 151-260
126
+ # longs: 261-400
127
+ # totals: 65-400
128
+
129
+ # Using histogram might be cleaner or just boolean indexing
130
+ shorts = np.sum((lengths >= 65) & (lengths <= 150))
131
+ intermediates = np.sum((lengths >= 151) & (lengths <= 260))
132
+ longs = np.sum((lengths >= 261) & (lengths <= 400))
133
+ totals = np.sum(mask) # 65-400
134
+
135
+ shorts_data.append(shorts)
136
+ intermediates_data.append(intermediates)
137
+ longs_data.append(longs)
138
+ totals_data.append(totals)
139
+
140
+ except Exception as e:
141
+ logger.error(f"Error processing data in bin {chrom}:{start}-{end}: {e}")
142
+ raise typer.Exit(1)
143
+
144
+ # GC Correction
145
+ try:
146
+ correct_shorts = gc_correct(shorts_data, bingc)
147
+ correct_intermediates = gc_correct(intermediates_data, bingc)
148
+ correct_longs = gc_correct(longs_data, bingc)
149
+ correct_totals = gc_correct(totals_data, bingc)
150
+ except Exception as e:
151
+ logger.error(f"GC correction failed: {e}")
152
+ raise typer.Exit(1)
153
+
154
+ # Aggregation into windows
155
+ # The original logic aggregates 'continue_n' bins into one window.
156
+ # It assumes bins are contiguous and ordered by chromosome.
157
+ # It resets when chromosome changes.
158
+
159
+ # We can do this more pandas-style
160
+ df = pd.DataFrame({
161
+ 'chrom': bins_df['chrom'],
162
+ 'start': bins_df['start'], # bin start
163
+ 'end': bins_df['end'], # bin end
164
+ 'shorts': correct_shorts,
165
+ 'intermediates': correct_intermediates,
166
+ 'longs': correct_longs,
167
+ 'totals': correct_totals
168
+ })
169
+
170
+ results = []
171
+
172
+ # Group by chromosome
173
+ for chrom, group in df.groupby('chrom', sort=False):
174
+ # Rolling window or block aggregation?
175
+ # Original code:
176
+ # num = chrom.count(chrom[step]) -> count of bins in this chrom
177
+ # continues_bin = num // continue_n
178
+ # for i in range(continues_bin):
179
+ # combine...
180
+ # This is block aggregation (non-overlapping blocks of size continue_n)
181
+
182
+ n_bins = len(group)
183
+ n_windows = n_bins // continue_n
184
+
185
+ # We can reshape the array to (n_windows, continue_n) and sum along axis 1
186
+ # But we need to handle the remainder (last_bin) which is ignored in original code?
187
+ # Original: "if last_bin != 0: step += last_bin; start = 0" -> It skips the remainder?
188
+ # "for i in range(continues_bin): ... step += continue_n"
189
+ # It seems it drops the last partial window.
190
+
191
+ if n_windows == 0:
192
+ continue
193
+
194
+ # Truncate to multiple of continue_n
195
+ trunc_len = n_windows * continue_n
196
+
197
+ shorts_mat = group['shorts'].values[:trunc_len].reshape(n_windows, continue_n)
198
+ inter_mat = group['intermediates'].values[:trunc_len].reshape(n_windows, continue_n)
199
+ longs_mat = group['longs'].values[:trunc_len].reshape(n_windows, continue_n)
200
+ totals_mat = group['totals'].values[:trunc_len].reshape(n_windows, continue_n)
201
+
202
+ sum_shorts = shorts_mat.sum(axis=1)
203
+ sum_inter = inter_mat.sum(axis=1)
204
+ sum_longs = longs_mat.sum(axis=1)
205
+ sum_totals = totals_mat.sum(axis=1)
206
+
207
+ # Calculate window coordinates
208
+ # bin_start = start * windows
209
+ # bin_end = (start + continue_n) * windows - 1
210
+ # 'windows' param is actually the bin size? Or the window size?
211
+ # CLI says: --windows "-w" help="Window size (default: 100000)"
212
+ # CLI says: --bin-input "-b" help="Path to bin file (default: .../hg19_window_100kb.bed)"
213
+ # If bin file has 100kb bins, and 'windows' is 100000 (100kb).
214
+ # And continue_n is 50.
215
+ # Then the output window is 50 * 100kb = 5Mb?
216
+ # Original code: bin_start = start * windows
217
+ # start increments by continue_n.
218
+ # So yes, it seems to be creating larger windows from smaller bins.
219
+
220
+ # Let's trust the logic:
221
+ # start index 0, 1, 2... corresponding to blocks of continue_n
222
+
223
+ window_starts = np.arange(n_windows) * continue_n * windows
224
+ window_ends = (np.arange(n_windows) + 1) * continue_n * windows - 1
225
+
226
+ # Z-scores
227
+ # Calculated per chromosome or globally?
228
+ # Original code:
229
+ # short_z = (short_s - np.mean(short_s)) / np.std(short_s)
230
+ # It accumulates `short_s` lists across ALL chromosomes in the loop `while step < length`.
231
+ # Then calculates Z-score on the full list.
232
+ # So Z-score is global.
233
+
234
+ results.append(pd.DataFrame({
235
+ 'chrom': chrom,
236
+ 'start': window_starts,
237
+ 'end': window_ends,
238
+ 'short_sum': sum_shorts,
239
+ 'inter_sum': sum_inter,
240
+ 'long_sum': sum_longs,
241
+ 'total_sum': sum_totals
242
+ }))
243
+
244
+ if not results:
245
+ logger.warning("No valid windows found.")
246
+ return
247
+
248
+ final_df = pd.concat(results, ignore_index=True)
249
+
250
+ # Calculate Z-scores globally
251
+ final_df['short_z'] = (final_df['short_sum'] - final_df['short_sum'].mean()) / final_df['short_sum'].std()
252
+ final_df['inter_z'] = (final_df['inter_sum'] - final_df['inter_sum'].mean()) / final_df['inter_sum'].std()
253
+ final_df['long_z'] = (final_df['long_sum'] - final_df['long_sum'].mean()) / final_df['long_sum'].std()
254
+ final_df['total_z'] = (final_df['total_sum'] - final_df['total_sum'].mean()) / final_df['total_sum'].std()
255
+
256
+ # Write output
257
+ with open(output_file, 'w') as f:
258
+ f.write("region\tshort-fragment-zscore\titermediate-fragment-zscore\tlong-fragment-zscore\ttotal-fragment-zscore\n")
259
+ for _, row in final_df.iterrows():
260
+ region = f"{row['chrom']}:{int(row['start'])}-{int(row['end'])}"
261
+ f.write(f"{region}\t{row['short_z']:.4f}\t{row['inter_z']:.4f}\t{row['long_z']:.4f}\t{row['total_z']:.4f}\n")
262
+
263
+ logger.info(f"FSC calculation complete. Results written to {output_file}")
264
+
265
+ except Exception as e:
266
+ logger.error(f"Fatal error in _calc_fsc: {e}")
267
+ raise typer.Exit(1)
268
+
269
+
270
+ def fsc(
271
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
272
+ bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
273
+ windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
274
+ continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
275
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
276
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
277
+ ):
278
+ """
279
+ Calculate fragment size coverage (FSC) features for all .bed.gz files in a folder.
280
+ """
281
+ # Input checks
282
+ if not bedgz_path.exists():
283
+ logger.error(f"Input directory not found: {bedgz_path}")
284
+ raise typer.Exit(1)
285
+ if bin_input and not bin_input.exists():
286
+ logger.error(f"Bin input file not found: {bin_input}")
287
+ raise typer.Exit(1)
288
+ try:
289
+ output.mkdir(parents=True, exist_ok=True)
290
+ except Exception as e:
291
+ logger.error(f"Could not create output directory {output}: {e}")
292
+ raise typer.Exit(1)
293
+ if not output.exists():
294
+ logger.error(f"Output directory not found: {output}")
295
+ raise typer.Exit(1)
296
+ if not output.is_dir():
297
+ logger.error(f"Output path is not a directory: {output}")
298
+ raise typer.Exit(1)
299
+ if not output.is_writable():
300
+ logger.error(f"Output directory is not writable: {output}")
301
+ raise typer.Exit(1)
302
+
303
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
304
+ if not bedgz_files:
305
+ logger.error("No .bed.gz files found in the specified folder.")
306
+ raise typer.Exit(1)
307
+ if bin_input is None:
308
+ # Use package-relative default
309
+ bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
310
+ logger.info(f"No bin_input specified. Using default: {bin_input}")
311
+ if not bin_input.exists():
312
+ logger.error(f"Bin input file does not exist: {bin_input}")
313
+ raise typer.Exit(1)
314
+ logger.info(f"Calculating FSC for {len(bedgz_files)} files...")
315
+ from concurrent.futures import ProcessPoolExecutor, as_completed
316
+ logger.info(f"Starting parallel FSC calculation using {threads} processes...")
317
+ def run_fsc_file(bedgz_file):
318
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSC.txt')
319
+ _calc_fsc(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
320
+ return str(output_file)
321
+ with ProcessPoolExecutor(max_workers=threads) as executor:
322
+ futures = {executor.submit(run_fsc_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
323
+ for future in as_completed(futures):
324
+ bedgz_file = futures[future]
325
+ try:
326
+ result = future.result()
327
+ logger.info(f"FSC calculated: {result}")
328
+ except Exception as exc:
329
+ logger.error(f"FSC calculation failed for {bedgz_file}: {exc}")
330
+ logger.info(f"FSC features calculated for {len(bedgz_files)} files.")
krewlyzer/fsd.py ADDED
@@ -0,0 +1,170 @@
1
+ import typer
2
+ from pathlib import Path
3
+ import logging
4
+ import pysam
5
+
6
+ import numpy as np
7
+ from rich.console import Console
8
+ from rich.logging import RichHandler
9
+
10
+ console = Console()
11
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
12
+ logger = logging.getLogger("fsd")
13
+
14
+ import pandas as pd
15
+
16
+ def _calc_fsd(
17
+ bedgz_input: str | Path,
18
+ arms_file: str | Path,
19
+ output_file: str | Path
20
+ ):
21
+ """
22
+ Internal: Calculate fragment size distribution (FSD) for a single .bed.gz file.
23
+ Optimized with vectorized operations.
24
+ """
25
+ try:
26
+ logger.info(f"Processing {bedgz_input} with regions from {arms_file}")
27
+
28
+ # Load regions
29
+ try:
30
+ bins_df = pd.read_csv(arms_file, sep='\t', header=None, usecols=[0, 1, 2], names=['chrom', 'start', 'end'], dtype={'chrom': str, 'start': int, 'end': int})
31
+ except Exception as e:
32
+ logger.error(f"Could not load regions from {arms_file}: {e}")
33
+ raise typer.Exit(1)
34
+
35
+ try:
36
+ tbx = pysam.TabixFile(filename=bedgz_input, mode="r")
37
+ except Exception as e:
38
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
39
+ raise typer.Exit(1)
40
+
41
+ results = []
42
+ regions = []
43
+
44
+ # Define histogram bins
45
+ hist_bins = np.arange(65, 401, 5) # 65, 70, ..., 400. 67 bins.
46
+
47
+ for _, bin_row in bins_df.iterrows():
48
+ chrom = bin_row['chrom']
49
+ start = bin_row['start']
50
+ end = bin_row['end']
51
+ regions.append(f"{chrom}:{start}-{end}")
52
+
53
+ try:
54
+ rows = list(tbx.fetch(chrom, start, end, parser=pysam.asTuple()))
55
+ except ValueError:
56
+ rows = []
57
+ except Exception as e:
58
+ logger.error(f"Error fetching {chrom}:{start}-{end}: {e}")
59
+ raise typer.Exit(1)
60
+
61
+ if not rows:
62
+ results.append(np.zeros(67))
63
+ continue
64
+
65
+ try:
66
+ # Vectorized parsing
67
+ _, starts, ends, _ = zip(*rows)
68
+ starts = np.array(starts, dtype=int)
69
+ ends = np.array(ends, dtype=int)
70
+ lengths = ends - starts
71
+
72
+ # Histogram
73
+ counts, _ = np.histogram(lengths, bins=hist_bins)
74
+
75
+ # Normalize
76
+ total = np.sum(counts)
77
+ if total > 0:
78
+ results.append(counts / total)
79
+ else:
80
+ results.append(np.zeros(67))
81
+
82
+ except Exception as e:
83
+ logger.error(f"Error processing data in bin {chrom}:{start}-{end}: {e}")
84
+ results.append(np.zeros(67))
85
+ continue
86
+
87
+ # Write output
88
+ try:
89
+ with open(output_file, 'w') as f:
90
+ # Header
91
+ # sbin = np.arange(65, 400, 5) -> 65, 70, ... 395
92
+ # ranges: 65-69, 70-74, ...
93
+ # Original code: f"{s}-{s+4}"
94
+ # My hist_bins: 65, 70...
95
+ # So bin 0 is [65, 70). i.e. 65, 66, 67, 68, 69.
96
+ # Matches 65-69 (inclusive).
97
+
98
+ header_bins = [f"{s}-{s+4}" for s in hist_bins[:-1]]
99
+ f.write('region\t' + '\t'.join(header_bins) + '\n')
100
+
101
+ for i, region in enumerate(regions):
102
+ scores = results[i]
103
+ f.write(f"{region}\t" + "\t".join(map(str, scores)) + "\n")
104
+
105
+ except Exception as e:
106
+ logger.error(f"Error writing FSD output file: {e}")
107
+ raise typer.Exit(1)
108
+
109
+ logger.info(f"FSD calculation complete. Results written to {output_file}")
110
+
111
+ except Exception as e:
112
+ logger.error(f"Fatal error in _calc_fsd: {e}")
113
+ raise typer.Exit(1)
114
+
115
+ def fsd(
116
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
117
+ arms_file: Path = typer.Option(..., "--arms-file", "-a", help="Path to arms/region file (BED format)"),
118
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
119
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of threads (default: 1)")
120
+ ):
121
+ """
122
+ Calculate fragment size distribution (FSD) features for all .bed.gz files in a folder.
123
+ The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
124
+ Output files are written to the output directory, one per .bed.gz file.
125
+ """
126
+ # Input checks
127
+ if not bedgz_path.exists():
128
+ logger.error(f"Input directory not found: {bedgz_path}")
129
+ raise typer.Exit(1)
130
+ if not arms_file.exists():
131
+ logger.error(f"Arms/region file not found: {arms_file}")
132
+ raise typer.Exit(1)
133
+ try:
134
+ output.mkdir(parents=True, exist_ok=True)
135
+ except Exception as e:
136
+ logger.error(f"Could not create output directory {output}: {e}")
137
+ raise typer.Exit(1)
138
+ if not output.exists():
139
+ logger.error(f"Output directory not found: {output}")
140
+ raise typer.Exit(1)
141
+ if not output.is_dir():
142
+ logger.error(f"Output path is not a directory: {output}")
143
+ raise typer.Exit(1)
144
+ if not output.is_writable():
145
+ logger.error(f"Output directory is not writable: {output}")
146
+ raise typer.Exit(1)
147
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
148
+ if not bedgz_files:
149
+ logger.error("No .bed.gz files found in the specified folder.")
150
+ raise typer.Exit(1)
151
+ if not arms_file.exists():
152
+ logger.error(f"Arms/region file does not exist: {arms_file}")
153
+ raise typer.Exit(1)
154
+ logger.info(f"Calculating FSD for {len(bedgz_files)} files...")
155
+ from concurrent.futures import ProcessPoolExecutor, as_completed
156
+ logger.info(f"Starting parallel FSD calculation using {threads} processes...")
157
+ def run_fsd_file(bedgz_file):
158
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSD.txt')
159
+ _calc_fsd(str(bedgz_file), str(arms_file), str(output_file))
160
+ return str(output_file)
161
+ with ProcessPoolExecutor(max_workers=threads) as executor:
162
+ futures = {executor.submit(run_fsd_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
163
+ for future in as_completed(futures):
164
+ bedgz_file = futures[future]
165
+ try:
166
+ result = future.result()
167
+ logger.info(f"FSD calculated: {result}")
168
+ except Exception as exc:
169
+ logger.error(f"FSD calculation failed for {bedgz_file}: {exc}")
170
+ logger.info(f"FSD features calculated for {len(bedgz_files)} files.")