krewlyzer 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krewlyzer/__init__.py +3 -0
- krewlyzer/cli.py +53 -0
- krewlyzer/fsc.py +330 -0
- krewlyzer/fsd.py +170 -0
- krewlyzer/fsr.py +225 -0
- krewlyzer/helpers.py +237 -0
- krewlyzer/mfsd.py +236 -0
- krewlyzer/motif.py +430 -0
- krewlyzer/ocf.py +133 -0
- krewlyzer/uxm.py +188 -0
- krewlyzer/wps.py +264 -0
- krewlyzer/wrapper.py +147 -0
- krewlyzer-0.1.4.dist-info/METADATA +22 -0
- krewlyzer-0.1.4.dist-info/RECORD +18 -0
- krewlyzer-0.1.4.dist-info/WHEEL +5 -0
- krewlyzer-0.1.4.dist-info/entry_points.txt +2 -0
- krewlyzer-0.1.4.dist-info/licenses/LICENSE +619 -0
- krewlyzer-0.1.4.dist-info/top_level.txt +1 -0
krewlyzer/uxm.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import pysam
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.logging import RichHandler
|
|
10
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
15
|
+
logger = logging.getLogger("uxm")
|
|
16
|
+
|
|
17
|
+
def calc_uxm(
|
|
18
|
+
bam_file: Path,
|
|
19
|
+
mark_file: Path,
|
|
20
|
+
output_file: Path,
|
|
21
|
+
map_quality: int,
|
|
22
|
+
min_cpg: int,
|
|
23
|
+
methy_threshold: float,
|
|
24
|
+
unmethy_threshold: float,
|
|
25
|
+
pe_type: str = "PE"
|
|
26
|
+
) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Calculate UXM fragment-level methylation for a single BAM and marker file.
|
|
29
|
+
Output is a .UXM.tsv file with region, U, X, M proportions.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
bai = str(bam_file) + ".bai"
|
|
33
|
+
if not os.path.exists(bai):
|
|
34
|
+
pysam.sort("-o", str(bam_file), str(bam_file))
|
|
35
|
+
pysam.index(str(bam_file))
|
|
36
|
+
logger.warning(f"Index file {bai} did not exist. Sorted and indexed BAM.")
|
|
37
|
+
input_file = pysam.AlignmentFile(str(bam_file))
|
|
38
|
+
marks = pybedtools.BedTool(str(mark_file))
|
|
39
|
+
res = []
|
|
40
|
+
for mark in marks:
|
|
41
|
+
region = f"{mark.chrom}:{mark.start}-{mark.end}"
|
|
42
|
+
try:
|
|
43
|
+
input_file.fetch(mark.chrom, mark.start, mark.end)
|
|
44
|
+
except ValueError:
|
|
45
|
+
res.append(f"{region}\t0\t0\t0")
|
|
46
|
+
continue
|
|
47
|
+
Ufragment = 0
|
|
48
|
+
Xfragment = 0
|
|
49
|
+
Mfragment = 0
|
|
50
|
+
if pe_type == "PE":
|
|
51
|
+
from krewlyzer.helpers import read_pair_generator
|
|
52
|
+
region_string = f"{mark.chrom}:{mark.start}-{mark.end}"
|
|
53
|
+
for read1, read2 in read_pair_generator(input_file, region_string):
|
|
54
|
+
if read1 is None or read2 is None:
|
|
55
|
+
continue
|
|
56
|
+
if read1.mapping_quality < map_quality or read2.mapping_quality < map_quality:
|
|
57
|
+
continue
|
|
58
|
+
try:
|
|
59
|
+
m1 = read1.get_tag("XM")
|
|
60
|
+
m2 = read2.get_tag("XM")
|
|
61
|
+
except KeyError:
|
|
62
|
+
continue
|
|
63
|
+
read1Start = read1.reference_start
|
|
64
|
+
read1End = read1.reference_end
|
|
65
|
+
read2Start = read2.reference_start
|
|
66
|
+
read2End = read2.reference_end
|
|
67
|
+
# cfDNAFE logic for overlap
|
|
68
|
+
if not read1.is_reverse: # read1 is forward, read2 is reverse
|
|
69
|
+
if read2Start < read1End:
|
|
70
|
+
overlap = read1End - read2Start
|
|
71
|
+
num_methylated = m1.count("Z") + m2[overlap:].count("Z")
|
|
72
|
+
num_unmethylated = m1.count("z") + m2[overlap:].count("z")
|
|
73
|
+
else:
|
|
74
|
+
num_methylated = m1.count("Z") + m2.count("Z")
|
|
75
|
+
num_unmethylated = m1.count("z") + m2.count("z")
|
|
76
|
+
else: # read1 is reverse, read2 is forward
|
|
77
|
+
if read1Start < read2End:
|
|
78
|
+
overlap = read2End - read1Start
|
|
79
|
+
num_methylated = m2.count("Z") + m1[overlap:].count("Z")
|
|
80
|
+
num_unmethylated = m2.count("z") + m1[overlap:].count("z")
|
|
81
|
+
else:
|
|
82
|
+
num_methylated = m1.count("Z") + m2.count("Z")
|
|
83
|
+
num_unmethylated = m1.count("z") + m2.count("z")
|
|
84
|
+
if num_methylated + num_unmethylated < min_cpg:
|
|
85
|
+
continue
|
|
86
|
+
ratio = num_methylated / (num_methylated + num_unmethylated)
|
|
87
|
+
if ratio >= methy_threshold:
|
|
88
|
+
Mfragment += 1
|
|
89
|
+
elif ratio <= unmethy_threshold:
|
|
90
|
+
Ufragment += 1
|
|
91
|
+
else:
|
|
92
|
+
Xfragment += 1
|
|
93
|
+
elif pe_type == "SE":
|
|
94
|
+
for read in input_file.fetch(mark.chrom, mark.start, mark.end):
|
|
95
|
+
if read.mapping_quality < map_quality:
|
|
96
|
+
continue
|
|
97
|
+
try:
|
|
98
|
+
m = read.get_tag("XM")
|
|
99
|
+
except KeyError:
|
|
100
|
+
continue
|
|
101
|
+
num_methylated = m.count("Z")
|
|
102
|
+
num_unmethylated = m.count("z")
|
|
103
|
+
if num_methylated + num_unmethylated < min_cpg:
|
|
104
|
+
continue
|
|
105
|
+
ratio = num_methylated / (num_methylated + num_unmethylated)
|
|
106
|
+
if ratio >= methy_threshold:
|
|
107
|
+
Mfragment += 1
|
|
108
|
+
elif ratio <= unmethy_threshold:
|
|
109
|
+
Ufragment += 1
|
|
110
|
+
else:
|
|
111
|
+
Xfragment += 1
|
|
112
|
+
else:
|
|
113
|
+
logger.error("type must be SE or PE")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
total = Mfragment + Ufragment + Xfragment
|
|
116
|
+
if total == 0:
|
|
117
|
+
res.append(f"{region}\t0\t0\t0")
|
|
118
|
+
else:
|
|
119
|
+
tmp_array = np.zeros(3)
|
|
120
|
+
tmp_array[0] = Ufragment / total
|
|
121
|
+
tmp_array[1] = Xfragment / total
|
|
122
|
+
tmp_array[2] = Mfragment / total
|
|
123
|
+
res.append(f"{region}\t" + "\t".join(map(str, tmp_array)))
|
|
124
|
+
with open(output_file, 'w') as f:
|
|
125
|
+
f.write('region\tU\tX\tM\n')
|
|
126
|
+
for i in res:
|
|
127
|
+
f.write(i + '\n')
|
|
128
|
+
logger.info(f"UXM calculation complete for {bam_file}. Results in {output_file}.")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"Fatal error in calc_uxm: {e}")
|
|
131
|
+
raise typer.Exit(1)
|
|
132
|
+
|
|
133
|
+
def uxm(
|
|
134
|
+
bam_path: Path = typer.Argument(..., help="Folder containing .bam files for UXM calculation."),
|
|
135
|
+
mark_input: Optional[Path] = typer.Option(None, "--mark-input", "-m", help="Marker BED file (default: packaged atlas)", show_default=False),
|
|
136
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
137
|
+
map_quality: int = typer.Option(30, "--map-quality", "-q", help="Minimum mapping quality"),
|
|
138
|
+
min_cpg: int = typer.Option(4, "--min-cpg", "-c", help="Minimum CpG count per fragment"),
|
|
139
|
+
methy_threshold: float = typer.Option(0.75, "--methy-threshold", "-tM", help="Methylation threshold for M fragments"),
|
|
140
|
+
unmethy_threshold: float = typer.Option(0.25, "--unmethy-threshold", "-tU", help="Unmethylation threshold for U fragments"),
|
|
141
|
+
pe_type: str = typer.Option("SE", "--type", help="Fragment type: SE or PE (default: SE)"),
|
|
142
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
143
|
+
) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Calculate fragment-level methylation (UXM) features for all BAM files in a folder.
|
|
146
|
+
"""
|
|
147
|
+
# Input checks
|
|
148
|
+
if not bam_path.exists():
|
|
149
|
+
logger.error(f"Input BAM directory not found: {bam_path}")
|
|
150
|
+
raise typer.Exit(1)
|
|
151
|
+
if mark_input and not mark_input.exists():
|
|
152
|
+
logger.error(f"Marker BED file not found: {mark_input}")
|
|
153
|
+
raise typer.Exit(1)
|
|
154
|
+
try:
|
|
155
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
158
|
+
raise typer.Exit(1)
|
|
159
|
+
if mark_input is None:
|
|
160
|
+
pkg_dir = Path(__file__).parent
|
|
161
|
+
mark_input = pkg_dir / "data/MethMark/Atlas.U25.l4.hg19.bed"
|
|
162
|
+
bam_files = [f for f in Path(bam_path).glob("*.bam")]
|
|
163
|
+
output = Path(output)
|
|
164
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
165
|
+
def run_uxm_file(bam_file):
|
|
166
|
+
sample_prefix = bam_file.stem.replace('.bam', '')
|
|
167
|
+
output_file = output / f"{sample_prefix}.UXM.tsv"
|
|
168
|
+
calc_uxm(
|
|
169
|
+
bam_file,
|
|
170
|
+
mark_input,
|
|
171
|
+
output_file,
|
|
172
|
+
map_quality,
|
|
173
|
+
min_cpg,
|
|
174
|
+
methy_threshold,
|
|
175
|
+
unmethy_threshold,
|
|
176
|
+
pe_type
|
|
177
|
+
)
|
|
178
|
+
return str(output_file)
|
|
179
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
180
|
+
futures = {executor.submit(run_uxm_file, bam_file): bam_file for bam_file in bam_files}
|
|
181
|
+
for future in as_completed(futures):
|
|
182
|
+
bam_file = futures[future]
|
|
183
|
+
try:
|
|
184
|
+
result = future.result()
|
|
185
|
+
logger.info(f"UXM calculated: {result}")
|
|
186
|
+
except Exception as exc:
|
|
187
|
+
logger.error(f"UXM calculation failed for {bam_file}: {exc}")
|
|
188
|
+
logger.info(f"UXM features calculated for {len(bam_files)} files.")
|
krewlyzer/wps.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import pysam
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import gzip
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
|
|
12
|
+
from .helpers import max_core, commonError
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
16
|
+
logger = logging.getLogger("wps")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
def _calc_wps(
|
|
22
|
+
bedgz_input: str | Path,
|
|
23
|
+
tsv_input: str | Path,
|
|
24
|
+
output_file_pattern: str,
|
|
25
|
+
empty: bool = False,
|
|
26
|
+
protect_input: int = 120,
|
|
27
|
+
min_size: int = 120,
|
|
28
|
+
max_size: int = 180
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
Calculate Windowed Protection Score (WPS) for a single .bed.gz file and transcript region file.
|
|
32
|
+
Output is gzipped TSV per region.
|
|
33
|
+
Optimized with vectorized operations.
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
bedgzfile = str(bedgz_input)
|
|
37
|
+
tbx = pysam.TabixFile(bedgzfile)
|
|
38
|
+
protection = protect_input // 2
|
|
39
|
+
|
|
40
|
+
logger.info(f"Processing {bedgz_input} with regions from {tsv_input}")
|
|
41
|
+
|
|
42
|
+
with open(tsv_input, 'r') as infile:
|
|
43
|
+
valid_chroms = set(map(str, list(range(1, 23)) + ["X", "Y"]))
|
|
44
|
+
|
|
45
|
+
for line in infile:
|
|
46
|
+
if not line.strip():
|
|
47
|
+
continue
|
|
48
|
+
parts = line.split()
|
|
49
|
+
if len(parts) < 5:
|
|
50
|
+
continue
|
|
51
|
+
cid, chrom, start_str, end_str, strand = parts[:5]
|
|
52
|
+
chrom = chrom.replace("chr", "")
|
|
53
|
+
if chrom not in valid_chroms:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
region_start = int(float(start_str))
|
|
57
|
+
region_end = int(float(end_str))
|
|
58
|
+
|
|
59
|
+
if region_start < 1:
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
# Region length (inclusive)
|
|
63
|
+
length = region_end - region_start + 1
|
|
64
|
+
|
|
65
|
+
# Arrays for the region (0-based index relative to region_start)
|
|
66
|
+
cov_arr = np.zeros(length, dtype=int)
|
|
67
|
+
start_arr = np.zeros(length, dtype=int)
|
|
68
|
+
gcount_arr = np.zeros(length, dtype=int)
|
|
69
|
+
total_arr = np.zeros(length, dtype=int)
|
|
70
|
+
|
|
71
|
+
# Fetch reads
|
|
72
|
+
# Pysam fetch is 0-based.
|
|
73
|
+
# Region 1-based [S, E] -> 0-based [S-1, E)
|
|
74
|
+
# We need reads extending 'protection' bp around the region
|
|
75
|
+
fetch_start = max(0, region_start - protection - 1)
|
|
76
|
+
fetch_end = region_end + protection
|
|
77
|
+
|
|
78
|
+
# Ensure 'chr' prefix for fetching as per original logic
|
|
79
|
+
fetch_chrom = "chr" + chrom if not chrom.startswith("chr") else chrom
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
rows = list(tbx.fetch(fetch_chrom, fetch_start, fetch_end, parser=pysam.asTuple()))
|
|
83
|
+
except ValueError:
|
|
84
|
+
# Try without chr prefix if failed?
|
|
85
|
+
try:
|
|
86
|
+
rows = list(tbx.fetch(chrom, fetch_start, fetch_end, parser=pysam.asTuple()))
|
|
87
|
+
except ValueError:
|
|
88
|
+
rows = []
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Error fetching region {chrom}:{region_start}-{region_end}: {e}")
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
if not rows:
|
|
94
|
+
if not empty:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# Process reads
|
|
98
|
+
for row in rows:
|
|
99
|
+
# BED is 0-based start, 0-based exclusive end
|
|
100
|
+
rstart = int(row[1])
|
|
101
|
+
rend = int(row[2])
|
|
102
|
+
lseq = rend - rstart
|
|
103
|
+
|
|
104
|
+
if lseq < min_size or lseq > max_size:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
# Convert read to 1-based inclusive for easier logic with 1-based regions
|
|
108
|
+
r_start_1 = rstart + 1
|
|
109
|
+
r_end_1 = rend
|
|
110
|
+
|
|
111
|
+
# 1. Coverage (covCount)
|
|
112
|
+
# Read spans [r_start_1, r_end_1]
|
|
113
|
+
ov_start = max(region_start, r_start_1)
|
|
114
|
+
ov_end = min(region_end, r_end_1)
|
|
115
|
+
|
|
116
|
+
if ov_start <= ov_end:
|
|
117
|
+
idx_start = ov_start - region_start
|
|
118
|
+
idx_end = ov_end - region_start + 1
|
|
119
|
+
cov_arr[idx_start:idx_end] += 1
|
|
120
|
+
|
|
121
|
+
# 2. Start Count (ends)
|
|
122
|
+
if region_start <= r_start_1 <= region_end:
|
|
123
|
+
start_arr[r_start_1 - region_start] += 1
|
|
124
|
+
if region_start <= r_end_1 <= region_end:
|
|
125
|
+
start_arr[r_end_1 - region_start] += 1
|
|
126
|
+
|
|
127
|
+
# 3. WPS
|
|
128
|
+
# gcount (spanning): window [k-P, k+P] is inside read
|
|
129
|
+
# Range: [r_start_1 + P, r_end_1 - P]
|
|
130
|
+
g_start = r_start_1 + protection
|
|
131
|
+
g_end = r_end_1 - protection
|
|
132
|
+
|
|
133
|
+
g_ov_start = max(region_start, g_start)
|
|
134
|
+
g_ov_end = min(region_end, g_end)
|
|
135
|
+
|
|
136
|
+
if g_ov_start <= g_ov_end:
|
|
137
|
+
idx_start = g_ov_start - region_start
|
|
138
|
+
idx_end = g_ov_end - region_start + 1
|
|
139
|
+
gcount_arr[idx_start:idx_end] += 1
|
|
140
|
+
|
|
141
|
+
# total (overlapping): window [k-P, k+P] overlaps read
|
|
142
|
+
# Range: [r_start_1 - P, r_end_1 + P]
|
|
143
|
+
t_start = r_start_1 - protection
|
|
144
|
+
t_end = r_end_1 + protection
|
|
145
|
+
|
|
146
|
+
t_ov_start = max(region_start, t_start)
|
|
147
|
+
t_ov_end = min(region_end, t_end)
|
|
148
|
+
|
|
149
|
+
if t_ov_start <= t_ov_end:
|
|
150
|
+
idx_start = t_ov_start - region_start
|
|
151
|
+
idx_end = t_ov_end - region_start + 1
|
|
152
|
+
total_arr[idx_start:idx_end] += 1
|
|
153
|
+
|
|
154
|
+
# WPS = Spanning - (Total - Spanning) = 2 * Spanning - Total
|
|
155
|
+
wps_arr = 2 * gcount_arr - total_arr
|
|
156
|
+
|
|
157
|
+
# Check if we should write
|
|
158
|
+
if np.sum(cov_arr) == 0 and not empty:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
# Prepare output
|
|
162
|
+
filename = output_file_pattern % cid
|
|
163
|
+
|
|
164
|
+
positions = np.arange(region_start, region_end + 1)
|
|
165
|
+
df = pd.DataFrame({
|
|
166
|
+
'chrom': chrom,
|
|
167
|
+
'pos': positions,
|
|
168
|
+
'cov': cov_arr,
|
|
169
|
+
'starts': start_arr,
|
|
170
|
+
'wps': wps_arr
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
if strand == "-":
|
|
174
|
+
df = df.iloc[::-1]
|
|
175
|
+
|
|
176
|
+
with gzip.open(filename, 'wt') as outfile:
|
|
177
|
+
for _, row in df.iterrows():
|
|
178
|
+
outfile.write(f"{row['chrom']}\t{int(row['pos'])}\t{int(row['cov'])}\t{int(row['starts'])}\t{int(row['wps'])}\n")
|
|
179
|
+
|
|
180
|
+
logger.info(f"WPS calculation complete. Results written to pattern: {output_file_pattern}")
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.error(f"Fatal error in _calc_wps: {e}")
|
|
183
|
+
raise typer.Exit(1)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def wps(
|
|
187
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
188
|
+
tsv_input: Path = typer.Option(None, "--tsv-input", "-t", help="Path to transcript/region file (TSV format)"),
|
|
189
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
190
|
+
wpstype: str = typer.Option('L', "--wpstype", "-w", help="WPS type: 'L' for long (default), 'S' for short"),
|
|
191
|
+
empty: bool = typer.Option(False, "--empty", help="Keep files of empty blocks (default: False)"),
|
|
192
|
+
threads: int = typer.Option(1, "--threads", "-p", help="Number of threads (default: 1)")
|
|
193
|
+
):
|
|
194
|
+
"""
|
|
195
|
+
Calculate Windowed Protection Score (WPS) features for all .bed.gz files in a folder.
|
|
196
|
+
"""
|
|
197
|
+
# Input checks
|
|
198
|
+
if not bedgz_path.exists():
|
|
199
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
200
|
+
raise typer.Exit(1)
|
|
201
|
+
if tsv_input and not tsv_input.exists():
|
|
202
|
+
logger.error(f"Transcript region file not found: {tsv_input}")
|
|
203
|
+
raise typer.Exit(1)
|
|
204
|
+
try:
|
|
205
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
208
|
+
raise typer.Exit(1)
|
|
209
|
+
try:
|
|
210
|
+
output.touch()
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"Output directory {output} is not writable: {e}")
|
|
213
|
+
raise typer.Exit(1)
|
|
214
|
+
try:
|
|
215
|
+
bedgz_files = list(Path(bedgz_path).glob("*.bed.gz"))
|
|
216
|
+
if not bedgz_files:
|
|
217
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
218
|
+
raise typer.Exit(1)
|
|
219
|
+
if tsv_input is None:
|
|
220
|
+
# Default to package data transcriptAnno-hg19-1kb.tsv
|
|
221
|
+
tsv_input = Path(__file__).parent.parent / "data" / "TranscriptAnno" / "transcriptAnno-hg19-1kb.tsv"
|
|
222
|
+
logger.info(f"No tsv_input specified. Using default: {tsv_input}")
|
|
223
|
+
if not tsv_input.exists():
|
|
224
|
+
logger.error(f"Transcript/region file does not exist: {tsv_input}")
|
|
225
|
+
raise typer.Exit(1)
|
|
226
|
+
if wpstype == 'L':
|
|
227
|
+
protect_input = 120
|
|
228
|
+
min_size = 120
|
|
229
|
+
max_size = 180
|
|
230
|
+
else:
|
|
231
|
+
protect_input = 16
|
|
232
|
+
min_size = 35
|
|
233
|
+
max_size = 80
|
|
234
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
235
|
+
logger.info(f"Calculating WPS for {len(bedgz_files)} files...")
|
|
236
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
237
|
+
import traceback
|
|
238
|
+
def wps_task(bedgz_file):
|
|
239
|
+
try:
|
|
240
|
+
output_file_pattern = str(output / (bedgz_file.stem.replace('.bed', '') + ".%s.WPS.tsv.gz"))
|
|
241
|
+
_calc_wps(
|
|
242
|
+
bedgz_input=str(bedgz_file),
|
|
243
|
+
tsv_input=str(tsv_input),
|
|
244
|
+
output_file_pattern=output_file_pattern,
|
|
245
|
+
empty=empty,
|
|
246
|
+
protect_input=protect_input,
|
|
247
|
+
min_size=min_size,
|
|
248
|
+
max_size=max_size
|
|
249
|
+
)
|
|
250
|
+
return None
|
|
251
|
+
except Exception as exc:
|
|
252
|
+
return traceback.format_exc()
|
|
253
|
+
n_procs = max_core(threads) if threads else 1
|
|
254
|
+
logger.info(f"Calculating WPS for {len(bedgz_files)} files using {n_procs} processes...")
|
|
255
|
+
with ProcessPoolExecutor(max_workers=n_procs) as executor:
|
|
256
|
+
futures = {executor.submit(wps_task, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
257
|
+
for future in as_completed(futures):
|
|
258
|
+
exc = future.result()
|
|
259
|
+
if exc:
|
|
260
|
+
logger.error(f"WPS calculation failed for {futures[future]}:\n{exc}")
|
|
261
|
+
logger.info(f"WPS features calculated for {len(bedgz_files)} files.")
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"Fatal error in wps CLI: {e}")
|
|
264
|
+
raise typer.Exit(1)
|
krewlyzer/wrapper.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.logging import RichHandler
|
|
6
|
+
from .motif import motif
|
|
7
|
+
from .fsc import fsc
|
|
8
|
+
from .fsr import fsr
|
|
9
|
+
from .fsd import fsd
|
|
10
|
+
from .wps import wps
|
|
11
|
+
from .ocf import ocf
|
|
12
|
+
from .uxm import uxm
|
|
13
|
+
from .mfsd import mfsd
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
console = Console()
|
|
17
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
18
|
+
logger = logging.getLogger("krewlyzer-wrapper")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_all(
|
|
22
|
+
bam_file: Path = typer.Argument(..., help="Input BAM file (sorted, indexed)"),
|
|
23
|
+
reference: Path = typer.Option(..., "--reference", "-g", help="Reference genome FASTA file for motif extraction"),
|
|
24
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output directory for all results"),
|
|
25
|
+
variant_input: Optional[Path] = typer.Option(None, "--variant-input", "-v", help="Input VCF/MAF file for mFSD analysis"),
|
|
26
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes for each step"),
|
|
27
|
+
pe_type: str = typer.Option("SE", "--type", help="Fragment type for UXM: SE or PE (default: SE)")
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Run all feature extraction commands (motif, fsc, fsr, fsd, wps, ocf, uxm, mfsd) for a single BAM file.
|
|
31
|
+
"""
|
|
32
|
+
# Input checks
|
|
33
|
+
if not bam_file.exists() or not bam_file.is_file():
|
|
34
|
+
logger.error(f"Input BAM file not found: {bam_file}")
|
|
35
|
+
raise typer.Exit(1)
|
|
36
|
+
if not reference.exists() or not reference.is_file():
|
|
37
|
+
logger.error(f"Reference FASTA file not found: {reference}")
|
|
38
|
+
raise typer.Exit(1)
|
|
39
|
+
try:
|
|
40
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
43
|
+
raise typer.Exit(1)
|
|
44
|
+
# 1. Motif extraction
|
|
45
|
+
motif_output = output / "motif"
|
|
46
|
+
try:
|
|
47
|
+
motif(
|
|
48
|
+
bam_path=bam_file,
|
|
49
|
+
reference=reference,
|
|
50
|
+
output=motif_output,
|
|
51
|
+
minlen=65,
|
|
52
|
+
maxlen=400,
|
|
53
|
+
k=3,
|
|
54
|
+
verbose=True,
|
|
55
|
+
threads=threads,
|
|
56
|
+
)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Motif extraction failed: {e}")
|
|
59
|
+
raise typer.Exit(1)
|
|
60
|
+
# 2. FSC
|
|
61
|
+
fsc_output = output / "fsc"
|
|
62
|
+
try:
|
|
63
|
+
fsc(
|
|
64
|
+
bedgz_path=motif_output,
|
|
65
|
+
output=fsc_output,
|
|
66
|
+
threads=threads
|
|
67
|
+
)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.error(f"FSC calculation failed: {e}")
|
|
70
|
+
raise typer.Exit(1)
|
|
71
|
+
# 3. FSR
|
|
72
|
+
fsr_output = output / "fsr"
|
|
73
|
+
try:
|
|
74
|
+
fsr(
|
|
75
|
+
bedgz_path=motif_output,
|
|
76
|
+
output=fsr_output,
|
|
77
|
+
threads=threads
|
|
78
|
+
)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"FSR calculation failed: {e}")
|
|
81
|
+
raise typer.Exit(1)
|
|
82
|
+
# 4. FSD
|
|
83
|
+
fsd_output = output / "fsd"
|
|
84
|
+
try:
|
|
85
|
+
fsd(
|
|
86
|
+
bedgz_path=motif_output,
|
|
87
|
+
output=fsd_output,
|
|
88
|
+
arms_file=None,
|
|
89
|
+
threads=threads
|
|
90
|
+
)
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"FSD calculation failed: {e}")
|
|
93
|
+
raise typer.Exit(1)
|
|
94
|
+
# 5. WPS
|
|
95
|
+
wps_output = output / "wps"
|
|
96
|
+
try:
|
|
97
|
+
wps(
|
|
98
|
+
bedgz_path=motif_output,
|
|
99
|
+
output=wps_output,
|
|
100
|
+
threads=threads
|
|
101
|
+
)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"WPS calculation failed: {e}")
|
|
104
|
+
raise typer.Exit(1)
|
|
105
|
+
# 6. OCF
|
|
106
|
+
ocf_output = output / "ocf"
|
|
107
|
+
try:
|
|
108
|
+
ocf(
|
|
109
|
+
bedgz_path=motif_output,
|
|
110
|
+
output=ocf_output,
|
|
111
|
+
threads=threads
|
|
112
|
+
)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"OCF calculation failed: {e}")
|
|
115
|
+
raise typer.Exit(1)
|
|
116
|
+
# 7. UXM
|
|
117
|
+
uxm_output = output / "uxm"
|
|
118
|
+
try:
|
|
119
|
+
uxm(
|
|
120
|
+
bam_path=bam_file.parent,
|
|
121
|
+
output=uxm_output,
|
|
122
|
+
pe_type=pe_type,
|
|
123
|
+
threads=threads
|
|
124
|
+
)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"UXM calculation failed: {e}")
|
|
127
|
+
raise typer.Exit(1)
|
|
128
|
+
|
|
129
|
+
# 8. mFSD (Optional)
|
|
130
|
+
if variant_input:
|
|
131
|
+
if not variant_input.exists():
|
|
132
|
+
logger.warning(f"Variant input file not found: {variant_input}. Skipping mFSD.")
|
|
133
|
+
else:
|
|
134
|
+
mfsd_output = output / "mfsd" / (bam_file.stem + ".mfsd.tsv")
|
|
135
|
+
try:
|
|
136
|
+
mfsd(
|
|
137
|
+
bam_path=bam_file,
|
|
138
|
+
input_file=variant_input,
|
|
139
|
+
output=mfsd_output,
|
|
140
|
+
format="auto",
|
|
141
|
+
map_quality=20
|
|
142
|
+
)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"mFSD calculation failed: {e}")
|
|
145
|
+
# Don't raise exit here, just log error as it's optional
|
|
146
|
+
|
|
147
|
+
logger.info(f"All feature extraction complete. Results saved to {output}")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: krewlyzer
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Feature extraction tools for circulating tumor DNA from GRCh37 aligned BAM files
|
|
5
|
+
Author-email: Ronak Shah <shahr2@mskcc.org>
|
|
6
|
+
Project-URL: Homepage, https://github.com/msk-access/krewlyzer
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: typer>=0.12.3
|
|
10
|
+
Requires-Dist: pysam>=0.20.0
|
|
11
|
+
Requires-Dist: pandas>=2.0.0
|
|
12
|
+
Requires-Dist: biopython>=1.81
|
|
13
|
+
Requires-Dist: scikit-misc>=0.1.4
|
|
14
|
+
Requires-Dist: scipy>=1.10.0
|
|
15
|
+
Requires-Dist: rich>=13.0.0
|
|
16
|
+
Provides-Extra: docs
|
|
17
|
+
Requires-Dist: mkdocs>=1.5.0; extra == "docs"
|
|
18
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == "docs"
|
|
19
|
+
Provides-Extra: test
|
|
20
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
21
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "test"
|
|
22
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
krewlyzer/__init__.py,sha256=trkZFdHJ1IoLrxZb8PVQe2wSGn_ojPODNqHbjLSL1_c,90
|
|
2
|
+
krewlyzer/cli.py,sha256=4a_gl59n4RguPMFsRi7OZ0-uhAfk7PFDOgFdUvD1a1A,1553
|
|
3
|
+
krewlyzer/fsc.py,sha256=O3VZz-lY-s603Pq-94txL49vBSbxSMPIxOmVykUvHS8,14548
|
|
4
|
+
krewlyzer/fsd.py,sha256=u91sb8WLNv6t0DBwg_SwFL-RLaWPGNb2f4LrBKzh0wQ,6857
|
|
5
|
+
krewlyzer/fsr.py,sha256=ee5ZijYjUAdIvs2TVeBvmm6qQwUCp6XjenVQUSeXCBQ,9411
|
|
6
|
+
krewlyzer/helpers.py,sha256=NA7YBfIRYKj_ZdjiwG96yOyd8IC03VPgBBokfj1a1_g,8540
|
|
7
|
+
krewlyzer/mfsd.py,sha256=C7xAaTPSU-KOL0Mvu5nYBugfCidOVN9YWaFZmDfe0Uw,9451
|
|
8
|
+
krewlyzer/motif.py,sha256=6xCocdFAn0BBssxWzXhGxHac0OHvVKRg6ovQQ-XLr_c,20610
|
|
9
|
+
krewlyzer/ocf.py,sha256=cSIy8ax-EWVONarFZgvunIexbP2q39oY66R0i9Xd9-0,5832
|
|
10
|
+
krewlyzer/uxm.py,sha256=LOyEXCDQimucTTwl2A95dfHJDFyKK0S6MpREpFnOVXI,8482
|
|
11
|
+
krewlyzer/wps.py,sha256=7-u_VJ2kdYfvq-CSuCrCTRoRjlIRMJopB5K0jAr1G-E,11279
|
|
12
|
+
krewlyzer/wrapper.py,sha256=V6BvaxlHl1z7JBMtQC1AteNrTQ9oYf2hHIleje3Cb-M,4715
|
|
13
|
+
krewlyzer-0.1.4.dist-info/licenses/LICENSE,sha256=DuJF49YfFt6g7la7cekWI06XA7ImodNiTEOrEBsOkpk,32365
|
|
14
|
+
krewlyzer-0.1.4.dist-info/METADATA,sha256=Za75HoZckHD7LjPZmbrzI3lckh0YPFztei4E4R9uxiI,767
|
|
15
|
+
krewlyzer-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
+
krewlyzer-0.1.4.dist-info/entry_points.txt,sha256=x9Wngsqelv0MxiUvgDDx3hVNR-MhDbASxwKpNeBiX8I,48
|
|
17
|
+
krewlyzer-0.1.4.dist-info/top_level.txt,sha256=bFO6hK-X3pxPGZ7ewoOVx80u9p_Npyy9NW8XophctdY,10
|
|
18
|
+
krewlyzer-0.1.4.dist-info/RECORD,,
|