krewlyzer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krewlyzer/__init__.py +1 -0
- krewlyzer/cli.py +47 -0
- krewlyzer/fsc.py +450 -0
- krewlyzer/fsd.py +139 -0
- krewlyzer/fsr.py +171 -0
- krewlyzer/helpers.py +187 -0
- krewlyzer/motif.py +275 -0
- krewlyzer/ocf.py +133 -0
- krewlyzer/uxm.py +188 -0
- krewlyzer/wps.py +173 -0
- krewlyzer/wrapper.py +125 -0
- krewlyzer-0.1.0.dist-info/METADATA +15 -0
- krewlyzer-0.1.0.dist-info/RECORD +17 -0
- krewlyzer-0.1.0.dist-info/WHEEL +5 -0
- krewlyzer-0.1.0.dist-info/entry_points.txt +2 -0
- krewlyzer-0.1.0.dist-info/licenses/LICENSE +619 -0
- krewlyzer-0.1.0.dist-info/top_level.txt +1 -0
krewlyzer/ocf.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import pysam
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from functools import partial
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
16
|
+
logger = logging.getLogger("ocf")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def calc_ocf(bedgz_file: Path, ocr_file: Path, output_dir: Path):
|
|
20
|
+
"""
|
|
21
|
+
Calculate OCF for a single .bed.gz file and OCR region file.
|
|
22
|
+
Output is per-region .sync.end files and a summary all.ocf.csv.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
tbx = pysam.TabixFile(str(bedgz_file))
|
|
26
|
+
regions = pd.read_csv(ocr_file, sep="\t", header=None, names=["chr", "start", "end", "description"])
|
|
27
|
+
leftPOS = defaultdict(partial(defaultdict, int))
|
|
28
|
+
rightPOS = defaultdict(partial(defaultdict, int))
|
|
29
|
+
total = defaultdict(lambda: [0, 0])
|
|
30
|
+
for _, region in regions.iterrows():
|
|
31
|
+
region_Chr, region_Start, region_End, region_Label = (
|
|
32
|
+
region["chr"], region["start"], region["end"], region["description"])
|
|
33
|
+
try:
|
|
34
|
+
fetched_reads = tbx.fetch(region_Chr, region_Start, region_End)
|
|
35
|
+
except ValueError:
|
|
36
|
+
continue
|
|
37
|
+
for row in fetched_reads:
|
|
38
|
+
tmp_row = row.split()
|
|
39
|
+
rstart = int(tmp_row[1])
|
|
40
|
+
rend = int(tmp_row[2])
|
|
41
|
+
if rstart >= region_Start:
|
|
42
|
+
s = rstart - region_Start
|
|
43
|
+
leftPOS[region_Label][s] += 1
|
|
44
|
+
total[region_Label][0] += 1
|
|
45
|
+
if rend <= region_End:
|
|
46
|
+
e = rend - region_Start + 1
|
|
47
|
+
rightPOS[region_Label][e] += 1
|
|
48
|
+
total[region_Label][1] += 1
|
|
49
|
+
Labels = []
|
|
50
|
+
ocf = []
|
|
51
|
+
outputfile = output_dir / 'all.ocf.csv'
|
|
52
|
+
for label in total.keys():
|
|
53
|
+
output = output_dir / f'{label}.sync.end'
|
|
54
|
+
Labels.append(label)
|
|
55
|
+
le = leftPOS[label]
|
|
56
|
+
re = rightPOS[label]
|
|
57
|
+
ts = total[label][0] / 10000 if total[label][0] else 1
|
|
58
|
+
te = total[label][1] / 10000 if total[label][1] else 1
|
|
59
|
+
num = 2000
|
|
60
|
+
with open(output, 'w') as output_write:
|
|
61
|
+
for k in range(num):
|
|
62
|
+
l = le[k]
|
|
63
|
+
r = re[k]
|
|
64
|
+
output_write.write(
|
|
65
|
+
f"{k - 1000}\t{l}\t{l / ts}\t{r}\t{r / te}\n")
|
|
66
|
+
# OCF calculation
|
|
67
|
+
with open(output, 'r') as o:
|
|
68
|
+
peak = 60
|
|
69
|
+
bin = 10
|
|
70
|
+
trueends = 0
|
|
71
|
+
background = 0
|
|
72
|
+
for line in o.readlines():
|
|
73
|
+
loc, left, Left, right, Right = line.split()
|
|
74
|
+
loc = int(loc)
|
|
75
|
+
if -peak - bin <= loc <= -peak + bin:
|
|
76
|
+
trueends += float(Right)
|
|
77
|
+
background += float(Left)
|
|
78
|
+
elif peak - bin <= loc <= peak + bin:
|
|
79
|
+
trueends += float(Left)
|
|
80
|
+
background += float(Right)
|
|
81
|
+
ocf.append(trueends - background)
|
|
82
|
+
import pandas as pd
|
|
83
|
+
ocf_df = pd.DataFrame({"tissue": Labels, "OCF": ocf})
|
|
84
|
+
ocf_df.to_csv(outputfile, sep="\t", index=None)
|
|
85
|
+
logger.info(f"OCF calculation complete for {bedgz_file}. Results in {output_dir}.")
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Fatal error in calc_ocf: {e}")
|
|
88
|
+
raise typer.Exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def ocf(
|
|
92
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
93
|
+
ocr_input: Optional[Path] = typer.Option(None, "--ocr-input", "-r", help="Path to open chromatin region BED file (default: packaged tissue file)"),
|
|
94
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
95
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
96
|
+
):
|
|
97
|
+
"""
|
|
98
|
+
Calculate orientation-aware cfDNA fragmentation (OCF) features for all .bed.gz files in a folder.
|
|
99
|
+
"""
|
|
100
|
+
# Input checks
|
|
101
|
+
if not bedgz_path.exists():
|
|
102
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
103
|
+
raise typer.Exit(1)
|
|
104
|
+
if ocr_input and not ocr_input.exists():
|
|
105
|
+
logger.error(f"OCR region BED file not found: {ocr_input}")
|
|
106
|
+
raise typer.Exit(1)
|
|
107
|
+
try:
|
|
108
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
111
|
+
raise typer.Exit(1)
|
|
112
|
+
# Set default OCR file if not provided
|
|
113
|
+
if ocr_input is None:
|
|
114
|
+
pkg_dir = Path(__file__).parent
|
|
115
|
+
ocr_input = pkg_dir / "data/OpenChromatinRegion/7specificTissue.all.OC.bed"
|
|
116
|
+
bedgz_files = [f for f in Path(bedgz_path).glob("*.bed.gz")]
|
|
117
|
+
output = Path(output)
|
|
118
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
def run_ocf_file(bedgz_file):
|
|
120
|
+
sample_dir = output / bedgz_file.stem.replace('.bed', '')
|
|
121
|
+
sample_dir.mkdir(exist_ok=True)
|
|
122
|
+
calc_ocf(bedgz_file, ocr_input, sample_dir)
|
|
123
|
+
return str(sample_dir)
|
|
124
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
125
|
+
futures = {executor.submit(run_ocf_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
126
|
+
for future in as_completed(futures):
|
|
127
|
+
bedgz_file = futures[future]
|
|
128
|
+
try:
|
|
129
|
+
result = future.result()
|
|
130
|
+
logger.info(f"OCF calculated: {result}")
|
|
131
|
+
except Exception as exc:
|
|
132
|
+
logger.error(f"OCF calculation failed for {bedgz_file}: {exc}")
|
|
133
|
+
logger.info(f"OCF features calculated for {len(bedgz_files)} files.")
|
krewlyzer/uxm.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import logging
|
|
5
|
+
import pysam
|
|
6
|
+
import pybedtools
|
|
7
|
+
import numpy as np
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.logging import RichHandler
|
|
10
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
15
|
+
logger = logging.getLogger("uxm")
|
|
16
|
+
|
|
17
|
+
def calc_uxm(
|
|
18
|
+
bam_file: Path,
|
|
19
|
+
mark_file: Path,
|
|
20
|
+
output_file: Path,
|
|
21
|
+
map_quality: int,
|
|
22
|
+
min_cpg: int,
|
|
23
|
+
methy_threshold: float,
|
|
24
|
+
unmethy_threshold: float,
|
|
25
|
+
pe_type: str = "PE"
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Calculate UXM fragment-level methylation for a single BAM and marker file.
|
|
29
|
+
Output is a .UXM.tsv file with region, U, X, M proportions.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
bai = str(bam_file) + ".bai"
|
|
33
|
+
if not os.path.exists(bai):
|
|
34
|
+
pysam.sort("-o", str(bam_file), str(bam_file))
|
|
35
|
+
pysam.index(str(bam_file))
|
|
36
|
+
logger.warning(f"Index file {bai} did not exist. Sorted and indexed BAM.")
|
|
37
|
+
input_file = pysam.AlignmentFile(str(bam_file))
|
|
38
|
+
marks = pybedtools.BedTool(str(mark_file))
|
|
39
|
+
res = []
|
|
40
|
+
for mark in marks:
|
|
41
|
+
region = f"{mark.chrom}:{mark.start}-{mark.end}"
|
|
42
|
+
try:
|
|
43
|
+
input_file.fetch(mark.chrom, mark.start, mark.end)
|
|
44
|
+
except ValueError:
|
|
45
|
+
res.append(f"{region}\t0\t0\t0")
|
|
46
|
+
continue
|
|
47
|
+
Ufragment = 0
|
|
48
|
+
Xfragment = 0
|
|
49
|
+
Mfragment = 0
|
|
50
|
+
if pe_type == "PE":
|
|
51
|
+
from krewlyzer.helpers import read_pair_generator
|
|
52
|
+
region_string = f"{mark.chrom}:{mark.start}-{mark.end}"
|
|
53
|
+
for read1, read2 in read_pair_generator(input_file, region_string):
|
|
54
|
+
if read1 is None or read2 is None:
|
|
55
|
+
continue
|
|
56
|
+
if read1.mapping_quality < map_quality or read2.mapping_quality < map_quality:
|
|
57
|
+
continue
|
|
58
|
+
try:
|
|
59
|
+
m1 = read1.get_tag("XM")
|
|
60
|
+
m2 = read2.get_tag("XM")
|
|
61
|
+
except KeyError:
|
|
62
|
+
continue
|
|
63
|
+
read1Start = read1.reference_start
|
|
64
|
+
read1End = read1.reference_end
|
|
65
|
+
read2Start = read2.reference_start
|
|
66
|
+
read2End = read2.reference_end
|
|
67
|
+
# cfDNAFE logic for overlap
|
|
68
|
+
if not read1.is_reverse: # read1 is forward, read2 is reverse
|
|
69
|
+
if read2Start < read1End:
|
|
70
|
+
overlap = read1End - read2Start
|
|
71
|
+
num_methylated = m1.count("Z") + m2[overlap:].count("Z")
|
|
72
|
+
num_unmethylated = m1.count("z") + m2[overlap:].count("z")
|
|
73
|
+
else:
|
|
74
|
+
num_methylated = m1.count("Z") + m2.count("Z")
|
|
75
|
+
num_unmethylated = m1.count("z") + m2.count("z")
|
|
76
|
+
else: # read1 is reverse, read2 is forward
|
|
77
|
+
if read1Start < read2End:
|
|
78
|
+
overlap = read2End - read1Start
|
|
79
|
+
num_methylated = m2.count("Z") + m1[overlap:].count("Z")
|
|
80
|
+
num_unmethylated = m2.count("z") + m1[overlap:].count("z")
|
|
81
|
+
else:
|
|
82
|
+
num_methylated = m1.count("Z") + m2.count("Z")
|
|
83
|
+
num_unmethylated = m1.count("z") + m2.count("z")
|
|
84
|
+
if num_methylated + num_unmethylated < min_cpg:
|
|
85
|
+
continue
|
|
86
|
+
ratio = num_methylated / (num_methylated + num_unmethylated)
|
|
87
|
+
if ratio >= methy_threshold:
|
|
88
|
+
Mfragment += 1
|
|
89
|
+
elif ratio <= unmethy_threshold:
|
|
90
|
+
Ufragment += 1
|
|
91
|
+
else:
|
|
92
|
+
Xfragment += 1
|
|
93
|
+
elif pe_type == "SE":
|
|
94
|
+
for read in input_file.fetch(mark.chrom, mark.start, mark.end):
|
|
95
|
+
if read.mapping_quality < map_quality:
|
|
96
|
+
continue
|
|
97
|
+
try:
|
|
98
|
+
m = read.get_tag("XM")
|
|
99
|
+
except KeyError:
|
|
100
|
+
continue
|
|
101
|
+
num_methylated = m.count("Z")
|
|
102
|
+
num_unmethylated = m.count("z")
|
|
103
|
+
if num_methylated + num_unmethylated < min_cpg:
|
|
104
|
+
continue
|
|
105
|
+
ratio = num_methylated / (num_methylated + num_unmethylated)
|
|
106
|
+
if ratio >= methy_threshold:
|
|
107
|
+
Mfragment += 1
|
|
108
|
+
elif ratio <= unmethy_threshold:
|
|
109
|
+
Ufragment += 1
|
|
110
|
+
else:
|
|
111
|
+
Xfragment += 1
|
|
112
|
+
else:
|
|
113
|
+
logger.error("type must be SE or PE")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
total = Mfragment + Ufragment + Xfragment
|
|
116
|
+
if total == 0:
|
|
117
|
+
res.append(f"{region}\t0\t0\t0")
|
|
118
|
+
else:
|
|
119
|
+
tmp_array = np.zeros(3)
|
|
120
|
+
tmp_array[0] = Ufragment / total
|
|
121
|
+
tmp_array[1] = Xfragment / total
|
|
122
|
+
tmp_array[2] = Mfragment / total
|
|
123
|
+
res.append(f"{region}\t" + "\t".join(map(str, tmp_array)))
|
|
124
|
+
with open(output_file, 'w') as f:
|
|
125
|
+
f.write('region\tU\tX\tM\n')
|
|
126
|
+
for i in res:
|
|
127
|
+
f.write(i + '\n')
|
|
128
|
+
logger.info(f"UXM calculation complete for {bam_file}. Results in {output_file}.")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"Fatal error in calc_uxm: {e}")
|
|
131
|
+
raise typer.Exit(1)
|
|
132
|
+
|
|
133
|
+
def uxm(
|
|
134
|
+
bam_path: Path = typer.Argument(..., help="Folder containing .bam files for UXM calculation."),
|
|
135
|
+
mark_input: Optional[Path] = typer.Option(None, "--mark-input", "-m", help="Marker BED file (default: packaged atlas)", show_default=False),
|
|
136
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
137
|
+
map_quality: int = typer.Option(30, "--map-quality", "-q", help="Minimum mapping quality"),
|
|
138
|
+
min_cpg: int = typer.Option(4, "--min-cpg", "-c", help="Minimum CpG count per fragment"),
|
|
139
|
+
methy_threshold: float = typer.Option(0.75, "--methy-threshold", "-tM", help="Methylation threshold for M fragments"),
|
|
140
|
+
unmethy_threshold: float = typer.Option(0.25, "--unmethy-threshold", "-tU", help="Unmethylation threshold for U fragments"),
|
|
141
|
+
pe_type: str = typer.Option("SE", "--type", help="Fragment type: SE or PE (default: SE)"),
|
|
142
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
|
|
143
|
+
):
|
|
144
|
+
"""
|
|
145
|
+
Calculate fragment-level methylation (UXM) features for all BAM files in a folder.
|
|
146
|
+
"""
|
|
147
|
+
# Input checks
|
|
148
|
+
if not bam_path.exists():
|
|
149
|
+
logger.error(f"Input BAM directory not found: {bam_path}")
|
|
150
|
+
raise typer.Exit(1)
|
|
151
|
+
if mark_input and not mark_input.exists():
|
|
152
|
+
logger.error(f"Marker BED file not found: {mark_input}")
|
|
153
|
+
raise typer.Exit(1)
|
|
154
|
+
try:
|
|
155
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
158
|
+
raise typer.Exit(1)
|
|
159
|
+
if mark_input is None:
|
|
160
|
+
pkg_dir = Path(__file__).parent
|
|
161
|
+
mark_input = pkg_dir / "data/MethMark/Atlas.U25.l4.hg19.bed"
|
|
162
|
+
bam_files = [f for f in Path(bam_path).glob("*.bam")]
|
|
163
|
+
output = Path(output)
|
|
164
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
165
|
+
def run_uxm_file(bam_file):
|
|
166
|
+
sample_prefix = bam_file.stem.replace('.bam', '')
|
|
167
|
+
output_file = output / f"{sample_prefix}.UXM.tsv"
|
|
168
|
+
calc_uxm(
|
|
169
|
+
bam_file,
|
|
170
|
+
mark_input,
|
|
171
|
+
output_file,
|
|
172
|
+
map_quality,
|
|
173
|
+
min_cpg,
|
|
174
|
+
methy_threshold,
|
|
175
|
+
unmethy_threshold,
|
|
176
|
+
pe_type
|
|
177
|
+
)
|
|
178
|
+
return str(output_file)
|
|
179
|
+
with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
180
|
+
futures = {executor.submit(run_uxm_file, bam_file): bam_file for bam_file in bam_files}
|
|
181
|
+
for future in as_completed(futures):
|
|
182
|
+
bam_file = futures[future]
|
|
183
|
+
try:
|
|
184
|
+
result = future.result()
|
|
185
|
+
logger.info(f"UXM calculated: {result}")
|
|
186
|
+
except Exception as exc:
|
|
187
|
+
logger.error(f"UXM calculation failed for {bam_file}: {exc}")
|
|
188
|
+
logger.info(f"UXM features calculated for {len(bam_files)} files.")
|
krewlyzer/wps.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import pysam
|
|
5
|
+
import pybedtools
|
|
6
|
+
import numpy as np
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import gzip
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.logging import RichHandler
|
|
11
|
+
|
|
12
|
+
from .helpers import max_core, commonError
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
16
|
+
logger = logging.getLogger("wps")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _calc_wps(bedgz_input, tsv_input, output_file_pattern, empty=False, protect_input=120, min_size=120, max_size=180):
|
|
20
|
+
"""
|
|
21
|
+
Calculate Windowed Protection Score (WPS) for a single .bed.gz file and transcript region file.
|
|
22
|
+
Output is gzipped TSV per region.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
bedgzfile = str(bedgz_input)
|
|
26
|
+
tbx = pysam.TabixFile(bedgzfile)
|
|
27
|
+
protection = protect_input // 2
|
|
28
|
+
with open(tsv_input, 'r') as infile:
|
|
29
|
+
prefix = "chr"
|
|
30
|
+
valid_chroms = set(map(str, list(range(1, 23)) + ["X"]))
|
|
31
|
+
logger.info(f"input file: {bedgz_input}, {tsv_input}")
|
|
32
|
+
for line in infile:
|
|
33
|
+
if not line.strip():
|
|
34
|
+
continue
|
|
35
|
+
parts = line.split()
|
|
36
|
+
if len(parts) < 5:
|
|
37
|
+
continue
|
|
38
|
+
cid, chrom, start, end, strand = parts[:5]
|
|
39
|
+
chrom = chrom.replace("chr", "")
|
|
40
|
+
if chrom not in valid_chroms:
|
|
41
|
+
continue
|
|
42
|
+
region_start, region_end = int(float(start)), int(float(end))
|
|
43
|
+
if region_start < 1:
|
|
44
|
+
continue
|
|
45
|
+
pos_range = defaultdict(lambda: [0, 0])
|
|
46
|
+
try:
|
|
47
|
+
from bx.intervals.intersection import Intersecter, Interval
|
|
48
|
+
filtered_reads = Intersecter()
|
|
49
|
+
for row in tbx.fetch(prefix + chrom, region_start - protection, region_end + protection):
|
|
50
|
+
tmp_row = row.split()
|
|
51
|
+
rstart = int(tmp_row[1])
|
|
52
|
+
rend = int(tmp_row[2])
|
|
53
|
+
lseq = rend - rstart
|
|
54
|
+
if lseq < min_size or lseq > max_size:
|
|
55
|
+
continue
|
|
56
|
+
filtered_reads.add_interval(Interval(rstart, rend))
|
|
57
|
+
for i in range(rstart, rend):
|
|
58
|
+
if region_start <= i <= region_end:
|
|
59
|
+
pos_range[i][0] += 1
|
|
60
|
+
if region_start <= rstart <= region_end:
|
|
61
|
+
pos_range[rstart][1] += 1
|
|
62
|
+
if region_start <= rend <= region_end:
|
|
63
|
+
pos_range[rend][1] += 1
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"Error fetching region {chrom}:{region_start}-{region_end}: {e}")
|
|
66
|
+
continue
|
|
67
|
+
filename = output_file_pattern % cid
|
|
68
|
+
with gzip.open(filename, 'wt') as outfile:
|
|
69
|
+
cov_sites = 0
|
|
70
|
+
out_lines = []
|
|
71
|
+
for pos in range(region_start, region_end + 1):
|
|
72
|
+
rstart, rend = pos - protection, pos + protection
|
|
73
|
+
gcount, bcount = 0, 0
|
|
74
|
+
for read in filtered_reads.find(rstart, rend):
|
|
75
|
+
if (read.start > rstart) or (read.end < rend):
|
|
76
|
+
bcount += 1
|
|
77
|
+
else:
|
|
78
|
+
gcount += 1
|
|
79
|
+
cov_count, start_count = pos_range[pos]
|
|
80
|
+
cov_sites += cov_count
|
|
81
|
+
out_lines.append(f"{chrom}\t{pos}\t{cov_count}\t{start_count}\t{gcount - bcount}\n")
|
|
82
|
+
if strand == "-":
|
|
83
|
+
out_lines = out_lines[::-1]
|
|
84
|
+
for line in out_lines:
|
|
85
|
+
outfile.write(line)
|
|
86
|
+
if cov_sites == 0 and not empty:
|
|
87
|
+
import os
|
|
88
|
+
os.remove(filename)
|
|
89
|
+
logger.info(f"WPS calculation complete. Results written to pattern: {output_file_pattern}")
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Fatal error in _calc_wps: {e}")
|
|
92
|
+
raise typer.Exit(1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def wps(
|
|
96
|
+
bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
|
|
97
|
+
tsv_input: Path = typer.Option(None, "--tsv-input", "-t", help="Path to transcript/region file (TSV format)"),
|
|
98
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
|
|
99
|
+
wpstype: str = typer.Option('L', "--wpstype", "-w", help="WPS type: 'L' for long (default), 'S' for short"),
|
|
100
|
+
empty: bool = typer.Option(False, "--empty", help="Keep files of empty blocks (default: False)"),
|
|
101
|
+
threads: int = typer.Option(1, "--threads", "-p", help="Number of threads (default: 1)")
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
Calculate Windowed Protection Score (WPS) features for all .bed.gz files in a folder.
|
|
105
|
+
"""
|
|
106
|
+
# Input checks
|
|
107
|
+
if not bedgz_path.exists():
|
|
108
|
+
logger.error(f"Input directory not found: {bedgz_path}")
|
|
109
|
+
raise typer.Exit(1)
|
|
110
|
+
if tsv_input and not tsv_input.exists():
|
|
111
|
+
logger.error(f"Transcript region file not found: {tsv_input}")
|
|
112
|
+
raise typer.Exit(1)
|
|
113
|
+
try:
|
|
114
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
117
|
+
raise typer.Exit(1)
|
|
118
|
+
try:
|
|
119
|
+
output.touch()
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"Output directory {output} is not writable: {e}")
|
|
122
|
+
raise typer.Exit(1)
|
|
123
|
+
try:
|
|
124
|
+
bedgz_files = list(Path(bedgz_path).glob("*.bed.gz"))
|
|
125
|
+
if not bedgz_files:
|
|
126
|
+
logger.error("No .bed.gz files found in the specified folder.")
|
|
127
|
+
raise typer.Exit(1)
|
|
128
|
+
if tsv_input is None:
|
|
129
|
+
# Default to package data transcriptAnno-hg19-1kb.tsv
|
|
130
|
+
tsv_input = Path(__file__).parent.parent / "data" / "TranscriptAnno" / "transcriptAnno-hg19-1kb.tsv"
|
|
131
|
+
logger.info(f"No tsv_input specified. Using default: {tsv_input}")
|
|
132
|
+
if not tsv_input.exists():
|
|
133
|
+
logger.error(f"Transcript/region file does not exist: {tsv_input}")
|
|
134
|
+
raise typer.Exit(1)
|
|
135
|
+
if wpstype == 'L':
|
|
136
|
+
protect_input = 120
|
|
137
|
+
min_size = 120
|
|
138
|
+
max_size = 180
|
|
139
|
+
else:
|
|
140
|
+
protect_input = 16
|
|
141
|
+
min_size = 35
|
|
142
|
+
max_size = 80
|
|
143
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
144
|
+
logger.info(f"Calculating WPS for {len(bedgz_files)} files...")
|
|
145
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
146
|
+
import traceback
|
|
147
|
+
def wps_task(bedgz_file):
|
|
148
|
+
try:
|
|
149
|
+
output_file_pattern = str(output / (bedgz_file.stem.replace('.bed', '') + ".%s.WPS.tsv.gz"))
|
|
150
|
+
_calc_wps(
|
|
151
|
+
bedgz_input=str(bedgz_file),
|
|
152
|
+
tsv_input=str(tsv_input),
|
|
153
|
+
output_file_pattern=output_file_pattern,
|
|
154
|
+
empty=empty,
|
|
155
|
+
protect_input=protect_input,
|
|
156
|
+
min_size=min_size,
|
|
157
|
+
max_size=max_size
|
|
158
|
+
)
|
|
159
|
+
return None
|
|
160
|
+
except Exception as exc:
|
|
161
|
+
return traceback.format_exc()
|
|
162
|
+
n_procs = max_core(threads) if threads else 1
|
|
163
|
+
logger.info(f"Calculating WPS for {len(bedgz_files)} files using {n_procs} processes...")
|
|
164
|
+
with ProcessPoolExecutor(max_workers=n_procs) as executor:
|
|
165
|
+
futures = {executor.submit(wps_task, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
|
|
166
|
+
for future in as_completed(futures):
|
|
167
|
+
exc = future.result()
|
|
168
|
+
if exc:
|
|
169
|
+
logger.error(f"WPS calculation failed for {futures[future]}:\n{exc}")
|
|
170
|
+
logger.info(f"WPS features calculated for {len(bedgz_files)} files.")
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.error(f"Fatal error in wps CLI: {e}")
|
|
173
|
+
raise typer.Exit(1)
|
krewlyzer/wrapper.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.logging import RichHandler
|
|
6
|
+
from .motif import motif
|
|
7
|
+
from .fsc import fsc
|
|
8
|
+
from .fsr import fsr
|
|
9
|
+
from .fsd import fsd
|
|
10
|
+
from .wps import wps
|
|
11
|
+
from .ocf import ocf
|
|
12
|
+
from .uxm import uxm
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
|
+
logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
|
|
16
|
+
logger = logging.getLogger("krewlyzer-wrapper")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def run_all(
|
|
20
|
+
bam_file: Path = typer.Argument(..., help="Input BAM file (sorted, indexed)"),
|
|
21
|
+
reference: Path = typer.Option(..., "--reference", "-g", help="Reference genome FASTA file for motif extraction"),
|
|
22
|
+
output: Path = typer.Option(..., "--output", "-o", help="Output directory for all results"),
|
|
23
|
+
threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes for each step"),
|
|
24
|
+
pe_type: str = typer.Option("SE", "--type", help="Fragment type for UXM: SE or PE (default: SE)")
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Run all feature extraction commands (motif, fsc, fsr, fsd, wps, ocf, uxm) for a single BAM file.
|
|
28
|
+
"""
|
|
29
|
+
# Input checks
|
|
30
|
+
if not bam_file.exists() or not bam_file.is_file():
|
|
31
|
+
logger.error(f"Input BAM file not found: {bam_file}")
|
|
32
|
+
raise typer.Exit(1)
|
|
33
|
+
if not reference.exists() or not reference.is_file():
|
|
34
|
+
logger.error(f"Reference FASTA file not found: {reference}")
|
|
35
|
+
raise typer.Exit(1)
|
|
36
|
+
try:
|
|
37
|
+
output.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error(f"Could not create output directory {output}: {e}")
|
|
40
|
+
raise typer.Exit(1)
|
|
41
|
+
# 1. Motif extraction
|
|
42
|
+
motif_output = output / "motif"
|
|
43
|
+
try:
|
|
44
|
+
motif(
|
|
45
|
+
bam_path=bam_file,
|
|
46
|
+
reference=reference,
|
|
47
|
+
output=motif_output,
|
|
48
|
+
minlen=65,
|
|
49
|
+
maxlen=400,
|
|
50
|
+
k=3,
|
|
51
|
+
verbose=True,
|
|
52
|
+
threads=threads,
|
|
53
|
+
)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Motif extraction failed: {e}")
|
|
56
|
+
raise typer.Exit(1)
|
|
57
|
+
# 2. FSC
|
|
58
|
+
fsc_output = output / "fsc"
|
|
59
|
+
try:
|
|
60
|
+
fsc(
|
|
61
|
+
bedgz_path=motif_output,
|
|
62
|
+
output=fsc_output,
|
|
63
|
+
threads=threads
|
|
64
|
+
)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f"FSC calculation failed: {e}")
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
# 3. FSR
|
|
69
|
+
fsr_output = output / "fsr"
|
|
70
|
+
try:
|
|
71
|
+
fsr(
|
|
72
|
+
bedgz_path=motif_output,
|
|
73
|
+
output=fsr_output,
|
|
74
|
+
threads=threads
|
|
75
|
+
)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"FSR calculation failed: {e}")
|
|
78
|
+
raise typer.Exit(1)
|
|
79
|
+
# 4. FSD
|
|
80
|
+
fsd_output = output / "fsd"
|
|
81
|
+
try:
|
|
82
|
+
fsd(
|
|
83
|
+
bedgz_path=motif_output,
|
|
84
|
+
output=fsd_output,
|
|
85
|
+
arms_file=None,
|
|
86
|
+
threads=threads
|
|
87
|
+
)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.error(f"FSD calculation failed: {e}")
|
|
90
|
+
raise typer.Exit(1)
|
|
91
|
+
# 5. WPS
|
|
92
|
+
wps_output = output / "wps"
|
|
93
|
+
try:
|
|
94
|
+
wps(
|
|
95
|
+
bedgz_path=motif_output,
|
|
96
|
+
output=wps_output,
|
|
97
|
+
threads=threads
|
|
98
|
+
)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error(f"WPS calculation failed: {e}")
|
|
101
|
+
raise typer.Exit(1)
|
|
102
|
+
# 6. OCF
|
|
103
|
+
ocf_output = output / "ocf"
|
|
104
|
+
try:
|
|
105
|
+
ocf(
|
|
106
|
+
bedgz_path=motif_output,
|
|
107
|
+
output=ocf_output,
|
|
108
|
+
threads=threads
|
|
109
|
+
)
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"OCF calculation failed: {e}")
|
|
112
|
+
raise typer.Exit(1)
|
|
113
|
+
# 7. UXM
|
|
114
|
+
uxm_output = output / "uxm"
|
|
115
|
+
try:
|
|
116
|
+
uxm(
|
|
117
|
+
bam_path=bam_file.parent,
|
|
118
|
+
output=uxm_output,
|
|
119
|
+
pe_type=pe_type,
|
|
120
|
+
threads=threads
|
|
121
|
+
)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"UXM calculation failed: {e}")
|
|
124
|
+
raise typer.Exit(1)
|
|
125
|
+
logger.info(f"All feature extraction complete. Results saved to {output}")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: krewlyzer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Feature extraction tools for circulating tumor DNA from GRCh37 aligned BAM files
|
|
5
|
+
Author-email: Ronak Shah <shahr2@mskcc.org>
|
|
6
|
+
Project-URL: Homepage, https://github.com/msk-access/krewlyzer
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: typer>=0.12.3
|
|
10
|
+
Requires-Dist: pysam>=0.20.0
|
|
11
|
+
Requires-Dist: pandas>=2.0.0
|
|
12
|
+
Requires-Dist: biopython>=1.81
|
|
13
|
+
Requires-Dist: pybedtools>=0.9.0
|
|
14
|
+
Requires-Dist: scikit-misc>=0.1.4
|
|
15
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
krewlyzer/__init__.py,sha256=qpkv4AhtoRCJsN5NGxl1bc5RJo8_l-EduGL1emFww0Y,67
|
|
2
|
+
krewlyzer/cli.py,sha256=MuMUiV79mcotsf73DRDtW3C0wwGhE3jcVvpfjJ3mtA0,1232
|
|
3
|
+
krewlyzer/fsc.py,sha256=NfRiRFxZVVnbGMyWfrYIucz9Em4EhNnWigNdcsekPZk,21688
|
|
4
|
+
krewlyzer/fsd.py,sha256=PVs_aTiitVSHtiEcmbi5ocFe0Xj8g7xmIkcgmC-vXEg,6353
|
|
5
|
+
krewlyzer/fsr.py,sha256=JZg2D3AYCK2Y26SZ7mWo6U-i4zyGrKOolg5m6_DKLvw,8018
|
|
6
|
+
krewlyzer/helpers.py,sha256=-FDOMIh8ANs6lC60vTq_tLrjU4OUdeuRwu1oVpBwyP8,5887
|
|
7
|
+
krewlyzer/motif.py,sha256=yg0RQ2cWSmQyaEqa5VX38DECQ1sWubdi6kDose7q7bg,12490
|
|
8
|
+
krewlyzer/ocf.py,sha256=4XMKjpRE1LlepFzXJcsOIAW3oP9MxQe45PAn3qpdVeA,5816
|
|
9
|
+
krewlyzer/uxm.py,sha256=7ktSFAhn7pqFYwG3UmKP49Sge9Hie0HlP-70X4dcQpc,8483
|
|
10
|
+
krewlyzer/wps.py,sha256=-XDVCbHYyz1uGZUNTkSgBmae8qbK6GZJFzyWFMvjQ2o,7982
|
|
11
|
+
krewlyzer/wrapper.py,sha256=Gpfo-lGY_2hwpcwXN5exVjvhoec8X4wtUrF3YjgHj1A,3833
|
|
12
|
+
krewlyzer-0.1.0.dist-info/licenses/LICENSE,sha256=DuJF49YfFt6g7la7cekWI06XA7ImodNiTEOrEBsOkpk,32365
|
|
13
|
+
krewlyzer-0.1.0.dist-info/METADATA,sha256=RLBb0adv1Ob3XPlfbgpVQnVnS_-srP3RRKtaIy3sDr4,502
|
|
14
|
+
krewlyzer-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
krewlyzer-0.1.0.dist-info/entry_points.txt,sha256=x9Wngsqelv0MxiUvgDDx3hVNR-MhDbASxwKpNeBiX8I,48
|
|
16
|
+
krewlyzer-0.1.0.dist-info/top_level.txt,sha256=bFO6hK-X3pxPGZ7ewoOVx80u9p_Npyy9NW8XophctdY,10
|
|
17
|
+
krewlyzer-0.1.0.dist-info/RECORD,,
|