krewlyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
krewlyzer/ocf.py ADDED
@@ -0,0 +1,133 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import pysam
6
+ import pandas as pd
7
+ from collections import defaultdict
8
+ from functools import partial
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
12
+ import os
13
+
14
+ console = Console()
15
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
16
+ logger = logging.getLogger("ocf")
17
+
18
+
19
+ def calc_ocf(bedgz_file: Path, ocr_file: Path, output_dir: Path):
20
+ """
21
+ Calculate OCF for a single .bed.gz file and OCR region file.
22
+ Output is per-region .sync.end files and a summary all.ocf.csv.
23
+ """
24
+ try:
25
+ tbx = pysam.TabixFile(str(bedgz_file))
26
+ regions = pd.read_csv(ocr_file, sep="\t", header=None, names=["chr", "start", "end", "description"])
27
+ leftPOS = defaultdict(partial(defaultdict, int))
28
+ rightPOS = defaultdict(partial(defaultdict, int))
29
+ total = defaultdict(lambda: [0, 0])
30
+ for _, region in regions.iterrows():
31
+ region_Chr, region_Start, region_End, region_Label = (
32
+ region["chr"], region["start"], region["end"], region["description"])
33
+ try:
34
+ fetched_reads = tbx.fetch(region_Chr, region_Start, region_End)
35
+ except ValueError:
36
+ continue
37
+ for row in fetched_reads:
38
+ tmp_row = row.split()
39
+ rstart = int(tmp_row[1])
40
+ rend = int(tmp_row[2])
41
+ if rstart >= region_Start:
42
+ s = rstart - region_Start
43
+ leftPOS[region_Label][s] += 1
44
+ total[region_Label][0] += 1
45
+ if rend <= region_End:
46
+ e = rend - region_Start + 1
47
+ rightPOS[region_Label][e] += 1
48
+ total[region_Label][1] += 1
49
+ Labels = []
50
+ ocf = []
51
+ outputfile = output_dir / 'all.ocf.csv'
52
+ for label in total.keys():
53
+ output = output_dir / f'{label}.sync.end'
54
+ Labels.append(label)
55
+ le = leftPOS[label]
56
+ re = rightPOS[label]
57
+ ts = total[label][0] / 10000 if total[label][0] else 1
58
+ te = total[label][1] / 10000 if total[label][1] else 1
59
+ num = 2000
60
+ with open(output, 'w') as output_write:
61
+ for k in range(num):
62
+ l = le[k]
63
+ r = re[k]
64
+ output_write.write(
65
+ f"{k - 1000}\t{l}\t{l / ts}\t{r}\t{r / te}\n")
66
+ # OCF calculation
67
+ with open(output, 'r') as o:
68
+ peak = 60
69
+ bin = 10
70
+ trueends = 0
71
+ background = 0
72
+ for line in o.readlines():
73
+ loc, left, Left, right, Right = line.split()
74
+ loc = int(loc)
75
+ if -peak - bin <= loc <= -peak + bin:
76
+ trueends += float(Right)
77
+ background += float(Left)
78
+ elif peak - bin <= loc <= peak + bin:
79
+ trueends += float(Left)
80
+ background += float(Right)
81
+ ocf.append(trueends - background)
82
+ import pandas as pd
83
+ ocf_df = pd.DataFrame({"tissue": Labels, "OCF": ocf})
84
+ ocf_df.to_csv(outputfile, sep="\t", index=None)
85
+ logger.info(f"OCF calculation complete for {bedgz_file}. Results in {output_dir}.")
86
+ except Exception as e:
87
+ logger.error(f"Fatal error in calc_ocf: {e}")
88
+ raise typer.Exit(1)
89
+
90
+
91
+ def ocf(
92
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
93
+ ocr_input: Optional[Path] = typer.Option(None, "--ocr-input", "-r", help="Path to open chromatin region BED file (default: packaged tissue file)"),
94
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
95
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
96
+ ):
97
+ """
98
+ Calculate orientation-aware cfDNA fragmentation (OCF) features for all .bed.gz files in a folder.
99
+ """
100
+ # Input checks
101
+ if not bedgz_path.exists():
102
+ logger.error(f"Input directory not found: {bedgz_path}")
103
+ raise typer.Exit(1)
104
+ if ocr_input and not ocr_input.exists():
105
+ logger.error(f"OCR region BED file not found: {ocr_input}")
106
+ raise typer.Exit(1)
107
+ try:
108
+ output.mkdir(parents=True, exist_ok=True)
109
+ except Exception as e:
110
+ logger.error(f"Could not create output directory {output}: {e}")
111
+ raise typer.Exit(1)
112
+ # Set default OCR file if not provided
113
+ if ocr_input is None:
114
+ pkg_dir = Path(__file__).parent
115
+ ocr_input = pkg_dir / "data/OpenChromatinRegion/7specificTissue.all.OC.bed"
116
+ bedgz_files = [f for f in Path(bedgz_path).glob("*.bed.gz")]
117
+ output = Path(output)
118
+ output.mkdir(parents=True, exist_ok=True)
119
+ def run_ocf_file(bedgz_file):
120
+ sample_dir = output / bedgz_file.stem.replace('.bed', '')
121
+ sample_dir.mkdir(exist_ok=True)
122
+ calc_ocf(bedgz_file, ocr_input, sample_dir)
123
+ return str(sample_dir)
124
+ with ProcessPoolExecutor(max_workers=threads) as executor:
125
+ futures = {executor.submit(run_ocf_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
126
+ for future in as_completed(futures):
127
+ bedgz_file = futures[future]
128
+ try:
129
+ result = future.result()
130
+ logger.info(f"OCF calculated: {result}")
131
+ except Exception as exc:
132
+ logger.error(f"OCF calculation failed for {bedgz_file}: {exc}")
133
+ logger.info(f"OCF features calculated for {len(bedgz_files)} files.")
krewlyzer/uxm.py ADDED
@@ -0,0 +1,188 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import pysam
6
+ import pybedtools
7
+ import numpy as np
8
+ from rich.console import Console
9
+ from rich.logging import RichHandler
10
+ from concurrent.futures import ProcessPoolExecutor, as_completed
11
+ import os
12
+
13
+ console = Console()
14
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
15
+ logger = logging.getLogger("uxm")
16
+
17
+ def calc_uxm(
18
+ bam_file: Path,
19
+ mark_file: Path,
20
+ output_file: Path,
21
+ map_quality: int,
22
+ min_cpg: int,
23
+ methy_threshold: float,
24
+ unmethy_threshold: float,
25
+ pe_type: str = "PE"
26
+ ):
27
+ """
28
+ Calculate UXM fragment-level methylation for a single BAM and marker file.
29
+ Output is a .UXM.tsv file with region, U, X, M proportions.
30
+ """
31
+ try:
32
+ bai = str(bam_file) + ".bai"
33
+ if not os.path.exists(bai):
34
+ pysam.sort("-o", str(bam_file), str(bam_file))
35
+ pysam.index(str(bam_file))
36
+ logger.warning(f"Index file {bai} did not exist. Sorted and indexed BAM.")
37
+ input_file = pysam.AlignmentFile(str(bam_file))
38
+ marks = pybedtools.BedTool(str(mark_file))
39
+ res = []
40
+ for mark in marks:
41
+ region = f"{mark.chrom}:{mark.start}-{mark.end}"
42
+ try:
43
+ input_file.fetch(mark.chrom, mark.start, mark.end)
44
+ except ValueError:
45
+ res.append(f"{region}\t0\t0\t0")
46
+ continue
47
+ Ufragment = 0
48
+ Xfragment = 0
49
+ Mfragment = 0
50
+ if pe_type == "PE":
51
+ from krewlyzer.helpers import read_pair_generator
52
+ region_string = f"{mark.chrom}:{mark.start}-{mark.end}"
53
+ for read1, read2 in read_pair_generator(input_file, region_string):
54
+ if read1 is None or read2 is None:
55
+ continue
56
+ if read1.mapping_quality < map_quality or read2.mapping_quality < map_quality:
57
+ continue
58
+ try:
59
+ m1 = read1.get_tag("XM")
60
+ m2 = read2.get_tag("XM")
61
+ except KeyError:
62
+ continue
63
+ read1Start = read1.reference_start
64
+ read1End = read1.reference_end
65
+ read2Start = read2.reference_start
66
+ read2End = read2.reference_end
67
+ # cfDNAFE logic for overlap
68
+ if not read1.is_reverse: # read1 is forward, read2 is reverse
69
+ if read2Start < read1End:
70
+ overlap = read1End - read2Start
71
+ num_methylated = m1.count("Z") + m2[overlap:].count("Z")
72
+ num_unmethylated = m1.count("z") + m2[overlap:].count("z")
73
+ else:
74
+ num_methylated = m1.count("Z") + m2.count("Z")
75
+ num_unmethylated = m1.count("z") + m2.count("z")
76
+ else: # read1 is reverse, read2 is forward
77
+ if read1Start < read2End:
78
+ overlap = read2End - read1Start
79
+ num_methylated = m2.count("Z") + m1[overlap:].count("Z")
80
+ num_unmethylated = m2.count("z") + m1[overlap:].count("z")
81
+ else:
82
+ num_methylated = m1.count("Z") + m2.count("Z")
83
+ num_unmethylated = m1.count("z") + m2.count("z")
84
+ if num_methylated + num_unmethylated < min_cpg:
85
+ continue
86
+ ratio = num_methylated / (num_methylated + num_unmethylated)
87
+ if ratio >= methy_threshold:
88
+ Mfragment += 1
89
+ elif ratio <= unmethy_threshold:
90
+ Ufragment += 1
91
+ else:
92
+ Xfragment += 1
93
+ elif pe_type == "SE":
94
+ for read in input_file.fetch(mark.chrom, mark.start, mark.end):
95
+ if read.mapping_quality < map_quality:
96
+ continue
97
+ try:
98
+ m = read.get_tag("XM")
99
+ except KeyError:
100
+ continue
101
+ num_methylated = m.count("Z")
102
+ num_unmethylated = m.count("z")
103
+ if num_methylated + num_unmethylated < min_cpg:
104
+ continue
105
+ ratio = num_methylated / (num_methylated + num_unmethylated)
106
+ if ratio >= methy_threshold:
107
+ Mfragment += 1
108
+ elif ratio <= unmethy_threshold:
109
+ Ufragment += 1
110
+ else:
111
+ Xfragment += 1
112
+ else:
113
+ logger.error("type must be SE or PE")
114
+ raise typer.Exit(1)
115
+ total = Mfragment + Ufragment + Xfragment
116
+ if total == 0:
117
+ res.append(f"{region}\t0\t0\t0")
118
+ else:
119
+ tmp_array = np.zeros(3)
120
+ tmp_array[0] = Ufragment / total
121
+ tmp_array[1] = Xfragment / total
122
+ tmp_array[2] = Mfragment / total
123
+ res.append(f"{region}\t" + "\t".join(map(str, tmp_array)))
124
+ with open(output_file, 'w') as f:
125
+ f.write('region\tU\tX\tM\n')
126
+ for i in res:
127
+ f.write(i + '\n')
128
+ logger.info(f"UXM calculation complete for {bam_file}. Results in {output_file}.")
129
+ except Exception as e:
130
+ logger.error(f"Fatal error in calc_uxm: {e}")
131
+ raise typer.Exit(1)
132
+
133
+ def uxm(
134
+ bam_path: Path = typer.Argument(..., help="Folder containing .bam files for UXM calculation."),
135
+ mark_input: Optional[Path] = typer.Option(None, "--mark-input", "-m", help="Marker BED file (default: packaged atlas)", show_default=False),
136
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
137
+ map_quality: int = typer.Option(30, "--map-quality", "-q", help="Minimum mapping quality"),
138
+ min_cpg: int = typer.Option(4, "--min-cpg", "-c", help="Minimum CpG count per fragment"),
139
+ methy_threshold: float = typer.Option(0.75, "--methy-threshold", "-tM", help="Methylation threshold for M fragments"),
140
+ unmethy_threshold: float = typer.Option(0.25, "--unmethy-threshold", "-tU", help="Unmethylation threshold for U fragments"),
141
+ pe_type: str = typer.Option("SE", "--type", help="Fragment type: SE or PE (default: SE)"),
142
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
143
+ ):
144
+ """
145
+ Calculate fragment-level methylation (UXM) features for all BAM files in a folder.
146
+ """
147
+ # Input checks
148
+ if not bam_path.exists():
149
+ logger.error(f"Input BAM directory not found: {bam_path}")
150
+ raise typer.Exit(1)
151
+ if mark_input and not mark_input.exists():
152
+ logger.error(f"Marker BED file not found: {mark_input}")
153
+ raise typer.Exit(1)
154
+ try:
155
+ output.mkdir(parents=True, exist_ok=True)
156
+ except Exception as e:
157
+ logger.error(f"Could not create output directory {output}: {e}")
158
+ raise typer.Exit(1)
159
+ if mark_input is None:
160
+ pkg_dir = Path(__file__).parent
161
+ mark_input = pkg_dir / "data/MethMark/Atlas.U25.l4.hg19.bed"
162
+ bam_files = [f for f in Path(bam_path).glob("*.bam")]
163
+ output = Path(output)
164
+ output.mkdir(parents=True, exist_ok=True)
165
+ def run_uxm_file(bam_file):
166
+ sample_prefix = bam_file.stem.replace('.bam', '')
167
+ output_file = output / f"{sample_prefix}.UXM.tsv"
168
+ calc_uxm(
169
+ bam_file,
170
+ mark_input,
171
+ output_file,
172
+ map_quality,
173
+ min_cpg,
174
+ methy_threshold,
175
+ unmethy_threshold,
176
+ pe_type
177
+ )
178
+ return str(output_file)
179
+ with ProcessPoolExecutor(max_workers=threads) as executor:
180
+ futures = {executor.submit(run_uxm_file, bam_file): bam_file for bam_file in bam_files}
181
+ for future in as_completed(futures):
182
+ bam_file = futures[future]
183
+ try:
184
+ result = future.result()
185
+ logger.info(f"UXM calculated: {result}")
186
+ except Exception as exc:
187
+ logger.error(f"UXM calculation failed for {bam_file}: {exc}")
188
+ logger.info(f"UXM features calculated for {len(bam_files)} files.")
krewlyzer/wps.py ADDED
@@ -0,0 +1,173 @@
1
+ import typer
2
+ from pathlib import Path
3
+ import logging
4
+ import pysam
5
+ import pybedtools
6
+ import numpy as np
7
+ from collections import defaultdict
8
+ import gzip
9
+ from rich.console import Console
10
+ from rich.logging import RichHandler
11
+
12
+ from .helpers import max_core, commonError
13
+
14
+ console = Console()
15
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
16
+ logger = logging.getLogger("wps")
17
+
18
+
19
+ def _calc_wps(bedgz_input, tsv_input, output_file_pattern, empty=False, protect_input=120, min_size=120, max_size=180):
20
+ """
21
+ Calculate Windowed Protection Score (WPS) for a single .bed.gz file and transcript region file.
22
+ Output is gzipped TSV per region.
23
+ """
24
+ try:
25
+ bedgzfile = str(bedgz_input)
26
+ tbx = pysam.TabixFile(bedgzfile)
27
+ protection = protect_input // 2
28
+ with open(tsv_input, 'r') as infile:
29
+ prefix = "chr"
30
+ valid_chroms = set(map(str, list(range(1, 23)) + ["X"]))
31
+ logger.info(f"input file: {bedgz_input}, {tsv_input}")
32
+ for line in infile:
33
+ if not line.strip():
34
+ continue
35
+ parts = line.split()
36
+ if len(parts) < 5:
37
+ continue
38
+ cid, chrom, start, end, strand = parts[:5]
39
+ chrom = chrom.replace("chr", "")
40
+ if chrom not in valid_chroms:
41
+ continue
42
+ region_start, region_end = int(float(start)), int(float(end))
43
+ if region_start < 1:
44
+ continue
45
+ pos_range = defaultdict(lambda: [0, 0])
46
+ try:
47
+ from bx.intervals.intersection import Intersecter, Interval
48
+ filtered_reads = Intersecter()
49
+ for row in tbx.fetch(prefix + chrom, region_start - protection, region_end + protection):
50
+ tmp_row = row.split()
51
+ rstart = int(tmp_row[1])
52
+ rend = int(tmp_row[2])
53
+ lseq = rend - rstart
54
+ if lseq < min_size or lseq > max_size:
55
+ continue
56
+ filtered_reads.add_interval(Interval(rstart, rend))
57
+ for i in range(rstart, rend):
58
+ if region_start <= i <= region_end:
59
+ pos_range[i][0] += 1
60
+ if region_start <= rstart <= region_end:
61
+ pos_range[rstart][1] += 1
62
+ if region_start <= rend <= region_end:
63
+ pos_range[rend][1] += 1
64
+ except Exception as e:
65
+ logger.error(f"Error fetching region {chrom}:{region_start}-{region_end}: {e}")
66
+ continue
67
+ filename = output_file_pattern % cid
68
+ with gzip.open(filename, 'wt') as outfile:
69
+ cov_sites = 0
70
+ out_lines = []
71
+ for pos in range(region_start, region_end + 1):
72
+ rstart, rend = pos - protection, pos + protection
73
+ gcount, bcount = 0, 0
74
+ for read in filtered_reads.find(rstart, rend):
75
+ if (read.start > rstart) or (read.end < rend):
76
+ bcount += 1
77
+ else:
78
+ gcount += 1
79
+ cov_count, start_count = pos_range[pos]
80
+ cov_sites += cov_count
81
+ out_lines.append(f"{chrom}\t{pos}\t{cov_count}\t{start_count}\t{gcount - bcount}\n")
82
+ if strand == "-":
83
+ out_lines = out_lines[::-1]
84
+ for line in out_lines:
85
+ outfile.write(line)
86
+ if cov_sites == 0 and not empty:
87
+ import os
88
+ os.remove(filename)
89
+ logger.info(f"WPS calculation complete. Results written to pattern: {output_file_pattern}")
90
+ except Exception as e:
91
+ logger.error(f"Fatal error in _calc_wps: {e}")
92
+ raise typer.Exit(1)
93
+
94
+
95
+ def wps(
96
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
97
+ tsv_input: Path = typer.Option(None, "--tsv-input", "-t", help="Path to transcript/region file (TSV format)"),
98
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
99
+ wpstype: str = typer.Option('L', "--wpstype", "-w", help="WPS type: 'L' for long (default), 'S' for short"),
100
+ empty: bool = typer.Option(False, "--empty", help="Keep files of empty blocks (default: False)"),
101
+ threads: int = typer.Option(1, "--threads", "-p", help="Number of threads (default: 1)")
102
+ ):
103
+ """
104
+ Calculate Windowed Protection Score (WPS) features for all .bed.gz files in a folder.
105
+ """
106
+ # Input checks
107
+ if not bedgz_path.exists():
108
+ logger.error(f"Input directory not found: {bedgz_path}")
109
+ raise typer.Exit(1)
110
+ if tsv_input and not tsv_input.exists():
111
+ logger.error(f"Transcript region file not found: {tsv_input}")
112
+ raise typer.Exit(1)
113
+ try:
114
+ output.mkdir(parents=True, exist_ok=True)
115
+ except Exception as e:
116
+ logger.error(f"Could not create output directory {output}: {e}")
117
+ raise typer.Exit(1)
118
+ try:
119
+ output.touch()
120
+ except Exception as e:
121
+ logger.error(f"Output directory {output} is not writable: {e}")
122
+ raise typer.Exit(1)
123
+ try:
124
+ bedgz_files = list(Path(bedgz_path).glob("*.bed.gz"))
125
+ if not bedgz_files:
126
+ logger.error("No .bed.gz files found in the specified folder.")
127
+ raise typer.Exit(1)
128
+ if tsv_input is None:
129
+ # Default to package data transcriptAnno-hg19-1kb.tsv
130
+ tsv_input = Path(__file__).parent.parent / "data" / "TranscriptAnno" / "transcriptAnno-hg19-1kb.tsv"
131
+ logger.info(f"No tsv_input specified. Using default: {tsv_input}")
132
+ if not tsv_input.exists():
133
+ logger.error(f"Transcript/region file does not exist: {tsv_input}")
134
+ raise typer.Exit(1)
135
+ if wpstype == 'L':
136
+ protect_input = 120
137
+ min_size = 120
138
+ max_size = 180
139
+ else:
140
+ protect_input = 16
141
+ min_size = 35
142
+ max_size = 80
143
+ output.mkdir(parents=True, exist_ok=True)
144
+ logger.info(f"Calculating WPS for {len(bedgz_files)} files...")
145
+ from concurrent.futures import ProcessPoolExecutor, as_completed
146
+ import traceback
147
+ def wps_task(bedgz_file):
148
+ try:
149
+ output_file_pattern = str(output / (bedgz_file.stem.replace('.bed', '') + ".%s.WPS.tsv.gz"))
150
+ _calc_wps(
151
+ bedgz_input=str(bedgz_file),
152
+ tsv_input=str(tsv_input),
153
+ output_file_pattern=output_file_pattern,
154
+ empty=empty,
155
+ protect_input=protect_input,
156
+ min_size=min_size,
157
+ max_size=max_size
158
+ )
159
+ return None
160
+ except Exception as exc:
161
+ return traceback.format_exc()
162
+ n_procs = max_core(threads) if threads else 1
163
+ logger.info(f"Calculating WPS for {len(bedgz_files)} files using {n_procs} processes...")
164
+ with ProcessPoolExecutor(max_workers=n_procs) as executor:
165
+ futures = {executor.submit(wps_task, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
166
+ for future in as_completed(futures):
167
+ exc = future.result()
168
+ if exc:
169
+ logger.error(f"WPS calculation failed for {futures[future]}:\n{exc}")
170
+ logger.info(f"WPS features calculated for {len(bedgz_files)} files.")
171
+ except Exception as e:
172
+ logger.error(f"Fatal error in wps CLI: {e}")
173
+ raise typer.Exit(1)
krewlyzer/wrapper.py ADDED
@@ -0,0 +1,125 @@
1
+ import typer
2
+ from pathlib import Path
3
+ import logging
4
+ from rich.console import Console
5
+ from rich.logging import RichHandler
6
+ from .motif import motif
7
+ from .fsc import fsc
8
+ from .fsr import fsr
9
+ from .fsd import fsd
10
+ from .wps import wps
11
+ from .ocf import ocf
12
+ from .uxm import uxm
13
+
14
+ console = Console()
15
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
16
+ logger = logging.getLogger("krewlyzer-wrapper")
17
+
18
+
19
+ def run_all(
20
+ bam_file: Path = typer.Argument(..., help="Input BAM file (sorted, indexed)"),
21
+ reference: Path = typer.Option(..., "--reference", "-g", help="Reference genome FASTA file for motif extraction"),
22
+ output: Path = typer.Option(..., "--output", "-o", help="Output directory for all results"),
23
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes for each step"),
24
+ pe_type: str = typer.Option("SE", "--type", help="Fragment type for UXM: SE or PE (default: SE)")
25
+ ):
26
+ """
27
+ Run all feature extraction commands (motif, fsc, fsr, fsd, wps, ocf, uxm) for a single BAM file.
28
+ """
29
+ # Input checks
30
+ if not bam_file.exists() or not bam_file.is_file():
31
+ logger.error(f"Input BAM file not found: {bam_file}")
32
+ raise typer.Exit(1)
33
+ if not reference.exists() or not reference.is_file():
34
+ logger.error(f"Reference FASTA file not found: {reference}")
35
+ raise typer.Exit(1)
36
+ try:
37
+ output.mkdir(parents=True, exist_ok=True)
38
+ except Exception as e:
39
+ logger.error(f"Could not create output directory {output}: {e}")
40
+ raise typer.Exit(1)
41
+ # 1. Motif extraction
42
+ motif_output = output / "motif"
43
+ try:
44
+ motif(
45
+ bam_path=bam_file,
46
+ reference=reference,
47
+ output=motif_output,
48
+ minlen=65,
49
+ maxlen=400,
50
+ k=3,
51
+ verbose=True,
52
+ threads=threads,
53
+ )
54
+ except Exception as e:
55
+ logger.error(f"Motif extraction failed: {e}")
56
+ raise typer.Exit(1)
57
+ # 2. FSC
58
+ fsc_output = output / "fsc"
59
+ try:
60
+ fsc(
61
+ bedgz_path=motif_output,
62
+ output=fsc_output,
63
+ threads=threads
64
+ )
65
+ except Exception as e:
66
+ logger.error(f"FSC calculation failed: {e}")
67
+ raise typer.Exit(1)
68
+ # 3. FSR
69
+ fsr_output = output / "fsr"
70
+ try:
71
+ fsr(
72
+ bedgz_path=motif_output,
73
+ output=fsr_output,
74
+ threads=threads
75
+ )
76
+ except Exception as e:
77
+ logger.error(f"FSR calculation failed: {e}")
78
+ raise typer.Exit(1)
79
+ # 4. FSD
80
+ fsd_output = output / "fsd"
81
+ try:
82
+ fsd(
83
+ bedgz_path=motif_output,
84
+ output=fsd_output,
85
+ arms_file=None,
86
+ threads=threads
87
+ )
88
+ except Exception as e:
89
+ logger.error(f"FSD calculation failed: {e}")
90
+ raise typer.Exit(1)
91
+ # 5. WPS
92
+ wps_output = output / "wps"
93
+ try:
94
+ wps(
95
+ bedgz_path=motif_output,
96
+ output=wps_output,
97
+ threads=threads
98
+ )
99
+ except Exception as e:
100
+ logger.error(f"WPS calculation failed: {e}")
101
+ raise typer.Exit(1)
102
+ # 6. OCF
103
+ ocf_output = output / "ocf"
104
+ try:
105
+ ocf(
106
+ bedgz_path=motif_output,
107
+ output=ocf_output,
108
+ threads=threads
109
+ )
110
+ except Exception as e:
111
+ logger.error(f"OCF calculation failed: {e}")
112
+ raise typer.Exit(1)
113
+ # 7. UXM
114
+ uxm_output = output / "uxm"
115
+ try:
116
+ uxm(
117
+ bam_path=bam_file.parent,
118
+ output=uxm_output,
119
+ pe_type=pe_type,
120
+ threads=threads
121
+ )
122
+ except Exception as e:
123
+ logger.error(f"UXM calculation failed: {e}")
124
+ raise typer.Exit(1)
125
+ logger.info(f"All feature extraction complete. Results saved to {output}")
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: krewlyzer
3
+ Version: 0.1.0
4
+ Summary: Feature extraction tools for circulating tumor DNA from GRCh37 aligned BAM files
5
+ Author-email: Ronak Shah <shahr2@mskcc.org>
6
+ Project-URL: Homepage, https://github.com/msk-access/krewlyzer
7
+ Requires-Python: >=3.8
8
+ License-File: LICENSE
9
+ Requires-Dist: typer>=0.12.3
10
+ Requires-Dist: pysam>=0.20.0
11
+ Requires-Dist: pandas>=2.0.0
12
+ Requires-Dist: biopython>=1.81
13
+ Requires-Dist: pybedtools>=0.9.0
14
+ Requires-Dist: scikit-misc>=0.1.4
15
+ Dynamic: license-file
@@ -0,0 +1,17 @@
1
+ krewlyzer/__init__.py,sha256=qpkv4AhtoRCJsN5NGxl1bc5RJo8_l-EduGL1emFww0Y,67
2
+ krewlyzer/cli.py,sha256=MuMUiV79mcotsf73DRDtW3C0wwGhE3jcVvpfjJ3mtA0,1232
3
+ krewlyzer/fsc.py,sha256=NfRiRFxZVVnbGMyWfrYIucz9Em4EhNnWigNdcsekPZk,21688
4
+ krewlyzer/fsd.py,sha256=PVs_aTiitVSHtiEcmbi5ocFe0Xj8g7xmIkcgmC-vXEg,6353
5
+ krewlyzer/fsr.py,sha256=JZg2D3AYCK2Y26SZ7mWo6U-i4zyGrKOolg5m6_DKLvw,8018
6
+ krewlyzer/helpers.py,sha256=-FDOMIh8ANs6lC60vTq_tLrjU4OUdeuRwu1oVpBwyP8,5887
7
+ krewlyzer/motif.py,sha256=yg0RQ2cWSmQyaEqa5VX38DECQ1sWubdi6kDose7q7bg,12490
8
+ krewlyzer/ocf.py,sha256=4XMKjpRE1LlepFzXJcsOIAW3oP9MxQe45PAn3qpdVeA,5816
9
+ krewlyzer/uxm.py,sha256=7ktSFAhn7pqFYwG3UmKP49Sge9Hie0HlP-70X4dcQpc,8483
10
+ krewlyzer/wps.py,sha256=-XDVCbHYyz1uGZUNTkSgBmae8qbK6GZJFzyWFMvjQ2o,7982
11
+ krewlyzer/wrapper.py,sha256=Gpfo-lGY_2hwpcwXN5exVjvhoec8X4wtUrF3YjgHj1A,3833
12
+ krewlyzer-0.1.0.dist-info/licenses/LICENSE,sha256=DuJF49YfFt6g7la7cekWI06XA7ImodNiTEOrEBsOkpk,32365
13
+ krewlyzer-0.1.0.dist-info/METADATA,sha256=RLBb0adv1Ob3XPlfbgpVQnVnS_-srP3RRKtaIy3sDr4,502
14
+ krewlyzer-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ krewlyzer-0.1.0.dist-info/entry_points.txt,sha256=x9Wngsqelv0MxiUvgDDx3hVNR-MhDbASxwKpNeBiX8I,48
16
+ krewlyzer-0.1.0.dist-info/top_level.txt,sha256=bFO6hK-X3pxPGZ7ewoOVx80u9p_Npyy9NW8XophctdY,10
17
+ krewlyzer-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ krewlyzer = krewlyzer.cli:app