krewlyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
krewlyzer/fsr.py ADDED
@@ -0,0 +1,171 @@
1
+ import typer
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import logging
5
+ import sys
6
+
7
+ import pysam
8
+ import pybedtools
9
+ import numpy as np
10
+ from rich.console import Console
11
+ from rich.logging import RichHandler
12
+
13
+ console = Console()
14
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
15
+ logger = logging.getLogger("fsr")
16
+
17
+ from .helpers import gc_correct
18
+
19
+ def _calc_fsr(bedgz_input, bin_input, windows, continue_n, output_file):
20
+ """
21
+ Internal: Calculate fragment size ratio (FSR) for a single .bed.gz file.
22
+ Writes region-based ratios for short, intermediate, and long fragments.
23
+ """
24
+ try:
25
+ logger.info(f"input file: {bedgz_input}, {bin_input}")
26
+ try:
27
+ inputbed = pysam.Tabixfile(filename=bedgz_input, mode="r")
28
+ except Exception as e:
29
+ logger.error(f"Could not open {bedgz_input} as Tabix file: {e}")
30
+ raise typer.Exit(1)
31
+ try:
32
+ bins = pybedtools.BedTool(bin_input)
33
+ except Exception as e:
34
+ logger.error(f"Could not load bins from {bin_input}: {e}")
35
+ raise typer.Exit(1)
36
+ length = len(bins)
37
+ shorts_data, intermediates_data, longs_data, totals_data, bingc = [], [], [], [], []
38
+ chrom = []
39
+ logger.info(f"output file: {output_file}")
40
+ for idx in range(length):
41
+ bin = bins[idx]
42
+ try:
43
+ chrom.append(bin.chrom)
44
+ inputbed.fetch(bin.chrom, bin.start, bin.end)
45
+ except ValueError:
46
+ bingc.append(np.nan)
47
+ shorts_data.append(0)
48
+ intermediates_data.append(0)
49
+ longs_data.append(0)
50
+ except Exception as e:
51
+ logger.error(f"Error fetching bin {bin}: {e}")
52
+ raise typer.Exit(1)
53
+ else:
54
+ bin_data = []
55
+ gc = []
56
+ try:
57
+ for read in inputbed.fetch(bin.chrom, bin.start, bin.end):
58
+ bin_data.append(int(read.split("\t")[2]) - int(read.split("\t")[1]))
59
+ if 65 <= int(read.split("\t")[2]) - int(read.split("\t")[1]) <= 400:
60
+ gc.append(float(read.split("\t")[3]))
61
+ count = np.bincount(bin_data, minlength=401)
62
+ except Exception as e:
63
+ logger.error(f"Error processing reads in bin {bin}: {e}")
64
+ raise typer.Exit(1)
65
+ if len(gc) == 0:
66
+ bingc.append(np.nan)
67
+ else:
68
+ bingc.append(np.mean(gc))
69
+ shorts = sum(count[65:150])
70
+ intermediates = sum(count[151:260])
71
+ longs = sum(count[261:400])
72
+ totals = sum(count[65:400])
73
+ if totals == 0:
74
+ shorts_data.append(0)
75
+ intermediates_data.append(0)
76
+ longs_data.append(0)
77
+ else:
78
+ shorts_data.append(shorts / totals)
79
+ intermediates_data.append(intermediates / totals)
80
+ longs_data.append(longs / totals)
81
+ start = 0
82
+ step = 0
83
+ try:
84
+ with open(output_file, 'w') as fsrfile:
85
+ fsrfile.write("region\tshort-ratio\titermediate-ratio\tlong-ratio\n")
86
+ while step < length:
87
+ num = chrom.count(chrom[step])
88
+ continues_bin = num // continue_n
89
+ last_bin = num % continue_n
90
+ for _ in range(continues_bin):
91
+ bin_start = start * windows
92
+ bin_end = (start + continue_n) * windows - 1
93
+ combine_shorts = shorts_data[step: step + continue_n]
94
+ combine_intermediates = intermediates_data[step: step + continue_n]
95
+ combine_longs = longs_data[step: step + continue_n]
96
+ tmp_array = np.zeros(3)
97
+ tmp_array[0] = np.mean(combine_shorts)
98
+ tmp_array[1] = np.mean(combine_intermediates)
99
+ tmp_array[2] = np.mean(combine_longs)
100
+ region = f"{chrom[step]}:{bin_start}-{bin_end}"
101
+ temp_str = f"{region}\t" + "\t".join(map(str, tmp_array)) + "\n"
102
+ fsrfile.write(temp_str)
103
+ step += continue_n
104
+ start += continue_n
105
+ if last_bin != 0:
106
+ step += last_bin
107
+ start = 0
108
+ except Exception as e:
109
+ logger.error(f"Error writing FSR output file: {e}")
110
+ raise typer.Exit(1)
111
+ logger.info(f"FSR calculation complete. Results written to {output_file}")
112
+ except Exception as e:
113
+ logger.error(f"Fatal error in _calc_fsr: {e}")
114
+ raise typer.Exit(1)
115
+
116
+ def fsr(
117
+ bedgz_path: Path = typer.Argument(..., help="Folder containing .bed.gz files (should be the output directory from motif.py)"),
118
+ bin_input: Optional[Path] = typer.Option(None, "--bin-input", "-b", help="Path to bin file (default: data/ChormosomeBins/hg19_window_100kb.bed)"),
119
+ windows: int = typer.Option(100000, "--windows", "-w", help="Window size (default: 100000)"),
120
+ continue_n: int = typer.Option(50, "--continue-n", "-c", help="Consecutive window number (default: 50)"),
121
+ output: Path = typer.Option(..., "--output", "-o", help="Output folder for results"),
122
+ threads: int = typer.Option(1, "--threads", "-t", help="Number of parallel processes (default: 1)")
123
+ ):
124
+ """
125
+ Calculate fragment size ratio (FSR) features for all .bed.gz files in a folder.
126
+ """
127
+ # Input checks
128
+ if not bedgz_path.exists():
129
+ logger.error(f"Input directory not found: {bedgz_path}")
130
+ raise typer.Exit(1)
131
+ if bin_input and not bin_input.exists():
132
+ logger.error(f"Bin input file not found: {bin_input}")
133
+ raise typer.Exit(1)
134
+ try:
135
+ output.mkdir(parents=True, exist_ok=True)
136
+ except Exception as e:
137
+ logger.error(f"Could not create output directory {output}: {e}")
138
+ raise typer.Exit(1)
139
+ """
140
+ The input folder should be the output directory produced by motif.py, containing the .bed.gz files.
141
+ Output files are written to the output directory, one per .bed.gz file.
142
+ """
143
+ if not output.exists():
144
+ output.mkdir(parents=True, exist_ok=True)
145
+ bedgz_files = [f for f in bedgz_path.iterdir() if f.suffixes == ['.bed', '.gz']]
146
+ if not bedgz_files:
147
+ logger.error("No .bed.gz files found in the specified folder.")
148
+ raise typer.Exit(1)
149
+ if bin_input is None:
150
+ bin_input = Path(__file__).parent / "data" / "ChormosomeBins" / "hg19_window_100kb.bed"
151
+ logger.info(f"No bin_input specified. Using default: {bin_input}")
152
+ if not bin_input.exists():
153
+ logger.error(f"Bin input file does not exist: {bin_input}")
154
+ raise typer.Exit(1)
155
+ logger.info(f"Calculating FSR for {len(bedgz_files)} files...")
156
+ from concurrent.futures import ProcessPoolExecutor, as_completed
157
+ logger.info(f"Starting parallel FSR calculation using {threads} processes...")
158
+ def run_fsr_file(bedgz_file):
159
+ output_file = output / (bedgz_file.stem.replace('.bed', '') + '.FSR.txt')
160
+ _calc_fsr(str(bedgz_file), str(bin_input), windows, continue_n, str(output_file))
161
+ return str(output_file)
162
+ with ProcessPoolExecutor(max_workers=threads) as executor:
163
+ futures = {executor.submit(run_fsr_file, bedgz_file): bedgz_file for bedgz_file in bedgz_files}
164
+ for future in as_completed(futures):
165
+ bedgz_file = futures[future]
166
+ try:
167
+ result = future.result()
168
+ logger.info(f"FSR calculated: {result}")
169
+ except Exception as exc:
170
+ logger.error(f"FSR calculation failed for {bedgz_file}: {exc}")
171
+ logger.info(f"FSR features calculated for {len(bedgz_files)} files.")
krewlyzer/helpers.py ADDED
@@ -0,0 +1,187 @@
1
+ import pysam
2
+ import itertools
3
+ import os
4
+ import numpy as np
5
+ import pandas as pd
6
+ import math
7
+ from collections import defaultdict
8
+ from rich.logging import RichHandler
9
+ import logging
10
+ from skmisc.loess import loess
11
+ import numpy as np
12
+
13
+ logging.basicConfig(level="INFO", handlers=[RichHandler()], format="%(message)s")
14
+ logger = logging.getLogger("krewlyzer-helpers")
15
+
16
+ def gc_correct(coverage, bias):
17
+ """
18
+ Perform GC bias correction on coverage values using LOESS regression.
19
+ Logs errors and raises commonError if fitting fails.
20
+ """
21
+ covl = len(coverage)
22
+ valid = [True for _ in range(covl)]
23
+ temp_cov = []
24
+ temp_bias = []
25
+ for i in range(covl):
26
+ if np.isnan(bias[i]):
27
+ valid[i] = False
28
+ else:
29
+ temp_cov.append(coverage[i])
30
+ temp_bias.append(bias[i])
31
+ if not temp_cov or not temp_bias:
32
+ logger.error("No valid coverage/bias values for GC correction.")
33
+ raise commonError("No valid coverage/bias values for GC correction.")
34
+ med = np.median(temp_cov)
35
+ correct_cov = []
36
+ try:
37
+ i = np.arange(np.min(temp_bias), np.max(temp_bias), 0.001)
38
+ coverage_trend = loess(temp_bias, temp_cov, span=0.75)
39
+ coverage_trend.fit()
40
+ coverage_model = loess(i, coverage_trend.predict(i, stderror=True).values)
41
+ coverage_model.fit()
42
+ coverage_pred = coverage_model.predict(temp_bias, stderror=True)
43
+ pred = np.array(coverage_pred.values)
44
+ coverage_corrected = temp_cov - pred + med
45
+ except Exception as e:
46
+ logger.error(f"GC correction failed: {e}")
47
+ raise commonError(f"GC correction failed: {e}")
48
+ i, j = 0, 0
49
+ while i < covl:
50
+ if valid[i]:
51
+ if coverage_corrected[j] < 0:
52
+ correct_cov.append(0)
53
+ else:
54
+ correct_cov.append(coverage_corrected[j])
55
+ j += 1
56
+ else:
57
+ correct_cov.append(0)
58
+ i += 1
59
+ return correct_cov
60
+
61
+ class commonError(Exception):
62
+ def __init__(self, message):
63
+ logger.error(f"commonError: {message}")
64
+ self.message = message
65
+
66
+ def maxCore(nCore=None):
67
+ if nCore and nCore > 16:
68
+ logger.warning("Requested nCore > 16; capping to 16.")
69
+ return 16
70
+ else:
71
+ return nCore
72
+
73
+ # Alias for CLI import consistency
74
+ max_core = maxCore
75
+
76
+ def rmEndString(x, y):
77
+ for item in y:
78
+ if x.endswith(item):
79
+ x = x.replace(item, "")
80
+ return x
81
+
82
+ def isSoftClipped(cigar):
83
+ """
84
+ cigar information:
85
+ S BAM_CSOFT_CLIP 4
86
+ H BAM_CHARD_CLIP 5
87
+ P BAM_CPAD 6
88
+ """
89
+ for (op, count) in cigar:
90
+ if op in [4, 5, 6]:
91
+ return True
92
+ return False
93
+
94
+ def GCcontent(seq):
95
+ try:
96
+ nA = seq.count("a") + seq.count("A")
97
+ nT = seq.count("t") + seq.count("T")
98
+ nG = seq.count("g") + seq.count("G")
99
+ nC = seq.count("c") + seq.count("C")
100
+ percent_GC = (nG + nC) / (nA + nT + nG + nC) if (nA + nT + nG + nC) > 0 else 0
101
+ return percent_GC
102
+ except Exception as e:
103
+ logger.error(f"GCcontent calculation failed: {e}")
104
+ return 0
105
+
106
+ def read_pair_generator(bam, region_string=None):
107
+ """
108
+ Generate read pairs in a BAM file or within a region string.
109
+ Reads are added to read_dict until a pair is found.
110
+ Reference: https://www.biostars.org/p/306041/
111
+ """
112
+ read_dict = defaultdict(lambda: [None, None])
113
+ try:
114
+ for read in bam.fetch(region=region_string):
115
+ if read.is_unmapped or read.is_qcfail or read.is_duplicate:
116
+ continue
117
+ if not read.is_paired or not read.is_proper_pair:
118
+ continue
119
+ if read.is_secondary or read.is_supplementary:
120
+ continue
121
+ if read.mate_is_unmapped:
122
+ continue
123
+ if read.rnext != read.tid:
124
+ continue
125
+ if read.template_length == 0:
126
+ continue
127
+ if isSoftClipped(read.cigar):
128
+ continue
129
+ qname = read.query_name
130
+ if qname not in read_dict:
131
+ if read.is_read1:
132
+ read_dict[qname][0] = read
133
+ else:
134
+ read_dict[qname][1] = read
135
+ else:
136
+ if read.is_read1:
137
+ yield read, read_dict[qname][1]
138
+ else:
139
+ yield read_dict[qname][0], read
140
+ del read_dict[qname]
141
+ except Exception as e:
142
+ logger.error(f"Error during BAM read pair generation: {e}")
143
+ return
144
+
145
+ def reverse_seq(seq):
146
+ r_seq = ''
147
+ for i in seq:
148
+ if i == 'A':
149
+ r_seq += 'T'
150
+ elif i == 'T':
151
+ r_seq += 'A'
152
+ elif i == 'C':
153
+ r_seq += 'G'
154
+ elif i == 'G':
155
+ r_seq += 'C'
156
+ else:
157
+ r_seq += i
158
+ return r_seq
159
+
160
+ def get_End_motif(Emotif, seq1, seq2):
161
+ if seq1.count('N') + seq1.count('n') + seq2.count('N') + seq2.count('n') != 0:
162
+ return Emotif
163
+ seq2 = reverse_seq(seq2)
164
+ if seq1 in Emotif.keys():
165
+ Emotif[seq1] += 1
166
+ if seq2 in Emotif.keys():
167
+ Emotif[seq2] += 1
168
+ return Emotif
169
+
170
+ def calc_MDS(inputEndMotifFile, outputfile):
171
+ inputfile = pd.read_table(inputEndMotifFile, header=None, names=['bases', 'frequency'])
172
+ k_mer = math.log(len(inputfile), 4)
173
+ frequency = inputfile['frequency'].to_numpy()
174
+ MDS = np.sum(-frequency * np.log2(frequency) / np.log2(4 ** k_mer))
175
+ with open(outputfile, 'a') as f:
176
+ f.write(inputEndMotifFile + '\t' + str(MDS) + '\n')
177
+
178
+ def get_Breakpoint_motif(Bpmotif, seq1, seq2):
179
+ # seq1 and seq2 do not include N
180
+ if seq1.count('N') + seq1.count('n') + seq2.count('N') + seq2.count('n') != 0:
181
+ return Bpmotif
182
+ seq2 = reverse_seq(seq2)
183
+ if seq1 in Bpmotif.keys():
184
+ Bpmotif[seq1] += 1
185
+ if seq2 in Bpmotif.keys():
186
+ Bpmotif[seq2] += 1
187
+ return Bpmotif
krewlyzer/motif.py ADDED
@@ -0,0 +1,275 @@
1
+ # motif.py: Extracts motif-based features from BAM files
2
+ import typer
3
+ from pathlib import Path
4
+ from typing import Optional
5
+ import os
6
+ import pysam
7
+ import numpy as np
8
+ import pandas as pd
9
+ import math
10
+ import pybedtools
11
+ from collections import defaultdict
12
+ from .helpers import (
13
+ reverse_seq,
14
+ get_End_motif,
15
+ get_Breakpoint_motif,
16
+ GCcontent,
17
+ read_pair_generator,
18
+ maxCore,
19
+ rmEndString,
20
+ calc_MDS
21
+ )
22
+ from rich.progress import Progress
23
+ from rich.console import Console
24
+ from rich.logging import RichHandler
25
+ import logging
26
+
27
+ console = Console()
28
+ logging.basicConfig(level="INFO", handlers=[RichHandler(console=console)], format="%(message)s")
29
+ logger = logging.getLogger("motif")
30
+
31
+
32
+ def motif(
33
+ bam_path: Path = typer.Argument(..., help="Path to input BAM file or directory of BAM files (GRCh37 aligned)"),
34
+ genome_reference: Path = typer.Option(..., '-g', help="Path to genome reference file (GRCh37/hg19)"),
35
+ output: Path = typer.Option(..., '-o', help="Output directory"),
36
+ blacklist: Optional[Path] = typer.Option(None, '-b', help="Path to blacklist regions file"),
37
+ map_quality: int = typer.Option(20, '-m', help="Minimum mapping quality"),
38
+ min_length: int = typer.Option(65, '--minlen', help="Minimum fragment length"),
39
+ max_length: int = typer.Option(400, '--maxlen', help="Maximum fragment length"),
40
+ kmer: int = typer.Option(3, '-k', help="K-mer size for motif extraction"),
41
+ chromosomes: Optional[str] = typer.Option(None, '--chromosomes', help="Comma-separated list of chromosomes to process"),
42
+ verbose: bool = typer.Option(False, '--verbose', help="Enable verbose logging"),
43
+ threads: int = typer.Option(1, '--threads', help="Number of parallel processes (default: 1)")
44
+ ):
45
+ """
46
+ Extract motif-based features from BAM files.
47
+ """
48
+ # Input checks
49
+ if not bam_path.exists():
50
+ logger.error(f"Input BAM file or directory not found: {bam_path}")
51
+ raise typer.Exit(1)
52
+ if not genome_reference.exists() or not genome_reference.is_file():
53
+ logger.error(f"Reference genome file not found: {genome_reference}")
54
+ raise typer.Exit(1)
55
+ try:
56
+ output.mkdir(parents=True, exist_ok=True)
57
+ except Exception as e:
58
+ logger.error(f"Could not create output directory {output}: {e}")
59
+ raise typer.Exit(1)
60
+ """
61
+ Extracts end motif, breakpoint motif, and Motif-Diversity Score (MDS) from one or more BAM files.
62
+ If a directory is provided, all BAM files in the directory will be processed in parallel using multiple processes.
63
+ Output files are written to the output directory, with EDM, BPM, and MDS subfolders.
64
+ """
65
+ import concurrent.futures
66
+ if verbose:
67
+ logger.setLevel(logging.DEBUG)
68
+ logger.info(f"Reference genome: {genome_reference}")
69
+ logger.info(f"Output directory: {output}")
70
+ if bam_path.is_dir():
71
+ bam_files = sorted([f for f in bam_path.iterdir() if f.suffix == '.bam'])
72
+ if not bam_files:
73
+ logger.error(f"No BAM files found in directory: {bam_path}")
74
+ raise typer.Exit(1)
75
+ logger.info(f"Processing {len(bam_files)} BAM files in parallel using {threads} processes...")
76
+ def run_motif_for_bam(bam_file):
77
+ logger.info(f"Processing BAM: {bam_file}")
78
+ motif_process(
79
+ str(bam_file),
80
+ str(blacklist) if blacklist else None,
81
+ str(output / (bam_file.stem + '.bed')),
82
+ str(genome_reference),
83
+ chromosomes.split(',') if chromosomes else None,
84
+ map_quality,
85
+ kmer,
86
+ fragFilter=True,
87
+ minLen=min_length,
88
+ maxLen=max_length
89
+ )
90
+ return str(bam_file)
91
+ with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
92
+ futures = {executor.submit(run_motif_for_bam, bam_file): bam_file for bam_file in bam_files}
93
+ for future in concurrent.futures.as_completed(futures):
94
+ bam_file = futures[future]
95
+ try:
96
+ result = future.result()
97
+ logger.info(f"Motif extraction complete for: {result}")
98
+ except Exception as exc:
99
+ logger.error(f"Motif extraction failed for {bam_file}: {exc}")
100
+ logger.info(f"All BAM files processed.")
101
+ else:
102
+ logger.info(f"Processing BAM: {bam_path}")
103
+ motif_process(
104
+ str(bam_path),
105
+ str(blacklist) if blacklist else None,
106
+ str(output / (bam_path.stem + '.bed')),
107
+ str(genome_reference),
108
+ chromosomes.split(',') if chromosomes else None,
109
+ map_quality,
110
+ kmer,
111
+ fragFilter=True,
112
+ minLen=min_length,
113
+ maxLen=max_length
114
+ )
115
+ logger.info("End motif, Breakpoint motif, and MDS extraction complete.")
116
+
117
+
118
+ def motif_process(
119
+ bamInput,
120
+ blacklistInput,
121
+ bedOutput,
122
+ genome_reference,
123
+ CHR,
124
+ mapQuality,
125
+ k_mer,
126
+ fragFilter=False,
127
+ minLen=None,
128
+ maxLen=None
129
+ ):
130
+ """
131
+ Main motif feature extraction process with rich logging and consistent CLI output.
132
+ """
133
+ from rich.table import Table
134
+ from rich.panel import Panel
135
+ from rich.text import Text
136
+ bedOutput_path = os.path.abspath(bedOutput)
137
+ EDM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'EDM')
138
+ BPM_output_path = os.path.join(os.path.dirname(bedOutput_path), 'BPM')
139
+ MDS_output_path = os.path.join(os.path.dirname(bedOutput_path), 'MDS')
140
+ try:
141
+ os.makedirs(EDM_output_path, exist_ok=True)
142
+ os.makedirs(BPM_output_path, exist_ok=True)
143
+ os.makedirs(MDS_output_path, exist_ok=True)
144
+ except Exception as e:
145
+ logger.error(f"Failed to create output directories: {e}")
146
+ raise typer.Exit(1)
147
+ bases = ['A', 'C', 'T', 'G']
148
+ End_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
149
+ Breakpoint_motif = {''.join(i): 0 for i in itertools.product(bases, repeat=k_mer)}
150
+ try:
151
+ bamfile = pysam.AlignmentFile(bamInput, 'rb')
152
+ except Exception as e:
153
+ logger.error(f"Failed to open BAM file: {e}")
154
+ raise typer.Exit(1)
155
+ try:
156
+ genome = pysam.FastaFile(genome_reference)
157
+ except Exception as e:
158
+ logger.error(f"Failed to open genome FASTA: {e}")
159
+ raise typer.Exit(1)
160
+ temp_bed = bedOutput + '.tmp'
161
+ try:
162
+ bedWrite = open(temp_bed, 'w')
163
+ except Exception as e:
164
+ logger.error(f"Failed to open temp BED for writing: {e}")
165
+ raise typer.Exit(1)
166
+ chroms = CHR if CHR else list(bamfile.references)
167
+ logger.info("Extracting motif features from BAM file...")
168
+ total_pairs = bamfile.mapped // 2 if bamfile.mapped else 1000000
169
+ motif_errors = 0
170
+ with Progress(console=console, transient=True) as progress:
171
+ task = progress.add_task("Processing fragments", total=total_pairs)
172
+ for idx, pair in enumerate(read_pair_generator(bamfile)):
173
+ try:
174
+ read1, read2 = pair
175
+ if read1.mapping_quality < mapQuality or read2.mapping_quality < mapQuality or read1.reference_name not in chroms:
176
+ continue
177
+ read1Start = read1.reference_start
178
+ read1End = read1.reference_end
179
+ read2Start = read2.reference_start
180
+ read2End = read2.reference_end
181
+ if not read1.is_reverse:
182
+ rstart = read1Start
183
+ rend = read2End
184
+ forward_end5 = read1.query_sequence[:k_mer].upper()
185
+ forward_end3 = read2.query_sequence[-k_mer:].upper()
186
+ else:
187
+ rstart = read2Start
188
+ rend = read1End
189
+ forward_end5 = read2.query_sequence[:k_mer].upper()
190
+ forward_end3 = read1.query_sequence[-k_mer:].upper()
191
+ if (rstart < 0) or (rend < 0) or (rstart >= rend):
192
+ continue
193
+ if fragFilter:
194
+ readLen = rend - rstart
195
+ if (minLen and readLen < minLen) or (maxLen and readLen > maxLen):
196
+ continue
197
+ gc = GCcontent(genome.fetch(read1.reference_name, rstart, rend))
198
+ bedWrite.write(f"{read1.reference_name}\t{rstart+1}\t{rend+1}\t{gc}\n")
199
+ End_motif = get_End_motif(End_motif, forward_end3, forward_end3)
200
+ pos = math.ceil(k_mer / 2)
201
+ try:
202
+ if k_mer % 2 == 0:
203
+ ref_seq1 = genome.fetch(read1.reference_name, rstart - pos, rstart).upper()
204
+ ref_seq2 = genome.fetch(read2.reference_name, rend, rend + pos).upper()
205
+ Breakpoint_motif = get_Breakpoint_motif(Breakpoint_motif, ref_seq1 + forward_end5[:pos], forward_end3[-pos:] + ref_seq2)
206
+ else:
207
+ ref_seq1 = genome.fetch(read1.reference_name, rstart - pos + 1, rstart).upper()
208
+ ref_seq2 = genome.fetch(read2.reference_name, rend, rend + pos - 1).upper()
209
+ Breakpoint_motif = get_Breakpoint_motif(Breakpoint_motif, ref_seq1 + forward_end5[:pos], forward_end3[-pos:] + ref_seq2)
210
+ except Exception as e:
211
+ motif_errors += 1
212
+ logger.warning(f"Motif extraction failed for fragment at {read1.reference_name}:{rstart}-{rend}: {e}")
213
+ continue
214
+ if idx % 10000 == 0:
215
+ progress.update(task, advance=10000)
216
+ except Exception as e:
217
+ motif_errors += 1
218
+ logger.error(f"Unexpected error during fragment processing: {e}")
219
+ continue
220
+ progress.update(task, completed=total_pairs)
221
+ bedWrite.close()
222
+ logger.info("Filtering and sorting fragments with blacklist (if provided)...")
223
+ try:
224
+ bedData = pybedtools.BedTool(temp_bed)
225
+ if blacklistInput:
226
+ black_reigon = pybedtools.BedTool(blacklistInput)
227
+ bedData = bedData.subtract(black_reigon, A=True)
228
+ bedData.sort(output=bedOutput)
229
+ os.remove(temp_bed)
230
+ except Exception as e:
231
+ logger.error(f"Error during BED filtering/sorting: {e}")
232
+ raise typer.Exit(1)
233
+ # Write EndMotif
234
+ edm_file = os.path.join(EDM_output_path, Path(bedOutput).stem + '.EndMotif')
235
+ logger.info(f"Writing End Motif frequencies to {edm_file}")
236
+ try:
237
+ with open(edm_file, 'w') as f:
238
+ total = sum(End_motif.values())
239
+ for k, v in End_motif.items():
240
+ f.write(f"{k}\t{v/total if total else 0}\n")
241
+ except Exception as e:
242
+ logger.error(f"Failed to write End Motif output: {e}")
243
+ raise typer.Exit(1)
244
+ # Write BreakPointMotif
245
+ bpm_file = os.path.join(BPM_output_path, Path(bedOutput).stem + '.BreakPointMotif')
246
+ logger.info(f"Writing Breakpoint Motif frequencies to {bpm_file}")
247
+ try:
248
+ with open(bpm_file, 'w') as f:
249
+ total = sum(Breakpoint_motif.values())
250
+ for k, v in Breakpoint_motif.items():
251
+ f.write(f"{k}\t{v/total if total else 0}\n")
252
+ except Exception as e:
253
+ logger.error(f"Failed to write Breakpoint Motif output: {e}")
254
+ raise typer.Exit(1)
255
+ # Write MDS (Motif Diversity Score)
256
+ mds_file = os.path.join(MDS_output_path, Path(bedOutput).stem + '.MDS')
257
+ logger.info(f"Writing Motif Diversity Score to {mds_file}")
258
+ try:
259
+ df = pd.read_csv(edm_file, sep='\t', header=None, names=['motif', 'frequency'])
260
+ freq = df['frequency'].values
261
+ mds = -np.sum(freq * np.log2(freq + 1e-12)) / np.log2(len(freq))
262
+ with open(mds_file, 'w') as f:
263
+ f.write(f"{mds}\n")
264
+ except Exception as e:
265
+ logger.error(f"Failed to write MDS output: {e}")
266
+ raise typer.Exit(1)
267
+ # Print summary
268
+ summary_table = Table(title="Motif Extraction Summary", show_header=True, header_style="bold magenta")
269
+ summary_table.add_column("Output Type", style="bold")
270
+ summary_table.add_column("File Path")
271
+ summary_table.add_row("End Motif (EDM)", edm_file)
272
+ summary_table.add_row("Breakpoint Motif (BPM)", bpm_file)
273
+ summary_table.add_row("Motif Diversity Score (MDS)", mds_file)
274
+ console.print(Panel(summary_table, title="[green]Extraction Complete", subtitle=f"Motif errors: {motif_errors}", expand=False))
275
+ logger.info("Motif feature extraction complete.")