supremo-lite 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,414 @@
1
+ """
2
+ In-silico saturation mutagenesis functionality for supremo_lite.
3
+
4
+ This module provides functions for generating saturation mutagenesis sequences,
5
+ where each position in a sequence is systematically mutated.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ import warnings
11
+ from typing import Optional, Union
12
+ from .core import nt_to_1h, TORCH_AVAILABLE
13
+ from .sequence_utils import encode_seq
14
+ from .chromosome_utils import match_chromosomes_with_report, apply_chromosome_mapping
15
+
16
+ try:
17
+ import torch
18
+ except ImportError:
19
+ pass # Already handled in core
20
+
21
+
22
+ def _read_bed_file(bed_regions: Union[str, pd.DataFrame]) -> pd.DataFrame:
23
+ """
24
+ Read BED file or validate BED DataFrame format.
25
+
26
+ Args:
27
+ bed_regions: Path to BED file or DataFrame with BED format
28
+
29
+ Returns:
30
+ DataFrame with columns: chrom, start, end
31
+
32
+ Raises:
33
+ ValueError: If file format is invalid
34
+ FileNotFoundError: If file path doesn't exist
35
+ """
36
+ if isinstance(bed_regions, str):
37
+ # Read BED file
38
+ try:
39
+ bed_df = pd.read_csv(
40
+ bed_regions,
41
+ sep="\t",
42
+ header=None,
43
+ comment="#",
44
+ usecols=[0, 1, 2], # Only read first 3 columns
45
+ names=["chrom", "start", "end"],
46
+ )
47
+ except Exception as e:
48
+ raise ValueError(f"Error reading BED file '{bed_regions}': {e}")
49
+
50
+ elif isinstance(bed_regions, pd.DataFrame):
51
+ bed_df = bed_regions.copy()
52
+
53
+ # Validate required columns exist
54
+ required_cols = ["chrom", "start", "end"]
55
+ if not all(col in bed_df.columns for col in required_cols):
56
+ available_cols = list(bed_df.columns)
57
+ raise ValueError(
58
+ f"BED DataFrame must contain columns {required_cols}. "
59
+ f"Found columns: {available_cols}"
60
+ )
61
+
62
+ # Select only required columns
63
+ bed_df = bed_df[required_cols].copy()
64
+
65
+ else:
66
+ raise ValueError(
67
+ f"bed_regions must be a file path (str) or DataFrame, got {type(bed_regions)}"
68
+ )
69
+
70
+ # Validate data types and values
71
+ try:
72
+ bed_df["start"] = pd.to_numeric(bed_df["start"], errors="coerce")
73
+ bed_df["end"] = pd.to_numeric(bed_df["end"], errors="coerce")
74
+ except Exception as e:
75
+ raise ValueError(f"Invalid BED coordinates: {e}")
76
+
77
+ # Check for invalid coordinates
78
+ invalid_coords = bed_df["start"] >= bed_df["end"]
79
+ if invalid_coords.any():
80
+ n_invalid = invalid_coords.sum()
81
+ warnings.warn(
82
+ f"Found {n_invalid} BED regions with invalid coordinates (start >= end). These will be removed."
83
+ )
84
+ bed_df = bed_df[~invalid_coords].reset_index(drop=True)
85
+
86
+ # Check for negative coordinates
87
+ negative_coords = (bed_df["start"] < 0) | (bed_df["end"] < 0)
88
+ if negative_coords.any():
89
+ n_negative = negative_coords.sum()
90
+ warnings.warn(
91
+ f"Found {n_negative} BED regions with negative coordinates. These will be removed."
92
+ )
93
+ bed_df = bed_df[~negative_coords].reset_index(drop=True)
94
+
95
+ if len(bed_df) == 0:
96
+ raise ValueError("No valid BED regions found after filtering")
97
+
98
+ return bed_df
99
+
100
+
101
+ def get_sm_sequences(chrom, start, end, reference_fasta, encoder=None):
102
+ """
103
+ Generate sequences with all alternate nucleotides at every position (saturation mutagenesis).
104
+
105
+ Args:
106
+ chrom: Chromosome name
107
+ start: Start position (0-based)
108
+ end: End position (0-based, exclusive)
109
+ reference_fasta: Reference genome object
110
+ encoder: Optional custom encoding function. If provided, should accept a single
111
+ sequence string and return encoded array with shape (4, L). Default: None
112
+
113
+ Returns:
114
+ Tuple of (reference one-hot, alt one-hot tensor, metadata DataFrame)
115
+ """
116
+ # Get the reference sequence
117
+ ref_seq = reference_fasta[chrom][start:end]
118
+ if hasattr(ref_seq, "seq"): # Handle pyfaidx-like objects
119
+ ref_seq = ref_seq.seq
120
+
121
+ ref_1h = encode_seq(ref_seq, encoder)
122
+
123
+ alt_seqs = []
124
+ metadata = []
125
+
126
+ # For each position, substitute with each alternate base
127
+ for i in range(len(ref_seq)):
128
+ ref_nt = ref_seq[i]
129
+ for alt in sorted({"A", "C", "G", "T"} - {ref_nt.upper()}):
130
+ # Create a clone and substitute the base
131
+ if TORCH_AVAILABLE and isinstance(ref_1h, torch.Tensor):
132
+ alt_1h = ref_1h.clone()
133
+ alt_1h[:, i] = torch.tensor(nt_to_1h[alt], dtype=alt_1h.dtype)
134
+ else:
135
+ alt_1h = ref_1h.copy()
136
+ alt_1h[:, i] = nt_to_1h[alt]
137
+
138
+ alt_seqs.append(alt_1h)
139
+ metadata.append([chrom, start, end, i, ref_nt, alt])
140
+
141
+ # Stack the alternate sequences
142
+ if TORCH_AVAILABLE and isinstance(ref_1h, torch.Tensor):
143
+ alt_seqs_stacked = torch.stack(alt_seqs)
144
+ else:
145
+ alt_seqs_stacked = np.stack(alt_seqs)
146
+
147
+ # Create a DataFrame for the metadata
148
+ metadata_df = pd.DataFrame(
149
+ metadata, columns=["chrom", "window_start", "window_end", "variant_pos0", "ref", "alt"]
150
+ )
151
+
152
+ return ref_1h, alt_seqs_stacked, metadata_df
153
+
154
+
155
+ def get_sm_subsequences(
156
+ chrom,
157
+ seq_len,
158
+ reference_fasta,
159
+ anchor=None,
160
+ anchor_radius=None,
161
+ bed_regions=None,
162
+ encoder=None,
163
+ auto_map_chromosomes=False,
164
+ ):
165
+ """
166
+ Generate sequences with all alternate nucleotides at positions in specified regions
167
+ (saturation mutagenesis).
168
+
169
+ Supports two mutually exclusive approaches for defining mutation intervals:
170
+ 1. Anchor-based: Use anchor + anchor_radius to define a single centered region
171
+ 2. BED-based: Use bed_regions to define one or more arbitrary genomic regions
172
+
173
+ In both cases, sequences of length seq_len are generated, centered on the mutation interval(s).
174
+
175
+ Args:
176
+ chrom: Chromosome name
177
+ seq_len: Total sequence length for each window
178
+ reference_fasta: Reference genome object
179
+ anchor: Anchor position (0-based). Required when using anchor_radius.
180
+ Must be provided together with anchor_radius.
181
+ Mutually exclusive with bed_regions.
182
+ anchor_radius: Number of bases to include on either side of the anchor for mutations.
183
+ Required when using anchor. Must be provided together with anchor.
184
+ Mutually exclusive with bed_regions.
185
+ bed_regions: BED file path or DataFrame defining mutation intervals.
186
+ BED format: chrom, start, end (0-based, half-open intervals).
187
+ Each BED region defines positions to mutate, centered in a seq_len window.
188
+ Mutually exclusive with anchor + anchor_radius.
189
+ encoder: Optional custom encoding function. If provided, should accept a single
190
+ sequence string and return encoded array with shape (4, L). Default: None
191
+ auto_map_chromosomes: Automatically map chromosome names between reference and BED file
192
+ when they don't match exactly (e.g., 'chr1' <-> '1', 'chrM' <-> 'MT').
193
+ Only applies when bed_regions is provided. Default: False.
194
+
195
+ Returns:
196
+ Tuple of (reference one-hot, alt one-hot tensor, metadata DataFrame)
197
+
198
+ Raises:
199
+ ValueError: If invalid parameter combinations are provided
200
+ ChromosomeMismatchError: If auto_map_chromosomes=False and chromosome names in BED file
201
+ and reference don't match exactly (only when bed_regions is provided)
202
+
203
+ Examples:
204
+ # Approach 1: Anchor-based (single region)
205
+ ref, alts, meta = get_sm_subsequences(
206
+ chrom='chr1',
207
+ seq_len=200,
208
+ reference_fasta=ref,
209
+ anchor=1050,
210
+ anchor_radius=10 # Mutate positions 1040-1060 in a 200bp window
211
+ )
212
+
213
+ # Approach 2: BED-based (multiple regions)
214
+ ref, alts, meta = get_sm_subsequences(
215
+ chrom='chr1',
216
+ seq_len=200,
217
+ reference_fasta=ref,
218
+ bed_regions='regions.bed' # Each region centered in 200bp window
219
+ )
220
+ """
221
+ # Validate parameter combinations
222
+ has_anchor = anchor is not None
223
+ has_anchor_radius = anchor_radius is not None
224
+ has_bed = bed_regions is not None
225
+
226
+ # Check for invalid combinations
227
+ if (has_anchor or has_anchor_radius) and has_bed:
228
+ raise ValueError(
229
+ "Cannot use both (anchor + anchor_radius) and bed_regions. "
230
+ "These are mutually exclusive approaches."
231
+ )
232
+
233
+ # Validate anchor approach
234
+ if has_anchor or has_anchor_radius:
235
+ if not (has_anchor and has_anchor_radius):
236
+ raise ValueError(
237
+ "anchor and anchor_radius must be provided together. "
238
+ "Both are required when using the anchor-based approach."
239
+ )
240
+ elif not has_bed:
241
+ # Neither approach was specified
242
+ raise ValueError(
243
+ "Must provide either (anchor + anchor_radius) or bed_regions."
244
+ )
245
+
246
+ alt_seqs = []
247
+ metadata = []
248
+
249
+ # Handle the two approaches differently
250
+ if has_anchor:
251
+ # APPROACH 1: Anchor-based (single region)
252
+ # Calculate sequence boundaries centered on anchor
253
+ start = anchor - seq_len // 2
254
+ end = start + seq_len
255
+
256
+ # Get the reference sequence
257
+ ref_seq = reference_fasta[chrom][start:end]
258
+ if hasattr(ref_seq, "seq"): # Handle pyfaidx-like objects
259
+ ref_seq = ref_seq.seq
260
+
261
+ assert (
262
+ len(ref_seq) == seq_len
263
+ ), f"Expected sequence length {seq_len}, got {len(ref_seq)}"
264
+
265
+ ref_1h = encode_seq(ref_seq, encoder)
266
+
267
+ # Calculate the range to mutate
268
+ anchor_offset = anchor - start
269
+ # Validate anchor_radius
270
+ assert anchor_radius <= anchor_offset, "Anchor radius exceeds start of sequence"
271
+
272
+ # Create set of positions to mutate (within anchor_radius of anchor)
273
+ mut_start = anchor_offset - anchor_radius
274
+ mut_end = anchor_offset + anchor_radius
275
+ valid_positions = set(range(mut_start, mut_end))
276
+
277
+ # Mutate positions
278
+ for i in sorted(valid_positions):
279
+ ref_nt = ref_seq[i]
280
+ for alt in sorted({"A", "C", "G", "T"} - {ref_nt.upper()}):
281
+ # Create a clone and substitute the base
282
+ if TORCH_AVAILABLE and isinstance(ref_1h, torch.Tensor):
283
+ alt_1h = ref_1h.clone()
284
+ alt_1h[:, i] = torch.tensor(nt_to_1h[alt], dtype=alt_1h.dtype)
285
+ else:
286
+ alt_1h = ref_1h.copy()
287
+ alt_1h[:, i] = nt_to_1h[alt]
288
+
289
+ alt_seqs.append(alt_1h)
290
+ metadata.append([chrom, start, end, i, ref_nt, alt])
291
+
292
+ else:
293
+ # APPROACH 2: BED-based (multiple regions)
294
+ # Each BED region gets its own seq_len window centered on it
295
+ ref_1h = None # Will be set for first region
296
+
297
+ # Parse BED file/DataFrame
298
+ bed_df = _read_bed_file(bed_regions)
299
+
300
+ # Apply chromosome name matching
301
+ ref_chroms = {chrom}
302
+ bed_chroms = set(bed_df["chrom"].unique())
303
+
304
+ mapping, unmatched = match_chromosomes_with_report(
305
+ ref_chroms,
306
+ bed_chroms,
307
+ verbose=False,
308
+ auto_map_chromosomes=auto_map_chromosomes,
309
+ )
310
+
311
+ if mapping:
312
+ bed_df = apply_chromosome_mapping(bed_df, mapping)
313
+
314
+ # Filter to current chromosome
315
+ chrom_bed_regions = bed_df[bed_df["chrom"] == chrom].copy()
316
+
317
+ if len(chrom_bed_regions) == 0:
318
+ warnings.warn(
319
+ f"No BED regions found for chromosome {chrom}. No mutagenesis will be performed."
320
+ )
321
+ else:
322
+ # Process each BED region
323
+ for _, bed_region in chrom_bed_regions.iterrows():
324
+ region_start = bed_region["start"]
325
+ region_end = bed_region["end"]
326
+ region_center = (region_start + region_end) // 2
327
+
328
+ # Calculate sequence window centered on this BED region
329
+ window_start = region_center - seq_len // 2
330
+ window_end = window_start + seq_len
331
+
332
+ # Adjust window to stay within chromosome bounds
333
+ chrom_obj = reference_fasta[chrom]
334
+ chrom_len = len(chrom_obj) if hasattr(chrom_obj, '__len__') else len(chrom_obj.seq)
335
+ if window_start < 0:
336
+ window_start = 0
337
+ window_end = min(seq_len, chrom_len)
338
+ elif window_end > chrom_len:
339
+ window_end = chrom_len
340
+ window_start = max(0, chrom_len - seq_len)
341
+
342
+ # Get the reference sequence for this window
343
+ region_seq = reference_fasta[chrom][window_start:window_end]
344
+ if hasattr(region_seq, "seq"):
345
+ region_seq = region_seq.seq
346
+
347
+ if len(region_seq) != seq_len:
348
+ warnings.warn(
349
+ f"Region {chrom}:{region_start}-{region_end} produces sequence of length "
350
+ f"{len(region_seq)} instead of {seq_len} (chromosome length: {chrom_len}). "
351
+ f"Skipping this region."
352
+ )
353
+ continue
354
+
355
+ region_1h = encode_seq(region_seq, encoder)
356
+
357
+ # Set ref_1h for the first valid region (for return value)
358
+ if ref_1h is None:
359
+ ref_1h = region_1h
360
+
361
+ # Calculate which positions to mutate (BED region relative to window)
362
+ mut_start_rel = max(0, region_start - window_start)
363
+ mut_end_rel = min(seq_len, region_end - window_start)
364
+
365
+ # Check if BED region overlaps with the extracted window
366
+ if mut_start_rel >= mut_end_rel:
367
+ warnings.warn(
368
+ f"BED region {chrom}:{region_start}-{region_end} is outside chromosome bounds "
369
+ f"(length: {chrom_len}). Skipping this region."
370
+ )
371
+ continue
372
+
373
+ # Mutate positions within this BED region
374
+ for i in range(mut_start_rel, mut_end_rel):
375
+ ref_nt = region_seq[i]
376
+ for alt in sorted({"A", "C", "G", "T"} - {ref_nt.upper()}):
377
+ # Create a clone and substitute the base
378
+ if TORCH_AVAILABLE and isinstance(region_1h, torch.Tensor):
379
+ alt_1h = region_1h.clone()
380
+ alt_1h[:, i] = torch.tensor(nt_to_1h[alt], dtype=alt_1h.dtype)
381
+ else:
382
+ alt_1h = region_1h.copy()
383
+ alt_1h[:, i] = nt_to_1h[alt]
384
+
385
+ alt_seqs.append(alt_1h)
386
+ metadata.append([chrom, window_start, window_end, i, ref_nt, alt])
387
+
388
+ # If no regions were processed, create empty ref_1h
389
+ if ref_1h is None:
390
+ # Create a dummy empty sequence
391
+ if TORCH_AVAILABLE:
392
+ ref_1h = torch.zeros((4, seq_len), dtype=torch.float32)
393
+ else:
394
+ ref_1h = np.zeros((4, seq_len), dtype=np.float32)
395
+
396
+ # Stack the alternate sequences
397
+ if alt_seqs:
398
+ if TORCH_AVAILABLE and isinstance(ref_1h, torch.Tensor):
399
+ alt_seqs_stacked = torch.stack(alt_seqs)
400
+ else:
401
+ alt_seqs_stacked = np.stack(alt_seqs)
402
+ else:
403
+ # No mutations generated (e.g., due to BED filtering)
404
+ if TORCH_AVAILABLE and isinstance(ref_1h, torch.Tensor):
405
+ alt_seqs_stacked = torch.empty((0, 4, seq_len), dtype=ref_1h.dtype)
406
+ else:
407
+ alt_seqs_stacked = np.empty((0, 4, seq_len), dtype=ref_1h.dtype)
408
+
409
+ # Create a DataFrame for the metadata
410
+ metadata_df = pd.DataFrame(
411
+ metadata, columns=["chrom", "window_start", "window_end", "variant_pos0", "ref", "alt"]
412
+ )
413
+
414
+ return ref_1h, alt_seqs_stacked, metadata_df