geney 1.3.79__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +54 -56
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.79.dist-info → geney-1.4.1.dist-info}/METADATA +1 -1
- geney-1.4.1.dist-info/RECORD +51 -0
- {geney-1.3.79.dist-info → geney-1.4.1.dist-info}/WHEEL +1 -1
- geney-1.3.79.dist-info/RECORD +0 -31
- {geney-1.3.79.dist-info → geney-1.4.1.dist-info}/top_level.txt +0 -0
geney/utils/SeqMats.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
__all__ = ['SeqMat', 'format_mut_id']
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Optional, Union, List, Tuple
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
def format_mut_id(text):
|
|
10
|
+
import re
|
|
11
|
+
# text = "TP53:17:7579472:G:A"
|
|
12
|
+
|
|
13
|
+
pattern = r'^[^:]+:[^:]+:(\d+):([ACGTN\-]+):([ACGTN\-]+)$'
|
|
14
|
+
match = re.match(pattern, text)
|
|
15
|
+
|
|
16
|
+
if match:
|
|
17
|
+
position = int(match.group(1))
|
|
18
|
+
ref = match.group(2)
|
|
19
|
+
alt = match.group(3)
|
|
20
|
+
return {'pos': position, 'ref': ref, 'alt': alt}
|
|
21
|
+
|
|
22
|
+
# print(f"Position: {position}, Ref: {ref}, Alt: {alt}")
|
|
23
|
+
else:
|
|
24
|
+
print("No match")
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(slots=True)
|
|
29
|
+
class SeqMat:
|
|
30
|
+
"""Represents a genomic sequence matrix used for training."""
|
|
31
|
+
# Metadata fields (uncomment and/or extend as needed)
|
|
32
|
+
name: str = field(default="Unnamed Sequence", metadata={"description": "Name of the sequence"})
|
|
33
|
+
version: str = field(default="1.0", metadata={"description": "Version of the dataset"})
|
|
34
|
+
source: str = field(default="Unknown", metadata={"description": "Source of the sequence data"})
|
|
35
|
+
notes: dict = field(default_factory=dict, metadata={"description": "User-defined metadata dictionary"})
|
|
36
|
+
|
|
37
|
+
seq_array: np.ndarray = field(init=False, repr=False)
|
|
38
|
+
insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
|
|
39
|
+
rev: bool = field(default=False, init=False, repr=False)
|
|
40
|
+
|
|
41
|
+
predicted_splicing: pd.DataFrame = field(init=False, repr=False)
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
nucleotides: str,
|
|
46
|
+
index: np.ndarray,
|
|
47
|
+
conservation: Optional[np.ndarray] = None,
|
|
48
|
+
reference_nucleotides: Optional[np.ndarray] = None,
|
|
49
|
+
notes: Optional[dict] = None,
|
|
50
|
+
source: Optional[str] = None,
|
|
51
|
+
rev: Optional[bool] = False,
|
|
52
|
+
name: Optional[str] = 'wild_type',
|
|
53
|
+
version: Optional[str] = 'none'
|
|
54
|
+
|
|
55
|
+
) -> None:
|
|
56
|
+
self.predicted_splicing = None
|
|
57
|
+
nucleotides = np.array(list(nucleotides))
|
|
58
|
+
L = nucleotides.shape[0]
|
|
59
|
+
if index.shape[0] != L:
|
|
60
|
+
raise ValueError("Indices array length must match nucleotide sequence length.")
|
|
61
|
+
if conservation is not None and conservation.shape[0] != L:
|
|
62
|
+
raise ValueError("Conservation vector length must match sequence length.")
|
|
63
|
+
if reference_nucleotides is not None and reference_nucleotides.shape[0] != L:
|
|
64
|
+
raise ValueError("Reference nucleotide vector length must match sequence length.")
|
|
65
|
+
|
|
66
|
+
dtype = np.dtype([
|
|
67
|
+
("nt", "S1"),
|
|
68
|
+
("index", np.float64),
|
|
69
|
+
("ref", "S1"),
|
|
70
|
+
("cons", np.float32),
|
|
71
|
+
("valid_mask", bool),
|
|
72
|
+
])
|
|
73
|
+
|
|
74
|
+
self.seq_array = np.empty(L, dtype=dtype)
|
|
75
|
+
self.seq_array["nt"] = nucleotides
|
|
76
|
+
# Use provided reference nucleotides if available.
|
|
77
|
+
self.seq_array["ref"] = nucleotides if reference_nucleotides is None else reference_nucleotides
|
|
78
|
+
self.seq_array["index"] = index
|
|
79
|
+
self.seq_array["cons"] = np.nan if conservation is None else conservation
|
|
80
|
+
self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
|
|
81
|
+
self.insertion_counters = defaultdict(int)
|
|
82
|
+
|
|
83
|
+
self.source = source if source is not None else "Unknown"
|
|
84
|
+
self.notes = notes if notes is not None else {}
|
|
85
|
+
self.name = name
|
|
86
|
+
self.rev = rev
|
|
87
|
+
self.version = version
|
|
88
|
+
|
|
89
|
+
def __len__(self) -> int:
|
|
90
|
+
return int(self.seq_array["valid_mask"].sum())
|
|
91
|
+
|
|
92
|
+
def __repr__(self):
|
|
93
|
+
return f"<SeqMat: {self.seq}>"
|
|
94
|
+
|
|
95
|
+
def __str__(self):
|
|
96
|
+
return self.seq
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> dict:
|
|
99
|
+
"""Retrieve all metadata as a dictionary."""
|
|
100
|
+
return {
|
|
101
|
+
"name": self.name,
|
|
102
|
+
"source": self.source,
|
|
103
|
+
"version": self.version,
|
|
104
|
+
"notes": self.notes
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def seq(self) -> str:
|
|
109
|
+
return self.seq_array["nt"][self.seq_array["valid_mask"]].tobytes().decode()
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def index(self) -> np.ndarray:
|
|
113
|
+
return self.seq_array["index"][self.seq_array["valid_mask"]]
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def conservation(self) -> np.ndarray:
|
|
117
|
+
return self.seq_array["cons"][self.seq_array["valid_mask"]]
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def max_index(self) -> float:
|
|
121
|
+
return self.seq_array["index"].max()
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def min_index(self) -> float:
|
|
125
|
+
return self.seq_array["index"].min()
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def start(self) -> float:
|
|
129
|
+
return self.min_index
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def end(self) -> float:
|
|
133
|
+
return self.max_index
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def mutated_positions(self) -> np.ndarray:
|
|
137
|
+
return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
|
|
138
|
+
|
|
139
|
+
def clone(self, start: Optional[int] = None, end: Optional[int] = None) -> "SeqMat":
|
|
140
|
+
cloned = SeqMat.__new__(SeqMat)
|
|
141
|
+
if start is not None and end is not None:
|
|
142
|
+
cloned.seq_array = self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= end)]
|
|
143
|
+
else:
|
|
144
|
+
cloned.seq_array = self.seq_array.copy()
|
|
145
|
+
cloned.insertion_counters = defaultdict(int)
|
|
146
|
+
cloned.name = self.name
|
|
147
|
+
cloned.source = self.source
|
|
148
|
+
cloned.version = self.version
|
|
149
|
+
cloned.notes = self.notes.copy()
|
|
150
|
+
cloned.rev = self.rev
|
|
151
|
+
return cloned
|
|
152
|
+
|
|
153
|
+
def apply_mutation(self, pos: int, ref: str, alt: str, only_snps: bool = False):
|
|
154
|
+
"""
|
|
155
|
+
Applies a mutation (SNP, substitution, insertion, or deletion) to the sequence.
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
pos (int): The reference position where the mutation should occur.
|
|
159
|
+
ref (str): The reference allele (use '-' for insertions).
|
|
160
|
+
alt (str): The alternate allele (use '-' for deletions).
|
|
161
|
+
only_snps (bool): If True, only SNP substitutions are allowed; indels are ignored.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
SeqMat: The mutated sequence matrix.
|
|
165
|
+
|
|
166
|
+
The method normalizes the mutation (dropping any shared prefix) and then applies:
|
|
167
|
+
- A SNP/substitution if both alleles are non-gap.
|
|
168
|
+
- An insertion if ref is '-' (after normalization).
|
|
169
|
+
- A deletion if alt is '-' (after normalization).
|
|
170
|
+
|
|
171
|
+
For insertions, new rows are added with fractional indices computed from an insertion counter.
|
|
172
|
+
For deletions, the corresponding rows are removed.
|
|
173
|
+
"""
|
|
174
|
+
return_to_rc = False
|
|
175
|
+
if self.rev:
|
|
176
|
+
return_to_rc = True
|
|
177
|
+
self.reverse_complement()
|
|
178
|
+
|
|
179
|
+
# Normalize shared prefix (similar to left-alignment in VCFs)
|
|
180
|
+
if ref and alt and ref[0] == alt[0]:
|
|
181
|
+
pos += 1
|
|
182
|
+
ref = ref[1:] or "-"
|
|
183
|
+
alt = alt[1:] or "-"
|
|
184
|
+
|
|
185
|
+
# Case 1: SNP or multi-base substitution
|
|
186
|
+
if ref != "-" and alt != "-":
|
|
187
|
+
if len(ref) == len(alt):
|
|
188
|
+
# print('Inserting SNP')
|
|
189
|
+
pos_idx = np.searchsorted(self.seq_array["index"], pos)
|
|
190
|
+
end_idx = pos_idx + len(ref)
|
|
191
|
+
if end_idx > len(self.seq_array):
|
|
192
|
+
raise ValueError(f"Substitution range exceeds sequence length at position {pos}.")
|
|
193
|
+
segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
|
|
194
|
+
if segment != ref:
|
|
195
|
+
raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{segment}'")
|
|
196
|
+
for i, nt in enumerate(alt):
|
|
197
|
+
self.seq_array["nt"][pos_idx + i] = nt.encode()
|
|
198
|
+
else:
|
|
199
|
+
raise ValueError("Substitution mutations must have alleles of equal length.")
|
|
200
|
+
|
|
201
|
+
# Case 2: Insertion (ref is '-' means nothing was present, and we need to add bases)
|
|
202
|
+
elif ref == "-" and alt != "-":
|
|
203
|
+
if only_snps:
|
|
204
|
+
return self # Skip if indels are not allowed.
|
|
205
|
+
pos_idx = np.searchsorted(self.seq_array["index"], pos)
|
|
206
|
+
insertion_count = self.insertion_counters[pos]
|
|
207
|
+
eps = 1e-6
|
|
208
|
+
new_rows = []
|
|
209
|
+
for i, nt in enumerate(alt):
|
|
210
|
+
new_index = pos + (insertion_count + i + 1) * eps
|
|
211
|
+
new_row = (nt.encode(), new_index, b"-", np.float32(np.nan), True)
|
|
212
|
+
new_rows.append(new_row)
|
|
213
|
+
rows = list(self.seq_array)
|
|
214
|
+
rows.extend(new_rows)
|
|
215
|
+
new_seq_array = np.array(rows, dtype=self.seq_array.dtype)
|
|
216
|
+
new_seq_array.sort(order="index")
|
|
217
|
+
self.seq_array = new_seq_array
|
|
218
|
+
self.insertion_counters[pos] += len(alt)
|
|
219
|
+
|
|
220
|
+
# Case 3: Deletion (alt is '-' means bases are to be removed)
|
|
221
|
+
elif alt == "-" and ref != "-":
|
|
222
|
+
if only_snps:
|
|
223
|
+
return self # Skip if indels are not allowed.
|
|
224
|
+
pos_idx = np.searchsorted(self.seq_array["index"], pos)
|
|
225
|
+
end_idx = pos_idx + len(ref)
|
|
226
|
+
if end_idx > len(self.seq_array):
|
|
227
|
+
raise ValueError(f"Deletion range exceeds sequence length at position {pos}.")
|
|
228
|
+
segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
|
|
229
|
+
if segment != ref:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f"Reference mismatch for deletion at position {pos}: expected '{ref}', found '{segment}'")
|
|
232
|
+
self.seq_array = np.delete(self.seq_array, np.s_[pos_idx:end_idx])
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError("Unsupported mutation type. Provide valid ref and alt values.")
|
|
235
|
+
|
|
236
|
+
self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
|
|
237
|
+
if return_to_rc:
|
|
238
|
+
self.reverse_complement()
|
|
239
|
+
|
|
240
|
+
return self
|
|
241
|
+
|
|
242
|
+
def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
|
|
243
|
+
if isinstance(key, int):
|
|
244
|
+
pos_idx = np.where(self.seq_array["index"] == key)[0]
|
|
245
|
+
if pos_idx.size == 0:
|
|
246
|
+
raise IndexError(f"Position {key} not found in sequence.")
|
|
247
|
+
return self.seq_array[pos_idx[0]]
|
|
248
|
+
elif isinstance(key, slice):
|
|
249
|
+
start, stop = key.start, key.stop
|
|
250
|
+
if start is None:
|
|
251
|
+
start = self.seq_array["index"].min()
|
|
252
|
+
if stop is None:
|
|
253
|
+
stop = self.seq_array["index"].max()
|
|
254
|
+
return self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= stop)]
|
|
255
|
+
else:
|
|
256
|
+
raise TypeError("Indexing must be an integer or a slice.")
|
|
257
|
+
|
|
258
|
+
def complement(self) -> "SeqMat":
|
|
259
|
+
comp_dict = {b"A": b"T", b"T": b"A", b"C": b"G", b"G": b"C", b"-": b"-", b"N": b"N"}
|
|
260
|
+
comp_seq = np.array([comp_dict[nt] for nt in self.seq_array["nt"]], dtype="S1")
|
|
261
|
+
new_instance = self.clone()
|
|
262
|
+
new_instance.seq_array["nt"] = comp_seq
|
|
263
|
+
return new_instance
|
|
264
|
+
|
|
265
|
+
def reverse_complement(self) -> "SeqMat":
|
|
266
|
+
rev_comp_seq = self.complement().seq_array[::-1]
|
|
267
|
+
self.seq_array = rev_comp_seq.copy()
|
|
268
|
+
self.rev = not self.rev
|
|
269
|
+
return self
|
|
270
|
+
|
|
271
|
+
# def splice_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
272
|
+
# """
|
|
273
|
+
# Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
274
|
+
#
|
|
275
|
+
# Args:
|
|
276
|
+
# introns (List[Tuple[int, int]]): List of (start, end) intron boundaries to remove.
|
|
277
|
+
# Coordinates should match the 'index' field.
|
|
278
|
+
#
|
|
279
|
+
# Returns:
|
|
280
|
+
# SeqMat: A new instance with the intron regions removed.
|
|
281
|
+
# """
|
|
282
|
+
# mask = np.ones(len(self.seq_array), dtype=bool)
|
|
283
|
+
#
|
|
284
|
+
# for start, end in introns:
|
|
285
|
+
# mask &= ~((self.seq_array["index"] >= start) & (self.seq_array["index"] <= end))
|
|
286
|
+
#
|
|
287
|
+
# new_instance = self.clone()
|
|
288
|
+
# new_instance.seq_array = self.seq_array[mask].copy()
|
|
289
|
+
# return new_instance
|
|
290
|
+
|
|
291
|
+
def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
292
|
+
"""
|
|
293
|
+
Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
294
|
+
|
|
295
|
+
Handles reverse-complemented sequences by interpreting introns in reverse as well.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
|
|
299
|
+
These are always genomic (absolute) coordinates,
|
|
300
|
+
regardless of strand direction.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
SeqMat: A new instance with the intron regions removed.
|
|
304
|
+
"""
|
|
305
|
+
# In reverse orientation, flip intron direction for comparison
|
|
306
|
+
if self.rev:
|
|
307
|
+
introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
|
|
308
|
+
|
|
309
|
+
mask = np.ones(len(self.seq_array), dtype=bool)
|
|
310
|
+
|
|
311
|
+
for start, end in introns:
|
|
312
|
+
lo, hi = min(start, end) + 1, max(start, end) - 1
|
|
313
|
+
mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
|
|
314
|
+
|
|
315
|
+
new_instance = self.clone()
|
|
316
|
+
new_instance.seq_array = self.seq_array[mask].copy()
|
|
317
|
+
return new_instance
|
|
318
|
+
|
|
319
|
+
def open_reading_frame(self, tis: int) -> "SeqMat":
|
|
320
|
+
"""
|
|
321
|
+
Extracts the open reading frame starting from the translation initiation site (TIS)
|
|
322
|
+
until the first in-frame stop codon.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
tis (int): Genomic position of the translation initiation site (start codon).
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
|
|
329
|
+
"""
|
|
330
|
+
if tis not in self.seq_array["index"]:
|
|
331
|
+
print(f"Warning: TIS position {tis} not found, returning default.")
|
|
332
|
+
return self.clone(start=0, end=3)
|
|
333
|
+
|
|
334
|
+
# Extract nucleotide sequence and indices starting from TIS
|
|
335
|
+
mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
|
|
336
|
+
coding_part = self.seq_array[mask]
|
|
337
|
+
coding_seq = coding_part["nt"].tobytes().decode()
|
|
338
|
+
|
|
339
|
+
# Read codons in-frame
|
|
340
|
+
for i in range(0, len(coding_seq) - 2, 3):
|
|
341
|
+
codon = coding_seq[i:i + 3]
|
|
342
|
+
if codon in {"TAA", "TAG", "TGA"}:
|
|
343
|
+
# Determine index range for this ORF
|
|
344
|
+
start = coding_part["index"][0]
|
|
345
|
+
stop = coding_part["index"][i + 2]
|
|
346
|
+
lo, hi = sorted((start, stop))
|
|
347
|
+
return self.clone(start=lo, end=hi)
|
|
348
|
+
|
|
349
|
+
raise ValueError("No in-frame stop codon found after the TIS.")
|
|
350
|
+
|
|
351
|
+
def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
|
|
352
|
+
"""
|
|
353
|
+
Predict splicing probabilities at a given position using the specified engine.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
position (int): The genomic position to predict splicing probabilities for.
|
|
357
|
+
engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
358
|
+
context (int): The length of the target central region (default: 7500).
|
|
359
|
+
format (str): Output format for the splicing engine results.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
pd.DataFrame: A DataFrame containing:
|
|
363
|
+
- position: The genomic position
|
|
364
|
+
- donor_prob: Probability of being a donor splice site
|
|
365
|
+
- acceptor_prob: Probability of being an acceptor splice site
|
|
366
|
+
- nucleotides: The nucleotide sequence at that position
|
|
367
|
+
|
|
368
|
+
Raises:
|
|
369
|
+
ValueError: If an unsupported engine is provided.
|
|
370
|
+
IndexError: If the position is not found in the sequence.
|
|
371
|
+
"""
|
|
372
|
+
# Retrieve extended context (includes flanks) around the position.
|
|
373
|
+
# seq, indices = self.get_context(position, context=context, padding='N')
|
|
374
|
+
target = self.clone(position - context, position + context)
|
|
375
|
+
# print(len(target.seq))
|
|
376
|
+
seq, indices = target.seq, target.index
|
|
377
|
+
# print(len(seq))
|
|
378
|
+
# rel_pos = np.where(indices == position)[0][0]
|
|
379
|
+
# print(rel_pos)
|
|
380
|
+
rel_pos = np.abs(indices - position).argmin()
|
|
381
|
+
# print(rel_pos, len(seq))
|
|
382
|
+
left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
|
|
383
|
+
# print(left_missing, right_missing)
|
|
384
|
+
if left_missing > 0 or right_missing > 0:
|
|
385
|
+
step = -1 if self.rev else 1
|
|
386
|
+
|
|
387
|
+
if left_missing > 0:
|
|
388
|
+
left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
|
|
389
|
+
else:
|
|
390
|
+
left_pad = np.array([], dtype=indices.dtype)
|
|
391
|
+
|
|
392
|
+
if right_missing > 0:
|
|
393
|
+
right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
|
|
394
|
+
else:
|
|
395
|
+
right_pad = np.array([], dtype=indices.dtype)
|
|
396
|
+
|
|
397
|
+
seq = 'N' * left_missing + seq + 'N' * right_missing
|
|
398
|
+
indices = np.concatenate([left_pad, indices, right_pad])
|
|
399
|
+
|
|
400
|
+
# Run the splicing prediction engine (function assumed to be defined externally)
|
|
401
|
+
from .splicing_utils import run_splicing_engine
|
|
402
|
+
donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
|
|
403
|
+
# Trim off the fixed flanks before returning results.
|
|
404
|
+
seq = seq[5000:-5000]
|
|
405
|
+
indices = indices[5000:-5000]
|
|
406
|
+
df = pd.DataFrame({
|
|
407
|
+
'position': indices,
|
|
408
|
+
'donor_prob': donor_probs,
|
|
409
|
+
'acceptor_prob': acceptor_probs,
|
|
410
|
+
'nucleotides': list(seq)
|
|
411
|
+
}).set_index('position').round(3)
|
|
412
|
+
# if reference_donors is not None:
|
|
413
|
+
# df['ref_donor'] = df.index.isin(reference_donors).astype(int)
|
|
414
|
+
# if reference_acceptors is not None:
|
|
415
|
+
# df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
|
|
416
|
+
|
|
417
|
+
df.attrs['name'] = self.name
|
|
418
|
+
if inplace:
|
|
419
|
+
self.predicted_splicing = df
|
|
420
|
+
return self
|
|
421
|
+
else:
|
|
422
|
+
return df
|
|
423
|
+
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
__all__ = ['TranscriptLibrary']
|
|
2
|
+
|
|
3
|
+
from .splicing_utils import adjoin_splicing_outcomes
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TranscriptLibrary:
|
|
7
|
+
def __init__(self, reference_transcript, mutations):
|
|
8
|
+
self.ref = reference_transcript.clone()
|
|
9
|
+
self.event = reference_transcript.clone()
|
|
10
|
+
self._transcripts = {'ref': self.ref, 'event': self.event}
|
|
11
|
+
|
|
12
|
+
# Apply all mutations to 'event'
|
|
13
|
+
for i, (pos, ref, alt) in enumerate(mutations):
|
|
14
|
+
self.event.pre_mrna.apply_mutation(pos, ref, alt)
|
|
15
|
+
if len(mutations) > 1:
|
|
16
|
+
t = reference_transcript.clone()
|
|
17
|
+
t.pre_mrna.apply_mutation(pos, ref, alt)
|
|
18
|
+
self._transcripts[f'mut{i+1}'] = t
|
|
19
|
+
setattr(self, f'mut{i+1}', t)
|
|
20
|
+
|
|
21
|
+
# Make 'ref' and 'event' accessible as attributes too
|
|
22
|
+
setattr(self, 'ref', self.ref)
|
|
23
|
+
setattr(self, 'event', self.event)
|
|
24
|
+
|
|
25
|
+
def predict_splicing(self, pos, engine='spliceai', inplace=False):
|
|
26
|
+
self.splicing_predictions = {
|
|
27
|
+
k: t.pre_mrna.predict_splicing(pos, engine=engine, inplace=True)
|
|
28
|
+
for k, t in self._transcripts.items()
|
|
29
|
+
}
|
|
30
|
+
self.splicing_results = adjoin_splicing_outcomes(
|
|
31
|
+
{k: t.pre_mrna.predicted_splicing for k, t in self._transcripts.items()},
|
|
32
|
+
self.ref
|
|
33
|
+
)
|
|
34
|
+
if inplace:
|
|
35
|
+
return self
|
|
36
|
+
else:
|
|
37
|
+
return self.splicing_results
|
|
38
|
+
|
|
39
|
+
def get_event_columns(self, event_name, sites=('donors', 'acceptors')):
|
|
40
|
+
"""
|
|
41
|
+
Extracts selected columns from splicing_results for a given event name
|
|
42
|
+
(e.g., 'event', 'mut1', etc.)
|
|
43
|
+
"""
|
|
44
|
+
metrics = (f'{event_name}_prob', 'ref_prob', 'annotated')
|
|
45
|
+
if not hasattr(self, 'splicing_results'):
|
|
46
|
+
raise ValueError("You must run predict_splicing() first.")
|
|
47
|
+
|
|
48
|
+
cols = [(site, metric) for site in sites for metric in metrics]
|
|
49
|
+
return self.splicing_results.loc[:, cols]
|
|
50
|
+
|
|
51
|
+
def __getitem__(self, key):
|
|
52
|
+
return self._transcripts[key]
|
|
53
|
+
|
|
54
|
+
def __iter__(self):
|
|
55
|
+
return iter(self._transcripts.items())
|
geney/utils/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import os
|
|
3
|
+
import pathlib
|
|
4
|
+
|
|
5
|
+
__all__ = [] # This will collect all the names you want to expose
|
|
6
|
+
|
|
7
|
+
# Find all utility modules in this directory
|
|
8
|
+
_package_dir = pathlib.Path(__file__).parent
|
|
9
|
+
|
|
10
|
+
for file in os.listdir(_package_dir):
|
|
11
|
+
if file.endswith(".py") and file != "__init__.py":
|
|
12
|
+
module_name = file[:-3] # strip '.py'
|
|
13
|
+
module_path = f"{__name__}.{module_name}"
|
|
14
|
+
module = importlib.import_module(module_path)
|
|
15
|
+
|
|
16
|
+
# If the module defines __all__, expose those names at utils level
|
|
17
|
+
if hasattr(module, "__all__"):
|
|
18
|
+
for name in module.__all__:
|
|
19
|
+
globals()[name] = getattr(module, name)
|
|
20
|
+
__all__.append(name)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
__all__ = ['MutationalEvent', 'Mutation']
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
class Mutation:
|
|
9
|
+
def __init__(self, gene: str, chrom: str, pos: int, ref: str, alt: str):
|
|
10
|
+
self.gene = gene
|
|
11
|
+
self.chrom = chrom
|
|
12
|
+
self.pos = int(pos)
|
|
13
|
+
self.ref = ref
|
|
14
|
+
self.alt = alt
|
|
15
|
+
self.mut_type = self._infer_type()
|
|
16
|
+
|
|
17
|
+
def _infer_type(self):
|
|
18
|
+
if self.ref == '-' or self.alt == '-':
|
|
19
|
+
return 'indel'
|
|
20
|
+
elif len(self.ref) == len(self.alt) == 1:
|
|
21
|
+
return 'snp'
|
|
22
|
+
else:
|
|
23
|
+
return 'indel'
|
|
24
|
+
|
|
25
|
+
def overlaps_with(self, other: 'Mutation') -> bool:
|
|
26
|
+
ref_len = len(self.ref) if self.ref != '-' else 0
|
|
27
|
+
alt_len = len(self.alt) if self.alt != '-' else 0
|
|
28
|
+
span = max(ref_len, alt_len, 1)
|
|
29
|
+
return not (self.pos + span <= other.pos or other.pos + span <= self.pos)
|
|
30
|
+
|
|
31
|
+
def to_dict(self):
|
|
32
|
+
return {
|
|
33
|
+
'gene': self.gene,
|
|
34
|
+
'chrom': self.chrom,
|
|
35
|
+
'pos': self.pos,
|
|
36
|
+
'ref': self.ref,
|
|
37
|
+
'alt': self.alt,
|
|
38
|
+
'type': self.mut_type
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
def __repr__(self):
|
|
42
|
+
return f"{self.gene}:{self.chrom}:{self.pos}:{self.ref}:{self.alt}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MutationalEvent:
|
|
46
|
+
def __init__(self, mut_id: str):
|
|
47
|
+
self.raw = mut_id
|
|
48
|
+
self.mutations: List[Mutation] = self._parse_mutations(mut_id)
|
|
49
|
+
self.gene = self._verify_same_gene()
|
|
50
|
+
|
|
51
|
+
def __len__(self):
|
|
52
|
+
return len(self.mutations)
|
|
53
|
+
|
|
54
|
+
def _parse_mutations(self, mut_id: str) -> List[Mutation]:
|
|
55
|
+
parts = re.split(r'[|,]', mut_id)
|
|
56
|
+
mutations = []
|
|
57
|
+
for part in parts:
|
|
58
|
+
match = re.match(r'^([^:]+):([^:]+):(\d+):([ACGTN\-]+):([ACGTN\-]+)$', part)
|
|
59
|
+
if not match:
|
|
60
|
+
raise ValueError(f"Invalid format for mutation: {part}")
|
|
61
|
+
mutations.append(Mutation(*match.groups()))
|
|
62
|
+
return mutations
|
|
63
|
+
|
|
64
|
+
def _verify_same_gene(self) -> Optional[str]:
|
|
65
|
+
genes = {m.gene for m in self.mutations}
|
|
66
|
+
if len(genes) != 1:
|
|
67
|
+
raise ValueError(f"Multiple genes found in event: {genes}")
|
|
68
|
+
return genes.pop()
|
|
69
|
+
|
|
70
|
+
def compatible(self) -> bool:
|
|
71
|
+
# Check for non-overlapping mutations
|
|
72
|
+
for i, m1 in enumerate(self.mutations):
|
|
73
|
+
for j, m2 in enumerate(self.mutations):
|
|
74
|
+
if i != j and m1.overlaps_with(m2):
|
|
75
|
+
return False
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
79
|
+
return pd.DataFrame([m.to_dict() for m in self.mutations])
|
|
80
|
+
|
|
81
|
+
def __repr__(self):
|
|
82
|
+
muts = ', '.join(f"{m.pos}:{m.ref}>{m.alt}" for m in self.mutations)
|
|
83
|
+
return f"MutationalEvent({self.gene} -> [{muts}])"
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def positions(self):
|
|
87
|
+
return [m.pos for m in self.mutations]
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def position(self):
|
|
91
|
+
return int(np.mean(self.positions))
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def types(self):
|
|
95
|
+
return [m.mut_type for m in self.mutations]
|
|
96
|
+
|
|
97
|
+
def mutation_args(self):
|
|
98
|
+
"""
|
|
99
|
+
Yields (pos, ref, alt) tuples for each mutation, for use with `apply_mutation`.
|
|
100
|
+
"""
|
|
101
|
+
return [(m.pos, m.ref, m.alt) for m in self.mutations]
|
|
102
|
+
|
|
103
|
+
def __iter__(self):
|
|
104
|
+
return iter(self.mutation_args())
|