geney 1.4.16__py2.py3-none-any.whl → 1.4.18__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/Transcript.py +1 -1
- geney/_oncosplice.py +1 -1
- geney/_splicing_utils.py +1 -1
- geney/utils/SeqMats.py +162 -360
- geney/utils/SeqMatsOld.py +441 -0
- {geney-1.4.16.dist-info → geney-1.4.18.dist-info}/METADATA +1 -1
- {geney-1.4.16.dist-info → geney-1.4.18.dist-info}/RECORD +9 -8
- {geney-1.4.16.dist-info → geney-1.4.18.dist-info}/WHEEL +0 -0
- {geney-1.4.16.dist-info → geney-1.4.18.dist-info}/top_level.txt +0 -0
geney/Transcript.py
CHANGED
|
@@ -5,7 +5,7 @@ import copy
|
|
|
5
5
|
from Bio.Seq import Seq # Assuming Biopython is used
|
|
6
6
|
from . import config
|
|
7
7
|
from .utils.utils import unload_pickle
|
|
8
|
-
from .utils.
|
|
8
|
+
from .utils.SeqMatsOld import SeqMat #, MutSeqMat
|
|
9
9
|
from .utils.Fasta_segment import Fasta_segment
|
|
10
10
|
|
|
11
11
|
class Transcript:
|
geney/_oncosplice.py
CHANGED
|
@@ -4,7 +4,7 @@ from datetime import datetime
|
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import numpy as np
|
|
7
|
-
from geney.utils.
|
|
7
|
+
from geney.utils.SeqMatsOld import MutSeqMat
|
|
8
8
|
from ._splicing_utils import find_transcript_missplicing_seqs, develop_aberrant_splicing, Missplicing
|
|
9
9
|
from .Gene import Gene
|
|
10
10
|
|
geney/_splicing_utils.py
CHANGED
geney/utils/SeqMats.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
__all__ = ['SeqMat', 'format_mut_id']
|
|
2
4
|
|
|
5
|
+
|
|
3
6
|
from dataclasses import dataclass, field
|
|
7
|
+
from typing import List, Tuple, Union, Optional
|
|
4
8
|
from collections import defaultdict
|
|
5
|
-
from typing import Optional, Union, List, Tuple
|
|
6
9
|
import numpy as np
|
|
7
10
|
import pandas as pd
|
|
8
11
|
|
|
12
|
+
|
|
13
|
+
|
|
9
14
|
def format_mut_id(text):
|
|
10
15
|
import re
|
|
11
16
|
# text = "TP53:17:7579472:G:A"
|
|
@@ -25,417 +30,214 @@ def format_mut_id(text):
|
|
|
25
30
|
return None
|
|
26
31
|
|
|
27
32
|
|
|
33
|
+
|
|
28
34
|
@dataclass(slots=True)
|
|
29
35
|
class SeqMat:
|
|
30
36
|
"""Represents a genomic sequence matrix used for training."""
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
notes: dict = field(default_factory=dict, metadata={"description": "User-defined metadata dictionary"})
|
|
37
|
+
name: str = field(default="Unnamed Sequence")
|
|
38
|
+
version: str = field(default="1.0")
|
|
39
|
+
source: str = field(default="Unknown")
|
|
40
|
+
notes: dict = field(default_factory=dict)
|
|
36
41
|
|
|
37
42
|
seq_array: np.ndarray = field(init=False, repr=False)
|
|
38
43
|
insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
|
|
39
44
|
rev: bool = field(default=False, init=False, repr=False)
|
|
40
|
-
|
|
41
45
|
predicted_splicing: pd.DataFrame = field(init=False, repr=False)
|
|
42
46
|
_pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
|
|
43
47
|
|
|
44
48
|
def __init__(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
49
|
+
self,
|
|
50
|
+
nucleotides: str,
|
|
51
|
+
index: np.ndarray,
|
|
52
|
+
conservation: Optional[np.ndarray] = None,
|
|
53
|
+
reference_nucleotides: Optional[np.ndarray] = None,
|
|
54
|
+
notes: Optional[dict] = None,
|
|
55
|
+
source: Optional[str] = None,
|
|
56
|
+
rev: Optional[bool] = False,
|
|
57
|
+
name: Optional[str] = 'wild_type',
|
|
58
|
+
version: Optional[str] = 'none'
|
|
56
59
|
) -> None:
|
|
60
|
+
# Initialize metadata
|
|
61
|
+
self.name = name
|
|
62
|
+
self.version = version
|
|
63
|
+
self.source = source or "Unknown"
|
|
64
|
+
self.notes = notes or {}
|
|
65
|
+
self.rev = rev
|
|
57
66
|
self.predicted_splicing = None
|
|
58
|
-
|
|
59
|
-
|
|
67
|
+
|
|
68
|
+
# Build structured array
|
|
69
|
+
nts = np.array(list(nucleotides), dtype='S1')
|
|
70
|
+
L = len(nts)
|
|
60
71
|
if index.shape[0] != L:
|
|
61
|
-
raise ValueError("Indices
|
|
72
|
+
raise ValueError("Indices length must match sequence length.")
|
|
62
73
|
if conservation is not None and conservation.shape[0] != L:
|
|
63
|
-
raise ValueError("Conservation
|
|
64
|
-
if reference_nucleotides is not None and reference_nucleotides
|
|
65
|
-
raise ValueError("Reference
|
|
74
|
+
raise ValueError("Conservation length must match sequence length.")
|
|
75
|
+
if reference_nucleotides is not None and len(reference_nucleotides) != L:
|
|
76
|
+
raise ValueError("Reference nucleotides length must match sequence length.")
|
|
66
77
|
|
|
67
78
|
dtype = np.dtype([
|
|
68
|
-
(
|
|
69
|
-
(
|
|
70
|
-
(
|
|
71
|
-
(
|
|
72
|
-
(
|
|
79
|
+
('nt', 'S1'),
|
|
80
|
+
('index', np.float64),
|
|
81
|
+
('ref', 'S1'),
|
|
82
|
+
('cons', np.float32),
|
|
83
|
+
('valid_mask', bool)
|
|
73
84
|
])
|
|
74
|
-
|
|
75
85
|
self.seq_array = np.empty(L, dtype=dtype)
|
|
76
|
-
self.seq_array[
|
|
77
|
-
|
|
78
|
-
self.seq_array[
|
|
79
|
-
self.seq_array[
|
|
80
|
-
self.seq_array[
|
|
81
|
-
|
|
86
|
+
self.seq_array['nt'] = nts
|
|
87
|
+
self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
|
|
88
|
+
self.seq_array['index'] = index
|
|
89
|
+
self.seq_array['cons'] = (np.zeros(L, dtype='f4') if conservation is None else conservation)
|
|
90
|
+
self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
|
|
91
|
+
|
|
92
|
+
# Initialize helpers
|
|
82
93
|
self.insertion_counters = defaultdict(int)
|
|
83
|
-
self.
|
|
94
|
+
self._build_index_map()
|
|
84
95
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
self.
|
|
88
|
-
self.rev = rev
|
|
89
|
-
self.version = version
|
|
96
|
+
def _build_index_map(self):
|
|
97
|
+
"""Rebuild position-to-index lookup."""
|
|
98
|
+
self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
|
|
90
99
|
|
|
91
100
|
def __len__(self) -> int:
|
|
92
|
-
return int(self.seq_array[
|
|
93
|
-
|
|
94
|
-
def __repr__(self):
|
|
95
|
-
return f"<SeqMat: {self.seq}>"
|
|
96
|
-
|
|
97
|
-
def __str__(self):
|
|
98
|
-
return self.seq
|
|
99
|
-
|
|
100
|
-
def get_metadata(self) -> dict:
|
|
101
|
-
"""Retrieve all metadata as a dictionary."""
|
|
102
|
-
return {
|
|
103
|
-
"name": self.name,
|
|
104
|
-
"source": self.source,
|
|
105
|
-
"version": self.version,
|
|
106
|
-
"notes": self.notes
|
|
107
|
-
}
|
|
101
|
+
return int(self.seq_array['valid_mask'].sum())
|
|
108
102
|
|
|
109
103
|
@property
|
|
110
104
|
def seq(self) -> str:
|
|
111
|
-
return self.seq_array[
|
|
105
|
+
return self.seq_array['nt'][self.seq_array['valid_mask']].tobytes().decode()
|
|
112
106
|
|
|
113
107
|
@property
|
|
114
108
|
def index(self) -> np.ndarray:
|
|
115
|
-
return self.seq_array[
|
|
109
|
+
return self.seq_array['index'][self.seq_array['valid_mask']]
|
|
116
110
|
|
|
117
111
|
@property
|
|
118
112
|
def conservation(self) -> np.ndarray:
|
|
119
|
-
return self.seq_array[
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
@property
|
|
134
|
-
def end(self) -> float:
|
|
135
|
-
return self.max_index
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def mutated_positions(self) -> np.ndarray:
|
|
139
|
-
return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
|
|
140
|
-
|
|
141
|
-
def clone(self, start: Optional[int] = None, end: Optional[int] = None) -> "SeqMat":
|
|
142
|
-
cloned = SeqMat.__new__(SeqMat)
|
|
113
|
+
return self.seq_array['cons'][self.seq_array['valid_mask']]
|
|
114
|
+
|
|
115
|
+
def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
|
|
116
|
+
new = SeqMat.__new__(SeqMat)
|
|
117
|
+
# copy metadata
|
|
118
|
+
new.name = self.name
|
|
119
|
+
new.version = self.version
|
|
120
|
+
new.source = self.source
|
|
121
|
+
new.notes = self.notes.copy()
|
|
122
|
+
new.rev = self.rev
|
|
123
|
+
new.predicted_splicing = None
|
|
124
|
+
new.insertion_counters = defaultdict(int)
|
|
125
|
+
|
|
126
|
+
# slice or full copy
|
|
143
127
|
if start is not None and end is not None:
|
|
144
|
-
|
|
128
|
+
mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
|
|
129
|
+
new.seq_array = self.seq_array[mask].copy()
|
|
145
130
|
else:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
cloned._pos_to_idx = {pos: i for i, pos in enumerate(cloned.seq_array["index"])}
|
|
155
|
-
|
|
156
|
-
return cloned
|
|
157
|
-
|
|
158
|
-
def apply_mutation(self, pos: int, ref: str, alt: str, only_snps: bool = False):
|
|
159
|
-
"""
|
|
160
|
-
Applies a mutation (SNP, substitution, insertion, or deletion) to the sequence.
|
|
161
|
-
|
|
162
|
-
Parameters:
|
|
163
|
-
pos (int): The reference position where the mutation should occur.
|
|
164
|
-
ref (str): The reference allele (use '-' for insertions).
|
|
165
|
-
alt (str): The alternate allele (use '-' for deletions).
|
|
166
|
-
only_snps (bool): If True, only SNP substitutions are allowed; indels are ignored.
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
SeqMat: The mutated sequence matrix.
|
|
170
|
-
|
|
171
|
-
The method normalizes the mutation (dropping any shared prefix) and then applies:
|
|
172
|
-
- A SNP/substitution if both alleles are non-gap.
|
|
173
|
-
- An insertion if ref is '-' (after normalization).
|
|
174
|
-
- A deletion if alt is '-' (after normalization).
|
|
175
|
-
|
|
176
|
-
For insertions, new rows are added with fractional indices computed from an insertion counter.
|
|
177
|
-
For deletions, the corresponding rows are removed.
|
|
178
|
-
"""
|
|
179
|
-
return_to_rc = False
|
|
131
|
+
new.seq_array = self.seq_array.copy()
|
|
132
|
+
|
|
133
|
+
new._build_index_map()
|
|
134
|
+
return new
|
|
135
|
+
|
|
136
|
+
def apply_mutation(self, pos: float, ref: str, alt: str, only_snps: bool = False) -> SeqMat:
|
|
137
|
+
"""Apply a single mutation to this SeqMat."""
|
|
138
|
+
# reverse-complement context
|
|
180
139
|
if self.rev:
|
|
181
|
-
return_to_rc = True
|
|
182
140
|
self.reverse_complement()
|
|
183
141
|
|
|
184
|
-
#
|
|
142
|
+
# left-normalize
|
|
185
143
|
while ref and alt and ref[0] == alt[0]:
|
|
186
144
|
pos += 1
|
|
187
|
-
ref = ref[1:] or
|
|
188
|
-
alt = alt[1:] or
|
|
145
|
+
ref = ref[1:] or '-'
|
|
146
|
+
alt = alt[1:] or '-'
|
|
189
147
|
|
|
190
|
-
#
|
|
191
|
-
if ref !=
|
|
148
|
+
# substitution
|
|
149
|
+
if ref != '-' and alt != '-':
|
|
192
150
|
if len(ref) != len(alt):
|
|
193
|
-
raise ValueError("Substitution
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if
|
|
199
|
-
raise
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
if
|
|
203
|
-
raise ValueError(f"
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
ref_segment = self.seq_array["ref"][pos_idx:end_idx]
|
|
210
|
-
# expected_segment = np.frombuffer(ref.encode(), dtype='S1')
|
|
211
|
-
if not np.all(ref_segment == np.frombuffer(ref.encode(), dtype='S1')):
|
|
212
|
-
actual_str = ref_segment.tobytes().decode()
|
|
213
|
-
raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{actual_str}'")
|
|
214
|
-
self.seq_array["nt"][pos_idx:end_idx] = np.frombuffer(alt.encode(), dtype='S1')
|
|
215
|
-
|
|
216
|
-
# for i, nt in enumerate(alt):
|
|
217
|
-
# self.seq_array["nt"][pos_idx + i] = nt.encode()
|
|
218
|
-
|
|
219
|
-
# Case 2: Insertion (ref is '-' means nothing was present, and we need to add bases)
|
|
220
|
-
elif ref == "-" and alt != "-":
|
|
151
|
+
raise ValueError("Substitution requires equal-length alleles.")
|
|
152
|
+
idx = self._pos_to_idx.get(pos)
|
|
153
|
+
if idx is None:
|
|
154
|
+
raise KeyError(f"Position {pos} not found.")
|
|
155
|
+
end = idx + len(ref)
|
|
156
|
+
if end > len(self.seq_array):
|
|
157
|
+
raise IndexError(f"Out of bounds at {pos}.")
|
|
158
|
+
# verify reference
|
|
159
|
+
ref_seg = self.seq_array['ref'][idx:end]
|
|
160
|
+
if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
|
|
161
|
+
raise ValueError(f"Ref mismatch at {pos}.")
|
|
162
|
+
# assign alt
|
|
163
|
+
self.seq_array['nt'][idx:end] = np.frombuffer(alt.encode(), dtype='S1')
|
|
164
|
+
|
|
165
|
+
# insertion
|
|
166
|
+
elif ref == '-' and alt != '-':
|
|
221
167
|
if only_snps:
|
|
222
|
-
return self
|
|
223
|
-
|
|
224
|
-
|
|
168
|
+
return self
|
|
169
|
+
idx = self._pos_to_idx.get(pos)
|
|
170
|
+
if idx is None:
|
|
171
|
+
raise KeyError(f"Position {pos} not found.")
|
|
172
|
+
cnt = self.insertion_counters[pos]
|
|
225
173
|
eps = 1e-6
|
|
226
174
|
new_rows = []
|
|
227
175
|
for i, nt in enumerate(alt):
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
new_seq_array.sort(order="index")
|
|
235
|
-
self.seq_array = new_seq_array
|
|
176
|
+
new_rows.append((nt.encode(),
|
|
177
|
+
pos + (cnt + i + 1)*eps,
|
|
178
|
+
b'-',
|
|
179
|
+
np.nan,
|
|
180
|
+
True))
|
|
181
|
+
self._insert_rows(idx, new_rows)
|
|
236
182
|
self.insertion_counters[pos] += len(alt)
|
|
237
183
|
|
|
238
|
-
#
|
|
239
|
-
elif alt ==
|
|
184
|
+
# deletion
|
|
185
|
+
elif alt == '-' and ref != '-':
|
|
240
186
|
if only_snps:
|
|
241
|
-
return self
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
self.seq_array = np.delete(self.seq_array, np.s_[
|
|
187
|
+
return self
|
|
188
|
+
idx = self._pos_to_idx.get(pos)
|
|
189
|
+
if idx is None:
|
|
190
|
+
raise KeyError(f"Position {pos} not found.")
|
|
191
|
+
end = idx + len(ref)
|
|
192
|
+
# verify
|
|
193
|
+
ref_seg = self.seq_array['ref'][idx:end]
|
|
194
|
+
if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
|
|
195
|
+
raise ValueError(f"Ref mismatch at {pos}.")
|
|
196
|
+
self.seq_array = np.delete(self.seq_array, np.s_[idx:end])
|
|
197
|
+
|
|
251
198
|
else:
|
|
252
|
-
raise ValueError("Unsupported mutation type.
|
|
199
|
+
raise ValueError("Unsupported mutation type.")
|
|
253
200
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
201
|
+
# update mask & index map
|
|
202
|
+
self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
|
|
203
|
+
self._build_index_map()
|
|
257
204
|
|
|
205
|
+
# restore orientation
|
|
206
|
+
if self.rev:
|
|
207
|
+
self.reverse_complement()
|
|
258
208
|
return self
|
|
259
209
|
|
|
210
|
+
def _insert_rows(self, idx: int, rows: List[tuple]):
|
|
211
|
+
"""Helper to insert new rows efficiently and resort."""
|
|
212
|
+
arr = self.seq_array.tolist()
|
|
213
|
+
arr[idx:idx] = rows
|
|
214
|
+
new = np.array(arr, dtype=self.seq_array.dtype)
|
|
215
|
+
new.sort(order='index')
|
|
216
|
+
self.seq_array = new
|
|
217
|
+
|
|
218
|
+
def complement(self) -> SeqMat:
|
|
219
|
+
comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
|
|
220
|
+
nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
|
|
221
|
+
new = self.clone()
|
|
222
|
+
new.seq_array['nt'] = nts
|
|
223
|
+
return new
|
|
224
|
+
|
|
225
|
+
def reverse_complement(self) -> SeqMat:
|
|
226
|
+
new = self.complement().clone()
|
|
227
|
+
new.seq_array = new.seq_array[::-1].copy()
|
|
228
|
+
new.rev = not self.rev
|
|
229
|
+
return new
|
|
230
|
+
|
|
260
231
|
def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
|
|
232
|
+
idx = None
|
|
261
233
|
if isinstance(key, int):
|
|
262
|
-
|
|
263
|
-
if
|
|
264
|
-
raise
|
|
265
|
-
return self.seq_array[
|
|
266
|
-
|
|
267
|
-
start
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
return self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= stop)]
|
|
273
|
-
else:
|
|
274
|
-
raise TypeError("Indexing must be an integer or a slice.")
|
|
275
|
-
|
|
276
|
-
def complement(self) -> "SeqMat":
|
|
277
|
-
comp_dict = {b"A": b"T", b"T": b"A", b"C": b"G", b"G": b"C", b"-": b"-", b"N": b"N"}
|
|
278
|
-
comp_seq = np.array([comp_dict[nt] for nt in self.seq_array["nt"]], dtype="S1")
|
|
279
|
-
new_instance = self.clone()
|
|
280
|
-
new_instance.seq_array["nt"] = comp_seq
|
|
281
|
-
return new_instance
|
|
282
|
-
|
|
283
|
-
def reverse_complement(self) -> "SeqMat":
|
|
284
|
-
rev_comp_seq = self.complement().seq_array[::-1]
|
|
285
|
-
self.seq_array = rev_comp_seq.copy()
|
|
286
|
-
self.rev = not self.rev
|
|
287
|
-
return self
|
|
288
|
-
|
|
289
|
-
# def splice_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
290
|
-
# """
|
|
291
|
-
# Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
292
|
-
#
|
|
293
|
-
# Args:
|
|
294
|
-
# introns (List[Tuple[int, int]]): List of (start, end) intron boundaries to remove.
|
|
295
|
-
# Coordinates should match the 'index' field.
|
|
296
|
-
#
|
|
297
|
-
# Returns:
|
|
298
|
-
# SeqMat: A new instance with the intron regions removed.
|
|
299
|
-
# """
|
|
300
|
-
# mask = np.ones(len(self.seq_array), dtype=bool)
|
|
301
|
-
#
|
|
302
|
-
# for start, end in introns:
|
|
303
|
-
# mask &= ~((self.seq_array["index"] >= start) & (self.seq_array["index"] <= end))
|
|
304
|
-
#
|
|
305
|
-
# new_instance = self.clone()
|
|
306
|
-
# new_instance.seq_array = self.seq_array[mask].copy()
|
|
307
|
-
# return new_instance
|
|
308
|
-
|
|
309
|
-
def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
310
|
-
"""
|
|
311
|
-
Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
312
|
-
|
|
313
|
-
Handles reverse-complemented sequences by interpreting introns in reverse as well.
|
|
314
|
-
|
|
315
|
-
Args:
|
|
316
|
-
introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
|
|
317
|
-
These are always genomic (absolute) coordinates,
|
|
318
|
-
regardless of strand direction.
|
|
319
|
-
|
|
320
|
-
Returns:
|
|
321
|
-
SeqMat: A new instance with the intron regions removed.
|
|
322
|
-
"""
|
|
323
|
-
# In reverse orientation, flip intron direction for comparison
|
|
324
|
-
if self.rev:
|
|
325
|
-
introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
|
|
326
|
-
|
|
327
|
-
mask = np.ones(len(self.seq_array), dtype=bool)
|
|
328
|
-
|
|
329
|
-
for start, end in introns:
|
|
330
|
-
lo, hi = min(start, end) + 1, max(start, end) - 1
|
|
331
|
-
mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
|
|
332
|
-
|
|
333
|
-
new_instance = self.clone()
|
|
334
|
-
new_instance.seq_array = self.seq_array[mask].copy()
|
|
335
|
-
return new_instance
|
|
336
|
-
|
|
337
|
-
def open_reading_frame(self, tis: int) -> "SeqMat":
|
|
338
|
-
"""
|
|
339
|
-
Extracts the open reading frame starting from the translation initiation site (TIS)
|
|
340
|
-
until the first in-frame stop codon.
|
|
341
|
-
|
|
342
|
-
Args:
|
|
343
|
-
tis (int): Genomic position of the translation initiation site (start codon).
|
|
344
|
-
|
|
345
|
-
Returns:
|
|
346
|
-
SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
|
|
347
|
-
"""
|
|
348
|
-
if tis not in self.seq_array["index"]:
|
|
349
|
-
print(f"Warning: TIS position {tis} not found, returning default.")
|
|
350
|
-
return self.clone(start=0, end=3)
|
|
351
|
-
|
|
352
|
-
# Extract nucleotide sequence and indices starting from TIS
|
|
353
|
-
mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
|
|
354
|
-
coding_part = self.seq_array[mask]
|
|
355
|
-
coding_seq = coding_part["nt"].tobytes().decode()
|
|
356
|
-
|
|
357
|
-
# Read codons in-frame
|
|
358
|
-
for i in range(0, len(coding_seq) - 2, 3):
|
|
359
|
-
codon = coding_seq[i:i + 3]
|
|
360
|
-
if codon in {"TAA", "TAG", "TGA"}:
|
|
361
|
-
# Determine index range for this ORF
|
|
362
|
-
start = coding_part["index"][0]
|
|
363
|
-
stop = coding_part["index"][i + 2]
|
|
364
|
-
lo, hi = sorted((start, stop))
|
|
365
|
-
return self.clone(start=lo, end=hi)
|
|
366
|
-
|
|
367
|
-
raise ValueError("No in-frame stop codon found after the TIS.")
|
|
368
|
-
|
|
369
|
-
def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
|
|
370
|
-
"""
|
|
371
|
-
Predict splicing probabilities at a given position using the specified engine.
|
|
372
|
-
|
|
373
|
-
Args:
|
|
374
|
-
position (int): The genomic position to predict splicing probabilities for.
|
|
375
|
-
engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
376
|
-
context (int): The length of the target central region (default: 7500).
|
|
377
|
-
format (str): Output format for the splicing engine results.
|
|
378
|
-
|
|
379
|
-
Returns:
|
|
380
|
-
pd.DataFrame: A DataFrame containing:
|
|
381
|
-
- position: The genomic position
|
|
382
|
-
- donor_prob: Probability of being a donor splice site
|
|
383
|
-
- acceptor_prob: Probability of being an acceptor splice site
|
|
384
|
-
- nucleotides: The nucleotide sequence at that position
|
|
385
|
-
|
|
386
|
-
Raises:
|
|
387
|
-
ValueError: If an unsupported engine is provided.
|
|
388
|
-
IndexError: If the position is not found in the sequence.
|
|
389
|
-
"""
|
|
390
|
-
# Retrieve extended context (includes flanks) around the position.
|
|
391
|
-
# seq, indices = self.get_context(position, context=context, padding='N')
|
|
392
|
-
target = self.clone(position - context, position + context)
|
|
393
|
-
# print(len(target.seq))
|
|
394
|
-
seq, indices = target.seq, target.index
|
|
395
|
-
# print(len(seq))
|
|
396
|
-
# rel_pos = np.where(indices == position)[0][0]
|
|
397
|
-
# print(rel_pos)
|
|
398
|
-
rel_pos = np.abs(indices - position).argmin()
|
|
399
|
-
# print(rel_pos, len(seq))
|
|
400
|
-
left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
|
|
401
|
-
# print(left_missing, right_missing)
|
|
402
|
-
if left_missing > 0 or right_missing > 0:
|
|
403
|
-
step = -1 if self.rev else 1
|
|
404
|
-
|
|
405
|
-
if left_missing > 0:
|
|
406
|
-
left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
|
|
407
|
-
else:
|
|
408
|
-
left_pad = np.array([], dtype=indices.dtype)
|
|
409
|
-
|
|
410
|
-
if right_missing > 0:
|
|
411
|
-
right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
|
|
412
|
-
else:
|
|
413
|
-
right_pad = np.array([], dtype=indices.dtype)
|
|
414
|
-
|
|
415
|
-
seq = 'N' * left_missing + seq + 'N' * right_missing
|
|
416
|
-
indices = np.concatenate([left_pad, indices, right_pad])
|
|
417
|
-
|
|
418
|
-
# Run the splicing prediction engine (function assumed to be defined externally)
|
|
419
|
-
from .splicing_utils import run_splicing_engine
|
|
420
|
-
donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
|
|
421
|
-
# Trim off the fixed flanks before returning results.
|
|
422
|
-
seq = seq[5000:-5000]
|
|
423
|
-
indices = indices[5000:-5000]
|
|
424
|
-
df = pd.DataFrame({
|
|
425
|
-
'position': indices,
|
|
426
|
-
'donor_prob': donor_probs,
|
|
427
|
-
'acceptor_prob': acceptor_probs,
|
|
428
|
-
'nucleotides': list(seq)
|
|
429
|
-
}).set_index('position').round(3)
|
|
430
|
-
# if reference_donors is not None:
|
|
431
|
-
# df['ref_donor'] = df.index.isin(reference_donors).astype(int)
|
|
432
|
-
# if reference_acceptors is not None:
|
|
433
|
-
# df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
|
|
434
|
-
|
|
435
|
-
df.attrs['name'] = self.name
|
|
436
|
-
if inplace:
|
|
437
|
-
self.predicted_splicing = df
|
|
438
|
-
return self
|
|
439
|
-
else:
|
|
440
|
-
return df
|
|
441
|
-
|
|
234
|
+
idx = self._pos_to_idx.get(float(key))
|
|
235
|
+
if idx is None:
|
|
236
|
+
raise KeyError(f"Position {key} not found.")
|
|
237
|
+
return self.seq_array[idx]
|
|
238
|
+
if isinstance(key, slice):
|
|
239
|
+
start = key.start or self.min_index
|
|
240
|
+
stop = key.stop or self.max_index
|
|
241
|
+
mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= stop)
|
|
242
|
+
return self.seq_array[mask]
|
|
243
|
+
raise TypeError("Invalid index type.")
|
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
__all__ = ['SeqMat', 'format_mut_id']
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Optional, Union, List, Tuple
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
def format_mut_id(text):
|
|
10
|
+
import re
|
|
11
|
+
# text = "TP53:17:7579472:G:A"
|
|
12
|
+
|
|
13
|
+
pattern = r'^[^:]+:[^:]+:(\d+):([ACGTN\-]+):([ACGTN\-]+)$'
|
|
14
|
+
match = re.match(pattern, text)
|
|
15
|
+
|
|
16
|
+
if match:
|
|
17
|
+
position = int(match.group(1))
|
|
18
|
+
ref = match.group(2)
|
|
19
|
+
alt = match.group(3)
|
|
20
|
+
return {'pos': position, 'ref': ref, 'alt': alt}
|
|
21
|
+
|
|
22
|
+
# print(f"Position: {position}, Ref: {ref}, Alt: {alt}")
|
|
23
|
+
else:
|
|
24
|
+
print("No match")
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(slots=True)
|
|
29
|
+
class SeqMat:
|
|
30
|
+
"""Represents a genomic sequence matrix used for training."""
|
|
31
|
+
# Metadata fields (uncomment and/or extend as needed)
|
|
32
|
+
name: str = field(default="Unnamed Sequence", metadata={"description": "Name of the sequence"})
|
|
33
|
+
version: str = field(default="1.0", metadata={"description": "Version of the dataset"})
|
|
34
|
+
source: str = field(default="Unknown", metadata={"description": "Source of the sequence data"})
|
|
35
|
+
notes: dict = field(default_factory=dict, metadata={"description": "User-defined metadata dictionary"})
|
|
36
|
+
|
|
37
|
+
seq_array: np.ndarray = field(init=False, repr=False)
|
|
38
|
+
insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
|
|
39
|
+
rev: bool = field(default=False, init=False, repr=False)
|
|
40
|
+
|
|
41
|
+
predicted_splicing: pd.DataFrame = field(init=False, repr=False)
|
|
42
|
+
_pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
nucleotides: str,
|
|
47
|
+
index: np.ndarray,
|
|
48
|
+
conservation: Optional[np.ndarray] = None,
|
|
49
|
+
reference_nucleotides: Optional[np.ndarray] = None,
|
|
50
|
+
notes: Optional[dict] = None,
|
|
51
|
+
source: Optional[str] = None,
|
|
52
|
+
rev: Optional[bool] = False,
|
|
53
|
+
name: Optional[str] = 'wild_type',
|
|
54
|
+
version: Optional[str] = 'none'
|
|
55
|
+
|
|
56
|
+
) -> None:
|
|
57
|
+
self.predicted_splicing = None
|
|
58
|
+
nucleotides = np.array(list(nucleotides))
|
|
59
|
+
L = nucleotides.shape[0]
|
|
60
|
+
if index.shape[0] != L:
|
|
61
|
+
raise ValueError("Indices array length must match nucleotide sequence length.")
|
|
62
|
+
if conservation is not None and conservation.shape[0] != L:
|
|
63
|
+
raise ValueError("Conservation vector length must match sequence length.")
|
|
64
|
+
if reference_nucleotides is not None and reference_nucleotides.shape[0] != L:
|
|
65
|
+
raise ValueError("Reference nucleotide vector length must match sequence length.")
|
|
66
|
+
|
|
67
|
+
dtype = np.dtype([
|
|
68
|
+
("nt", "S1"),
|
|
69
|
+
("index", np.float64),
|
|
70
|
+
("ref", "S1"),
|
|
71
|
+
("cons", np.float32),
|
|
72
|
+
("valid_mask", bool),
|
|
73
|
+
])
|
|
74
|
+
|
|
75
|
+
self.seq_array = np.empty(L, dtype=dtype)
|
|
76
|
+
self.seq_array["nt"] = nucleotides
|
|
77
|
+
# Use provided reference nucleotides if available.
|
|
78
|
+
self.seq_array["ref"] = nucleotides if reference_nucleotides is None else reference_nucleotides
|
|
79
|
+
self.seq_array["index"] = index
|
|
80
|
+
self.seq_array["cons"] = np.nan if conservation is None else conservation
|
|
81
|
+
self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
|
|
82
|
+
self.insertion_counters = defaultdict(int)
|
|
83
|
+
self._pos_to_idx = {pos: i for i, pos in enumerate(self.seq_array["index"])}
|
|
84
|
+
|
|
85
|
+
self.source = source if source is not None else "Unknown"
|
|
86
|
+
self.notes = notes if notes is not None else {}
|
|
87
|
+
self.name = name
|
|
88
|
+
self.rev = rev
|
|
89
|
+
self.version = version
|
|
90
|
+
|
|
91
|
+
def __len__(self) -> int:
|
|
92
|
+
return int(self.seq_array["valid_mask"].sum())
|
|
93
|
+
|
|
94
|
+
def __repr__(self):
|
|
95
|
+
return f"<SeqMat: {self.seq}>"
|
|
96
|
+
|
|
97
|
+
def __str__(self):
|
|
98
|
+
return self.seq
|
|
99
|
+
|
|
100
|
+
def get_metadata(self) -> dict:
|
|
101
|
+
"""Retrieve all metadata as a dictionary."""
|
|
102
|
+
return {
|
|
103
|
+
"name": self.name,
|
|
104
|
+
"source": self.source,
|
|
105
|
+
"version": self.version,
|
|
106
|
+
"notes": self.notes
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def seq(self) -> str:
|
|
111
|
+
return self.seq_array["nt"][self.seq_array["valid_mask"]].tobytes().decode()
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def index(self) -> np.ndarray:
|
|
115
|
+
return self.seq_array["index"][self.seq_array["valid_mask"]]
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def conservation(self) -> np.ndarray:
|
|
119
|
+
return self.seq_array["cons"][self.seq_array["valid_mask"]]
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def max_index(self) -> float:
|
|
123
|
+
return self.seq_array["index"].max()
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def min_index(self) -> float:
|
|
127
|
+
return self.seq_array["index"].min()
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def start(self) -> float:
|
|
131
|
+
return self.min_index
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def end(self) -> float:
|
|
135
|
+
return self.max_index
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def mutated_positions(self) -> np.ndarray:
|
|
139
|
+
return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
|
|
140
|
+
|
|
141
|
+
def clone(self, start: Optional[int] = None, end: Optional[int] = None) -> "SeqMat":
|
|
142
|
+
cloned = SeqMat.__new__(SeqMat)
|
|
143
|
+
if start is not None and end is not None:
|
|
144
|
+
cloned.seq_array = self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= end)]
|
|
145
|
+
else:
|
|
146
|
+
cloned.seq_array = self.seq_array.copy()
|
|
147
|
+
cloned.insertion_counters = defaultdict(int)
|
|
148
|
+
cloned.name = self.name
|
|
149
|
+
cloned.source = self.source
|
|
150
|
+
cloned.version = self.version
|
|
151
|
+
cloned.notes = self.notes.copy()
|
|
152
|
+
cloned.rev = self.rev
|
|
153
|
+
|
|
154
|
+
cloned._pos_to_idx = {pos: i for i, pos in enumerate(cloned.seq_array["index"])}
|
|
155
|
+
|
|
156
|
+
return cloned
|
|
157
|
+
|
|
158
|
+
def apply_mutation(self, pos: int, ref: str, alt: str, only_snps: bool = False):
|
|
159
|
+
"""
|
|
160
|
+
Applies a mutation (SNP, substitution, insertion, or deletion) to the sequence.
|
|
161
|
+
|
|
162
|
+
Parameters:
|
|
163
|
+
pos (int): The reference position where the mutation should occur.
|
|
164
|
+
ref (str): The reference allele (use '-' for insertions).
|
|
165
|
+
alt (str): The alternate allele (use '-' for deletions).
|
|
166
|
+
only_snps (bool): If True, only SNP substitutions are allowed; indels are ignored.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
SeqMat: The mutated sequence matrix.
|
|
170
|
+
|
|
171
|
+
The method normalizes the mutation (dropping any shared prefix) and then applies:
|
|
172
|
+
- A SNP/substitution if both alleles are non-gap.
|
|
173
|
+
- An insertion if ref is '-' (after normalization).
|
|
174
|
+
- A deletion if alt is '-' (after normalization).
|
|
175
|
+
|
|
176
|
+
For insertions, new rows are added with fractional indices computed from an insertion counter.
|
|
177
|
+
For deletions, the corresponding rows are removed.
|
|
178
|
+
"""
|
|
179
|
+
return_to_rc = False
|
|
180
|
+
if self.rev:
|
|
181
|
+
return_to_rc = True
|
|
182
|
+
self.reverse_complement()
|
|
183
|
+
|
|
184
|
+
# Normalize shared prefix (similar to left-alignment in VCFs)
|
|
185
|
+
while ref and alt and ref[0] == alt[0]:
|
|
186
|
+
pos += 1
|
|
187
|
+
ref = ref[1:] or "-"
|
|
188
|
+
alt = alt[1:] or "-"
|
|
189
|
+
|
|
190
|
+
# Case 1: SNP or multi-base substitution
|
|
191
|
+
if ref != "-" and alt != "-":
|
|
192
|
+
if len(ref) != len(alt):
|
|
193
|
+
raise ValueError("Substitution mutations must have alleles of equal length.")
|
|
194
|
+
|
|
195
|
+
pos_idx = np.searchsorted(self.seq_array["index"], pos)
|
|
196
|
+
# pos_idx = self._pos_to_idx.get(pos)
|
|
197
|
+
|
|
198
|
+
if pos_idx is None:
|
|
199
|
+
raise ValueError(f"Position {pos} not found in index")
|
|
200
|
+
|
|
201
|
+
end_idx = pos_idx + len(ref)
|
|
202
|
+
if end_idx > len(self.seq_array):
|
|
203
|
+
raise ValueError(f"Substitution range exceeds sequence length at position {pos}.")
|
|
204
|
+
|
|
205
|
+
# segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
|
|
206
|
+
# if segment != ref:
|
|
207
|
+
# raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{segment}'")
|
|
208
|
+
|
|
209
|
+
ref_segment = self.seq_array["ref"][pos_idx:end_idx]
|
|
210
|
+
# expected_segment = np.frombuffer(ref.encode(), dtype='S1')
|
|
211
|
+
if not np.all(ref_segment == np.frombuffer(ref.encode(), dtype='S1')):
|
|
212
|
+
actual_str = ref_segment.tobytes().decode()
|
|
213
|
+
raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{actual_str}'")
|
|
214
|
+
self.seq_array["nt"][pos_idx:end_idx] = np.frombuffer(alt.encode(), dtype='S1')
|
|
215
|
+
|
|
216
|
+
# for i, nt in enumerate(alt):
|
|
217
|
+
# self.seq_array["nt"][pos_idx + i] = nt.encode()
|
|
218
|
+
|
|
219
|
+
# Case 2: Insertion (ref is '-' means nothing was present, and we need to add bases)
|
|
220
|
+
elif ref == "-" and alt != "-":
|
|
221
|
+
if only_snps:
|
|
222
|
+
return self # Skip if indels are not allowed.
|
|
223
|
+
pos_idx = np.searchsorted(self.seq_array["index"], pos)
|
|
224
|
+
insertion_count = self.insertion_counters[pos]
|
|
225
|
+
eps = 1e-6
|
|
226
|
+
new_rows = []
|
|
227
|
+
for i, nt in enumerate(alt):
|
|
228
|
+
new_index = pos + (insertion_count + i + 1) * eps
|
|
229
|
+
new_row = (nt.encode(), new_index, b"-", np.float32(np.nan), True)
|
|
230
|
+
new_rows.append(new_row)
|
|
231
|
+
rows = list(self.seq_array)
|
|
232
|
+
rows.extend(new_rows)
|
|
233
|
+
new_seq_array = np.array(rows, dtype=self.seq_array.dtype)
|
|
234
|
+
new_seq_array.sort(order="index")
|
|
235
|
+
self.seq_array = new_seq_array
|
|
236
|
+
self.insertion_counters[pos] += len(alt)
|
|
237
|
+
|
|
238
|
+
# Case 3: Deletion (alt is '-' means bases are to be removed)
|
|
239
|
+
elif alt == "-" and ref != "-":
|
|
240
|
+
if only_snps:
|
|
241
|
+
return self # Skip if indels are not allowed.
|
|
242
|
+
pos_idx = np.searchsorted(self.seq_array["index"], pos)
|
|
243
|
+
end_idx = pos_idx + len(ref)
|
|
244
|
+
if end_idx > len(self.seq_array):
|
|
245
|
+
raise ValueError(f"Deletion range exceeds sequence length at position {pos}.")
|
|
246
|
+
segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
|
|
247
|
+
if segment != ref:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"Reference mismatch for deletion at position {pos}: expected '{ref}', found '{segment}'")
|
|
250
|
+
self.seq_array = np.delete(self.seq_array, np.s_[pos_idx:end_idx])
|
|
251
|
+
else:
|
|
252
|
+
raise ValueError("Unsupported mutation type. Provide valid ref and alt values.")
|
|
253
|
+
|
|
254
|
+
self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
|
|
255
|
+
if return_to_rc:
|
|
256
|
+
self.reverse_complement()
|
|
257
|
+
|
|
258
|
+
return self
|
|
259
|
+
|
|
260
|
+
def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
|
|
261
|
+
if isinstance(key, int):
|
|
262
|
+
pos_idx = np.where(self.seq_array["index"] == key)[0]
|
|
263
|
+
if pos_idx.size == 0:
|
|
264
|
+
raise IndexError(f"Position {key} not found in sequence.")
|
|
265
|
+
return self.seq_array[pos_idx[0]]
|
|
266
|
+
elif isinstance(key, slice):
|
|
267
|
+
start, stop = key.start, key.stop
|
|
268
|
+
if start is None:
|
|
269
|
+
start = self.seq_array["index"].min()
|
|
270
|
+
if stop is None:
|
|
271
|
+
stop = self.seq_array["index"].max()
|
|
272
|
+
return self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= stop)]
|
|
273
|
+
else:
|
|
274
|
+
raise TypeError("Indexing must be an integer or a slice.")
|
|
275
|
+
|
|
276
|
+
def complement(self) -> "SeqMat":
|
|
277
|
+
comp_dict = {b"A": b"T", b"T": b"A", b"C": b"G", b"G": b"C", b"-": b"-", b"N": b"N"}
|
|
278
|
+
comp_seq = np.array([comp_dict[nt] for nt in self.seq_array["nt"]], dtype="S1")
|
|
279
|
+
new_instance = self.clone()
|
|
280
|
+
new_instance.seq_array["nt"] = comp_seq
|
|
281
|
+
return new_instance
|
|
282
|
+
|
|
283
|
+
def reverse_complement(self) -> "SeqMat":
|
|
284
|
+
rev_comp_seq = self.complement().seq_array[::-1]
|
|
285
|
+
self.seq_array = rev_comp_seq.copy()
|
|
286
|
+
self.rev = not self.rev
|
|
287
|
+
return self
|
|
288
|
+
|
|
289
|
+
# def splice_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
290
|
+
# """
|
|
291
|
+
# Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
292
|
+
#
|
|
293
|
+
# Args:
|
|
294
|
+
# introns (List[Tuple[int, int]]): List of (start, end) intron boundaries to remove.
|
|
295
|
+
# Coordinates should match the 'index' field.
|
|
296
|
+
#
|
|
297
|
+
# Returns:
|
|
298
|
+
# SeqMat: A new instance with the intron regions removed.
|
|
299
|
+
# """
|
|
300
|
+
# mask = np.ones(len(self.seq_array), dtype=bool)
|
|
301
|
+
#
|
|
302
|
+
# for start, end in introns:
|
|
303
|
+
# mask &= ~((self.seq_array["index"] >= start) & (self.seq_array["index"] <= end))
|
|
304
|
+
#
|
|
305
|
+
# new_instance = self.clone()
|
|
306
|
+
# new_instance.seq_array = self.seq_array[mask].copy()
|
|
307
|
+
# return new_instance
|
|
308
|
+
|
|
309
|
+
def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
310
|
+
"""
|
|
311
|
+
Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
312
|
+
|
|
313
|
+
Handles reverse-complemented sequences by interpreting introns in reverse as well.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
|
|
317
|
+
These are always genomic (absolute) coordinates,
|
|
318
|
+
regardless of strand direction.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
SeqMat: A new instance with the intron regions removed.
|
|
322
|
+
"""
|
|
323
|
+
# In reverse orientation, flip intron direction for comparison
|
|
324
|
+
if self.rev:
|
|
325
|
+
introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
|
|
326
|
+
|
|
327
|
+
mask = np.ones(len(self.seq_array), dtype=bool)
|
|
328
|
+
|
|
329
|
+
for start, end in introns:
|
|
330
|
+
lo, hi = min(start, end) + 1, max(start, end) - 1
|
|
331
|
+
mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
|
|
332
|
+
|
|
333
|
+
new_instance = self.clone()
|
|
334
|
+
new_instance.seq_array = self.seq_array[mask].copy()
|
|
335
|
+
return new_instance
|
|
336
|
+
|
|
337
|
+
def open_reading_frame(self, tis: int) -> "SeqMat":
|
|
338
|
+
"""
|
|
339
|
+
Extracts the open reading frame starting from the translation initiation site (TIS)
|
|
340
|
+
until the first in-frame stop codon.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
tis (int): Genomic position of the translation initiation site (start codon).
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
|
|
347
|
+
"""
|
|
348
|
+
if tis not in self.seq_array["index"]:
|
|
349
|
+
print(f"Warning: TIS position {tis} not found, returning default.")
|
|
350
|
+
return self.clone(start=0, end=3)
|
|
351
|
+
|
|
352
|
+
# Extract nucleotide sequence and indices starting from TIS
|
|
353
|
+
mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
|
|
354
|
+
coding_part = self.seq_array[mask]
|
|
355
|
+
coding_seq = coding_part["nt"].tobytes().decode()
|
|
356
|
+
|
|
357
|
+
# Read codons in-frame
|
|
358
|
+
for i in range(0, len(coding_seq) - 2, 3):
|
|
359
|
+
codon = coding_seq[i:i + 3]
|
|
360
|
+
if codon in {"TAA", "TAG", "TGA"}:
|
|
361
|
+
# Determine index range for this ORF
|
|
362
|
+
start = coding_part["index"][0]
|
|
363
|
+
stop = coding_part["index"][i + 2]
|
|
364
|
+
lo, hi = sorted((start, stop))
|
|
365
|
+
return self.clone(start=lo, end=hi)
|
|
366
|
+
|
|
367
|
+
raise ValueError("No in-frame stop codon found after the TIS.")
|
|
368
|
+
|
|
369
|
+
def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
|
|
370
|
+
"""
|
|
371
|
+
Predict splicing probabilities at a given position using the specified engine.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
position (int): The genomic position to predict splicing probabilities for.
|
|
375
|
+
engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
376
|
+
context (int): The length of the target central region (default: 7500).
|
|
377
|
+
format (str): Output format for the splicing engine results.
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
pd.DataFrame: A DataFrame containing:
|
|
381
|
+
- position: The genomic position
|
|
382
|
+
- donor_prob: Probability of being a donor splice site
|
|
383
|
+
- acceptor_prob: Probability of being an acceptor splice site
|
|
384
|
+
- nucleotides: The nucleotide sequence at that position
|
|
385
|
+
|
|
386
|
+
Raises:
|
|
387
|
+
ValueError: If an unsupported engine is provided.
|
|
388
|
+
IndexError: If the position is not found in the sequence.
|
|
389
|
+
"""
|
|
390
|
+
# Retrieve extended context (includes flanks) around the position.
|
|
391
|
+
# seq, indices = self.get_context(position, context=context, padding='N')
|
|
392
|
+
target = self.clone(position - context, position + context)
|
|
393
|
+
# print(len(target.seq))
|
|
394
|
+
seq, indices = target.seq, target.index
|
|
395
|
+
# print(len(seq))
|
|
396
|
+
# rel_pos = np.where(indices == position)[0][0]
|
|
397
|
+
# print(rel_pos)
|
|
398
|
+
rel_pos = np.abs(indices - position).argmin()
|
|
399
|
+
# print(rel_pos, len(seq))
|
|
400
|
+
left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
|
|
401
|
+
# print(left_missing, right_missing)
|
|
402
|
+
if left_missing > 0 or right_missing > 0:
|
|
403
|
+
step = -1 if self.rev else 1
|
|
404
|
+
|
|
405
|
+
if left_missing > 0:
|
|
406
|
+
left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
|
|
407
|
+
else:
|
|
408
|
+
left_pad = np.array([], dtype=indices.dtype)
|
|
409
|
+
|
|
410
|
+
if right_missing > 0:
|
|
411
|
+
right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
|
|
412
|
+
else:
|
|
413
|
+
right_pad = np.array([], dtype=indices.dtype)
|
|
414
|
+
|
|
415
|
+
seq = 'N' * left_missing + seq + 'N' * right_missing
|
|
416
|
+
indices = np.concatenate([left_pad, indices, right_pad])
|
|
417
|
+
|
|
418
|
+
# Run the splicing prediction engine (function assumed to be defined externally)
|
|
419
|
+
from .splicing_utils import run_splicing_engine
|
|
420
|
+
donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
|
|
421
|
+
# Trim off the fixed flanks before returning results.
|
|
422
|
+
seq = seq[5000:-5000]
|
|
423
|
+
indices = indices[5000:-5000]
|
|
424
|
+
df = pd.DataFrame({
|
|
425
|
+
'position': indices,
|
|
426
|
+
'donor_prob': donor_probs,
|
|
427
|
+
'acceptor_prob': acceptor_probs,
|
|
428
|
+
'nucleotides': list(seq)
|
|
429
|
+
}).set_index('position').round(3)
|
|
430
|
+
# if reference_donors is not None:
|
|
431
|
+
# df['ref_donor'] = df.index.isin(reference_donors).astype(int)
|
|
432
|
+
# if reference_acceptors is not None:
|
|
433
|
+
# df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
|
|
434
|
+
|
|
435
|
+
df.attrs['name'] = self.name
|
|
436
|
+
if inplace:
|
|
437
|
+
self.predicted_splicing = df
|
|
438
|
+
return self
|
|
439
|
+
else:
|
|
440
|
+
return df
|
|
441
|
+
|
|
@@ -3,15 +3,15 @@ geney/Gene.py,sha256=6x1sEZV50Il4oydegW6iHIF12EZTGexniG3YUD-3DfM,7036
|
|
|
3
3
|
geney/Oncosplice.py,sha256=ETAvMl_Oq6mEJQHPNwdDO5csX6Ahuped_om10KifCyM,17739
|
|
4
4
|
geney/SeqMats.py,sha256=9-eJnfU2w3LGc0XvVvFEO_QrBneTkC6xkZKDfTcEw5o,19282
|
|
5
5
|
geney/SpliceSimulator.py,sha256=iF6feVeSnsKFmn3WV60CgWLI0_rSLgpq5fVFL1IOv_4,18491
|
|
6
|
-
geney/Transcript.py,sha256=
|
|
6
|
+
geney/Transcript.py,sha256=_DhKQ-UnyFDPb4Cu-8sQPWvLd-kKj4ZEJq6KBntFVGE,14467
|
|
7
7
|
geney/__init__.py,sha256=YLWXJS53yeryp6nVhCgFg3_Du9Guj9y3iSrdfx61q5Y,3017
|
|
8
8
|
geney/_config_setup.py,sha256=nblcGU3HIt8YjdrAoGfbEVKRxwJKv0PikJ5-7AL6axQ,723
|
|
9
9
|
geney/_graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
10
10
|
geney/_gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
11
11
|
geney/_immune_utils.py,sha256=b-8dRcCti7xsU7RG3op18lkSnAD8dp_BymGaR-hbNcI,5272
|
|
12
12
|
geney/_mutation_utils.py,sha256=dHssUsnii_mf-wuRoMmF13UlD7k3ml_VwQMItTYnXpU,1132
|
|
13
|
-
geney/_oncosplice.py,sha256=
|
|
14
|
-
geney/_splicing_utils.py,sha256=
|
|
13
|
+
geney/_oncosplice.py,sha256=qrIqo3HAZAnzhtIGgv7EnwCE5YkdTFSwWoiSZLBzCpg,35530
|
|
14
|
+
geney/_splicing_utils.py,sha256=7j5YC9CrWWFfct8hDdXyxYIqRtCPI4TqxA1cyAMhyy8,31476
|
|
15
15
|
geney/_survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
16
16
|
geney/_tcga_utils.py,sha256=uJhVnTbTysj0XrEw_YeDKRSLexsqgBLYQdhl7_hnr64,17611
|
|
17
17
|
geney/_tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
|
|
@@ -37,7 +37,8 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
|
|
|
37
37
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
38
38
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
39
39
|
geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
|
|
40
|
-
geney/utils/SeqMats.py,sha256=
|
|
40
|
+
geney/utils/SeqMats.py,sha256=2tJYPGy-cCCaANRbiYkki5yNxnhgzysDQGWjRYRSnlQ,8767
|
|
41
|
+
geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
|
|
41
42
|
geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
|
|
42
43
|
geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
|
|
43
44
|
geney/utils/mutation_utils.py,sha256=r-pHr56gEa5kh_DPX8MjFY3ZfYaOtyo4CUfJ5ZHlXPw,3243
|
|
@@ -45,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
|
|
|
45
46
|
geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
|
|
46
47
|
geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
|
|
47
48
|
geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
|
|
48
|
-
geney-1.4.
|
|
49
|
-
geney-1.4.
|
|
50
|
-
geney-1.4.
|
|
51
|
-
geney-1.4.
|
|
49
|
+
geney-1.4.18.dist-info/METADATA,sha256=WLdB6CQyOBWNDYLU05Gyj6o7kaNzA5qr7ROD0mzTgm4,990
|
|
50
|
+
geney-1.4.18.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
51
|
+
geney-1.4.18.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
52
|
+
geney-1.4.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|