geney 1.4.17__py2.py3-none-any.whl → 1.4.19__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/utils/SeqMats.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
__all__ = ['SeqMat', 'format_mut_id']
|
|
2
4
|
|
|
3
5
|
|
|
4
|
-
from __future__ import annotations
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from typing import List, Tuple, Union, Optional
|
|
7
8
|
from collections import defaultdict
|
|
@@ -29,7 +30,6 @@ def format_mut_id(text):
|
|
|
29
30
|
return None
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
|
|
33
33
|
@dataclass(slots=True)
|
|
34
34
|
class SeqMat:
|
|
35
35
|
"""Represents a genomic sequence matrix used for training."""
|
|
@@ -42,21 +42,20 @@ class SeqMat:
|
|
|
42
42
|
insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
|
|
43
43
|
rev: bool = field(default=False, init=False, repr=False)
|
|
44
44
|
predicted_splicing: pd.DataFrame = field(init=False, repr=False)
|
|
45
|
-
_pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
|
|
46
45
|
|
|
47
46
|
def __init__(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
47
|
+
self,
|
|
48
|
+
nucleotides: str,
|
|
49
|
+
index: np.ndarray,
|
|
50
|
+
conservation: Optional[np.ndarray] = None,
|
|
51
|
+
reference_nucleotides: Optional[np.ndarray] = None,
|
|
52
|
+
notes: Optional[dict] = None,
|
|
53
|
+
source: Optional[str] = None,
|
|
54
|
+
rev: Optional[bool] = False,
|
|
55
|
+
name: Optional[str] = 'wild_type',
|
|
56
|
+
version: Optional[str] = 'none'
|
|
58
57
|
) -> None:
|
|
59
|
-
#
|
|
58
|
+
# Metadata
|
|
60
59
|
self.name = name
|
|
61
60
|
self.version = version
|
|
62
61
|
self.source = source or "Unknown"
|
|
@@ -85,19 +84,10 @@ class SeqMat:
|
|
|
85
84
|
self.seq_array['nt'] = nts
|
|
86
85
|
self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
|
|
87
86
|
self.seq_array['index'] = index
|
|
88
|
-
self.seq_array['cons'] =
|
|
87
|
+
self.seq_array['cons'] = np.zeros(L, dtype='f4') if conservation is None else conservation
|
|
89
88
|
self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
|
|
90
89
|
|
|
91
|
-
# Initialize helpers
|
|
92
90
|
self.insertion_counters = defaultdict(int)
|
|
93
|
-
self._build_index_map()
|
|
94
|
-
|
|
95
|
-
def _build_index_map(self):
|
|
96
|
-
"""Rebuild position-to-index lookup."""
|
|
97
|
-
self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
|
|
98
|
-
|
|
99
|
-
def __len__(self) -> int:
|
|
100
|
-
return int(self.seq_array['valid_mask'].sum())
|
|
101
91
|
|
|
102
92
|
@property
|
|
103
93
|
def seq(self) -> str:
|
|
@@ -113,7 +103,6 @@ class SeqMat:
|
|
|
113
103
|
|
|
114
104
|
def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
|
|
115
105
|
new = SeqMat.__new__(SeqMat)
|
|
116
|
-
# copy metadata
|
|
117
106
|
new.name = self.name
|
|
118
107
|
new.version = self.version
|
|
119
108
|
new.source = self.source
|
|
@@ -122,100 +111,96 @@ class SeqMat:
|
|
|
122
111
|
new.predicted_splicing = None
|
|
123
112
|
new.insertion_counters = defaultdict(int)
|
|
124
113
|
|
|
125
|
-
# slice or full copy
|
|
126
114
|
if start is not None and end is not None:
|
|
127
115
|
mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
|
|
128
116
|
new.seq_array = self.seq_array[mask].copy()
|
|
129
117
|
else:
|
|
130
118
|
new.seq_array = self.seq_array.copy()
|
|
131
119
|
|
|
132
|
-
new.
|
|
120
|
+
new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
|
|
133
121
|
return new
|
|
134
122
|
|
|
135
|
-
def
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
123
|
+
def apply_mutations(
|
|
124
|
+
self,
|
|
125
|
+
mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
|
|
126
|
+
only_snps: bool = False
|
|
127
|
+
) -> SeqMat:
|
|
128
|
+
"""
|
|
129
|
+
Apply one or a batch of mutations (pos, ref, alt) efficiently:
|
|
130
|
+
- Supports a single tuple or a list of tuples
|
|
131
|
+
- Assumes mutations sorted by position for vectorized searchsorted
|
|
132
|
+
"""
|
|
133
|
+
# Normalize to list
|
|
134
|
+
if isinstance(mutations, tuple) and len(mutations) == 3:
|
|
135
|
+
mutations = [mutations]
|
|
136
|
+
elif not isinstance(mutations, list):
|
|
137
|
+
raise TypeError("mutations must be a tuple or list of tuples")
|
|
138
|
+
|
|
139
|
+
# Left-normalize and bucket
|
|
140
|
+
subs, ins, dels = [], [], []
|
|
141
|
+
for pos, ref, alt in mutations:
|
|
142
|
+
while ref and alt and ref[0] == alt[0]:
|
|
143
|
+
pos += 1
|
|
144
|
+
ref = ref[1:] or '-'
|
|
145
|
+
alt = alt[1:] or '-'
|
|
146
|
+
if ref != '-' and alt != '-':
|
|
147
|
+
subs.append((pos, ref, alt))
|
|
148
|
+
elif ref == '-' and alt != '-' and not only_snps:
|
|
149
|
+
ins.append((pos, alt))
|
|
150
|
+
elif alt == '-' and ref != '-' and not only_snps:
|
|
151
|
+
dels.append((pos, ref))
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
|
|
154
|
+
|
|
155
|
+
# Ensure seq_array indices sorted
|
|
156
|
+
coords = self.seq_array['index']
|
|
157
|
+
|
|
158
|
+
# 1) Bulk substitutions
|
|
159
|
+
if subs:
|
|
160
|
+
subs.sort(key=lambda x: x[0])
|
|
161
|
+
positions = np.array([p for p, _, _ in subs], dtype=coords.dtype)
|
|
162
|
+
idxs = np.searchsorted(coords, positions)
|
|
163
|
+
for (pos, ref, alt), idx in zip(subs, idxs):
|
|
164
|
+
length = len(ref)
|
|
165
|
+
if not np.all(self.seq_array['ref'][idx:idx + length] == np.frombuffer(ref.encode(), dtype='S1')):
|
|
166
|
+
actual = self.seq_array['ref'][idx:idx + length].tobytes().decode()
|
|
167
|
+
raise ValueError(f"Ref mismatch at {pos}: expected {ref}, found {actual}")
|
|
168
|
+
self.seq_array['nt'][idx:idx + length] = np.frombuffer(alt.encode(), dtype='S1')
|
|
169
|
+
|
|
170
|
+
# 2) Bulk insertions
|
|
171
|
+
if ins:
|
|
172
|
+
ins.sort(key=lambda x: x[0])
|
|
173
|
+
positions = np.array([p for p, _ in ins], dtype=coords.dtype)
|
|
174
|
+
idxs = np.searchsorted(coords, positions)
|
|
173
175
|
new_rows = []
|
|
174
|
-
for
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
self.
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
else:
|
|
198
|
-
raise ValueError("Unsupported mutation type.")
|
|
199
|
-
|
|
200
|
-
# update mask & index map
|
|
176
|
+
for (pos, alt), idx in zip(ins, idxs):
|
|
177
|
+
cnt = self.insertion_counters[pos]
|
|
178
|
+
eps = 1e-6
|
|
179
|
+
for i, nt in enumerate(alt):
|
|
180
|
+
new_idx = pos + (cnt + i + 1) * eps
|
|
181
|
+
new_rows.append((nt.encode(), new_idx, b'-', np.nan, True))
|
|
182
|
+
self.insertion_counters[pos] += len(alt)
|
|
183
|
+
merged = np.concatenate([self.seq_array, np.array(new_rows, dtype=self.seq_array.dtype)])
|
|
184
|
+
merged.sort(order='index')
|
|
185
|
+
self.seq_array = merged
|
|
186
|
+
|
|
187
|
+
# 3) Bulk deletions
|
|
188
|
+
if dels:
|
|
189
|
+
dels.sort(key=lambda x: x[0])
|
|
190
|
+
positions = np.array([p for p, _ in dels], dtype=coords.dtype)
|
|
191
|
+
idxs = np.searchsorted(self.seq_array['index'], positions)
|
|
192
|
+
mask = np.ones(len(self.seq_array), dtype=bool)
|
|
193
|
+
for (pos, ref), idx in zip(dels, idxs):
|
|
194
|
+
length = len(ref)
|
|
195
|
+
mask[idx:idx + length] = False
|
|
196
|
+
self.seq_array = self.seq_array[mask]
|
|
197
|
+
|
|
198
|
+
# Finalize valid mask
|
|
201
199
|
self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
|
|
202
|
-
self._build_index_map()
|
|
203
|
-
|
|
204
|
-
# restore orientation
|
|
205
|
-
if self.rev:
|
|
206
|
-
self.reverse_complement()
|
|
207
200
|
return self
|
|
208
201
|
|
|
209
|
-
def _insert_rows(self, idx: int, rows: List[tuple]):
|
|
210
|
-
"""Helper to insert new rows efficiently and resort."""
|
|
211
|
-
arr = self.seq_array.tolist()
|
|
212
|
-
arr[idx:idx] = rows
|
|
213
|
-
new = np.array(arr, dtype=self.seq_array.dtype)
|
|
214
|
-
new.sort(order='index')
|
|
215
|
-
self.seq_array = new
|
|
216
|
-
|
|
217
202
|
def complement(self) -> SeqMat:
|
|
218
|
-
comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
|
|
203
|
+
comp = {b'A': b'T', b'T': b'A', b'C': b'G', b'G': b'C', b'-': b'-'}
|
|
219
204
|
nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
|
|
220
205
|
new = self.clone()
|
|
221
206
|
new.seq_array['nt'] = nts
|
|
@@ -228,15 +213,15 @@ class SeqMat:
|
|
|
228
213
|
return new
|
|
229
214
|
|
|
230
215
|
def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
|
|
231
|
-
|
|
216
|
+
coords = self.seq_array['index']
|
|
232
217
|
if isinstance(key, int):
|
|
233
|
-
idx =
|
|
234
|
-
if idx
|
|
218
|
+
idx = np.searchsorted(coords, key)
|
|
219
|
+
if idx >= len(coords) or coords[idx] != key:
|
|
235
220
|
raise KeyError(f"Position {key} not found.")
|
|
236
221
|
return self.seq_array[idx]
|
|
237
222
|
if isinstance(key, slice):
|
|
238
|
-
start = key.start or
|
|
239
|
-
stop = key.stop or
|
|
240
|
-
mask = (
|
|
223
|
+
start = key.start or coords.min()
|
|
224
|
+
stop = key.stop or coords.max()
|
|
225
|
+
mask = (coords >= start) & (coords <= stop)
|
|
241
226
|
return self.seq_array[mask]
|
|
242
227
|
raise TypeError("Invalid index type.")
|
|
@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
|
|
|
37
37
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
38
38
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
39
39
|
geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
|
|
40
|
-
geney/utils/SeqMats.py,sha256
|
|
40
|
+
geney/utils/SeqMats.py,sha256=q858gWPsSoS4HUr6FD1CHYuUh5AE5u9KePHYT7FQw7g,8777
|
|
41
41
|
geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
|
|
42
42
|
geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
|
|
43
43
|
geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
|
|
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
|
|
|
46
46
|
geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
|
|
47
47
|
geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
|
|
48
48
|
geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
|
|
49
|
-
geney-1.4.
|
|
50
|
-
geney-1.4.
|
|
51
|
-
geney-1.4.
|
|
52
|
-
geney-1.4.
|
|
49
|
+
geney-1.4.19.dist-info/METADATA,sha256=BIX1e0_0ypaE0e5prBztdgdzp0S1UutUqJufy-qmpZs,990
|
|
50
|
+
geney-1.4.19.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
51
|
+
geney-1.4.19.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
52
|
+
geney-1.4.19.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|