geney 1.4.18__py2.py3-none-any.whl → 1.4.19__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/utils/SeqMats.py
CHANGED
|
@@ -30,7 +30,6 @@ def format_mut_id(text):
|
|
|
30
30
|
return None
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
|
|
34
33
|
@dataclass(slots=True)
|
|
35
34
|
class SeqMat:
|
|
36
35
|
"""Represents a genomic sequence matrix used for training."""
|
|
@@ -43,21 +42,20 @@ class SeqMat:
|
|
|
43
42
|
insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
|
|
44
43
|
rev: bool = field(default=False, init=False, repr=False)
|
|
45
44
|
predicted_splicing: pd.DataFrame = field(init=False, repr=False)
|
|
46
|
-
_pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
|
|
47
45
|
|
|
48
46
|
def __init__(
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
47
|
+
self,
|
|
48
|
+
nucleotides: str,
|
|
49
|
+
index: np.ndarray,
|
|
50
|
+
conservation: Optional[np.ndarray] = None,
|
|
51
|
+
reference_nucleotides: Optional[np.ndarray] = None,
|
|
52
|
+
notes: Optional[dict] = None,
|
|
53
|
+
source: Optional[str] = None,
|
|
54
|
+
rev: Optional[bool] = False,
|
|
55
|
+
name: Optional[str] = 'wild_type',
|
|
56
|
+
version: Optional[str] = 'none'
|
|
59
57
|
) -> None:
|
|
60
|
-
#
|
|
58
|
+
# Metadata
|
|
61
59
|
self.name = name
|
|
62
60
|
self.version = version
|
|
63
61
|
self.source = source or "Unknown"
|
|
@@ -86,19 +84,10 @@ class SeqMat:
|
|
|
86
84
|
self.seq_array['nt'] = nts
|
|
87
85
|
self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
|
|
88
86
|
self.seq_array['index'] = index
|
|
89
|
-
self.seq_array['cons'] =
|
|
87
|
+
self.seq_array['cons'] = np.zeros(L, dtype='f4') if conservation is None else conservation
|
|
90
88
|
self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
|
|
91
89
|
|
|
92
|
-
# Initialize helpers
|
|
93
90
|
self.insertion_counters = defaultdict(int)
|
|
94
|
-
self._build_index_map()
|
|
95
|
-
|
|
96
|
-
def _build_index_map(self):
|
|
97
|
-
"""Rebuild position-to-index lookup."""
|
|
98
|
-
self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
|
|
99
|
-
|
|
100
|
-
def __len__(self) -> int:
|
|
101
|
-
return int(self.seq_array['valid_mask'].sum())
|
|
102
91
|
|
|
103
92
|
@property
|
|
104
93
|
def seq(self) -> str:
|
|
@@ -114,7 +103,6 @@ class SeqMat:
|
|
|
114
103
|
|
|
115
104
|
def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
|
|
116
105
|
new = SeqMat.__new__(SeqMat)
|
|
117
|
-
# copy metadata
|
|
118
106
|
new.name = self.name
|
|
119
107
|
new.version = self.version
|
|
120
108
|
new.source = self.source
|
|
@@ -123,100 +111,96 @@ class SeqMat:
|
|
|
123
111
|
new.predicted_splicing = None
|
|
124
112
|
new.insertion_counters = defaultdict(int)
|
|
125
113
|
|
|
126
|
-
# slice or full copy
|
|
127
114
|
if start is not None and end is not None:
|
|
128
115
|
mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
|
|
129
116
|
new.seq_array = self.seq_array[mask].copy()
|
|
130
117
|
else:
|
|
131
118
|
new.seq_array = self.seq_array.copy()
|
|
132
119
|
|
|
133
|
-
new.
|
|
120
|
+
new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
|
|
134
121
|
return new
|
|
135
122
|
|
|
136
|
-
def
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
123
|
+
def apply_mutations(
|
|
124
|
+
self,
|
|
125
|
+
mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
|
|
126
|
+
only_snps: bool = False
|
|
127
|
+
) -> SeqMat:
|
|
128
|
+
"""
|
|
129
|
+
Apply one or a batch of mutations (pos, ref, alt) efficiently:
|
|
130
|
+
- Supports a single tuple or a list of tuples
|
|
131
|
+
- Assumes mutations sorted by position for vectorized searchsorted
|
|
132
|
+
"""
|
|
133
|
+
# Normalize to list
|
|
134
|
+
if isinstance(mutations, tuple) and len(mutations) == 3:
|
|
135
|
+
mutations = [mutations]
|
|
136
|
+
elif not isinstance(mutations, list):
|
|
137
|
+
raise TypeError("mutations must be a tuple or list of tuples")
|
|
138
|
+
|
|
139
|
+
# Left-normalize and bucket
|
|
140
|
+
subs, ins, dels = [], [], []
|
|
141
|
+
for pos, ref, alt in mutations:
|
|
142
|
+
while ref and alt and ref[0] == alt[0]:
|
|
143
|
+
pos += 1
|
|
144
|
+
ref = ref[1:] or '-'
|
|
145
|
+
alt = alt[1:] or '-'
|
|
146
|
+
if ref != '-' and alt != '-':
|
|
147
|
+
subs.append((pos, ref, alt))
|
|
148
|
+
elif ref == '-' and alt != '-' and not only_snps:
|
|
149
|
+
ins.append((pos, alt))
|
|
150
|
+
elif alt == '-' and ref != '-' and not only_snps:
|
|
151
|
+
dels.append((pos, ref))
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
|
|
154
|
+
|
|
155
|
+
# Ensure seq_array indices sorted
|
|
156
|
+
coords = self.seq_array['index']
|
|
157
|
+
|
|
158
|
+
# 1) Bulk substitutions
|
|
159
|
+
if subs:
|
|
160
|
+
subs.sort(key=lambda x: x[0])
|
|
161
|
+
positions = np.array([p for p, _, _ in subs], dtype=coords.dtype)
|
|
162
|
+
idxs = np.searchsorted(coords, positions)
|
|
163
|
+
for (pos, ref, alt), idx in zip(subs, idxs):
|
|
164
|
+
length = len(ref)
|
|
165
|
+
if not np.all(self.seq_array['ref'][idx:idx + length] == np.frombuffer(ref.encode(), dtype='S1')):
|
|
166
|
+
actual = self.seq_array['ref'][idx:idx + length].tobytes().decode()
|
|
167
|
+
raise ValueError(f"Ref mismatch at {pos}: expected {ref}, found {actual}")
|
|
168
|
+
self.seq_array['nt'][idx:idx + length] = np.frombuffer(alt.encode(), dtype='S1')
|
|
169
|
+
|
|
170
|
+
# 2) Bulk insertions
|
|
171
|
+
if ins:
|
|
172
|
+
ins.sort(key=lambda x: x[0])
|
|
173
|
+
positions = np.array([p for p, _ in ins], dtype=coords.dtype)
|
|
174
|
+
idxs = np.searchsorted(coords, positions)
|
|
174
175
|
new_rows = []
|
|
175
|
-
for
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
self.
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
else:
|
|
199
|
-
raise ValueError("Unsupported mutation type.")
|
|
200
|
-
|
|
201
|
-
# update mask & index map
|
|
176
|
+
for (pos, alt), idx in zip(ins, idxs):
|
|
177
|
+
cnt = self.insertion_counters[pos]
|
|
178
|
+
eps = 1e-6
|
|
179
|
+
for i, nt in enumerate(alt):
|
|
180
|
+
new_idx = pos + (cnt + i + 1) * eps
|
|
181
|
+
new_rows.append((nt.encode(), new_idx, b'-', np.nan, True))
|
|
182
|
+
self.insertion_counters[pos] += len(alt)
|
|
183
|
+
merged = np.concatenate([self.seq_array, np.array(new_rows, dtype=self.seq_array.dtype)])
|
|
184
|
+
merged.sort(order='index')
|
|
185
|
+
self.seq_array = merged
|
|
186
|
+
|
|
187
|
+
# 3) Bulk deletions
|
|
188
|
+
if dels:
|
|
189
|
+
dels.sort(key=lambda x: x[0])
|
|
190
|
+
positions = np.array([p for p, _ in dels], dtype=coords.dtype)
|
|
191
|
+
idxs = np.searchsorted(self.seq_array['index'], positions)
|
|
192
|
+
mask = np.ones(len(self.seq_array), dtype=bool)
|
|
193
|
+
for (pos, ref), idx in zip(dels, idxs):
|
|
194
|
+
length = len(ref)
|
|
195
|
+
mask[idx:idx + length] = False
|
|
196
|
+
self.seq_array = self.seq_array[mask]
|
|
197
|
+
|
|
198
|
+
# Finalize valid mask
|
|
202
199
|
self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
|
|
203
|
-
self._build_index_map()
|
|
204
|
-
|
|
205
|
-
# restore orientation
|
|
206
|
-
if self.rev:
|
|
207
|
-
self.reverse_complement()
|
|
208
200
|
return self
|
|
209
201
|
|
|
210
|
-
def _insert_rows(self, idx: int, rows: List[tuple]):
|
|
211
|
-
"""Helper to insert new rows efficiently and resort."""
|
|
212
|
-
arr = self.seq_array.tolist()
|
|
213
|
-
arr[idx:idx] = rows
|
|
214
|
-
new = np.array(arr, dtype=self.seq_array.dtype)
|
|
215
|
-
new.sort(order='index')
|
|
216
|
-
self.seq_array = new
|
|
217
|
-
|
|
218
202
|
def complement(self) -> SeqMat:
|
|
219
|
-
comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
|
|
203
|
+
comp = {b'A': b'T', b'T': b'A', b'C': b'G', b'G': b'C', b'-': b'-'}
|
|
220
204
|
nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
|
|
221
205
|
new = self.clone()
|
|
222
206
|
new.seq_array['nt'] = nts
|
|
@@ -229,15 +213,15 @@ class SeqMat:
|
|
|
229
213
|
return new
|
|
230
214
|
|
|
231
215
|
def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
|
|
232
|
-
|
|
216
|
+
coords = self.seq_array['index']
|
|
233
217
|
if isinstance(key, int):
|
|
234
|
-
idx =
|
|
235
|
-
if idx
|
|
218
|
+
idx = np.searchsorted(coords, key)
|
|
219
|
+
if idx >= len(coords) or coords[idx] != key:
|
|
236
220
|
raise KeyError(f"Position {key} not found.")
|
|
237
221
|
return self.seq_array[idx]
|
|
238
222
|
if isinstance(key, slice):
|
|
239
|
-
start = key.start or
|
|
240
|
-
stop = key.stop or
|
|
241
|
-
mask = (
|
|
223
|
+
start = key.start or coords.min()
|
|
224
|
+
stop = key.stop or coords.max()
|
|
225
|
+
mask = (coords >= start) & (coords <= stop)
|
|
242
226
|
return self.seq_array[mask]
|
|
243
227
|
raise TypeError("Invalid index type.")
|
|
@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
|
|
|
37
37
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
38
38
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
39
39
|
geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
|
|
40
|
-
geney/utils/SeqMats.py,sha256=
|
|
40
|
+
geney/utils/SeqMats.py,sha256=q858gWPsSoS4HUr6FD1CHYuUh5AE5u9KePHYT7FQw7g,8777
|
|
41
41
|
geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
|
|
42
42
|
geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
|
|
43
43
|
geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
|
|
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
|
|
|
46
46
|
geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
|
|
47
47
|
geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
|
|
48
48
|
geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
|
|
49
|
-
geney-1.4.
|
|
50
|
-
geney-1.4.
|
|
51
|
-
geney-1.4.
|
|
52
|
-
geney-1.4.
|
|
49
|
+
geney-1.4.19.dist-info/METADATA,sha256=BIX1e0_0ypaE0e5prBztdgdzp0S1UutUqJufy-qmpZs,990
|
|
50
|
+
geney-1.4.19.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
51
|
+
geney-1.4.19.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
52
|
+
geney-1.4.19.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|