geney 1.4.21__py2.py3-none-any.whl → 1.4.23__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/utils/SeqMats.py
CHANGED
|
@@ -89,6 +89,25 @@ class SeqMat:
|
|
|
89
89
|
|
|
90
90
|
self.insertion_counters = defaultdict(int)
|
|
91
91
|
|
|
92
|
+
|
|
93
|
+
def __len__(self) -> int:
|
|
94
|
+
return int(self.seq_array["valid_mask"].sum())
|
|
95
|
+
|
|
96
|
+
def __repr__(self):
|
|
97
|
+
return f"<SeqMat: {self.seq}>"
|
|
98
|
+
|
|
99
|
+
def __str__(self):
|
|
100
|
+
return self.seq
|
|
101
|
+
|
|
102
|
+
def get_metadata(self) -> dict:
|
|
103
|
+
"""Retrieve all metadata as a dictionary."""
|
|
104
|
+
return {
|
|
105
|
+
"name": self.name,
|
|
106
|
+
"source": self.source,
|
|
107
|
+
"version": self.version,
|
|
108
|
+
"notes": self.notes
|
|
109
|
+
}
|
|
110
|
+
|
|
92
111
|
@property
|
|
93
112
|
def seq(self) -> str:
|
|
94
113
|
return self.seq_array['nt'][self.seq_array['valid_mask']].tobytes().decode()
|
|
@@ -101,6 +120,26 @@ class SeqMat:
|
|
|
101
120
|
def conservation(self) -> np.ndarray:
|
|
102
121
|
return self.seq_array['cons'][self.seq_array['valid_mask']]
|
|
103
122
|
|
|
123
|
+
@property
|
|
124
|
+
def max_index(self) -> float:
|
|
125
|
+
return self.seq_array["index"].max()
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def min_index(self) -> float:
|
|
129
|
+
return self.seq_array["index"].min()
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def start(self) -> float:
|
|
133
|
+
return self.min_index
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def end(self) -> float:
|
|
137
|
+
return self.max_index
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def mutated_positions(self) -> np.ndarray:
|
|
141
|
+
return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
|
|
142
|
+
|
|
104
143
|
def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
|
|
105
144
|
new = SeqMat.__new__(SeqMat)
|
|
106
145
|
new.name = self.name
|
|
@@ -120,37 +159,66 @@ class SeqMat:
|
|
|
120
159
|
new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
|
|
121
160
|
return new
|
|
122
161
|
|
|
162
|
+
# def apply_mutations(
|
|
163
|
+
# self,
|
|
164
|
+
# mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
|
|
165
|
+
# only_snps: bool = False
|
|
166
|
+
# ) -> SeqMat:
|
|
167
|
+
|
|
123
168
|
def apply_mutations(
|
|
124
169
|
self,
|
|
125
|
-
mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
|
|
170
|
+
mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]] = None,
|
|
171
|
+
*,
|
|
172
|
+
pos: Optional[float] = None,
|
|
173
|
+
ref: Optional[str] = None,
|
|
174
|
+
alt: Optional[str] = None,
|
|
126
175
|
only_snps: bool = False
|
|
127
|
-
|
|
176
|
+
) -> SeqMat:
|
|
128
177
|
"""
|
|
129
178
|
Apply one or a batch of mutations (pos, ref, alt) efficiently:
|
|
130
179
|
- Supports a single tuple or a list of tuples
|
|
131
180
|
- Assumes mutations sorted by position for vectorized searchsorted
|
|
132
181
|
"""
|
|
133
182
|
# Normalize to list
|
|
134
|
-
if isinstance(mutations, tuple) and len(mutations) == 3:
|
|
183
|
+
# if isinstance(mutations, tuple) and len(mutations) == 3:
|
|
184
|
+
# mutations = [mutations]
|
|
185
|
+
# elif not isinstance(mutations, list):
|
|
186
|
+
# raise TypeError("mutations must be a tuple or list of tuples")
|
|
187
|
+
# Input normalization
|
|
188
|
+
if mutations is None:
|
|
189
|
+
if pos is None or ref is None or alt is None:
|
|
190
|
+
raise ValueError("Either `mutations` or `pos, ref, alt` must be provided")
|
|
191
|
+
mutations = [(pos, ref, alt)]
|
|
192
|
+
elif isinstance(mutations, tuple) and len(mutations) == 3:
|
|
135
193
|
mutations = [mutations]
|
|
136
194
|
elif not isinstance(mutations, list):
|
|
137
|
-
raise TypeError("mutations must be a tuple or list of tuples")
|
|
138
|
-
|
|
139
|
-
# Left-normalize and bucket
|
|
195
|
+
raise TypeError("`mutations` must be a tuple or list of tuples")
|
|
196
|
+
|
|
197
|
+
# # Left-normalize and bucket
|
|
198
|
+
# subs, ins, dels = [], [], []
|
|
199
|
+
# for pos, ref, alt in mutations:
|
|
200
|
+
# while ref and alt and ref[0] == alt[0]:
|
|
201
|
+
# pos += 1
|
|
202
|
+
# ref = ref[1:] or '-'
|
|
203
|
+
# alt = alt[1:] or '-'
|
|
204
|
+
# if ref != '-' and alt != '-':
|
|
205
|
+
# subs.append((pos, ref, alt))
|
|
206
|
+
# elif ref == '-' and alt != '-' and not only_snps:
|
|
207
|
+
# ins.append((pos, alt))
|
|
208
|
+
# elif alt == '-' and ref != '-' and not only_snps:
|
|
209
|
+
# dels.append((pos, ref))
|
|
210
|
+
# else:
|
|
211
|
+
# raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
|
|
212
|
+
# Bucket mutations
|
|
140
213
|
subs, ins, dels = [], [], []
|
|
141
|
-
for
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
ins.append((pos, alt))
|
|
150
|
-
elif alt == '-' and ref != '-' and not only_snps:
|
|
151
|
-
dels.append((pos, ref))
|
|
152
|
-
else:
|
|
153
|
-
raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
|
|
214
|
+
for p, r, a in mutations:
|
|
215
|
+
# left-normalize
|
|
216
|
+
while r and a and r[0] == a[0]:
|
|
217
|
+
p += 1; r = r[1:] or '-'; a = a[1:] or '-'
|
|
218
|
+
if r != '-' and a != '-': subs.append((p, r, a))
|
|
219
|
+
elif r == '-' and a != '-' and not only_snps: ins.append((p, a))
|
|
220
|
+
elif a == '-' and r != '-' and not only_snps: dels.append((p, r))
|
|
221
|
+
else: raise ValueError(f"Unsupported mutation {p}:{r}:{a}")
|
|
154
222
|
|
|
155
223
|
# Ensure seq_array indices sorted
|
|
156
224
|
coords = self.seq_array['index']
|
|
@@ -225,3 +293,137 @@ class SeqMat:
|
|
|
225
293
|
mask = (coords >= start) & (coords <= stop)
|
|
226
294
|
return self.seq_array[mask]
|
|
227
295
|
raise TypeError("Invalid index type.")
|
|
296
|
+
|
|
297
|
+
def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
|
|
298
|
+
"""
|
|
299
|
+
Splices out regions from the sequence corresponding to the given intron boundaries.
|
|
300
|
+
|
|
301
|
+
Handles reverse-complemented sequences by interpreting introns in reverse as well.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
|
|
305
|
+
These are always genomic (absolute) coordinates,
|
|
306
|
+
regardless of strand direction.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SeqMat: A new instance with the intron regions removed.
|
|
310
|
+
"""
|
|
311
|
+
# In reverse orientation, flip intron direction for comparison
|
|
312
|
+
if self.rev:
|
|
313
|
+
introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
|
|
314
|
+
|
|
315
|
+
mask = np.ones(len(self.seq_array), dtype=bool)
|
|
316
|
+
|
|
317
|
+
for start, end in introns:
|
|
318
|
+
lo, hi = min(start, end) + 1, max(start, end) - 1
|
|
319
|
+
mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
|
|
320
|
+
|
|
321
|
+
new_instance = self.clone()
|
|
322
|
+
new_instance.seq_array = self.seq_array[mask].copy()
|
|
323
|
+
return new_instance
|
|
324
|
+
|
|
325
|
+
def open_reading_frame(self, tis: int) -> "SeqMat":
|
|
326
|
+
"""
|
|
327
|
+
Extracts the open reading frame starting from the translation initiation site (TIS)
|
|
328
|
+
until the first in-frame stop codon.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
tis (int): Genomic position of the translation initiation site (start codon).
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
|
|
335
|
+
"""
|
|
336
|
+
if tis not in self.seq_array["index"]:
|
|
337
|
+
print(f"Warning: TIS position {tis} not found, returning default.")
|
|
338
|
+
return self.clone(start=0, end=3)
|
|
339
|
+
|
|
340
|
+
# Extract nucleotide sequence and indices starting from TIS
|
|
341
|
+
mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
|
|
342
|
+
coding_part = self.seq_array[mask]
|
|
343
|
+
coding_seq = coding_part["nt"].tobytes().decode()
|
|
344
|
+
|
|
345
|
+
# Read codons in-frame
|
|
346
|
+
for i in range(0, len(coding_seq) - 2, 3):
|
|
347
|
+
codon = coding_seq[i:i + 3]
|
|
348
|
+
if codon in {"TAA", "TAG", "TGA"}:
|
|
349
|
+
# Determine index range for this ORF
|
|
350
|
+
start = coding_part["index"][0]
|
|
351
|
+
stop = coding_part["index"][i + 2]
|
|
352
|
+
lo, hi = sorted((start, stop))
|
|
353
|
+
return self.clone(start=lo, end=hi)
|
|
354
|
+
|
|
355
|
+
raise ValueError("No in-frame stop codon found after the TIS.")
|
|
356
|
+
|
|
357
|
+
def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
|
|
358
|
+
"""
|
|
359
|
+
Predict splicing probabilities at a given position using the specified engine.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
position (int): The genomic position to predict splicing probabilities for.
|
|
363
|
+
engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
|
|
364
|
+
context (int): The length of the target central region (default: 7500).
|
|
365
|
+
format (str): Output format for the splicing engine results.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
pd.DataFrame: A DataFrame containing:
|
|
369
|
+
- position: The genomic position
|
|
370
|
+
- donor_prob: Probability of being a donor splice site
|
|
371
|
+
- acceptor_prob: Probability of being an acceptor splice site
|
|
372
|
+
- nucleotides: The nucleotide sequence at that position
|
|
373
|
+
|
|
374
|
+
Raises:
|
|
375
|
+
ValueError: If an unsupported engine is provided.
|
|
376
|
+
IndexError: If the position is not found in the sequence.
|
|
377
|
+
"""
|
|
378
|
+
# Retrieve extended context (includes flanks) around the position.
|
|
379
|
+
# seq, indices = self.get_context(position, context=context, padding='N')
|
|
380
|
+
target = self.clone(position - context, position + context)
|
|
381
|
+
# print(len(target.seq))
|
|
382
|
+
seq, indices = target.seq, target.index
|
|
383
|
+
# print(len(seq))
|
|
384
|
+
# rel_pos = np.where(indices == position)[0][0]
|
|
385
|
+
# print(rel_pos)
|
|
386
|
+
rel_pos = np.abs(indices - position).argmin()
|
|
387
|
+
# print(rel_pos, len(seq))
|
|
388
|
+
left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
|
|
389
|
+
# print(left_missing, right_missing)
|
|
390
|
+
if left_missing > 0 or right_missing > 0:
|
|
391
|
+
step = -1 if self.rev else 1
|
|
392
|
+
|
|
393
|
+
if left_missing > 0:
|
|
394
|
+
left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
|
|
395
|
+
else:
|
|
396
|
+
left_pad = np.array([], dtype=indices.dtype)
|
|
397
|
+
|
|
398
|
+
if right_missing > 0:
|
|
399
|
+
right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
|
|
400
|
+
else:
|
|
401
|
+
right_pad = np.array([], dtype=indices.dtype)
|
|
402
|
+
|
|
403
|
+
seq = 'N' * left_missing + seq + 'N' * right_missing
|
|
404
|
+
indices = np.concatenate([left_pad, indices, right_pad])
|
|
405
|
+
|
|
406
|
+
# Run the splicing prediction engine (function assumed to be defined externally)
|
|
407
|
+
from .splicing_utils import run_splicing_engine
|
|
408
|
+
donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
|
|
409
|
+
# Trim off the fixed flanks before returning results.
|
|
410
|
+
seq = seq[5000:-5000]
|
|
411
|
+
indices = indices[5000:-5000]
|
|
412
|
+
df = pd.DataFrame({
|
|
413
|
+
'position': indices,
|
|
414
|
+
'donor_prob': donor_probs,
|
|
415
|
+
'acceptor_prob': acceptor_probs,
|
|
416
|
+
'nucleotides': list(seq)
|
|
417
|
+
}).set_index('position').round(3)
|
|
418
|
+
# if reference_donors is not None:
|
|
419
|
+
# df['ref_donor'] = df.index.isin(reference_donors).astype(int)
|
|
420
|
+
# if reference_acceptors is not None:
|
|
421
|
+
# df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
|
|
422
|
+
|
|
423
|
+
df.attrs['name'] = self.name
|
|
424
|
+
if inplace:
|
|
425
|
+
self.predicted_splicing = df
|
|
426
|
+
return self
|
|
427
|
+
else:
|
|
428
|
+
return df
|
|
429
|
+
|
|
@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
|
|
|
37
37
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
38
38
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
39
39
|
geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
|
|
40
|
-
geney/utils/SeqMats.py,sha256=
|
|
40
|
+
geney/utils/SeqMats.py,sha256=4NJMXNDw6XQAaKVpNuEIft1Xa1sSxPSFwnIeMzjI3eE,17058
|
|
41
41
|
geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
|
|
42
42
|
geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
|
|
43
43
|
geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
|
|
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
|
|
|
46
46
|
geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
|
|
47
47
|
geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
|
|
48
48
|
geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
|
|
49
|
-
geney-1.4.
|
|
50
|
-
geney-1.4.
|
|
51
|
-
geney-1.4.
|
|
52
|
-
geney-1.4.
|
|
49
|
+
geney-1.4.23.dist-info/METADATA,sha256=cRWawSdfae-X2F-k7AFedHrQd_OhD2zBLjkA2zzNLrs,990
|
|
50
|
+
geney-1.4.23.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
51
|
+
geney-1.4.23.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
52
|
+
geney-1.4.23.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|