geney 1.2.69__py2.py3-none-any.whl → 1.3.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/Gene.py ADDED
@@ -0,0 +1,177 @@
1
+ import copy
2
+ from typing import Any, Dict, List, Tuple, Optional, Iterator, Union, TYPE_CHECKING
3
+ from collections import Counter
4
+ from . import unload_pickle, config
5
+ from .Transcript import Transcript
6
+
7
+ class Gene:
8
+ """
9
+ A class representing a Gene, with associated transcripts and metadata.
10
+
11
+ Attributes:
12
+ organism (str): The organism build (e.g. 'hg38').
13
+ transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
14
+ gene_name (str): The name of the gene.
15
+ gene_id (str): The unique identifier for the gene.
16
+ chrm (str): The chromosome on which the gene resides.
17
+ """
18
+
19
+ def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
20
+ """
21
+ Initialize a Gene instance by loading gene information from stored pickled files.
22
+
23
+ Args:
24
+ gene_name (str): Name of the gene (default 'KRAS').
25
+ variation: Variation information (unused currently).
26
+ organism (str): Organism reference build (default 'hg38').
27
+
28
+ Raises:
29
+ FileNotFoundError: If no files for the specified gene are found.
30
+ AssertionError: If required attributes are missing after loading.
31
+ """
32
+
33
+ def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
34
+ self.gene_name = gene_name
35
+ self.gene_id = gene_id
36
+ self.rev = rev
37
+ self.chrm = chrm
38
+ self.organism = organism
39
+ self.transcripts = transcripts if transcripts is not None else {}
40
+
41
+ def __repr__(self) -> str:
42
+ """
43
+ Official string representation of the Gene object.
44
+ """
45
+ return f"Gene({self.gene_name})"
46
+
47
+ def __str__(self) -> str:
48
+ """
49
+ Unofficial, user-friendly string representation of the Gene object.
50
+ """
51
+ return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
52
+
53
+ def __len__(self) -> int:
54
+ """
55
+ Returns the number of transcripts associated with this gene.
56
+
57
+ Returns:
58
+ int: The count of transcripts.
59
+ """
60
+ return len(self.transcripts)
61
+
62
+ def __copy__(self):
63
+ """
64
+ Returns a shallow copy of the Gene object.
65
+ """
66
+ return copy.copy(self)
67
+
68
+ def __deepcopy__(self, memo):
69
+ """
70
+ Returns a deep copy of the Gene object.
71
+ """
72
+ return copy.deepcopy(self, memo)
73
+
74
+ def __iter__(self):
75
+ """
76
+ Allow iteration over the gene's transcripts, yielding Transcript objects.
77
+ """
78
+ for tid, annotations in self.transcripts.items():
79
+ yield Transcript(annotations, organism=self.organism)
80
+
81
+ @classmethod
82
+ def from_file(cls, gene_name, organism='hg38'):
83
+ # Load data from file here
84
+
85
+ # Find gene data files in the configured organism MRNA path
86
+ gene_files = list((config[organism]['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
87
+ if not gene_files:
88
+ raise FileNotFoundError(f"No files available for gene '{gene_name}'.")
89
+
90
+ # Load gene data from the first matching file
91
+ data = unload_pickle(gene_files[0])
92
+ gene_name = data.get('gene_name')
93
+ gene_id = data.get('gene_id')
94
+ rev = data.get('rev')
95
+ chrm = data.get('chrm')
96
+ transcripts = data.get('transcripts', {})
97
+
98
+ return cls(
99
+ gene_name=gene_name,
100
+ gene_id=gene_id,
101
+ rev=rev,
102
+ chrm=chrm,
103
+ transcripts=transcripts,
104
+ organism=organism
105
+ )
106
+
107
+ def splice_sites(self) -> Tuple['Counter', 'Counter']:
108
+ """
109
+ Aggregates splice sites (acceptors and donors) from all transcripts.
110
+
111
+ Returns:
112
+ tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
113
+ """
114
+ from collections import Counter
115
+ acceptors: List[Any] = []
116
+ donors: List[Any] = []
117
+
118
+ # Collect acceptor and donor sites from each transcript
119
+ for transcript in self.transcripts.values():
120
+ acceptors.extend(transcript.get('acceptors', []))
121
+ donors.extend(transcript.get('donors', []))
122
+
123
+ return Counter(acceptors), Counter(donors)
124
+
125
+ def transcript(self, tid: Optional[str] = None):
126
+ """
127
+ Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
128
+
129
+ Args:
130
+ tid (str, optional): Transcript ID. If None, returns primary transcript.
131
+
132
+ Returns:
133
+ Transcript: The Transcript object with the given ID or the primary transcript.
134
+
135
+ Raises:
136
+ AttributeError: If the requested transcript does not exist.
137
+ """
138
+ if tid is None:
139
+ tid = self.primary_transcript
140
+
141
+ if tid not in self.transcripts:
142
+ raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
143
+
144
+ return Transcript(self.transcripts[tid], organism=self.organism)
145
+
146
+ @property
147
+ def primary_transcript(self) -> Optional[str]:
148
+ """
149
+ Returns the primary transcript ID for this gene.
150
+ If not explicitly defined, it attempts to select a primary transcript.
151
+ If none is found, it falls back to the first protein-coding transcript.
152
+ If still none is found, returns None.
153
+
154
+ Returns:
155
+ str or None: The primary transcript ID or None if not available.
156
+ """
157
+ # If already calculated, return it
158
+ if hasattr(self, '_primary_transcript'):
159
+ return self._primary_transcript
160
+
161
+ # Try to find a primary transcript
162
+ primary_transcripts = [k for k, v in self.transcripts.items() if v.get('primary_transcript')]
163
+ if primary_transcripts:
164
+ self._primary_transcript = primary_transcripts[0]
165
+ return self._primary_transcript
166
+
167
+ # Fallback: find a protein-coding transcript
168
+ protein_coding = [k for k, v in self.transcripts.items() if v.get('transcript_biotype') == 'protein_coding']
169
+ if protein_coding:
170
+ self._primary_transcript = protein_coding[0]
171
+ return self._primary_transcript
172
+
173
+ # No primary or protein-coding transcript found
174
+ self._primary_transcript = None
175
+ return None
176
+
177
+
geney/SeqMats.py ADDED
@@ -0,0 +1,492 @@
1
+ import re
2
+ import numpy as np
3
+
4
+ ALPHABET = {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
5
+
6
+ class SeqMat:
7
+ ROW_SEQ = 0
8
+ ROW_INDS = 1
9
+ ROW_SUPERINDS = 2
10
+ ROW_MUTATED = 3
11
+
12
+ def __init__(self, seqmat, alphabet=None):
13
+ self.seqmat = seqmat
14
+ self.alphabet = alphabet or {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
15
+
16
+ self.char_to_value = {c: i for i, c in enumerate(self.alphabet.keys())}
17
+ self.value_to_char = {i: c for i, c in enumerate(self.alphabet.keys())}
18
+ self.value_complements = {self.char_to_value[c1]: self.char_to_value[c2] for c1, c2 in self.alphabet.items()}
19
+
20
+ def __repr__(self):
21
+ return f"<SeqMat: {self.seq}>"
22
+
23
+ def __str__(self):
24
+ return self.seq
25
+
26
+ def __len__(self):
27
+ return self.seqmat.shape[1]
28
+
29
+ def __getitem__(self, key):
30
+ if isinstance(key, slice):
31
+ pos1, pos2 = self._rel_index(key.start), self._rel_index(key.stop)
32
+ return SeqMat(self.seqmat[:, pos1:pos2+1])
33
+ else:
34
+ pos = self._rel_index(key)
35
+ return SeqMat(self.seqmat[:, pos:pos + 1])
36
+
37
+ def __contains__(self, other):
38
+ """
39
+ Checks if another SeqMat object is entirely contained within this SeqMat object.
40
+
41
+ Args:
42
+ other (SeqMat): Another SeqMat object to check for containment.
43
+
44
+ Returns:
45
+ bool: True if `other` is contained in `self`, False otherwise.
46
+ """
47
+ # Ensure `other` is a SeqMat
48
+ if not isinstance(other, SeqMat):
49
+ raise TypeError("Can only check containment with another SeqMat object.")
50
+
51
+ # Check if all indices of `other` are in `self`
52
+ other_indices = other.seqmat[other.ROW_INDS, :]
53
+ self_indices = self.seqmat[self.ROW_INDS, :]
54
+ if not np.all(np.isin(other_indices, self_indices)):
55
+ return False
56
+
57
+ return True
58
+
59
+ def __eq__(self, other):
60
+ """
61
+ Implements the == operator to compare two SeqMat objects.
62
+
63
+ Args:
64
+ other (SeqMat): The other SeqMat object to compare.
65
+
66
+ Returns:
67
+ bool: True if the two SeqMat objects are equal, False otherwise.
68
+ """
69
+ # Ensure `other` is a SeqMat object
70
+ if not isinstance(other, SeqMat):
71
+ return False
72
+
73
+ # Compare the sequence matrix
74
+ if not np.array_equal(self.seqmat, other.seqmat):
75
+ return False
76
+
77
+ return True
78
+
79
+ @classmethod
80
+ def empty(cls, alphabet=None):
81
+ """
82
+ Creates an empty SeqMat object.
83
+
84
+ Args:
85
+ alphabet (dict): Optional alphabet dictionary (default: {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}).
86
+
87
+ Returns:
88
+ SeqMat: An empty SeqMat object.
89
+ """
90
+ empty_seqmat = np.zeros((4, 0), dtype=np.int32) # 4 rows, 0 columns (no data)
91
+ return cls(empty_seqmat, alphabet=alphabet)
92
+
93
+ def __add__(self, other):
94
+ """
95
+ Implements the + operator. Joins two SeqMat objects or applies mutations.
96
+
97
+ If `other` is outside the range of indices, the sequences are concatenated, provided the indices are
98
+ monotonically increasing or decreasing. Otherwise, it applies the mutation.
99
+
100
+ Args:
101
+ other (SeqMat): Another SeqMat object to join or mutate.
102
+
103
+ Returns:
104
+ SeqMat: A new SeqMat object with the resulting sequence.
105
+ """
106
+ # Ensure `other` is a SeqMat
107
+ if not isinstance(other, SeqMat):
108
+ raise TypeError("Can only add another SeqMat object.")
109
+
110
+ if other in self:
111
+ return self.mutate(other)
112
+
113
+ else:
114
+ combined_seqmat = np.hstack((self.seqmat, other.seqmat))
115
+
116
+ # Ensure the combined sequence is monotonic
117
+ if not self._is_monotonic(combined_seqmat[self.ROW_INDS]):
118
+ raise ValueError("Resulting sequence indices are not monotonic.")
119
+
120
+ return SeqMat(combined_seqmat, alphabet=self.alphabet)
121
+
122
+ def __iadd__(self, other):
123
+ """
124
+ Implements the += operator. Joins two SeqMat objects or applies mutations in place.
125
+
126
+ Args:
127
+ other (SeqMat): Another SeqMat object to join or mutate.
128
+
129
+ Returns:
130
+ SeqMat: The mutated or joined SeqMat object.
131
+ """
132
+ # Ensure `other` is a SeqMat
133
+ if not isinstance(other, SeqMat):
134
+ raise TypeError("Can only add another SeqMat object.")
135
+
136
+ if other in self:
137
+ self.seqmat = self.mutate(other).seqmat
138
+ return self
139
+ else:
140
+ self.seqmat = np.hstack((self.seqmat, other.seqmat))
141
+
142
+ if not self._is_monotonic(self.seqmat[self.ROW_INDS]):
143
+ raise ValueError("Resulting sequence indices are not monotonic.")
144
+
145
+ return self
146
+
147
+ # def get_context(self, pos, context=500):
148
+ # pos = self._rel_index(pos)
149
+ # lower_bound, upper_bound = max(0, pos - context), min(len(self), pos + context + 1)
150
+ # return SeqMat(self.seqmat[:, lower_bound:upper_bound])
151
+
152
+ def get_context(self, pos, context=500, padding=None):
153
+ """
154
+ Returns a SeqMat object representing the region around `pos` with the given context.
155
+ If padding is provided and the requested context extends beyond the sequence boundaries,
156
+ the result is padded with the specified nucleotide in the sequence row and -1 in the indices rows.
157
+
158
+ Args:
159
+ pos (int): The position of interest in the original coordinate space.
160
+ context (int): The number of nucleotides to include on each side of pos (default 500).
161
+ padding (str or None): The nucleotide to use for padding. If None, no padding is applied and
162
+ the returned region may be shorter than requested.
163
+
164
+ Returns:
165
+ SeqMat: A new SeqMat object containing the context region (padded if requested).
166
+ """
167
+ # Resolve the relative index
168
+ pos = self._rel_index(pos)
169
+
170
+ # Calculate desired start and end positions
171
+ desired_length = 2 * context + 1
172
+ start = pos - context
173
+ end = pos + context + 1
174
+
175
+ # Actual bounds clipped to the available length
176
+ actual_start = max(start, 0)
177
+ actual_end = min(len(self), end)
178
+
179
+ # Extract the slice that fits within the sequence
180
+ slice_seqmat = self.seqmat[:, actual_start:actual_end]
181
+
182
+ extracted_length = slice_seqmat.shape[1]
183
+
184
+ # If no padding requested, just return the slice
185
+ if padding is None or extracted_length == desired_length:
186
+ return SeqMat(slice_seqmat)
187
+
188
+ # If padding is requested and we have fewer columns than desired, pad the result
189
+ if extracted_length < desired_length:
190
+ # Determine how much we need to pad on each side
191
+ pad_left = max(-start, 0) # How many columns needed before actual_start
192
+ pad_right = max(end - len(self), 0) # How many columns needed after actual_end
193
+
194
+ # Determine numeric code for padding nucleotide
195
+ # Assuming self.char_to_value is available and 'N' is known if padding isn't recognized
196
+ N_val = self.char_to_value.get(padding, self.char_to_value['N'])
197
+
198
+ # Create a new array with the desired length
199
+ new_seqmat = np.full((self.seqmat.shape[0], desired_length), -1, dtype=self.seqmat.dtype)
200
+ # Fill the sequence row with N_val
201
+ new_seqmat[0, :] = N_val
202
+
203
+ # Place the extracted slice into the correct position
204
+ new_seqmat[:, pad_left:pad_left + extracted_length] = slice_seqmat
205
+ return SeqMat(new_seqmat)
206
+
207
+ # If for some reason extracted_length > desired_length (unlikely), just truncate
208
+ if extracted_length > desired_length:
209
+ return SeqMat(slice_seqmat[:, :desired_length])
210
+
211
+ # Fallback (should not reach here normally)
212
+ return SeqMat(slice_seqmat)
213
+
214
+
215
+ def _rel_index(self, pos):
216
+ if pos in self.indices:
217
+ return np.where(self.seqmat[self.ROW_INDS, :] == pos)[0][0]
218
+ else:
219
+ raise IndexError(f"Position {pos} not found in sequence.")
220
+
221
+ def _is_same_strand(self, other):
222
+ """
223
+ Checks if two SeqMat objects are on the same strand.
224
+
225
+ Args:
226
+ other (SeqMat): The other SeqMat object to compare.
227
+
228
+ Returns:
229
+ bool: True if both are on the same strand, False otherwise.
230
+ """
231
+ self_indices = self.seqmat[self.ROW_INDS, :]
232
+ other_indices = other.seqmat[self.ROW_INDS, :]
233
+
234
+ # Determine monotonicity
235
+ self_increasing = np.all(np.diff(self_indices) >= 0)
236
+ self_decreasing = np.all(np.diff(self_indices) <= 0)
237
+ other_increasing = np.all(np.diff(other_indices) >= 0)
238
+ other_decreasing = np.all(np.diff(other_indices) <= 0)
239
+
240
+ # Both must be either increasing or decreasing
241
+ return (self_increasing and other_increasing) or (self_decreasing and other_decreasing)
242
+
243
+ def reverse_complement(self, inplace=True):
244
+ """
245
+ Reverse complement the sequence in place.
246
+ """
247
+ seqmat = self.seqmat[:, ::-1].copy()
248
+ seqmat[self.ROW_SEQ, :] = np.vectorize(self.value_complements.get)(seqmat[self.ROW_SEQ])
249
+
250
+ if inplace:
251
+ self.seqmat = seqmat
252
+ return self
253
+
254
+ return SeqMat(seqmat)
255
+
256
+ @classmethod
257
+ def from_seq(cls, seq_dict, alphabet=None):
258
+ """
259
+ Create a SeqMat object from a dictionary containing sequence information.
260
+ """
261
+ seq = np.array(list(seq_dict["seq"]))
262
+ inds = seq_dict.get("indices", np.arange(len(seq), dtype=np.int32))
263
+ superinds = seq_dict.get("superinds", np.zeros(len(seq), dtype=np.int32))
264
+ mutmark = np.zeros_like(superinds)
265
+
266
+ assert len(seq) == len(inds), f"Sequence length {len(seq)} must match indices length {len(inds)}"
267
+ if not cls._is_monotonic(inds):
268
+ raise ValueError(f"Sequence indices must be monotonic, got {inds}")
269
+
270
+ # Create character-to-value mapping
271
+ char_to_value = {c: i for i, c in enumerate(ALPHABET.keys())}
272
+ seq_values = [char_to_value[nt] for nt in seq]
273
+
274
+ # Stack sequence matrix
275
+ seqmat = np.vstack([seq_values, inds, superinds, mutmark]).astype(np.int32)
276
+ return cls(seqmat)
277
+
278
+ @staticmethod
279
+ def _is_monotonic(inds):
280
+ return all(x >= y for x, y in zip(inds, inds[1:])) if inds[0] > inds[-1] else all(
281
+ x <= y for x, y in zip(inds, inds[1:]))
282
+
283
+ @property
284
+ def seq(self):
285
+ return self.rawseq.replace('-', '')
286
+
287
+ @property
288
+ def rawseq(self):
289
+ return ''.join([self.value_to_char[int(ind)] for ind in self.seqmat[self.ROW_SEQ, :]])
290
+
291
+ @property
292
+ def indices(self):
293
+ return self.seqmat[self.ROW_INDS, self.seqmat[self.ROW_SEQ, :] != 5] + (
294
+ self.seqmat[self.ROW_SUPERINDS, self.seqmat[self.ROW_SEQ, :] != 5] / 10)
295
+
296
+ def mutate(self, mut, inplace=False):
297
+ """
298
+ Apply mutations to the sequence matrix.
299
+ Args:
300
+ mut (SeqMat): A SeqMat object containing mutations.
301
+ return_seqmat (bool): If True, return the mutated seqmat; otherwise, return updated sequence.
302
+
303
+ Returns:
304
+ str or np.ndarray: Mutated sequence or sequence matrix based on `return_seqmat`.
305
+ """
306
+ ### NEEDS some work to make sure that mutations can continue being added without issue...
307
+
308
+ # Ensure strand compatibility
309
+ if not self._is_same_strand(mut):
310
+ raise ValueError("Mutation and sequence are not on the same strand.")
311
+
312
+ # something to make sure the mutation is contained as one deletion, insertion, or snp or indel
313
+ ref_seqmat = self.seqmat.copy()
314
+ mut_seqmat = mut.seqmat
315
+
316
+ # Ensure mutation indices exist in the reference
317
+ if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
318
+ return self
319
+
320
+ # Handle the fact that only part of the mutation is in the sequence and isertable
321
+ if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
322
+ raise ValueError("Some mutation indices are not found in the reference sequence.")
323
+
324
+ # Handle replacements
325
+ temp = mut_seqmat[:, np.where(mut_seqmat[self.ROW_SUPERINDS, :] == 0)[0]]
326
+ condition = (
327
+ np.isin(ref_seqmat[self.ROW_INDS, :],
328
+ temp[self.ROW_INDS, :])
329
+ )
330
+
331
+ indices = np.where(condition)[0]
332
+ ref_seqmat[:, indices] = temp[:, :]
333
+
334
+ # Handle insertions
335
+ insertions = np.where(mut_seqmat[self.ROW_SUPERINDS, :] > 0)[0]
336
+ if insertions.size > 0:
337
+ ins_seqmat = mut_seqmat[:, insertions]
338
+ correction = 1 if self.seqmat[self.ROW_INDS, 0] > self.seqmat[self.ROW_INDS, -1] else 0
339
+ ins_loc = np.where(ref_seqmat[self.ROW_INDS, :] == ins_seqmat[self.ROW_INDS, 0])[0][0] + 1 - correction
340
+ ref_seqmat = np.insert(ref_seqmat, ins_loc, ins_seqmat.T, axis=1)
341
+
342
+ if inplace:
343
+ self.seqmat = ref_seqmat
344
+ return self
345
+
346
+ return SeqMat(ref_seqmat)
347
+
348
+ def orf_seqmat(self, tis_index):
349
+ temp = self.seqmat[:, self._rel_index(tis_index):]
350
+ temp = temp[:, temp[0, :] != 5]
351
+ temp = SeqMat(temp) # .drop_indices()
352
+ raw_seq = temp.seq # Extract the raw sequence
353
+ pattern = re.compile(r"(?:[NACGT]{3})*?(TAA|TAG|TGA)")
354
+ match = pattern.match(raw_seq)
355
+ if match:
356
+ stop_index = match.end()
357
+ else:
358
+ stop_index = len(raw_seq)
359
+ end_index = stop_index
360
+ return SeqMat(temp.seqmat[:, :end_index])
361
+
362
+ def translate(self, tis_index):
363
+ """
364
+ Translates a nucleotide sequence into an amino acid sequence.
365
+ Ensures the sequence length is divisible by 3 by trimming excess nucleotides.
366
+
367
+ Args:
368
+ sequence (str): Nucleotide sequence (e.g., ACGT).
369
+
370
+ Returns:
371
+ str: Translated amino acid sequence.
372
+ """
373
+ # Codon-to-amino acid mapping table (standard genetic code)
374
+ codon_table = {
375
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
376
+ 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
377
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
378
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
379
+ 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
380
+ 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
381
+ 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
382
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
383
+ 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
384
+ 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
385
+ 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
386
+ 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
387
+ 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
388
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
389
+ 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
390
+ 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
391
+ }
392
+ sequence = self.orf_seqmat(tis_index).seq
393
+
394
+ # Ensure sequence is uppercase
395
+ sequence = sequence.upper()
396
+
397
+ # Trim sequence to ensure divisibility by 3
398
+ trimmed_length = len(sequence) - (len(sequence) % 3)
399
+ sequence = sequence[:trimmed_length]
400
+
401
+ # Translate sequence in chunks of 3
402
+ amino_acids = [codon_table.get(sequence[i:i+3], 'X') for i in range(0, len(sequence), 3)]
403
+
404
+ # Join amino acids into a single string
405
+ return ''.join(amino_acids)
406
+
407
+
408
+ def to_dict(self):
409
+ return {'seq': self.rawseq, 'indices': self.indices, 'superinds': self.seqmat[self.ROW_SUPERINDS, :]}
410
+
411
+ class DnaSeqMat(SeqMat):
412
+ pass
413
+
414
+
415
+ class RnaSeqMat(SeqMat):
416
+ pass
417
+
418
+
419
+ class AASeqMat(SeqMat):
420
+ pass
421
+
422
+
423
+ class MutSeqMat(SeqMat):
424
+ """
425
+ A subclass of SeqMat designed specifically for mutation sequences.
426
+
427
+ Additional Conditions:
428
+ 1. Mutation indices must be consecutive (increasing or decreasing).
429
+ 2. The superinds row must have a maximum value of 10.
430
+ """
431
+
432
+ def __init__(self, seqmat, alphabet=None):
433
+ super().__init__(seqmat, alphabet)
434
+
435
+ # Validate the mutation-specific conditions
436
+ self._validate_mutation_indices()
437
+ self.seqmat[-1, :] = 1
438
+ self.position = min(self.seqmat[self.ROW_INDS, :])
439
+
440
+ # self._validate_superinds()
441
+
442
+ def _validate_mutation_indices(self):
443
+ """
444
+ Validates that the mutation indices are consecutive (increasing or decreasing).
445
+ """
446
+ indices = self.seqmat[self.ROW_INDS, :]
447
+ if not (np.all(abs(np.diff(indices)) <= 1)):
448
+ raise ValueError(f"Mutation indices must be consecutive. Got: {indices}")
449
+
450
+
451
+ @classmethod
452
+ def from_mutid(cls, mid):
453
+ gene, chrom, i, r, a = mid.split(':')
454
+ if list(set(a))[0] == '-' and len(a) > 1 and len(list(set(a))) == 1:
455
+ a = '-'
456
+
457
+ if list(set(r))[0] == '-' and len(r) > 1 and len(list(set(r))) == 1:
458
+ r = '-'
459
+
460
+ i = int(i)
461
+
462
+ if len(a) == len(r) == 1 and a != '-' and r != '-':
463
+ temp = {'seq': a, 'indices': [i], 'superinds': [0]}
464
+
465
+ elif a == '-' and r != '-':
466
+ # return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
467
+ temp = {'seq': '-'*len(r), 'indices': np.arange(i, i + len(r), dtype=np.int32), 'superinds': [0] * len(r)}
468
+
469
+ elif r == '-' and a != '-':
470
+ # print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
471
+ # return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
472
+ temp = {'seq': a, 'indices': np.full(len(a), int(i)), 'superinds': np.arange(1, len(a)+1)}
473
+
474
+ elif a != '-' and r != '-':
475
+ ind1 = np.concatenate(
476
+ [np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
477
+ ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
478
+ # return Allele('-' * len(r) + a, ind1, ind2, rev)
479
+ temp = {'seq': '-' * len(r) + a, 'indices': ind1, 'superinds': ind2}
480
+
481
+ return cls.from_seq(temp)
482
+
483
+
484
+ # def _validate_superinds(self):
485
+ # """
486
+ # Validates that the superinds row has a maximum value of 10.
487
+ # """
488
+ # superinds = self.seqmat[self.ROW_SUPERINDS, :]
489
+ # if np.max(superinds) > 10:
490
+ # raise ValueError(f"Superinds row must have a maximum value of 10. Got: {superinds}")
491
+
492
+
geney/Transcript.py ADDED
@@ -0,0 +1,379 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Optional, Union
3
+ import numpy as np
4
+ import copy
5
+ from Bio.Seq import Seq # Assuming Biopython is used
6
+ from . import unload_pickle, config
7
+ from .SeqMats import SeqMat, MutSeqMat
8
+ from .Fasta_segment import Fasta_segment
9
+
10
+ class Transcript:
11
+ """
12
+ Represents a transcript with associated genomic information such as exons, introns, and sequences.
13
+
14
+ A Transcript object is expected to contain attributes loaded from a dictionary `d` representing
15
+ annotations and metadata. This includes (at least):
16
+ - transcript_start
17
+ - transcript_end
18
+ - rev (boolean indicating if the transcript is on the reverse strand)
19
+ - chrm (chromosome)
20
+ - donors
21
+ - acceptors
22
+ - cons_vector
23
+ - cons_seq
24
+ - transcript_seq
25
+ - transcript_biotype
26
+ - primary_transcript
27
+ - transcript_id
28
+ - TIS, TTS (if protein-coding)
29
+ """
30
+
31
+ def __init__(self, d: dict[str, Any], organism: str = 'hg38'):
32
+ """
33
+ Initialize a Transcript object from a dictionary of attributes and metadata.
34
+
35
+ Args:
36
+ d (dict): Dictionary containing transcript attributes and data.
37
+ organism (str): Genome build or organism reference (e.g., 'hg38').
38
+
39
+ Raises:
40
+ AssertionError: If required attributes are missing.
41
+ """
42
+ # Convert certain attributes to NumPy arrays for consistent processing
43
+ array_fields = {'acceptors', 'donors', 'cons_vector'}
44
+ for k, v in d.items():
45
+ if k in array_fields and v is not None:
46
+ v = np.array(v)
47
+ setattr(self, k, v)
48
+
49
+ self.organism: str = organism
50
+
51
+ # Required attributes to form a valid transcript object
52
+ required_attrs = ['transcript_start', 'transcript_end', 'rev', 'chrm']
53
+ missing = [attr for attr in required_attrs if not hasattr(self, attr)]
54
+ if missing:
55
+ raise AssertionError(f"Transcript is missing required attributes: {missing}")
56
+
57
+ # Default fallback values for optional attributes
58
+ if not hasattr(self, 'donors') or self.donors is None:
59
+ self.donors = np.array([])
60
+ if not hasattr(self, 'acceptors') or self.acceptors is None:
61
+ self.acceptors = np.array([])
62
+ if not hasattr(self, 'cons_available'):
63
+ self.cons_available = False
64
+
65
+ # Determine if transcript is protein-coding
66
+ self.protein_coding: bool = hasattr(self, 'TIS') and hasattr(self, 'TTS')
67
+
68
+ # Calculate transcript boundaries
69
+ self.transcript_upper = max(self.transcript_start, self.transcript_end)
70
+ self.transcript_lower = min(self.transcript_start, self.transcript_end)
71
+
72
+ # Generate pre-mRNA sequence data
73
+ self.generate_pre_mrna()
74
+
75
+ # If consensus data is available and ends with '*', adjust cons_vector and cons_seq
76
+ if self.cons_available and hasattr(self, 'cons_seq') and hasattr(self, 'cons_vector'):
77
+ if self.cons_seq.endswith('*') and len(self.cons_seq) == len(self.cons_vector):
78
+ self.cons_vector = self.cons_vector[:-1]
79
+ self.cons_seq = self.cons_seq[:-1]
80
+
81
+ def __repr__(self) -> str:
82
+ """Official string representation."""
83
+ return f"Transcript({getattr(self, 'transcript_id', 'unknown_id')})"
84
+
85
+ def __str__(self) -> str:
86
+ """
87
+ Unofficial, user-friendly string representation of the transcript.
88
+
89
+ Returns:
90
+ str: A summary of the transcript including ID, type, and primary status.
91
+ """
92
+ transcript_biotype = getattr(self, 'transcript_biotype', 'unknown').replace('_', ' ').title()
93
+ primary = getattr(self, 'primary_transcript', False)
94
+ return f"Transcript {getattr(self, 'transcript_id', 'unknown_id')}, " \
95
+ f"Type: {transcript_biotype}, Primary: {primary}"
96
+
97
+ def __len__(self) -> int:
98
+ """
99
+ Length of the transcript sequence.
100
+
101
+ Returns:
102
+ int: Length of the transcript sequence.
103
+ """
104
+ return len(getattr(self, 'transcript_seq', ''))
105
+
106
+ def __eq__(self, other: object) -> bool:
107
+ """
108
+ Check equality of two transcripts based on their transcript sequences.
109
+
110
+ Args:
111
+ other (object): Another transcript-like object.
112
+
113
+ Returns:
114
+ bool: True if sequences match, False otherwise.
115
+ """
116
+ if not isinstance(other, Transcript):
117
+ return NotImplemented
118
+ return self.transcript_seq == other.transcript_seq
119
+
120
+ def __contains__(self, subvalue: Any) -> bool:
121
+ """
122
+ Check if a given subsequence (e.g., another SeqMat) is contained within the pre_mRNA.
123
+
124
+ Args:
125
+ subvalue (Any): The substring (or sub-SeqMat) to search for in the mature mRNA.
126
+
127
+ Returns:
128
+ bool: True if subvalue's indices are all present in the pre_mRNA, False otherwise.
129
+
130
+ Notes:
131
+ This assumes `subvalue` has a `seqmat` attribute and that `subvalue.seqmat[1, :]` represents indices.
132
+ """
133
+ if not hasattr(subvalue, 'seqmat'):
134
+ return False
135
+ return np.all(np.isin(subvalue.seqmat[1, :], self.pre_mrna.seqmat[1, :]))
136
+
137
+ @property
138
+ def exons(self) -> list[tuple[int, int]]:
139
+ """
140
+ Return a list of exon boundary tuples (acceptor, donor).
141
+
142
+ Returns:
143
+ list of (int, int): List of exon boundaries.
144
+ """
145
+ exon_starts = np.concatenate(([self.transcript_start], self.acceptors))
146
+ exon_ends = np.concatenate((self.donors, [self.transcript_end]))
147
+ return list(zip(exon_starts, exon_ends))
148
+
149
+ @property
150
+ def exons_pos(self) -> list[tuple[int, int]]:
151
+ """
152
+ Return exons with positions adjusted for strand orientation.
153
+
154
+ Returns:
155
+ list of (int, int): Exons adjusted for strand orientation.
156
+ """
157
+ exon_positions = self.exons
158
+ if self.rev:
159
+ # Reverse order and swap coordinates for reverse strand
160
+ exon_positions = [(end, start) for start, end in exon_positions[::-1]]
161
+ return exon_positions
162
+
163
+ @property
164
+ def introns(self) -> list[tuple[int, int]]:
165
+ """
166
+ Return a list of intron boundaries derived from donors and acceptors.
167
+
168
+ Returns:
169
+ list of (int, int): Intron boundaries.
170
+ """
171
+ valid_donors = self.donors[self.donors != self.transcript_end]
172
+ valid_acceptors = self.acceptors[self.acceptors != self.transcript_start]
173
+ return list(zip(valid_donors, valid_acceptors))
174
+
175
+ @property
176
+ def introns_pos(self) -> list[tuple[int, int]]:
177
+ """
178
+ Return introns with positions adjusted for strand orientation.
179
+
180
+ Returns:
181
+ list of (int, int): Introns adjusted for strand orientation.
182
+ """
183
+ intron_positions = self.introns
184
+ if self.rev:
185
+ intron_positions = [(end, start) for start, end in intron_positions[::-1]]
186
+ return intron_positions
187
+
188
+ def _fix_and_check_introns(self) -> Transcript:
189
+ """
190
+ Ensure acceptors and donors are sorted and unique, and validate exon/intron structures.
191
+
192
+ Raises:
193
+ ValueError: If there are mismatches or ordering issues in exons/introns.
194
+
195
+ Returns:
196
+ Transcript: The current Transcript object (for chaining).
197
+ """
198
+ # Ensure uniqueness and correct ordering based on strand
199
+ self.acceptors = np.unique(self.acceptors)
200
+ self.donors = np.unique(self.donors)
201
+
202
+ if self.rev:
203
+ self.acceptors = np.sort(self.acceptors)[::-1]
204
+ self.donors = np.sort(self.donors)[::-1]
205
+ else:
206
+ self.acceptors = np.sort(self.acceptors)
207
+ self.donors = np.sort(self.donors)
208
+
209
+ # Validation checks
210
+ if self.__exon_intron_matchup_flag():
211
+ raise ValueError("Unequal number of acceptors and donors.")
212
+
213
+ if self.__exon_intron_order_flag():
214
+ raise ValueError("Exon/intron order out of position.")
215
+
216
+ if self.__transcript_boundary_flag():
217
+ raise ValueError("Transcript boundaries must straddle acceptors and donors.")
218
+
219
+ return self
220
+
221
+ def __exon_intron_matchup_flag(self) -> bool:
222
+ """Check if acceptors and donors count match."""
223
+ return len(self.acceptors) != len(self.donors)
224
+
225
+ def __exon_intron_order_flag(self) -> bool:
226
+ """Check for ordering issues in exon boundaries."""
227
+ return any(start > end for start, end in self.exons_pos)
228
+
229
+ def __transcript_boundary_flag(self) -> bool:
230
+ """Check if boundaries are within the transcript start/end range."""
231
+ if not len(self.acceptors) and not len(self.donors):
232
+ return False
233
+ min_boundary = np.min(np.concatenate((self.acceptors, self.donors)))
234
+ max_boundary = np.max(np.concatenate((self.acceptors, self.donors)))
235
+ return (self.transcript_lower > min_boundary) or (self.transcript_upper < max_boundary)
236
+
237
+ @property
238
+ def exonic_indices(self) -> np.ndarray:
239
+ """
240
+ Return the indices covering exons in the transcript.
241
+
242
+ Returns:
243
+ np.ndarray: Array of exon indices.
244
+ """
245
+ return np.concatenate([np.arange(a, b + 1) for a, b in self.exons_pos])
246
+
247
+ def pull_pre_mrna_pos(self) -> dict[str, Any]:
248
+ """
249
+ Retrieve the pre-mRNA sequence and indices using a Fasta_segment object.
250
+
251
+ Returns:
252
+ dict: A dictionary with 'seq' and 'indices' keys.
253
+ """
254
+ fasta_obj = Fasta_segment()
255
+ return fasta_obj.read_segment_endpoints(
256
+ config[self.organism]['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
257
+ self.transcript_lower - 1,
258
+ self.transcript_upper + 1
259
+ )
260
+
261
+ def generate_pre_mrna(self) -> Transcript:
262
+ """
263
+ Generate the pre-mRNA sequence for the transcript and store it as `self.pre_mrna`.
264
+
265
+ Returns:
266
+ Transcript: The current Transcript object (for chaining).
267
+ """
268
+ pre_mrna = SeqMat.from_seq(self.pull_pre_mrna_pos())
269
+ if self.rev:
270
+ pre_mrna.reverse_complement()
271
+ self.pre_mrna = pre_mrna
272
+ return self
273
+
274
+ def mutate(self, mutation: MutSeqMat, inplace: bool = False) -> Union[Transcript, SeqMat]:
275
+ """
276
+ Apply a mutation to the pre_mRNA sequence of this Transcript.
277
+
278
+ If the transcript is on the reverse strand (self.rev is True),
279
+ the mutation is first reverse-complemented to ensure strand compatibility.
280
+
281
+ Args:
282
+ mutation (SeqMat): The mutation to apply. Must be a SeqMat or a compatible object that supports .mutate().
283
+ inplace (bool): If True, apply the mutation directly to this Transcript's pre_mRNA
284
+ and return 'self'. If False, return a new SeqMat with the mutated sequence.
285
+
286
+ Returns:
287
+ Transcript: If inplace=True, returns the updated Transcript object.
288
+ SeqMat: If inplace=False, returns a new SeqMat object representing the mutated sequence.
289
+ """
290
+ # If transcript is reversed, reverse-complement the mutation first
291
+ if self.rev:
292
+ mutation.reverse_complement()
293
+
294
+ # Attempt the mutation operation
295
+ mutated_seqmat = self.pre_mrna.mutate(mutation).seqmat
296
+ if inplace:
297
+ # Update this Transcript's pre_mRNA and return the Transcript itself
298
+ self.pre_mrna = SeqMat(mutated_seqmat)
299
+ return self
300
+
301
+ else:
302
+ # Create a copy of the current Transcript and update its pre_mrna
303
+ # Assuming you have a way to clone the Transcript; if not, manually recreate it.
304
+ new_transcript = copy.deepcopy(self)
305
+ new_transcript.pre_mrna = SeqMat(mutated_seqmat)
306
+ return new_transcript
307
+
308
+ def generate_mature_mrna(self, inplace: bool = True) -> Union[Transcript, SeqMat]:
309
+ """
310
+ Generate the mature mRNA by concatenating exon regions from pre_mRNA.
311
+
312
+ Args:
313
+ inplace (bool): If True, set `self.mature_mrna`, else return a new SeqMat.
314
+
315
+ Returns:
316
+ Transcript or SeqMat: The Transcript object (if inplace=True) or a SeqMat (if inplace=False).
317
+ """
318
+ self._fix_and_check_introns()
319
+
320
+ mature_mrna = SeqMat.empty()
321
+ pos_mrna = self.pre_mrna
322
+
323
+ for exon_start, exon_end in self.exons:
324
+ # Add each exon region to the mature_mrna
325
+ mature_mrna += pos_mrna[exon_start:exon_end]
326
+
327
+ if inplace:
328
+ self.mature_mrna = mature_mrna
329
+ return self
330
+ return mature_mrna
331
+
332
+ @property
333
+ def orf(self, tis=None):
334
+ """
335
+ Return the ORF (Open Reading Frame) SeqMat object, if TIS and TTS are available.
336
+
337
+ Returns:
338
+ SeqMat or self: The ORF SeqMat if TIS/TTS are set, else self.
339
+ """
340
+ if not self.protein_coding:
341
+ print("Cannot create protein without set TIS and TTS values.")
342
+ return self
343
+
344
+ if tis is None:
345
+ tis = self.TIS
346
+ return self.mature_mrna.orf_seqmat(tis)
347
+
348
+ def clone(self) -> Transcript:
349
+ """
350
+ Returns a deep copy of this Transcript instance.
351
+
352
+ Returns:
353
+ Transcript: A new Transcript object that is a deep copy of the current instance.
354
+ """
355
+ return copy.deepcopy(self)
356
+
357
+ def generate_protein(self, inplace: bool = True, domains: Optional[np.ndarray] = None) -> Union[
358
+ Transcript, tuple[str, np.ndarray]]:
359
+ """
360
+ Translate the ORF into a protein sequence and optionally filter consensus vector by domains.
361
+
362
+ Args:
363
+ inplace (bool): If True, store protein and cons_vector in self. Otherwise, return them.
364
+ domains (np.ndarray, optional): Array of domain indices.
365
+
366
+ Returns:
367
+ Transcript or (protein: str, cons_vector: np.ndarray): The Transcript object if inplace=True, else the protein and cons_vector.
368
+ """
369
+ if not self.protein_coding:
370
+ print("No protein can be generated without TIS/TTS.")
371
+ return self if inplace else ("", np.array([]))
372
+
373
+ # Translate the ORF to protein
374
+ protein = str(Seq(self.orf.seq).translate()).replace('*', '')
375
+
376
+ # Use existing cons_vector or default to an array of ones
377
+ self.cons_vector = self.cons_vector if hasattr(self, 'cons_vector') else np.ones(len(protein))
378
+ self.protein = protein
379
+ return self
@@ -0,0 +1,38 @@
1
+ from .Gene import *
2
+ import numpy as np
3
+
4
+ class Allele(SeqMat):
5
+ def __init__(self, alt, pos1, pos2, rev):
6
+ super().__init__(alt, pos1, pos2)
7
+ self.position = min(pos1)
8
+ if rev:
9
+ self.reverse_complement()
10
+
11
+ # def _continuous(self, ind):
12
+ # return True
13
+
14
+
15
+
16
+ def get_mutation(mut_id, rev=False):
17
+
18
+ _, _, i, r, a = mut_id.split(':')
19
+ i = int(i)
20
+
21
+ if len(a) == len(r) == 1 and a != '-' and r != '-':
22
+ return Allele(a, [i], [0], rev)
23
+
24
+ elif a == '-' and r != '-':
25
+ return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
26
+
27
+ elif r == '-' and a != '-':
28
+ # print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
29
+ return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
30
+
31
+ elif a != '-' and r != '-':
32
+ ind1 = np.concatenate(
33
+ [np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
34
+ ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
35
+ return Allele('-' * len(r) + a, ind1, ind2, rev)
36
+
37
+
38
+
geney/oncosplice.py CHANGED
@@ -1,19 +1,22 @@
1
- import copy
2
-
3
1
  from Bio import pairwise2
4
2
  import re
5
- import numpy as np
3
+ import hashlib
4
+ from tqdm import tqdm
6
5
  import pandas as pd
7
- from .splicing_utils import find_transcript_missplicing, develop_aberrant_splicing, Missplicing
8
- from .seqmat_utils import *
9
- from .mutation_utils import *
6
+ import numpy as np
7
+ from .SeqMats import SeqMat, MutSeqMat
8
+ from .splicing_utils import find_transcript_missplicing_seqs, develop_aberrant_splicing
10
9
  from .tis_utils import find_tis
11
10
 
11
+ def short_hash_of_list(numbers, length=5):
12
+ encoded = repr(numbers).encode('utf-8')
13
+ full_hash = hashlib.sha256(encoded).hexdigest()
14
+ return full_hash[:length]
15
+
12
16
  def find_continuous_gaps(sequence):
13
17
  """Find continuous gap sequences in an alignment."""
14
18
  return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
15
19
 
16
-
17
20
  def get_logical_alignment(ref_prot, var_prot):
18
21
  """
19
22
  Aligns two protein sequences and finds the optimal alignment with the least number of gaps.
@@ -272,43 +275,22 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
272
275
  event.append('NE')
273
276
  if len(event) >= 1:
274
277
  return ','.join(event)
275
- # elif len(event) == 1:
276
- # return event[0]
277
278
  else:
278
279
  return '-'
279
280
 
280
281
 
281
282
  # Annotating
282
- def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
283
+ def OncospliceAnnotator(reference_transcript, variant_transcript, mut, ref_attributes=[], var_attributes=[]):
283
284
  affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
284
285
  reference_transcript)
285
286
 
286
287
  report = {}
287
-
288
288
  report['primary_transcript'] = reference_transcript.primary_transcript
289
289
  report['transcript_id'] = reference_transcript.transcript_id
290
- # report['mut_id'] = mut.mut_id
291
- # report['cons_available'] = int(reference_transcript.cons_available)
292
- # report['protein_coding'] = reference_transcript.transcript_biotype
293
-
294
- # report['reference_mrna'] = reference_transcript.transcript_seq
295
- # report['reference_cds_start'] = reference_transcript.TIS
296
- # report['reference_pre_mrna'] = reference_transcript.pre_mrna
297
- # report[
298
- # 'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
299
290
  report['reference_protein'] = reference_transcript.protein
300
- # report['reference_protein_length'] = len(reference_transcript.protein)
301
-
302
- # report['variant_mrna'] = variant_transcript.transcript_seq
303
- # report['variant_cds_start'] = variant_transcript.TIS
304
- # report[
305
- # 'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
306
- # report['variant_orf'] = variant_transcript.orf
307
291
  report['variant_protein'] = variant_transcript.protein
308
292
  report['variant_protein_length'] = len(variant_transcript.protein)
309
-
310
293
  descriptions = define_missplicing_events(reference_transcript, variant_transcript)
311
- # print(descriptions)
312
294
  report['exon_changes'] = '|'.join([v for v in descriptions if v])
313
295
  report['splicing_codes'] = summarize_missplicing_event(*descriptions)
314
296
  report['affected_exon'] = affected_exon
@@ -318,60 +300,79 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
318
300
  return report
319
301
 
320
302
 
321
- def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
322
- gene = Gene(mut_id.split(':')[0], organism=organism)
323
- reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
324
- mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
303
+ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False,
304
+ window_length=13, organism='hg38', engine='spliceai', domains=None):
305
+ gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
306
+ reference_gene_proteins = {
307
+ transcript.generate_pre_mrna().generate_mature_mrna().generate_protein().protein: transcript.transcript_id for
308
+ transcript in gene if transcript.transcript_biotype == 'protein_coding'}
325
309
 
310
+ mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
326
311
  results = []
327
- for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
328
- if cons_required and not transcript.cons_available:
312
+ for reference_transcript in tqdm(gene):
313
+ if (cons_required and not reference_transcript.cons_available) or (
314
+ protein_coding and not reference_transcript.transcript_biotype == 'protein_coding'):
329
315
  continue
330
316
 
331
- if all(mutation not in transcript for mutation in mutations):
317
+ current_mutations = [m for m in mutations if m in reference_transcript]
318
+ if len(current_mutations) == 0:
332
319
  continue
333
320
 
334
- transcript.generate_pre_mrna()
335
- transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
336
- transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
337
- ref_protein, cons_vector = transcript.protein, transcript.cons_vector
338
- reference_transcript = copy.deepcopy(transcript)
321
+ center = np.mean([m.indices[0] for m in current_mutations]) // 1
339
322
 
340
- assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
323
+ mutated_transcript = reference_transcript.clone()
324
+ for mutation in current_mutations:
325
+ mutated_transcript.mutate(mutation, inplace=True)
341
326
 
342
- missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
343
- for mutation in mutations:
344
- transcript.pre_mrna += mutation
327
+ reference_transcript.generate_mature_mrna().generate_protein()
328
+ reference_transcript.cons_vector = transform_conservation_vector(reference_transcript.cons_vector,
329
+ window=window_length)
345
330
 
346
- for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
347
- transcript.acceptors = new_boundaries['acceptors']
348
- transcript.donors = new_boundaries['donors']
349
- transcript.generate_mature_mrna().generate_protein()
331
+ assert len(reference_transcript.protein) == len(
332
+ reference_transcript.cons_vector), f"Protein ({len(reference_transcript.protein)}) and conservation vector ({len(reference_transcript.cons_vector)}) must be same length."
350
333
 
351
- alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
334
+ missplicing = find_transcript_missplicing_seqs(
335
+ reference_transcript.pre_mrna.get_context(center, context=7500, padding='N'),
336
+ mutated_transcript.pre_mrna.get_context(center, context=7500, padding='N'), reference_transcript.donors,
337
+ reference_transcript.acceptors, threshold=splicing_threshold, engine=engine)
338
+ alternative_splicing_paths = develop_aberrant_splicing(reference_transcript, missplicing.aberrant_splicing)
339
+
340
+ for i, new_boundaries in enumerate(alternative_splicing_paths):
341
+ mutated_transcript.acceptors = new_boundaries['acceptors']
342
+ mutated_transcript.donors = new_boundaries['donors']
343
+ mutated_transcript.generate_mature_mrna().generate_protein()
344
+
345
+ alignment = get_logical_alignment(reference_transcript.protein, mutated_transcript.protein)
352
346
  deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
353
- modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
354
- temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
347
+ modified_positions = find_modified_positions(len(reference_transcript.protein), deleted, inserted)
348
+ temp_cons = np.convolve(reference_transcript.cons_vector * modified_positions,
349
+ np.ones(window_length)) / window_length
355
350
  affected_cons_scores = max(temp_cons)
356
351
  percentile = (
357
- sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
358
- cons_vector))
352
+ sorted(reference_transcript.cons_vector).index(
353
+ next(x for x in sorted(reference_transcript.cons_vector) if x >= affected_cons_scores)) / len(
354
+ reference_transcript.cons_vector))
359
355
 
360
- report = OncospliceAnnotator(reference_transcript, transcript, mutation)
356
+ report = OncospliceAnnotator(reference_transcript, mutated_transcript, current_mutations[0])
361
357
  report['mut_id'] = mut_id
358
+ report['engine'] = engine
362
359
  report['oncosplice_score'] = affected_cons_scores
363
360
  report['percentile'] = percentile
364
- report['isoform_id'] = i
361
+ report['isoform_id'] = short_hash_of_list(mutated_transcript.exons)
365
362
  report['isoform_prevalence'] = new_boundaries['path_weight']
366
363
  report['full_missplicing'] = missplicing.aberrant_splicing
367
364
  report['missplicing'] = max(missplicing)
368
- report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
365
+ report['reference_resemblance'] = reference_gene_proteins.get(mutated_transcript.protein, None)
369
366
  results.append(report)
370
367
 
371
368
  if len(results) == 0:
372
369
  return None
373
370
 
374
- return pd.DataFrame(results)
371
+ return pd.DataFrame(results)[
372
+ ['mut_id', 'transcript_id', 'isoform_id', 'primary_transcript', 'missplicing', 'full_missplicing',
373
+ 'exon_changes', 'splicing_codes', 'affected_exon', 'affected_intron', 'mutation_distance_from_5',
374
+ 'mutation_distance_from_3', 'engine', 'reference_resemblance', 'oncosplice_score', 'percentile',
375
+ 'isoform_prevalence', 'reference_protein', 'variant_protein']]
375
376
 
376
377
 
377
378
  import asyncio
geney/splicing_utils.py CHANGED
@@ -146,7 +146,7 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
146
146
 
147
147
 
148
148
  def find_transcript_missplicing_mutid(mut_id):
149
- from geney.seqmat_utils import Gene
149
+ from geney.Gene import Gene
150
150
  transcript = Gene(mut_id.split(':')[0]).transcript().generate_mature_mrna()
151
151
  out = find_transcript_missplicing(transcript, [get_mutation(mut_id, rev=transcript.rev)], context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=True)
152
152
  best_delta = 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.69
3
+ Version: 1.3.2
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -29,6 +29,8 @@ Requires-Dist: notebook
29
29
  Requires-Dist: matplotlib
30
30
  Requires-Dist: gffutils
31
31
  Requires-Dist: pyfastx
32
+ Requires-Dist: tensorflow
33
+ Requires-Dist: keras
32
34
 
33
35
  UNKNOWN
34
36
 
@@ -1,17 +1,21 @@
1
1
  geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
2
+ geney/Gene.py,sha256=JGWtfA6-d1W3I9YRASwaF8vaZ6CCuY0KEawQNdloIqY,6259
3
+ geney/SeqMats.py,sha256=jkXmXAs0OpnFeyCfiJcKKpHHSi9JpKgiOIwsu63e1CQ,18557
4
+ geney/Transcript.py,sha256=eRZXVVxDVBbv0l385bnAOBFRBSzBwppXcbBq8KXkwlo,14443
2
5
  geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
6
+ geney/_mutation_utils.py,sha256=dHssUsnii_mf-wuRoMmF13UlD7k3ml_VwQMItTYnXpU,1132
3
7
  geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
4
8
  geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
5
9
  geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
6
10
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
7
11
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
8
12
  geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
9
- geney/oncosplice.py,sha256=eWgY2Lcj894UBFnIVhbxiVz5oqASHg-Ot1wFbjlJbI8,21857
13
+ geney/oncosplice.py,sha256=FdvuROk2G7wwLoB5lLzYia8Smw9hHZeVs-J2MUoAwlU,22106
10
14
  geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
11
15
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
12
16
  geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
13
17
  geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
14
- geney/splicing_utils.py,sha256=0DFnBJaEqPQ8_0VPFzsXu_ZxaTlw56e0bcAGVLyIH1Q,19223
18
+ geney/splicing_utils.py,sha256=4xYXy_dIbqdbVfxsEj_OCuM-MsQ24gi4fIv0vQjAYcQ,19215
15
19
  geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
16
20
  geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
17
21
  geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
@@ -20,7 +24,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
20
24
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
21
25
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
22
26
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
23
- geney-1.2.69.dist-info/METADATA,sha256=6Qzw_HkVkPN0J_dFfc8lYuw5rJsIsjKlT_uvybAmltM,948
24
- geney-1.2.69.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
- geney-1.2.69.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
- geney-1.2.69.dist-info/RECORD,,
27
+ geney-1.3.2.dist-info/METADATA,sha256=aGPdV-x5PcONjV5ylUg8rYhW0eo4Fm2HDOE8dzldpcg,994
28
+ geney-1.3.2.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
29
+ geney-1.3.2.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
30
+ geney-1.3.2.dist-info/RECORD,,
File without changes