geney 1.2.69__py2.py3-none-any.whl → 1.3.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/Gene.py +177 -0
- geney/SeqMats.py +492 -0
- geney/Transcript.py +379 -0
- geney/_mutation_utils.py +38 -0
- geney/oncosplice.py +59 -58
- geney/splicing_utils.py +1 -1
- {geney-1.2.69.dist-info → geney-1.3.2.dist-info}/METADATA +3 -1
- {geney-1.2.69.dist-info → geney-1.3.2.dist-info}/RECORD +10 -6
- {geney-1.2.69.dist-info → geney-1.3.2.dist-info}/WHEEL +0 -0
- {geney-1.2.69.dist-info → geney-1.3.2.dist-info}/top_level.txt +0 -0
geney/Gene.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import Any, Dict, List, Tuple, Optional, Iterator, Union, TYPE_CHECKING
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from . import unload_pickle, config
|
|
5
|
+
from .Transcript import Transcript
|
|
6
|
+
|
|
7
|
+
class Gene:
|
|
8
|
+
"""
|
|
9
|
+
A class representing a Gene, with associated transcripts and metadata.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
organism (str): The organism build (e.g. 'hg38').
|
|
13
|
+
transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
|
|
14
|
+
gene_name (str): The name of the gene.
|
|
15
|
+
gene_id (str): The unique identifier for the gene.
|
|
16
|
+
chrm (str): The chromosome on which the gene resides.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
|
|
20
|
+
"""
|
|
21
|
+
Initialize a Gene instance by loading gene information from stored pickled files.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
gene_name (str): Name of the gene (default 'KRAS').
|
|
25
|
+
variation: Variation information (unused currently).
|
|
26
|
+
organism (str): Organism reference build (default 'hg38').
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
FileNotFoundError: If no files for the specified gene are found.
|
|
30
|
+
AssertionError: If required attributes are missing after loading.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
|
|
34
|
+
self.gene_name = gene_name
|
|
35
|
+
self.gene_id = gene_id
|
|
36
|
+
self.rev = rev
|
|
37
|
+
self.chrm = chrm
|
|
38
|
+
self.organism = organism
|
|
39
|
+
self.transcripts = transcripts if transcripts is not None else {}
|
|
40
|
+
|
|
41
|
+
def __repr__(self) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Official string representation of the Gene object.
|
|
44
|
+
"""
|
|
45
|
+
return f"Gene({self.gene_name})"
|
|
46
|
+
|
|
47
|
+
def __str__(self) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Unofficial, user-friendly string representation of the Gene object.
|
|
50
|
+
"""
|
|
51
|
+
return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
|
|
52
|
+
|
|
53
|
+
def __len__(self) -> int:
|
|
54
|
+
"""
|
|
55
|
+
Returns the number of transcripts associated with this gene.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
int: The count of transcripts.
|
|
59
|
+
"""
|
|
60
|
+
return len(self.transcripts)
|
|
61
|
+
|
|
62
|
+
def __copy__(self):
|
|
63
|
+
"""
|
|
64
|
+
Returns a shallow copy of the Gene object.
|
|
65
|
+
"""
|
|
66
|
+
return copy.copy(self)
|
|
67
|
+
|
|
68
|
+
def __deepcopy__(self, memo):
|
|
69
|
+
"""
|
|
70
|
+
Returns a deep copy of the Gene object.
|
|
71
|
+
"""
|
|
72
|
+
return copy.deepcopy(self, memo)
|
|
73
|
+
|
|
74
|
+
def __iter__(self):
|
|
75
|
+
"""
|
|
76
|
+
Allow iteration over the gene's transcripts, yielding Transcript objects.
|
|
77
|
+
"""
|
|
78
|
+
for tid, annotations in self.transcripts.items():
|
|
79
|
+
yield Transcript(annotations, organism=self.organism)
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_file(cls, gene_name, organism='hg38'):
|
|
83
|
+
# Load data from file here
|
|
84
|
+
|
|
85
|
+
# Find gene data files in the configured organism MRNA path
|
|
86
|
+
gene_files = list((config[organism]['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
|
|
87
|
+
if not gene_files:
|
|
88
|
+
raise FileNotFoundError(f"No files available for gene '{gene_name}'.")
|
|
89
|
+
|
|
90
|
+
# Load gene data from the first matching file
|
|
91
|
+
data = unload_pickle(gene_files[0])
|
|
92
|
+
gene_name = data.get('gene_name')
|
|
93
|
+
gene_id = data.get('gene_id')
|
|
94
|
+
rev = data.get('rev')
|
|
95
|
+
chrm = data.get('chrm')
|
|
96
|
+
transcripts = data.get('transcripts', {})
|
|
97
|
+
|
|
98
|
+
return cls(
|
|
99
|
+
gene_name=gene_name,
|
|
100
|
+
gene_id=gene_id,
|
|
101
|
+
rev=rev,
|
|
102
|
+
chrm=chrm,
|
|
103
|
+
transcripts=transcripts,
|
|
104
|
+
organism=organism
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def splice_sites(self) -> Tuple['Counter', 'Counter']:
|
|
108
|
+
"""
|
|
109
|
+
Aggregates splice sites (acceptors and donors) from all transcripts.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
|
|
113
|
+
"""
|
|
114
|
+
from collections import Counter
|
|
115
|
+
acceptors: List[Any] = []
|
|
116
|
+
donors: List[Any] = []
|
|
117
|
+
|
|
118
|
+
# Collect acceptor and donor sites from each transcript
|
|
119
|
+
for transcript in self.transcripts.values():
|
|
120
|
+
acceptors.extend(transcript.get('acceptors', []))
|
|
121
|
+
donors.extend(transcript.get('donors', []))
|
|
122
|
+
|
|
123
|
+
return Counter(acceptors), Counter(donors)
|
|
124
|
+
|
|
125
|
+
def transcript(self, tid: Optional[str] = None):
|
|
126
|
+
"""
|
|
127
|
+
Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
tid (str, optional): Transcript ID. If None, returns primary transcript.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Transcript: The Transcript object with the given ID or the primary transcript.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
AttributeError: If the requested transcript does not exist.
|
|
137
|
+
"""
|
|
138
|
+
if tid is None:
|
|
139
|
+
tid = self.primary_transcript
|
|
140
|
+
|
|
141
|
+
if tid not in self.transcripts:
|
|
142
|
+
raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
|
|
143
|
+
|
|
144
|
+
return Transcript(self.transcripts[tid], organism=self.organism)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def primary_transcript(self) -> Optional[str]:
|
|
148
|
+
"""
|
|
149
|
+
Returns the primary transcript ID for this gene.
|
|
150
|
+
If not explicitly defined, it attempts to select a primary transcript.
|
|
151
|
+
If none is found, it falls back to the first protein-coding transcript.
|
|
152
|
+
If still none is found, returns None.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str or None: The primary transcript ID or None if not available.
|
|
156
|
+
"""
|
|
157
|
+
# If already calculated, return it
|
|
158
|
+
if hasattr(self, '_primary_transcript'):
|
|
159
|
+
return self._primary_transcript
|
|
160
|
+
|
|
161
|
+
# Try to find a primary transcript
|
|
162
|
+
primary_transcripts = [k for k, v in self.transcripts.items() if v.get('primary_transcript')]
|
|
163
|
+
if primary_transcripts:
|
|
164
|
+
self._primary_transcript = primary_transcripts[0]
|
|
165
|
+
return self._primary_transcript
|
|
166
|
+
|
|
167
|
+
# Fallback: find a protein-coding transcript
|
|
168
|
+
protein_coding = [k for k, v in self.transcripts.items() if v.get('transcript_biotype') == 'protein_coding']
|
|
169
|
+
if protein_coding:
|
|
170
|
+
self._primary_transcript = protein_coding[0]
|
|
171
|
+
return self._primary_transcript
|
|
172
|
+
|
|
173
|
+
# No primary or protein-coding transcript found
|
|
174
|
+
self._primary_transcript = None
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
geney/SeqMats.py
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
ALPHABET = {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
|
|
5
|
+
|
|
6
|
+
class SeqMat:
|
|
7
|
+
ROW_SEQ = 0
|
|
8
|
+
ROW_INDS = 1
|
|
9
|
+
ROW_SUPERINDS = 2
|
|
10
|
+
ROW_MUTATED = 3
|
|
11
|
+
|
|
12
|
+
def __init__(self, seqmat, alphabet=None):
|
|
13
|
+
self.seqmat = seqmat
|
|
14
|
+
self.alphabet = alphabet or {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
|
|
15
|
+
|
|
16
|
+
self.char_to_value = {c: i for i, c in enumerate(self.alphabet.keys())}
|
|
17
|
+
self.value_to_char = {i: c for i, c in enumerate(self.alphabet.keys())}
|
|
18
|
+
self.value_complements = {self.char_to_value[c1]: self.char_to_value[c2] for c1, c2 in self.alphabet.items()}
|
|
19
|
+
|
|
20
|
+
def __repr__(self):
|
|
21
|
+
return f"<SeqMat: {self.seq}>"
|
|
22
|
+
|
|
23
|
+
def __str__(self):
|
|
24
|
+
return self.seq
|
|
25
|
+
|
|
26
|
+
def __len__(self):
|
|
27
|
+
return self.seqmat.shape[1]
|
|
28
|
+
|
|
29
|
+
def __getitem__(self, key):
|
|
30
|
+
if isinstance(key, slice):
|
|
31
|
+
pos1, pos2 = self._rel_index(key.start), self._rel_index(key.stop)
|
|
32
|
+
return SeqMat(self.seqmat[:, pos1:pos2+1])
|
|
33
|
+
else:
|
|
34
|
+
pos = self._rel_index(key)
|
|
35
|
+
return SeqMat(self.seqmat[:, pos:pos + 1])
|
|
36
|
+
|
|
37
|
+
def __contains__(self, other):
|
|
38
|
+
"""
|
|
39
|
+
Checks if another SeqMat object is entirely contained within this SeqMat object.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
other (SeqMat): Another SeqMat object to check for containment.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
bool: True if `other` is contained in `self`, False otherwise.
|
|
46
|
+
"""
|
|
47
|
+
# Ensure `other` is a SeqMat
|
|
48
|
+
if not isinstance(other, SeqMat):
|
|
49
|
+
raise TypeError("Can only check containment with another SeqMat object.")
|
|
50
|
+
|
|
51
|
+
# Check if all indices of `other` are in `self`
|
|
52
|
+
other_indices = other.seqmat[other.ROW_INDS, :]
|
|
53
|
+
self_indices = self.seqmat[self.ROW_INDS, :]
|
|
54
|
+
if not np.all(np.isin(other_indices, self_indices)):
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
return True
|
|
58
|
+
|
|
59
|
+
def __eq__(self, other):
|
|
60
|
+
"""
|
|
61
|
+
Implements the == operator to compare two SeqMat objects.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
other (SeqMat): The other SeqMat object to compare.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
bool: True if the two SeqMat objects are equal, False otherwise.
|
|
68
|
+
"""
|
|
69
|
+
# Ensure `other` is a SeqMat object
|
|
70
|
+
if not isinstance(other, SeqMat):
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
# Compare the sequence matrix
|
|
74
|
+
if not np.array_equal(self.seqmat, other.seqmat):
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def empty(cls, alphabet=None):
|
|
81
|
+
"""
|
|
82
|
+
Creates an empty SeqMat object.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
alphabet (dict): Optional alphabet dictionary (default: {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}).
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
SeqMat: An empty SeqMat object.
|
|
89
|
+
"""
|
|
90
|
+
empty_seqmat = np.zeros((4, 0), dtype=np.int32) # 4 rows, 0 columns (no data)
|
|
91
|
+
return cls(empty_seqmat, alphabet=alphabet)
|
|
92
|
+
|
|
93
|
+
def __add__(self, other):
|
|
94
|
+
"""
|
|
95
|
+
Implements the + operator. Joins two SeqMat objects or applies mutations.
|
|
96
|
+
|
|
97
|
+
If `other` is outside the range of indices, the sequences are concatenated, provided the indices are
|
|
98
|
+
monotonically increasing or decreasing. Otherwise, it applies the mutation.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
other (SeqMat): Another SeqMat object to join or mutate.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
SeqMat: A new SeqMat object with the resulting sequence.
|
|
105
|
+
"""
|
|
106
|
+
# Ensure `other` is a SeqMat
|
|
107
|
+
if not isinstance(other, SeqMat):
|
|
108
|
+
raise TypeError("Can only add another SeqMat object.")
|
|
109
|
+
|
|
110
|
+
if other in self:
|
|
111
|
+
return self.mutate(other)
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
combined_seqmat = np.hstack((self.seqmat, other.seqmat))
|
|
115
|
+
|
|
116
|
+
# Ensure the combined sequence is monotonic
|
|
117
|
+
if not self._is_monotonic(combined_seqmat[self.ROW_INDS]):
|
|
118
|
+
raise ValueError("Resulting sequence indices are not monotonic.")
|
|
119
|
+
|
|
120
|
+
return SeqMat(combined_seqmat, alphabet=self.alphabet)
|
|
121
|
+
|
|
122
|
+
def __iadd__(self, other):
|
|
123
|
+
"""
|
|
124
|
+
Implements the += operator. Joins two SeqMat objects or applies mutations in place.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
other (SeqMat): Another SeqMat object to join or mutate.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
SeqMat: The mutated or joined SeqMat object.
|
|
131
|
+
"""
|
|
132
|
+
# Ensure `other` is a SeqMat
|
|
133
|
+
if not isinstance(other, SeqMat):
|
|
134
|
+
raise TypeError("Can only add another SeqMat object.")
|
|
135
|
+
|
|
136
|
+
if other in self:
|
|
137
|
+
self.seqmat = self.mutate(other).seqmat
|
|
138
|
+
return self
|
|
139
|
+
else:
|
|
140
|
+
self.seqmat = np.hstack((self.seqmat, other.seqmat))
|
|
141
|
+
|
|
142
|
+
if not self._is_monotonic(self.seqmat[self.ROW_INDS]):
|
|
143
|
+
raise ValueError("Resulting sequence indices are not monotonic.")
|
|
144
|
+
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
# def get_context(self, pos, context=500):
|
|
148
|
+
# pos = self._rel_index(pos)
|
|
149
|
+
# lower_bound, upper_bound = max(0, pos - context), min(len(self), pos + context + 1)
|
|
150
|
+
# return SeqMat(self.seqmat[:, lower_bound:upper_bound])
|
|
151
|
+
|
|
152
|
+
def get_context(self, pos, context=500, padding=None):
|
|
153
|
+
"""
|
|
154
|
+
Returns a SeqMat object representing the region around `pos` with the given context.
|
|
155
|
+
If padding is provided and the requested context extends beyond the sequence boundaries,
|
|
156
|
+
the result is padded with the specified nucleotide in the sequence row and -1 in the indices rows.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
pos (int): The position of interest in the original coordinate space.
|
|
160
|
+
context (int): The number of nucleotides to include on each side of pos (default 500).
|
|
161
|
+
padding (str or None): The nucleotide to use for padding. If None, no padding is applied and
|
|
162
|
+
the returned region may be shorter than requested.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
SeqMat: A new SeqMat object containing the context region (padded if requested).
|
|
166
|
+
"""
|
|
167
|
+
# Resolve the relative index
|
|
168
|
+
pos = self._rel_index(pos)
|
|
169
|
+
|
|
170
|
+
# Calculate desired start and end positions
|
|
171
|
+
desired_length = 2 * context + 1
|
|
172
|
+
start = pos - context
|
|
173
|
+
end = pos + context + 1
|
|
174
|
+
|
|
175
|
+
# Actual bounds clipped to the available length
|
|
176
|
+
actual_start = max(start, 0)
|
|
177
|
+
actual_end = min(len(self), end)
|
|
178
|
+
|
|
179
|
+
# Extract the slice that fits within the sequence
|
|
180
|
+
slice_seqmat = self.seqmat[:, actual_start:actual_end]
|
|
181
|
+
|
|
182
|
+
extracted_length = slice_seqmat.shape[1]
|
|
183
|
+
|
|
184
|
+
# If no padding requested, just return the slice
|
|
185
|
+
if padding is None or extracted_length == desired_length:
|
|
186
|
+
return SeqMat(slice_seqmat)
|
|
187
|
+
|
|
188
|
+
# If padding is requested and we have fewer columns than desired, pad the result
|
|
189
|
+
if extracted_length < desired_length:
|
|
190
|
+
# Determine how much we need to pad on each side
|
|
191
|
+
pad_left = max(-start, 0) # How many columns needed before actual_start
|
|
192
|
+
pad_right = max(end - len(self), 0) # How many columns needed after actual_end
|
|
193
|
+
|
|
194
|
+
# Determine numeric code for padding nucleotide
|
|
195
|
+
# Assuming self.char_to_value is available and 'N' is known if padding isn't recognized
|
|
196
|
+
N_val = self.char_to_value.get(padding, self.char_to_value['N'])
|
|
197
|
+
|
|
198
|
+
# Create a new array with the desired length
|
|
199
|
+
new_seqmat = np.full((self.seqmat.shape[0], desired_length), -1, dtype=self.seqmat.dtype)
|
|
200
|
+
# Fill the sequence row with N_val
|
|
201
|
+
new_seqmat[0, :] = N_val
|
|
202
|
+
|
|
203
|
+
# Place the extracted slice into the correct position
|
|
204
|
+
new_seqmat[:, pad_left:pad_left + extracted_length] = slice_seqmat
|
|
205
|
+
return SeqMat(new_seqmat)
|
|
206
|
+
|
|
207
|
+
# If for some reason extracted_length > desired_length (unlikely), just truncate
|
|
208
|
+
if extracted_length > desired_length:
|
|
209
|
+
return SeqMat(slice_seqmat[:, :desired_length])
|
|
210
|
+
|
|
211
|
+
# Fallback (should not reach here normally)
|
|
212
|
+
return SeqMat(slice_seqmat)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _rel_index(self, pos):
|
|
216
|
+
if pos in self.indices:
|
|
217
|
+
return np.where(self.seqmat[self.ROW_INDS, :] == pos)[0][0]
|
|
218
|
+
else:
|
|
219
|
+
raise IndexError(f"Position {pos} not found in sequence.")
|
|
220
|
+
|
|
221
|
+
def _is_same_strand(self, other):
|
|
222
|
+
"""
|
|
223
|
+
Checks if two SeqMat objects are on the same strand.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
other (SeqMat): The other SeqMat object to compare.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
bool: True if both are on the same strand, False otherwise.
|
|
230
|
+
"""
|
|
231
|
+
self_indices = self.seqmat[self.ROW_INDS, :]
|
|
232
|
+
other_indices = other.seqmat[self.ROW_INDS, :]
|
|
233
|
+
|
|
234
|
+
# Determine monotonicity
|
|
235
|
+
self_increasing = np.all(np.diff(self_indices) >= 0)
|
|
236
|
+
self_decreasing = np.all(np.diff(self_indices) <= 0)
|
|
237
|
+
other_increasing = np.all(np.diff(other_indices) >= 0)
|
|
238
|
+
other_decreasing = np.all(np.diff(other_indices) <= 0)
|
|
239
|
+
|
|
240
|
+
# Both must be either increasing or decreasing
|
|
241
|
+
return (self_increasing and other_increasing) or (self_decreasing and other_decreasing)
|
|
242
|
+
|
|
243
|
+
def reverse_complement(self, inplace=True):
|
|
244
|
+
"""
|
|
245
|
+
Reverse complement the sequence in place.
|
|
246
|
+
"""
|
|
247
|
+
seqmat = self.seqmat[:, ::-1].copy()
|
|
248
|
+
seqmat[self.ROW_SEQ, :] = np.vectorize(self.value_complements.get)(seqmat[self.ROW_SEQ])
|
|
249
|
+
|
|
250
|
+
if inplace:
|
|
251
|
+
self.seqmat = seqmat
|
|
252
|
+
return self
|
|
253
|
+
|
|
254
|
+
return SeqMat(seqmat)
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def from_seq(cls, seq_dict, alphabet=None):
|
|
258
|
+
"""
|
|
259
|
+
Create a SeqMat object from a dictionary containing sequence information.
|
|
260
|
+
"""
|
|
261
|
+
seq = np.array(list(seq_dict["seq"]))
|
|
262
|
+
inds = seq_dict.get("indices", np.arange(len(seq), dtype=np.int32))
|
|
263
|
+
superinds = seq_dict.get("superinds", np.zeros(len(seq), dtype=np.int32))
|
|
264
|
+
mutmark = np.zeros_like(superinds)
|
|
265
|
+
|
|
266
|
+
assert len(seq) == len(inds), f"Sequence length {len(seq)} must match indices length {len(inds)}"
|
|
267
|
+
if not cls._is_monotonic(inds):
|
|
268
|
+
raise ValueError(f"Sequence indices must be monotonic, got {inds}")
|
|
269
|
+
|
|
270
|
+
# Create character-to-value mapping
|
|
271
|
+
char_to_value = {c: i for i, c in enumerate(ALPHABET.keys())}
|
|
272
|
+
seq_values = [char_to_value[nt] for nt in seq]
|
|
273
|
+
|
|
274
|
+
# Stack sequence matrix
|
|
275
|
+
seqmat = np.vstack([seq_values, inds, superinds, mutmark]).astype(np.int32)
|
|
276
|
+
return cls(seqmat)
|
|
277
|
+
|
|
278
|
+
@staticmethod
|
|
279
|
+
def _is_monotonic(inds):
|
|
280
|
+
return all(x >= y for x, y in zip(inds, inds[1:])) if inds[0] > inds[-1] else all(
|
|
281
|
+
x <= y for x, y in zip(inds, inds[1:]))
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def seq(self):
|
|
285
|
+
return self.rawseq.replace('-', '')
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def rawseq(self):
|
|
289
|
+
return ''.join([self.value_to_char[int(ind)] for ind in self.seqmat[self.ROW_SEQ, :]])
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def indices(self):
|
|
293
|
+
return self.seqmat[self.ROW_INDS, self.seqmat[self.ROW_SEQ, :] != 5] + (
|
|
294
|
+
self.seqmat[self.ROW_SUPERINDS, self.seqmat[self.ROW_SEQ, :] != 5] / 10)
|
|
295
|
+
|
|
296
|
+
def mutate(self, mut, inplace=False):
|
|
297
|
+
"""
|
|
298
|
+
Apply mutations to the sequence matrix.
|
|
299
|
+
Args:
|
|
300
|
+
mut (SeqMat): A SeqMat object containing mutations.
|
|
301
|
+
return_seqmat (bool): If True, return the mutated seqmat; otherwise, return updated sequence.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
str or np.ndarray: Mutated sequence or sequence matrix based on `return_seqmat`.
|
|
305
|
+
"""
|
|
306
|
+
### NEEDS some work to make sure that mutations can continue being added without issue...
|
|
307
|
+
|
|
308
|
+
# Ensure strand compatibility
|
|
309
|
+
if not self._is_same_strand(mut):
|
|
310
|
+
raise ValueError("Mutation and sequence are not on the same strand.")
|
|
311
|
+
|
|
312
|
+
# something to make sure the mutation is contained as one deletion, insertion, or snp or indel
|
|
313
|
+
ref_seqmat = self.seqmat.copy()
|
|
314
|
+
mut_seqmat = mut.seqmat
|
|
315
|
+
|
|
316
|
+
# Ensure mutation indices exist in the reference
|
|
317
|
+
if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
|
|
318
|
+
return self
|
|
319
|
+
|
|
320
|
+
# Handle the fact that only part of the mutation is in the sequence and isertable
|
|
321
|
+
if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
|
|
322
|
+
raise ValueError("Some mutation indices are not found in the reference sequence.")
|
|
323
|
+
|
|
324
|
+
# Handle replacements
|
|
325
|
+
temp = mut_seqmat[:, np.where(mut_seqmat[self.ROW_SUPERINDS, :] == 0)[0]]
|
|
326
|
+
condition = (
|
|
327
|
+
np.isin(ref_seqmat[self.ROW_INDS, :],
|
|
328
|
+
temp[self.ROW_INDS, :])
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
indices = np.where(condition)[0]
|
|
332
|
+
ref_seqmat[:, indices] = temp[:, :]
|
|
333
|
+
|
|
334
|
+
# Handle insertions
|
|
335
|
+
insertions = np.where(mut_seqmat[self.ROW_SUPERINDS, :] > 0)[0]
|
|
336
|
+
if insertions.size > 0:
|
|
337
|
+
ins_seqmat = mut_seqmat[:, insertions]
|
|
338
|
+
correction = 1 if self.seqmat[self.ROW_INDS, 0] > self.seqmat[self.ROW_INDS, -1] else 0
|
|
339
|
+
ins_loc = np.where(ref_seqmat[self.ROW_INDS, :] == ins_seqmat[self.ROW_INDS, 0])[0][0] + 1 - correction
|
|
340
|
+
ref_seqmat = np.insert(ref_seqmat, ins_loc, ins_seqmat.T, axis=1)
|
|
341
|
+
|
|
342
|
+
if inplace:
|
|
343
|
+
self.seqmat = ref_seqmat
|
|
344
|
+
return self
|
|
345
|
+
|
|
346
|
+
return SeqMat(ref_seqmat)
|
|
347
|
+
|
|
348
|
+
def orf_seqmat(self, tis_index):
|
|
349
|
+
temp = self.seqmat[:, self._rel_index(tis_index):]
|
|
350
|
+
temp = temp[:, temp[0, :] != 5]
|
|
351
|
+
temp = SeqMat(temp) # .drop_indices()
|
|
352
|
+
raw_seq = temp.seq # Extract the raw sequence
|
|
353
|
+
pattern = re.compile(r"(?:[NACGT]{3})*?(TAA|TAG|TGA)")
|
|
354
|
+
match = pattern.match(raw_seq)
|
|
355
|
+
if match:
|
|
356
|
+
stop_index = match.end()
|
|
357
|
+
else:
|
|
358
|
+
stop_index = len(raw_seq)
|
|
359
|
+
end_index = stop_index
|
|
360
|
+
return SeqMat(temp.seqmat[:, :end_index])
|
|
361
|
+
|
|
362
|
+
def translate(self, tis_index):
|
|
363
|
+
"""
|
|
364
|
+
Translates a nucleotide sequence into an amino acid sequence.
|
|
365
|
+
Ensures the sequence length is divisible by 3 by trimming excess nucleotides.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
sequence (str): Nucleotide sequence (e.g., ACGT).
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
str: Translated amino acid sequence.
|
|
372
|
+
"""
|
|
373
|
+
# Codon-to-amino acid mapping table (standard genetic code)
|
|
374
|
+
codon_table = {
|
|
375
|
+
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
|
376
|
+
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
|
377
|
+
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
|
378
|
+
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
|
379
|
+
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
|
380
|
+
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
|
381
|
+
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
|
382
|
+
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
|
383
|
+
'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
|
|
384
|
+
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
|
385
|
+
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
|
386
|
+
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
|
387
|
+
'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
|
|
388
|
+
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
|
389
|
+
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
|
390
|
+
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
|
|
391
|
+
}
|
|
392
|
+
sequence = self.orf_seqmat(tis_index).seq
|
|
393
|
+
|
|
394
|
+
# Ensure sequence is uppercase
|
|
395
|
+
sequence = sequence.upper()
|
|
396
|
+
|
|
397
|
+
# Trim sequence to ensure divisibility by 3
|
|
398
|
+
trimmed_length = len(sequence) - (len(sequence) % 3)
|
|
399
|
+
sequence = sequence[:trimmed_length]
|
|
400
|
+
|
|
401
|
+
# Translate sequence in chunks of 3
|
|
402
|
+
amino_acids = [codon_table.get(sequence[i:i+3], 'X') for i in range(0, len(sequence), 3)]
|
|
403
|
+
|
|
404
|
+
# Join amino acids into a single string
|
|
405
|
+
return ''.join(amino_acids)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def to_dict(self):
|
|
409
|
+
return {'seq': self.rawseq, 'indices': self.indices, 'superinds': self.seqmat[self.ROW_SUPERINDS, :]}
|
|
410
|
+
|
|
411
|
+
class DnaSeqMat(SeqMat):
|
|
412
|
+
pass
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class RnaSeqMat(SeqMat):
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class AASeqMat(SeqMat):
|
|
420
|
+
pass
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class MutSeqMat(SeqMat):
|
|
424
|
+
"""
|
|
425
|
+
A subclass of SeqMat designed specifically for mutation sequences.
|
|
426
|
+
|
|
427
|
+
Additional Conditions:
|
|
428
|
+
1. Mutation indices must be consecutive (increasing or decreasing).
|
|
429
|
+
2. The superinds row must have a maximum value of 10.
|
|
430
|
+
"""
|
|
431
|
+
|
|
432
|
+
def __init__(self, seqmat, alphabet=None):
|
|
433
|
+
super().__init__(seqmat, alphabet)
|
|
434
|
+
|
|
435
|
+
# Validate the mutation-specific conditions
|
|
436
|
+
self._validate_mutation_indices()
|
|
437
|
+
self.seqmat[-1, :] = 1
|
|
438
|
+
self.position = min(self.seqmat[self.ROW_INDS, :])
|
|
439
|
+
|
|
440
|
+
# self._validate_superinds()
|
|
441
|
+
|
|
442
|
+
def _validate_mutation_indices(self):
|
|
443
|
+
"""
|
|
444
|
+
Validates that the mutation indices are consecutive (increasing or decreasing).
|
|
445
|
+
"""
|
|
446
|
+
indices = self.seqmat[self.ROW_INDS, :]
|
|
447
|
+
if not (np.all(abs(np.diff(indices)) <= 1)):
|
|
448
|
+
raise ValueError(f"Mutation indices must be consecutive. Got: {indices}")
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
@classmethod
|
|
452
|
+
def from_mutid(cls, mid):
|
|
453
|
+
gene, chrom, i, r, a = mid.split(':')
|
|
454
|
+
if list(set(a))[0] == '-' and len(a) > 1 and len(list(set(a))) == 1:
|
|
455
|
+
a = '-'
|
|
456
|
+
|
|
457
|
+
if list(set(r))[0] == '-' and len(r) > 1 and len(list(set(r))) == 1:
|
|
458
|
+
r = '-'
|
|
459
|
+
|
|
460
|
+
i = int(i)
|
|
461
|
+
|
|
462
|
+
if len(a) == len(r) == 1 and a != '-' and r != '-':
|
|
463
|
+
temp = {'seq': a, 'indices': [i], 'superinds': [0]}
|
|
464
|
+
|
|
465
|
+
elif a == '-' and r != '-':
|
|
466
|
+
# return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
|
|
467
|
+
temp = {'seq': '-'*len(r), 'indices': np.arange(i, i + len(r), dtype=np.int32), 'superinds': [0] * len(r)}
|
|
468
|
+
|
|
469
|
+
elif r == '-' and a != '-':
|
|
470
|
+
# print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
|
|
471
|
+
# return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
|
|
472
|
+
temp = {'seq': a, 'indices': np.full(len(a), int(i)), 'superinds': np.arange(1, len(a)+1)}
|
|
473
|
+
|
|
474
|
+
elif a != '-' and r != '-':
|
|
475
|
+
ind1 = np.concatenate(
|
|
476
|
+
[np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
|
|
477
|
+
ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
|
|
478
|
+
# return Allele('-' * len(r) + a, ind1, ind2, rev)
|
|
479
|
+
temp = {'seq': '-' * len(r) + a, 'indices': ind1, 'superinds': ind2}
|
|
480
|
+
|
|
481
|
+
return cls.from_seq(temp)
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
# def _validate_superinds(self):
|
|
485
|
+
# """
|
|
486
|
+
# Validates that the superinds row has a maximum value of 10.
|
|
487
|
+
# """
|
|
488
|
+
# superinds = self.seqmat[self.ROW_SUPERINDS, :]
|
|
489
|
+
# if np.max(superinds) > 10:
|
|
490
|
+
# raise ValueError(f"Superinds row must have a maximum value of 10. Got: {superinds}")
|
|
491
|
+
|
|
492
|
+
|
geney/Transcript.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
import numpy as np
|
|
4
|
+
import copy
|
|
5
|
+
from Bio.Seq import Seq # Assuming Biopython is used
|
|
6
|
+
from . import unload_pickle, config
|
|
7
|
+
from .SeqMats import SeqMat, MutSeqMat
|
|
8
|
+
from .Fasta_segment import Fasta_segment
|
|
9
|
+
|
|
10
|
+
class Transcript:
|
|
11
|
+
"""
|
|
12
|
+
Represents a transcript with associated genomic information such as exons, introns, and sequences.
|
|
13
|
+
|
|
14
|
+
A Transcript object is expected to contain attributes loaded from a dictionary `d` representing
|
|
15
|
+
annotations and metadata. This includes (at least):
|
|
16
|
+
- transcript_start
|
|
17
|
+
- transcript_end
|
|
18
|
+
- rev (boolean indicating if the transcript is on the reverse strand)
|
|
19
|
+
- chrm (chromosome)
|
|
20
|
+
- donors
|
|
21
|
+
- acceptors
|
|
22
|
+
- cons_vector
|
|
23
|
+
- cons_seq
|
|
24
|
+
- transcript_seq
|
|
25
|
+
- transcript_biotype
|
|
26
|
+
- primary_transcript
|
|
27
|
+
- transcript_id
|
|
28
|
+
- TIS, TTS (if protein-coding)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, d: dict[str, Any], organism: str = 'hg38'):
|
|
32
|
+
"""
|
|
33
|
+
Initialize a Transcript object from a dictionary of attributes and metadata.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
d (dict): Dictionary containing transcript attributes and data.
|
|
37
|
+
organism (str): Genome build or organism reference (e.g., 'hg38').
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
AssertionError: If required attributes are missing.
|
|
41
|
+
"""
|
|
42
|
+
# Convert certain attributes to NumPy arrays for consistent processing
|
|
43
|
+
array_fields = {'acceptors', 'donors', 'cons_vector'}
|
|
44
|
+
for k, v in d.items():
|
|
45
|
+
if k in array_fields and v is not None:
|
|
46
|
+
v = np.array(v)
|
|
47
|
+
setattr(self, k, v)
|
|
48
|
+
|
|
49
|
+
self.organism: str = organism
|
|
50
|
+
|
|
51
|
+
# Required attributes to form a valid transcript object
|
|
52
|
+
required_attrs = ['transcript_start', 'transcript_end', 'rev', 'chrm']
|
|
53
|
+
missing = [attr for attr in required_attrs if not hasattr(self, attr)]
|
|
54
|
+
if missing:
|
|
55
|
+
raise AssertionError(f"Transcript is missing required attributes: {missing}")
|
|
56
|
+
|
|
57
|
+
# Default fallback values for optional attributes
|
|
58
|
+
if not hasattr(self, 'donors') or self.donors is None:
|
|
59
|
+
self.donors = np.array([])
|
|
60
|
+
if not hasattr(self, 'acceptors') or self.acceptors is None:
|
|
61
|
+
self.acceptors = np.array([])
|
|
62
|
+
if not hasattr(self, 'cons_available'):
|
|
63
|
+
self.cons_available = False
|
|
64
|
+
|
|
65
|
+
# Determine if transcript is protein-coding
|
|
66
|
+
self.protein_coding: bool = hasattr(self, 'TIS') and hasattr(self, 'TTS')
|
|
67
|
+
|
|
68
|
+
# Calculate transcript boundaries
|
|
69
|
+
self.transcript_upper = max(self.transcript_start, self.transcript_end)
|
|
70
|
+
self.transcript_lower = min(self.transcript_start, self.transcript_end)
|
|
71
|
+
|
|
72
|
+
# Generate pre-mRNA sequence data
|
|
73
|
+
self.generate_pre_mrna()
|
|
74
|
+
|
|
75
|
+
# If consensus data is available and ends with '*', adjust cons_vector and cons_seq
|
|
76
|
+
if self.cons_available and hasattr(self, 'cons_seq') and hasattr(self, 'cons_vector'):
|
|
77
|
+
if self.cons_seq.endswith('*') and len(self.cons_seq) == len(self.cons_vector):
|
|
78
|
+
self.cons_vector = self.cons_vector[:-1]
|
|
79
|
+
self.cons_seq = self.cons_seq[:-1]
|
|
80
|
+
|
|
81
|
+
def __repr__(self) -> str:
|
|
82
|
+
"""Official string representation."""
|
|
83
|
+
return f"Transcript({getattr(self, 'transcript_id', 'unknown_id')})"
|
|
84
|
+
|
|
85
|
+
def __str__(self) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Unofficial, user-friendly string representation of the transcript.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
str: A summary of the transcript including ID, type, and primary status.
|
|
91
|
+
"""
|
|
92
|
+
transcript_biotype = getattr(self, 'transcript_biotype', 'unknown').replace('_', ' ').title()
|
|
93
|
+
primary = getattr(self, 'primary_transcript', False)
|
|
94
|
+
return f"Transcript {getattr(self, 'transcript_id', 'unknown_id')}, " \
|
|
95
|
+
f"Type: {transcript_biotype}, Primary: {primary}"
|
|
96
|
+
|
|
97
|
+
def __len__(self) -> int:
|
|
98
|
+
"""
|
|
99
|
+
Length of the transcript sequence.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
int: Length of the transcript sequence.
|
|
103
|
+
"""
|
|
104
|
+
return len(getattr(self, 'transcript_seq', ''))
|
|
105
|
+
|
|
106
|
+
def __eq__(self, other: object) -> bool:
|
|
107
|
+
"""
|
|
108
|
+
Check equality of two transcripts based on their transcript sequences.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
other (object): Another transcript-like object.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
bool: True if sequences match, False otherwise.
|
|
115
|
+
"""
|
|
116
|
+
if not isinstance(other, Transcript):
|
|
117
|
+
return NotImplemented
|
|
118
|
+
return self.transcript_seq == other.transcript_seq
|
|
119
|
+
|
|
120
|
+
def __contains__(self, subvalue: Any) -> bool:
|
|
121
|
+
"""
|
|
122
|
+
Check if a given subsequence (e.g., another SeqMat) is contained within the pre_mRNA.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
subvalue (Any): The substring (or sub-SeqMat) to search for in the mature mRNA.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
bool: True if subvalue's indices are all present in the pre_mRNA, False otherwise.
|
|
129
|
+
|
|
130
|
+
Notes:
|
|
131
|
+
This assumes `subvalue` has a `seqmat` attribute and that `subvalue.seqmat[1, :]` represents indices.
|
|
132
|
+
"""
|
|
133
|
+
if not hasattr(subvalue, 'seqmat'):
|
|
134
|
+
return False
|
|
135
|
+
return np.all(np.isin(subvalue.seqmat[1, :], self.pre_mrna.seqmat[1, :]))
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def exons(self) -> list[tuple[int, int]]:
|
|
139
|
+
"""
|
|
140
|
+
Return a list of exon boundary tuples (acceptor, donor).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
list of (int, int): List of exon boundaries.
|
|
144
|
+
"""
|
|
145
|
+
exon_starts = np.concatenate(([self.transcript_start], self.acceptors))
|
|
146
|
+
exon_ends = np.concatenate((self.donors, [self.transcript_end]))
|
|
147
|
+
return list(zip(exon_starts, exon_ends))
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def exons_pos(self) -> list[tuple[int, int]]:
|
|
151
|
+
"""
|
|
152
|
+
Return exons with positions adjusted for strand orientation.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
list of (int, int): Exons adjusted for strand orientation.
|
|
156
|
+
"""
|
|
157
|
+
exon_positions = self.exons
|
|
158
|
+
if self.rev:
|
|
159
|
+
# Reverse order and swap coordinates for reverse strand
|
|
160
|
+
exon_positions = [(end, start) for start, end in exon_positions[::-1]]
|
|
161
|
+
return exon_positions
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def introns(self) -> list[tuple[int, int]]:
|
|
165
|
+
"""
|
|
166
|
+
Return a list of intron boundaries derived from donors and acceptors.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
list of (int, int): Intron boundaries.
|
|
170
|
+
"""
|
|
171
|
+
valid_donors = self.donors[self.donors != self.transcript_end]
|
|
172
|
+
valid_acceptors = self.acceptors[self.acceptors != self.transcript_start]
|
|
173
|
+
return list(zip(valid_donors, valid_acceptors))
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def introns_pos(self) -> list[tuple[int, int]]:
|
|
177
|
+
"""
|
|
178
|
+
Return introns with positions adjusted for strand orientation.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
list of (int, int): Introns adjusted for strand orientation.
|
|
182
|
+
"""
|
|
183
|
+
intron_positions = self.introns
|
|
184
|
+
if self.rev:
|
|
185
|
+
intron_positions = [(end, start) for start, end in intron_positions[::-1]]
|
|
186
|
+
return intron_positions
|
|
187
|
+
|
|
188
|
+
def _fix_and_check_introns(self) -> Transcript:
|
|
189
|
+
"""
|
|
190
|
+
Ensure acceptors and donors are sorted and unique, and validate exon/intron structures.
|
|
191
|
+
|
|
192
|
+
Raises:
|
|
193
|
+
ValueError: If there are mismatches or ordering issues in exons/introns.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Transcript: The current Transcript object (for chaining).
|
|
197
|
+
"""
|
|
198
|
+
# Ensure uniqueness and correct ordering based on strand
|
|
199
|
+
self.acceptors = np.unique(self.acceptors)
|
|
200
|
+
self.donors = np.unique(self.donors)
|
|
201
|
+
|
|
202
|
+
if self.rev:
|
|
203
|
+
self.acceptors = np.sort(self.acceptors)[::-1]
|
|
204
|
+
self.donors = np.sort(self.donors)[::-1]
|
|
205
|
+
else:
|
|
206
|
+
self.acceptors = np.sort(self.acceptors)
|
|
207
|
+
self.donors = np.sort(self.donors)
|
|
208
|
+
|
|
209
|
+
# Validation checks
|
|
210
|
+
if self.__exon_intron_matchup_flag():
|
|
211
|
+
raise ValueError("Unequal number of acceptors and donors.")
|
|
212
|
+
|
|
213
|
+
if self.__exon_intron_order_flag():
|
|
214
|
+
raise ValueError("Exon/intron order out of position.")
|
|
215
|
+
|
|
216
|
+
if self.__transcript_boundary_flag():
|
|
217
|
+
raise ValueError("Transcript boundaries must straddle acceptors and donors.")
|
|
218
|
+
|
|
219
|
+
return self
|
|
220
|
+
|
|
221
|
+
def __exon_intron_matchup_flag(self) -> bool:
|
|
222
|
+
"""Check if acceptors and donors count match."""
|
|
223
|
+
return len(self.acceptors) != len(self.donors)
|
|
224
|
+
|
|
225
|
+
def __exon_intron_order_flag(self) -> bool:
|
|
226
|
+
"""Check for ordering issues in exon boundaries."""
|
|
227
|
+
return any(start > end for start, end in self.exons_pos)
|
|
228
|
+
|
|
229
|
+
def __transcript_boundary_flag(self) -> bool:
|
|
230
|
+
"""Check if boundaries are within the transcript start/end range."""
|
|
231
|
+
if not len(self.acceptors) and not len(self.donors):
|
|
232
|
+
return False
|
|
233
|
+
min_boundary = np.min(np.concatenate((self.acceptors, self.donors)))
|
|
234
|
+
max_boundary = np.max(np.concatenate((self.acceptors, self.donors)))
|
|
235
|
+
return (self.transcript_lower > min_boundary) or (self.transcript_upper < max_boundary)
|
|
236
|
+
|
|
237
|
+
@property
|
|
238
|
+
def exonic_indices(self) -> np.ndarray:
|
|
239
|
+
"""
|
|
240
|
+
Return the indices covering exons in the transcript.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
np.ndarray: Array of exon indices.
|
|
244
|
+
"""
|
|
245
|
+
return np.concatenate([np.arange(a, b + 1) for a, b in self.exons_pos])
|
|
246
|
+
|
|
247
|
+
def pull_pre_mrna_pos(self) -> dict[str, Any]:
|
|
248
|
+
"""
|
|
249
|
+
Retrieve the pre-mRNA sequence and indices using a Fasta_segment object.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
dict: A dictionary with 'seq' and 'indices' keys.
|
|
253
|
+
"""
|
|
254
|
+
fasta_obj = Fasta_segment()
|
|
255
|
+
return fasta_obj.read_segment_endpoints(
|
|
256
|
+
config[self.organism]['CHROM_SOURCE'] / f'chr{self.chrm}.fasta',
|
|
257
|
+
self.transcript_lower - 1,
|
|
258
|
+
self.transcript_upper + 1
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def generate_pre_mrna(self) -> Transcript:
|
|
262
|
+
"""
|
|
263
|
+
Generate the pre-mRNA sequence for the transcript and store it as `self.pre_mrna`.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Transcript: The current Transcript object (for chaining).
|
|
267
|
+
"""
|
|
268
|
+
pre_mrna = SeqMat.from_seq(self.pull_pre_mrna_pos())
|
|
269
|
+
if self.rev:
|
|
270
|
+
pre_mrna.reverse_complement()
|
|
271
|
+
self.pre_mrna = pre_mrna
|
|
272
|
+
return self
|
|
273
|
+
|
|
274
|
+
def mutate(self, mutation: MutSeqMat, inplace: bool = False) -> Union[Transcript, SeqMat]:
|
|
275
|
+
"""
|
|
276
|
+
Apply a mutation to the pre_mRNA sequence of this Transcript.
|
|
277
|
+
|
|
278
|
+
If the transcript is on the reverse strand (self.rev is True),
|
|
279
|
+
the mutation is first reverse-complemented to ensure strand compatibility.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
mutation (SeqMat): The mutation to apply. Must be a SeqMat or a compatible object that supports .mutate().
|
|
283
|
+
inplace (bool): If True, apply the mutation directly to this Transcript's pre_mRNA
|
|
284
|
+
and return 'self'. If False, return a new SeqMat with the mutated sequence.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Transcript: If inplace=True, returns the updated Transcript object.
|
|
288
|
+
SeqMat: If inplace=False, returns a new SeqMat object representing the mutated sequence.
|
|
289
|
+
"""
|
|
290
|
+
# If transcript is reversed, reverse-complement the mutation first
|
|
291
|
+
if self.rev:
|
|
292
|
+
mutation.reverse_complement()
|
|
293
|
+
|
|
294
|
+
# Attempt the mutation operation
|
|
295
|
+
mutated_seqmat = self.pre_mrna.mutate(mutation).seqmat
|
|
296
|
+
if inplace:
|
|
297
|
+
# Update this Transcript's pre_mRNA and return the Transcript itself
|
|
298
|
+
self.pre_mrna = SeqMat(mutated_seqmat)
|
|
299
|
+
return self
|
|
300
|
+
|
|
301
|
+
else:
|
|
302
|
+
# Create a copy of the current Transcript and update its pre_mrna
|
|
303
|
+
# Assuming you have a way to clone the Transcript; if not, manually recreate it.
|
|
304
|
+
new_transcript = copy.deepcopy(self)
|
|
305
|
+
new_transcript.pre_mrna = SeqMat(mutated_seqmat)
|
|
306
|
+
return new_transcript
|
|
307
|
+
|
|
308
|
+
def generate_mature_mrna(self, inplace: bool = True) -> Union[Transcript, SeqMat]:
|
|
309
|
+
"""
|
|
310
|
+
Generate the mature mRNA by concatenating exon regions from pre_mRNA.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
inplace (bool): If True, set `self.mature_mrna`, else return a new SeqMat.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Transcript or SeqMat: The Transcript object (if inplace=True) or a SeqMat (if inplace=False).
|
|
317
|
+
"""
|
|
318
|
+
self._fix_and_check_introns()
|
|
319
|
+
|
|
320
|
+
mature_mrna = SeqMat.empty()
|
|
321
|
+
pos_mrna = self.pre_mrna
|
|
322
|
+
|
|
323
|
+
for exon_start, exon_end in self.exons:
|
|
324
|
+
# Add each exon region to the mature_mrna
|
|
325
|
+
mature_mrna += pos_mrna[exon_start:exon_end]
|
|
326
|
+
|
|
327
|
+
if inplace:
|
|
328
|
+
self.mature_mrna = mature_mrna
|
|
329
|
+
return self
|
|
330
|
+
return mature_mrna
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def orf(self, tis=None):
|
|
334
|
+
"""
|
|
335
|
+
Return the ORF (Open Reading Frame) SeqMat object, if TIS and TTS are available.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
SeqMat or self: The ORF SeqMat if TIS/TTS are set, else self.
|
|
339
|
+
"""
|
|
340
|
+
if not self.protein_coding:
|
|
341
|
+
print("Cannot create protein without set TIS and TTS values.")
|
|
342
|
+
return self
|
|
343
|
+
|
|
344
|
+
if tis is None:
|
|
345
|
+
tis = self.TIS
|
|
346
|
+
return self.mature_mrna.orf_seqmat(tis)
|
|
347
|
+
|
|
348
|
+
def clone(self) -> Transcript:
|
|
349
|
+
"""
|
|
350
|
+
Returns a deep copy of this Transcript instance.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Transcript: A new Transcript object that is a deep copy of the current instance.
|
|
354
|
+
"""
|
|
355
|
+
return copy.deepcopy(self)
|
|
356
|
+
|
|
357
|
+
def generate_protein(self, inplace: bool = True, domains: Optional[np.ndarray] = None) -> Union[
|
|
358
|
+
Transcript, tuple[str, np.ndarray]]:
|
|
359
|
+
"""
|
|
360
|
+
Translate the ORF into a protein sequence and optionally filter consensus vector by domains.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
inplace (bool): If True, store protein and cons_vector in self. Otherwise, return them.
|
|
364
|
+
domains (np.ndarray, optional): Array of domain indices.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Transcript or (protein: str, cons_vector: np.ndarray): The Transcript object if inplace=True, else the protein and cons_vector.
|
|
368
|
+
"""
|
|
369
|
+
if not self.protein_coding:
|
|
370
|
+
print("No protein can be generated without TIS/TTS.")
|
|
371
|
+
return self if inplace else ("", np.array([]))
|
|
372
|
+
|
|
373
|
+
# Translate the ORF to protein
|
|
374
|
+
protein = str(Seq(self.orf.seq).translate()).replace('*', '')
|
|
375
|
+
|
|
376
|
+
# Use existing cons_vector or default to an array of ones
|
|
377
|
+
self.cons_vector = self.cons_vector if hasattr(self, 'cons_vector') else np.ones(len(protein))
|
|
378
|
+
self.protein = protein
|
|
379
|
+
return self
|
geney/_mutation_utils.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from .Gene import *
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
class Allele(SeqMat):
|
|
5
|
+
def __init__(self, alt, pos1, pos2, rev):
|
|
6
|
+
super().__init__(alt, pos1, pos2)
|
|
7
|
+
self.position = min(pos1)
|
|
8
|
+
if rev:
|
|
9
|
+
self.reverse_complement()
|
|
10
|
+
|
|
11
|
+
# def _continuous(self, ind):
|
|
12
|
+
# return True
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_mutation(mut_id, rev=False):
|
|
17
|
+
|
|
18
|
+
_, _, i, r, a = mut_id.split(':')
|
|
19
|
+
i = int(i)
|
|
20
|
+
|
|
21
|
+
if len(a) == len(r) == 1 and a != '-' and r != '-':
|
|
22
|
+
return Allele(a, [i], [0], rev)
|
|
23
|
+
|
|
24
|
+
elif a == '-' and r != '-':
|
|
25
|
+
return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
|
|
26
|
+
|
|
27
|
+
elif r == '-' and a != '-':
|
|
28
|
+
# print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
|
|
29
|
+
return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
|
|
30
|
+
|
|
31
|
+
elif a != '-' and r != '-':
|
|
32
|
+
ind1 = np.concatenate(
|
|
33
|
+
[np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
|
|
34
|
+
ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
|
|
35
|
+
return Allele('-' * len(r) + a, ind1, ind2, rev)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
geney/oncosplice.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
|
|
3
1
|
from Bio import pairwise2
|
|
4
2
|
import re
|
|
5
|
-
import
|
|
3
|
+
import hashlib
|
|
4
|
+
from tqdm import tqdm
|
|
6
5
|
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from .
|
|
9
|
-
from .
|
|
6
|
+
import numpy as np
|
|
7
|
+
from .SeqMats import SeqMat, MutSeqMat
|
|
8
|
+
from .splicing_utils import find_transcript_missplicing_seqs, develop_aberrant_splicing
|
|
10
9
|
from .tis_utils import find_tis
|
|
11
10
|
|
|
11
|
+
def short_hash_of_list(numbers, length=5):
|
|
12
|
+
encoded = repr(numbers).encode('utf-8')
|
|
13
|
+
full_hash = hashlib.sha256(encoded).hexdigest()
|
|
14
|
+
return full_hash[:length]
|
|
15
|
+
|
|
12
16
|
def find_continuous_gaps(sequence):
|
|
13
17
|
"""Find continuous gap sequences in an alignment."""
|
|
14
18
|
return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
|
|
15
19
|
|
|
16
|
-
|
|
17
20
|
def get_logical_alignment(ref_prot, var_prot):
|
|
18
21
|
"""
|
|
19
22
|
Aligns two protein sequences and finds the optimal alignment with the least number of gaps.
|
|
@@ -272,43 +275,22 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
|
|
|
272
275
|
event.append('NE')
|
|
273
276
|
if len(event) >= 1:
|
|
274
277
|
return ','.join(event)
|
|
275
|
-
# elif len(event) == 1:
|
|
276
|
-
# return event[0]
|
|
277
278
|
else:
|
|
278
279
|
return '-'
|
|
279
280
|
|
|
280
281
|
|
|
281
282
|
# Annotating
|
|
282
|
-
def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
283
|
+
def OncospliceAnnotator(reference_transcript, variant_transcript, mut, ref_attributes=[], var_attributes=[]):
|
|
283
284
|
affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
|
|
284
285
|
reference_transcript)
|
|
285
286
|
|
|
286
287
|
report = {}
|
|
287
|
-
|
|
288
288
|
report['primary_transcript'] = reference_transcript.primary_transcript
|
|
289
289
|
report['transcript_id'] = reference_transcript.transcript_id
|
|
290
|
-
# report['mut_id'] = mut.mut_id
|
|
291
|
-
# report['cons_available'] = int(reference_transcript.cons_available)
|
|
292
|
-
# report['protein_coding'] = reference_transcript.transcript_biotype
|
|
293
|
-
|
|
294
|
-
# report['reference_mrna'] = reference_transcript.transcript_seq
|
|
295
|
-
# report['reference_cds_start'] = reference_transcript.TIS
|
|
296
|
-
# report['reference_pre_mrna'] = reference_transcript.pre_mrna
|
|
297
|
-
# report[
|
|
298
|
-
# 'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
|
|
299
290
|
report['reference_protein'] = reference_transcript.protein
|
|
300
|
-
# report['reference_protein_length'] = len(reference_transcript.protein)
|
|
301
|
-
|
|
302
|
-
# report['variant_mrna'] = variant_transcript.transcript_seq
|
|
303
|
-
# report['variant_cds_start'] = variant_transcript.TIS
|
|
304
|
-
# report[
|
|
305
|
-
# 'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
|
|
306
|
-
# report['variant_orf'] = variant_transcript.orf
|
|
307
291
|
report['variant_protein'] = variant_transcript.protein
|
|
308
292
|
report['variant_protein_length'] = len(variant_transcript.protein)
|
|
309
|
-
|
|
310
293
|
descriptions = define_missplicing_events(reference_transcript, variant_transcript)
|
|
311
|
-
# print(descriptions)
|
|
312
294
|
report['exon_changes'] = '|'.join([v for v in descriptions if v])
|
|
313
295
|
report['splicing_codes'] = summarize_missplicing_event(*descriptions)
|
|
314
296
|
report['affected_exon'] = affected_exon
|
|
@@ -318,60 +300,79 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
|
|
|
318
300
|
return report
|
|
319
301
|
|
|
320
302
|
|
|
321
|
-
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False,
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
303
|
+
def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False,
|
|
304
|
+
window_length=13, organism='hg38', engine='spliceai', domains=None):
|
|
305
|
+
gene = Gene.from_file(mut_id.split(':')[0], organism=organism)
|
|
306
|
+
reference_gene_proteins = {
|
|
307
|
+
transcript.generate_pre_mrna().generate_mature_mrna().generate_protein().protein: transcript.transcript_id for
|
|
308
|
+
transcript in gene if transcript.transcript_biotype == 'protein_coding'}
|
|
325
309
|
|
|
310
|
+
mutations = [MutSeqMat.from_mutid(m) for m in mut_id.split('|')]
|
|
326
311
|
results = []
|
|
327
|
-
for
|
|
328
|
-
if cons_required and not
|
|
312
|
+
for reference_transcript in tqdm(gene):
|
|
313
|
+
if (cons_required and not reference_transcript.cons_available) or (
|
|
314
|
+
protein_coding and not reference_transcript.transcript_biotype == 'protein_coding'):
|
|
329
315
|
continue
|
|
330
316
|
|
|
331
|
-
|
|
317
|
+
current_mutations = [m for m in mutations if m in reference_transcript]
|
|
318
|
+
if len(current_mutations) == 0:
|
|
332
319
|
continue
|
|
333
320
|
|
|
334
|
-
|
|
335
|
-
transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
|
|
336
|
-
transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
|
|
337
|
-
ref_protein, cons_vector = transcript.protein, transcript.cons_vector
|
|
338
|
-
reference_transcript = copy.deepcopy(transcript)
|
|
321
|
+
center = np.mean([m.indices[0] for m in current_mutations]) // 1
|
|
339
322
|
|
|
340
|
-
|
|
323
|
+
mutated_transcript = reference_transcript.clone()
|
|
324
|
+
for mutation in current_mutations:
|
|
325
|
+
mutated_transcript.mutate(mutation, inplace=True)
|
|
341
326
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
327
|
+
reference_transcript.generate_mature_mrna().generate_protein()
|
|
328
|
+
reference_transcript.cons_vector = transform_conservation_vector(reference_transcript.cons_vector,
|
|
329
|
+
window=window_length)
|
|
345
330
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
transcript.donors = new_boundaries['donors']
|
|
349
|
-
transcript.generate_mature_mrna().generate_protein()
|
|
331
|
+
assert len(reference_transcript.protein) == len(
|
|
332
|
+
reference_transcript.cons_vector), f"Protein ({len(reference_transcript.protein)}) and conservation vector ({len(reference_transcript.cons_vector)}) must be same length."
|
|
350
333
|
|
|
351
|
-
|
|
334
|
+
missplicing = find_transcript_missplicing_seqs(
|
|
335
|
+
reference_transcript.pre_mrna.get_context(center, context=7500, padding='N'),
|
|
336
|
+
mutated_transcript.pre_mrna.get_context(center, context=7500, padding='N'), reference_transcript.donors,
|
|
337
|
+
reference_transcript.acceptors, threshold=splicing_threshold, engine=engine)
|
|
338
|
+
alternative_splicing_paths = develop_aberrant_splicing(reference_transcript, missplicing.aberrant_splicing)
|
|
339
|
+
|
|
340
|
+
for i, new_boundaries in enumerate(alternative_splicing_paths):
|
|
341
|
+
mutated_transcript.acceptors = new_boundaries['acceptors']
|
|
342
|
+
mutated_transcript.donors = new_boundaries['donors']
|
|
343
|
+
mutated_transcript.generate_mature_mrna().generate_protein()
|
|
344
|
+
|
|
345
|
+
alignment = get_logical_alignment(reference_transcript.protein, mutated_transcript.protein)
|
|
352
346
|
deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
|
|
353
|
-
modified_positions = find_modified_positions(len(
|
|
354
|
-
temp_cons = np.convolve(cons_vector * modified_positions,
|
|
347
|
+
modified_positions = find_modified_positions(len(reference_transcript.protein), deleted, inserted)
|
|
348
|
+
temp_cons = np.convolve(reference_transcript.cons_vector * modified_positions,
|
|
349
|
+
np.ones(window_length)) / window_length
|
|
355
350
|
affected_cons_scores = max(temp_cons)
|
|
356
351
|
percentile = (
|
|
357
|
-
|
|
358
|
-
|
|
352
|
+
sorted(reference_transcript.cons_vector).index(
|
|
353
|
+
next(x for x in sorted(reference_transcript.cons_vector) if x >= affected_cons_scores)) / len(
|
|
354
|
+
reference_transcript.cons_vector))
|
|
359
355
|
|
|
360
|
-
report = OncospliceAnnotator(reference_transcript,
|
|
356
|
+
report = OncospliceAnnotator(reference_transcript, mutated_transcript, current_mutations[0])
|
|
361
357
|
report['mut_id'] = mut_id
|
|
358
|
+
report['engine'] = engine
|
|
362
359
|
report['oncosplice_score'] = affected_cons_scores
|
|
363
360
|
report['percentile'] = percentile
|
|
364
|
-
report['isoform_id'] =
|
|
361
|
+
report['isoform_id'] = short_hash_of_list(mutated_transcript.exons)
|
|
365
362
|
report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
366
363
|
report['full_missplicing'] = missplicing.aberrant_splicing
|
|
367
364
|
report['missplicing'] = max(missplicing)
|
|
368
|
-
report['reference_resemblance'] = reference_gene_proteins.get(
|
|
365
|
+
report['reference_resemblance'] = reference_gene_proteins.get(mutated_transcript.protein, None)
|
|
369
366
|
results.append(report)
|
|
370
367
|
|
|
371
368
|
if len(results) == 0:
|
|
372
369
|
return None
|
|
373
370
|
|
|
374
|
-
return pd.DataFrame(results)
|
|
371
|
+
return pd.DataFrame(results)[
|
|
372
|
+
['mut_id', 'transcript_id', 'isoform_id', 'primary_transcript', 'missplicing', 'full_missplicing',
|
|
373
|
+
'exon_changes', 'splicing_codes', 'affected_exon', 'affected_intron', 'mutation_distance_from_5',
|
|
374
|
+
'mutation_distance_from_3', 'engine', 'reference_resemblance', 'oncosplice_score', 'percentile',
|
|
375
|
+
'isoform_prevalence', 'reference_protein', 'variant_protein']]
|
|
375
376
|
|
|
376
377
|
|
|
377
378
|
import asyncio
|
geney/splicing_utils.py
CHANGED
|
@@ -146,7 +146,7 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
|
|
|
146
146
|
|
|
147
147
|
|
|
148
148
|
def find_transcript_missplicing_mutid(mut_id):
|
|
149
|
-
from geney.
|
|
149
|
+
from geney.Gene import Gene
|
|
150
150
|
transcript = Gene(mut_id.split(':')[0]).transcript().generate_mature_mrna()
|
|
151
151
|
out = find_transcript_missplicing(transcript, [get_mutation(mut_id, rev=transcript.rev)], context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=True)
|
|
152
152
|
best_delta = 0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: geney
|
|
3
|
-
Version: 1.2
|
|
3
|
+
Version: 1.3.2
|
|
4
4
|
Summary: A Python package for gene expression modeling.
|
|
5
5
|
Home-page: https://github.com/nicolaslynn/geney
|
|
6
6
|
Author: Nicolas Lynn
|
|
@@ -29,6 +29,8 @@ Requires-Dist: notebook
|
|
|
29
29
|
Requires-Dist: matplotlib
|
|
30
30
|
Requires-Dist: gffutils
|
|
31
31
|
Requires-Dist: pyfastx
|
|
32
|
+
Requires-Dist: tensorflow
|
|
33
|
+
Requires-Dist: keras
|
|
32
34
|
|
|
33
35
|
UNKNOWN
|
|
34
36
|
|
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
|
|
2
|
+
geney/Gene.py,sha256=JGWtfA6-d1W3I9YRASwaF8vaZ6CCuY0KEawQNdloIqY,6259
|
|
3
|
+
geney/SeqMats.py,sha256=jkXmXAs0OpnFeyCfiJcKKpHHSi9JpKgiOIwsu63e1CQ,18557
|
|
4
|
+
geney/Transcript.py,sha256=eRZXVVxDVBbv0l385bnAOBFRBSzBwppXcbBq8KXkwlo,14443
|
|
2
5
|
geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
|
|
6
|
+
geney/_mutation_utils.py,sha256=dHssUsnii_mf-wuRoMmF13UlD7k3ml_VwQMItTYnXpU,1132
|
|
3
7
|
geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
|
|
4
8
|
geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
|
|
5
9
|
geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
|
|
6
10
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
7
11
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
8
12
|
geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
|
|
9
|
-
geney/oncosplice.py,sha256=
|
|
13
|
+
geney/oncosplice.py,sha256=FdvuROk2G7wwLoB5lLzYia8Smw9hHZeVs-J2MUoAwlU,22106
|
|
10
14
|
geney/pangolin_utils.py,sha256=i5j5vEMCWOTIa1mRP2377BAhlUFZjHBzTQBips4lA_4,2934
|
|
11
15
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
12
16
|
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
13
17
|
geney/spliceai_utils.py,sha256=PFIhTK8Ihrj-cv5tgRN0UFPYEmC4uxtqXSP9bBLnZRM,3077
|
|
14
|
-
geney/splicing_utils.py,sha256=
|
|
18
|
+
geney/splicing_utils.py,sha256=4xYXy_dIbqdbVfxsEj_OCuM-MsQ24gi4fIv0vQjAYcQ,19215
|
|
15
19
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
16
20
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
17
21
|
geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
|
|
@@ -20,7 +24,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
20
24
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
21
25
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
22
26
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
23
|
-
geney-1.2.
|
|
24
|
-
geney-1.2.
|
|
25
|
-
geney-1.2.
|
|
26
|
-
geney-1.2.
|
|
27
|
+
geney-1.3.2.dist-info/METADATA,sha256=aGPdV-x5PcONjV5ylUg8rYhW0eo4Fm2HDOE8dzldpcg,994
|
|
28
|
+
geney-1.3.2.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
|
|
29
|
+
geney-1.3.2.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
30
|
+
geney-1.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|