geney 1.1.12__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. geney-1.3.0/PKG-INFO +19 -0
  2. geney-1.3.0/geney/Gene.py +177 -0
  3. geney-1.3.0/geney/SeqMats.py +492 -0
  4. geney-1.3.0/geney/Transcript.py +379 -0
  5. {geney-1.1.12 → geney-1.3.0}/geney/__init__.py +13 -1
  6. geney-1.3.0/geney/_mutation_utils.py +38 -0
  7. geney-1.3.0/geney/config_setup.py +15 -0
  8. {geney-1.1.12 → geney-1.3.0}/geney/data_setup.py +75 -38
  9. geney-1.3.0/geney/graphic_utils.py +269 -0
  10. {geney-1.1.12 → geney-1.3.0}/geney/immune_utils.py +12 -14
  11. geney-1.3.0/geney/oncosplice.py +484 -0
  12. geney-1.3.0/geney/pangolin_utils.py +82 -0
  13. geney-1.3.0/geney/spliceai_utils.py +76 -0
  14. geney-1.3.0/geney/splicing_utils.py +466 -0
  15. {geney-1.1.12 → geney-1.3.0}/geney/survival_utils.py +25 -8
  16. {geney-1.1.12 → geney-1.3.0}/geney/tcga_utils.py +70 -37
  17. geney-1.3.0/geney/tis_utils.py +163 -0
  18. {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/tis_utils.py +2 -0
  19. geney-1.3.0/geney/utils.py +80 -0
  20. geney-1.3.0/geney.egg-info/PKG-INFO +19 -0
  21. {geney-1.1.12 → geney-1.3.0}/geney.egg-info/SOURCES.txt +11 -3
  22. geney-1.3.0/geney.egg-info/requires.txt +17 -0
  23. {geney-1.1.12 → geney-1.3.0}/setup.py +1 -1
  24. geney-1.3.0/tests/test_oncosplice.py +25 -0
  25. geney-1.1.12/PKG-INFO +0 -34
  26. geney-1.1.12/geney/config_setup.py +0 -14
  27. geney-1.1.12/geney/oncosplice.py +0 -2690
  28. geney-1.1.12/geney/performance_utils.py +0 -138
  29. geney-1.1.12/geney/power_utils.py +0 -180
  30. geney-1.1.12/geney/utils.py +0 -75
  31. geney-1.1.12/geney.egg-info/PKG-INFO +0 -34
  32. geney-1.1.12/geney.egg-info/requires.txt +0 -19
  33. {geney-1.1.12 → geney-1.3.0}/MANIFEST.in +0 -0
  34. {geney-1.1.12 → geney-1.3.0}/geney/Fasta_segment.py +0 -0
  35. {geney-1.1.12 → geney-1.3.0}/geney/gtex_utils.py +0 -0
  36. {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/__init__.py +0 -0
  37. {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/resources/kozak_pssm.json +0 -0
  38. {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/resources/tis_regressor_model.joblib +0 -0
  39. {geney-1.1.12 → geney-1.3.0}/geney.egg-info/dependency_links.txt +0 -0
  40. {geney-1.1.12 → geney-1.3.0}/geney.egg-info/top_level.txt +0 -0
  41. {geney-1.1.12 → geney-1.3.0}/setup.cfg +0 -0
geney-1.3.0/PKG-INFO ADDED
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: geney
3
+ Version: 1.3.0
4
+ Summary: A Python package for gene expression modeling.
5
+ Home-page: https://github.com/nicolaslynn/geney
6
+ Author: Nicolas Lynn
7
+ Author-email: nicolasalynn@gmail.com
8
+ License: Free for non-commercial use
9
+ Platform: UNKNOWN
10
+ Classifier: Development Status :: 1 - Planning
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: Free for non-commercial use
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Operating System :: MacOS
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Requires-Python: >3.9
17
+
18
+ UNKNOWN
19
+
@@ -0,0 +1,177 @@
1
+ import copy
2
+ from typing import Any, Dict, List, Tuple, Optional, Iterator, Union, TYPE_CHECKING
3
+ from collections import Counter
4
+ from . import unload_pickle, config
5
+ from .Transcript import Transcript
6
+
7
+ class Gene:
8
+ """
9
+ A class representing a Gene, with associated transcripts and metadata.
10
+
11
+ Attributes:
12
+ organism (str): The organism build (e.g. 'hg38').
13
+ transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
14
+ gene_name (str): The name of the gene.
15
+ gene_id (str): The unique identifier for the gene.
16
+ chrm (str): The chromosome on which the gene resides.
17
+ """
18
+
19
+ def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
20
+ """
21
+ Initialize a Gene instance by loading gene information from stored pickled files.
22
+
23
+ Args:
24
+ gene_name (str): Name of the gene (default 'KRAS').
25
+ variation: Variation information (unused currently).
26
+ organism (str): Organism reference build (default 'hg38').
27
+
28
+ Raises:
29
+ FileNotFoundError: If no files for the specified gene are found.
30
+ AssertionError: If required attributes are missing after loading.
31
+ """
32
+
33
+ def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
34
+ self.gene_name = gene_name
35
+ self.gene_id = gene_id
36
+ self.rev = rev
37
+ self.chrm = chrm
38
+ self.organism = organism
39
+ self.transcripts = transcripts if transcripts is not None else {}
40
+
41
+ def __repr__(self) -> str:
42
+ """
43
+ Official string representation of the Gene object.
44
+ """
45
+ return f"Gene({self.gene_name})"
46
+
47
+ def __str__(self) -> str:
48
+ """
49
+ Unofficial, user-friendly string representation of the Gene object.
50
+ """
51
+ return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
52
+
53
+ def __len__(self) -> int:
54
+ """
55
+ Returns the number of transcripts associated with this gene.
56
+
57
+ Returns:
58
+ int: The count of transcripts.
59
+ """
60
+ return len(self.transcripts)
61
+
62
+ def __copy__(self):
63
+ """
64
+ Returns a shallow copy of the Gene object.
65
+ """
66
+ return copy.copy(self)
67
+
68
+ def __deepcopy__(self, memo):
69
+ """
70
+ Returns a deep copy of the Gene object.
71
+ """
72
+ return copy.deepcopy(self, memo)
73
+
74
+ def __iter__(self):
75
+ """
76
+ Allow iteration over the gene's transcripts, yielding Transcript objects.
77
+ """
78
+ for tid, annotations in self.transcripts.items():
79
+ yield Transcript(annotations, organism=self.organism)
80
+
81
+ @classmethod
82
+ def from_file(cls, gene_name, organism='hg38'):
83
+ # Load data from file here
84
+
85
+ # Find gene data files in the configured organism MRNA path
86
+ gene_files = list((config[organism]['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
87
+ if not gene_files:
88
+ raise FileNotFoundError(f"No files available for gene '{gene_name}'.")
89
+
90
+ # Load gene data from the first matching file
91
+ data = unload_pickle(gene_files[0])
92
+ gene_name = data.get('gene_name')
93
+ gene_id = data.get('gene_id')
94
+ rev = data.get('rev')
95
+ chrm = data.get('chrm')
96
+ transcripts = data.get('transcripts', {})
97
+
98
+ return cls(
99
+ gene_name=gene_name,
100
+ gene_id=gene_id,
101
+ rev=rev,
102
+ chrm=chrm,
103
+ transcripts=transcripts,
104
+ organism=organism
105
+ )
106
+
107
+ def splice_sites(self) -> Tuple['Counter', 'Counter']:
108
+ """
109
+ Aggregates splice sites (acceptors and donors) from all transcripts.
110
+
111
+ Returns:
112
+ tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
113
+ """
114
+ from collections import Counter
115
+ acceptors: List[Any] = []
116
+ donors: List[Any] = []
117
+
118
+ # Collect acceptor and donor sites from each transcript
119
+ for transcript in self.transcripts.values():
120
+ acceptors.extend(transcript.get('acceptors', []))
121
+ donors.extend(transcript.get('donors', []))
122
+
123
+ return Counter(acceptors), Counter(donors)
124
+
125
+ def transcript(self, tid: Optional[str] = None):
126
+ """
127
+ Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
128
+
129
+ Args:
130
+ tid (str, optional): Transcript ID. If None, returns primary transcript.
131
+
132
+ Returns:
133
+ Transcript: The Transcript object with the given ID or the primary transcript.
134
+
135
+ Raises:
136
+ AttributeError: If the requested transcript does not exist.
137
+ """
138
+ if tid is None:
139
+ tid = self.primary_transcript
140
+
141
+ if tid not in self.transcripts:
142
+ raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
143
+
144
+ return Transcript(self.transcripts[tid], organism=self.organism)
145
+
146
+ @property
147
+ def primary_transcript(self) -> Optional[str]:
148
+ """
149
+ Returns the primary transcript ID for this gene.
150
+ If not explicitly defined, it attempts to select a primary transcript.
151
+ If none is found, it falls back to the first protein-coding transcript.
152
+ If still none is found, returns None.
153
+
154
+ Returns:
155
+ str or None: The primary transcript ID or None if not available.
156
+ """
157
+ # If already calculated, return it
158
+ if hasattr(self, '_primary_transcript'):
159
+ return self._primary_transcript
160
+
161
+ # Try to find a primary transcript
162
+ primary_transcripts = [k for k, v in self.transcripts.items() if v.get('primary_transcript')]
163
+ if primary_transcripts:
164
+ self._primary_transcript = primary_transcripts[0]
165
+ return self._primary_transcript
166
+
167
+ # Fallback: find a protein-coding transcript
168
+ protein_coding = [k for k, v in self.transcripts.items() if v.get('transcript_biotype') == 'protein_coding']
169
+ if protein_coding:
170
+ self._primary_transcript = protein_coding[0]
171
+ return self._primary_transcript
172
+
173
+ # No primary or protein-coding transcript found
174
+ self._primary_transcript = None
175
+ return None
176
+
177
+
@@ -0,0 +1,492 @@
1
+ import re
2
+ import numpy as np
3
+
4
+ ALPHABET = {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
5
+
6
+ class SeqMat:
7
+ ROW_SEQ = 0
8
+ ROW_INDS = 1
9
+ ROW_SUPERINDS = 2
10
+ ROW_MUTATED = 3
11
+
12
+ def __init__(self, seqmat, alphabet=None):
13
+ self.seqmat = seqmat
14
+ self.alphabet = alphabet or {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
15
+
16
+ self.char_to_value = {c: i for i, c in enumerate(self.alphabet.keys())}
17
+ self.value_to_char = {i: c for i, c in enumerate(self.alphabet.keys())}
18
+ self.value_complements = {self.char_to_value[c1]: self.char_to_value[c2] for c1, c2 in self.alphabet.items()}
19
+
20
+ def __repr__(self):
21
+ return f"<SeqMat: {self.seq}>"
22
+
23
+ def __str__(self):
24
+ return self.seq
25
+
26
+ def __len__(self):
27
+ return self.seqmat.shape[1]
28
+
29
+ def __getitem__(self, key):
30
+ if isinstance(key, slice):
31
+ pos1, pos2 = self._rel_index(key.start), self._rel_index(key.stop)
32
+ return SeqMat(self.seqmat[:, pos1:pos2+1])
33
+ else:
34
+ pos = self._rel_index(key)
35
+ return SeqMat(self.seqmat[:, pos:pos + 1])
36
+
37
+ def __contains__(self, other):
38
+ """
39
+ Checks if another SeqMat object is entirely contained within this SeqMat object.
40
+
41
+ Args:
42
+ other (SeqMat): Another SeqMat object to check for containment.
43
+
44
+ Returns:
45
+ bool: True if `other` is contained in `self`, False otherwise.
46
+ """
47
+ # Ensure `other` is a SeqMat
48
+ if not isinstance(other, SeqMat):
49
+ raise TypeError("Can only check containment with another SeqMat object.")
50
+
51
+ # Check if all indices of `other` are in `self`
52
+ other_indices = other.seqmat[other.ROW_INDS, :]
53
+ self_indices = self.seqmat[self.ROW_INDS, :]
54
+ if not np.all(np.isin(other_indices, self_indices)):
55
+ return False
56
+
57
+ return True
58
+
59
+ def __eq__(self, other):
60
+ """
61
+ Implements the == operator to compare two SeqMat objects.
62
+
63
+ Args:
64
+ other (SeqMat): The other SeqMat object to compare.
65
+
66
+ Returns:
67
+ bool: True if the two SeqMat objects are equal, False otherwise.
68
+ """
69
+ # Ensure `other` is a SeqMat object
70
+ if not isinstance(other, SeqMat):
71
+ return False
72
+
73
+ # Compare the sequence matrix
74
+ if not np.array_equal(self.seqmat, other.seqmat):
75
+ return False
76
+
77
+ return True
78
+
79
+ @classmethod
80
+ def empty(cls, alphabet=None):
81
+ """
82
+ Creates an empty SeqMat object.
83
+
84
+ Args:
85
+ alphabet (dict): Optional alphabet dictionary (default: {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}).
86
+
87
+ Returns:
88
+ SeqMat: An empty SeqMat object.
89
+ """
90
+ empty_seqmat = np.zeros((4, 0), dtype=np.int32) # 4 rows, 0 columns (no data)
91
+ return cls(empty_seqmat, alphabet=alphabet)
92
+
93
+ def __add__(self, other):
94
+ """
95
+ Implements the + operator. Joins two SeqMat objects or applies mutations.
96
+
97
+ If `other` is outside the range of indices, the sequences are concatenated, provided the indices are
98
+ monotonically increasing or decreasing. Otherwise, it applies the mutation.
99
+
100
+ Args:
101
+ other (SeqMat): Another SeqMat object to join or mutate.
102
+
103
+ Returns:
104
+ SeqMat: A new SeqMat object with the resulting sequence.
105
+ """
106
+ # Ensure `other` is a SeqMat
107
+ if not isinstance(other, SeqMat):
108
+ raise TypeError("Can only add another SeqMat object.")
109
+
110
+ if other in self:
111
+ return self.mutate(other)
112
+
113
+ else:
114
+ combined_seqmat = np.hstack((self.seqmat, other.seqmat))
115
+
116
+ # Ensure the combined sequence is monotonic
117
+ if not self._is_monotonic(combined_seqmat[self.ROW_INDS]):
118
+ raise ValueError("Resulting sequence indices are not monotonic.")
119
+
120
+ return SeqMat(combined_seqmat, alphabet=self.alphabet)
121
+
122
+ def __iadd__(self, other):
123
+ """
124
+ Implements the += operator. Joins two SeqMat objects or applies mutations in place.
125
+
126
+ Args:
127
+ other (SeqMat): Another SeqMat object to join or mutate.
128
+
129
+ Returns:
130
+ SeqMat: The mutated or joined SeqMat object.
131
+ """
132
+ # Ensure `other` is a SeqMat
133
+ if not isinstance(other, SeqMat):
134
+ raise TypeError("Can only add another SeqMat object.")
135
+
136
+ if other in self:
137
+ self.seqmat = self.mutate(other).seqmat
138
+ return self
139
+ else:
140
+ self.seqmat = np.hstack((self.seqmat, other.seqmat))
141
+
142
+ if not self._is_monotonic(self.seqmat[self.ROW_INDS]):
143
+ raise ValueError("Resulting sequence indices are not monotonic.")
144
+
145
+ return self
146
+
147
+ # def get_context(self, pos, context=500):
148
+ # pos = self._rel_index(pos)
149
+ # lower_bound, upper_bound = max(0, pos - context), min(len(self), pos + context + 1)
150
+ # return SeqMat(self.seqmat[:, lower_bound:upper_bound])
151
+
152
+ def get_context(self, pos, context=500, padding=None):
153
+ """
154
+ Returns a SeqMat object representing the region around `pos` with the given context.
155
+ If padding is provided and the requested context extends beyond the sequence boundaries,
156
+ the result is padded with the specified nucleotide in the sequence row and -1 in the indices rows.
157
+
158
+ Args:
159
+ pos (int): The position of interest in the original coordinate space.
160
+ context (int): The number of nucleotides to include on each side of pos (default 500).
161
+ padding (str or None): The nucleotide to use for padding. If None, no padding is applied and
162
+ the returned region may be shorter than requested.
163
+
164
+ Returns:
165
+ SeqMat: A new SeqMat object containing the context region (padded if requested).
166
+ """
167
+ # Resolve the relative index
168
+ pos = self._rel_index(pos)
169
+
170
+ # Calculate desired start and end positions
171
+ desired_length = 2 * context + 1
172
+ start = pos - context
173
+ end = pos + context + 1
174
+
175
+ # Actual bounds clipped to the available length
176
+ actual_start = max(start, 0)
177
+ actual_end = min(len(self), end)
178
+
179
+ # Extract the slice that fits within the sequence
180
+ slice_seqmat = self.seqmat[:, actual_start:actual_end]
181
+
182
+ extracted_length = slice_seqmat.shape[1]
183
+
184
+ # If no padding requested, just return the slice
185
+ if padding is None or extracted_length == desired_length:
186
+ return SeqMat(slice_seqmat)
187
+
188
+ # If padding is requested and we have fewer columns than desired, pad the result
189
+ if extracted_length < desired_length:
190
+ # Determine how much we need to pad on each side
191
+ pad_left = max(-start, 0) # How many columns needed before actual_start
192
+ pad_right = max(end - len(self), 0) # How many columns needed after actual_end
193
+
194
+ # Determine numeric code for padding nucleotide
195
+ # Assuming self.char_to_value is available and 'N' is known if padding isn't recognized
196
+ N_val = self.char_to_value.get(padding, self.char_to_value['N'])
197
+
198
+ # Create a new array with the desired length
199
+ new_seqmat = np.full((self.seqmat.shape[0], desired_length), -1, dtype=self.seqmat.dtype)
200
+ # Fill the sequence row with N_val
201
+ new_seqmat[0, :] = N_val
202
+
203
+ # Place the extracted slice into the correct position
204
+ new_seqmat[:, pad_left:pad_left + extracted_length] = slice_seqmat
205
+ return SeqMat(new_seqmat)
206
+
207
+ # If for some reason extracted_length > desired_length (unlikely), just truncate
208
+ if extracted_length > desired_length:
209
+ return SeqMat(slice_seqmat[:, :desired_length])
210
+
211
+ # Fallback (should not reach here normally)
212
+ return SeqMat(slice_seqmat)
213
+
214
+
215
+ def _rel_index(self, pos):
216
+ if pos in self.indices:
217
+ return np.where(self.seqmat[self.ROW_INDS, :] == pos)[0][0]
218
+ else:
219
+ raise IndexError(f"Position {pos} not found in sequence.")
220
+
221
+ def _is_same_strand(self, other):
222
+ """
223
+ Checks if two SeqMat objects are on the same strand.
224
+
225
+ Args:
226
+ other (SeqMat): The other SeqMat object to compare.
227
+
228
+ Returns:
229
+ bool: True if both are on the same strand, False otherwise.
230
+ """
231
+ self_indices = self.seqmat[self.ROW_INDS, :]
232
+ other_indices = other.seqmat[self.ROW_INDS, :]
233
+
234
+ # Determine monotonicity
235
+ self_increasing = np.all(np.diff(self_indices) >= 0)
236
+ self_decreasing = np.all(np.diff(self_indices) <= 0)
237
+ other_increasing = np.all(np.diff(other_indices) >= 0)
238
+ other_decreasing = np.all(np.diff(other_indices) <= 0)
239
+
240
+ # Both must be either increasing or decreasing
241
+ return (self_increasing and other_increasing) or (self_decreasing and other_decreasing)
242
+
243
+ def reverse_complement(self, inplace=True):
244
+ """
245
+ Reverse complement the sequence in place.
246
+ """
247
+ seqmat = self.seqmat[:, ::-1].copy()
248
+ seqmat[self.ROW_SEQ, :] = np.vectorize(self.value_complements.get)(seqmat[self.ROW_SEQ])
249
+
250
+ if inplace:
251
+ self.seqmat = seqmat
252
+ return self
253
+
254
+ return SeqMat(seqmat)
255
+
256
+ @classmethod
257
+ def from_seq(cls, seq_dict, alphabet=None):
258
+ """
259
+ Create a SeqMat object from a dictionary containing sequence information.
260
+ """
261
+ seq = np.array(list(seq_dict["seq"]))
262
+ inds = seq_dict.get("indices", np.arange(len(seq), dtype=np.int32))
263
+ superinds = seq_dict.get("superinds", np.zeros(len(seq), dtype=np.int32))
264
+ mutmark = np.zeros_like(superinds)
265
+
266
+ assert len(seq) == len(inds), f"Sequence length {len(seq)} must match indices length {len(inds)}"
267
+ if not cls._is_monotonic(inds):
268
+ raise ValueError(f"Sequence indices must be monotonic, got {inds}")
269
+
270
+ # Create character-to-value mapping
271
+ char_to_value = {c: i for i, c in enumerate(ALPHABET.keys())}
272
+ seq_values = [char_to_value[nt] for nt in seq]
273
+
274
+ # Stack sequence matrix
275
+ seqmat = np.vstack([seq_values, inds, superinds, mutmark]).astype(np.int32)
276
+ return cls(seqmat)
277
+
278
+ @staticmethod
279
+ def _is_monotonic(inds):
280
+ return all(x >= y for x, y in zip(inds, inds[1:])) if inds[0] > inds[-1] else all(
281
+ x <= y for x, y in zip(inds, inds[1:]))
282
+
283
+ @property
284
+ def seq(self):
285
+ return self.rawseq.replace('-', '')
286
+
287
+ @property
288
+ def rawseq(self):
289
+ return ''.join([self.value_to_char[int(ind)] for ind in self.seqmat[self.ROW_SEQ, :]])
290
+
291
+ @property
292
+ def indices(self):
293
+ return self.seqmat[self.ROW_INDS, self.seqmat[self.ROW_SEQ, :] != 5] + (
294
+ self.seqmat[self.ROW_SUPERINDS, self.seqmat[self.ROW_SEQ, :] != 5] / 10)
295
+
296
+ def mutate(self, mut, inplace=False):
297
+ """
298
+ Apply mutations to the sequence matrix.
299
+ Args:
300
+ mut (SeqMat): A SeqMat object containing mutations.
301
+ return_seqmat (bool): If True, return the mutated seqmat; otherwise, return updated sequence.
302
+
303
+ Returns:
304
+ str or np.ndarray: Mutated sequence or sequence matrix based on `return_seqmat`.
305
+ """
306
+ ### NEEDS some work to make sure that mutations can continue being added without issue...
307
+
308
+ # Ensure strand compatibility
309
+ if not self._is_same_strand(mut):
310
+ raise ValueError("Mutation and sequence are not on the same strand.")
311
+
312
+ # something to make sure the mutation is contained as one deletion, insertion, or snp or indel
313
+ ref_seqmat = self.seqmat.copy()
314
+ mut_seqmat = mut.seqmat
315
+
316
+ # Ensure mutation indices exist in the reference
317
+ if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
318
+ return self
319
+
320
+ # Handle the fact that only part of the mutation is in the sequence and isertable
321
+ if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
322
+ raise ValueError("Some mutation indices are not found in the reference sequence.")
323
+
324
+ # Handle replacements
325
+ temp = mut_seqmat[:, np.where(mut_seqmat[self.ROW_SUPERINDS, :] == 0)[0]]
326
+ condition = (
327
+ np.isin(ref_seqmat[self.ROW_INDS, :],
328
+ temp[self.ROW_INDS, :])
329
+ )
330
+
331
+ indices = np.where(condition)[0]
332
+ ref_seqmat[:, indices] = temp[:, :]
333
+
334
+ # Handle insertions
335
+ insertions = np.where(mut_seqmat[self.ROW_SUPERINDS, :] > 0)[0]
336
+ if insertions.size > 0:
337
+ ins_seqmat = mut_seqmat[:, insertions]
338
+ correction = 1 if self.seqmat[self.ROW_INDS, 0] > self.seqmat[self.ROW_INDS, -1] else 0
339
+ ins_loc = np.where(ref_seqmat[self.ROW_INDS, :] == ins_seqmat[self.ROW_INDS, 0])[0][0] + 1 - correction
340
+ ref_seqmat = np.insert(ref_seqmat, ins_loc, ins_seqmat.T, axis=1)
341
+
342
+ if inplace:
343
+ self.seqmat = ref_seqmat
344
+ return self
345
+
346
+ return SeqMat(ref_seqmat)
347
+
348
+ def orf_seqmat(self, tis_index):
349
+ temp = self.seqmat[:, self._rel_index(tis_index):]
350
+ temp = temp[:, temp[0, :] != 5]
351
+ temp = SeqMat(temp) # .drop_indices()
352
+ raw_seq = temp.seq # Extract the raw sequence
353
+ pattern = re.compile(r"(?:[NACGT]{3})*?(TAA|TAG|TGA)")
354
+ match = pattern.match(raw_seq)
355
+ if match:
356
+ stop_index = match.end()
357
+ else:
358
+ stop_index = len(raw_seq)
359
+ end_index = stop_index
360
+ return SeqMat(temp.seqmat[:, :end_index])
361
+
362
+ def translate(self, tis_index):
363
+ """
364
+ Translates a nucleotide sequence into an amino acid sequence.
365
+ Ensures the sequence length is divisible by 3 by trimming excess nucleotides.
366
+
367
+ Args:
368
+ sequence (str): Nucleotide sequence (e.g., ACGT).
369
+
370
+ Returns:
371
+ str: Translated amino acid sequence.
372
+ """
373
+ # Codon-to-amino acid mapping table (standard genetic code)
374
+ codon_table = {
375
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
376
+ 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
377
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
378
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
379
+ 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
380
+ 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
381
+ 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
382
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
383
+ 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
384
+ 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
385
+ 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
386
+ 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
387
+ 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
388
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
389
+ 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
390
+ 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
391
+ }
392
+ sequence = self.orf_seqmat(tis_index).seq
393
+
394
+ # Ensure sequence is uppercase
395
+ sequence = sequence.upper()
396
+
397
+ # Trim sequence to ensure divisibility by 3
398
+ trimmed_length = len(sequence) - (len(sequence) % 3)
399
+ sequence = sequence[:trimmed_length]
400
+
401
+ # Translate sequence in chunks of 3
402
+ amino_acids = [codon_table.get(sequence[i:i+3], 'X') for i in range(0, len(sequence), 3)]
403
+
404
+ # Join amino acids into a single string
405
+ return ''.join(amino_acids)
406
+
407
+
408
+ def to_dict(self):
409
+ return {'seq': self.rawseq, 'indices': self.indices, 'superinds': self.seqmat[self.ROW_SUPERINDS, :]}
410
+
411
+ class DnaSeqMat(SeqMat):
412
+ pass
413
+
414
+
415
+ class RnaSeqMat(SeqMat):
416
+ pass
417
+
418
+
419
+ class AASeqMat(SeqMat):
420
+ pass
421
+
422
+
423
+ class MutSeqMat(SeqMat):
424
+ """
425
+ A subclass of SeqMat designed specifically for mutation sequences.
426
+
427
+ Additional Conditions:
428
+ 1. Mutation indices must be consecutive (increasing or decreasing).
429
+ 2. The superinds row must have a maximum value of 10.
430
+ """
431
+
432
+ def __init__(self, seqmat, alphabet=None):
433
+ super().__init__(seqmat, alphabet)
434
+
435
+ # Validate the mutation-specific conditions
436
+ self._validate_mutation_indices()
437
+ self.seqmat[-1, :] = 1
438
+ self.position = min(self.seqmat[self.ROW_INDS, :])
439
+
440
+ # self._validate_superinds()
441
+
442
+ def _validate_mutation_indices(self):
443
+ """
444
+ Validates that the mutation indices are consecutive (increasing or decreasing).
445
+ """
446
+ indices = self.seqmat[self.ROW_INDS, :]
447
+ if not (np.all(abs(np.diff(indices)) <= 1)):
448
+ raise ValueError(f"Mutation indices must be consecutive. Got: {indices}")
449
+
450
+
451
+ @classmethod
452
+ def from_mutid(cls, mid):
453
+ gene, chrom, i, r, a = mid.split(':')
454
+ if list(set(a))[0] == '-' and len(a) > 1 and len(list(set(a))) == 1:
455
+ a = '-'
456
+
457
+ if list(set(r))[0] == '-' and len(r) > 1 and len(list(set(r))) == 1:
458
+ r = '-'
459
+
460
+ i = int(i)
461
+
462
+ if len(a) == len(r) == 1 and a != '-' and r != '-':
463
+ temp = {'seq': a, 'indices': [i], 'superinds': [0]}
464
+
465
+ elif a == '-' and r != '-':
466
+ # return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
467
+ temp = {'seq': '-'*len(r), 'indices': np.arange(i, i + len(r), dtype=np.int32), 'superinds': [0] * len(r)}
468
+
469
+ elif r == '-' and a != '-':
470
+ # print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
471
+ # return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
472
+ temp = {'seq': a, 'indices': np.full(len(a), int(i)), 'superinds': np.arange(1, len(a)+1)}
473
+
474
+ elif a != '-' and r != '-':
475
+ ind1 = np.concatenate(
476
+ [np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
477
+ ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
478
+ # return Allele('-' * len(r) + a, ind1, ind2, rev)
479
+ temp = {'seq': '-' * len(r) + a, 'indices': ind1, 'superinds': ind2}
480
+
481
+ return cls.from_seq(temp)
482
+
483
+
484
+ # def _validate_superinds(self):
485
+ # """
486
+ # Validates that the superinds row has a maximum value of 10.
487
+ # """
488
+ # superinds = self.seqmat[self.ROW_SUPERINDS, :]
489
+ # if np.max(superinds) > 10:
490
+ # raise ValueError(f"Superinds row must have a maximum value of 10. Got: {superinds}")
491
+
492
+