geney 1.1.12__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney-1.3.0/PKG-INFO +19 -0
- geney-1.3.0/geney/Gene.py +177 -0
- geney-1.3.0/geney/SeqMats.py +492 -0
- geney-1.3.0/geney/Transcript.py +379 -0
- {geney-1.1.12 → geney-1.3.0}/geney/__init__.py +13 -1
- geney-1.3.0/geney/_mutation_utils.py +38 -0
- geney-1.3.0/geney/config_setup.py +15 -0
- {geney-1.1.12 → geney-1.3.0}/geney/data_setup.py +75 -38
- geney-1.3.0/geney/graphic_utils.py +269 -0
- {geney-1.1.12 → geney-1.3.0}/geney/immune_utils.py +12 -14
- geney-1.3.0/geney/oncosplice.py +484 -0
- geney-1.3.0/geney/pangolin_utils.py +82 -0
- geney-1.3.0/geney/spliceai_utils.py +76 -0
- geney-1.3.0/geney/splicing_utils.py +466 -0
- {geney-1.1.12 → geney-1.3.0}/geney/survival_utils.py +25 -8
- {geney-1.1.12 → geney-1.3.0}/geney/tcga_utils.py +70 -37
- geney-1.3.0/geney/tis_utils.py +163 -0
- {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/tis_utils.py +2 -0
- geney-1.3.0/geney/utils.py +80 -0
- geney-1.3.0/geney.egg-info/PKG-INFO +19 -0
- {geney-1.1.12 → geney-1.3.0}/geney.egg-info/SOURCES.txt +11 -3
- geney-1.3.0/geney.egg-info/requires.txt +17 -0
- {geney-1.1.12 → geney-1.3.0}/setup.py +1 -1
- geney-1.3.0/tests/test_oncosplice.py +25 -0
- geney-1.1.12/PKG-INFO +0 -34
- geney-1.1.12/geney/config_setup.py +0 -14
- geney-1.1.12/geney/oncosplice.py +0 -2690
- geney-1.1.12/geney/performance_utils.py +0 -138
- geney-1.1.12/geney/power_utils.py +0 -180
- geney-1.1.12/geney/utils.py +0 -75
- geney-1.1.12/geney.egg-info/PKG-INFO +0 -34
- geney-1.1.12/geney.egg-info/requires.txt +0 -19
- {geney-1.1.12 → geney-1.3.0}/MANIFEST.in +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney/Fasta_segment.py +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney/gtex_utils.py +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/__init__.py +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/resources/kozak_pssm.json +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney/translation_initiation/resources/tis_regressor_model.joblib +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney.egg-info/dependency_links.txt +0 -0
- {geney-1.1.12 → geney-1.3.0}/geney.egg-info/top_level.txt +0 -0
- {geney-1.1.12 → geney-1.3.0}/setup.cfg +0 -0
geney-1.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: geney
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: A Python package for gene expression modeling.
|
|
5
|
+
Home-page: https://github.com/nicolaslynn/geney
|
|
6
|
+
Author: Nicolas Lynn
|
|
7
|
+
Author-email: nicolasalynn@gmail.com
|
|
8
|
+
License: Free for non-commercial use
|
|
9
|
+
Platform: UNKNOWN
|
|
10
|
+
Classifier: Development Status :: 1 - Planning
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: Free for non-commercial use
|
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
14
|
+
Classifier: Operating System :: MacOS
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Requires-Python: >3.9
|
|
17
|
+
|
|
18
|
+
UNKNOWN
|
|
19
|
+
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import Any, Dict, List, Tuple, Optional, Iterator, Union, TYPE_CHECKING
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from . import unload_pickle, config
|
|
5
|
+
from .Transcript import Transcript
|
|
6
|
+
|
|
7
|
+
class Gene:
|
|
8
|
+
"""
|
|
9
|
+
A class representing a Gene, with associated transcripts and metadata.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
organism (str): The organism build (e.g. 'hg38').
|
|
13
|
+
transcripts (dict): A dictionary of transcript annotations keyed by transcript ID.
|
|
14
|
+
gene_name (str): The name of the gene.
|
|
15
|
+
gene_id (str): The unique identifier for the gene.
|
|
16
|
+
chrm (str): The chromosome on which the gene resides.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
|
|
20
|
+
"""
|
|
21
|
+
Initialize a Gene instance by loading gene information from stored pickled files.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
gene_name (str): Name of the gene (default 'KRAS').
|
|
25
|
+
variation: Variation information (unused currently).
|
|
26
|
+
organism (str): Organism reference build (default 'hg38').
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
FileNotFoundError: If no files for the specified gene are found.
|
|
30
|
+
AssertionError: If required attributes are missing after loading.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
|
|
34
|
+
self.gene_name = gene_name
|
|
35
|
+
self.gene_id = gene_id
|
|
36
|
+
self.rev = rev
|
|
37
|
+
self.chrm = chrm
|
|
38
|
+
self.organism = organism
|
|
39
|
+
self.transcripts = transcripts if transcripts is not None else {}
|
|
40
|
+
|
|
41
|
+
def __repr__(self) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Official string representation of the Gene object.
|
|
44
|
+
"""
|
|
45
|
+
return f"Gene({self.gene_name})"
|
|
46
|
+
|
|
47
|
+
def __str__(self) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Unofficial, user-friendly string representation of the Gene object.
|
|
50
|
+
"""
|
|
51
|
+
return f"Gene: {self.gene_name}, ID: {self.gene_id}, Chr: {self.chrm}, Transcripts: {len(self.transcripts)}"
|
|
52
|
+
|
|
53
|
+
def __len__(self) -> int:
|
|
54
|
+
"""
|
|
55
|
+
Returns the number of transcripts associated with this gene.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
int: The count of transcripts.
|
|
59
|
+
"""
|
|
60
|
+
return len(self.transcripts)
|
|
61
|
+
|
|
62
|
+
def __copy__(self):
|
|
63
|
+
"""
|
|
64
|
+
Returns a shallow copy of the Gene object.
|
|
65
|
+
"""
|
|
66
|
+
return copy.copy(self)
|
|
67
|
+
|
|
68
|
+
def __deepcopy__(self, memo):
|
|
69
|
+
"""
|
|
70
|
+
Returns a deep copy of the Gene object.
|
|
71
|
+
"""
|
|
72
|
+
return copy.deepcopy(self, memo)
|
|
73
|
+
|
|
74
|
+
def __iter__(self):
|
|
75
|
+
"""
|
|
76
|
+
Allow iteration over the gene's transcripts, yielding Transcript objects.
|
|
77
|
+
"""
|
|
78
|
+
for tid, annotations in self.transcripts.items():
|
|
79
|
+
yield Transcript(annotations, organism=self.organism)
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_file(cls, gene_name, organism='hg38'):
|
|
83
|
+
# Load data from file here
|
|
84
|
+
|
|
85
|
+
# Find gene data files in the configured organism MRNA path
|
|
86
|
+
gene_files = list((config[organism]['MRNA_PATH'] / 'protein_coding').glob(f'*_{gene_name}.pkl'))
|
|
87
|
+
if not gene_files:
|
|
88
|
+
raise FileNotFoundError(f"No files available for gene '{gene_name}'.")
|
|
89
|
+
|
|
90
|
+
# Load gene data from the first matching file
|
|
91
|
+
data = unload_pickle(gene_files[0])
|
|
92
|
+
gene_name = data.get('gene_name')
|
|
93
|
+
gene_id = data.get('gene_id')
|
|
94
|
+
rev = data.get('rev')
|
|
95
|
+
chrm = data.get('chrm')
|
|
96
|
+
transcripts = data.get('transcripts', {})
|
|
97
|
+
|
|
98
|
+
return cls(
|
|
99
|
+
gene_name=gene_name,
|
|
100
|
+
gene_id=gene_id,
|
|
101
|
+
rev=rev,
|
|
102
|
+
chrm=chrm,
|
|
103
|
+
transcripts=transcripts,
|
|
104
|
+
organism=organism
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def splice_sites(self) -> Tuple['Counter', 'Counter']:
|
|
108
|
+
"""
|
|
109
|
+
Aggregates splice sites (acceptors and donors) from all transcripts.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
tuple(Counter, Counter): A tuple of two Counters for acceptors and donors.
|
|
113
|
+
"""
|
|
114
|
+
from collections import Counter
|
|
115
|
+
acceptors: List[Any] = []
|
|
116
|
+
donors: List[Any] = []
|
|
117
|
+
|
|
118
|
+
# Collect acceptor and donor sites from each transcript
|
|
119
|
+
for transcript in self.transcripts.values():
|
|
120
|
+
acceptors.extend(transcript.get('acceptors', []))
|
|
121
|
+
donors.extend(transcript.get('donors', []))
|
|
122
|
+
|
|
123
|
+
return Counter(acceptors), Counter(donors)
|
|
124
|
+
|
|
125
|
+
def transcript(self, tid: Optional[str] = None):
|
|
126
|
+
"""
|
|
127
|
+
Retrieve a Transcript object by ID, or the primary transcript if no ID is given.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
tid (str, optional): Transcript ID. If None, returns primary transcript.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Transcript: The Transcript object with the given ID or the primary transcript.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
AttributeError: If the requested transcript does not exist.
|
|
137
|
+
"""
|
|
138
|
+
if tid is None:
|
|
139
|
+
tid = self.primary_transcript
|
|
140
|
+
|
|
141
|
+
if tid not in self.transcripts:
|
|
142
|
+
raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
|
|
143
|
+
|
|
144
|
+
return Transcript(self.transcripts[tid], organism=self.organism)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def primary_transcript(self) -> Optional[str]:
|
|
148
|
+
"""
|
|
149
|
+
Returns the primary transcript ID for this gene.
|
|
150
|
+
If not explicitly defined, it attempts to select a primary transcript.
|
|
151
|
+
If none is found, it falls back to the first protein-coding transcript.
|
|
152
|
+
If still none is found, returns None.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
str or None: The primary transcript ID or None if not available.
|
|
156
|
+
"""
|
|
157
|
+
# If already calculated, return it
|
|
158
|
+
if hasattr(self, '_primary_transcript'):
|
|
159
|
+
return self._primary_transcript
|
|
160
|
+
|
|
161
|
+
# Try to find a primary transcript
|
|
162
|
+
primary_transcripts = [k for k, v in self.transcripts.items() if v.get('primary_transcript')]
|
|
163
|
+
if primary_transcripts:
|
|
164
|
+
self._primary_transcript = primary_transcripts[0]
|
|
165
|
+
return self._primary_transcript
|
|
166
|
+
|
|
167
|
+
# Fallback: find a protein-coding transcript
|
|
168
|
+
protein_coding = [k for k, v in self.transcripts.items() if v.get('transcript_biotype') == 'protein_coding']
|
|
169
|
+
if protein_coding:
|
|
170
|
+
self._primary_transcript = protein_coding[0]
|
|
171
|
+
return self._primary_transcript
|
|
172
|
+
|
|
173
|
+
# No primary or protein-coding transcript found
|
|
174
|
+
self._primary_transcript = None
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
ALPHABET = {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
|
|
5
|
+
|
|
6
|
+
class SeqMat:
|
|
7
|
+
ROW_SEQ = 0
|
|
8
|
+
ROW_INDS = 1
|
|
9
|
+
ROW_SUPERINDS = 2
|
|
10
|
+
ROW_MUTATED = 3
|
|
11
|
+
|
|
12
|
+
def __init__(self, seqmat, alphabet=None):
|
|
13
|
+
self.seqmat = seqmat
|
|
14
|
+
self.alphabet = alphabet or {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', '-': '-'}
|
|
15
|
+
|
|
16
|
+
self.char_to_value = {c: i for i, c in enumerate(self.alphabet.keys())}
|
|
17
|
+
self.value_to_char = {i: c for i, c in enumerate(self.alphabet.keys())}
|
|
18
|
+
self.value_complements = {self.char_to_value[c1]: self.char_to_value[c2] for c1, c2 in self.alphabet.items()}
|
|
19
|
+
|
|
20
|
+
def __repr__(self):
|
|
21
|
+
return f"<SeqMat: {self.seq}>"
|
|
22
|
+
|
|
23
|
+
def __str__(self):
|
|
24
|
+
return self.seq
|
|
25
|
+
|
|
26
|
+
def __len__(self):
|
|
27
|
+
return self.seqmat.shape[1]
|
|
28
|
+
|
|
29
|
+
def __getitem__(self, key):
|
|
30
|
+
if isinstance(key, slice):
|
|
31
|
+
pos1, pos2 = self._rel_index(key.start), self._rel_index(key.stop)
|
|
32
|
+
return SeqMat(self.seqmat[:, pos1:pos2+1])
|
|
33
|
+
else:
|
|
34
|
+
pos = self._rel_index(key)
|
|
35
|
+
return SeqMat(self.seqmat[:, pos:pos + 1])
|
|
36
|
+
|
|
37
|
+
def __contains__(self, other):
|
|
38
|
+
"""
|
|
39
|
+
Checks if another SeqMat object is entirely contained within this SeqMat object.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
other (SeqMat): Another SeqMat object to check for containment.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
bool: True if `other` is contained in `self`, False otherwise.
|
|
46
|
+
"""
|
|
47
|
+
# Ensure `other` is a SeqMat
|
|
48
|
+
if not isinstance(other, SeqMat):
|
|
49
|
+
raise TypeError("Can only check containment with another SeqMat object.")
|
|
50
|
+
|
|
51
|
+
# Check if all indices of `other` are in `self`
|
|
52
|
+
other_indices = other.seqmat[other.ROW_INDS, :]
|
|
53
|
+
self_indices = self.seqmat[self.ROW_INDS, :]
|
|
54
|
+
if not np.all(np.isin(other_indices, self_indices)):
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
return True
|
|
58
|
+
|
|
59
|
+
def __eq__(self, other):
|
|
60
|
+
"""
|
|
61
|
+
Implements the == operator to compare two SeqMat objects.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
other (SeqMat): The other SeqMat object to compare.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
bool: True if the two SeqMat objects are equal, False otherwise.
|
|
68
|
+
"""
|
|
69
|
+
# Ensure `other` is a SeqMat object
|
|
70
|
+
if not isinstance(other, SeqMat):
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
# Compare the sequence matrix
|
|
74
|
+
if not np.array_equal(self.seqmat, other.seqmat):
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def empty(cls, alphabet=None):
|
|
81
|
+
"""
|
|
82
|
+
Creates an empty SeqMat object.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
alphabet (dict): Optional alphabet dictionary (default: {'N': 'N', 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}).
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
SeqMat: An empty SeqMat object.
|
|
89
|
+
"""
|
|
90
|
+
empty_seqmat = np.zeros((4, 0), dtype=np.int32) # 4 rows, 0 columns (no data)
|
|
91
|
+
return cls(empty_seqmat, alphabet=alphabet)
|
|
92
|
+
|
|
93
|
+
def __add__(self, other):
|
|
94
|
+
"""
|
|
95
|
+
Implements the + operator. Joins two SeqMat objects or applies mutations.
|
|
96
|
+
|
|
97
|
+
If `other` is outside the range of indices, the sequences are concatenated, provided the indices are
|
|
98
|
+
monotonically increasing or decreasing. Otherwise, it applies the mutation.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
other (SeqMat): Another SeqMat object to join or mutate.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
SeqMat: A new SeqMat object with the resulting sequence.
|
|
105
|
+
"""
|
|
106
|
+
# Ensure `other` is a SeqMat
|
|
107
|
+
if not isinstance(other, SeqMat):
|
|
108
|
+
raise TypeError("Can only add another SeqMat object.")
|
|
109
|
+
|
|
110
|
+
if other in self:
|
|
111
|
+
return self.mutate(other)
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
combined_seqmat = np.hstack((self.seqmat, other.seqmat))
|
|
115
|
+
|
|
116
|
+
# Ensure the combined sequence is monotonic
|
|
117
|
+
if not self._is_monotonic(combined_seqmat[self.ROW_INDS]):
|
|
118
|
+
raise ValueError("Resulting sequence indices are not monotonic.")
|
|
119
|
+
|
|
120
|
+
return SeqMat(combined_seqmat, alphabet=self.alphabet)
|
|
121
|
+
|
|
122
|
+
def __iadd__(self, other):
|
|
123
|
+
"""
|
|
124
|
+
Implements the += operator. Joins two SeqMat objects or applies mutations in place.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
other (SeqMat): Another SeqMat object to join or mutate.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
SeqMat: The mutated or joined SeqMat object.
|
|
131
|
+
"""
|
|
132
|
+
# Ensure `other` is a SeqMat
|
|
133
|
+
if not isinstance(other, SeqMat):
|
|
134
|
+
raise TypeError("Can only add another SeqMat object.")
|
|
135
|
+
|
|
136
|
+
if other in self:
|
|
137
|
+
self.seqmat = self.mutate(other).seqmat
|
|
138
|
+
return self
|
|
139
|
+
else:
|
|
140
|
+
self.seqmat = np.hstack((self.seqmat, other.seqmat))
|
|
141
|
+
|
|
142
|
+
if not self._is_monotonic(self.seqmat[self.ROW_INDS]):
|
|
143
|
+
raise ValueError("Resulting sequence indices are not monotonic.")
|
|
144
|
+
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
# def get_context(self, pos, context=500):
|
|
148
|
+
# pos = self._rel_index(pos)
|
|
149
|
+
# lower_bound, upper_bound = max(0, pos - context), min(len(self), pos + context + 1)
|
|
150
|
+
# return SeqMat(self.seqmat[:, lower_bound:upper_bound])
|
|
151
|
+
|
|
152
|
+
def get_context(self, pos, context=500, padding=None):
|
|
153
|
+
"""
|
|
154
|
+
Returns a SeqMat object representing the region around `pos` with the given context.
|
|
155
|
+
If padding is provided and the requested context extends beyond the sequence boundaries,
|
|
156
|
+
the result is padded with the specified nucleotide in the sequence row and -1 in the indices rows.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
pos (int): The position of interest in the original coordinate space.
|
|
160
|
+
context (int): The number of nucleotides to include on each side of pos (default 500).
|
|
161
|
+
padding (str or None): The nucleotide to use for padding. If None, no padding is applied and
|
|
162
|
+
the returned region may be shorter than requested.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
SeqMat: A new SeqMat object containing the context region (padded if requested).
|
|
166
|
+
"""
|
|
167
|
+
# Resolve the relative index
|
|
168
|
+
pos = self._rel_index(pos)
|
|
169
|
+
|
|
170
|
+
# Calculate desired start and end positions
|
|
171
|
+
desired_length = 2 * context + 1
|
|
172
|
+
start = pos - context
|
|
173
|
+
end = pos + context + 1
|
|
174
|
+
|
|
175
|
+
# Actual bounds clipped to the available length
|
|
176
|
+
actual_start = max(start, 0)
|
|
177
|
+
actual_end = min(len(self), end)
|
|
178
|
+
|
|
179
|
+
# Extract the slice that fits within the sequence
|
|
180
|
+
slice_seqmat = self.seqmat[:, actual_start:actual_end]
|
|
181
|
+
|
|
182
|
+
extracted_length = slice_seqmat.shape[1]
|
|
183
|
+
|
|
184
|
+
# If no padding requested, just return the slice
|
|
185
|
+
if padding is None or extracted_length == desired_length:
|
|
186
|
+
return SeqMat(slice_seqmat)
|
|
187
|
+
|
|
188
|
+
# If padding is requested and we have fewer columns than desired, pad the result
|
|
189
|
+
if extracted_length < desired_length:
|
|
190
|
+
# Determine how much we need to pad on each side
|
|
191
|
+
pad_left = max(-start, 0) # How many columns needed before actual_start
|
|
192
|
+
pad_right = max(end - len(self), 0) # How many columns needed after actual_end
|
|
193
|
+
|
|
194
|
+
# Determine numeric code for padding nucleotide
|
|
195
|
+
# Assuming self.char_to_value is available and 'N' is known if padding isn't recognized
|
|
196
|
+
N_val = self.char_to_value.get(padding, self.char_to_value['N'])
|
|
197
|
+
|
|
198
|
+
# Create a new array with the desired length
|
|
199
|
+
new_seqmat = np.full((self.seqmat.shape[0], desired_length), -1, dtype=self.seqmat.dtype)
|
|
200
|
+
# Fill the sequence row with N_val
|
|
201
|
+
new_seqmat[0, :] = N_val
|
|
202
|
+
|
|
203
|
+
# Place the extracted slice into the correct position
|
|
204
|
+
new_seqmat[:, pad_left:pad_left + extracted_length] = slice_seqmat
|
|
205
|
+
return SeqMat(new_seqmat)
|
|
206
|
+
|
|
207
|
+
# If for some reason extracted_length > desired_length (unlikely), just truncate
|
|
208
|
+
if extracted_length > desired_length:
|
|
209
|
+
return SeqMat(slice_seqmat[:, :desired_length])
|
|
210
|
+
|
|
211
|
+
# Fallback (should not reach here normally)
|
|
212
|
+
return SeqMat(slice_seqmat)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _rel_index(self, pos):
|
|
216
|
+
if pos in self.indices:
|
|
217
|
+
return np.where(self.seqmat[self.ROW_INDS, :] == pos)[0][0]
|
|
218
|
+
else:
|
|
219
|
+
raise IndexError(f"Position {pos} not found in sequence.")
|
|
220
|
+
|
|
221
|
+
def _is_same_strand(self, other):
|
|
222
|
+
"""
|
|
223
|
+
Checks if two SeqMat objects are on the same strand.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
other (SeqMat): The other SeqMat object to compare.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
bool: True if both are on the same strand, False otherwise.
|
|
230
|
+
"""
|
|
231
|
+
self_indices = self.seqmat[self.ROW_INDS, :]
|
|
232
|
+
other_indices = other.seqmat[self.ROW_INDS, :]
|
|
233
|
+
|
|
234
|
+
# Determine monotonicity
|
|
235
|
+
self_increasing = np.all(np.diff(self_indices) >= 0)
|
|
236
|
+
self_decreasing = np.all(np.diff(self_indices) <= 0)
|
|
237
|
+
other_increasing = np.all(np.diff(other_indices) >= 0)
|
|
238
|
+
other_decreasing = np.all(np.diff(other_indices) <= 0)
|
|
239
|
+
|
|
240
|
+
# Both must be either increasing or decreasing
|
|
241
|
+
return (self_increasing and other_increasing) or (self_decreasing and other_decreasing)
|
|
242
|
+
|
|
243
|
+
def reverse_complement(self, inplace=True):
|
|
244
|
+
"""
|
|
245
|
+
Reverse complement the sequence in place.
|
|
246
|
+
"""
|
|
247
|
+
seqmat = self.seqmat[:, ::-1].copy()
|
|
248
|
+
seqmat[self.ROW_SEQ, :] = np.vectorize(self.value_complements.get)(seqmat[self.ROW_SEQ])
|
|
249
|
+
|
|
250
|
+
if inplace:
|
|
251
|
+
self.seqmat = seqmat
|
|
252
|
+
return self
|
|
253
|
+
|
|
254
|
+
return SeqMat(seqmat)
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def from_seq(cls, seq_dict, alphabet=None):
|
|
258
|
+
"""
|
|
259
|
+
Create a SeqMat object from a dictionary containing sequence information.
|
|
260
|
+
"""
|
|
261
|
+
seq = np.array(list(seq_dict["seq"]))
|
|
262
|
+
inds = seq_dict.get("indices", np.arange(len(seq), dtype=np.int32))
|
|
263
|
+
superinds = seq_dict.get("superinds", np.zeros(len(seq), dtype=np.int32))
|
|
264
|
+
mutmark = np.zeros_like(superinds)
|
|
265
|
+
|
|
266
|
+
assert len(seq) == len(inds), f"Sequence length {len(seq)} must match indices length {len(inds)}"
|
|
267
|
+
if not cls._is_monotonic(inds):
|
|
268
|
+
raise ValueError(f"Sequence indices must be monotonic, got {inds}")
|
|
269
|
+
|
|
270
|
+
# Create character-to-value mapping
|
|
271
|
+
char_to_value = {c: i for i, c in enumerate(ALPHABET.keys())}
|
|
272
|
+
seq_values = [char_to_value[nt] for nt in seq]
|
|
273
|
+
|
|
274
|
+
# Stack sequence matrix
|
|
275
|
+
seqmat = np.vstack([seq_values, inds, superinds, mutmark]).astype(np.int32)
|
|
276
|
+
return cls(seqmat)
|
|
277
|
+
|
|
278
|
+
@staticmethod
|
|
279
|
+
def _is_monotonic(inds):
|
|
280
|
+
return all(x >= y for x, y in zip(inds, inds[1:])) if inds[0] > inds[-1] else all(
|
|
281
|
+
x <= y for x, y in zip(inds, inds[1:]))
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def seq(self):
|
|
285
|
+
return self.rawseq.replace('-', '')
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def rawseq(self):
|
|
289
|
+
return ''.join([self.value_to_char[int(ind)] for ind in self.seqmat[self.ROW_SEQ, :]])
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def indices(self):
|
|
293
|
+
return self.seqmat[self.ROW_INDS, self.seqmat[self.ROW_SEQ, :] != 5] + (
|
|
294
|
+
self.seqmat[self.ROW_SUPERINDS, self.seqmat[self.ROW_SEQ, :] != 5] / 10)
|
|
295
|
+
|
|
296
|
+
def mutate(self, mut, inplace=False):
|
|
297
|
+
"""
|
|
298
|
+
Apply mutations to the sequence matrix.
|
|
299
|
+
Args:
|
|
300
|
+
mut (SeqMat): A SeqMat object containing mutations.
|
|
301
|
+
return_seqmat (bool): If True, return the mutated seqmat; otherwise, return updated sequence.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
str or np.ndarray: Mutated sequence or sequence matrix based on `return_seqmat`.
|
|
305
|
+
"""
|
|
306
|
+
### NEEDS some work to make sure that mutations can continue being added without issue...
|
|
307
|
+
|
|
308
|
+
# Ensure strand compatibility
|
|
309
|
+
if not self._is_same_strand(mut):
|
|
310
|
+
raise ValueError("Mutation and sequence are not on the same strand.")
|
|
311
|
+
|
|
312
|
+
# something to make sure the mutation is contained as one deletion, insertion, or snp or indel
|
|
313
|
+
ref_seqmat = self.seqmat.copy()
|
|
314
|
+
mut_seqmat = mut.seqmat
|
|
315
|
+
|
|
316
|
+
# Ensure mutation indices exist in the reference
|
|
317
|
+
if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
|
|
318
|
+
return self
|
|
319
|
+
|
|
320
|
+
# Handle the fact that only part of the mutation is in the sequence and isertable
|
|
321
|
+
if not np.all(np.isin(mut_seqmat[self.ROW_INDS, :], ref_seqmat[self.ROW_INDS, :])):
|
|
322
|
+
raise ValueError("Some mutation indices are not found in the reference sequence.")
|
|
323
|
+
|
|
324
|
+
# Handle replacements
|
|
325
|
+
temp = mut_seqmat[:, np.where(mut_seqmat[self.ROW_SUPERINDS, :] == 0)[0]]
|
|
326
|
+
condition = (
|
|
327
|
+
np.isin(ref_seqmat[self.ROW_INDS, :],
|
|
328
|
+
temp[self.ROW_INDS, :])
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
indices = np.where(condition)[0]
|
|
332
|
+
ref_seqmat[:, indices] = temp[:, :]
|
|
333
|
+
|
|
334
|
+
# Handle insertions
|
|
335
|
+
insertions = np.where(mut_seqmat[self.ROW_SUPERINDS, :] > 0)[0]
|
|
336
|
+
if insertions.size > 0:
|
|
337
|
+
ins_seqmat = mut_seqmat[:, insertions]
|
|
338
|
+
correction = 1 if self.seqmat[self.ROW_INDS, 0] > self.seqmat[self.ROW_INDS, -1] else 0
|
|
339
|
+
ins_loc = np.where(ref_seqmat[self.ROW_INDS, :] == ins_seqmat[self.ROW_INDS, 0])[0][0] + 1 - correction
|
|
340
|
+
ref_seqmat = np.insert(ref_seqmat, ins_loc, ins_seqmat.T, axis=1)
|
|
341
|
+
|
|
342
|
+
if inplace:
|
|
343
|
+
self.seqmat = ref_seqmat
|
|
344
|
+
return self
|
|
345
|
+
|
|
346
|
+
return SeqMat(ref_seqmat)
|
|
347
|
+
|
|
348
|
+
def orf_seqmat(self, tis_index):
|
|
349
|
+
temp = self.seqmat[:, self._rel_index(tis_index):]
|
|
350
|
+
temp = temp[:, temp[0, :] != 5]
|
|
351
|
+
temp = SeqMat(temp) # .drop_indices()
|
|
352
|
+
raw_seq = temp.seq # Extract the raw sequence
|
|
353
|
+
pattern = re.compile(r"(?:[NACGT]{3})*?(TAA|TAG|TGA)")
|
|
354
|
+
match = pattern.match(raw_seq)
|
|
355
|
+
if match:
|
|
356
|
+
stop_index = match.end()
|
|
357
|
+
else:
|
|
358
|
+
stop_index = len(raw_seq)
|
|
359
|
+
end_index = stop_index
|
|
360
|
+
return SeqMat(temp.seqmat[:, :end_index])
|
|
361
|
+
|
|
362
|
+
def translate(self, tis_index):
|
|
363
|
+
"""
|
|
364
|
+
Translates a nucleotide sequence into an amino acid sequence.
|
|
365
|
+
Ensures the sequence length is divisible by 3 by trimming excess nucleotides.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
sequence (str): Nucleotide sequence (e.g., ACGT).
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
str: Translated amino acid sequence.
|
|
372
|
+
"""
|
|
373
|
+
# Codon-to-amino acid mapping table (standard genetic code)
|
|
374
|
+
codon_table = {
|
|
375
|
+
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
|
376
|
+
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
|
377
|
+
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
|
378
|
+
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
|
379
|
+
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
|
380
|
+
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
|
381
|
+
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
|
382
|
+
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
|
383
|
+
'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
|
|
384
|
+
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
|
385
|
+
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
|
386
|
+
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
|
387
|
+
'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
|
|
388
|
+
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
|
389
|
+
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
|
390
|
+
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
|
|
391
|
+
}
|
|
392
|
+
sequence = self.orf_seqmat(tis_index).seq
|
|
393
|
+
|
|
394
|
+
# Ensure sequence is uppercase
|
|
395
|
+
sequence = sequence.upper()
|
|
396
|
+
|
|
397
|
+
# Trim sequence to ensure divisibility by 3
|
|
398
|
+
trimmed_length = len(sequence) - (len(sequence) % 3)
|
|
399
|
+
sequence = sequence[:trimmed_length]
|
|
400
|
+
|
|
401
|
+
# Translate sequence in chunks of 3
|
|
402
|
+
amino_acids = [codon_table.get(sequence[i:i+3], 'X') for i in range(0, len(sequence), 3)]
|
|
403
|
+
|
|
404
|
+
# Join amino acids into a single string
|
|
405
|
+
return ''.join(amino_acids)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def to_dict(self):
|
|
409
|
+
return {'seq': self.rawseq, 'indices': self.indices, 'superinds': self.seqmat[self.ROW_SUPERINDS, :]}
|
|
410
|
+
|
|
411
|
+
class DnaSeqMat(SeqMat):
|
|
412
|
+
pass
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class RnaSeqMat(SeqMat):
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class AASeqMat(SeqMat):
|
|
420
|
+
pass
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class MutSeqMat(SeqMat):
|
|
424
|
+
"""
|
|
425
|
+
A subclass of SeqMat designed specifically for mutation sequences.
|
|
426
|
+
|
|
427
|
+
Additional Conditions:
|
|
428
|
+
1. Mutation indices must be consecutive (increasing or decreasing).
|
|
429
|
+
2. The superinds row must have a maximum value of 10.
|
|
430
|
+
"""
|
|
431
|
+
|
|
432
|
+
def __init__(self, seqmat, alphabet=None):
|
|
433
|
+
super().__init__(seqmat, alphabet)
|
|
434
|
+
|
|
435
|
+
# Validate the mutation-specific conditions
|
|
436
|
+
self._validate_mutation_indices()
|
|
437
|
+
self.seqmat[-1, :] = 1
|
|
438
|
+
self.position = min(self.seqmat[self.ROW_INDS, :])
|
|
439
|
+
|
|
440
|
+
# self._validate_superinds()
|
|
441
|
+
|
|
442
|
+
def _validate_mutation_indices(self):
|
|
443
|
+
"""
|
|
444
|
+
Validates that the mutation indices are consecutive (increasing or decreasing).
|
|
445
|
+
"""
|
|
446
|
+
indices = self.seqmat[self.ROW_INDS, :]
|
|
447
|
+
if not (np.all(abs(np.diff(indices)) <= 1)):
|
|
448
|
+
raise ValueError(f"Mutation indices must be consecutive. Got: {indices}")
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
@classmethod
|
|
452
|
+
def from_mutid(cls, mid):
|
|
453
|
+
gene, chrom, i, r, a = mid.split(':')
|
|
454
|
+
if list(set(a))[0] == '-' and len(a) > 1 and len(list(set(a))) == 1:
|
|
455
|
+
a = '-'
|
|
456
|
+
|
|
457
|
+
if list(set(r))[0] == '-' and len(r) > 1 and len(list(set(r))) == 1:
|
|
458
|
+
r = '-'
|
|
459
|
+
|
|
460
|
+
i = int(i)
|
|
461
|
+
|
|
462
|
+
if len(a) == len(r) == 1 and a != '-' and r != '-':
|
|
463
|
+
temp = {'seq': a, 'indices': [i], 'superinds': [0]}
|
|
464
|
+
|
|
465
|
+
elif a == '-' and r != '-':
|
|
466
|
+
# return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
|
|
467
|
+
temp = {'seq': '-'*len(r), 'indices': np.arange(i, i + len(r), dtype=np.int32), 'superinds': [0] * len(r)}
|
|
468
|
+
|
|
469
|
+
elif r == '-' and a != '-':
|
|
470
|
+
# print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
|
|
471
|
+
# return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
|
|
472
|
+
temp = {'seq': a, 'indices': np.full(len(a), int(i)), 'superinds': np.arange(1, len(a)+1)}
|
|
473
|
+
|
|
474
|
+
elif a != '-' and r != '-':
|
|
475
|
+
ind1 = np.concatenate(
|
|
476
|
+
[np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
|
|
477
|
+
ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
|
|
478
|
+
# return Allele('-' * len(r) + a, ind1, ind2, rev)
|
|
479
|
+
temp = {'seq': '-' * len(r) + a, 'indices': ind1, 'superinds': ind2}
|
|
480
|
+
|
|
481
|
+
return cls.from_seq(temp)
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
# def _validate_superinds(self):
|
|
485
|
+
# """
|
|
486
|
+
# Validates that the superinds row has a maximum value of 10.
|
|
487
|
+
# """
|
|
488
|
+
# superinds = self.seqmat[self.ROW_SUPERINDS, :]
|
|
489
|
+
# if np.max(superinds) > 10:
|
|
490
|
+
# raise ValueError(f"Superinds row must have a maximum value of 10. Got: {superinds}")
|
|
491
|
+
|
|
492
|
+
|