geney 1.3.79__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/utils/SeqMats.py ADDED
@@ -0,0 +1,423 @@
1
+ __all__ = ['SeqMat', 'format_mut_id']
2
+
3
+ from dataclasses import dataclass, field
4
+ from collections import defaultdict
5
+ from typing import Optional, Union, List, Tuple
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ def format_mut_id(text):
10
+ import re
11
+ # text = "TP53:17:7579472:G:A"
12
+
13
+ pattern = r'^[^:]+:[^:]+:(\d+):([ACGTN\-]+):([ACGTN\-]+)$'
14
+ match = re.match(pattern, text)
15
+
16
+ if match:
17
+ position = int(match.group(1))
18
+ ref = match.group(2)
19
+ alt = match.group(3)
20
+ return {'pos': position, 'ref': ref, 'alt': alt}
21
+
22
+ # print(f"Position: {position}, Ref: {ref}, Alt: {alt}")
23
+ else:
24
+ print("No match")
25
+ return None
26
+
27
+
28
+ @dataclass(slots=True)
29
+ class SeqMat:
30
+ """Represents a genomic sequence matrix used for training."""
31
+ # Metadata fields (uncomment and/or extend as needed)
32
+ name: str = field(default="Unnamed Sequence", metadata={"description": "Name of the sequence"})
33
+ version: str = field(default="1.0", metadata={"description": "Version of the dataset"})
34
+ source: str = field(default="Unknown", metadata={"description": "Source of the sequence data"})
35
+ notes: dict = field(default_factory=dict, metadata={"description": "User-defined metadata dictionary"})
36
+
37
+ seq_array: np.ndarray = field(init=False, repr=False)
38
+ insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
39
+ rev: bool = field(default=False, init=False, repr=False)
40
+
41
+ predicted_splicing: pd.DataFrame = field(init=False, repr=False)
42
+
43
+ def __init__(
44
+ self,
45
+ nucleotides: str,
46
+ index: np.ndarray,
47
+ conservation: Optional[np.ndarray] = None,
48
+ reference_nucleotides: Optional[np.ndarray] = None,
49
+ notes: Optional[dict] = None,
50
+ source: Optional[str] = None,
51
+ rev: Optional[bool] = False,
52
+ name: Optional[str] = 'wild_type',
53
+ version: Optional[str] = 'none'
54
+
55
+ ) -> None:
56
+ self.predicted_splicing = None
57
+ nucleotides = np.array(list(nucleotides))
58
+ L = nucleotides.shape[0]
59
+ if index.shape[0] != L:
60
+ raise ValueError("Indices array length must match nucleotide sequence length.")
61
+ if conservation is not None and conservation.shape[0] != L:
62
+ raise ValueError("Conservation vector length must match sequence length.")
63
+ if reference_nucleotides is not None and reference_nucleotides.shape[0] != L:
64
+ raise ValueError("Reference nucleotide vector length must match sequence length.")
65
+
66
+ dtype = np.dtype([
67
+ ("nt", "S1"),
68
+ ("index", np.float64),
69
+ ("ref", "S1"),
70
+ ("cons", np.float32),
71
+ ("valid_mask", bool),
72
+ ])
73
+
74
+ self.seq_array = np.empty(L, dtype=dtype)
75
+ self.seq_array["nt"] = nucleotides
76
+ # Use provided reference nucleotides if available.
77
+ self.seq_array["ref"] = nucleotides if reference_nucleotides is None else reference_nucleotides
78
+ self.seq_array["index"] = index
79
+ self.seq_array["cons"] = np.nan if conservation is None else conservation
80
+ self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
81
+ self.insertion_counters = defaultdict(int)
82
+
83
+ self.source = source if source is not None else "Unknown"
84
+ self.notes = notes if notes is not None else {}
85
+ self.name = name
86
+ self.rev = rev
87
+ self.version = version
88
+
89
+ def __len__(self) -> int:
90
+ return int(self.seq_array["valid_mask"].sum())
91
+
92
+ def __repr__(self):
93
+ return f"<SeqMat: {self.seq}>"
94
+
95
+ def __str__(self):
96
+ return self.seq
97
+
98
+ def get_metadata(self) -> dict:
99
+ """Retrieve all metadata as a dictionary."""
100
+ return {
101
+ "name": self.name,
102
+ "source": self.source,
103
+ "version": self.version,
104
+ "notes": self.notes
105
+ }
106
+
107
+ @property
108
+ def seq(self) -> str:
109
+ return self.seq_array["nt"][self.seq_array["valid_mask"]].tobytes().decode()
110
+
111
+ @property
112
+ def index(self) -> np.ndarray:
113
+ return self.seq_array["index"][self.seq_array["valid_mask"]]
114
+
115
+ @property
116
+ def conservation(self) -> np.ndarray:
117
+ return self.seq_array["cons"][self.seq_array["valid_mask"]]
118
+
119
+ @property
120
+ def max_index(self) -> float:
121
+ return self.seq_array["index"].max()
122
+
123
+ @property
124
+ def min_index(self) -> float:
125
+ return self.seq_array["index"].min()
126
+
127
+ @property
128
+ def start(self) -> float:
129
+ return self.min_index
130
+
131
+ @property
132
+ def end(self) -> float:
133
+ return self.max_index
134
+
135
+ @property
136
+ def mutated_positions(self) -> np.ndarray:
137
+ return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
138
+
139
+ def clone(self, start: Optional[int] = None, end: Optional[int] = None) -> "SeqMat":
140
+ cloned = SeqMat.__new__(SeqMat)
141
+ if start is not None and end is not None:
142
+ cloned.seq_array = self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= end)]
143
+ else:
144
+ cloned.seq_array = self.seq_array.copy()
145
+ cloned.insertion_counters = defaultdict(int)
146
+ cloned.name = self.name
147
+ cloned.source = self.source
148
+ cloned.version = self.version
149
+ cloned.notes = self.notes.copy()
150
+ cloned.rev = self.rev
151
+ return cloned
152
+
153
+ def apply_mutation(self, pos: int, ref: str, alt: str, only_snps: bool = False):
154
+ """
155
+ Applies a mutation (SNP, substitution, insertion, or deletion) to the sequence.
156
+
157
+ Parameters:
158
+ pos (int): The reference position where the mutation should occur.
159
+ ref (str): The reference allele (use '-' for insertions).
160
+ alt (str): The alternate allele (use '-' for deletions).
161
+ only_snps (bool): If True, only SNP substitutions are allowed; indels are ignored.
162
+
163
+ Returns:
164
+ SeqMat: The mutated sequence matrix.
165
+
166
+ The method normalizes the mutation (dropping any shared prefix) and then applies:
167
+ - A SNP/substitution if both alleles are non-gap.
168
+ - An insertion if ref is '-' (after normalization).
169
+ - A deletion if alt is '-' (after normalization).
170
+
171
+ For insertions, new rows are added with fractional indices computed from an insertion counter.
172
+ For deletions, the corresponding rows are removed.
173
+ """
174
+ return_to_rc = False
175
+ if self.rev:
176
+ return_to_rc = True
177
+ self.reverse_complement()
178
+
179
+ # Normalize shared prefix (similar to left-alignment in VCFs)
180
+ if ref and alt and ref[0] == alt[0]:
181
+ pos += 1
182
+ ref = ref[1:] or "-"
183
+ alt = alt[1:] or "-"
184
+
185
+ # Case 1: SNP or multi-base substitution
186
+ if ref != "-" and alt != "-":
187
+ if len(ref) == len(alt):
188
+ # print('Inserting SNP')
189
+ pos_idx = np.searchsorted(self.seq_array["index"], pos)
190
+ end_idx = pos_idx + len(ref)
191
+ if end_idx > len(self.seq_array):
192
+ raise ValueError(f"Substitution range exceeds sequence length at position {pos}.")
193
+ segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
194
+ if segment != ref:
195
+ raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{segment}'")
196
+ for i, nt in enumerate(alt):
197
+ self.seq_array["nt"][pos_idx + i] = nt.encode()
198
+ else:
199
+ raise ValueError("Substitution mutations must have alleles of equal length.")
200
+
201
+ # Case 2: Insertion (ref is '-' means nothing was present, and we need to add bases)
202
+ elif ref == "-" and alt != "-":
203
+ if only_snps:
204
+ return self # Skip if indels are not allowed.
205
+ pos_idx = np.searchsorted(self.seq_array["index"], pos)
206
+ insertion_count = self.insertion_counters[pos]
207
+ eps = 1e-6
208
+ new_rows = []
209
+ for i, nt in enumerate(alt):
210
+ new_index = pos + (insertion_count + i + 1) * eps
211
+ new_row = (nt.encode(), new_index, b"-", np.float32(np.nan), True)
212
+ new_rows.append(new_row)
213
+ rows = list(self.seq_array)
214
+ rows.extend(new_rows)
215
+ new_seq_array = np.array(rows, dtype=self.seq_array.dtype)
216
+ new_seq_array.sort(order="index")
217
+ self.seq_array = new_seq_array
218
+ self.insertion_counters[pos] += len(alt)
219
+
220
+ # Case 3: Deletion (alt is '-' means bases are to be removed)
221
+ elif alt == "-" and ref != "-":
222
+ if only_snps:
223
+ return self # Skip if indels are not allowed.
224
+ pos_idx = np.searchsorted(self.seq_array["index"], pos)
225
+ end_idx = pos_idx + len(ref)
226
+ if end_idx > len(self.seq_array):
227
+ raise ValueError(f"Deletion range exceeds sequence length at position {pos}.")
228
+ segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
229
+ if segment != ref:
230
+ raise ValueError(
231
+ f"Reference mismatch for deletion at position {pos}: expected '{ref}', found '{segment}'")
232
+ self.seq_array = np.delete(self.seq_array, np.s_[pos_idx:end_idx])
233
+ else:
234
+ raise ValueError("Unsupported mutation type. Provide valid ref and alt values.")
235
+
236
+ self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
237
+ if return_to_rc:
238
+ self.reverse_complement()
239
+
240
+ return self
241
+
242
+ def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
243
+ if isinstance(key, int):
244
+ pos_idx = np.where(self.seq_array["index"] == key)[0]
245
+ if pos_idx.size == 0:
246
+ raise IndexError(f"Position {key} not found in sequence.")
247
+ return self.seq_array[pos_idx[0]]
248
+ elif isinstance(key, slice):
249
+ start, stop = key.start, key.stop
250
+ if start is None:
251
+ start = self.seq_array["index"].min()
252
+ if stop is None:
253
+ stop = self.seq_array["index"].max()
254
+ return self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= stop)]
255
+ else:
256
+ raise TypeError("Indexing must be an integer or a slice.")
257
+
258
+ def complement(self) -> "SeqMat":
259
+ comp_dict = {b"A": b"T", b"T": b"A", b"C": b"G", b"G": b"C", b"-": b"-", b"N": b"N"}
260
+ comp_seq = np.array([comp_dict[nt] for nt in self.seq_array["nt"]], dtype="S1")
261
+ new_instance = self.clone()
262
+ new_instance.seq_array["nt"] = comp_seq
263
+ return new_instance
264
+
265
+ def reverse_complement(self) -> "SeqMat":
266
+ rev_comp_seq = self.complement().seq_array[::-1]
267
+ self.seq_array = rev_comp_seq.copy()
268
+ self.rev = not self.rev
269
+ return self
270
+
271
+ # def splice_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
272
+ # """
273
+ # Splices out regions from the sequence corresponding to the given intron boundaries.
274
+ #
275
+ # Args:
276
+ # introns (List[Tuple[int, int]]): List of (start, end) intron boundaries to remove.
277
+ # Coordinates should match the 'index' field.
278
+ #
279
+ # Returns:
280
+ # SeqMat: A new instance with the intron regions removed.
281
+ # """
282
+ # mask = np.ones(len(self.seq_array), dtype=bool)
283
+ #
284
+ # for start, end in introns:
285
+ # mask &= ~((self.seq_array["index"] >= start) & (self.seq_array["index"] <= end))
286
+ #
287
+ # new_instance = self.clone()
288
+ # new_instance.seq_array = self.seq_array[mask].copy()
289
+ # return new_instance
290
+
291
+ def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
292
+ """
293
+ Splices out regions from the sequence corresponding to the given intron boundaries.
294
+
295
+ Handles reverse-complemented sequences by interpreting introns in reverse as well.
296
+
297
+ Args:
298
+ introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
299
+ These are always genomic (absolute) coordinates,
300
+ regardless of strand direction.
301
+
302
+ Returns:
303
+ SeqMat: A new instance with the intron regions removed.
304
+ """
305
+ # In reverse orientation, flip intron direction for comparison
306
+ if self.rev:
307
+ introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
308
+
309
+ mask = np.ones(len(self.seq_array), dtype=bool)
310
+
311
+ for start, end in introns:
312
+ lo, hi = min(start, end) + 1, max(start, end) - 1
313
+ mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
314
+
315
+ new_instance = self.clone()
316
+ new_instance.seq_array = self.seq_array[mask].copy()
317
+ return new_instance
318
+
319
+ def open_reading_frame(self, tis: int) -> "SeqMat":
320
+ """
321
+ Extracts the open reading frame starting from the translation initiation site (TIS)
322
+ until the first in-frame stop codon.
323
+
324
+ Args:
325
+ tis (int): Genomic position of the translation initiation site (start codon).
326
+
327
+ Returns:
328
+ SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
329
+ """
330
+ if tis not in self.seq_array["index"]:
331
+ print(f"Warning: TIS position {tis} not found, returning default.")
332
+ return self.clone(start=0, end=3)
333
+
334
+ # Extract nucleotide sequence and indices starting from TIS
335
+ mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
336
+ coding_part = self.seq_array[mask]
337
+ coding_seq = coding_part["nt"].tobytes().decode()
338
+
339
+ # Read codons in-frame
340
+ for i in range(0, len(coding_seq) - 2, 3):
341
+ codon = coding_seq[i:i + 3]
342
+ if codon in {"TAA", "TAG", "TGA"}:
343
+ # Determine index range for this ORF
344
+ start = coding_part["index"][0]
345
+ stop = coding_part["index"][i + 2]
346
+ lo, hi = sorted((start, stop))
347
+ return self.clone(start=lo, end=hi)
348
+
349
+ raise ValueError("No in-frame stop codon found after the TIS.")
350
+
351
+ def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
352
+ """
353
+ Predict splicing probabilities at a given position using the specified engine.
354
+
355
+ Args:
356
+ position (int): The genomic position to predict splicing probabilities for.
357
+ engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
358
+ context (int): The length of the target central region (default: 7500).
359
+ format (str): Output format for the splicing engine results.
360
+
361
+ Returns:
362
+ pd.DataFrame: A DataFrame containing:
363
+ - position: The genomic position
364
+ - donor_prob: Probability of being a donor splice site
365
+ - acceptor_prob: Probability of being an acceptor splice site
366
+ - nucleotides: The nucleotide sequence at that position
367
+
368
+ Raises:
369
+ ValueError: If an unsupported engine is provided.
370
+ IndexError: If the position is not found in the sequence.
371
+ """
372
+ # Retrieve extended context (includes flanks) around the position.
373
+ # seq, indices = self.get_context(position, context=context, padding='N')
374
+ target = self.clone(position - context, position + context)
375
+ # print(len(target.seq))
376
+ seq, indices = target.seq, target.index
377
+ # print(len(seq))
378
+ # rel_pos = np.where(indices == position)[0][0]
379
+ # print(rel_pos)
380
+ rel_pos = np.abs(indices - position).argmin()
381
+ # print(rel_pos, len(seq))
382
+ left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
383
+ # print(left_missing, right_missing)
384
+ if left_missing > 0 or right_missing > 0:
385
+ step = -1 if self.rev else 1
386
+
387
+ if left_missing > 0:
388
+ left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
389
+ else:
390
+ left_pad = np.array([], dtype=indices.dtype)
391
+
392
+ if right_missing > 0:
393
+ right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
394
+ else:
395
+ right_pad = np.array([], dtype=indices.dtype)
396
+
397
+ seq = 'N' * left_missing + seq + 'N' * right_missing
398
+ indices = np.concatenate([left_pad, indices, right_pad])
399
+
400
+ # Run the splicing prediction engine (function assumed to be defined externally)
401
+ from .splicing_utils import run_splicing_engine
402
+ donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
403
+ # Trim off the fixed flanks before returning results.
404
+ seq = seq[5000:-5000]
405
+ indices = indices[5000:-5000]
406
+ df = pd.DataFrame({
407
+ 'position': indices,
408
+ 'donor_prob': donor_probs,
409
+ 'acceptor_prob': acceptor_probs,
410
+ 'nucleotides': list(seq)
411
+ }).set_index('position').round(3)
412
+ # if reference_donors is not None:
413
+ # df['ref_donor'] = df.index.isin(reference_donors).astype(int)
414
+ # if reference_acceptors is not None:
415
+ # df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
416
+
417
+ df.attrs['name'] = self.name
418
+ if inplace:
419
+ self.predicted_splicing = df
420
+ return self
421
+ else:
422
+ return df
423
+
@@ -0,0 +1,55 @@
1
+ __all__ = ['TranscriptLibrary']
2
+
3
+ from .splicing_utils import adjoin_splicing_outcomes
4
+
5
+
6
+ class TranscriptLibrary:
7
+ def __init__(self, reference_transcript, mutations):
8
+ self.ref = reference_transcript.clone()
9
+ self.event = reference_transcript.clone()
10
+ self._transcripts = {'ref': self.ref, 'event': self.event}
11
+
12
+ # Apply all mutations to 'event'
13
+ for i, (pos, ref, alt) in enumerate(mutations):
14
+ self.event.pre_mrna.apply_mutation(pos, ref, alt)
15
+ if len(mutations) > 1:
16
+ t = reference_transcript.clone()
17
+ t.pre_mrna.apply_mutation(pos, ref, alt)
18
+ self._transcripts[f'mut{i+1}'] = t
19
+ setattr(self, f'mut{i+1}', t)
20
+
21
+ # Make 'ref' and 'event' accessible as attributes too
22
+ setattr(self, 'ref', self.ref)
23
+ setattr(self, 'event', self.event)
24
+
25
+ def predict_splicing(self, pos, engine='spliceai', inplace=False):
26
+ self.splicing_predictions = {
27
+ k: t.pre_mrna.predict_splicing(pos, engine=engine, inplace=True)
28
+ for k, t in self._transcripts.items()
29
+ }
30
+ self.splicing_results = adjoin_splicing_outcomes(
31
+ {k: t.pre_mrna.predicted_splicing for k, t in self._transcripts.items()},
32
+ self.ref
33
+ )
34
+ if inplace:
35
+ return self
36
+ else:
37
+ return self.splicing_results
38
+
39
+ def get_event_columns(self, event_name, sites=('donors', 'acceptors')):
40
+ """
41
+ Extracts selected columns from splicing_results for a given event name
42
+ (e.g., 'event', 'mut1', etc.)
43
+ """
44
+ metrics = (f'{event_name}_prob', 'ref_prob', 'annotated')
45
+ if not hasattr(self, 'splicing_results'):
46
+ raise ValueError("You must run predict_splicing() first.")
47
+
48
+ cols = [(site, metric) for site in sites for metric in metrics]
49
+ return self.splicing_results.loc[:, cols]
50
+
51
+ def __getitem__(self, key):
52
+ return self._transcripts[key]
53
+
54
+ def __iter__(self):
55
+ return iter(self._transcripts.items())
@@ -0,0 +1,20 @@
1
+ import importlib
2
+ import os
3
+ import pathlib
4
+
5
+ __all__ = [] # This will collect all the names you want to expose
6
+
7
+ # Find all utility modules in this directory
8
+ _package_dir = pathlib.Path(__file__).parent
9
+
10
+ for file in os.listdir(_package_dir):
11
+ if file.endswith(".py") and file != "__init__.py":
12
+ module_name = file[:-3] # strip '.py'
13
+ module_path = f"{__name__}.{module_name}"
14
+ module = importlib.import_module(module_path)
15
+
16
+ # If the module defines __all__, expose those names at utils level
17
+ if hasattr(module, "__all__"):
18
+ for name in module.__all__:
19
+ globals()[name] = getattr(module, name)
20
+ __all__.append(name)
@@ -0,0 +1,104 @@
1
+ __all__ = ['MutationalEvent', 'Mutation']
2
+
3
+ import re
4
+ from typing import List, Optional
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ class Mutation:
9
+ def __init__(self, gene: str, chrom: str, pos: int, ref: str, alt: str):
10
+ self.gene = gene
11
+ self.chrom = chrom
12
+ self.pos = int(pos)
13
+ self.ref = ref
14
+ self.alt = alt
15
+ self.mut_type = self._infer_type()
16
+
17
+ def _infer_type(self):
18
+ if self.ref == '-' or self.alt == '-':
19
+ return 'indel'
20
+ elif len(self.ref) == len(self.alt) == 1:
21
+ return 'snp'
22
+ else:
23
+ return 'indel'
24
+
25
+ def overlaps_with(self, other: 'Mutation') -> bool:
26
+ ref_len = len(self.ref) if self.ref != '-' else 0
27
+ alt_len = len(self.alt) if self.alt != '-' else 0
28
+ span = max(ref_len, alt_len, 1)
29
+ return not (self.pos + span <= other.pos or other.pos + span <= self.pos)
30
+
31
+ def to_dict(self):
32
+ return {
33
+ 'gene': self.gene,
34
+ 'chrom': self.chrom,
35
+ 'pos': self.pos,
36
+ 'ref': self.ref,
37
+ 'alt': self.alt,
38
+ 'type': self.mut_type
39
+ }
40
+
41
+ def __repr__(self):
42
+ return f"{self.gene}:{self.chrom}:{self.pos}:{self.ref}:{self.alt}"
43
+
44
+
45
+ class MutationalEvent:
46
+ def __init__(self, mut_id: str):
47
+ self.raw = mut_id
48
+ self.mutations: List[Mutation] = self._parse_mutations(mut_id)
49
+ self.gene = self._verify_same_gene()
50
+
51
+ def __len__(self):
52
+ return len(self.mutations)
53
+
54
+ def _parse_mutations(self, mut_id: str) -> List[Mutation]:
55
+ parts = re.split(r'[|,]', mut_id)
56
+ mutations = []
57
+ for part in parts:
58
+ match = re.match(r'^([^:]+):([^:]+):(\d+):([ACGTN\-]+):([ACGTN\-]+)$', part)
59
+ if not match:
60
+ raise ValueError(f"Invalid format for mutation: {part}")
61
+ mutations.append(Mutation(*match.groups()))
62
+ return mutations
63
+
64
+ def _verify_same_gene(self) -> Optional[str]:
65
+ genes = {m.gene for m in self.mutations}
66
+ if len(genes) != 1:
67
+ raise ValueError(f"Multiple genes found in event: {genes}")
68
+ return genes.pop()
69
+
70
+ def compatible(self) -> bool:
71
+ # Check for non-overlapping mutations
72
+ for i, m1 in enumerate(self.mutations):
73
+ for j, m2 in enumerate(self.mutations):
74
+ if i != j and m1.overlaps_with(m2):
75
+ return False
76
+ return True
77
+
78
+ def to_dataframe(self) -> pd.DataFrame:
79
+ return pd.DataFrame([m.to_dict() for m in self.mutations])
80
+
81
+ def __repr__(self):
82
+ muts = ', '.join(f"{m.pos}:{m.ref}>{m.alt}" for m in self.mutations)
83
+ return f"MutationalEvent({self.gene} -> [{muts}])"
84
+
85
+ @property
86
+ def positions(self):
87
+ return [m.pos for m in self.mutations]
88
+
89
+ @property
90
+ def position(self):
91
+ return int(np.mean(self.positions))
92
+
93
+ @property
94
+ def types(self):
95
+ return [m.mut_type for m in self.mutations]
96
+
97
+ def mutation_args(self):
98
+ """
99
+ Yields (pos, ref, alt) tuples for each mutation, for use with `apply_mutation`.
100
+ """
101
+ return [(m.pos, m.ref, m.alt) for m in self.mutations]
102
+
103
+ def __iter__(self):
104
+ return iter(self.mutation_args())