geney 1.3.79__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/Gene.py CHANGED
@@ -1,8 +1,9 @@
1
1
  import copy
2
- import random
2
+ # import random
3
+ from . import config
3
4
  from typing import Any, Dict, List, Tuple, Optional, Iterator, Union, TYPE_CHECKING
4
5
  from collections import Counter
5
- from . import unload_pickle, config
6
+ from .utils import unload_pickle
6
7
  from .Transcript import Transcript
7
8
 
8
9
  class Gene:
@@ -17,7 +18,7 @@ class Gene:
17
18
  chrm (str): The chromosome on which the gene resides.
18
19
  """
19
20
 
20
- def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
21
+ def __init__(self, gene_name, gene_id, rev, chrm, transcripts={}, organism='hg38'):
21
22
  """
22
23
  Initialize a Gene instance by loading gene information from stored pickled files.
23
24
 
@@ -30,8 +31,6 @@ class Gene:
30
31
  FileNotFoundError: If no files for the specified gene are found.
31
32
  AssertionError: If required attributes are missing after loading.
32
33
  """
33
-
34
- def __init__(self, gene_name, gene_id, rev, chrm, transcripts, organism='hg38'):
35
34
  self.gene_name = gene_name
36
35
  self.gene_id = gene_id
37
36
  self.rev = rev
@@ -145,12 +144,12 @@ class Gene:
145
144
  if tid is None:
146
145
  tid = self.primary_transcript
147
146
 
148
- if tid is None:
149
- tid = random.choice(list(self.transcripts.keys()))
150
- return None #Transcript()
147
+ # if tid is None:
148
+ # tid = random.choice(list(self.transcripts.keys()))
149
+ # return None #Transcript()
151
150
 
152
- if tid not in self.transcripts:
153
- return None
151
+ # if tid not in self.transcripts:
152
+ # return None
154
153
  # raise AttributeError(f"Transcript '{tid}' not found in gene '{self.gene_name}'.")
155
154
 
156
155
  return Transcript(self.transcripts[tid], organism=self.organism)
geney/Oncosplice.py ADDED
@@ -0,0 +1,400 @@
1
+ import re
2
+ import pandas as pd
3
+ import numpy as np
4
+ from Bio import pairwise2
5
+ import matplotlib.pyplot as plt
6
+ from matplotlib.patches import Rectangle
7
+ import seaborn as sns # Optional: uncomment if you wish to set a seaborn theme
8
+
9
+ class Oncosplice:
10
+ def __init__(self, reference_protein: str, variant_protein: str, conservation_vector: np.ndarray,
11
+ window_length: int = 13):
12
+ """
13
+ Initializes the Oncosplice analysis with protein sequences and conservation data.
14
+
15
+ Args:
16
+ reference_protein (str): Reference protein sequence.
17
+ variant_protein (str): Variant protein sequence.
18
+ conservation_vector (np.ndarray): 1D array of conservation scores for the reference protein.
19
+ window_length (int, optional): Window length for smoothing calculations. Defaults to 13.
20
+ """
21
+ self.reference_protein = reference_protein
22
+ self.variant_protein = variant_protein
23
+ self.conservation_vector = self.transform_conservation_vector(
24
+ conservation_vector, window=window_length)
25
+ self.window_length = window_length
26
+
27
+ # These will be calculated in run_analysis()
28
+ self.alignment = None
29
+ self.deletions = None
30
+ self.insertions = None
31
+ self.modified_positions = None
32
+ self.smoothed_conservation = None
33
+ self.score = None
34
+ self.percentile = None
35
+
36
+ self.run_analysis()
37
+
38
+ def run_analysis(self) -> None:
39
+ """
40
+ Runs the alignment and conservation analysis.
41
+ """
42
+ self.alignment = self.get_logical_alignment(self.reference_protein, self.variant_protein)
43
+ self.deletions, self.insertions = self.find_indels_with_mismatches_as_deletions(self.alignment.seqA,
44
+ self.alignment.seqB)
45
+ self.modified_positions = self.find_modified_positions(len(self.reference_protein), self.deletions,
46
+ self.insertions)
47
+ self.smoothed_conservation = np.convolve(self.conservation_vector * self.modified_positions,
48
+ np.ones(self.window_length), mode='same') / self.window_length
49
+
50
+ sorted_cons = sorted(self.conservation_vector)
51
+ max_temp_cons = max(self.smoothed_conservation)
52
+ self.percentile = sorted_cons.index(next(x for x in sorted_cons if x >= max_temp_cons)) / len(
53
+ self.conservation_vector)
54
+ self.score = max_temp_cons
55
+
56
+ @staticmethod
57
+ def find_continuous_gaps(sequence: str) -> list[tuple[int, int]]:
58
+ """
59
+ Finds continuous gap sequences in an alignment.
60
+ """
61
+ return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
62
+
63
+ @staticmethod
64
+ def build_position_mapper(sequence: str) -> dict[int, int]:
65
+ """
66
+ Creates a mapping from each alignment index to its corresponding position in the ungapped sequence.
67
+ """
68
+ mapper = {}
69
+ counter = 0
70
+ for i, char in enumerate(sequence):
71
+ if char != '-':
72
+ counter += 1
73
+ mapper[i] = counter
74
+ return mapper
75
+
76
+ def get_logical_alignment(self, ref_prot: str, var_prot: str):
77
+ """
78
+ Aligns two protein sequences and returns the alignment with the minimal gap sum.
79
+ If the variant is empty, uses the first character of the reference.
80
+
81
+ Returns:
82
+ Alignment object (with attributes seqA and seqB) from pairwise2.
83
+ """
84
+ if var_prot == '':
85
+ print("Variant protein is empty; using first character of reference as heuristic...")
86
+ var_prot = ref_prot[0]
87
+
88
+ alignments = pairwise2.align.globalms(ref_prot, var_prot, 1, -1, -3, 0, penalize_end_gaps=(True, True))
89
+ if not alignments:
90
+ print("No alignment found for:", ref_prot, var_prot)
91
+
92
+ if len(alignments) > 1:
93
+ gap_lengths = [sum(
94
+ end - start for start, end in (self.find_continuous_gaps(al.seqA) + self.find_continuous_gaps(al.seqB)))
95
+ for al in alignments]
96
+ optimal_alignment = alignments[gap_lengths.index(min(gap_lengths))]
97
+ else:
98
+ optimal_alignment = alignments[0]
99
+
100
+ return optimal_alignment
101
+
102
+ def find_indels_with_mismatches_as_deletions(self, seqA: str, seqB: str) -> tuple[dict[int, str], dict[int, str]]:
103
+ """
104
+ Identifies insertions and deletions in aligned sequences, treating mismatches as deletions.
105
+
106
+ Returns:
107
+ tuple: (deletions, insertions) dictionaries.
108
+ """
109
+ if len(seqA) != len(seqB):
110
+ raise ValueError("Sequences must be of the same length")
111
+
112
+ mapperA = self.build_position_mapper(seqA)
113
+ mapperB = self.build_position_mapper(seqB)
114
+ seqA_array = np.array(list(seqA))
115
+ seqB_array = np.array(list(seqB))
116
+
117
+ # Mark mismatches (where neither is a gap) as gaps in seqB.
118
+ mismatches = (seqA_array != seqB_array) & (seqA_array != '-') & (seqB_array != '-')
119
+ seqB_array[mismatches] = '-'
120
+ modified_seqB = ''.join(seqB_array)
121
+
122
+ gaps_in_A = self.find_continuous_gaps(seqA)
123
+ gaps_in_B = self.find_continuous_gaps(modified_seqB)
124
+
125
+ insertions = {
126
+ mapperB[start]: modified_seqB[start:end].replace('-', '')
127
+ for start, end in gaps_in_A if seqB[start:end].strip('-')
128
+ }
129
+ deletions = {
130
+ mapperA[start]: seqA[start:end].replace('-', '')
131
+ for start, end in gaps_in_B if seqA[start:end].strip('-')
132
+ }
133
+ return deletions, insertions
134
+
135
+ @staticmethod
136
+ def parabolic_window(window_size: int) -> np.ndarray:
137
+ """
138
+ Creates a parabolic window function with a peak at the center.
139
+ """
140
+ x = np.linspace(-1, 1, window_size)
141
+ return 0.9 * (1 - x ** 2) + 0.1
142
+
143
+ @staticmethod
144
+ def transform_conservation_vector(conservation_vector: np.ndarray, window: int = 13,
145
+ factor: float = 4) -> np.ndarray:
146
+ """
147
+ Transforms a 1D conservation vector using a parabolic window and exponential scaling.
148
+ """
149
+ conv_window = Oncosplice.parabolic_window(window)
150
+ transformed_vector = np.convolve(conservation_vector, conv_window, mode='same') / np.sum(conv_window)
151
+ assert len(transformed_vector) == len(
152
+ conservation_vector), "Length mismatch in transformed conservation vector."
153
+ return np.exp(-transformed_vector * factor)
154
+
155
+ @staticmethod
156
+ def find_modified_positions(sequence_length: int, deletions: dict[int, str], insertions: dict[int, str],
157
+ reach_limit: int = 16) -> np.ndarray:
158
+ """
159
+ Marks sequence positions as modified if they lie within a deletion or near an insertion.
160
+ """
161
+ modified = np.zeros(sequence_length, dtype=float)
162
+ for pos, deletion in deletions.items():
163
+ deletion_length = len(deletion)
164
+ modified[pos:pos + deletion_length] = 1
165
+
166
+ for pos, insertion in insertions.items():
167
+ reach = min(len(insertion) // 2, reach_limit)
168
+ start = max(0, pos - reach)
169
+ end = min(sequence_length, pos + reach)
170
+ modified[start:end] = 1
171
+
172
+ return modified
173
+
174
+ @staticmethod
175
+ def moving_average_conv(vector: np.ndarray, window_size: int, factor: float = 1) -> np.ndarray:
176
+ """
177
+ Computes the moving average convolution of a vector.
178
+ """
179
+ if not isinstance(vector, (list, tuple, np.ndarray)):
180
+ raise TypeError("Input vector must be a list, tuple, or numpy array.")
181
+ if not isinstance(window_size, int) or window_size <= 0:
182
+ raise ValueError("window_size must be a positive integer.")
183
+ if len(vector) < window_size:
184
+ raise ValueError("window_size must not exceed the length of the vector.")
185
+ if factor == 0:
186
+ raise ValueError("factor must be non-zero.")
187
+ return np.convolve(vector, np.ones(window_size), mode='same') / window_size
188
+
189
+ @staticmethod
190
+ def calculate_penalty(domains: dict[int, str], cons_scores: np.ndarray, window: int,
191
+ is_insertion: bool = False) -> np.ndarray:
192
+ """
193
+ Calculates a penalty for mutations based on conservation scores.
194
+ """
195
+ penalty = np.zeros(len(cons_scores))
196
+ for pos, seq in domains.items():
197
+ mutation_length = len(seq)
198
+ weight = max(1.0, mutation_length / window)
199
+ if is_insertion:
200
+ reach = min(window // 2, mutation_length // 2)
201
+ penalty[pos - reach:pos + reach] = weight * cons_scores[pos - reach:pos + reach]
202
+ else:
203
+ penalty[pos:pos + mutation_length] = weight * cons_scores[pos:pos + mutation_length]
204
+ return penalty
205
+
206
+ def oncosplice_score(self) -> tuple[float, float]:
207
+ """
208
+ Returns the computed Oncosplice score and its percentile.
209
+ """
210
+ return self.score, self.percentile
211
+
212
+ # ----------------- Visualization Methods -----------------
213
+
214
+ def plot_alignment(self) -> None:
215
+ """
216
+ Visualizes the alignment of reference and variant protein sequences.
217
+ Differences (mismatches or gaps) are marked in red.
218
+ """
219
+ aligned_ref = self.alignment.seqA
220
+ aligned_var = self.alignment.seqB
221
+ n = len(aligned_ref)
222
+
223
+ fig, ax = plt.subplots(figsize=(max(12, n * 0.5), 3))
224
+ ax.axis("off")
225
+
226
+ x_start, x_end = 0.01, 0.99
227
+ char_step = (x_end - x_start) / n
228
+ y_ref, y_var = 0.65, 0.35
229
+
230
+ ax.text(0.0, y_ref, "Reference:", fontsize=12, fontfamily="monospace", ha="right", va="center")
231
+ ax.text(0.0, y_var, "Variant: ", fontsize=12, fontfamily="monospace", ha="right", va="center")
232
+
233
+ for i in range(n):
234
+ x = x_start + i * char_step
235
+ char_ref = aligned_ref[i]
236
+ char_var = aligned_var[i]
237
+ color = "black" if char_ref == char_var else "red"
238
+ ax.text(x, y_ref, char_ref, fontsize=12, fontfamily="monospace",
239
+ ha="center", va="center", color=color)
240
+ ax.text(x, y_var, char_var, fontsize=12, fontfamily="monospace",
241
+ ha="center", va="center", color=color)
242
+
243
+ plt.title("Protein Sequence Alignment (differences in red)")
244
+ plt.show()
245
+
246
+ def plot_indels(self) -> None:
247
+ """
248
+ Visualizes the positions of insertions and deletions along the reference protein.
249
+ """
250
+ positions = np.arange(len(self.reference_protein))
251
+ indel_signal = np.zeros(len(self.reference_protein))
252
+ for pos, deletion in self.deletions.items():
253
+ indel_signal[pos:pos + len(deletion)] = 1
254
+ for pos, insertion in self.insertions.items():
255
+ reach = min(len(insertion) // 2, 16)
256
+ start = max(0, pos - reach)
257
+ end = min(len(self.reference_protein), pos + reach)
258
+ indel_signal[start:end] = 2
259
+
260
+ plt.figure(figsize=(10, 2))
261
+ plt.step(positions, indel_signal, where="post", marker="o")
262
+ plt.xlabel("Position")
263
+ plt.ylabel("Indel Signal\n(1 = Deletion, 2 = Insertion)")
264
+ plt.title("Insertions and Deletions Along Protein")
265
+ plt.ylim(-0.5, 2.5)
266
+ plt.show()
267
+
268
+ def plot_combined_analysis(self, gene: str = '', domain_annotations: list[tuple[int, int, str]] = None) -> None:
269
+ """
270
+ Creates a comprehensive plot that shows:
271
+ - Two conservation curves computed at different resolutions (using different window sizes).
272
+ - Normalized Rate4Site–like scores (plotted on a twin y-axis).
273
+ - Vertical markers for mutation events: deletions (red), insertions (blue), and missense mutations (magenta).
274
+ - Protein domain annotations (if provided) in a separate axis above the main plot.
275
+
276
+ Args:
277
+ gene (str): Gene name for the x-axis label.
278
+ domain_annotations (list of tuples): Each tuple is (start, end, label) for a protein domain.
279
+ """
280
+ # Optionally, you may set a seaborn style:
281
+ # sns.set_theme(style="white")
282
+
283
+ fig, ax = plt.subplots(figsize=(15, 5))
284
+ ax.set_xlabel(f'AA Position - {gene}', weight='bold')
285
+ ax.set_xlim(0, len(self.conservation_vector))
286
+ ax.set_ylim(0, 1.2)
287
+ ax.set_ylabel('Relative Importance', weight='bold')
288
+ ax.tick_params(axis='y')
289
+ ax.spines['right'].set_visible(False)
290
+ ax.spines['top'].set_visible(False)
291
+
292
+ # Compute conservation vectors at two resolutions.
293
+ cons_low = Oncosplice.transform_conservation_vector(self.conservation_vector, window=76)
294
+ cons_high = Oncosplice.transform_conservation_vector(self.conservation_vector, window=6)
295
+ # Normalize the vectors.
296
+ cons_low = cons_low / np.max(cons_low)
297
+ cons_high = cons_high / np.max(cons_high)
298
+ positions = np.arange(len(self.conservation_vector))
299
+
300
+ ax.plot(positions, cons_low, c='blue', label='Estimated Functional Residues (low-res)')
301
+ ax.plot(positions, cons_high, c='black', label='Estimated Functional Domains (high-res)')
302
+
303
+ # Plot Rate4Site–like scores on a twin y‑axis.
304
+ ax2 = ax.twinx()
305
+ c = np.array(self.conservation_vector)
306
+ c = c + abs(min(c))
307
+ c = c / np.max(c)
308
+ ax2.scatter(positions, c, color='green', label='Rate4Site Scores', alpha=0.4)
309
+ ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
310
+ ax2.tick_params(axis='y', labelcolor='green')
311
+ ax2.spines['right'].set_visible(True)
312
+ ax2.spines['top'].set_visible(False)
313
+
314
+ # Compute mutation event positions from the alignment.
315
+ ref_seq = self.alignment.seqA
316
+ var_seq = self.alignment.seqB
317
+ mapper = Oncosplice.build_position_mapper(ref_seq)
318
+ deletion_positions = []
319
+ insertion_positions = []
320
+ missense_positions = []
321
+
322
+ for i in range(len(ref_seq)):
323
+ r = ref_seq[i]
324
+ v = var_seq[i]
325
+ if r != '-' and v == '-':
326
+ deletion_positions.append(mapper[i])
327
+ elif r == '-' and v != '-':
328
+ pos = mapper[i - 1] if i > 0 else 0
329
+ insertion_positions.append(pos)
330
+ elif r != '-' and v != '-' and r != v:
331
+ missense_positions.append(mapper[i])
332
+
333
+ deletion_positions = sorted(set(deletion_positions))
334
+ insertion_positions = sorted(set(insertion_positions))
335
+ missense_positions = sorted(set(missense_positions))
336
+
337
+ # Add vertical markers for the mutation events.
338
+ for pos in deletion_positions:
339
+ ax.axvline(x=pos, color='red', linestyle='--', alpha=0.7,
340
+ label='Deletion' if pos == deletion_positions[0] else "")
341
+ for pos in insertion_positions:
342
+ ax.axvline(x=pos, color='blue', linestyle='--', alpha=0.7,
343
+ label='Insertion' if pos == insertion_positions[0] else "")
344
+ for pos in missense_positions:
345
+ ax.axvline(x=pos, color='magenta', linestyle='--', alpha=0.7,
346
+ label='Missense' if pos == missense_positions[0] else "")
347
+
348
+ ax.legend(loc='upper left')
349
+ ax2.legend(loc='upper right')
350
+
351
+ # If domain annotations are provided, create a small axes above for the domains.
352
+ if domain_annotations is not None:
353
+ domain_ax = fig.add_axes([0.125, 0.9, 0.775, 0.06])
354
+ domain_ax.set_xlim(0, len(self.conservation_vector))
355
+ domain_ax.set_xticks([])
356
+ domain_ax.set_yticks([])
357
+ for spine in domain_ax.spines.values():
358
+ spine.set_visible(False)
359
+ # Draw a base rectangle for the entire protein.
360
+ domain_ax.add_patch(Rectangle((0, 0), len(self.conservation_vector), 0.9,
361
+ facecolor='lightgray', edgecolor='none'))
362
+ for domain in domain_annotations:
363
+ start, end, label = domain
364
+ domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9,
365
+ facecolor='orange', edgecolor='none', alpha=0.5))
366
+ domain_ax.text((start + end) / 2, 1.2, label, ha='center', va='center', color='black', size=8)
367
+
368
+ plt.title("Combined Conservation and Mutation Analysis")
369
+ plt.show()
370
+
371
+ def get_analysis_series(self) -> pd.Series:
372
+ """
373
+ Returns a pandas Series summarizing the Oncosplice analysis.
374
+
375
+ The output includes:
376
+ - The reference protein sequence,
377
+ - The variant protein sequence,
378
+ - Their respective lengths,
379
+ - The alignment length,
380
+ - The computed Oncosplice score,
381
+ - The percentile,
382
+ - Counts of deletions and insertions,
383
+ - The total count of modified positions.
384
+
385
+ Returns:
386
+ pd.Series: A series containing the summary of the analysis.
387
+ """
388
+ analysis_dict = {
389
+ 'reference_protein': self.reference_protein,
390
+ 'variant_protein': self.variant_protein,
391
+ 'reference_length': len(self.reference_protein),
392
+ 'variant_length': len(self.variant_protein),
393
+ # 'alignment_length': len(self.alignment.seqA),
394
+ 'oncosplice_score': self.score,
395
+ 'percentile': self.percentile,
396
+ 'number_of_deletions': len(self.deletions),
397
+ 'number_of_insertions': len(self.insertions),
398
+ 'modified_positions_count': int(np.sum(self.modified_positions))
399
+ }
400
+ return pd.Series(analysis_dict)