geney 1.2.22__py2.py3-none-any.whl → 1.2.24__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/__init__.py CHANGED
@@ -1,6 +1,18 @@
1
1
  from .config_setup import get_config
2
- config_setup = get_config()
3
- print("Hello Geney.")
2
+ config = get_config()
3
+ from .Fasta_segment import Fasta_segment
4
+ from .utils import *
5
+
6
+ mut_id = 'KRAS:12:25227343:G:T'
7
+ epistasis_id = 'KRAS:12:25227343:G:T|KRAS:12:25227344:A:T'
8
+
9
+ def available_genes(organism='hg38'):
10
+ import os
11
+ for file in os.listdir(config[organism]['MRNA_PATH'] / 'protein_coding'):
12
+ gene = file.split('_')[-1].strip('.pkl')
13
+ yield gene
14
+
15
+
4
16
  # import os
5
17
  # import json
6
18
  # from pathlib import Path
geney/data_setup.py CHANGED
@@ -234,7 +234,7 @@ def main():
234
234
  config_data = unload_json(config_file)
235
235
  overwrite = 'y'
236
236
  if args.organism in config_data:
237
- overwrite = input("Organism {args.organism} already configured... Overwrite? (y/n)")
237
+ overwrite = input(f"Organism {args.organism} already configured... Overwrite? (y/n)")
238
238
 
239
239
  if overwrite == 'y':
240
240
  config_data[args.organism] = config_paths
geney/graphic_utils.py ADDED
@@ -0,0 +1,270 @@
1
+
2
+ import matplotlib.pyplot as plt
3
+ from matplotlib.patches import Rectangle
4
+ import seaborn as sns
5
+ from collections import namedtuple
6
+ from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle, contains, unload_json, dump_json #, is_monotonic
7
+
8
+
9
+ ### Graphical Stuff
10
+ def create_figure_story(epistasis, to_file=None):
11
+ g = epistasis.split(':')[0]
12
+ out = oncosplice(epistasis, annotate=True)
13
+ out = out[out.cons_available==1]
14
+
15
+ for _, row in out.iterrows():
16
+ max_length = 0
17
+ pos = 0
18
+ for i, k in row.deletions.items():
19
+ if len(k) > max_length:
20
+ pos = i
21
+ max_length = len(k)
22
+
23
+ if max_length > 5:
24
+ del_reg = [pos, pos + max_length]
25
+ else:
26
+ del_reg = None
27
+
28
+ if row.oncosplice_score == 0:
29
+ mutation_loc = None
30
+ else:
31
+ mutation_loc = pos
32
+
33
+ plot_conservation(tid=row.transcript_id,
34
+ gene=f'{g}, {row.transcript_id}.{row.isoform}',
35
+ mutation_loc=mutation_loc,
36
+ target_region=del_reg, mut_name='Epistasis',
37
+ domain_annotations=get_annotations(row.transcript_id, 300),
38
+ to_file=to_file)
39
+
40
+
41
+
42
+ def plot_conservation(gene_name, tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
43
+ """
44
+ Plots conservation vectors with protein domain visualization and Rate4Site scores.
45
+
46
+ Parameters:
47
+ tid (str): Transcript identifier.
48
+ gene (str): Gene name.
49
+ mutation_loc (int): Position of the mutation.
50
+ target_region (tuple): Start and end positions of the target region.
51
+ mut_name (str): Name of the mutation.
52
+ domain_annotations (list): List of tuples for domain annotations (start, end, label).
53
+ """
54
+ # Access conservation data
55
+ _, cons_vec = unload_pickle(gene_name)['tid']['cons_vector']
56
+
57
+ if not cons_vec:
58
+ raise ValueError("The conservation vector is empty.")
59
+
60
+ sns.set_theme(style="white")
61
+ fig, ax = plt.subplots(figsize=(max(15, len(cons_vec)/10), 3)) # Dynamic figure size
62
+
63
+ # Plotting the conservation vectors in the main plot
64
+ plot_conservation_vectors(ax, cons_vec)
65
+
66
+ # Setting up primary axis for the main plot
67
+ setup_primary_axis(ax, gene, len(cons_vec))
68
+
69
+ # Create a separate axes for protein domain visualization
70
+ domain_ax = create_domain_axes(fig, len(cons_vec))
71
+
72
+ # Draw protein domains
73
+ plot_protein_domains(domain_ax, domain_annotations, len(cons_vec))
74
+
75
+ # Plotting Rate4Site scores on secondary y-axis
76
+ plot_rate4site_scores(ax, cons_vec)
77
+
78
+ # Plotting mutation location and target region, if provided
79
+ plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name)
80
+
81
+ plt.show()
82
+
83
+ def plot_conservation_vectors(ax, cons_vec):
84
+ """Plots transformed conservation vectors."""
85
+ temp = transform_conservation_vector(cons_vec, 76) # Larger window
86
+ temp /= max(temp)
87
+ ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
88
+
89
+ temp = transform_conservation_vector(cons_vec, 6) # Smaller window
90
+ temp /= max(temp)
91
+ ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
92
+
93
+ def setup_primary_axis(ax, gene, length):
94
+ """Configures the primary axis of the plot."""
95
+ ax.set_xlabel(f'AA Position - {gene}', weight='bold')
96
+ ax.set_xlim(0, length)
97
+ ax.set_ylim(0, 1)
98
+ ax.set_ylabel('Relative Importance', weight='bold')
99
+ ax.tick_params(axis='y')
100
+ ax.spines['right'].set_visible(False)
101
+ ax.spines['top'].set_visible(False)
102
+
103
+ def create_domain_axes(fig, length):
104
+ """Creates an axis for protein domain visualization."""
105
+ domain_ax_height = 0.06
106
+ domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height])
107
+ domain_ax.set_xlim(0, length)
108
+ domain_ax.set_xticks([])
109
+ domain_ax.set_yticks([])
110
+ for spine in domain_ax.spines.values():
111
+ spine.set_visible(False)
112
+ return domain_ax
113
+
114
+ def plot_protein_domains(ax, domain_annotations, length):
115
+ """Plots protein domain annotations."""
116
+ ax.add_patch(Rectangle((0, 0), length, 0.9, facecolor='lightgray', edgecolor='none'))
117
+ for domain in domain_annotations:
118
+ start, end, label = domain
119
+ ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
120
+ ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
121
+
122
+ def plot_rate4site_scores(ax, cons_vec):
123
+ """Plots Rate4Site scores on a secondary y-axis."""
124
+ ax2 = ax.twinx()
125
+ c = np.array(cons_vec)
126
+ c = c + abs(min(c))
127
+ c = c/max(c)
128
+ ax2.set_ylim(min(c), max(c)*1.1)
129
+ ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
130
+ ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
131
+ ax2.tick_params(axis='y', labelcolor='green')
132
+ ax2.spines['right'].set_visible(True)
133
+ ax2.spines['top'].set_visible(False)
134
+
135
+ def plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name):
136
+ """Highlights mutation location and target region, if provided."""
137
+ if mutation_loc is not None:
138
+ ax.axvline(x=mutation_loc, ymax=1, color='r', linestyle='--', alpha=0.7)
139
+ ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
140
+
141
+ if target_region is not None:
142
+ ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
143
+ center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
144
+ ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
145
+
146
+
147
+ def merge_overlapping_regions(df):
148
+ """
149
+ Merges overlapping regions in a DataFrame.
150
+
151
+ Parameters:
152
+ df (pd.DataFrame): DataFrame with columns 'start', 'end', 'name'
153
+
154
+ Returns:
155
+ List: List of merged regions as namedtuples (start, end, combined_name)
156
+ """
157
+ if df.empty:
158
+ return []
159
+
160
+ Region = namedtuple('Region', ['start', 'end', 'combined_name'])
161
+ df = df.sort_values(by='start')
162
+ merged_regions = []
163
+ current_region = None
164
+
165
+ for _, row in df.iterrows():
166
+ start, end, name = row['start'], row['end'], row['name'].replace('_', ' ')
167
+ if current_region is None:
168
+ current_region = Region(start, end, [name])
169
+ elif start <= current_region.end:
170
+ current_region = Region(current_region.start, max(current_region.end, end), current_region.combined_name + [name])
171
+ else:
172
+ merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
173
+ current_region = Region(start, end, [name])
174
+
175
+ if current_region:
176
+ merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
177
+
178
+ # Assuming split_text is a function that splits the text appropriately.
179
+ merged_regions = [Region(a, b, split_text(c, 35)) for a, b, c in merged_regions]
180
+ return merged_regions
181
+
182
+
183
+ def split_text(text, width):
184
+ """
185
+ Splits a text into lines with a maximum specified width.
186
+
187
+ Parameters:
188
+ text (str): The text to be split.
189
+ width (int): Maximum width of each line.
190
+
191
+ Returns:
192
+ str: The text split into lines of specified width.
193
+ """
194
+ lines = re.findall('.{1,' + str(width) + '}', text)
195
+ return '\n'.join(lines)
196
+
197
+ def get_annotations(target_gene, w=500):
198
+ PROTEIN_ANNOTATIONS = {}
199
+ temp = PROTEIN_ANNOTATIONS[(PROTEIN_ANNOTATIONS['Transcript stable ID'] == PROTEIN_ANNOTATIONS[target_gene]) & (PROTEIN_ANNOTATIONS.length < w)].drop_duplicates(subset=['Interpro Short Description'], keep='first')
200
+ return merge_overlapping_regions(temp)
201
+
202
+
203
+ # def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[], to_file=None):
204
+ # _, cons_vec = access_conservation_data(tid)
205
+ #
206
+ # sns.set_theme(style="white")
207
+ # fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
208
+ #
209
+ # # Plotting the conservation vectors in the main plot
210
+ # temp = transform_conservation_vector(cons_vec, 76)
211
+ # temp /= max(temp)
212
+ # ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
213
+ # temp = transform_conservation_vector(cons_vec, 6)
214
+ # temp /= max(temp)
215
+ # ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
216
+ #
217
+ # # Setting up primary axis for the main plot
218
+ # ax.set_xlabel(f'AA Position - {gene}', weight='bold')
219
+ # ax.set_xlim(0, len(cons_vec))
220
+ # ax.set_ylim(0, 1) # Set y-limit to end at 1
221
+ # ax.set_ylabel('Relative Importance', weight='bold')
222
+ # ax.tick_params(axis='y')
223
+ # ax.spines['right'].set_visible(False)
224
+ # ax.spines['top'].set_visible(False)
225
+ #
226
+ # # Create a separate axes for protein domain visualization above the main plot
227
+ # domain_ax_height = 0.06 # Adjust for thinner protein diagram
228
+ # domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
229
+ # domain_ax.set_xlim(0, len(cons_vec))
230
+ # domain_ax.set_xticks([])
231
+ # domain_ax.set_yticks([])
232
+ # domain_ax.spines['top'].set_visible(False)
233
+ # domain_ax.spines['right'].set_visible(False)
234
+ # domain_ax.spines['left'].set_visible(False)
235
+ # domain_ax.spines['bottom'].set_visible(False)
236
+ #
237
+ # # Draw the full-length protein as a base rectangle
238
+ # domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
239
+ #
240
+ # # Overlay domain annotations
241
+ # for domain in domain_annotations:
242
+ # start, end, label = domain
243
+ # domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
244
+ # domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
245
+ #
246
+ # # Plotting Rate4Site scores on secondary y-axis
247
+ # ax2 = ax.twinx()
248
+ # c = np.array(cons_vec)
249
+ # c = c + abs(min(c))
250
+ # c = c/max(c)
251
+ # ax2.set_ylim(min(c), max(c)*1.1)
252
+ # ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
253
+ # ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
254
+ # ax2.tick_params(axis='y', labelcolor='green')
255
+ # ax2.spines['right'].set_visible(True)
256
+ # ax2.spines['top'].set_visible(False)
257
+ #
258
+ # # Plotting mutation location and target region
259
+ # if mutation_loc is not None:
260
+ # ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
261
+ # ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
262
+ #
263
+ # if target_region is not None:
264
+ # ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
265
+ # center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
266
+ # ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
267
+ #
268
+ # plt.show()
269
+ #
270
+
@@ -0,0 +1,56 @@
1
+ from .seqmat_utils import *
2
+ import numpy as np
3
+
4
+ class Allele(SeqMat):
5
+ def __init__(self, alt, pos1, pos2, rev):
6
+ super().__init__(alt, pos1, pos2)
7
+ self.position = min(pos1)
8
+ if rev:
9
+ self.reverse_complement()
10
+
11
+ # def _continuous(self, ind):
12
+ # return True
13
+
14
+
15
+ class SNP(Allele):
16
+ def __init__(self, alt, pos1, pos2):
17
+ super().__init__(alt, pos1, pos2)
18
+ pass
19
+
20
+ class INDEL(Allele):
21
+ def __init__(self, alt, pos):
22
+ super().__init__(alt, pos)
23
+ pass
24
+
25
+ class INS(Allele):
26
+ def __init__(self, alt, pos):
27
+ super().__init__()
28
+ pass
29
+
30
+ class DEL(Allele):
31
+ def __init__(self, alt, pos):
32
+ super().__init__()
33
+ pass
34
+
35
+ def get_mutation(mut_id, rev=False):
36
+ _, _, i, r, a = mut_id.split(':')
37
+ i = int(i)
38
+
39
+ if len(a) == len(r) == 1 and a != '-' and r != '-':
40
+ return Allele(a, [i], [0], rev)
41
+
42
+ elif a == '-' and r != '-':
43
+ return Allele('-' *len(r), np.arange(i, i+ len(r), dtype=np.int32), [0] * len(r), rev)
44
+
45
+ elif r == '-' and a != '-':
46
+ # print(a, np.full(len(a), int(i)), np.arange(1, len(a)+1),)
47
+ return Allele(a, np.full(len(a), int(i)), np.arange(1, len(a)+1), rev)
48
+
49
+ elif a != '-' and r != '-':
50
+ ind1 = np.concatenate(
51
+ [np.arange(i, i + len(r), dtype=np.int32), np.full(len(a), len(r) + i - 1, dtype=np.int32)])
52
+ ind2 = np.concatenate([np.zeros(len(r), dtype=np.int32), np.arange(1, len(a) + 1, dtype=np.int32)])
53
+ return Allele('-' * len(r) + a, ind1, ind2, rev)
54
+
55
+
56
+