geney 1.3.78__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

@@ -0,0 +1,269 @@
1
+ import matplotlib.pyplot as plt
2
+ from matplotlib.patches import Rectangle
3
+ import seaborn as sns
4
+ from collections import namedtuple
5
+ from geney.utils import unload_pickle, contains, unload_json, dump_json
6
+
7
+
8
+ ### Graphical Stuff
9
+ def create_figure_story(epistasis, to_file=None):
10
+ g = epistasis.split(':')[0]
11
+ out = oncosplice(epistasis, annotate=True)
12
+ out = out[out.cons_available==1]
13
+
14
+ for _, row in out.iterrows():
15
+ max_length = 0
16
+ pos = 0
17
+ for i, k in row.deletions.items():
18
+ if len(k) > max_length:
19
+ pos = i
20
+ max_length = len(k)
21
+
22
+ if max_length > 5:
23
+ del_reg = [pos, pos + max_length]
24
+ else:
25
+ del_reg = None
26
+
27
+ if row.oncosplice_score == 0:
28
+ mutation_loc = None
29
+ else:
30
+ mutation_loc = pos
31
+
32
+ plot_conservation(tid=row.transcript_id,
33
+ gene=f'{g}, {row.transcript_id}.{row.isoform}',
34
+ mutation_loc=mutation_loc,
35
+ target_region=del_reg, mut_name='Epistasis',
36
+ domain_annotations=get_annotations(row.transcript_id, 300),
37
+ to_file=to_file)
38
+
39
+
40
+
41
+ def plot_conservation(gene_name, tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
42
+ """
43
+ Plots conservation vectors with protein domain visualization and Rate4Site scores.
44
+
45
+ Parameters:
46
+ tid (str): Transcript identifier.
47
+ gene (str): Gene name.
48
+ mutation_loc (int): Position of the mutation.
49
+ target_region (tuple): Start and end positions of the target region.
50
+ mut_name (str): Name of the mutation.
51
+ domain_annotations (list): List of tuples for domain annotations (start, end, label).
52
+ """
53
+ # Access conservation data
54
+ _, cons_vec = unload_pickle(gene_name)['tid']['cons_vector']
55
+
56
+ if not cons_vec:
57
+ raise ValueError("The conservation vector is empty.")
58
+
59
+ sns.set_theme(style="white")
60
+ fig, ax = plt.subplots(figsize=(max(15, len(cons_vec)/10), 3)) # Dynamic figure size
61
+
62
+ # Plotting the conservation vectors in the main plot
63
+ plot_conservation_vectors(ax, cons_vec)
64
+
65
+ # Setting up primary axis for the main plot
66
+ setup_primary_axis(ax, gene, len(cons_vec))
67
+
68
+ # Create a separate axes for protein domain visualization
69
+ domain_ax = create_domain_axes(fig, len(cons_vec))
70
+
71
+ # Draw protein domains
72
+ plot_protein_domains(domain_ax, domain_annotations, len(cons_vec))
73
+
74
+ # Plotting Rate4Site scores on secondary y-axis
75
+ plot_rate4site_scores(ax, cons_vec)
76
+
77
+ # Plotting mutation location and target region, if provided
78
+ plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name)
79
+
80
+ plt.show()
81
+
82
+ def plot_conservation_vectors(ax, cons_vec):
83
+ """Plots transformed conservation vectors."""
84
+ temp = transform_conservation_vector(cons_vec, 76) # Larger window
85
+ temp /= max(temp)
86
+ ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
87
+
88
+ temp = transform_conservation_vector(cons_vec, 6) # Smaller window
89
+ temp /= max(temp)
90
+ ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
91
+
92
+ def setup_primary_axis(ax, gene, length):
93
+ """Configures the primary axis of the plot."""
94
+ ax.set_xlabel(f'AA Position - {gene}', weight='bold')
95
+ ax.set_xlim(0, length)
96
+ ax.set_ylim(0, 1)
97
+ ax.set_ylabel('Relative Importance', weight='bold')
98
+ ax.tick_params(axis='y')
99
+ ax.spines['right'].set_visible(False)
100
+ ax.spines['top'].set_visible(False)
101
+
102
+ def create_domain_axes(fig, length):
103
+ """Creates an axis for protein domain visualization."""
104
+ domain_ax_height = 0.06
105
+ domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height])
106
+ domain_ax.set_xlim(0, length)
107
+ domain_ax.set_xticks([])
108
+ domain_ax.set_yticks([])
109
+ for spine in domain_ax.spines.values():
110
+ spine.set_visible(False)
111
+ return domain_ax
112
+
113
+ def plot_protein_domains(ax, domain_annotations, length):
114
+ """Plots protein domain annotations."""
115
+ ax.add_patch(Rectangle((0, 0), length, 0.9, facecolor='lightgray', edgecolor='none'))
116
+ for domain in domain_annotations:
117
+ start, end, label = domain
118
+ ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
119
+ ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
120
+
121
+ def plot_rate4site_scores(ax, cons_vec):
122
+ """Plots Rate4Site scores on a secondary y-axis."""
123
+ ax2 = ax.twinx()
124
+ c = np.array(cons_vec)
125
+ c = c + abs(min(c))
126
+ c = c/max(c)
127
+ ax2.set_ylim(min(c), max(c)*1.1)
128
+ ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
129
+ ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
130
+ ax2.tick_params(axis='y', labelcolor='green')
131
+ ax2.spines['right'].set_visible(True)
132
+ ax2.spines['top'].set_visible(False)
133
+
134
+ def plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name):
135
+ """Highlights mutation location and target region, if provided."""
136
+ if mutation_loc is not None:
137
+ ax.axvline(x=mutation_loc, ymax=1, color='r', linestyle='--', alpha=0.7)
138
+ ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
139
+
140
+ if target_region is not None:
141
+ ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
142
+ center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
143
+ ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
144
+
145
+
146
+ def merge_overlapping_regions(df):
147
+ """
148
+ Merges overlapping regions in a DataFrame.
149
+
150
+ Parameters:
151
+ df (pd.DataFrame): DataFrame with columns 'start', 'end', 'name'
152
+
153
+ Returns:
154
+ List: List of merged regions as namedtuples (start, end, combined_name)
155
+ """
156
+ if df.empty:
157
+ return []
158
+
159
+ Region = namedtuple('Region', ['start', 'end', 'combined_name'])
160
+ df = df.sort_values(by='start')
161
+ merged_regions = []
162
+ current_region = None
163
+
164
+ for _, row in df.iterrows():
165
+ start, end, name = row['start'], row['end'], row['name'].replace('_', ' ')
166
+ if current_region is None:
167
+ current_region = Region(start, end, [name])
168
+ elif start <= current_region.end:
169
+ current_region = Region(current_region.start, max(current_region.end, end), current_region.combined_name + [name])
170
+ else:
171
+ merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
172
+ current_region = Region(start, end, [name])
173
+
174
+ if current_region:
175
+ merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
176
+
177
+ # Assuming split_text is a function that splits the text appropriately.
178
+ merged_regions = [Region(a, b, split_text(c, 35)) for a, b, c in merged_regions]
179
+ return merged_regions
180
+
181
+
182
+ def split_text(text, width):
183
+ """
184
+ Splits a text into lines with a maximum specified width.
185
+
186
+ Parameters:
187
+ text (str): The text to be split.
188
+ width (int): Maximum width of each line.
189
+
190
+ Returns:
191
+ str: The text split into lines of specified width.
192
+ """
193
+ lines = re.findall('.{1,' + str(width) + '}', text)
194
+ return '\n'.join(lines)
195
+
196
+ def get_annotations(target_gene, w=500):
197
+ PROTEIN_ANNOTATIONS = {}
198
+ temp = PROTEIN_ANNOTATIONS[(PROTEIN_ANNOTATIONS['Transcript stable ID'] == PROTEIN_ANNOTATIONS[target_gene]) & (PROTEIN_ANNOTATIONS.length < w)].drop_duplicates(subset=['Interpro Short Description'], keep='first')
199
+ return merge_overlapping_regions(temp)
200
+
201
+
202
+ # def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[], to_file=None):
203
+ # _, cons_vec = access_conservation_data(tid)
204
+ #
205
+ # sns.set_theme(style="white")
206
+ # fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
207
+ #
208
+ # # Plotting the conservation vectors in the main plot
209
+ # temp = transform_conservation_vector(cons_vec, 76)
210
+ # temp /= max(temp)
211
+ # ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
212
+ # temp = transform_conservation_vector(cons_vec, 6)
213
+ # temp /= max(temp)
214
+ # ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
215
+ #
216
+ # # Setting up primary axis for the main plot
217
+ # ax.set_xlabel(f'AA Position - {gene}', weight='bold')
218
+ # ax.set_xlim(0, len(cons_vec))
219
+ # ax.set_ylim(0, 1) # Set y-limit to end at 1
220
+ # ax.set_ylabel('Relative Importance', weight='bold')
221
+ # ax.tick_params(axis='y')
222
+ # ax.spines['right'].set_visible(False)
223
+ # ax.spines['top'].set_visible(False)
224
+ #
225
+ # # Create a separate axes for protein domain visualization above the main plot
226
+ # domain_ax_height = 0.06 # Adjust for thinner protein diagram
227
+ # domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
228
+ # domain_ax.set_xlim(0, len(cons_vec))
229
+ # domain_ax.set_xticks([])
230
+ # domain_ax.set_yticks([])
231
+ # domain_ax.spines['top'].set_visible(False)
232
+ # domain_ax.spines['right'].set_visible(False)
233
+ # domain_ax.spines['left'].set_visible(False)
234
+ # domain_ax.spines['bottom'].set_visible(False)
235
+ #
236
+ # # Draw the full-length protein as a base rectangle
237
+ # domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
238
+ #
239
+ # # Overlay domain annotations
240
+ # for domain in domain_annotations:
241
+ # start, end, label = domain
242
+ # domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
243
+ # domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
244
+ #
245
+ # # Plotting Rate4Site scores on secondary y-axis
246
+ # ax2 = ax.twinx()
247
+ # c = np.array(cons_vec)
248
+ # c = c + abs(min(c))
249
+ # c = c/max(c)
250
+ # ax2.set_ylim(min(c), max(c)*1.1)
251
+ # ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
252
+ # ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
253
+ # ax2.tick_params(axis='y', labelcolor='green')
254
+ # ax2.spines['right'].set_visible(True)
255
+ # ax2.spines['top'].set_visible(False)
256
+ #
257
+ # # Plotting mutation location and target region
258
+ # if mutation_loc is not None:
259
+ # ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
260
+ # ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
261
+ #
262
+ # if target_region is not None:
263
+ # ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
264
+ # center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
265
+ # ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
266
+ #
267
+ # plt.show()
268
+ #
269
+
geney/_gtex_utils.py ADDED
@@ -0,0 +1,68 @@
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+
4
+ # Set pandas display options (if necessary)
5
+ pd.options.display.max_rows = 999
6
+
7
+ # Read metadata
8
+ metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
9
+ metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
10
+
11
+ # Initialize an empty DataFrame for combined results
12
+ combined_df = pd.DataFrame()
13
+
14
+ # Define chunk size
15
+ tpm_mean = []
16
+ # Process the main data file in chunks
17
+ for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
18
+ delimiter='\t')):
19
+ # Perform the same operations on the chunk
20
+ chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
21
+ # Append the processed chunk to the combined DataFrame
22
+ tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
23
+
24
+ # Compute the mean TPM per tissue
25
+ tpm_mean = pd.concat(tpm_mean)
26
+
27
+
28
+ cancer_projects = {
29
+ "Adrenal Gland": "ACC",
30
+ "Bladder": "BLCA",
31
+ "Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
32
+ "Breast": "BRCA",
33
+ "Colon": "COAD",
34
+ "Esophagus": "ESCA",
35
+ "Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
36
+ "Liver": "LIHC",
37
+ "Lung": "LUNG",
38
+ "Ovary": "OV",
39
+ "Pancreas": "PAAD",
40
+ "Prostate": "PRAD",
41
+ "Skin": "SKCM",
42
+ "Stomach": "STAD",
43
+ "Testis": "TGCT",
44
+ "Uterus": "UCS"
45
+ }
46
+
47
+ tissue_projects = {
48
+ "ACC": "Adrenal Gland",
49
+ "BLCA": "Bladder",
50
+ "GBM": "Brain",
51
+ "LGG": "Brain",
52
+ "BRCA": "Breast",
53
+ "COAD": "Colon",
54
+ "ESCA": "Esophagus",
55
+ "KICH": "Kidney",
56
+ "KIRC": "Kidney",
57
+ "KIRP": "Kidney",
58
+ "LIHC": "Liver",
59
+ "LUNG": "Lung",
60
+ "OV": "Ovary",
61
+ "PAAD": "Pancreas",
62
+ "PRAD": "Prostate",
63
+ "SKCM": "Skin",
64
+ "STAD": "Stomach",
65
+ "TGCT": "Testis",
66
+ "UCS": "Uterus"
67
+ }
68
+
geney/_immune_utils.py ADDED
@@ -0,0 +1,125 @@
1
+ import subprocess
2
+ import logging
3
+ import tempfile
4
+ from geney import _config_setup
5
+ import re
6
+ from io import StringIO
7
+ import pandas as pd
8
+
9
+
10
+ class NetChop(object):
11
+ """
12
+ Wrapper around netChop tool. Assumes netChop is in your PATH.
13
+ """
14
+ def predict_epitopes(self, sequences, threshold=0.5, min_len=8):
15
+ """
16
+ Return netChop predictions for each position in each sequence.
17
+
18
+ Parameters
19
+ -----------
20
+ sequences : list of string
21
+ Amino acid sequences to predict cleavage for
22
+
23
+ Returns
24
+ -----------
25
+ list of list of float
26
+
27
+ The i'th list corresponds to the i'th sequence. Each list gives
28
+ the cleavage probability for each position in the sequence.
29
+ """
30
+ with tempfile.NamedTemporaryFile(dir=config_setup['NETCHOP'], suffix=".fsa", mode="w") as input_fd:
31
+ for (i, sequence) in enumerate(sequences):
32
+ _ = input_fd.write("> %d\n" % i)
33
+ _ = input_fd.write(sequence)
34
+ _ = input_fd.write("\n")
35
+ input_fd.flush()
36
+ try:
37
+ output = subprocess.check_output(["netchop", str(input_fd.name)])
38
+ except subprocess.CalledProcessError as e:
39
+ logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
40
+ raise
41
+ parsed = self.parse_netchop(output)
42
+ # return parsed
43
+ #
44
+ assert len(parsed) == len(sequences), \
45
+ "Expected %d results but got %d" % (
46
+ len(sequences), len(parsed))
47
+ assert [len(x) for x in parsed] == [len(x) for x in sequences]
48
+ filtered_proteosomes = []
49
+ for scores, seq in list(zip(parsed, sequences)):
50
+ proteosome = self.chop_protein(seq, [s > threshold for s in scores])
51
+ filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
52
+ return filtered_proteosomes
53
+ @staticmethod
54
+ def parse_netchop(netchop_output):
55
+ """
56
+ Parse netChop stdout.
57
+ """
58
+ line_iterator = iter(netchop_output.decode().split("\n"))
59
+ scores = []
60
+ for line in line_iterator:
61
+ if "pos" in line and 'AA' in line and 'score' in line:
62
+ scores.append([])
63
+ if "----" not in next(line_iterator):
64
+ raise ValueError("Dashes expected")
65
+ line = next(line_iterator)
66
+ while '-------' not in line:
67
+ score = float(line.split()[3])
68
+ scores[-1].append(score)
69
+ line = next(line_iterator)
70
+ return scores
71
+ def chop_protein(self, seq, pos):
72
+ # Generate subsequences using list comprehension and slicing
73
+ start = 0
74
+ subsequences = [seq[start:(start := i+1)] for i, marker in enumerate(pos) if marker == 1]
75
+ # Check if the last part needs to be added
76
+ if start < len(seq):
77
+ subsequences.append(seq[start:])
78
+ return subsequences
79
+ def generate_cut_sequences(self, char_sequence, cut_probabilities):
80
+ """
81
+ Generate all possible cut sequences and their abundance values,
82
+ considering only those sequences where the probabilities of all cut sites
83
+ between the two ends are zero.
84
+
85
+ :param char_sequence: A string representing the sequence of characters.
86
+ :param cut_probabilities: A list of probabilities for each position in the sequence.
87
+ :return: A list of tuples, where each tuple contains a cut sequence and its abundance value.
88
+ """
89
+ if len(char_sequence) != len(cut_probabilities):
90
+ raise ValueError("Character sequence and cut probabilities must have the same length.")
91
+ cut_sequences = []
92
+ # Generate all possible cuts
93
+ for i in range(len(char_sequence)):
94
+ for j in range(i + 1, len(char_sequence) + 1):
95
+ # Check if probabilities of all cut sites between i and j are zero
96
+ if sum(cut_probabilities[i + 1:j - 1]) < 1:
97
+ cut_sequence = char_sequence[i:j]
98
+ abundance_value = cut_probabilities[i] * cut_probabilities[j - 1] - sum(
99
+ cut_probabilities[i + 1:j - 1])
100
+ cut_sequences.append({'seq': cut_sequence, 'abundance': abundance_value})
101
+ return pd.DataFrame(cut_sequences)
102
+
103
+
104
+ def run_mhc(sequences):
105
+ with tempfile.NamedTemporaryFile(dir='/tamir2/nicolaslynn/temp', suffix=".pep", mode="w") as input_fd:
106
+ for (i, sequence) in enumerate(sequences):
107
+ _ = input_fd.write(sequence)
108
+ _ = input_fd.write("\n")
109
+ input_fd.flush()
110
+ try:
111
+ out = subprocess.check_output(
112
+ ["netMHCpan", "-p", "-BA", str(input_fd.name)])
113
+ except subprocess.CalledProcessError as e:
114
+ logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
115
+ raise
116
+ out = out.decode('utf-8')
117
+ out = out.split(
118
+ '\n---------------------------------------------------------------------------------------------------------------------------\n')
119
+ out = out[1] + '\n' + out[2]
120
+ out = re.sub(r'[ ]+', ',', out)
121
+ out = out.replace('\n,', '\n')
122
+ return pd.read_csv(StringIO(out)).drop(columns=['Unnamed: 0'])
123
+
124
+
125
+