geney 1.3.79__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +54 -56
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.79.dist-info → geney-1.4.0.dist-info}/METADATA +1 -1
- geney-1.4.0.dist-info/RECORD +51 -0
- {geney-1.3.79.dist-info → geney-1.4.0.dist-info}/WHEEL +1 -1
- geney-1.3.79.dist-info/RECORD +0 -31
- {geney-1.3.79.dist-info → geney-1.4.0.dist-info}/top_level.txt +0 -0
geney/_graphic_utils.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
from matplotlib.patches import Rectangle
|
|
3
|
+
import seaborn as sns
|
|
4
|
+
from collections import namedtuple
|
|
5
|
+
from geney.utils import unload_pickle, contains, unload_json, dump_json
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### Graphical Stuff
|
|
9
|
+
def create_figure_story(epistasis, to_file=None):
|
|
10
|
+
g = epistasis.split(':')[0]
|
|
11
|
+
out = oncosplice(epistasis, annotate=True)
|
|
12
|
+
out = out[out.cons_available==1]
|
|
13
|
+
|
|
14
|
+
for _, row in out.iterrows():
|
|
15
|
+
max_length = 0
|
|
16
|
+
pos = 0
|
|
17
|
+
for i, k in row.deletions.items():
|
|
18
|
+
if len(k) > max_length:
|
|
19
|
+
pos = i
|
|
20
|
+
max_length = len(k)
|
|
21
|
+
|
|
22
|
+
if max_length > 5:
|
|
23
|
+
del_reg = [pos, pos + max_length]
|
|
24
|
+
else:
|
|
25
|
+
del_reg = None
|
|
26
|
+
|
|
27
|
+
if row.oncosplice_score == 0:
|
|
28
|
+
mutation_loc = None
|
|
29
|
+
else:
|
|
30
|
+
mutation_loc = pos
|
|
31
|
+
|
|
32
|
+
plot_conservation(tid=row.transcript_id,
|
|
33
|
+
gene=f'{g}, {row.transcript_id}.{row.isoform}',
|
|
34
|
+
mutation_loc=mutation_loc,
|
|
35
|
+
target_region=del_reg, mut_name='Epistasis',
|
|
36
|
+
domain_annotations=get_annotations(row.transcript_id, 300),
|
|
37
|
+
to_file=to_file)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def plot_conservation(gene_name, tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[]):
|
|
42
|
+
"""
|
|
43
|
+
Plots conservation vectors with protein domain visualization and Rate4Site scores.
|
|
44
|
+
|
|
45
|
+
Parameters:
|
|
46
|
+
tid (str): Transcript identifier.
|
|
47
|
+
gene (str): Gene name.
|
|
48
|
+
mutation_loc (int): Position of the mutation.
|
|
49
|
+
target_region (tuple): Start and end positions of the target region.
|
|
50
|
+
mut_name (str): Name of the mutation.
|
|
51
|
+
domain_annotations (list): List of tuples for domain annotations (start, end, label).
|
|
52
|
+
"""
|
|
53
|
+
# Access conservation data
|
|
54
|
+
_, cons_vec = unload_pickle(gene_name)['tid']['cons_vector']
|
|
55
|
+
|
|
56
|
+
if not cons_vec:
|
|
57
|
+
raise ValueError("The conservation vector is empty.")
|
|
58
|
+
|
|
59
|
+
sns.set_theme(style="white")
|
|
60
|
+
fig, ax = plt.subplots(figsize=(max(15, len(cons_vec)/10), 3)) # Dynamic figure size
|
|
61
|
+
|
|
62
|
+
# Plotting the conservation vectors in the main plot
|
|
63
|
+
plot_conservation_vectors(ax, cons_vec)
|
|
64
|
+
|
|
65
|
+
# Setting up primary axis for the main plot
|
|
66
|
+
setup_primary_axis(ax, gene, len(cons_vec))
|
|
67
|
+
|
|
68
|
+
# Create a separate axes for protein domain visualization
|
|
69
|
+
domain_ax = create_domain_axes(fig, len(cons_vec))
|
|
70
|
+
|
|
71
|
+
# Draw protein domains
|
|
72
|
+
plot_protein_domains(domain_ax, domain_annotations, len(cons_vec))
|
|
73
|
+
|
|
74
|
+
# Plotting Rate4Site scores on secondary y-axis
|
|
75
|
+
plot_rate4site_scores(ax, cons_vec)
|
|
76
|
+
|
|
77
|
+
# Plotting mutation location and target region, if provided
|
|
78
|
+
plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name)
|
|
79
|
+
|
|
80
|
+
plt.show()
|
|
81
|
+
|
|
82
|
+
def plot_conservation_vectors(ax, cons_vec):
|
|
83
|
+
"""Plots transformed conservation vectors."""
|
|
84
|
+
temp = transform_conservation_vector(cons_vec, 76) # Larger window
|
|
85
|
+
temp /= max(temp)
|
|
86
|
+
ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
87
|
+
|
|
88
|
+
temp = transform_conservation_vector(cons_vec, 6) # Smaller window
|
|
89
|
+
temp /= max(temp)
|
|
90
|
+
ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
91
|
+
|
|
92
|
+
def setup_primary_axis(ax, gene, length):
|
|
93
|
+
"""Configures the primary axis of the plot."""
|
|
94
|
+
ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
95
|
+
ax.set_xlim(0, length)
|
|
96
|
+
ax.set_ylim(0, 1)
|
|
97
|
+
ax.set_ylabel('Relative Importance', weight='bold')
|
|
98
|
+
ax.tick_params(axis='y')
|
|
99
|
+
ax.spines['right'].set_visible(False)
|
|
100
|
+
ax.spines['top'].set_visible(False)
|
|
101
|
+
|
|
102
|
+
def create_domain_axes(fig, length):
|
|
103
|
+
"""Creates an axis for protein domain visualization."""
|
|
104
|
+
domain_ax_height = 0.06
|
|
105
|
+
domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height])
|
|
106
|
+
domain_ax.set_xlim(0, length)
|
|
107
|
+
domain_ax.set_xticks([])
|
|
108
|
+
domain_ax.set_yticks([])
|
|
109
|
+
for spine in domain_ax.spines.values():
|
|
110
|
+
spine.set_visible(False)
|
|
111
|
+
return domain_ax
|
|
112
|
+
|
|
113
|
+
def plot_protein_domains(ax, domain_annotations, length):
|
|
114
|
+
"""Plots protein domain annotations."""
|
|
115
|
+
ax.add_patch(Rectangle((0, 0), length, 0.9, facecolor='lightgray', edgecolor='none'))
|
|
116
|
+
for domain in domain_annotations:
|
|
117
|
+
start, end, label = domain
|
|
118
|
+
ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
119
|
+
ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
120
|
+
|
|
121
|
+
def plot_rate4site_scores(ax, cons_vec):
|
|
122
|
+
"""Plots Rate4Site scores on a secondary y-axis."""
|
|
123
|
+
ax2 = ax.twinx()
|
|
124
|
+
c = np.array(cons_vec)
|
|
125
|
+
c = c + abs(min(c))
|
|
126
|
+
c = c/max(c)
|
|
127
|
+
ax2.set_ylim(min(c), max(c)*1.1)
|
|
128
|
+
ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
129
|
+
ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
130
|
+
ax2.tick_params(axis='y', labelcolor='green')
|
|
131
|
+
ax2.spines['right'].set_visible(True)
|
|
132
|
+
ax2.spines['top'].set_visible(False)
|
|
133
|
+
|
|
134
|
+
def plot_mutation_and_target_region(ax, mutation_loc, target_region, mut_name):
|
|
135
|
+
"""Highlights mutation location and target region, if provided."""
|
|
136
|
+
if mutation_loc is not None:
|
|
137
|
+
ax.axvline(x=mutation_loc, ymax=1, color='r', linestyle='--', alpha=0.7)
|
|
138
|
+
ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
|
|
139
|
+
|
|
140
|
+
if target_region is not None:
|
|
141
|
+
ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
|
|
142
|
+
center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
143
|
+
ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def merge_overlapping_regions(df):
|
|
147
|
+
"""
|
|
148
|
+
Merges overlapping regions in a DataFrame.
|
|
149
|
+
|
|
150
|
+
Parameters:
|
|
151
|
+
df (pd.DataFrame): DataFrame with columns 'start', 'end', 'name'
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List: List of merged regions as namedtuples (start, end, combined_name)
|
|
155
|
+
"""
|
|
156
|
+
if df.empty:
|
|
157
|
+
return []
|
|
158
|
+
|
|
159
|
+
Region = namedtuple('Region', ['start', 'end', 'combined_name'])
|
|
160
|
+
df = df.sort_values(by='start')
|
|
161
|
+
merged_regions = []
|
|
162
|
+
current_region = None
|
|
163
|
+
|
|
164
|
+
for _, row in df.iterrows():
|
|
165
|
+
start, end, name = row['start'], row['end'], row['name'].replace('_', ' ')
|
|
166
|
+
if current_region is None:
|
|
167
|
+
current_region = Region(start, end, [name])
|
|
168
|
+
elif start <= current_region.end:
|
|
169
|
+
current_region = Region(current_region.start, max(current_region.end, end), current_region.combined_name + [name])
|
|
170
|
+
else:
|
|
171
|
+
merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
|
|
172
|
+
current_region = Region(start, end, [name])
|
|
173
|
+
|
|
174
|
+
if current_region:
|
|
175
|
+
merged_regions.append(current_region._replace(combined_name=', '.join(current_region.combined_name)))
|
|
176
|
+
|
|
177
|
+
# Assuming split_text is a function that splits the text appropriately.
|
|
178
|
+
merged_regions = [Region(a, b, split_text(c, 35)) for a, b, c in merged_regions]
|
|
179
|
+
return merged_regions
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def split_text(text, width):
|
|
183
|
+
"""
|
|
184
|
+
Splits a text into lines with a maximum specified width.
|
|
185
|
+
|
|
186
|
+
Parameters:
|
|
187
|
+
text (str): The text to be split.
|
|
188
|
+
width (int): Maximum width of each line.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
str: The text split into lines of specified width.
|
|
192
|
+
"""
|
|
193
|
+
lines = re.findall('.{1,' + str(width) + '}', text)
|
|
194
|
+
return '\n'.join(lines)
|
|
195
|
+
|
|
196
|
+
def get_annotations(target_gene, w=500):
|
|
197
|
+
PROTEIN_ANNOTATIONS = {}
|
|
198
|
+
temp = PROTEIN_ANNOTATIONS[(PROTEIN_ANNOTATIONS['Transcript stable ID'] == PROTEIN_ANNOTATIONS[target_gene]) & (PROTEIN_ANNOTATIONS.length < w)].drop_duplicates(subset=['Interpro Short Description'], keep='first')
|
|
199
|
+
return merge_overlapping_regions(temp)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# def plot_conservation(tid, gene='', mutation_loc=None, target_region=None, mut_name='Mutation', domain_annotations=[], to_file=None):
|
|
203
|
+
# _, cons_vec = access_conservation_data(tid)
|
|
204
|
+
#
|
|
205
|
+
# sns.set_theme(style="white")
|
|
206
|
+
# fig, ax = plt.subplots(figsize=(15, 3)) # Adjusted figure size for better layout
|
|
207
|
+
#
|
|
208
|
+
# # Plotting the conservation vectors in the main plot
|
|
209
|
+
# temp = transform_conservation_vector(cons_vec, 76)
|
|
210
|
+
# temp /= max(temp)
|
|
211
|
+
# ax.plot(list(range(len(temp))), temp, c='b', label='Estimated Functional Residues')
|
|
212
|
+
# temp = transform_conservation_vector(cons_vec, 6)
|
|
213
|
+
# temp /= max(temp)
|
|
214
|
+
# ax.plot(list(range(len(temp))), temp, c='k', label='Estimated Functional Domains')
|
|
215
|
+
#
|
|
216
|
+
# # Setting up primary axis for the main plot
|
|
217
|
+
# ax.set_xlabel(f'AA Position - {gene}', weight='bold')
|
|
218
|
+
# ax.set_xlim(0, len(cons_vec))
|
|
219
|
+
# ax.set_ylim(0, 1) # Set y-limit to end at 1
|
|
220
|
+
# ax.set_ylabel('Relative Importance', weight='bold')
|
|
221
|
+
# ax.tick_params(axis='y')
|
|
222
|
+
# ax.spines['right'].set_visible(False)
|
|
223
|
+
# ax.spines['top'].set_visible(False)
|
|
224
|
+
#
|
|
225
|
+
# # Create a separate axes for protein domain visualization above the main plot
|
|
226
|
+
# domain_ax_height = 0.06 # Adjust for thinner protein diagram
|
|
227
|
+
# domain_ax = fig.add_axes([0.125, 0.95, 0.775, domain_ax_height]) # Position higher above the main plot
|
|
228
|
+
# domain_ax.set_xlim(0, len(cons_vec))
|
|
229
|
+
# domain_ax.set_xticks([])
|
|
230
|
+
# domain_ax.set_yticks([])
|
|
231
|
+
# domain_ax.spines['top'].set_visible(False)
|
|
232
|
+
# domain_ax.spines['right'].set_visible(False)
|
|
233
|
+
# domain_ax.spines['left'].set_visible(False)
|
|
234
|
+
# domain_ax.spines['bottom'].set_visible(False)
|
|
235
|
+
#
|
|
236
|
+
# # Draw the full-length protein as a base rectangle
|
|
237
|
+
# domain_ax.add_patch(Rectangle((0, 0), len(cons_vec), 0.9, facecolor='lightgray', edgecolor='none'))
|
|
238
|
+
#
|
|
239
|
+
# # Overlay domain annotations
|
|
240
|
+
# for domain in domain_annotations:
|
|
241
|
+
# start, end, label = domain
|
|
242
|
+
# domain_ax.add_patch(Rectangle((start, 0), end - start, 0.9, facecolor='orange', edgecolor='none', alpha=0.5))
|
|
243
|
+
# domain_ax.text((start + end) / 2, 2.1, label, ha='center', va='center', color='black', size=8)
|
|
244
|
+
#
|
|
245
|
+
# # Plotting Rate4Site scores on secondary y-axis
|
|
246
|
+
# ax2 = ax.twinx()
|
|
247
|
+
# c = np.array(cons_vec)
|
|
248
|
+
# c = c + abs(min(c))
|
|
249
|
+
# c = c/max(c)
|
|
250
|
+
# ax2.set_ylim(min(c), max(c)*1.1)
|
|
251
|
+
# ax2.scatter(list(range(len(c))), c, color='green', label='Rate4Site Scores', alpha=0.4)
|
|
252
|
+
# ax2.set_ylabel('Rate4Site Normalized', color='green', weight='bold')
|
|
253
|
+
# ax2.tick_params(axis='y', labelcolor='green')
|
|
254
|
+
# ax2.spines['right'].set_visible(True)
|
|
255
|
+
# ax2.spines['top'].set_visible(False)
|
|
256
|
+
#
|
|
257
|
+
# # Plotting mutation location and target region
|
|
258
|
+
# if mutation_loc is not None:
|
|
259
|
+
# ax.axvline(x=mutation_loc, ymax=1,color='r', linestyle='--', alpha=0.7)
|
|
260
|
+
# ax.text(mutation_loc, 1.04, mut_name, color='r', weight='bold', ha='center')
|
|
261
|
+
#
|
|
262
|
+
# if target_region is not None:
|
|
263
|
+
# ax.add_patch(Rectangle((target_region[0], 0), target_region[1] - target_region[0], 1, alpha=0.25, facecolor='gray'))
|
|
264
|
+
# center_loc = target_region[0] + 0.5 * (target_region[1] - target_region[0])
|
|
265
|
+
# ax.text(center_loc, 1.04, 'Deleted Region', ha='center', va='center', color='gray', weight='bold')
|
|
266
|
+
#
|
|
267
|
+
# plt.show()
|
|
268
|
+
#
|
|
269
|
+
|
geney/_gtex_utils.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
|
|
4
|
+
# Set pandas display options (if necessary)
|
|
5
|
+
pd.options.display.max_rows = 999
|
|
6
|
+
|
|
7
|
+
# Read metadata
|
|
8
|
+
metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
|
|
9
|
+
metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
|
|
10
|
+
|
|
11
|
+
# Initialize an empty DataFrame for combined results
|
|
12
|
+
combined_df = pd.DataFrame()
|
|
13
|
+
|
|
14
|
+
# Define chunk size
|
|
15
|
+
tpm_mean = []
|
|
16
|
+
# Process the main data file in chunks
|
|
17
|
+
for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
|
|
18
|
+
delimiter='\t')):
|
|
19
|
+
# Perform the same operations on the chunk
|
|
20
|
+
chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
|
|
21
|
+
# Append the processed chunk to the combined DataFrame
|
|
22
|
+
tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
|
|
23
|
+
|
|
24
|
+
# Compute the mean TPM per tissue
|
|
25
|
+
tpm_mean = pd.concat(tpm_mean)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
cancer_projects = {
|
|
29
|
+
"Adrenal Gland": "ACC",
|
|
30
|
+
"Bladder": "BLCA",
|
|
31
|
+
"Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
|
|
32
|
+
"Breast": "BRCA",
|
|
33
|
+
"Colon": "COAD",
|
|
34
|
+
"Esophagus": "ESCA",
|
|
35
|
+
"Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
|
|
36
|
+
"Liver": "LIHC",
|
|
37
|
+
"Lung": "LUNG",
|
|
38
|
+
"Ovary": "OV",
|
|
39
|
+
"Pancreas": "PAAD",
|
|
40
|
+
"Prostate": "PRAD",
|
|
41
|
+
"Skin": "SKCM",
|
|
42
|
+
"Stomach": "STAD",
|
|
43
|
+
"Testis": "TGCT",
|
|
44
|
+
"Uterus": "UCS"
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
tissue_projects = {
|
|
48
|
+
"ACC": "Adrenal Gland",
|
|
49
|
+
"BLCA": "Bladder",
|
|
50
|
+
"GBM": "Brain",
|
|
51
|
+
"LGG": "Brain",
|
|
52
|
+
"BRCA": "Breast",
|
|
53
|
+
"COAD": "Colon",
|
|
54
|
+
"ESCA": "Esophagus",
|
|
55
|
+
"KICH": "Kidney",
|
|
56
|
+
"KIRC": "Kidney",
|
|
57
|
+
"KIRP": "Kidney",
|
|
58
|
+
"LIHC": "Liver",
|
|
59
|
+
"LUNG": "Lung",
|
|
60
|
+
"OV": "Ovary",
|
|
61
|
+
"PAAD": "Pancreas",
|
|
62
|
+
"PRAD": "Prostate",
|
|
63
|
+
"SKCM": "Skin",
|
|
64
|
+
"STAD": "Stomach",
|
|
65
|
+
"TGCT": "Testis",
|
|
66
|
+
"UCS": "Uterus"
|
|
67
|
+
}
|
|
68
|
+
|
geney/_immune_utils.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import logging
|
|
3
|
+
import tempfile
|
|
4
|
+
from geney import _config_setup
|
|
5
|
+
import re
|
|
6
|
+
from io import StringIO
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NetChop(object):
|
|
11
|
+
"""
|
|
12
|
+
Wrapper around netChop tool. Assumes netChop is in your PATH.
|
|
13
|
+
"""
|
|
14
|
+
def predict_epitopes(self, sequences, threshold=0.5, min_len=8):
|
|
15
|
+
"""
|
|
16
|
+
Return netChop predictions for each position in each sequence.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
-----------
|
|
20
|
+
sequences : list of string
|
|
21
|
+
Amino acid sequences to predict cleavage for
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-----------
|
|
25
|
+
list of list of float
|
|
26
|
+
|
|
27
|
+
The i'th list corresponds to the i'th sequence. Each list gives
|
|
28
|
+
the cleavage probability for each position in the sequence.
|
|
29
|
+
"""
|
|
30
|
+
with tempfile.NamedTemporaryFile(dir=config_setup['NETCHOP'], suffix=".fsa", mode="w") as input_fd:
|
|
31
|
+
for (i, sequence) in enumerate(sequences):
|
|
32
|
+
_ = input_fd.write("> %d\n" % i)
|
|
33
|
+
_ = input_fd.write(sequence)
|
|
34
|
+
_ = input_fd.write("\n")
|
|
35
|
+
input_fd.flush()
|
|
36
|
+
try:
|
|
37
|
+
output = subprocess.check_output(["netchop", str(input_fd.name)])
|
|
38
|
+
except subprocess.CalledProcessError as e:
|
|
39
|
+
logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
|
|
40
|
+
raise
|
|
41
|
+
parsed = self.parse_netchop(output)
|
|
42
|
+
# return parsed
|
|
43
|
+
#
|
|
44
|
+
assert len(parsed) == len(sequences), \
|
|
45
|
+
"Expected %d results but got %d" % (
|
|
46
|
+
len(sequences), len(parsed))
|
|
47
|
+
assert [len(x) for x in parsed] == [len(x) for x in sequences]
|
|
48
|
+
filtered_proteosomes = []
|
|
49
|
+
for scores, seq in list(zip(parsed, sequences)):
|
|
50
|
+
proteosome = self.chop_protein(seq, [s > threshold for s in scores])
|
|
51
|
+
filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
|
|
52
|
+
return filtered_proteosomes
|
|
53
|
+
@staticmethod
|
|
54
|
+
def parse_netchop(netchop_output):
|
|
55
|
+
"""
|
|
56
|
+
Parse netChop stdout.
|
|
57
|
+
"""
|
|
58
|
+
line_iterator = iter(netchop_output.decode().split("\n"))
|
|
59
|
+
scores = []
|
|
60
|
+
for line in line_iterator:
|
|
61
|
+
if "pos" in line and 'AA' in line and 'score' in line:
|
|
62
|
+
scores.append([])
|
|
63
|
+
if "----" not in next(line_iterator):
|
|
64
|
+
raise ValueError("Dashes expected")
|
|
65
|
+
line = next(line_iterator)
|
|
66
|
+
while '-------' not in line:
|
|
67
|
+
score = float(line.split()[3])
|
|
68
|
+
scores[-1].append(score)
|
|
69
|
+
line = next(line_iterator)
|
|
70
|
+
return scores
|
|
71
|
+
def chop_protein(self, seq, pos):
|
|
72
|
+
# Generate subsequences using list comprehension and slicing
|
|
73
|
+
start = 0
|
|
74
|
+
subsequences = [seq[start:(start := i+1)] for i, marker in enumerate(pos) if marker == 1]
|
|
75
|
+
# Check if the last part needs to be added
|
|
76
|
+
if start < len(seq):
|
|
77
|
+
subsequences.append(seq[start:])
|
|
78
|
+
return subsequences
|
|
79
|
+
def generate_cut_sequences(self, char_sequence, cut_probabilities):
|
|
80
|
+
"""
|
|
81
|
+
Generate all possible cut sequences and their abundance values,
|
|
82
|
+
considering only those sequences where the probabilities of all cut sites
|
|
83
|
+
between the two ends are zero.
|
|
84
|
+
|
|
85
|
+
:param char_sequence: A string representing the sequence of characters.
|
|
86
|
+
:param cut_probabilities: A list of probabilities for each position in the sequence.
|
|
87
|
+
:return: A list of tuples, where each tuple contains a cut sequence and its abundance value.
|
|
88
|
+
"""
|
|
89
|
+
if len(char_sequence) != len(cut_probabilities):
|
|
90
|
+
raise ValueError("Character sequence and cut probabilities must have the same length.")
|
|
91
|
+
cut_sequences = []
|
|
92
|
+
# Generate all possible cuts
|
|
93
|
+
for i in range(len(char_sequence)):
|
|
94
|
+
for j in range(i + 1, len(char_sequence) + 1):
|
|
95
|
+
# Check if probabilities of all cut sites between i and j are zero
|
|
96
|
+
if sum(cut_probabilities[i + 1:j - 1]) < 1:
|
|
97
|
+
cut_sequence = char_sequence[i:j]
|
|
98
|
+
abundance_value = cut_probabilities[i] * cut_probabilities[j - 1] - sum(
|
|
99
|
+
cut_probabilities[i + 1:j - 1])
|
|
100
|
+
cut_sequences.append({'seq': cut_sequence, 'abundance': abundance_value})
|
|
101
|
+
return pd.DataFrame(cut_sequences)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def run_mhc(sequences):
|
|
105
|
+
with tempfile.NamedTemporaryFile(dir='/tamir2/nicolaslynn/temp', suffix=".pep", mode="w") as input_fd:
|
|
106
|
+
for (i, sequence) in enumerate(sequences):
|
|
107
|
+
_ = input_fd.write(sequence)
|
|
108
|
+
_ = input_fd.write("\n")
|
|
109
|
+
input_fd.flush()
|
|
110
|
+
try:
|
|
111
|
+
out = subprocess.check_output(
|
|
112
|
+
["netMHCpan", "-p", "-BA", str(input_fd.name)])
|
|
113
|
+
except subprocess.CalledProcessError as e:
|
|
114
|
+
logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
|
|
115
|
+
raise
|
|
116
|
+
out = out.decode('utf-8')
|
|
117
|
+
out = out.split(
|
|
118
|
+
'\n---------------------------------------------------------------------------------------------------------------------------\n')
|
|
119
|
+
out = out[1] + '\n' + out[2]
|
|
120
|
+
out = re.sub(r'[ ]+', ',', out)
|
|
121
|
+
out = out.replace('\n,', '\n')
|
|
122
|
+
return pd.read_csv(StringIO(out)).drop(columns=['Unnamed: 0'])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|