geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (39) hide show
  1. geney/oncosplice.py +1 -1
  2. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
  3. geney-1.2.22.dist-info/RECORD +19 -0
  4. geney/Gene.py +0 -258
  5. geney/analyzers/__init__.py +0 -0
  6. geney/analyzers/benchmark_clinvar.py +0 -158
  7. geney/analyzers/characterize_epistasis.py +0 -15
  8. geney/analyzers/compare_sets.py +0 -91
  9. geney/analyzers/group_comparison.py +0 -81
  10. geney/analyzers/survival.py +0 -144
  11. geney/analyzers/tcga_annotations.py +0 -194
  12. geney/analyzers/visualize_protein_conservation.py +0 -398
  13. geney/benchmark_clinvar.py +0 -158
  14. geney/compare_sets.py +0 -91
  15. geney/data_parsers/__init__.py +0 -0
  16. geney/data_parsers/gtex.py +0 -68
  17. geney/gtex.py +0 -68
  18. geney/immunotherapy/__init__.py +0 -0
  19. geney/immunotherapy/netchop.py +0 -78
  20. geney/mutations/__init__.py +0 -0
  21. geney/mutations/variant_utils.py +0 -125
  22. geney/netchop.py +0 -79
  23. geney/oncosplice/__init__.py +0 -0
  24. geney/oncosplice_mouse.py +0 -277
  25. geney/oncosplice_pipeline.py +0 -1588
  26. geney/performance_utils.py +0 -138
  27. geney/pipelines/__init__.py +0 -0
  28. geney/pipelines/dask_utils.py +0 -153
  29. geney/splicing/__init__.py +0 -2
  30. geney/splicing/spliceai_utils.py +0 -253
  31. geney/splicing/splicing_isoform_utils.py +0 -0
  32. geney/splicing/splicing_utils.py +0 -366
  33. geney/survival.py +0 -124
  34. geney/tcga_annotations.py +0 -352
  35. geney/translation_termination/__init__.py +0 -0
  36. geney/translation_termination/tts_utils.py +0 -0
  37. geney-1.2.20.dist-info/RECORD +0 -52
  38. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
  39. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0
geney/oncosplice_mouse.py DELETED
@@ -1,277 +0,0 @@
1
- from geney.oncosplice import *
2
- from copy import deepcopy
3
- import pandas as pd
4
- import numpy as np
5
- from geney.Fasta_segment import Fasta_segment
6
- import torch
7
-
8
- config_setup = { "BASE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse",
9
- "ONCOSPLICE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/oncosplice",
10
- "CHROM_SOURCE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/chromosomes",
11
- "MRNA_PATH": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/annotations",
12
- "MISSPLICING_PATH": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/missplicing"}
13
-
14
-
15
- from pkg_resources import resource_filename
16
- from pangolin.model import *
17
-
18
-
19
- IN_MAP = np.asarray([[0, 0, 0, 0],
20
- [1, 0, 0, 0],
21
- [0, 1, 0, 0],
22
- [0, 0, 1, 0],
23
- [0, 0, 0, 1]])
24
- INDEX_MAP = {0:1, 1:2, 2:4, 3:5, 4:7, 5:8, 6:10, 7:11}
25
- model_nums = [1, 3, 5, 7]
26
-
27
- models = []
28
- for i in model_nums:
29
- for j in range(1, 6):
30
- model = Pangolin(L, W, AR)
31
- if torch.cuda.is_available():
32
- model.cuda()
33
- weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
34
- else:
35
- weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)),
36
- map_location=torch.device('cpu'))
37
- model.load_state_dict(weights)
38
- model.eval()
39
- models.append(model)
40
-
41
- def one_hot_encode(seq, strand='+'):
42
- seq = seq.upper().replace('A', '1').replace('C', '2')
43
- seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
44
- if strand == '+':
45
- seq = np.asarray(list(map(int, list(seq))))
46
- elif strand == '-':
47
- seq = np.asarray(list(map(int, list(seq[::-1]))))
48
- seq = (5 - seq) % 5 # Reverse complement
49
- return IN_MAP[seq.astype('int8')]
50
-
51
-
52
- def run_pangolin_seq(seq):
53
- seq = one_hot_encode(seq, '+').T
54
- seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
55
-
56
- if torch.cuda.is_available():
57
- seq = seq.to(torch.device("cuda"))
58
-
59
- score = []
60
- for j, model_num in enumerate(model_nums):
61
- # score = []
62
- # Average across 5 models
63
- for model in models[5*j:5*j+5]:
64
- with torch.no_grad():
65
- score.append(model(seq)[0][INDEX_MAP[model_num],:].cpu().numpy())
66
- return np.mean(score, axis=0)
67
-
68
- # Missplicing Detection
69
- def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
70
- '''
71
- :param ref_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the reference sequence
72
- :param mut_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the mutated sequence
73
- :param known_splice_sites: the indices (by genomic position) that serve as known splice sites
74
- :param threshold: the threshold for detection (difference between reference and mutated probabilities)
75
- :return: two dictionaries; discovered_pos is a dictionary containing all the positions that meat the threshold for discovery
76
- and deleted_pos containing all the positions that meet the threshold for missing and the condition for missing
77
- '''
78
-
79
- new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
80
- list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
81
-
82
- discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
83
- new_dict.items() if v >= threshold and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
84
-
85
- deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
86
- new_dict.items() if -v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
87
-
88
- return discovered_pos, deleted_pos
89
-
90
-
91
- def run_pangolin_comparison(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
92
- positions = mutations.positions
93
- end_positions = [m.start + len(m.ref) for m in mutations.variants]
94
- positions.extend(end_positions)
95
-
96
- seq_start_pos = min(positions) - sai_mrg_context - min_coverage
97
- seq_end_pos = max(positions) + sai_mrg_context + min_coverage
98
-
99
- fasta_obj = Fasta_segment()
100
- ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
101
- config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
102
- seq_start_pos,
103
- seq_end_pos)
104
-
105
- transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
106
-
107
- start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
108
- end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
109
- end_pad = len(ref_indices) - end_cutoff
110
- ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
111
- ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
112
- mut_seq, mut_indices = ref_seq, ref_indices
113
-
114
- for mut in mutations:
115
- mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
116
-
117
- ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
118
- mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
119
-
120
- visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
121
- visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
122
-
123
- if rev:
124
- ref_seq = reverse_complement(ref_seq)
125
- mut_seq = reverse_complement(mut_seq)
126
- ref_indices = ref_indices[::-1]
127
- mut_indices = mut_indices[::-1]
128
-
129
- ref_seq_probs = run_pangolin_seq(ref_seq)
130
- mut_seq_probs = run_pangolin_seq(mut_seq)
131
-
132
-
133
- assert len(ref_indices) == len(ref_seq_probs), 'Reference pos not the same'
134
- assert len(mut_indices) == len(mut_seq_probs), 'Mut pos not the same'
135
-
136
- iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_probs))},
137
- {p: v for p, v in list(zip(mut_indices, mut_seq_probs))},
138
- visible_acceptors,
139
- threshold=sai_threshold)
140
-
141
-
142
- idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_probs))},
143
- {p: v for p, v in list(zip(mut_indices, mut_seq_probs))},
144
- visible_donors,
145
- threshold=sai_threshold)
146
-
147
-
148
- ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_probs))}
149
- ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_probs))}
150
-
151
- lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
152
- lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
153
- dap.update(lost_acceptors)
154
- ddp.update(lost_donors)
155
-
156
- missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
157
- missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
158
- return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
159
-
160
-
161
- class PredictPangolin:
162
- def __init__(self, mutation, gene_data,
163
- threshold=0.5, context=5000, coverage=2500):
164
- self.modification = mutation
165
- self.threshold = threshold
166
- self.transcript_id = gene_data.transcript_id
167
- self.spliceai_db = config_setup['MISSPLICING_PATH'] / f'spliceai_epistatic'
168
- self.missplicing = {}
169
- self.missplicing = run_pangolin_comparison(self.modification, transcript_data=gene_data, sai_mrg_context=context, min_coverage=coverage, sai_threshold=0.1)
170
-
171
-
172
- def __repr__(self):
173
- return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
174
-
175
- def __str__(self):
176
- return self.aberrant_splicing
177
- def __bool__(self):
178
- for event, details in self.aberrant_splicing.items():
179
- if details:
180
- return True
181
- return False
182
-
183
- def __eq__(self, alt_splicing):
184
- flag, _ = check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
185
- return not flag
186
-
187
- def __iter__(self):
188
- penetrances = [abs(d_in['delta']) for d in self.missplicing.values() for d_in in d.values()] + [0]
189
- return iter(penetrances)
190
-
191
- @property
192
- def aberrant_splicing(self):
193
- return self.apply_sai_threshold(self.missplicing, self.threshold)
194
-
195
- def apply_sai_threshold(self, splicing_dict=None, threshold=None):
196
- splicing_dict = self.missplicing if not splicing_dict else splicing_dict
197
- threshold = self.threshold if not threshold else threshold
198
- new_dict = {}
199
- for event, details in splicing_dict.items():
200
- for e, d in details.items():
201
- if abs(d['delta']) >= threshold:
202
- return splicing_dict
203
- return new_dict
204
-
205
-
206
- def apply_sai_threshold_primary(self, splicing_dict=None, threshold=None):
207
- splicing_dict = self.missplicing if not splicing_dict else splicing_dict
208
- threshold = self.threshold if not threshold else threshold
209
- new_dict = {}
210
- for event, details in splicing_dict.items():
211
- new_dict_in = {}
212
- for e, d in details.items():
213
- if abs(d['delta']) >= threshold:
214
- new_dict_in[e] = d
215
- new_dict[event] = new_dict_in
216
- return new_dict
217
-
218
- def get_max_missplicing_delta(self):
219
- max_delta = 0
220
- for event, details in self.missplicing.items():
221
- for e, d in details.items():
222
- if abs(d['delta']) > max_delta:
223
- max_delta = abs(d['delta'])
224
- return max_delta
225
-
226
-
227
- def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, per_transcript_missplicing=False, window_length=13, save_spliceai_results=False, force_spliceai=False):
228
- mutation = Variations(mut_id)
229
- try:
230
- reference_gene = Gene(mutation.gene)
231
- except FileNotFoundError:
232
- return pd.DataFrame()
233
-
234
- reference_gene_proteines = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
235
- mutated_gene = Gene(mutation.gene, mut_id)
236
-
237
-
238
- results = []
239
- for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
240
- reference = reference_gene.transcript(variant.transcript_id)
241
- if mutation not in reference or reference.protein == '' or len(reference.protein) < window_length:
242
- continue
243
-
244
- cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
245
- # if per_transcript_missplicing:
246
- missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
247
- missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
248
- # print(missplicing)
249
- for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
250
- variant_isoform = deepcopy(variant)
251
- variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
252
- alignment = get_logical_alignment(reference.protein, variant_isoform.protein)
253
- deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
254
- modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
255
- temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
256
- affected_cons_scores = max(temp_cons)
257
- percentile = (
258
- sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
259
- cons_vector))
260
-
261
- report = OncospliceAnnotator(reference, variant_isoform, mutation)
262
- report['original_cons'] = reference.cons_vector
263
- report['oncosplice_score'] = affected_cons_scores
264
- report['percentile'] = percentile
265
- report['modified_positions'] = modified_positions
266
- report['cons_vector'] = cons_vector
267
- report['isoform_id'] = i
268
- report['isoform_prevalence'] = new_boundaries['path_weight']
269
- report['full_missplicing'] = missplicing
270
- report['missplicing'] = max(missplicing_obj)
271
- report['reference_resemblance'] = reference_gene_proteines.get(variant_isoform.protein, None)
272
- results.append(report)
273
-
274
- report = pd.DataFrame(results)
275
- return report
276
-
277
-