geney 1.1.14__py2.py3-none-any.whl → 1.1.15__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/immune_utils.py CHANGED
@@ -2,6 +2,8 @@ import subprocess
2
2
  import logging
3
3
  import tempfile
4
4
  from geney import config_setup
5
+ import re
6
+ from io import StringIO
5
7
  import pandas as pd
6
8
 
7
9
 
@@ -37,17 +39,17 @@ class NetChop(object):
37
39
  logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
38
40
  raise
39
41
  parsed = self.parse_netchop(output)
40
- return parsed
42
+ # return parsed
41
43
  #
42
- # assert len(parsed) == len(sequences), \
43
- # "Expected %d results but got %d" % (
44
- # len(sequences), len(parsed))
45
- # assert [len(x) for x in parsed] == [len(x) for x in sequences]
46
- # filtered_proteosomes = []
47
- # for scores, seq in list(zip(parsed, sequences)):
48
- # proteosome = self.chop_protein(seq, [s > threshold for s in scores])
49
- # filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
50
- # return filtered_proteosomes
44
+ assert len(parsed) == len(sequences), \
45
+ "Expected %d results but got %d" % (
46
+ len(sequences), len(parsed))
47
+ assert [len(x) for x in parsed] == [len(x) for x in sequences]
48
+ filtered_proteosomes = []
49
+ for scores, seq in list(zip(parsed, sequences)):
50
+ proteosome = self.chop_protein(seq, [s > threshold for s in scores])
51
+ filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
52
+ return filtered_proteosomes
51
53
  @staticmethod
52
54
  def parse_netchop(netchop_output):
53
55
  """
@@ -99,10 +101,6 @@ class NetChop(object):
99
101
  return pd.DataFrame(cut_sequences)
100
102
 
101
103
 
102
- import re
103
- import StringIO
104
- import pandas as pd
105
-
106
104
  def run_mhc(sequences):
107
105
  with tempfile.NamedTemporaryFile(dir='/tamir2/nicolaslynn/temp', suffix=".pep", mode="w") as input_fd:
108
106
  for (i, sequence) in enumerate(sequences):
geney/oncosplice.py CHANGED
@@ -530,8 +530,8 @@ class Transcript:
530
530
  for i, j in self.exons_pos:
531
531
  rel_start, rel_end = pre_indices_pos.index(i), pre_indices_pos.index(j)
532
532
  mature_mrna_pos += pre_seq_pos[rel_start:rel_end + 1]
533
- pre_indices_pos.extend(pre_indices_pos[rel_start:rel_end + 1])
534
- return mature_mrna_pos, pre_indices_pos
533
+ mature_indices_pos.extend(pre_indices_pos[rel_start:rel_end + 1])
534
+ return mature_mrna_pos, mature_indices_pos
535
535
 
536
536
  def generate_mature_mrna(self, inplace=True):
537
537
  if inplace:
@@ -0,0 +1,277 @@
1
+ from geney.oncosplice import *
2
+ from copy import deepcopy
3
+ import pandas as pd
4
+ import numpy as np
5
+ from geney.Fasta_segment import Fasta_segment
6
+ import torch
7
+
8
+ config_setup = { "BASE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse",
9
+ "ONCOSPLICE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/oncosplice",
10
+ "CHROM_SOURCE": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/chromosomes",
11
+ "MRNA_PATH": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/annotations",
12
+ "MISSPLICING_PATH": "/tamir2/nicolaslynntest/experimental/oncosplice_mouse/missplicing"}
13
+
14
+
15
+ from pkg_resources import resource_filename
16
+ from pangolin.model import *
17
+
18
+
19
+ IN_MAP = np.asarray([[0, 0, 0, 0],
20
+ [1, 0, 0, 0],
21
+ [0, 1, 0, 0],
22
+ [0, 0, 1, 0],
23
+ [0, 0, 0, 1]])
24
+ INDEX_MAP = {0:1, 1:2, 2:4, 3:5, 4:7, 5:8, 6:10, 7:11}
25
+ model_nums = [1, 3, 5, 7]
26
+
27
+ models = []
28
+ for i in model_nums:
29
+ for j in range(1, 6):
30
+ model = Pangolin(L, W, AR)
31
+ if torch.cuda.is_available():
32
+ model.cuda()
33
+ weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)))
34
+ else:
35
+ weights = torch.load(resource_filename("pangolin","models/final.%s.%s.3" % (j, i)),
36
+ map_location=torch.device('cpu'))
37
+ model.load_state_dict(weights)
38
+ model.eval()
39
+ models.append(model)
40
+
41
+ def one_hot_encode(seq, strand='+'):
42
+ seq = seq.upper().replace('A', '1').replace('C', '2')
43
+ seq = seq.replace('G', '3').replace('T', '4').replace('N', '0')
44
+ if strand == '+':
45
+ seq = np.asarray(list(map(int, list(seq))))
46
+ elif strand == '-':
47
+ seq = np.asarray(list(map(int, list(seq[::-1]))))
48
+ seq = (5 - seq) % 5 # Reverse complement
49
+ return IN_MAP[seq.astype('int8')]
50
+
51
+
52
+ def run_pangolin_seq(seq):
53
+ seq = one_hot_encode(seq, '+').T
54
+ seq = torch.from_numpy(np.expand_dims(seq, axis=0)).float()
55
+
56
+ if torch.cuda.is_available():
57
+ seq = seq.to(torch.device("cuda"))
58
+
59
+ score = []
60
+ for j, model_num in enumerate(model_nums):
61
+ # score = []
62
+ # Average across 5 models
63
+ for model in models[5*j:5*j+5]:
64
+ with torch.no_grad():
65
+ score.append(model(seq)[0][INDEX_MAP[model_num],:].cpu().numpy())
66
+ return np.mean(score, axis=0)
67
+
68
+ # Missplicing Detection
69
+ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
70
+ '''
71
+ :param ref_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the reference sequence
72
+ :param mut_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the mutated sequence
73
+ :param known_splice_sites: the indices (by genomic position) that serve as known splice sites
74
+ :param threshold: the threshold for detection (difference between reference and mutated probabilities)
75
+ :return: two dictionaries; discovered_pos is a dictionary containing all the positions that meat the threshold for discovery
76
+ and deleted_pos containing all the positions that meet the threshold for missing and the condition for missing
77
+ '''
78
+
79
+ new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
80
+ list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
81
+
82
+ discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
83
+ new_dict.items() if v >= threshold and k not in known_splice_sites} # if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
84
+
85
+ deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
86
+ new_dict.items() if -v >= threshold and k in known_splice_sites} #if k in known_splice_sites and v <= -threshold}
87
+
88
+ return discovered_pos, deleted_pos
89
+
90
+
91
+ def run_pangolin_comparison(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
92
+ positions = mutations.positions
93
+ end_positions = [m.start + len(m.ref) for m in mutations.variants]
94
+ positions.extend(end_positions)
95
+
96
+ seq_start_pos = min(positions) - sai_mrg_context - min_coverage
97
+ seq_end_pos = max(positions) + sai_mrg_context + min_coverage
98
+
99
+ fasta_obj = Fasta_segment()
100
+ ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
101
+ config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
102
+ seq_start_pos,
103
+ seq_end_pos)
104
+
105
+ transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
106
+
107
+ start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
108
+ end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
109
+ end_pad = len(ref_indices) - end_cutoff
110
+ ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
111
+ ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
112
+ mut_seq, mut_indices = ref_seq, ref_indices
113
+
114
+ for mut in mutations:
115
+ mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
116
+
117
+ ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
118
+ mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
119
+
120
+ visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
121
+ visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
122
+
123
+ if rev:
124
+ ref_seq = reverse_complement(ref_seq)
125
+ mut_seq = reverse_complement(mut_seq)
126
+ ref_indices = ref_indices[::-1]
127
+ mut_indices = mut_indices[::-1]
128
+
129
+ ref_seq_probs = run_pangolin_seq(ref_seq)
130
+ mut_seq_probs = run_pangolin_seq(mut_seq)
131
+
132
+
133
+ assert len(ref_indices) == len(ref_seq_probs), 'Reference pos not the same'
134
+ assert len(mut_indices) == len(mut_seq_probs), 'Mut pos not the same'
135
+
136
+ iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_probs))},
137
+ {p: v for p, v in list(zip(mut_indices, mut_seq_probs))},
138
+ visible_acceptors,
139
+ threshold=sai_threshold)
140
+
141
+
142
+ idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_probs))},
143
+ {p: v for p, v in list(zip(mut_indices, mut_seq_probs))},
144
+ visible_donors,
145
+ threshold=sai_threshold)
146
+
147
+
148
+ ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_probs))}
149
+ ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_probs))}
150
+
151
+ lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
152
+ lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
153
+ dap.update(lost_acceptors)
154
+ ddp.update(lost_donors)
155
+
156
+ missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
157
+ missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
158
+ return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
159
+
160
+
161
+ class PredictPangolin:
162
+ def __init__(self, mutation, gene_data,
163
+ threshold=0.5, context=5000, coverage=2500):
164
+ self.modification = mutation
165
+ self.threshold = threshold
166
+ self.transcript_id = gene_data.transcript_id
167
+ self.spliceai_db = config_setup['MISSPLICING_PATH'] / f'spliceai_epistatic'
168
+ self.missplicing = {}
169
+ self.missplicing = run_pangolin_comparison(self.modification, transcript_data=gene_data, sai_mrg_context=context, min_coverage=coverage, sai_threshold=0.1)
170
+
171
+
172
+ def __repr__(self):
173
+ return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
174
+
175
+ def __str__(self):
176
+ return self.aberrant_splicing
177
+ def __bool__(self):
178
+ for event, details in self.aberrant_splicing.items():
179
+ if details:
180
+ return True
181
+ return False
182
+
183
+ def __eq__(self, alt_splicing):
184
+ flag, _ = check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
185
+ return not flag
186
+
187
+ def __iter__(self):
188
+ penetrances = [abs(d_in['delta']) for d in self.missplicing.values() for d_in in d.values()] + [0]
189
+ return iter(penetrances)
190
+
191
+ @property
192
+ def aberrant_splicing(self):
193
+ return self.apply_sai_threshold(self.missplicing, self.threshold)
194
+
195
+ def apply_sai_threshold(self, splicing_dict=None, threshold=None):
196
+ splicing_dict = self.missplicing if not splicing_dict else splicing_dict
197
+ threshold = self.threshold if not threshold else threshold
198
+ new_dict = {}
199
+ for event, details in splicing_dict.items():
200
+ for e, d in details.items():
201
+ if abs(d['delta']) >= threshold:
202
+ return splicing_dict
203
+ return new_dict
204
+
205
+
206
+ def apply_sai_threshold_primary(self, splicing_dict=None, threshold=None):
207
+ splicing_dict = self.missplicing if not splicing_dict else splicing_dict
208
+ threshold = self.threshold if not threshold else threshold
209
+ new_dict = {}
210
+ for event, details in splicing_dict.items():
211
+ new_dict_in = {}
212
+ for e, d in details.items():
213
+ if abs(d['delta']) >= threshold:
214
+ new_dict_in[e] = d
215
+ new_dict[event] = new_dict_in
216
+ return new_dict
217
+
218
+ def get_max_missplicing_delta(self):
219
+ max_delta = 0
220
+ for event, details in self.missplicing.items():
221
+ for e, d in details.items():
222
+ if abs(d['delta']) > max_delta:
223
+ max_delta = abs(d['delta'])
224
+ return max_delta
225
+
226
+
227
+ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, per_transcript_missplicing=False, window_length=13, save_spliceai_results=False, force_spliceai=False):
228
+ mutation = Variations(mut_id)
229
+ try:
230
+ reference_gene = Gene(mutation.gene)
231
+ except FileNotFoundError:
232
+ return pd.DataFrame()
233
+
234
+ reference_gene_proteines = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
235
+ mutated_gene = Gene(mutation.gene, mut_id)
236
+
237
+
238
+ results = []
239
+ for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
240
+ reference = reference_gene.transcript(variant.transcript_id)
241
+ if mutation not in reference or reference.protein == '' or len(reference.protein) < window_length:
242
+ continue
243
+
244
+ cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
245
+ # if per_transcript_missplicing:
246
+ missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
247
+ missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
248
+ # print(missplicing)
249
+ for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
250
+ variant_isoform = deepcopy(variant)
251
+ variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
252
+ alignment = get_logical_alignment(reference.protein, variant_isoform.protein)
253
+ deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
254
+ modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
255
+ temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
256
+ affected_cons_scores = max(temp_cons)
257
+ percentile = (
258
+ sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
259
+ cons_vector))
260
+
261
+ report = OncospliceAnnotator(reference, variant_isoform, mutation)
262
+ report['original_cons'] = reference.cons_vector
263
+ report['oncosplice_score'] = affected_cons_scores
264
+ report['percentile'] = percentile
265
+ report['modified_positions'] = modified_positions
266
+ report['cons_vector'] = cons_vector
267
+ report['isoform_id'] = i
268
+ report['isoform_prevalence'] = new_boundaries['path_weight']
269
+ report['full_missplicing'] = missplicing
270
+ report['missplicing'] = max(missplicing_obj)
271
+ report['reference_resemblance'] = reference_gene_proteines.get(variant_isoform.protein, None)
272
+ results.append(report)
273
+
274
+ report = pd.DataFrame(results)
275
+ return report
276
+
277
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.1.14
3
+ Version: 1.1.15
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -27,9 +27,12 @@ Requires-Dist: joblib ==1.3.2
27
27
  Requires-Dist: gtfparse ==1.3.0
28
28
  Requires-Dist: sh ==2.0.6
29
29
  Requires-Dist: termplotlib ==0.3.9
30
+ Requires-Dist: torch
30
31
  Requires-Dist: lifelines
31
32
  Requires-Dist: notebook
32
33
  Requires-Dist: matplotlib
33
34
  Requires-Dist: dask[complete]
34
35
  Requires-Dist: dask-jobqueue
36
+ Requires-Dist: gffutils
37
+ Requires-Dist: pyfastx
35
38
 
@@ -7,9 +7,10 @@ geney/config_setup.py,sha256=SePeooA4RWAtR_KAT1-W1hkD3MT5tH6YMyp80t_RNPQ,385
7
7
  geney/data_setup.py,sha256=DZeksRPr2ZT7bszMo33W0r3OwmqHokVXtZ4gx5Lu_Mo,10725
8
8
  geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
9
9
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
10
- geney/immune_utils.py,sha256=elxjQyB52lYXrrt3sX6vtYlr_pTFEeCFzmEMP2qlPwA,5300
10
+ geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
11
11
  geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
12
- geney/oncosplice.py,sha256=hVSsQulgER5NZtmQB59LTLl5tOWPeWcpOpHquW_Z-DM,68965
12
+ geney/oncosplice.py,sha256=vHKRq5Zkc0qhsMAe8sZKbGjjK6-Wgk_Si0EDHUU_BOY,68971
13
+ geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
13
14
  geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
14
15
  geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
15
16
  geney/power_utils.py,sha256=6InuDm1jSrsgR-F_LmdMTbuQwty2OdYjwfGGaAPhaRI,7268
@@ -44,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
44
45
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
45
46
  geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
47
  geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- geney-1.1.14.dist-info/METADATA,sha256=zIhA9HkRpvesCUHmRo9Aml2qSmXBEYa6XBsISeWTtt0,1131
48
- geney-1.1.14.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
49
- geney-1.1.14.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
50
- geney-1.1.14.dist-info/RECORD,,
48
+ geney-1.1.15.dist-info/METADATA,sha256=DMZ8ovJT_dpSe2rmM_m2LAc9nIZsOf4VUlLE__kscfY,1199
49
+ geney-1.1.15.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
+ geney-1.1.15.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
+ geney-1.1.15.dist-info/RECORD,,
File without changes