geney 1.2.55__py2.py3-none-any.whl → 1.2.57__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/oncosplice.py CHANGED
@@ -9,7 +9,6 @@ from .seqmat_utils import *
9
9
  from .mutation_utils import *
10
10
  from .tis_utils import find_tis
11
11
 
12
- ### Scoring
13
12
  def find_continuous_gaps(sequence):
14
13
  """Find continuous gap sequences in an alignment."""
15
14
  return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
@@ -121,43 +120,6 @@ def transform_conservation_vector(conservation_vector, window=13, factor=4):
121
120
  return exp_factors
122
121
 
123
122
 
124
- # def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
125
- # """
126
- # Identify unmodified positions in a sequence given deletions and insertions.
127
- #
128
- # :param sequence_length: Length of the sequence.
129
- # :param deletions: Dictionary of deletions.
130
- # :param insertions: Dictionary of insertions.
131
- # :param reach_limit: Limit for considering the effect of insertions/deletions.
132
- # :return: Array indicating unmodified positions.
133
- # """
134
- # unmodified_positions = np.zeros(sequence_length, dtype=float)
135
- #
136
- # for pos, insertion in insertions.items():
137
- # # if pos >= sequence_length:
138
- # # pos = sequence_length - 1
139
- # # add_factor = 1
140
- #
141
- # reach = min(len(insertion) // 2, reach_limit)
142
- # front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
143
- # len_start, len_end = pos - front_end, back_end - pos
144
- # try:
145
- # gradient_front = np.linspace(0, 1, len_start, endpoint=False)
146
- # gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
147
- # combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
148
- # unmodified_positions[front_end:back_end + 1] = combined_gradient
149
- #
150
- # except ValueError as e:
151
- # print(
152
- # f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}.")
153
- # unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
154
- #
155
- # for pos, deletion in deletions.items():
156
- # deletion_length = len(deletion)
157
- # unmodified_positions[pos:pos + deletion_length] = 1
158
- #
159
- # return unmodified_positions
160
-
161
123
  def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
162
124
  """
163
125
  Identify unmodified positions in a sequence given deletions and insertions.
@@ -251,12 +213,7 @@ def moving_average_conv(vector, window_size, factor=1):
251
213
 
252
214
  return np.convolve(vector, np.ones(window_size), mode='same') / window_size
253
215
 
254
-
255
-
256
-
257
-
258
216
  def find_splice_site_proximity(pos, transcript):
259
-
260
217
  for i, (ex_start, ex_end) in enumerate(transcript.exons):
261
218
  if min(ex_start, ex_end) <= pos <= max(ex_start, ex_end):
262
219
  return i + 1, None, abs(pos - ex_start), abs(pos - ex_end)
@@ -323,7 +280,7 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
323
280
 
324
281
  # Annotating
325
282
  def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
326
- affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut.indices[0],
283
+ affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
327
284
  reference_transcript)
328
285
 
329
286
  report = {}
@@ -361,59 +318,60 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
361
318
  return report
362
319
 
363
320
 
364
- # def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
365
- # gene = Gene(mut_id.split(':')[0], organism=organism)
366
- # reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
367
- # mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
368
- #
369
- # results = []
370
- # for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
371
- # if cons_required and not transcript.cons_available:
372
- # continue
373
- #
374
- # if all(mutation not in transcript for mutation in mutations):
375
- # # results.append({'transcript_id': transcript.transcript_id})
376
- # continue
377
- #
378
- # transcript.generate_pre_mrna()
379
- # transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
380
- # transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
381
- # ref_protein, cons_vector = transcript.protein, transcript.cons_vector
382
- # reference_transcript = copy.deepcopy(transcript)
383
- #
384
- # assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
385
- #
386
- # missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
387
- # for mutation in mutations:
388
- # transcript.pre_mrna += mutation
389
- #
390
- # for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
391
- # transcript.acceptors = new_boundaries['acceptors']
392
- # transcript.donors = new_boundaries['donors']
393
- # transcript.generate_mature_mrna().generate_protein()
394
- #
395
- # alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
396
- # deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
397
- # modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
398
- # temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
399
- # affected_cons_scores = max(temp_cons)
400
- # percentile = (
401
- # sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
402
- # cons_vector))
403
- #
404
- # report = OncospliceAnnotator(reference_transcript, transcript, mutation)
405
- # report['mut_id'] = mut_id
406
- # report['oncosplice_score'] = affected_cons_scores
407
- # report['percentile'] = percentile
408
- # report['isoform_id'] = i
409
- # report['isoform_prevalence'] = new_boundaries['path_weight']
410
- # report['full_missplicing'] = missplicing.aberrant_splicing
411
- # report['missplicing'] = max(missplicing)
412
- # report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
413
- # results.append(report)
414
- #
415
- # report = pd.DataFrame(results)
416
- # return report
321
+ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
322
+ gene = Gene(mut_id.split(':')[0], organism=organism)
323
+ reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
324
+ mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
325
+
326
+ results = []
327
+ for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
328
+ if cons_required and not transcript.cons_available:
329
+ continue
330
+
331
+ if all(mutation not in transcript for mutation in mutations):
332
+ continue
333
+
334
+ transcript.generate_pre_mrna()
335
+ transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
336
+ transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
337
+ ref_protein, cons_vector = transcript.protein, transcript.cons_vector
338
+ reference_transcript = copy.deepcopy(transcript)
339
+
340
+ assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
341
+
342
+ missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
343
+ for mutation in mutations:
344
+ transcript.pre_mrna += mutation
345
+
346
+ for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
347
+ transcript.acceptors = new_boundaries['acceptors']
348
+ transcript.donors = new_boundaries['donors']
349
+ transcript.generate_mature_mrna().generate_protein()
350
+
351
+ alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
352
+ deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
353
+ modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
354
+ temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
355
+ affected_cons_scores = max(temp_cons)
356
+ percentile = (
357
+ sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
358
+ cons_vector))
359
+
360
+ report = OncospliceAnnotator(reference_transcript, transcript, mutation)
361
+ report['mut_id'] = mut_id
362
+ report['oncosplice_score'] = affected_cons_scores
363
+ report['percentile'] = percentile
364
+ report['isoform_id'] = i
365
+ report['isoform_prevalence'] = new_boundaries['path_weight']
366
+ report['full_missplicing'] = missplicing.aberrant_splicing
367
+ report['missplicing'] = max(missplicing)
368
+ report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
369
+ results.append(report)
370
+
371
+ if len(results) == 0:
372
+ return None
373
+
374
+ return pd.DataFrame(results)
417
375
 
418
376
 
419
377
  import asyncio
geney/pangolin_utils.py CHANGED
@@ -46,10 +46,12 @@ def pang_one_hot_encode(seq):
46
46
 
47
47
 
48
48
 
49
- def pangolin_predict_probs(true_seq, models):
49
+ def pangolin_predict_probs(true_seq, models, just_ss=False):
50
50
  # print(f"Running pangolin on: {true_seq}")
51
- model_nums = [0, 2, 4, 6]
52
- model_nums = [0, 1, 2, 3, 4, 5, 6]
51
+ if just_ss:
52
+ model_nums = [0, 2, 4, 6]
53
+ else:
54
+ model_nums = [0, 1, 2, 3, 4, 5, 6, 7]
53
55
  INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
54
56
 
55
57
  seq = true_seq
geney/spliceai_utils.py CHANGED
@@ -12,8 +12,8 @@ if tf.config.list_physical_devices('GPU'):
12
12
  else:
13
13
  print("Running on CPU.")
14
14
 
15
- # tf.config.threading.set_intra_op_parallelism_threads(1)
16
- # tf.config.threading.set_inter_op_parallelism_threads(1)
15
+ tf.config.threading.set_intra_op_parallelism_threads(1)
16
+ tf.config.threading.set_inter_op_parallelism_threads(1)
17
17
 
18
18
  sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
19
19
  sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
geney/splicing_utils.py CHANGED
@@ -145,7 +145,7 @@ def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
145
145
  return discovered_pos, deleted_pos
146
146
 
147
147
 
148
- def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai'):
148
+ def find_transcript_missplicing(transcript, mutations, context=5000, window=2500, threshold=0.5, engine='spliceai', just_ss=False):
149
149
  from functools import reduce
150
150
  ref = transcript.pre_mrna
151
151
  var = reduce(lambda acc, mutation: acc + mutation, mutations, ref)
@@ -182,8 +182,8 @@ def find_transcript_missplicing(transcript, mutations, context=5000, window=2500
182
182
 
183
183
  elif engine == 'pangolin':
184
184
  from .pangolin_utils import pangolin_predict_probs, pang_models
185
- ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
186
- mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models)
185
+ ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models, just_ss=just_ss)
186
+ mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(var_seq, models=pang_models, just_ss=just_ss)
187
187
 
188
188
  else:
189
189
  raise ValueError(f"{engine} not implemented")
geney/tis_utils.py CHANGED
@@ -28,26 +28,13 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
28
28
  right_context=right_context,
29
29
  padding='$')
30
30
 
31
- # 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
32
31
  if context_conserved:
33
- return tis_coords[0]
34
-
35
- # 4. Reaquisition of TIS follows:
36
- #### The logic:
37
- # a. We need to find all possible start codon candidates as relative indices
38
- # b. We need to find what proteins each alternative start codon would create
39
- # c. We need to make sure we are only looking at a region around a mutation
40
- # d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
32
+ return [(tis_coords[0], 1, 'canonical')]
41
33
 
42
34
  sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
43
- # target_transcript = sc_table[sc_table.transcript_id == ref_id]
44
- # if len(target_transcript) == 0:
45
- ### reaquire TIS score for ref
46
- # pass
47
-
48
35
  ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
49
36
  right_context=right_context, padding='$')
50
- # target_ref_titer_score = target_transcript.tis_score
37
+
51
38
  ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
52
39
  ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
53
40
  ref_protein = ref_seq.translate(tis_coords[0])
@@ -56,7 +43,8 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
56
43
  candidate_positions = np.array(
57
44
  [p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
58
45
  for i in range(len(ref_seq.seq))])
59
- candidate_positions = candidate_positions > sorted(candidate_positions)[-5]
46
+
47
+ candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
60
48
  candidate_positions = np.array([retrieve_titer_score(
61
49
  mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
62
50
  padding='$')) if candidate_positions[i] > 0 else False for i in
@@ -66,7 +54,7 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
66
54
  in range(len(ref_seq.seq))])
67
55
  best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
68
56
  out = mut_seq.seqmat[1, best_position]
69
- return out
57
+ return out #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
70
58
 
71
59
 
72
60
  def seq_matrix(seq_list):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.55
3
+ Version: 1.2.57
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -6,21 +6,21 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
6
6
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
7
7
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
8
8
  geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
9
- geney/oncosplice.py,sha256=-_b0ZSxWa-bSYDoVMt605lJlx8-rXf0WsKsFrMoF6Vg,23707
10
- geney/pangolin_utils.py,sha256=NJEdY43L_2lielY1hZOjlak0baHqXTa1ITrvx8Tkg5o,2878
9
+ geney/oncosplice.py,sha256=eWgY2Lcj894UBFnIVhbxiVz5oqASHg-Ot1wFbjlJbI8,21857
10
+ geney/pangolin_utils.py,sha256=HvXfdLhHWTDXNmYtc8K3p64iTvDtsBq6-Jml5tpg7JI,2930
11
11
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
12
12
  geney/seqmat_utils.py,sha256=2cRXT_Ox4IdzCM8x3H2HexxFZzjo5WHs0HZiUQv8fBM,18347
13
- geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
14
- geney/splicing_utils.py,sha256=t0vE5KTAdYOYJLa9wjaSJ1jqiHhsDxZs64OxrgR-Sqc,16811
13
+ geney/spliceai_utils.py,sha256=21_TaiLW3faRuPegMgsVvIf1G1a03penZSiydQ-hOTA,1869
14
+ geney/splicing_utils.py,sha256=34xdarFpTHsHZkhi7VrHby9DaIBZ2xCLqPMrTmasEgE,16860
15
15
  geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
16
16
  geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
17
- geney/tis_utils.py,sha256=vA2ci4gNfwwQZlCjPpO5ehvL2NRVeM7lHI_VyfT-_10,8049
17
+ geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
18
18
  geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
19
19
  geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
21
21
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
22
22
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
23
- geney-1.2.55.dist-info/METADATA,sha256=bMKlTktE8jhYNpbxWMnp6Z168gk4NafThjukv45vYI4,948
24
- geney-1.2.55.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
- geney-1.2.55.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
- geney-1.2.55.dist-info/RECORD,,
23
+ geney-1.2.57.dist-info/METADATA,sha256=UFirGNGhFN_aJnqSO8WHagJCmEfKoHdfRZojLfKymsE,948
24
+ geney-1.2.57.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
+ geney-1.2.57.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
+ geney-1.2.57.dist-info/RECORD,,
File without changes