geney 1.2.55__py2.py3-none-any.whl → 1.2.56__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/oncosplice.py CHANGED
@@ -9,7 +9,6 @@ from .seqmat_utils import *
9
9
  from .mutation_utils import *
10
10
  from .tis_utils import find_tis
11
11
 
12
- ### Scoring
13
12
  def find_continuous_gaps(sequence):
14
13
  """Find continuous gap sequences in an alignment."""
15
14
  return [(m.start(), m.end()) for m in re.finditer(r'-+', sequence)]
@@ -121,43 +120,6 @@ def transform_conservation_vector(conservation_vector, window=13, factor=4):
121
120
  return exp_factors
122
121
 
123
122
 
124
- # def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
125
- # """
126
- # Identify unmodified positions in a sequence given deletions and insertions.
127
- #
128
- # :param sequence_length: Length of the sequence.
129
- # :param deletions: Dictionary of deletions.
130
- # :param insertions: Dictionary of insertions.
131
- # :param reach_limit: Limit for considering the effect of insertions/deletions.
132
- # :return: Array indicating unmodified positions.
133
- # """
134
- # unmodified_positions = np.zeros(sequence_length, dtype=float)
135
- #
136
- # for pos, insertion in insertions.items():
137
- # # if pos >= sequence_length:
138
- # # pos = sequence_length - 1
139
- # # add_factor = 1
140
- #
141
- # reach = min(len(insertion) // 2, reach_limit)
142
- # front_end, back_end = max(0, pos - reach), min(sequence_length - 1, pos + reach)
143
- # len_start, len_end = pos - front_end, back_end - pos
144
- # try:
145
- # gradient_front = np.linspace(0, 1, len_start, endpoint=False)
146
- # gradient_back = np.linspace(0, 1, len_end, endpoint=True)[::-1]
147
- # combined_gradient = np.concatenate([gradient_front, np.array([1]), gradient_back])
148
- # unmodified_positions[front_end:back_end + 1] = combined_gradient
149
- #
150
- # except ValueError as e:
151
- # print(
152
- # f"Error: {e} | Lengths: unmodified_positions_slice={back_end - front_end}.")
153
- # unmodified_positions[front_end:back_end] = np.zeros(back_end - front_end)
154
- #
155
- # for pos, deletion in deletions.items():
156
- # deletion_length = len(deletion)
157
- # unmodified_positions[pos:pos + deletion_length] = 1
158
- #
159
- # return unmodified_positions
160
-
161
123
  def find_modified_positions(sequence_length, deletions, insertions, reach_limit=16):
162
124
  """
163
125
  Identify unmodified positions in a sequence given deletions and insertions.
@@ -251,12 +213,7 @@ def moving_average_conv(vector, window_size, factor=1):
251
213
 
252
214
  return np.convolve(vector, np.ones(window_size), mode='same') / window_size
253
215
 
254
-
255
-
256
-
257
-
258
216
  def find_splice_site_proximity(pos, transcript):
259
-
260
217
  for i, (ex_start, ex_end) in enumerate(transcript.exons):
261
218
  if min(ex_start, ex_end) <= pos <= max(ex_start, ex_end):
262
219
  return i + 1, None, abs(pos - ex_start), abs(pos - ex_end)
@@ -323,7 +280,7 @@ def summarize_missplicing_event(pes, pir, es, ne, ir):
323
280
 
324
281
  # Annotating
325
282
  def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
326
- affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(mut.indices[0],
283
+ affected_exon, affected_intron, distance_from_5, distance_from_3 = find_splice_site_proximity(np.floor(mut.indices[0]),
327
284
  reference_transcript)
328
285
 
329
286
  report = {}
@@ -361,59 +318,60 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
361
318
  return report
362
319
 
363
320
 
364
- # def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
365
- # gene = Gene(mut_id.split(':')[0], organism=organism)
366
- # reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
367
- # mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
368
- #
369
- # results = []
370
- # for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
371
- # if cons_required and not transcript.cons_available:
372
- # continue
373
- #
374
- # if all(mutation not in transcript for mutation in mutations):
375
- # # results.append({'transcript_id': transcript.transcript_id})
376
- # continue
377
- #
378
- # transcript.generate_pre_mrna()
379
- # transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
380
- # transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
381
- # ref_protein, cons_vector = transcript.protein, transcript.cons_vector
382
- # reference_transcript = copy.deepcopy(transcript)
383
- #
384
- # assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
385
- #
386
- # missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
387
- # for mutation in mutations:
388
- # transcript.pre_mrna += mutation
389
- #
390
- # for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
391
- # transcript.acceptors = new_boundaries['acceptors']
392
- # transcript.donors = new_boundaries['donors']
393
- # transcript.generate_mature_mrna().generate_protein()
394
- #
395
- # alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
396
- # deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
397
- # modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
398
- # temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
399
- # affected_cons_scores = max(temp_cons)
400
- # percentile = (
401
- # sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
402
- # cons_vector))
403
- #
404
- # report = OncospliceAnnotator(reference_transcript, transcript, mutation)
405
- # report['mut_id'] = mut_id
406
- # report['oncosplice_score'] = affected_cons_scores
407
- # report['percentile'] = percentile
408
- # report['isoform_id'] = i
409
- # report['isoform_prevalence'] = new_boundaries['path_weight']
410
- # report['full_missplicing'] = missplicing.aberrant_splicing
411
- # report['missplicing'] = max(missplicing)
412
- # report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
413
- # results.append(report)
414
- #
415
- # report = pd.DataFrame(results)
416
- # return report
321
+ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, cons_required=False, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
322
+ gene = Gene(mut_id.split(':')[0], organism=organism)
323
+ reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
324
+ mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
325
+
326
+ results = []
327
+ for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
328
+ if cons_required and not transcript.cons_available:
329
+ continue
330
+
331
+ if all(mutation not in transcript for mutation in mutations):
332
+ continue
333
+
334
+ transcript.generate_pre_mrna()
335
+ transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
336
+ transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
337
+ ref_protein, cons_vector = transcript.protein, transcript.cons_vector
338
+ reference_transcript = copy.deepcopy(transcript)
339
+
340
+ assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)}) must be same length. {ref_protein}, \n>{cons_vector}\n>{transcript.cons_seq}"
341
+
342
+ missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine, threshold=splicing_threshold), threshold=splicing_threshold)
343
+ for mutation in mutations:
344
+ transcript.pre_mrna += mutation
345
+
346
+ for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
347
+ transcript.acceptors = new_boundaries['acceptors']
348
+ transcript.donors = new_boundaries['donors']
349
+ transcript.generate_mature_mrna().generate_protein()
350
+
351
+ alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
352
+ deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
353
+ modified_positions = find_modified_positions(len(ref_protein), deleted, inserted)
354
+ temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
355
+ affected_cons_scores = max(temp_cons)
356
+ percentile = (
357
+ sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
358
+ cons_vector))
359
+
360
+ report = OncospliceAnnotator(reference_transcript, transcript, mutation)
361
+ report['mut_id'] = mut_id
362
+ report['oncosplice_score'] = affected_cons_scores
363
+ report['percentile'] = percentile
364
+ report['isoform_id'] = i
365
+ report['isoform_prevalence'] = new_boundaries['path_weight']
366
+ report['full_missplicing'] = missplicing.aberrant_splicing
367
+ report['missplicing'] = max(missplicing)
368
+ report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
369
+ results.append(report)
370
+
371
+ if len(results) == 0:
372
+ return None
373
+
374
+ return pd.DataFrame(results)
417
375
 
418
376
 
419
377
  import asyncio
geney/spliceai_utils.py CHANGED
@@ -12,8 +12,8 @@ if tf.config.list_physical_devices('GPU'):
12
12
  else:
13
13
  print("Running on CPU.")
14
14
 
15
- # tf.config.threading.set_intra_op_parallelism_threads(1)
16
- # tf.config.threading.set_inter_op_parallelism_threads(1)
15
+ tf.config.threading.set_intra_op_parallelism_threads(1)
16
+ tf.config.threading.set_inter_op_parallelism_threads(1)
17
17
 
18
18
  sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
19
19
  sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
geney/tis_utils.py CHANGED
@@ -28,26 +28,13 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
28
28
  right_context=right_context,
29
29
  padding='$')
30
30
 
31
- # 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
32
31
  if context_conserved:
33
- return tis_coords[0]
34
-
35
- # 4. Reaquisition of TIS follows:
36
- #### The logic:
37
- # a. We need to find all possible start codon candidates as relative indices
38
- # b. We need to find what proteins each alternative start codon would create
39
- # c. We need to make sure we are only looking at a region around a mutation
40
- # d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
32
+ return [(tis_coords[0], 1, 'canonical')]
41
33
 
42
34
  sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
43
- # target_transcript = sc_table[sc_table.transcript_id == ref_id]
44
- # if len(target_transcript) == 0:
45
- ### reaquire TIS score for ref
46
- # pass
47
-
48
35
  ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
49
36
  right_context=right_context, padding='$')
50
- # target_ref_titer_score = target_transcript.tis_score
37
+
51
38
  ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
52
39
  ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
53
40
  ref_protein = ref_seq.translate(tis_coords[0])
@@ -56,7 +43,8 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
56
43
  candidate_positions = np.array(
57
44
  [p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
58
45
  for i in range(len(ref_seq.seq))])
59
- candidate_positions = candidate_positions > sorted(candidate_positions)[-5]
46
+
47
+ candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
60
48
  candidate_positions = np.array([retrieve_titer_score(
61
49
  mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
62
50
  padding='$')) if candidate_positions[i] > 0 else False for i in
@@ -66,7 +54,7 @@ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
66
54
  in range(len(ref_seq.seq))])
67
55
  best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
68
56
  out = mut_seq.seqmat[1, best_position]
69
- return out
57
+ return out #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
70
58
 
71
59
 
72
60
  def seq_matrix(seq_list):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.55
3
+ Version: 1.2.56
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -6,21 +6,21 @@ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
6
6
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
7
7
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
8
8
  geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
9
- geney/oncosplice.py,sha256=-_b0ZSxWa-bSYDoVMt605lJlx8-rXf0WsKsFrMoF6Vg,23707
9
+ geney/oncosplice.py,sha256=eWgY2Lcj894UBFnIVhbxiVz5oqASHg-Ot1wFbjlJbI8,21857
10
10
  geney/pangolin_utils.py,sha256=NJEdY43L_2lielY1hZOjlak0baHqXTa1ITrvx8Tkg5o,2878
11
11
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
12
12
  geney/seqmat_utils.py,sha256=2cRXT_Ox4IdzCM8x3H2HexxFZzjo5WHs0HZiUQv8fBM,18347
13
- geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
13
+ geney/spliceai_utils.py,sha256=21_TaiLW3faRuPegMgsVvIf1G1a03penZSiydQ-hOTA,1869
14
14
  geney/splicing_utils.py,sha256=t0vE5KTAdYOYJLa9wjaSJ1jqiHhsDxZs64OxrgR-Sqc,16811
15
15
  geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
16
16
  geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
17
- geney/tis_utils.py,sha256=vA2ci4gNfwwQZlCjPpO5ehvL2NRVeM7lHI_VyfT-_10,8049
17
+ geney/tis_utils.py,sha256=2makfGfVlDFVIbxzXE85AY9jmAjcNmxyIAxjvkRA5LY,7396
18
18
  geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
19
19
  geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
21
21
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
22
22
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
23
- geney-1.2.55.dist-info/METADATA,sha256=bMKlTktE8jhYNpbxWMnp6Z168gk4NafThjukv45vYI4,948
24
- geney-1.2.55.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
- geney-1.2.55.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
- geney-1.2.55.dist-info/RECORD,,
23
+ geney-1.2.56.dist-info/METADATA,sha256=tHCFJyD9OKjk7GnQToKesLQZyzy0dtO9oBsr0Bjz6rI,948
24
+ geney-1.2.56.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
+ geney-1.2.56.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
+ geney-1.2.56.dist-info/RECORD,,
File without changes