geney 1.2.32__py2.py3-none-any.whl → 1.2.34__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/config_setup.py CHANGED
@@ -6,7 +6,8 @@ def get_config():
6
6
  config_file = os.path.join(os.path.expanduser('~'), '.oncosplice_setup_1_2', 'config.json')
7
7
  if Path(config_file).exists():
8
8
  config_setup = {k: {k_in: Path(p_in) for k_in, p_in in p.items()} for k, p in json.loads(open(config_file).read()).items()}
9
-
9
+ config_setup['hg38']['titer_path'] = Path('/tamir2/nicolaslynn/tools/titer')
10
+ config_setup['hg38']['yoram_path'] = Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils')
10
11
  else:
11
12
  print("Database not set up.")
12
13
  config_setup = {}
geney/oncosplice.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from .splicing_utils import find_transcript_missplicing, develop_aberrant_splicing, Missplicing
8
8
  from .seqmat_utils import *
9
9
  from .mutation_utils import *
10
-
10
+ from .tis_utils import find_tis
11
11
 
12
12
  ### Scoring
13
13
  def find_continuous_gaps(sequence):
@@ -416,58 +416,69 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_tran
416
416
 
417
417
 
418
418
  import asyncio
419
- async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai'):
419
+
420
+
421
+ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False,
422
+ window_length=13, organism='hg38', engine='spliceai'):
420
423
  import sys, os
421
- from pathlib import Path
422
- needed_path = Path('/tamir2/yoramzar/Projects/Cancer_mut/Utils')
423
- needed_file1 = needed_path / 'rest_api_utils.py'
424
- needed_file2 = needed_path / 'uniprot_utils.py'
424
+ needed_file1 = config[organism]['yoram_path'] / 'rest_api_utils.py'
425
+ needed_file2 = config[organism]['yoram_path'] / 'uniprot_utils.py'
425
426
 
426
- if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.X_OK)) and (needed_file2.is_file() and os.access(needed_file2, os.X_OK)):
427
- sys.path.append(str(needed_path))
427
+ if sys.platform == 'linux' and (needed_file1.is_file() and os.access(needed_file1, os.R_OK)) and (
428
+ needed_file2.is_file() and os.access(needed_file2, os.R_OK)):
429
+ sys.path.append(str(config[organism]['yoram_path']))
428
430
  import uniprot_utils as uput
429
431
 
430
432
  else:
431
- raise SystemError("Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
433
+ raise SystemError(
434
+ "Oncosplice Prototype can only be run on Power with access to the /tamir2/yoramzar/Projects/Cancer_mut/Utils folder.")
435
+
436
+ from .tis_utils import find_tis
432
437
 
433
438
  # Define async functions
434
439
  async def background_request(ensb_id, Uniprot_features=["Topological domain", "Transmembrane", "Domain"]):
435
440
  return uput.retrieve_protein_data_features_subset(uput.ensembl_id2uniprot_id(ensb_id), Uniprot_features)
436
441
 
442
+ def inspect_domain(row, modified_vector, conservation_vector):
443
+ v1, v2 = modified_vector[row.start:row.end], conservation_vector[row.start:row.end]
444
+ return pd.Series([f'{row.type}|{row.start}|{row.end}|{row.description}', sum(v1 * v2) / sum(v2)],
445
+ index=['domain_identifier', 'score'])
437
446
 
438
447
  gene = Gene(mut_id.split(':')[0], organism=organism)
439
- # request_thread = threading.Thread(target=background_request, args=(gene.transcript_ids, domains))
440
- # request_thread.start()
441
-
442
- mutation = get_mutation(mut_id, rev=gene.rev)
448
+ mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
443
449
 
444
450
  results = []
445
451
  for tid, transcript in gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
446
452
  if not transcript.cons_available:
447
453
  continue
448
454
 
449
- if mutation not in transcript:
455
+ if all(mutation not in transcript for mutation in mutations):
450
456
  results.append({'transcript_id': transcript.transcript_id})
451
457
  continue
452
458
 
453
- task1 = asyncio.create_task(background_request(transcript.transcript_id))
459
+ task1 = asyncio.create_task(background_request(tid))
454
460
 
455
461
  transcript.generate_pre_mrna()
456
462
  transcript.cons_vector = transform_conservation_vector(transcript.cons_vector, window=window_length)
457
- transcript.generate_mature_mrna().generate_protein(inplace=True, domains=domains)
463
+ transcript.generate_mature_mrna().generate_protein(inplace=True)
458
464
  ref_protein, cons_vector = transcript.protein, transcript.cons_vector
459
465
  reference_transcript = copy.deepcopy(transcript)
460
466
 
461
- assert len(ref_protein) == len(cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
467
+ assert len(ref_protein) == len(
468
+ cons_vector), f"Protein ({len(ref_protein)}) and conservation vector ({len(cons_vector)} must be same length."
462
469
 
463
- missplicing = Missplicing(find_transcript_missplicing(transcript, mutation, engine=engine), threshold=splicing_threshold)
464
- transcript.pre_mrna += mutation
465
- result1 = await task1
466
- print(result1)
470
+ missplicing = Missplicing(find_transcript_missplicing(transcript, mutations, engine=engine),
471
+ threshold=splicing_threshold)
472
+ for mutation in mutations:
473
+ transcript.pre_mrna += mutation
474
+
475
+ domains_df = await task1
467
476
  for i, new_boundaries in enumerate(develop_aberrant_splicing(transcript, missplicing.aberrant_splicing)):
468
477
  transcript.acceptors = new_boundaries['acceptors']
469
478
  transcript.donors = new_boundaries['donors']
470
- transcript.generate_mature_mrna().generate_protein()
479
+ transcript.generate_mature_mrna()
480
+ transcript.TIS = find_tis(ref_seq=reference_transcript, mut_seq=transcript)
481
+ transcript.generate_protein()
471
482
 
472
483
  alignment = get_logical_alignment(reference_transcript.protein, transcript.protein)
473
484
  deleted, inserted = find_indels_with_mismatches_as_deletions(alignment.seqA, alignment.seqB)
@@ -475,8 +486,11 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
475
486
  temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
476
487
  affected_cons_scores = max(temp_cons)
477
488
  percentile = (
478
- sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
479
- cons_vector))
489
+ sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
490
+ cons_vector))
491
+
492
+ out = domains_df.apply(lambda row: inspect_domain(row, va, vb), axis=1)
493
+ domains_affected = '+'.join([f'{a}:{b}' for a, b in list(zip(out.domain_identifier, out.score))])
480
494
 
481
495
  report = OncospliceAnnotator(reference_transcript, transcript, mutation)
482
496
  report['mut_id'] = mut_id
@@ -486,13 +500,13 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
486
500
  report['isoform_prevalence'] = new_boundaries['path_weight']
487
501
  report['full_missplicing'] = missplicing.aberrant_splicing
488
502
  report['missplicing'] = max(missplicing)
503
+ report['domains_affected'] = domains_affected
489
504
  # report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
490
- results.append(report)
505
+ results.append(pd.Series(report))
491
506
 
492
- report = pd.DataFrame(results)
507
+ report = pd.concat(results, axis=1).T
493
508
  return report
494
509
 
495
510
 
496
-
497
511
  if __name__ == '__main__':
498
512
  pass
geney/seqmat_utils.py CHANGED
@@ -203,7 +203,7 @@ class SeqMat:
203
203
  return SeqMat('ATG')
204
204
 
205
205
  def translate(self, tis_index):
206
- from Bio import Seq
206
+ from Bio.Seq import Seq
207
207
  return Seq(self.orf_seqmat(tis_index).seq).translate()
208
208
 
209
209
 
geney/tis_utils.py ADDED
@@ -0,0 +1,175 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ from scipy.stats import percentileofscore
5
+ import shelve
6
+ from Bio.Align import PairwiseAligner
7
+ from geney import config
8
+
9
+ p = PairwiseAligner()
10
+
11
+
12
+ def find_tis(ref_seq, mut_seq, left_context=100, right_context=102):
13
+ tis_coords = ref_seq.mature_mrna.asymmetric_indices(ref_seq.TIS, left_context=0, right_context=3)
14
+ ref_seq, mut_seq = ref_seq.mature_mrna, mut_seq.mature_mrna
15
+
16
+ # 1. Is the start codon (the indices) conserved in the mut sequence?
17
+ assert all(a in ref_seq.seqmat[1, :] for a in
18
+ tis_coords), f"Start codon indices specified not found in the reference sequence."
19
+ tis_conserved = all(a in mut_seq.seqmat[1, :] for a in tis_coords)
20
+
21
+ # 2. If condition 1 is passed, is the context around that start codon the same in both the reference and the mutated?
22
+ context_conserved = False
23
+ if tis_conserved:
24
+ context_conserved = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
25
+ right_context=right_context,
26
+ padding='$') == mut_seq.asymmetric_subseq(tis_coords[0],
27
+ left_context=left_context,
28
+ right_context=right_context,
29
+ padding='$')
30
+
31
+ # 3. If condition 2 is not met, we perform a TIS reaquisition. If condition 2 is met, then we return the reference TIS to be used in the mutated sequence
32
+ if context_conserved:
33
+ return tis_coords[0]
34
+
35
+ # 4. Reaquisition of TIS follows:
36
+ #### The logic:
37
+ # a. We need to find all possible start codon candidates as relative indices
38
+ # b. We need to find what proteins each alternative start codon would create
39
+ # c. We need to make sure we are only looking at a region around a mutation
40
+ # d. We need the titer score rank relative to all titer score reference ranks and relative to the reference score
41
+
42
+ sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
43
+ # target_transcript = sc_table[sc_table.transcript_id == ref_id]
44
+ # if len(target_transcript) == 0:
45
+ ### reaquire TIS score for ref
46
+ # pass
47
+
48
+ ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
49
+ right_context=right_context, padding='$')
50
+ # target_ref_titer_score = target_transcript.tis_score
51
+ ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
52
+ ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
53
+ ref_protein = ref_seq.translate(tis_coords[0])
54
+
55
+ candidate_positions = np.array([mut_seq.seq[i:i + 3] in TITER_acceptable_TISs for i in range(len(mut_seq.seq))])
56
+ candidate_positions = np.array(
57
+ [p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
58
+ for i in range(len(ref_seq.seq))])
59
+ candidate_positions = candidate_positions > sorted(candidate_positions)[-5]
60
+ candidate_positions = np.array([retrieve_titer_score(
61
+ mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
62
+ padding='$')) if candidate_positions[i] > 0 else False for i in
63
+ range(len(ref_seq.seq))])
64
+ candidate_positions = np.array(
65
+ [percentileofscore(sc_table.tis_score, candidate_positions[i]) if candidate_positions[i] != False else 100 for i
66
+ in range(len(ref_seq.seq))])
67
+ best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
68
+ out = mut_seq.seqmat[1, best_position]
69
+ return out
70
+
71
+
72
+ def seq_matrix(seq_list):
73
+ tensor = np.zeros((len(seq_list), 203, 8))
74
+ for i in range(len(seq_list)):
75
+ seq = seq_list[i]
76
+ j = 0
77
+ for s in seq:
78
+ if s == 'A' and (j < 100 or j > 102):
79
+ tensor[i][j] = [1, 0, 0, 0, 0, 0, 0, 0]
80
+ if s == 'T' and (j < 100 or j > 102):
81
+ tensor[i][j] = [0, 1, 0, 0, 0, 0, 0, 0]
82
+ if s == 'C' and (j < 100 or j > 102):
83
+ tensor[i][j] = [0, 0, 1, 0, 0, 0, 0, 0]
84
+ if s == 'G' and (j < 100 or j > 102):
85
+ tensor[i][j] = [0, 0, 0, 1, 0, 0, 0, 0]
86
+ if s == '$':
87
+ tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 0]
88
+ if s == 'A' and (j >= 100 and j <= 102):
89
+ tensor[i][j] = [0, 0, 0, 0, 1, 0, 0, 0]
90
+ if s == 'T' and (j >= 100 and j <= 102):
91
+ tensor[i][j] = [0, 0, 0, 0, 0, 1, 0, 0]
92
+ if s == 'C' and (j >= 100 and j <= 102):
93
+ tensor[i][j] = [0, 0, 0, 0, 0, 0, 1, 0]
94
+ if s == 'G' and (j >= 100 and j <= 102):
95
+ tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 1]
96
+ j += 1
97
+ return tensor
98
+
99
+
100
+ def build_titer_model(TITER_path=config['titer_setup']):
101
+ print('Building TITER model...')
102
+ from tensorflow.keras.constraints import MaxNorm
103
+ from tensorflow.keras.layers import Conv1D, MaxPool1D, LSTM, Dropout, Flatten, Dense, Activation
104
+ from tensorflow.keras import Sequential, Input
105
+
106
+ model = Sequential()
107
+ model.add(Input(shape=(203, 8)))
108
+ model.add(Conv1D(filters=128,
109
+ kernel_size=3,
110
+ padding='valid',
111
+ kernel_constraint=MaxNorm(3),
112
+ activation='relu'))
113
+ model.add(MaxPool1D(3))
114
+ model.add(Dropout(rate=0.21370950078747658))
115
+ model.add(LSTM(units=256,
116
+ return_sequences=True))
117
+ model.add(Dropout(rate=0.7238091317104384))
118
+ model.add(Flatten())
119
+ model.add(Dense(1))
120
+ model.add(Activation('sigmoid'))
121
+
122
+ model.compile(loss='binary_crossentropy',
123
+ optimizer='nadam',
124
+ metrics=['accuracy'])
125
+
126
+ models = []
127
+
128
+ # Load weights into multiple instances of the model
129
+ for i in range(32):
130
+ model_copy = Sequential(model.layers) # Create a new model instance with the same architecture
131
+ weights_path = os.path.join(TITER_path, f"bestmodel_{i}.hdf5")
132
+
133
+ if os.path.exists(weights_path):
134
+ model_copy.load_weights(weights_path) # Load weights into the new model instance
135
+ models.append(model_copy)
136
+ print(f"Loaded model {i} with weights from {weights_path}")
137
+ else:
138
+ print(f"Warning: Weights file {weights_path} not found")
139
+
140
+ return models
141
+
142
+
143
+ def calculate_titer_score(candidate_seq, titer_model=None): # , prior):
144
+ if titer_model is None:
145
+ titer_model = TITER_MODEL
146
+ processed_seq = seq_matrix([candidate_seq]) # Wrap in list to keep dimensions consistent
147
+ # prior = np.array([prior]).reshape(1, 1)
148
+ analyzed_score = np.zeros((1, 1))
149
+
150
+ # Iterate through the models (assuming 32 models) and calculate the score
151
+ for i in range(32):
152
+ y_pred = titer_model[i].predict(processed_seq, verbose=0)
153
+ analyzed_score += y_pred # * prior
154
+ print(analyzed_score)
155
+ return analyzed_score[0][0]
156
+
157
+
158
+ def retrieve_titer_score(sequence, filename='sequences_shelve.db'):
159
+ # Open the shelf (acts like a dictionary, stored in a file)
160
+ with shelve.open(filename) as db:
161
+ # Check if sequence is already in the shelf
162
+ if sequence in db:
163
+ return db[sequence]
164
+ else:
165
+ # If not, run the function, store the result, and return it
166
+ value = calculate_titer_score(sequence, TITER_MODEL)
167
+ db[sequence] = value
168
+ return value
169
+
170
+
171
+ TITER_acceptable_TISs = ['ATG', 'CTG', 'ACG', 'TTG', 'GTG']
172
+ codon_tis_prior = {'ATG': 3.5287101354987644, 'CTG': 1.746859242328512, 'ACG': 1.3535552403706805,
173
+ 'TTG': 1.1364995562364615, 'GTG': 1.218573747658257}
174
+ stop_codons = ['TAA', 'TAG', 'TGA']
175
+ TITER_MODEL = build_titer_model()
@@ -120,3 +120,5 @@ def get_end_codon(seq, start_position):
120
120
 
121
121
  def calculate_titer_score(seq, pos):
122
122
  return 0
123
+
124
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.32
3
+ Version: 1.2.34
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -13,7 +13,7 @@ Classifier: License :: Free for non-commercial use
13
13
  Classifier: Operating System :: POSIX :: Linux
14
14
  Classifier: Operating System :: MacOS
15
15
  Classifier: Programming Language :: Python :: 3.9
16
- Requires-Python: >3.9
16
+ Requires-Python: ==3.10
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: pandas
19
19
  Requires-Dist: networkx
@@ -1,25 +1,26 @@
1
1
  geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
2
2
  geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
3
- geney/config_setup.py,sha256=VA6mhVGMRadwlpEx4m1wrssmDM8qpfKT21MAijIwjyQ,428
3
+ geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
4
4
  geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
5
5
  geney/graphic_utils.py,sha256=tjm6IDQ1BdfSeuPYzjlqAUHFQoDYH9jXTzJjKFS4Hh4,11078
6
6
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
7
7
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
8
8
  geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
9
- geney/oncosplice.py,sha256=7wf0_-Gkc_G9HhUXjORHk3buZ66JzVzSFVQ4EZOtUAE,21787
9
+ geney/oncosplice.py,sha256=QETLNIzc3T1CYausLD3W_jCSJveDkg2F6WnIMagVLT0,22536
10
10
  geney/pangolin_utils.py,sha256=ETTGpuaQgdZ1v8H0NP8sbTEfGWu0VXUFUS7wsURsTc4,2991
11
11
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
12
- geney/seqmat_utils.py,sha256=TDWhE5oVTGJceaO6YmE7I_BEWRxWLT74_3rkmY1M0Fs,18368
12
+ geney/seqmat_utils.py,sha256=YV5DFLbfjXLIswPGvqK1-eEfwn9TUby0b2kewdGAKws,18372
13
13
  geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
14
14
  geney/splicing_utils.py,sha256=q47EdcsHrp4aLIPVWvkGBJSzS3l3DKiD9DNDsPpZdHk,16075
15
15
  geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
16
16
  geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
17
+ geney/tis_utils.py,sha256=GlzyO_QvMFt5tM4kewQ1L2l1KAYrCixgw8ny_WsGsYQ,8040
17
18
  geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
18
19
  geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- geney/translation_initiation/tis_utils.py,sha256=iXrWVijyPe-f8I9rEVGdxNnXBrOGPoKFjmvaOEnQYNE,4446
20
+ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
20
21
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
21
22
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
22
- geney-1.2.32.dist-info/METADATA,sha256=aHeSBHWq3b1li4G_CI2ClUEHJc5SfWHowqKrkZbQPGk,948
23
- geney-1.2.32.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
24
- geney-1.2.32.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
25
- geney-1.2.32.dist-info/RECORD,,
23
+ geney-1.2.34.dist-info/METADATA,sha256=LfYqiCiEw25eyzdGGYy2OrJ7rGC05l1lnaF8eupWrTE,950
24
+ geney-1.2.34.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
+ geney-1.2.34.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
+ geney-1.2.34.dist-info/RECORD,,
File without changes