geney 1.3.79__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/_tis_utils.py ADDED
@@ -0,0 +1,172 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ from scipy.stats import percentileofscore
5
+ import shelve
6
+ from Bio.Align import PairwiseAligner
7
+ from geney import config
8
+
9
+ p = PairwiseAligner()
10
+
11
+
12
+ def find_tis(reference_mrna, mutated_mrna, ref_tis_pos, left_context=100, right_context=102):
13
+ '''
14
+ mature_mrna: row 0 --> encoded nucleotides
15
+ row 1 --> genomic indices
16
+ row 2 --> super positions (incase of insertions or deletions
17
+ row1+row2 = conhesive & monotonic genomic indices
18
+ row 3 --> binary mutated position or not
19
+ mature_mrna.seq
20
+ mature_mrna.indices
21
+ '''
22
+ tis_coords = ref_seq.mature_mrna.asymmetric_indices(ref_seq.TIS, left_context=0, right_context=3)
23
+ ref_seq, mut_seq = ref_seq.mature_mrna, mut_seq.mature_mrna
24
+
25
+ # 1. Is the start codon (the indices) conserved in the mut sequence?
26
+ assert all(a in ref_seq.seqmat[1, :] for a in
27
+ tis_coords), f"Start codon indices specified not found in the reference sequence."
28
+ tis_conserved = all(a in mut_seq.seqmat[1, :] for a in tis_coords)
29
+
30
+ # 2. If condition 1 is passed, is the context around that start codon the same in both the reference and the mutated?
31
+ context_conserved = False
32
+ if tis_conserved:
33
+ context_conserved = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
34
+ right_context=right_context,
35
+ padding='$') == mut_seq.asymmetric_subseq(tis_coords[0],
36
+ left_context=left_context,
37
+ right_context=right_context,
38
+ padding='$')
39
+
40
+ if context_conserved:
41
+ return [(tis_coords[0], 1, 'canonical')]
42
+
43
+ sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
44
+ ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
45
+ right_context=right_context, padding='$')
46
+
47
+ ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
48
+ ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
49
+ ref_protein = ref_seq.translate(tis_coords[0])
50
+
51
+ candidate_positions = np.array([mut_seq.seq[i:i + 3] in TITER_acceptable_TISs for i in range(len(mut_seq.seq))])
52
+ candidate_positions = np.array(
53
+ [p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
54
+ for i in range(len(ref_seq.seq))])
55
+
56
+ candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
57
+ candidate_positions = np.array([retrieve_titer_score(
58
+ mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
59
+ padding='$')) if candidate_positions[i] > 0 else False for i in
60
+ range(len(ref_seq.seq))])
61
+ candidate_positions = np.array(
62
+ [percentileofscore(sc_table.tis_score, candidate_positions[i]) if candidate_positions[i] != False else 100 for i
63
+ in range(len(ref_seq.seq))])
64
+ best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
65
+ out = mut_seq.seqmat[1, best_position]
66
+ return out #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
67
+
68
+
69
+ def seq_matrix(seq_list):
70
+ tensor = np.zeros((len(seq_list), 203, 8))
71
+ for i in range(len(seq_list)):
72
+ seq = seq_list[i]
73
+ j = 0
74
+ for s in seq:
75
+ if s == 'A' and (j < 100 or j > 102):
76
+ tensor[i][j] = [1, 0, 0, 0, 0, 0, 0, 0]
77
+ if s == 'T' and (j < 100 or j > 102):
78
+ tensor[i][j] = [0, 1, 0, 0, 0, 0, 0, 0]
79
+ if s == 'C' and (j < 100 or j > 102):
80
+ tensor[i][j] = [0, 0, 1, 0, 0, 0, 0, 0]
81
+ if s == 'G' and (j < 100 or j > 102):
82
+ tensor[i][j] = [0, 0, 0, 1, 0, 0, 0, 0]
83
+ if s == '$':
84
+ tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 0]
85
+ if s == 'A' and (j >= 100 and j <= 102):
86
+ tensor[i][j] = [0, 0, 0, 0, 1, 0, 0, 0]
87
+ if s == 'T' and (j >= 100 and j <= 102):
88
+ tensor[i][j] = [0, 0, 0, 0, 0, 1, 0, 0]
89
+ if s == 'C' and (j >= 100 and j <= 102):
90
+ tensor[i][j] = [0, 0, 0, 0, 0, 0, 1, 0]
91
+ if s == 'G' and (j >= 100 and j <= 102):
92
+ tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 1]
93
+ j += 1
94
+ return tensor
95
+
96
+
97
+ def build_titer_model(TITER_path=config['hg38']['titer_path']):
98
+ print('Building TITER model...')
99
+ from tensorflow.keras.constraints import MaxNorm
100
+ from tensorflow.keras.layers import Conv1D, MaxPool1D, LSTM, Dropout, Flatten, Dense, Activation
101
+ from tensorflow.keras import Sequential, Input
102
+
103
+ model = Sequential()
104
+ model.add(Input(shape=(203, 8)))
105
+ model.add(Conv1D(filters=128,
106
+ kernel_size=3,
107
+ padding='valid',
108
+ kernel_constraint=MaxNorm(3),
109
+ activation='relu'))
110
+ model.add(MaxPool1D(3))
111
+ model.add(Dropout(rate=0.21370950078747658))
112
+ model.add(LSTM(units=256,
113
+ return_sequences=True))
114
+ model.add(Dropout(rate=0.7238091317104384))
115
+ model.add(Flatten())
116
+ model.add(Dense(1))
117
+ model.add(Activation('sigmoid'))
118
+
119
+ model.compile(loss='binary_crossentropy',
120
+ optimizer='nadam',
121
+ metrics=['accuracy'])
122
+
123
+ models = []
124
+
125
+ # Load weights into multiple instances of the model
126
+ for i in range(32):
127
+ model_copy = Sequential(model.layers) # Create a new model instance with the same architecture
128
+ weights_path = os.path.join(TITER_path, f"bestmodel_{i}.hdf5")
129
+
130
+ if os.path.exists(weights_path):
131
+ model_copy.load_weights(weights_path) # Load weights into the new model instance
132
+ models.append(model_copy)
133
+ # print(f"Loaded model {i} with weights from {weights_path}")
134
+ else:
135
+ print(f"Warning: Weights file {weights_path} not found")
136
+
137
+ return models
138
+
139
+
140
+ def calculate_titer_score(candidate_seq, titer_model=None): # , prior):
141
+ if titer_model is None:
142
+ titer_model = TITER_MODEL
143
+ processed_seq = seq_matrix([candidate_seq]) # Wrap in list to keep dimensions consistent
144
+ # prior = np.array([prior]).reshape(1, 1)
145
+ analyzed_score = np.zeros((1, 1))
146
+
147
+ # Iterate through the models (assuming 32 models) and calculate the score
148
+ for i in range(32):
149
+ y_pred = titer_model[i].predict(processed_seq, verbose=0)
150
+ analyzed_score += y_pred # * prior
151
+ print(analyzed_score)
152
+ return analyzed_score[0][0]
153
+
154
+
155
+ def retrieve_titer_score(sequence, filename='sequences_shelve.db'):
156
+ # Open the shelf (acts like a dictionary, stored in a file)
157
+ with shelve.open(filename) as db:
158
+ # Check if sequence is already in the shelf
159
+ if sequence in db:
160
+ return db[sequence]
161
+ else:
162
+ # If not, run the function, store the result, and return it
163
+ value = calculate_titer_score(sequence, TITER_MODEL)
164
+ db[sequence] = value
165
+ return value
166
+
167
+
168
+ TITER_acceptable_TISs = ['ATG', 'CTG', 'ACG', 'TTG', 'GTG']
169
+ codon_tis_prior = {'ATG': 3.5287101354987644, 'CTG': 1.746859242328512, 'ACG': 1.3535552403706805,
170
+ 'TTG': 1.1364995562364615, 'GTG': 1.218573747658257}
171
+ stop_codons = ['TAA', 'TAG', 'TGA']
172
+ TITER_MODEL = build_titer_model()
geney/immune_utils.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import subprocess
2
2
  import logging
3
3
  import tempfile
4
- from geney import config_setup
4
+ from geney import _config_setup
5
5
  import re
6
6
  from io import StringIO
7
7
  import pandas as pd
geney/pipelines.py ADDED
@@ -0,0 +1,66 @@
1
+ # Home of the frequently used pipelines that everyone needs
2
+ import pandas as pd
3
+ from datetime import datetime
4
+
5
+ from .Gene import Gene
6
+ from .utils.mutation_utils import MutationalEvent
7
+ from .SpliceSimulator import SpliceSimulator
8
+ from .Oncosplice import Oncosplice
9
+ from .utils.TranscriptLibrary import TranscriptLibrary
10
+
11
+
12
+ def max_splicing_delta(mut_id, transcript_id=None, splicing_engine='spliceai'):
13
+ m = MutationalEvent(mut_id)
14
+ assert m.compatible(), 'Mutations in event are incompatible'
15
+ reference_transcript = Gene.from_file(
16
+ m.gene).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
17
+ tl = TranscriptLibrary(reference_transcript, m)
18
+ splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
19
+ ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
20
+ return ss.max_splicing_delta('event_prob')
21
+
22
+
23
+ def oncosplice_pipeline_single_transcript(mut_id, transcript_id=None, splicing_engine='spliceai', ):
24
+ m = MutationalEvent(mut_id)
25
+ assert m.compatible(), 'Mutations in event are incompatible'
26
+ reference_transcript = Gene.from_file(
27
+ m.gene).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
28
+ tl = TranscriptLibrary(reference_transcript, m)
29
+ splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
30
+ ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
31
+
32
+ base_report = pd.Series({'mut_id': mut_id,
33
+ 'gene': m.gene,
34
+ 'transcript_id': reference_transcript.transcript_id,
35
+ 'primary_transcript': reference_transcript.primary_transcript,
36
+ 'splicing_engine': splicing_engine,
37
+ 'time_of_execution': datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
38
+
39
+ ss_metadata = ss.report(m.positions[0])
40
+ report = []
41
+ for variant_transcript, isoform_metadata in ss.get_viable_transcripts(metadata=True):
42
+ onco = Oncosplice(reference_transcript.protein, variant_transcript.protein, reference_transcript.cons_vector)
43
+ report.append(pd.concat([base_report, ss_metadata, isoform_metadata, onco.get_analysis_series()]))
44
+ return pd.DataFrame(report)
45
+
46
+
47
+ def oncosplice_pipeline_all_transcripts(mut_id, splicing_engine='spliceai'):
48
+ m = MutationalEvent(mut_id)
49
+ assert m.compatible(), 'Mutations in event are incompatible'
50
+ reports = []
51
+ for transcript_id in Gene.from_file(m.gene).transcripts.keys():
52
+ reports.append(oncosplice_pipeline_single_transcript(mut_id, transcript_id, splicing_engine=splicing_engine))
53
+ return pd.concat(reports, axis=1)
54
+
55
+
56
+ def get_tcga_annotations(mut_ids):
57
+ pass
58
+
59
+
60
+
61
+ def generate_epitopes():
62
+ pass
63
+
64
+
65
+
66
+
geney/power_utils.py CHANGED
@@ -5,7 +5,7 @@ from dask.distributed import Client, wait
5
5
  import os
6
6
  from tqdm import tqdm
7
7
  from pathlib import Path
8
- from geney import config_setup
8
+ from geney import _config_setup
9
9
  from geney.utils import contains, available_genes
10
10
  import warnings
11
11
  import gc
@@ -0,0 +1,260 @@
1
+ __all__ = ['Fasta_segment']
2
+
3
+ import os
4
+ import numpy as np
5
+
6
+ class Fasta_segment():
7
+ '''
8
+ Efficient reads of a segments starting from a given offset
9
+ from a FASTA file.
10
+
11
+ ================================================================
12
+ This class can be used ONLY for Fasta files where ALL sequence
13
+ rows (excluding headers) are of the same length !!
14
+ ================================================================
15
+
16
+ In case of a FASTA with multiple sequences (multiple headers), the header name
17
+ of the corresponding sequence must be provided. The header name is the string
18
+ that proceeds the character ">".
19
+ For example, the header name ofthe header:
20
+
21
+ ">NC_000011.10 Homo sapiens chromosome 11, GRCh38.p13 Primary Assembly"
22
+
23
+ is "NC_000011.10".
24
+
25
+ In case of a FASTA with a single sequence (single header), the header name
26
+ is not required.
27
+
28
+ The class reads only the segment into memory and not the all FASTA sequence.
29
+ This can be used, for example, to read a segment from a FASTA file
30
+ that contains a chromosome of the human genome.
31
+ '''
32
+ def __init__(self, files_info=None) -> None:
33
+ '''
34
+ files_info (optional input) is a dictionary:
35
+ For a FASTA with multiple headers, the key is a fasta file name and
36
+ the header name as follows: "<fasta file name>:<header name>".
37
+ For a FASTA with a single header the key is simply the file name (the header
38
+ can also be provided, in which case the key will include it as well, however this is not required).
39
+ The value is a dictionary that contains basic statistics of the fasta file.
40
+ Keys of files_info[file] are:
41
+ 'init_loc': offset of the first NT in file (0 indicated the first NT)
42
+ 'l_size': number of NTs per line excluding \n,
43
+ 'line_size': number of NTs per line including \n.
44
+ 'file_size': total number of NTs in file, excluding \n (and excluding the header). This is
45
+ available only for FASTA files with a single header.
46
+
47
+ For example: files_info = {'file1.fas': {'init_loc': 70, 'l_size': 80, 'line_size': 81, 'file_size': 240},
48
+ 'file2.fas:NC_000011.10': {'init_loc': 70, 'l_size': 80, 'line_size': 81}}
49
+
50
+ '''
51
+ self.files_info = files_info if files_info else {}
52
+
53
+ def __repr__(self) -> str:
54
+ return f"Fasta_segment([files_info])"
55
+
56
+ def __str__(self) -> str:
57
+ return f"Efficient reads of segments from FASTA files.\n"
58
+
59
+ def __len__(self) -> int:
60
+ return len(self.files_info)
61
+
62
+ def show_processed_files(self) -> list:
63
+ return list(self.files_info.keys())
64
+
65
+ def show_files_info(self) -> None:
66
+ print(self.str_files_info())
67
+
68
+ def str_files_info(self) -> str:
69
+ s = ''
70
+ for k in self.files_info.keys():
71
+ s += f"{k}:\n"
72
+ for ki, vi in self.files_info[k].items():
73
+ s += f"\t{ki}: {vi}\n"
74
+
75
+ return s
76
+
77
+ def read_segment(self, file: str, offset: int, size: int) -> str:
78
+ '''
79
+ Reads a segment from a FASTA with a single sequence (single header).
80
+ file - FASTA file name
81
+ offset - 0 based offset of the start segment in file (from the first sequence NT).
82
+ For example, value of 1 indicates starting from the second NT in the file.
83
+ size - number of NTs in a segment to read.
84
+ '''
85
+ if not os.path.exists(file):
86
+ print(f"Error: file {file} does not exists !!")
87
+ return ""
88
+
89
+ with open(file, 'rt') as fp:
90
+ if file in self.files_info:
91
+ l_size = self.files_info[file]['l_size']
92
+ line_size = self.files_info[file]['line_size']
93
+ init_loc = self.files_info[file]['init_loc']
94
+ else:
95
+ init_loc, line_size, l_size = self.__compute_file_stats(fp)
96
+ self.files_info[file] = {'init_loc': init_loc, 'l_size': l_size, 'line_size': line_size}
97
+
98
+ offset_num_lines, offset_in_line = offset // l_size, np.mod(offset, l_size)
99
+ # accounting for extra characters due to multiple \n that will be discarded
100
+ add_for_newline = ((offset + size -1) // l_size) - offset_num_lines
101
+ fp.seek(init_loc + offset_num_lines *line_size + offset_in_line)
102
+ return fp.read(size + add_for_newline).replace('\n', '')
103
+
104
+ def read_segment_endpoints(self, file: str, start_loc: int, end_loc: int):
105
+ seq = Fasta_segment().read_segment(file, start_loc - 1, end_loc - start_loc + 1).upper()
106
+ # indices = list(range(start_loc, end_loc + 1))
107
+ indices = np.arange(start_loc, end_loc + 1)
108
+ assert len(seq) == len(
109
+ indices), f'reference data not compatible; {len(seq)}, {len(indices)}, start: {start_loc}, end: {end_loc}, {file}'
110
+ return {"nucleotides": seq, "index": indices}
111
+
112
+ def __compute_file_stats(self, fp) -> tuple:
113
+ '''
114
+ Computs the stats that are needed to read a segment from
115
+ a FASTA file with a single sequence.
116
+ '''
117
+ # account for a header (if exists) in the first line of the file
118
+ fp.seek(0, os.SEEK_SET)
119
+ if fp.read(1) == ">":
120
+ fp.readline()
121
+ init_loc = fp.tell()
122
+ else:
123
+ init_loc = 0
124
+
125
+ # init_loc is the location (0-based) of the first NT in file
126
+ fp.seek(init_loc)
127
+ fp.readline() # advance handle to begining of second line
128
+ line_size = fp.tell() - init_loc # number of NTs per line, including the \n
129
+ return init_loc, line_size, line_size -1
130
+
131
+
132
+ def total_num_chars(self, file: str) -> int:
133
+ '''
134
+ Returns the total number of characters (NTs) in a FASTA
135
+ file (excluding the header and \n) without reading the all file.
136
+
137
+ This function supports only Fasta with a single headr.
138
+ Support for FASTAs with multiple headers is TBD.
139
+ '''
140
+ if file in self.files_info:
141
+ return self.files_info[file]['file_size']
142
+ else:
143
+ return self.__compute_total_num_NTs(file)
144
+
145
+
146
+ def __compute_total_num_NTs(self, file: str) -> int:
147
+ '''
148
+ Computes the total number of characters (NTs) in a FASTA
149
+ file (excluding the header and \n) and updates self.files_info.
150
+ '''
151
+ with open(file, 'rt') as fp:
152
+ init_loc, line_size, l_size = self.__compute_file_stats(fp)
153
+
154
+ # check if last character is new line
155
+ last_loc = fp.seek(0, os.SEEK_END)
156
+ fp.seek(fp.tell() - 1, os.SEEK_SET)
157
+ last_char = fp.read()
158
+
159
+ delta = last_loc -init_loc
160
+ q, r = divmod(delta, line_size)
161
+ num_chars = delta - q # remove counts of new lines
162
+ if q== 1 and r == 0 and (last_char != '\n'):
163
+ num_chars += 1 # if only one line in file and last character is not newline add one
164
+
165
+ self.files_info[file] = {'init_loc': init_loc,
166
+ 'l_size': l_size,
167
+ 'line_size': line_size,
168
+ 'file_size': num_chars}
169
+
170
+ return num_chars
171
+
172
+ def multiple_headers_read_segment(self, file: str, header_name: str, offset: int, size: int) -> str:
173
+ '''
174
+ Reades from a FASTA with multiple sequences (multiple headers).
175
+ The offset and size here are relative to the sequence with header name header_name.
176
+
177
+ For example, for header_name='name1', offset=3, size=4, the return sequence is of size 4 NTs, starting
178
+ from offset 3 of the sequence that follows the header with header name header_name.
179
+ '''
180
+ init_loc = None
181
+ token = file + ':' + header_name
182
+ with open(file, 'rt') as fp:
183
+ if token in self.files_info:
184
+ l_size = self.files_info[token]['l_size']
185
+ line_size = self.files_info[token]['line_size']
186
+ init_loc = self.files_info[token]['init_loc']
187
+ else:
188
+ line = fp.readline()
189
+ while line:
190
+ if line[0] == '>':
191
+ # header name
192
+ name = line.split()[0][1:]
193
+ if name == header_name:
194
+ init_loc = fp.tell() # file offset of the first NT of the corresponding sequence
195
+ break
196
+
197
+ line = fp.readline()
198
+
199
+ if init_loc is None:
200
+ print(f"Did not find header name {header_name} in file {file} !!")
201
+ return ''
202
+ else:
203
+ fp.seek(init_loc)
204
+ fp.readline()
205
+ line_size = fp.tell() - init_loc
206
+ l_size = line_size - 1
207
+ self.files_info[token] = {'init_loc': init_loc, 'l_size': l_size, 'line_size': line_size}
208
+
209
+ offset_num_lines, offset_in_line = offset // l_size, np.mod(offset, l_size)
210
+ # accounting for extra characters due to multiple \n that will be discarded
211
+ add_for_newline = ((offset + size - 1) // l_size) - offset_num_lines
212
+ fp.seek(init_loc + offset_num_lines * line_size + offset_in_line)
213
+ return fp.read(size + add_for_newline).replace('\n', '')
214
+
215
+ def fasta_gen(self, file: str, segment_size: int, init_offset: int = 0, jump_size: int = 0) -> str:
216
+ '''
217
+ Fasta generator.
218
+ Returns an iterator for reading segments of size segment_size NTs, separated by
219
+ jump_size NTs, starting from an offset (0-based) init_offset.
220
+
221
+ To read segments consecutively, set jump_size to 0 (which is the default value).
222
+ '''
223
+ try:
224
+ with open(file, 'rt') as fp:
225
+ # handeling the header
226
+ fp.seek(0, os.SEEK_SET)
227
+ if fp.read(1) == ">":
228
+ fp.readline()
229
+ init_loc = fp.tell()
230
+ else:
231
+ init_loc = 0
232
+
233
+ # computing number of characters (including \n) per line
234
+ fp.seek(init_loc)
235
+ fp.readline()
236
+ line_size = fp.tell() - init_loc # number of NTs per line, including the \n
237
+ ext_newlines = segment_size // line_size # number of \n in a segment_size read with offset 0
238
+ jump_newlines = jump_size // line_size # same for jump_size
239
+
240
+ # starting from init_offset (accounting for \n)
241
+ fp.seek(init_loc + init_offset + (init_offset // line_size))
242
+
243
+ while True:
244
+ segment = fp.read(segment_size + ext_newlines).replace('\n', '')
245
+ # might need another single read as ext_newlines assumed reading from beginning of the line
246
+ if len(segment) < segment_size:
247
+ segment += fp.read(1).replace('\n', '')
248
+
249
+ if segment != '':
250
+ yield segment.replace('\n', '')
251
+ else:
252
+ break # raise StopIteration
253
+
254
+ # jump to next segment (the jump is excluding \n)
255
+ if fp.read(jump_size + jump_newlines).count('\n') > jump_newlines:
256
+ fp.seek(fp.tell() + 1)
257
+
258
+ except IOError as err:
259
+ print(f"Error: {file} not found !! ({err})")
260
+