geney 1.3.78__py2.py3-none-any.whl → 1.4.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/Gene.py +9 -10
- geney/Oncosplice.py +400 -0
- geney/SpliceSimulator.py +407 -0
- geney/Transcript.py +55 -57
- geney/__init__.py +47 -19
- geney/_config_setup.py +16 -0
- geney/_graphic_utils.py +269 -0
- geney/_gtex_utils.py +68 -0
- geney/_immune_utils.py +125 -0
- geney/{oncosplice.py → _oncosplice.py} +199 -156
- geney/_splicing_utils.py +693 -0
- geney/_survival_utils.py +143 -0
- geney/_tcga_utils.py +405 -0
- geney/_tis_utils.py +172 -0
- geney/immune_utils.py +1 -1
- geney/pipelines.py +66 -0
- geney/power_utils.py +1 -1
- geney/spliceai_utils.py +17 -17
- geney/utils/Fasta_segment.py +260 -0
- geney/utils/SeqMats.py +423 -0
- geney/utils/TranscriptLibrary.py +55 -0
- geney/utils/__init__.py +20 -0
- geney/utils/mutation_utils.py +104 -0
- geney/utils/pangolin_utils.py +173 -0
- geney/utils/spliceai_utils.py +123 -0
- geney/utils/splicing_utils.py +525 -0
- geney/utils/utils.py +89 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/METADATA +1 -1
- geney-1.4.0.dist-info/RECORD +51 -0
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/WHEEL +1 -1
- geney-1.3.78.dist-info/RECORD +0 -31
- {geney-1.3.78.dist-info → geney-1.4.0.dist-info}/top_level.txt +0 -0
geney/_tis_utils.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import os
|
|
4
|
+
from scipy.stats import percentileofscore
|
|
5
|
+
import shelve
|
|
6
|
+
from Bio.Align import PairwiseAligner
|
|
7
|
+
from geney import config
|
|
8
|
+
|
|
9
|
+
p = PairwiseAligner()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_tis(reference_mrna, mutated_mrna, ref_tis_pos, left_context=100, right_context=102):
|
|
13
|
+
'''
|
|
14
|
+
mature_mrna: row 0 --> encoded nucleotides
|
|
15
|
+
row 1 --> genomic indices
|
|
16
|
+
row 2 --> super positions (incase of insertions or deletions
|
|
17
|
+
row1+row2 = conhesive & monotonic genomic indices
|
|
18
|
+
row 3 --> binary mutated position or not
|
|
19
|
+
mature_mrna.seq
|
|
20
|
+
mature_mrna.indices
|
|
21
|
+
'''
|
|
22
|
+
tis_coords = ref_seq.mature_mrna.asymmetric_indices(ref_seq.TIS, left_context=0, right_context=3)
|
|
23
|
+
ref_seq, mut_seq = ref_seq.mature_mrna, mut_seq.mature_mrna
|
|
24
|
+
|
|
25
|
+
# 1. Is the start codon (the indices) conserved in the mut sequence?
|
|
26
|
+
assert all(a in ref_seq.seqmat[1, :] for a in
|
|
27
|
+
tis_coords), f"Start codon indices specified not found in the reference sequence."
|
|
28
|
+
tis_conserved = all(a in mut_seq.seqmat[1, :] for a in tis_coords)
|
|
29
|
+
|
|
30
|
+
# 2. If condition 1 is passed, is the context around that start codon the same in both the reference and the mutated?
|
|
31
|
+
context_conserved = False
|
|
32
|
+
if tis_conserved:
|
|
33
|
+
context_conserved = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
|
|
34
|
+
right_context=right_context,
|
|
35
|
+
padding='$') == mut_seq.asymmetric_subseq(tis_coords[0],
|
|
36
|
+
left_context=left_context,
|
|
37
|
+
right_context=right_context,
|
|
38
|
+
padding='$')
|
|
39
|
+
|
|
40
|
+
if context_conserved:
|
|
41
|
+
return [(tis_coords[0], 1, 'canonical')]
|
|
42
|
+
|
|
43
|
+
sc_table = pd.read_pickle(config['titer_path'] / 'titer_tis_scores.pickle')
|
|
44
|
+
ref_seq_tis_context = ref_seq.asymmetric_subseq(tis_coords[0], left_context=left_context,
|
|
45
|
+
right_context=right_context, padding='$')
|
|
46
|
+
|
|
47
|
+
ref_titer_score = retrieve_titer_score(ref_seq_tis_context)
|
|
48
|
+
ref_titer_rank = percentileofscore(sc_table['tis_score'], ref_titer_score)
|
|
49
|
+
ref_protein = ref_seq.translate(tis_coords[0])
|
|
50
|
+
|
|
51
|
+
candidate_positions = np.array([mut_seq.seq[i:i + 3] in TITER_acceptable_TISs for i in range(len(mut_seq.seq))])
|
|
52
|
+
candidate_positions = np.array(
|
|
53
|
+
[p.align(ref_protein, mut_seq.translate(mut_seq.seqmat[1, i])).score if candidate_positions[i] == True else 0
|
|
54
|
+
for i in range(len(ref_seq.seq))])
|
|
55
|
+
|
|
56
|
+
candidate_positions = candidate_positions > sorted(candidate_positions)[-5] # implement correct logic
|
|
57
|
+
candidate_positions = np.array([retrieve_titer_score(
|
|
58
|
+
mut_seq.asymmetric_subseq(tis_coords[0], left_context=left_context, right_context=right_context,
|
|
59
|
+
padding='$')) if candidate_positions[i] > 0 else False for i in
|
|
60
|
+
range(len(ref_seq.seq))])
|
|
61
|
+
candidate_positions = np.array(
|
|
62
|
+
[percentileofscore(sc_table.tis_score, candidate_positions[i]) if candidate_positions[i] != False else 100 for i
|
|
63
|
+
in range(len(ref_seq.seq))])
|
|
64
|
+
best_position = np.where(candidate_positions == min(candidate_positions))[0][0]
|
|
65
|
+
out = mut_seq.seqmat[1, best_position]
|
|
66
|
+
return out #output: [(genomic_coord1, probability, filter_tag), (genomic_coord2, probability, filter_tag)]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def seq_matrix(seq_list):
|
|
70
|
+
tensor = np.zeros((len(seq_list), 203, 8))
|
|
71
|
+
for i in range(len(seq_list)):
|
|
72
|
+
seq = seq_list[i]
|
|
73
|
+
j = 0
|
|
74
|
+
for s in seq:
|
|
75
|
+
if s == 'A' and (j < 100 or j > 102):
|
|
76
|
+
tensor[i][j] = [1, 0, 0, 0, 0, 0, 0, 0]
|
|
77
|
+
if s == 'T' and (j < 100 or j > 102):
|
|
78
|
+
tensor[i][j] = [0, 1, 0, 0, 0, 0, 0, 0]
|
|
79
|
+
if s == 'C' and (j < 100 or j > 102):
|
|
80
|
+
tensor[i][j] = [0, 0, 1, 0, 0, 0, 0, 0]
|
|
81
|
+
if s == 'G' and (j < 100 or j > 102):
|
|
82
|
+
tensor[i][j] = [0, 0, 0, 1, 0, 0, 0, 0]
|
|
83
|
+
if s == '$':
|
|
84
|
+
tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 0]
|
|
85
|
+
if s == 'A' and (j >= 100 and j <= 102):
|
|
86
|
+
tensor[i][j] = [0, 0, 0, 0, 1, 0, 0, 0]
|
|
87
|
+
if s == 'T' and (j >= 100 and j <= 102):
|
|
88
|
+
tensor[i][j] = [0, 0, 0, 0, 0, 1, 0, 0]
|
|
89
|
+
if s == 'C' and (j >= 100 and j <= 102):
|
|
90
|
+
tensor[i][j] = [0, 0, 0, 0, 0, 0, 1, 0]
|
|
91
|
+
if s == 'G' and (j >= 100 and j <= 102):
|
|
92
|
+
tensor[i][j] = [0, 0, 0, 0, 0, 0, 0, 1]
|
|
93
|
+
j += 1
|
|
94
|
+
return tensor
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def build_titer_model(TITER_path=config['hg38']['titer_path']):
|
|
98
|
+
print('Building TITER model...')
|
|
99
|
+
from tensorflow.keras.constraints import MaxNorm
|
|
100
|
+
from tensorflow.keras.layers import Conv1D, MaxPool1D, LSTM, Dropout, Flatten, Dense, Activation
|
|
101
|
+
from tensorflow.keras import Sequential, Input
|
|
102
|
+
|
|
103
|
+
model = Sequential()
|
|
104
|
+
model.add(Input(shape=(203, 8)))
|
|
105
|
+
model.add(Conv1D(filters=128,
|
|
106
|
+
kernel_size=3,
|
|
107
|
+
padding='valid',
|
|
108
|
+
kernel_constraint=MaxNorm(3),
|
|
109
|
+
activation='relu'))
|
|
110
|
+
model.add(MaxPool1D(3))
|
|
111
|
+
model.add(Dropout(rate=0.21370950078747658))
|
|
112
|
+
model.add(LSTM(units=256,
|
|
113
|
+
return_sequences=True))
|
|
114
|
+
model.add(Dropout(rate=0.7238091317104384))
|
|
115
|
+
model.add(Flatten())
|
|
116
|
+
model.add(Dense(1))
|
|
117
|
+
model.add(Activation('sigmoid'))
|
|
118
|
+
|
|
119
|
+
model.compile(loss='binary_crossentropy',
|
|
120
|
+
optimizer='nadam',
|
|
121
|
+
metrics=['accuracy'])
|
|
122
|
+
|
|
123
|
+
models = []
|
|
124
|
+
|
|
125
|
+
# Load weights into multiple instances of the model
|
|
126
|
+
for i in range(32):
|
|
127
|
+
model_copy = Sequential(model.layers) # Create a new model instance with the same architecture
|
|
128
|
+
weights_path = os.path.join(TITER_path, f"bestmodel_{i}.hdf5")
|
|
129
|
+
|
|
130
|
+
if os.path.exists(weights_path):
|
|
131
|
+
model_copy.load_weights(weights_path) # Load weights into the new model instance
|
|
132
|
+
models.append(model_copy)
|
|
133
|
+
# print(f"Loaded model {i} with weights from {weights_path}")
|
|
134
|
+
else:
|
|
135
|
+
print(f"Warning: Weights file {weights_path} not found")
|
|
136
|
+
|
|
137
|
+
return models
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def calculate_titer_score(candidate_seq, titer_model=None): # , prior):
|
|
141
|
+
if titer_model is None:
|
|
142
|
+
titer_model = TITER_MODEL
|
|
143
|
+
processed_seq = seq_matrix([candidate_seq]) # Wrap in list to keep dimensions consistent
|
|
144
|
+
# prior = np.array([prior]).reshape(1, 1)
|
|
145
|
+
analyzed_score = np.zeros((1, 1))
|
|
146
|
+
|
|
147
|
+
# Iterate through the models (assuming 32 models) and calculate the score
|
|
148
|
+
for i in range(32):
|
|
149
|
+
y_pred = titer_model[i].predict(processed_seq, verbose=0)
|
|
150
|
+
analyzed_score += y_pred # * prior
|
|
151
|
+
print(analyzed_score)
|
|
152
|
+
return analyzed_score[0][0]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def retrieve_titer_score(sequence, filename='sequences_shelve.db'):
|
|
156
|
+
# Open the shelf (acts like a dictionary, stored in a file)
|
|
157
|
+
with shelve.open(filename) as db:
|
|
158
|
+
# Check if sequence is already in the shelf
|
|
159
|
+
if sequence in db:
|
|
160
|
+
return db[sequence]
|
|
161
|
+
else:
|
|
162
|
+
# If not, run the function, store the result, and return it
|
|
163
|
+
value = calculate_titer_score(sequence, TITER_MODEL)
|
|
164
|
+
db[sequence] = value
|
|
165
|
+
return value
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
TITER_acceptable_TISs = ['ATG', 'CTG', 'ACG', 'TTG', 'GTG']
|
|
169
|
+
codon_tis_prior = {'ATG': 3.5287101354987644, 'CTG': 1.746859242328512, 'ACG': 1.3535552403706805,
|
|
170
|
+
'TTG': 1.1364995562364615, 'GTG': 1.218573747658257}
|
|
171
|
+
stop_codons = ['TAA', 'TAG', 'TGA']
|
|
172
|
+
TITER_MODEL = build_titer_model()
|
geney/immune_utils.py
CHANGED
geney/pipelines.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Home of the frequently used pipelines that everyone needs
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from .Gene import Gene
|
|
6
|
+
from .utils.mutation_utils import MutationalEvent
|
|
7
|
+
from .SpliceSimulator import SpliceSimulator
|
|
8
|
+
from .Oncosplice import Oncosplice
|
|
9
|
+
from .utils.TranscriptLibrary import TranscriptLibrary
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def max_splicing_delta(mut_id, transcript_id=None, splicing_engine='spliceai'):
|
|
13
|
+
m = MutationalEvent(mut_id)
|
|
14
|
+
assert m.compatible(), 'Mutations in event are incompatible'
|
|
15
|
+
reference_transcript = Gene.from_file(
|
|
16
|
+
m.gene).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
|
|
17
|
+
tl = TranscriptLibrary(reference_transcript, m)
|
|
18
|
+
splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
|
|
19
|
+
ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
|
|
20
|
+
return ss.max_splicing_delta('event_prob')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def oncosplice_pipeline_single_transcript(mut_id, transcript_id=None, splicing_engine='spliceai', ):
|
|
24
|
+
m = MutationalEvent(mut_id)
|
|
25
|
+
assert m.compatible(), 'Mutations in event are incompatible'
|
|
26
|
+
reference_transcript = Gene.from_file(
|
|
27
|
+
m.gene).transcript(transcript_id).generate_pre_mrna().generate_mature_mrna().generate_protein()
|
|
28
|
+
tl = TranscriptLibrary(reference_transcript, m)
|
|
29
|
+
splicing_results = tl.predict_splicing(m.position, engine=splicing_engine, inplace=True).get_event_columns('event')
|
|
30
|
+
ss = SpliceSimulator(splicing_results, tl.event, feature='event', max_distance=100_000_000)
|
|
31
|
+
|
|
32
|
+
base_report = pd.Series({'mut_id': mut_id,
|
|
33
|
+
'gene': m.gene,
|
|
34
|
+
'transcript_id': reference_transcript.transcript_id,
|
|
35
|
+
'primary_transcript': reference_transcript.primary_transcript,
|
|
36
|
+
'splicing_engine': splicing_engine,
|
|
37
|
+
'time_of_execution': datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
|
|
38
|
+
|
|
39
|
+
ss_metadata = ss.report(m.positions[0])
|
|
40
|
+
report = []
|
|
41
|
+
for variant_transcript, isoform_metadata in ss.get_viable_transcripts(metadata=True):
|
|
42
|
+
onco = Oncosplice(reference_transcript.protein, variant_transcript.protein, reference_transcript.cons_vector)
|
|
43
|
+
report.append(pd.concat([base_report, ss_metadata, isoform_metadata, onco.get_analysis_series()]))
|
|
44
|
+
return pd.DataFrame(report)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def oncosplice_pipeline_all_transcripts(mut_id, splicing_engine='spliceai'):
|
|
48
|
+
m = MutationalEvent(mut_id)
|
|
49
|
+
assert m.compatible(), 'Mutations in event are incompatible'
|
|
50
|
+
reports = []
|
|
51
|
+
for transcript_id in Gene.from_file(m.gene).transcripts.keys():
|
|
52
|
+
reports.append(oncosplice_pipeline_single_transcript(mut_id, transcript_id, splicing_engine=splicing_engine))
|
|
53
|
+
return pd.concat(reports, axis=1)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_tcga_annotations(mut_ids):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def generate_epitopes():
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
geney/power_utils.py
CHANGED
geney/spliceai_utils.py
CHANGED
|
@@ -6,10 +6,10 @@ import tensorflow as tf
|
|
|
6
6
|
import sys
|
|
7
7
|
|
|
8
8
|
# Check if GPU is available
|
|
9
|
-
if tf.config.list_physical_devices('GPU'):
|
|
10
|
-
|
|
11
|
-
else:
|
|
12
|
-
|
|
9
|
+
# if tf.config.list_physical_devices('GPU'):
|
|
10
|
+
# print("Running on GPU.")
|
|
11
|
+
# else:
|
|
12
|
+
# print("Running on CPU.")
|
|
13
13
|
|
|
14
14
|
# tf.config.threading.set_intra_op_parallelism_threads(1)
|
|
15
15
|
# tf.config.threading.set_inter_op_parallelism_threads(1)
|
|
@@ -18,21 +18,21 @@ else:
|
|
|
18
18
|
model_filenames = [f"models/spliceai{i}.h5" for i in range(1, 6)]
|
|
19
19
|
|
|
20
20
|
# Load each model using the package resources.
|
|
21
|
-
sai_models = [load_model(resources.files("spliceai").joinpath(filename))
|
|
22
|
-
|
|
21
|
+
# sai_models = [load_model(resources.files("spliceai").joinpath(filename))
|
|
22
|
+
# for filename in model_filenames]
|
|
23
23
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
if sys.platform == 'darwin':
|
|
25
|
+
sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
|
|
26
|
+
# sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
|
|
27
|
+
sai_models = [load_model(resources.files('spliceai').joinpath(f)) for f in sai_paths]
|
|
28
|
+
else:
|
|
29
|
+
sai_paths = ['/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai1.h5',
|
|
30
|
+
'/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai2.h5',
|
|
31
|
+
'/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai3.h5',
|
|
32
|
+
'/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai4.h5',
|
|
33
|
+
'/tamir2/nicolaslynn/home/miniconda3/lib/python3.10/site-packages/spliceai/models/spliceai5.h5']
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
sai_models = [load_model(f) for f in sai_paths]
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def one_hot_encode(seq):
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
__all__ = ['Fasta_segment']
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class Fasta_segment():
|
|
7
|
+
'''
|
|
8
|
+
Efficient reads of a segments starting from a given offset
|
|
9
|
+
from a FASTA file.
|
|
10
|
+
|
|
11
|
+
================================================================
|
|
12
|
+
This class can be used ONLY for Fasta files where ALL sequence
|
|
13
|
+
rows (excluding headers) are of the same length !!
|
|
14
|
+
================================================================
|
|
15
|
+
|
|
16
|
+
In case of a FASTA with multiple sequences (multiple headers), the header name
|
|
17
|
+
of the corresponding sequence must be provided. The header name is the string
|
|
18
|
+
that proceeds the character ">".
|
|
19
|
+
For example, the header name ofthe header:
|
|
20
|
+
|
|
21
|
+
">NC_000011.10 Homo sapiens chromosome 11, GRCh38.p13 Primary Assembly"
|
|
22
|
+
|
|
23
|
+
is "NC_000011.10".
|
|
24
|
+
|
|
25
|
+
In case of a FASTA with a single sequence (single header), the header name
|
|
26
|
+
is not required.
|
|
27
|
+
|
|
28
|
+
The class reads only the segment into memory and not the all FASTA sequence.
|
|
29
|
+
This can be used, for example, to read a segment from a FASTA file
|
|
30
|
+
that contains a chromosome of the human genome.
|
|
31
|
+
'''
|
|
32
|
+
def __init__(self, files_info=None) -> None:
|
|
33
|
+
'''
|
|
34
|
+
files_info (optional input) is a dictionary:
|
|
35
|
+
For a FASTA with multiple headers, the key is a fasta file name and
|
|
36
|
+
the header name as follows: "<fasta file name>:<header name>".
|
|
37
|
+
For a FASTA with a single header the key is simply the file name (the header
|
|
38
|
+
can also be provided, in which case the key will include it as well, however this is not required).
|
|
39
|
+
The value is a dictionary that contains basic statistics of the fasta file.
|
|
40
|
+
Keys of files_info[file] are:
|
|
41
|
+
'init_loc': offset of the first NT in file (0 indicated the first NT)
|
|
42
|
+
'l_size': number of NTs per line excluding \n,
|
|
43
|
+
'line_size': number of NTs per line including \n.
|
|
44
|
+
'file_size': total number of NTs in file, excluding \n (and excluding the header). This is
|
|
45
|
+
available only for FASTA files with a single header.
|
|
46
|
+
|
|
47
|
+
For example: files_info = {'file1.fas': {'init_loc': 70, 'l_size': 80, 'line_size': 81, 'file_size': 240},
|
|
48
|
+
'file2.fas:NC_000011.10': {'init_loc': 70, 'l_size': 80, 'line_size': 81}}
|
|
49
|
+
|
|
50
|
+
'''
|
|
51
|
+
self.files_info = files_info if files_info else {}
|
|
52
|
+
|
|
53
|
+
def __repr__(self) -> str:
|
|
54
|
+
return f"Fasta_segment([files_info])"
|
|
55
|
+
|
|
56
|
+
def __str__(self) -> str:
|
|
57
|
+
return f"Efficient reads of segments from FASTA files.\n"
|
|
58
|
+
|
|
59
|
+
def __len__(self) -> int:
|
|
60
|
+
return len(self.files_info)
|
|
61
|
+
|
|
62
|
+
def show_processed_files(self) -> list:
|
|
63
|
+
return list(self.files_info.keys())
|
|
64
|
+
|
|
65
|
+
def show_files_info(self) -> None:
|
|
66
|
+
print(self.str_files_info())
|
|
67
|
+
|
|
68
|
+
def str_files_info(self) -> str:
|
|
69
|
+
s = ''
|
|
70
|
+
for k in self.files_info.keys():
|
|
71
|
+
s += f"{k}:\n"
|
|
72
|
+
for ki, vi in self.files_info[k].items():
|
|
73
|
+
s += f"\t{ki}: {vi}\n"
|
|
74
|
+
|
|
75
|
+
return s
|
|
76
|
+
|
|
77
|
+
def read_segment(self, file: str, offset: int, size: int) -> str:
|
|
78
|
+
'''
|
|
79
|
+
Reads a segment from a FASTA with a single sequence (single header).
|
|
80
|
+
file - FASTA file name
|
|
81
|
+
offset - 0 based offset of the start segment in file (from the first sequence NT).
|
|
82
|
+
For example, value of 1 indicates starting from the second NT in the file.
|
|
83
|
+
size - number of NTs in a segment to read.
|
|
84
|
+
'''
|
|
85
|
+
if not os.path.exists(file):
|
|
86
|
+
print(f"Error: file {file} does not exists !!")
|
|
87
|
+
return ""
|
|
88
|
+
|
|
89
|
+
with open(file, 'rt') as fp:
|
|
90
|
+
if file in self.files_info:
|
|
91
|
+
l_size = self.files_info[file]['l_size']
|
|
92
|
+
line_size = self.files_info[file]['line_size']
|
|
93
|
+
init_loc = self.files_info[file]['init_loc']
|
|
94
|
+
else:
|
|
95
|
+
init_loc, line_size, l_size = self.__compute_file_stats(fp)
|
|
96
|
+
self.files_info[file] = {'init_loc': init_loc, 'l_size': l_size, 'line_size': line_size}
|
|
97
|
+
|
|
98
|
+
offset_num_lines, offset_in_line = offset // l_size, np.mod(offset, l_size)
|
|
99
|
+
# accounting for extra characters due to multiple \n that will be discarded
|
|
100
|
+
add_for_newline = ((offset + size -1) // l_size) - offset_num_lines
|
|
101
|
+
fp.seek(init_loc + offset_num_lines *line_size + offset_in_line)
|
|
102
|
+
return fp.read(size + add_for_newline).replace('\n', '')
|
|
103
|
+
|
|
104
|
+
def read_segment_endpoints(self, file: str, start_loc: int, end_loc: int):
|
|
105
|
+
seq = Fasta_segment().read_segment(file, start_loc - 1, end_loc - start_loc + 1).upper()
|
|
106
|
+
# indices = list(range(start_loc, end_loc + 1))
|
|
107
|
+
indices = np.arange(start_loc, end_loc + 1)
|
|
108
|
+
assert len(seq) == len(
|
|
109
|
+
indices), f'reference data not compatible; {len(seq)}, {len(indices)}, start: {start_loc}, end: {end_loc}, {file}'
|
|
110
|
+
return {"nucleotides": seq, "index": indices}
|
|
111
|
+
|
|
112
|
+
def __compute_file_stats(self, fp) -> tuple:
|
|
113
|
+
'''
|
|
114
|
+
Computs the stats that are needed to read a segment from
|
|
115
|
+
a FASTA file with a single sequence.
|
|
116
|
+
'''
|
|
117
|
+
# account for a header (if exists) in the first line of the file
|
|
118
|
+
fp.seek(0, os.SEEK_SET)
|
|
119
|
+
if fp.read(1) == ">":
|
|
120
|
+
fp.readline()
|
|
121
|
+
init_loc = fp.tell()
|
|
122
|
+
else:
|
|
123
|
+
init_loc = 0
|
|
124
|
+
|
|
125
|
+
# init_loc is the location (0-based) of the first NT in file
|
|
126
|
+
fp.seek(init_loc)
|
|
127
|
+
fp.readline() # advance handle to begining of second line
|
|
128
|
+
line_size = fp.tell() - init_loc # number of NTs per line, including the \n
|
|
129
|
+
return init_loc, line_size, line_size -1
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def total_num_chars(self, file: str) -> int:
|
|
133
|
+
'''
|
|
134
|
+
Returns the total number of characters (NTs) in a FASTA
|
|
135
|
+
file (excluding the header and \n) without reading the all file.
|
|
136
|
+
|
|
137
|
+
This function supports only Fasta with a single headr.
|
|
138
|
+
Support for FASTAs with multiple headers is TBD.
|
|
139
|
+
'''
|
|
140
|
+
if file in self.files_info:
|
|
141
|
+
return self.files_info[file]['file_size']
|
|
142
|
+
else:
|
|
143
|
+
return self.__compute_total_num_NTs(file)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def __compute_total_num_NTs(self, file: str) -> int:
|
|
147
|
+
'''
|
|
148
|
+
Computes the total number of characters (NTs) in a FASTA
|
|
149
|
+
file (excluding the header and \n) and updates self.files_info.
|
|
150
|
+
'''
|
|
151
|
+
with open(file, 'rt') as fp:
|
|
152
|
+
init_loc, line_size, l_size = self.__compute_file_stats(fp)
|
|
153
|
+
|
|
154
|
+
# check if last character is new line
|
|
155
|
+
last_loc = fp.seek(0, os.SEEK_END)
|
|
156
|
+
fp.seek(fp.tell() - 1, os.SEEK_SET)
|
|
157
|
+
last_char = fp.read()
|
|
158
|
+
|
|
159
|
+
delta = last_loc -init_loc
|
|
160
|
+
q, r = divmod(delta, line_size)
|
|
161
|
+
num_chars = delta - q # remove counts of new lines
|
|
162
|
+
if q== 1 and r == 0 and (last_char != '\n'):
|
|
163
|
+
num_chars += 1 # if only one line in file and last character is not newline add one
|
|
164
|
+
|
|
165
|
+
self.files_info[file] = {'init_loc': init_loc,
|
|
166
|
+
'l_size': l_size,
|
|
167
|
+
'line_size': line_size,
|
|
168
|
+
'file_size': num_chars}
|
|
169
|
+
|
|
170
|
+
return num_chars
|
|
171
|
+
|
|
172
|
+
def multiple_headers_read_segment(self, file: str, header_name: str, offset: int, size: int) -> str:
|
|
173
|
+
'''
|
|
174
|
+
Reades from a FASTA with multiple sequences (multiple headers).
|
|
175
|
+
The offset and size here are relative to the sequence with header name header_name.
|
|
176
|
+
|
|
177
|
+
For example, for header_name='name1', offset=3, size=4, the return sequence is of size 4 NTs, starting
|
|
178
|
+
from offset 3 of the sequence that follows the header with header name header_name.
|
|
179
|
+
'''
|
|
180
|
+
init_loc = None
|
|
181
|
+
token = file + ':' + header_name
|
|
182
|
+
with open(file, 'rt') as fp:
|
|
183
|
+
if token in self.files_info:
|
|
184
|
+
l_size = self.files_info[token]['l_size']
|
|
185
|
+
line_size = self.files_info[token]['line_size']
|
|
186
|
+
init_loc = self.files_info[token]['init_loc']
|
|
187
|
+
else:
|
|
188
|
+
line = fp.readline()
|
|
189
|
+
while line:
|
|
190
|
+
if line[0] == '>':
|
|
191
|
+
# header name
|
|
192
|
+
name = line.split()[0][1:]
|
|
193
|
+
if name == header_name:
|
|
194
|
+
init_loc = fp.tell() # file offset of the first NT of the corresponding sequence
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
line = fp.readline()
|
|
198
|
+
|
|
199
|
+
if init_loc is None:
|
|
200
|
+
print(f"Did not find header name {header_name} in file {file} !!")
|
|
201
|
+
return ''
|
|
202
|
+
else:
|
|
203
|
+
fp.seek(init_loc)
|
|
204
|
+
fp.readline()
|
|
205
|
+
line_size = fp.tell() - init_loc
|
|
206
|
+
l_size = line_size - 1
|
|
207
|
+
self.files_info[token] = {'init_loc': init_loc, 'l_size': l_size, 'line_size': line_size}
|
|
208
|
+
|
|
209
|
+
offset_num_lines, offset_in_line = offset // l_size, np.mod(offset, l_size)
|
|
210
|
+
# accounting for extra characters due to multiple \n that will be discarded
|
|
211
|
+
add_for_newline = ((offset + size - 1) // l_size) - offset_num_lines
|
|
212
|
+
fp.seek(init_loc + offset_num_lines * line_size + offset_in_line)
|
|
213
|
+
return fp.read(size + add_for_newline).replace('\n', '')
|
|
214
|
+
|
|
215
|
+
def fasta_gen(self, file: str, segment_size: int, init_offset: int = 0, jump_size: int = 0) -> str:
|
|
216
|
+
'''
|
|
217
|
+
Fasta generator.
|
|
218
|
+
Returns an iterator for reading segments of size segment_size NTs, separated by
|
|
219
|
+
jump_size NTs, starting from an offset (0-based) init_offset.
|
|
220
|
+
|
|
221
|
+
To read segments consecutively, set jump_size to 0 (which is the default value).
|
|
222
|
+
'''
|
|
223
|
+
try:
|
|
224
|
+
with open(file, 'rt') as fp:
|
|
225
|
+
# handeling the header
|
|
226
|
+
fp.seek(0, os.SEEK_SET)
|
|
227
|
+
if fp.read(1) == ">":
|
|
228
|
+
fp.readline()
|
|
229
|
+
init_loc = fp.tell()
|
|
230
|
+
else:
|
|
231
|
+
init_loc = 0
|
|
232
|
+
|
|
233
|
+
# computing number of characters (including \n) per line
|
|
234
|
+
fp.seek(init_loc)
|
|
235
|
+
fp.readline()
|
|
236
|
+
line_size = fp.tell() - init_loc # number of NTs per line, including the \n
|
|
237
|
+
ext_newlines = segment_size // line_size # number of \n in a segment_size read with offset 0
|
|
238
|
+
jump_newlines = jump_size // line_size # same for jump_size
|
|
239
|
+
|
|
240
|
+
# starting from init_offset (accounting for \n)
|
|
241
|
+
fp.seek(init_loc + init_offset + (init_offset // line_size))
|
|
242
|
+
|
|
243
|
+
while True:
|
|
244
|
+
segment = fp.read(segment_size + ext_newlines).replace('\n', '')
|
|
245
|
+
# might need another single read as ext_newlines assumed reading from beginning of the line
|
|
246
|
+
if len(segment) < segment_size:
|
|
247
|
+
segment += fp.read(1).replace('\n', '')
|
|
248
|
+
|
|
249
|
+
if segment != '':
|
|
250
|
+
yield segment.replace('\n', '')
|
|
251
|
+
else:
|
|
252
|
+
break # raise StopIteration
|
|
253
|
+
|
|
254
|
+
# jump to next segment (the jump is excluding \n)
|
|
255
|
+
if fp.read(jump_size + jump_newlines).count('\n') > jump_newlines:
|
|
256
|
+
fp.seek(fp.tell() + 1)
|
|
257
|
+
|
|
258
|
+
except IOError as err:
|
|
259
|
+
print(f"Error: {file} not found !! ({err})")
|
|
260
|
+
|