geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/oncosplice.py +1 -1
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
- geney-1.2.22.dist-info/RECORD +19 -0
- geney/Gene.py +0 -258
- geney/analyzers/__init__.py +0 -0
- geney/analyzers/benchmark_clinvar.py +0 -158
- geney/analyzers/characterize_epistasis.py +0 -15
- geney/analyzers/compare_sets.py +0 -91
- geney/analyzers/group_comparison.py +0 -81
- geney/analyzers/survival.py +0 -144
- geney/analyzers/tcga_annotations.py +0 -194
- geney/analyzers/visualize_protein_conservation.py +0 -398
- geney/benchmark_clinvar.py +0 -158
- geney/compare_sets.py +0 -91
- geney/data_parsers/__init__.py +0 -0
- geney/data_parsers/gtex.py +0 -68
- geney/gtex.py +0 -68
- geney/immunotherapy/__init__.py +0 -0
- geney/immunotherapy/netchop.py +0 -78
- geney/mutations/__init__.py +0 -0
- geney/mutations/variant_utils.py +0 -125
- geney/netchop.py +0 -79
- geney/oncosplice/__init__.py +0 -0
- geney/oncosplice_mouse.py +0 -277
- geney/oncosplice_pipeline.py +0 -1588
- geney/performance_utils.py +0 -138
- geney/pipelines/__init__.py +0 -0
- geney/pipelines/dask_utils.py +0 -153
- geney/splicing/__init__.py +0 -2
- geney/splicing/spliceai_utils.py +0 -253
- geney/splicing/splicing_isoform_utils.py +0 -0
- geney/splicing/splicing_utils.py +0 -366
- geney/survival.py +0 -124
- geney/tcga_annotations.py +0 -352
- geney/translation_termination/__init__.py +0 -0
- geney/translation_termination/tts_utils.py +0 -0
- geney-1.2.20.dist-info/RECORD +0 -52
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
- {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0
geney/data_parsers/gtex.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from tqdm import tqdm
|
|
3
|
-
|
|
4
|
-
# Set pandas display options (if necessary)
|
|
5
|
-
pd.options.display.max_rows = 999
|
|
6
|
-
|
|
7
|
-
# Read metadata
|
|
8
|
-
metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
|
|
9
|
-
metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
|
|
10
|
-
|
|
11
|
-
# Initialize an empty DataFrame for combined results
|
|
12
|
-
combined_df = pd.DataFrame()
|
|
13
|
-
|
|
14
|
-
# Define chunk size
|
|
15
|
-
tpm_mean = []
|
|
16
|
-
# Process the main data file in chunks
|
|
17
|
-
for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
|
|
18
|
-
delimiter='\t')):
|
|
19
|
-
# Perform the same operations on the chunk
|
|
20
|
-
chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
|
|
21
|
-
# Append the processed chunk to the combined DataFrame
|
|
22
|
-
tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
|
|
23
|
-
|
|
24
|
-
# Compute the mean TPM per tissue
|
|
25
|
-
tpm_mean = pd.concat(tpm_mean)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
cancer_projects = {
|
|
29
|
-
"Adrenal Gland": "ACC",
|
|
30
|
-
"Bladder": "BLCA",
|
|
31
|
-
"Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
|
|
32
|
-
"Breast": "BRCA",
|
|
33
|
-
"Colon": "COAD",
|
|
34
|
-
"Esophagus": "ESCA",
|
|
35
|
-
"Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
|
|
36
|
-
"Liver": "LIHC",
|
|
37
|
-
"Lung": "LUNG",
|
|
38
|
-
"Ovary": "OV",
|
|
39
|
-
"Pancreas": "PAAD",
|
|
40
|
-
"Prostate": "PRAD",
|
|
41
|
-
"Skin": "SKCM",
|
|
42
|
-
"Stomach": "STAD",
|
|
43
|
-
"Testis": "TGCT",
|
|
44
|
-
"Uterus": "UCS"
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
tissue_projects = {
|
|
48
|
-
"ACC": "Adrenal Gland",
|
|
49
|
-
"BLCA": "Bladder",
|
|
50
|
-
"GBM": "Brain",
|
|
51
|
-
"LGG": "Brain",
|
|
52
|
-
"BRCA": "Breast",
|
|
53
|
-
"COAD": "Colon",
|
|
54
|
-
"ESCA": "Esophagus",
|
|
55
|
-
"KICH": "Kidney",
|
|
56
|
-
"KIRC": "Kidney",
|
|
57
|
-
"KIRP": "Kidney",
|
|
58
|
-
"LIHC": "Liver",
|
|
59
|
-
"LUNG": "Lung",
|
|
60
|
-
"OV": "Ovary",
|
|
61
|
-
"PAAD": "Pancreas",
|
|
62
|
-
"PRAD": "Prostate",
|
|
63
|
-
"SKCM": "Skin",
|
|
64
|
-
"STAD": "Stomach",
|
|
65
|
-
"TGCT": "Testis",
|
|
66
|
-
"UCS": "Uterus"
|
|
67
|
-
}
|
|
68
|
-
|
geney/gtex.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from tqdm import tqdm
|
|
3
|
-
|
|
4
|
-
# Set pandas display options (if necessary)
|
|
5
|
-
pd.options.display.max_rows = 999
|
|
6
|
-
|
|
7
|
-
# Read metadata
|
|
8
|
-
metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
|
|
9
|
-
metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
|
|
10
|
-
|
|
11
|
-
# Initialize an empty DataFrame for combined results
|
|
12
|
-
combined_df = pd.DataFrame()
|
|
13
|
-
|
|
14
|
-
# Define chunk size
|
|
15
|
-
tpm_mean = []
|
|
16
|
-
# Process the main data file in chunks
|
|
17
|
-
for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
|
|
18
|
-
delimiter='\t')):
|
|
19
|
-
# Perform the same operations on the chunk
|
|
20
|
-
chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
|
|
21
|
-
# Append the processed chunk to the combined DataFrame
|
|
22
|
-
tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
|
|
23
|
-
|
|
24
|
-
# Compute the mean TPM per tissue
|
|
25
|
-
tpm_mean = pd.concat(tpm_mean)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
cancer_projects = {
|
|
29
|
-
"Adrenal Gland": "ACC",
|
|
30
|
-
"Bladder": "BLCA",
|
|
31
|
-
"Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
|
|
32
|
-
"Breast": "BRCA",
|
|
33
|
-
"Colon": "COAD",
|
|
34
|
-
"Esophagus": "ESCA",
|
|
35
|
-
"Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
|
|
36
|
-
"Liver": "LIHC",
|
|
37
|
-
"Lung": "LUNG",
|
|
38
|
-
"Ovary": "OV",
|
|
39
|
-
"Pancreas": "PAAD",
|
|
40
|
-
"Prostate": "PRAD",
|
|
41
|
-
"Skin": "SKCM",
|
|
42
|
-
"Stomach": "STAD",
|
|
43
|
-
"Testis": "TGCT",
|
|
44
|
-
"Uterus": "UCS"
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
tissue_projects = {
|
|
48
|
-
"ACC": "Adrenal Gland",
|
|
49
|
-
"BLCA": "Bladder",
|
|
50
|
-
"GBM": "Brain",
|
|
51
|
-
"LGG": "Brain",
|
|
52
|
-
"BRCA": "Breast",
|
|
53
|
-
"COAD": "Colon",
|
|
54
|
-
"ESCA": "Esophagus",
|
|
55
|
-
"KICH": "Kidney",
|
|
56
|
-
"KIRC": "Kidney",
|
|
57
|
-
"KIRP": "Kidney",
|
|
58
|
-
"LIHC": "Liver",
|
|
59
|
-
"LUNG": "Lung",
|
|
60
|
-
"OV": "Ovary",
|
|
61
|
-
"PAAD": "Pancreas",
|
|
62
|
-
"PRAD": "Prostate",
|
|
63
|
-
"SKCM": "Skin",
|
|
64
|
-
"STAD": "Stomach",
|
|
65
|
-
"TGCT": "Testis",
|
|
66
|
-
"UCS": "Uterus"
|
|
67
|
-
}
|
|
68
|
-
|
geney/immunotherapy/__init__.py
DELETED
|
File without changes
|
geney/immunotherapy/netchop.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import subprocess
|
|
3
|
-
import logging
|
|
4
|
-
import tempfile
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class NetChop(object):
|
|
8
|
-
"""
|
|
9
|
-
Wrapper around netChop tool. Assumes netChop is in your PATH.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def predict_epitopes(self, sequences min_len=8):
|
|
13
|
-
"""
|
|
14
|
-
Return netChop predictions for each position in each sequence.
|
|
15
|
-
|
|
16
|
-
Parameters
|
|
17
|
-
-----------
|
|
18
|
-
sequences : list of string
|
|
19
|
-
Amino acid sequences to predict cleavage for
|
|
20
|
-
|
|
21
|
-
Returns
|
|
22
|
-
-----------
|
|
23
|
-
list of list of float
|
|
24
|
-
|
|
25
|
-
The i'th list corresponds to the i'th sequence. Each list gives
|
|
26
|
-
the cleavage probability for each position in the sequence.
|
|
27
|
-
"""
|
|
28
|
-
with tempfile.NamedTemporaryFile(suffix=".fsa", mode="w") as input_fd:
|
|
29
|
-
for (i, sequence) in enumerate(sequences):
|
|
30
|
-
input_fd.write("> %d\n" % i)
|
|
31
|
-
input_fd.write(sequence)
|
|
32
|
-
input_fd.write("\n")
|
|
33
|
-
input_fd.flush()
|
|
34
|
-
try:
|
|
35
|
-
output = subprocess.check_output(["netChop", input_fd.name])
|
|
36
|
-
except subprocess.CalledProcessError as e:
|
|
37
|
-
logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
|
|
38
|
-
raise
|
|
39
|
-
|
|
40
|
-
parsed = self.parse_netchop(output)
|
|
41
|
-
assert len(parsed) == len(sequences), \
|
|
42
|
-
"Expected %d results but got %d" % (
|
|
43
|
-
len(sequences), len(parsed))
|
|
44
|
-
assert [len(x) for x in parsed] == [len(x) for x in sequences]
|
|
45
|
-
filtered_proteosomes = []
|
|
46
|
-
for scores, seq in list(zip(parsed, sequences)):
|
|
47
|
-
proteosome = self.chop_protein(seq, [s > threshold for s in scores])
|
|
48
|
-
filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
|
|
49
|
-
return proteosomes
|
|
50
|
-
|
|
51
|
-
@staticmethod
|
|
52
|
-
def parse_netchop(netchop_output):
|
|
53
|
-
"""
|
|
54
|
-
Parse netChop stdout.
|
|
55
|
-
"""
|
|
56
|
-
line_iterator = iter(netchop_output.decode().split("\n"))
|
|
57
|
-
scores = []
|
|
58
|
-
for line in line_iterator:
|
|
59
|
-
if "pos" in line and 'AA' in line and 'score' in line:
|
|
60
|
-
scores.append([])
|
|
61
|
-
if "----" not in next(line_iterator):
|
|
62
|
-
raise ValueError("Dashes expected")
|
|
63
|
-
line = next(line_iterator)
|
|
64
|
-
while '-------' not in line:
|
|
65
|
-
score = float(line.split()[3])
|
|
66
|
-
scores[-1].append(score)
|
|
67
|
-
line = next(line_iterator)
|
|
68
|
-
return scores
|
|
69
|
-
|
|
70
|
-
def chop_protein(self, seq, pos):
|
|
71
|
-
# Generate subsequences using list comprehension and slicing
|
|
72
|
-
start = 0
|
|
73
|
-
subsequences = [seq[start:(start := i+1)] for i, marker in enumerate(pos) if marker == 1]
|
|
74
|
-
# Check if the last part needs to be added
|
|
75
|
-
if start < len(seq):
|
|
76
|
-
subsequences.append(seq[start:])
|
|
77
|
-
return subsequences
|
|
78
|
-
|
geney/mutations/__init__.py
DELETED
|
File without changes
|
geney/mutations/variant_utils.py
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
END_CODONS = ['TAA', 'TAG', 'TGA']
|
|
4
|
-
|
|
5
|
-
def is_monotonic(A):
|
|
6
|
-
x, y = [], []
|
|
7
|
-
x.extend(A)
|
|
8
|
-
y.extend(A)
|
|
9
|
-
x.sort()
|
|
10
|
-
y.sort(reverse=True)
|
|
11
|
-
if(x == A or y == A):
|
|
12
|
-
return True
|
|
13
|
-
return False
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class Mutation:
|
|
17
|
-
def __init__(self, mid):
|
|
18
|
-
self.mut_id = mid
|
|
19
|
-
|
|
20
|
-
gene, chrom, pos, ref, alt = mid.split(':')
|
|
21
|
-
self.gene = gene
|
|
22
|
-
self.chrom = chrom.strip('chr')
|
|
23
|
-
self.start = int(pos)
|
|
24
|
-
|
|
25
|
-
self.file_identifier = self.mut_id.replace(':', '_')
|
|
26
|
-
self.file_identifier_short = f'{self.start}_{ref}_{alt}'
|
|
27
|
-
|
|
28
|
-
self.ref = ref if ref != '-' else ''
|
|
29
|
-
self.alt = alt if alt != '-' else ''
|
|
30
|
-
|
|
31
|
-
if len(self.ref) == len(self.alt) == 1:
|
|
32
|
-
self.vartype = 'SNP'
|
|
33
|
-
elif len(self.ref) == len(self.alt) > 1:
|
|
34
|
-
self.vartype = 'SUB'
|
|
35
|
-
elif self.ref and not self.alt:
|
|
36
|
-
self.vartype = 'DEL'
|
|
37
|
-
elif self.alt and not self.ref:
|
|
38
|
-
self.vartype = 'INS'
|
|
39
|
-
else:
|
|
40
|
-
self.vartype = 'INDEL'
|
|
41
|
-
|
|
42
|
-
def __str__(self):
|
|
43
|
-
return self.mut_id
|
|
44
|
-
|
|
45
|
-
def __repr__(self):
|
|
46
|
-
return self.mut_id
|
|
47
|
-
|
|
48
|
-
def __lt__(self, other):
|
|
49
|
-
return self.start < other.start
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class Variations:
|
|
53
|
-
def __init__(self, epistatic_set):
|
|
54
|
-
self.variants = sorted([Mutation(m) for m in epistatic_set.split('|')])
|
|
55
|
-
self.mut_id = epistatic_set
|
|
56
|
-
self.start = self.variants[0].start
|
|
57
|
-
self.positions = [v.start for v in self.variants]
|
|
58
|
-
# self.ref = ','.join([m.ref for m in self.variants])
|
|
59
|
-
# self.alt = ','.join([m.alt for m in self.variants])
|
|
60
|
-
self.gene = self.variants[0].gene
|
|
61
|
-
self.chrom = self.variants[0].chrom.strip('chr')
|
|
62
|
-
self.file_identifier = f'{self.gene}_{self.chrom}' + '_' + '_'.join([v.file_identifier_short for v in self.variants])
|
|
63
|
-
|
|
64
|
-
def __str__(self):
|
|
65
|
-
return '|'.join([m.mut_id for m in self.variants])
|
|
66
|
-
|
|
67
|
-
def __repr__(self):
|
|
68
|
-
return '|'.join([m.mut_id for m in self.variants])
|
|
69
|
-
|
|
70
|
-
def __iter__(self):
|
|
71
|
-
self.current_index = 0
|
|
72
|
-
return self
|
|
73
|
-
|
|
74
|
-
def __next__(self):
|
|
75
|
-
if self.current_index < len(self.variants):
|
|
76
|
-
x = self.variants[self.current_index]
|
|
77
|
-
self.current_index += 1
|
|
78
|
-
return x
|
|
79
|
-
raise StopIteration
|
|
80
|
-
|
|
81
|
-
@property
|
|
82
|
-
def file_identifier_json(self):
|
|
83
|
-
return Path(self.file_identifier + '.json')
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def generate_mut_variant(seq: str, indices: list, mut: Mutation):
|
|
87
|
-
offset = 1 if not mut.ref else 0
|
|
88
|
-
|
|
89
|
-
check_indices = list(range(mut.start, mut.start + len(mut.ref) + offset))
|
|
90
|
-
check1 = all([m in indices for m in check_indices])
|
|
91
|
-
if not check1:
|
|
92
|
-
print(f"Mutation {mut} not within transcript bounds: {min(indices)} - {max(indices)}.")
|
|
93
|
-
return seq, indices, False, False
|
|
94
|
-
|
|
95
|
-
rel_start, rel_end = indices.index(mut.start)+offset, indices.index(mut.start)+offset+len(mut.ref)
|
|
96
|
-
acquired_seq = seq[rel_start:rel_end]
|
|
97
|
-
check2 = acquired_seq == mut.ref
|
|
98
|
-
if not check2:
|
|
99
|
-
print(f'Reference allele does not match genome_build allele. {acquired_seq}, {mut.ref}, {mut.start}')
|
|
100
|
-
consensus_allele = False
|
|
101
|
-
else:
|
|
102
|
-
consensus_allele = True
|
|
103
|
-
if len(mut.ref) == len(mut.alt) > 0:
|
|
104
|
-
temp_indices = list(range(mut.start, mut.start + len(mut.ref)))
|
|
105
|
-
else:
|
|
106
|
-
temp_indices = [indices[indices.index(mut.start)] + v / 1000 for v in list(range(1, len(mut.alt)+1))]
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
new_indices = indices[:rel_start] + temp_indices + indices[rel_end:]
|
|
110
|
-
new_seq = seq[:rel_start] + mut.alt + seq[rel_end:]
|
|
111
|
-
|
|
112
|
-
assert len(new_seq) == len(new_indices), f'Error in variant modification: {mut}, {len(new_seq)}, {len(new_indices)}'
|
|
113
|
-
assert is_monotonic(list(filter((-1).__ne__, new_indices))), f'Mut indices are not monotonic.'
|
|
114
|
-
return new_seq, new_indices, True, consensus_allele
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def find_new_tts(seq, indices, tis):
|
|
118
|
-
seq, indices = seq[indices.index(tis):], indices[indices.index(tis):]
|
|
119
|
-
pos_options = [i for i in list(range(0, len(seq), 3)) if seq[i:i + 3] in END_CODONS and i+3 <= len(seq)]
|
|
120
|
-
if len(pos_options) == 0:
|
|
121
|
-
return indices[0] #[len(seq) - (len(seq) % 3) - 1]
|
|
122
|
-
pos_options = pos_options[0]
|
|
123
|
-
assert pos_options % 3 == 0, f'{pos_options} not divisible by three.'
|
|
124
|
-
pos_options -= 1
|
|
125
|
-
return indices[pos_options]
|
geney/netchop.py
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import subprocess
|
|
3
|
-
import logging
|
|
4
|
-
import tempfile
|
|
5
|
-
from geney import config_setup
|
|
6
|
-
|
|
7
|
-
class NetChop(object):
|
|
8
|
-
"""
|
|
9
|
-
Wrapper around netChop tool. Assumes netChop is in your PATH.
|
|
10
|
-
"""
|
|
11
|
-
def predict_epitopes(self, sequences, threshold, min_len=8):
|
|
12
|
-
"""
|
|
13
|
-
Return netChop predictions for each position in each sequence.
|
|
14
|
-
|
|
15
|
-
Parameters
|
|
16
|
-
-----------
|
|
17
|
-
sequences : list of string
|
|
18
|
-
Amino acid sequences to predict cleavage for
|
|
19
|
-
|
|
20
|
-
Returns
|
|
21
|
-
-----------
|
|
22
|
-
list of list of float
|
|
23
|
-
|
|
24
|
-
The i'th list corresponds to the i'th sequence. Each list gives
|
|
25
|
-
the cleavage probability for each position in the sequence.
|
|
26
|
-
"""
|
|
27
|
-
with tempfile.NamedTemporaryFile(dir=config_setup['NETCHOP'], suffix=".fsa", mode="w") as input_fd:
|
|
28
|
-
# temp_file = config_setup['NETCHOP'] / 'netchop_input.fsa'
|
|
29
|
-
# with open(temp_file, 'w') as input_fd:
|
|
30
|
-
for (i, sequence) in enumerate(sequences):
|
|
31
|
-
_ = input_fd.write("> %d\n" % i)
|
|
32
|
-
_ = input_fd.write(sequence)
|
|
33
|
-
_ = input_fd.write("\n")
|
|
34
|
-
# input_fd.flush()
|
|
35
|
-
# print(str(temp_file))
|
|
36
|
-
|
|
37
|
-
try:
|
|
38
|
-
output = subprocess.check_output(["netchop", str(input_fd.name)])
|
|
39
|
-
except subprocess.CalledProcessError as e:
|
|
40
|
-
logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
|
|
41
|
-
raise
|
|
42
|
-
|
|
43
|
-
parsed = self.parse_netchop(output)
|
|
44
|
-
assert len(parsed) == len(sequences), \
|
|
45
|
-
"Expected %d results but got %d" % (
|
|
46
|
-
len(sequences), len(parsed))
|
|
47
|
-
assert [len(x) for x in parsed] == [len(x) for x in sequences]
|
|
48
|
-
filtered_proteosomes = []
|
|
49
|
-
for scores, seq in list(zip(parsed, sequences)):
|
|
50
|
-
proteosome = self.chop_protein(seq, [s > threshold for s in scores])
|
|
51
|
-
filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
|
|
52
|
-
return filtered_proteosomes
|
|
53
|
-
@staticmethod
|
|
54
|
-
def parse_netchop(netchop_output):
|
|
55
|
-
"""
|
|
56
|
-
Parse netChop stdout.
|
|
57
|
-
"""
|
|
58
|
-
line_iterator = iter(netchop_output.decode().split("\n"))
|
|
59
|
-
scores = []
|
|
60
|
-
for line in line_iterator:
|
|
61
|
-
if "pos" in line and 'AA' in line and 'score' in line:
|
|
62
|
-
scores.append([])
|
|
63
|
-
if "----" not in next(line_iterator):
|
|
64
|
-
raise ValueError("Dashes expected")
|
|
65
|
-
line = next(line_iterator)
|
|
66
|
-
while '-------' not in line:
|
|
67
|
-
score = float(line.split()[3])
|
|
68
|
-
scores[-1].append(score)
|
|
69
|
-
line = next(line_iterator)
|
|
70
|
-
return scores
|
|
71
|
-
def chop_protein(self, seq, pos):
|
|
72
|
-
# Generate subsequences using list comprehension and slicing
|
|
73
|
-
start = 0
|
|
74
|
-
subsequences = [seq[start:(start := i+1)] for i, marker in enumerate(pos) if marker == 1]
|
|
75
|
-
# Check if the last part needs to be added
|
|
76
|
-
if start < len(seq):
|
|
77
|
-
subsequences.append(seq[start:])
|
|
78
|
-
return subsequences
|
|
79
|
-
|
geney/oncosplice/__init__.py
DELETED
|
File without changes
|