geney 1.2.20__py2.py3-none-any.whl → 1.2.21__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (38) hide show
  1. {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/METADATA +1 -1
  2. geney-1.2.21.dist-info/RECORD +19 -0
  3. geney/Gene.py +0 -258
  4. geney/analyzers/__init__.py +0 -0
  5. geney/analyzers/benchmark_clinvar.py +0 -158
  6. geney/analyzers/characterize_epistasis.py +0 -15
  7. geney/analyzers/compare_sets.py +0 -91
  8. geney/analyzers/group_comparison.py +0 -81
  9. geney/analyzers/survival.py +0 -144
  10. geney/analyzers/tcga_annotations.py +0 -194
  11. geney/analyzers/visualize_protein_conservation.py +0 -398
  12. geney/benchmark_clinvar.py +0 -158
  13. geney/compare_sets.py +0 -91
  14. geney/data_parsers/__init__.py +0 -0
  15. geney/data_parsers/gtex.py +0 -68
  16. geney/gtex.py +0 -68
  17. geney/immunotherapy/__init__.py +0 -0
  18. geney/immunotherapy/netchop.py +0 -78
  19. geney/mutations/__init__.py +0 -0
  20. geney/mutations/variant_utils.py +0 -125
  21. geney/netchop.py +0 -79
  22. geney/oncosplice/__init__.py +0 -0
  23. geney/oncosplice_mouse.py +0 -277
  24. geney/oncosplice_pipeline.py +0 -1588
  25. geney/performance_utils.py +0 -138
  26. geney/pipelines/__init__.py +0 -0
  27. geney/pipelines/dask_utils.py +0 -153
  28. geney/splicing/__init__.py +0 -2
  29. geney/splicing/spliceai_utils.py +0 -253
  30. geney/splicing/splicing_isoform_utils.py +0 -0
  31. geney/splicing/splicing_utils.py +0 -366
  32. geney/survival.py +0 -124
  33. geney/tcga_annotations.py +0 -352
  34. geney/translation_termination/__init__.py +0 -0
  35. geney/translation_termination/tts_utils.py +0 -0
  36. geney-1.2.20.dist-info/RECORD +0 -52
  37. {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/WHEEL +0 -0
  38. {geney-1.2.20.dist-info → geney-1.2.21.dist-info}/top_level.txt +0 -0
@@ -1,68 +0,0 @@
1
- import pandas as pd
2
- from tqdm import tqdm
3
-
4
- # Set pandas display options (if necessary)
5
- pd.options.display.max_rows = 999
6
-
7
- # Read metadata
8
- metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
9
- metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
10
-
11
- # Initialize an empty DataFrame for combined results
12
- combined_df = pd.DataFrame()
13
-
14
- # Define chunk size
15
- tpm_mean = []
16
- # Process the main data file in chunks
17
- for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
18
- delimiter='\t')):
19
- # Perform the same operations on the chunk
20
- chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
21
- # Append the processed chunk to the combined DataFrame
22
- tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
23
-
24
- # Compute the mean TPM per tissue
25
- tpm_mean = pd.concat(tpm_mean)
26
-
27
-
28
- cancer_projects = {
29
- "Adrenal Gland": "ACC",
30
- "Bladder": "BLCA",
31
- "Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
32
- "Breast": "BRCA",
33
- "Colon": "COAD",
34
- "Esophagus": "ESCA",
35
- "Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
36
- "Liver": "LIHC",
37
- "Lung": "LUNG",
38
- "Ovary": "OV",
39
- "Pancreas": "PAAD",
40
- "Prostate": "PRAD",
41
- "Skin": "SKCM",
42
- "Stomach": "STAD",
43
- "Testis": "TGCT",
44
- "Uterus": "UCS"
45
- }
46
-
47
- tissue_projects = {
48
- "ACC": "Adrenal Gland",
49
- "BLCA": "Bladder",
50
- "GBM": "Brain",
51
- "LGG": "Brain",
52
- "BRCA": "Breast",
53
- "COAD": "Colon",
54
- "ESCA": "Esophagus",
55
- "KICH": "Kidney",
56
- "KIRC": "Kidney",
57
- "KIRP": "Kidney",
58
- "LIHC": "Liver",
59
- "LUNG": "Lung",
60
- "OV": "Ovary",
61
- "PAAD": "Pancreas",
62
- "PRAD": "Prostate",
63
- "SKCM": "Skin",
64
- "STAD": "Stomach",
65
- "TGCT": "Testis",
66
- "UCS": "Uterus"
67
- }
68
-
geney/gtex.py DELETED
@@ -1,68 +0,0 @@
1
- import pandas as pd
2
- from tqdm import tqdm
3
-
4
- # Set pandas display options (if necessary)
5
- pd.options.display.max_rows = 999
6
-
7
- # Read metadata
8
- metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
9
- metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
10
-
11
- # Initialize an empty DataFrame for combined results
12
- combined_df = pd.DataFrame()
13
-
14
- # Define chunk size
15
- tpm_mean = []
16
- # Process the main data file in chunks
17
- for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
18
- delimiter='\t')):
19
- # Perform the same operations on the chunk
20
- chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
21
- # Append the processed chunk to the combined DataFrame
22
- tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
23
-
24
- # Compute the mean TPM per tissue
25
- tpm_mean = pd.concat(tpm_mean)
26
-
27
-
28
- cancer_projects = {
29
- "Adrenal Gland": "ACC",
30
- "Bladder": "BLCA",
31
- "Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
32
- "Breast": "BRCA",
33
- "Colon": "COAD",
34
- "Esophagus": "ESCA",
35
- "Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
36
- "Liver": "LIHC",
37
- "Lung": "LUNG",
38
- "Ovary": "OV",
39
- "Pancreas": "PAAD",
40
- "Prostate": "PRAD",
41
- "Skin": "SKCM",
42
- "Stomach": "STAD",
43
- "Testis": "TGCT",
44
- "Uterus": "UCS"
45
- }
46
-
47
- tissue_projects = {
48
- "ACC": "Adrenal Gland",
49
- "BLCA": "Bladder",
50
- "GBM": "Brain",
51
- "LGG": "Brain",
52
- "BRCA": "Breast",
53
- "COAD": "Colon",
54
- "ESCA": "Esophagus",
55
- "KICH": "Kidney",
56
- "KIRC": "Kidney",
57
- "KIRP": "Kidney",
58
- "LIHC": "Liver",
59
- "LUNG": "Lung",
60
- "OV": "Ovary",
61
- "PAAD": "Pancreas",
62
- "PRAD": "Prostate",
63
- "SKCM": "Skin",
64
- "STAD": "Stomach",
65
- "TGCT": "Testis",
66
- "UCS": "Uterus"
67
- }
68
-
File without changes
@@ -1,78 +0,0 @@
1
-
2
- import subprocess
3
- import logging
4
- import tempfile
5
-
6
-
7
- class NetChop(object):
8
- """
9
- Wrapper around netChop tool. Assumes netChop is in your PATH.
10
- """
11
-
12
- def predict_epitopes(self, sequences min_len=8):
13
- """
14
- Return netChop predictions for each position in each sequence.
15
-
16
- Parameters
17
- -----------
18
- sequences : list of string
19
- Amino acid sequences to predict cleavage for
20
-
21
- Returns
22
- -----------
23
- list of list of float
24
-
25
- The i'th list corresponds to the i'th sequence. Each list gives
26
- the cleavage probability for each position in the sequence.
27
- """
28
- with tempfile.NamedTemporaryFile(suffix=".fsa", mode="w") as input_fd:
29
- for (i, sequence) in enumerate(sequences):
30
- input_fd.write("> %d\n" % i)
31
- input_fd.write(sequence)
32
- input_fd.write("\n")
33
- input_fd.flush()
34
- try:
35
- output = subprocess.check_output(["netChop", input_fd.name])
36
- except subprocess.CalledProcessError as e:
37
- logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
38
- raise
39
-
40
- parsed = self.parse_netchop(output)
41
- assert len(parsed) == len(sequences), \
42
- "Expected %d results but got %d" % (
43
- len(sequences), len(parsed))
44
- assert [len(x) for x in parsed] == [len(x) for x in sequences]
45
- filtered_proteosomes = []
46
- for scores, seq in list(zip(parsed, sequences)):
47
- proteosome = self.chop_protein(seq, [s > threshold for s in scores])
48
- filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
49
- return proteosomes
50
-
51
- @staticmethod
52
- def parse_netchop(netchop_output):
53
- """
54
- Parse netChop stdout.
55
- """
56
- line_iterator = iter(netchop_output.decode().split("\n"))
57
- scores = []
58
- for line in line_iterator:
59
- if "pos" in line and 'AA' in line and 'score' in line:
60
- scores.append([])
61
- if "----" not in next(line_iterator):
62
- raise ValueError("Dashes expected")
63
- line = next(line_iterator)
64
- while '-------' not in line:
65
- score = float(line.split()[3])
66
- scores[-1].append(score)
67
- line = next(line_iterator)
68
- return scores
69
-
70
- def chop_protein(self, seq, pos):
71
- # Generate subsequences using list comprehension and slicing
72
- start = 0
73
- subsequences = [seq[start:(start := i+1)] for i, marker in enumerate(pos) if marker == 1]
74
- # Check if the last part needs to be added
75
- if start < len(seq):
76
- subsequences.append(seq[start:])
77
- return subsequences
78
-
File without changes
@@ -1,125 +0,0 @@
1
-
2
- from pathlib import Path
3
- END_CODONS = ['TAA', 'TAG', 'TGA']
4
-
5
- def is_monotonic(A):
6
- x, y = [], []
7
- x.extend(A)
8
- y.extend(A)
9
- x.sort()
10
- y.sort(reverse=True)
11
- if(x == A or y == A):
12
- return True
13
- return False
14
-
15
-
16
- class Mutation:
17
- def __init__(self, mid):
18
- self.mut_id = mid
19
-
20
- gene, chrom, pos, ref, alt = mid.split(':')
21
- self.gene = gene
22
- self.chrom = chrom.strip('chr')
23
- self.start = int(pos)
24
-
25
- self.file_identifier = self.mut_id.replace(':', '_')
26
- self.file_identifier_short = f'{self.start}_{ref}_{alt}'
27
-
28
- self.ref = ref if ref != '-' else ''
29
- self.alt = alt if alt != '-' else ''
30
-
31
- if len(self.ref) == len(self.alt) == 1:
32
- self.vartype = 'SNP'
33
- elif len(self.ref) == len(self.alt) > 1:
34
- self.vartype = 'SUB'
35
- elif self.ref and not self.alt:
36
- self.vartype = 'DEL'
37
- elif self.alt and not self.ref:
38
- self.vartype = 'INS'
39
- else:
40
- self.vartype = 'INDEL'
41
-
42
- def __str__(self):
43
- return self.mut_id
44
-
45
- def __repr__(self):
46
- return self.mut_id
47
-
48
- def __lt__(self, other):
49
- return self.start < other.start
50
-
51
-
52
- class Variations:
53
- def __init__(self, epistatic_set):
54
- self.variants = sorted([Mutation(m) for m in epistatic_set.split('|')])
55
- self.mut_id = epistatic_set
56
- self.start = self.variants[0].start
57
- self.positions = [v.start for v in self.variants]
58
- # self.ref = ','.join([m.ref for m in self.variants])
59
- # self.alt = ','.join([m.alt for m in self.variants])
60
- self.gene = self.variants[0].gene
61
- self.chrom = self.variants[0].chrom.strip('chr')
62
- self.file_identifier = f'{self.gene}_{self.chrom}' + '_' + '_'.join([v.file_identifier_short for v in self.variants])
63
-
64
- def __str__(self):
65
- return '|'.join([m.mut_id for m in self.variants])
66
-
67
- def __repr__(self):
68
- return '|'.join([m.mut_id for m in self.variants])
69
-
70
- def __iter__(self):
71
- self.current_index = 0
72
- return self
73
-
74
- def __next__(self):
75
- if self.current_index < len(self.variants):
76
- x = self.variants[self.current_index]
77
- self.current_index += 1
78
- return x
79
- raise StopIteration
80
-
81
- @property
82
- def file_identifier_json(self):
83
- return Path(self.file_identifier + '.json')
84
-
85
-
86
- def generate_mut_variant(seq: str, indices: list, mut: Mutation):
87
- offset = 1 if not mut.ref else 0
88
-
89
- check_indices = list(range(mut.start, mut.start + len(mut.ref) + offset))
90
- check1 = all([m in indices for m in check_indices])
91
- if not check1:
92
- print(f"Mutation {mut} not within transcript bounds: {min(indices)} - {max(indices)}.")
93
- return seq, indices, False, False
94
-
95
- rel_start, rel_end = indices.index(mut.start)+offset, indices.index(mut.start)+offset+len(mut.ref)
96
- acquired_seq = seq[rel_start:rel_end]
97
- check2 = acquired_seq == mut.ref
98
- if not check2:
99
- print(f'Reference allele does not match genome_build allele. {acquired_seq}, {mut.ref}, {mut.start}')
100
- consensus_allele = False
101
- else:
102
- consensus_allele = True
103
- if len(mut.ref) == len(mut.alt) > 0:
104
- temp_indices = list(range(mut.start, mut.start + len(mut.ref)))
105
- else:
106
- temp_indices = [indices[indices.index(mut.start)] + v / 1000 for v in list(range(1, len(mut.alt)+1))]
107
-
108
-
109
- new_indices = indices[:rel_start] + temp_indices + indices[rel_end:]
110
- new_seq = seq[:rel_start] + mut.alt + seq[rel_end:]
111
-
112
- assert len(new_seq) == len(new_indices), f'Error in variant modification: {mut}, {len(new_seq)}, {len(new_indices)}'
113
- assert is_monotonic(list(filter((-1).__ne__, new_indices))), f'Mut indices are not monotonic.'
114
- return new_seq, new_indices, True, consensus_allele
115
-
116
-
117
- def find_new_tts(seq, indices, tis):
118
- seq, indices = seq[indices.index(tis):], indices[indices.index(tis):]
119
- pos_options = [i for i in list(range(0, len(seq), 3)) if seq[i:i + 3] in END_CODONS and i+3 <= len(seq)]
120
- if len(pos_options) == 0:
121
- return indices[0] #[len(seq) - (len(seq) % 3) - 1]
122
- pos_options = pos_options[0]
123
- assert pos_options % 3 == 0, f'{pos_options} not divisible by three.'
124
- pos_options -= 1
125
- return indices[pos_options]
geney/netchop.py DELETED
@@ -1,79 +0,0 @@
1
-
2
- import subprocess
3
- import logging
4
- import tempfile
5
- from geney import config_setup
6
-
7
- class NetChop(object):
8
- """
9
- Wrapper around netChop tool. Assumes netChop is in your PATH.
10
- """
11
- def predict_epitopes(self, sequences, threshold, min_len=8):
12
- """
13
- Return netChop predictions for each position in each sequence.
14
-
15
- Parameters
16
- -----------
17
- sequences : list of string
18
- Amino acid sequences to predict cleavage for
19
-
20
- Returns
21
- -----------
22
- list of list of float
23
-
24
- The i'th list corresponds to the i'th sequence. Each list gives
25
- the cleavage probability for each position in the sequence.
26
- """
27
- with tempfile.NamedTemporaryFile(dir=config_setup['NETCHOP'], suffix=".fsa", mode="w") as input_fd:
28
- # temp_file = config_setup['NETCHOP'] / 'netchop_input.fsa'
29
- # with open(temp_file, 'w') as input_fd:
30
- for (i, sequence) in enumerate(sequences):
31
- _ = input_fd.write("> %d\n" % i)
32
- _ = input_fd.write(sequence)
33
- _ = input_fd.write("\n")
34
- # input_fd.flush()
35
- # print(str(temp_file))
36
-
37
- try:
38
- output = subprocess.check_output(["netchop", str(input_fd.name)])
39
- except subprocess.CalledProcessError as e:
40
- logging.error("Error calling netChop: %s:\n%s" % (e, e.output))
41
- raise
42
-
43
- parsed = self.parse_netchop(output)
44
- assert len(parsed) == len(sequences), \
45
- "Expected %d results but got %d" % (
46
- len(sequences), len(parsed))
47
- assert [len(x) for x in parsed] == [len(x) for x in sequences]
48
- filtered_proteosomes = []
49
- for scores, seq in list(zip(parsed, sequences)):
50
- proteosome = self.chop_protein(seq, [s > threshold for s in scores])
51
- filtered_proteosomes.append([e for e in proteosome if len(e) > min_len])
52
- return filtered_proteosomes
53
- @staticmethod
54
- def parse_netchop(netchop_output):
55
- """
56
- Parse netChop stdout.
57
- """
58
- line_iterator = iter(netchop_output.decode().split("\n"))
59
- scores = []
60
- for line in line_iterator:
61
- if "pos" in line and 'AA' in line and 'score' in line:
62
- scores.append([])
63
- if "----" not in next(line_iterator):
64
- raise ValueError("Dashes expected")
65
- line = next(line_iterator)
66
- while '-------' not in line:
67
- score = float(line.split()[3])
68
- scores[-1].append(score)
69
- line = next(line_iterator)
70
- return scores
71
- def chop_protein(self, seq, pos):
72
- # Generate subsequences using list comprehension and slicing
73
- start = 0
74
- subsequences = [seq[start:(start := i+1)] for i, marker in enumerate(pos) if marker == 1]
75
- # Check if the last part needs to be added
76
- if start < len(seq):
77
- subsequences.append(seq[start:])
78
- return subsequences
79
-
File without changes