geney 1.2.20__py2.py3-none-any.whl → 1.2.22__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

Files changed (39) hide show
  1. geney/oncosplice.py +1 -1
  2. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/METADATA +1 -1
  3. geney-1.2.22.dist-info/RECORD +19 -0
  4. geney/Gene.py +0 -258
  5. geney/analyzers/__init__.py +0 -0
  6. geney/analyzers/benchmark_clinvar.py +0 -158
  7. geney/analyzers/characterize_epistasis.py +0 -15
  8. geney/analyzers/compare_sets.py +0 -91
  9. geney/analyzers/group_comparison.py +0 -81
  10. geney/analyzers/survival.py +0 -144
  11. geney/analyzers/tcga_annotations.py +0 -194
  12. geney/analyzers/visualize_protein_conservation.py +0 -398
  13. geney/benchmark_clinvar.py +0 -158
  14. geney/compare_sets.py +0 -91
  15. geney/data_parsers/__init__.py +0 -0
  16. geney/data_parsers/gtex.py +0 -68
  17. geney/gtex.py +0 -68
  18. geney/immunotherapy/__init__.py +0 -0
  19. geney/immunotherapy/netchop.py +0 -78
  20. geney/mutations/__init__.py +0 -0
  21. geney/mutations/variant_utils.py +0 -125
  22. geney/netchop.py +0 -79
  23. geney/oncosplice/__init__.py +0 -0
  24. geney/oncosplice_mouse.py +0 -277
  25. geney/oncosplice_pipeline.py +0 -1588
  26. geney/performance_utils.py +0 -138
  27. geney/pipelines/__init__.py +0 -0
  28. geney/pipelines/dask_utils.py +0 -153
  29. geney/splicing/__init__.py +0 -2
  30. geney/splicing/spliceai_utils.py +0 -253
  31. geney/splicing/splicing_isoform_utils.py +0 -0
  32. geney/splicing/splicing_utils.py +0 -366
  33. geney/survival.py +0 -124
  34. geney/tcga_annotations.py +0 -352
  35. geney/translation_termination/__init__.py +0 -0
  36. geney/translation_termination/tts_utils.py +0 -0
  37. geney-1.2.20.dist-info/RECORD +0 -52
  38. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/WHEEL +0 -0
  39. {geney-1.2.20.dist-info → geney-1.2.22.dist-info}/top_level.txt +0 -0
@@ -1,138 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- from sklearn.metrics import precision_score, recall_score, accuracy_score
4
- from sklearn.metrics import roc_auc_score, roc_curve
5
- import matplotlib.pyplot as plt
6
-
7
-
8
- # def plot_performance(true_values, predictions):
9
- # clinsig_map = {'Benign': 0, 'Pathogenic': 1}
10
- # true_values = [clinsig_map[t] for t in true_values]
11
- # predictions = scale_predictions(predictions)
12
- #
13
- # fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
14
- #
15
- # # Calculate Precision-Recall curve
16
- # precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
17
- #
18
- # # Plotting ROC curve
19
- # plt.figure(figsize=(20, 5))
20
- #
21
- # plt.subplot(1, 4, 1)
22
- # plt.plot(fpr, tpr)
23
- # plt.title('ROC Curve')
24
- # plt.xlabel('False Positive Rate')
25
- # plt.ylabel('True Positive Rate')
26
- #
27
- # # Plotting Precision-Recall curve
28
- # plt.subplot(1, 4, 2)
29
- # plt.plot(recall, precision)
30
- # plt.title('Precision-Recall Curve')
31
- # plt.xlabel('Recall')
32
- # plt.ylabel('Precision')
33
- #
34
- # # Plotting Precision vs. Thresholds
35
- # plt.subplot(1, 4, 3)
36
- # plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
37
- # plt.title('Precision vs. Threshold')
38
- # plt.xlabel('Threshold')
39
- # plt.ylabel('Precision')
40
- #
41
- # # Plotting Sample Percentage Captured vs. Thresholds
42
- # plt.subplot(1, 4, 4)
43
- # # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
44
- # plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
45
- # plt.title('Cumulative Percentage vs. Threshold')
46
- # plt.xlabel('Threshold')
47
- # plt.ylabel('Cumulative Percentage of Population')
48
- #
49
- # plt.tight_layout()
50
- # plt.show()
51
- #
52
- #
53
- #
54
- # def plot_auc_curve(y_true, y_pred_proba):
55
- # """
56
- # Plots the AUC curve.
57
- #
58
- # Args:
59
- # y_true (array-like): True labels (0 or 1).
60
- # y_pred_proba (array-like): Predicted probabilities for positive class.
61
- #
62
- # Returns:
63
- # None
64
- # """
65
- # fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
66
- # auc_value = roc_auc_score(y_true, y_pred_proba)
67
- #
68
- # plt.figure(figsize=(8, 6))
69
- # plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
70
- # plt.plot([0, 1], [0, 1], 'k--')
71
- # plt.xlabel("False Positive Rate")
72
- # plt.ylabel("True Positive Rate")
73
- # plt.title("Receiver Operating Characteristic (ROC) Curve")
74
- # plt.legend()
75
- # plt.show()
76
- # return auc_value
77
- #
78
- #
79
- # def optimal_ppv(dataframe, feature_name, plot=False):
80
- # """
81
- # Calculates the optimal positive predictive value (PPV) for a given feature.
82
- #
83
- # Args:
84
- # dataframe (pd.DataFrame): Input dataframe.
85
- # feature_name (str): Name of the feature column.
86
- #
87
- # Returns:
88
- # float: Optimal PPV.
89
- # """
90
- # # Assuming 'target' is the binary target column (0 or 1)
91
- # threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
92
- # ppv_values = []
93
- #
94
- # for threshold in threshold_values:
95
- # predictions = (dataframe[feature_name] >= threshold).astype(int)
96
- # ppv = precision_score(dataframe['target'], predictions)
97
- # ppv_values.append(ppv)
98
- #
99
- # optimal_threshold = threshold_values[np.argmax(ppv_values)]
100
- # optimal_ppv = max(ppv_values)
101
- # if plot:
102
- # plt.figure(figsize=(8, 6))
103
- # plt.scatter(threshold_values, ppv_values)
104
- # plt.xlabel("Threshold")
105
- # plt.ylabel("Positive Predictive Value (PPV)")
106
- # plt.title("Optimal Positive Predictive Value (PPV)")
107
- # plt.show()
108
- #
109
- # return optimal_ppv, optimal_threshold
110
- #
111
- #
112
- # def measure_prediction_quality(prediction_vector, quality_vector):
113
- # """
114
- # Measure the quality of the predictions using the quality_vector as the characteristic to check.
115
- # """
116
- # pass
117
- #
118
- #
119
- #
120
- # def create_ppv_vector(prediction_vector, true_value_vector):
121
- # """
122
- # Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
123
- # """
124
- # df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
125
- # df.sort_values('prediction', ascending=True, inplace=True)
126
- # df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
127
- # for bin in df.bin.unique():
128
- # temp_df = df[df.bin >= bin].
129
- #
130
- #
131
- # def group_retention(predictions, predictor):
132
- # # first i need to get the ratio of values that are retained at particular values
133
- # predictions.sort_values(predictor, inplace=True)
134
- # _, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
135
- # tracker = []
136
- # for th in thresholds:
137
- #
138
- #
File without changes
@@ -1,153 +0,0 @@
1
- from dask_jobqueue import PBSCluster
2
- from dask.distributed import Client, wait
3
- import os
4
- from tqdm import tqdm
5
- from pathlib import Path
6
- from geney.oncosplice import oncosplice
7
- from geney import config_setup
8
- from geney.utils import contains, available_genes
9
- import warnings
10
- import gc
11
- import pandas as pd
12
- import argparse
13
-
14
- tqdm.pandas()
15
- warnings.filterwarnings('ignore')
16
-
17
-
18
- def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
19
- walltime="24:00:00", dashboard_address=":23154",
20
- log_directory="dask-logs"):
21
- """
22
- Launch a Dask cluster using PBS.
23
-
24
- Parameters:
25
- memory_size (str): Memory for each worker.
26
- num_workers (int): Number of workers to scale to.
27
- queue (str): Queue name for PBS.
28
- walltime (str): Walltime for PBS.
29
- dashboard_address (str): Address for the Dask dashboard.
30
- log_directory (str): Directory for Dask logs.
31
-
32
- Returns:
33
- tuple: A tuple containing the Dask client and cluster objects.
34
- """
35
- try:
36
- dask_cluster = PBSCluster(
37
- cores=1,
38
- memory=memory_size,
39
- processes=1,
40
- queue=queue,
41
- walltime=walltime,
42
- scheduler_options={"dashboard_address": dashboard_address},
43
- log_directory=log_directory,
44
- job_script_prologue=[f"cd {config_setup['BASE']}"]
45
- )
46
- dask_cluster.scale(num_workers)
47
- dask_client = Client(dask_cluster)
48
- return dask_client, dask_cluster
49
- except Exception as e:
50
- print(f"An error occurred: {e}")
51
- return None, None
52
-
53
-
54
- def main_single(mut_id):
55
- try:
56
- res = oncosplice(mut_id, sai_threshold=0.5).dropna(axis=1)
57
- except Exception as e:
58
- print(f"An error occurred: {e}")
59
- res = None
60
- return res
61
-
62
-
63
- def process_and_save_tasks(tasks, dask_client, save_loc=None, num_workers=10, save_increment=20, file_index=0):
64
- """
65
- Process a list of tasks using Dask, saving the results incrementally.
66
- Parameters:
67
- tasks (list): List of tasks to be processed.
68
- save_loc (str): Location to save results.
69
- dask_client (Client): Dask client for task submission.
70
- num_workers (int): Number of workers to use.
71
- save_increment (int): Number of iterations after which to save results.
72
- file_index (int): Starting index for output files.
73
- Returns:
74
- None
75
- """
76
- def save_results(results, index):
77
- if results:
78
- df = pd.concat(results)
79
- df.to_csv(os.path.join(save_loc, f'results_{index}.csv'))
80
- return []
81
- return results
82
-
83
- futures, all_results = [], []
84
- for i, task in tqdm(enumerate(tasks), total=len(tasks)):
85
- futures.append(dask_client.submit(main_single, task))
86
- if (i + 1) % num_workers == 0:
87
- wait(futures)
88
- all_results.extend([f.result() for f in futures if f.status == 'finished' and f.result() is not None])
89
- futures = []
90
-
91
- if (i + 1) % (save_increment * num_workers) == 0:
92
- all_results = save_results(all_results, file_index)
93
- file_index += 1
94
- gc.collect()
95
- wait(futures)
96
- all_results.extend([f.result() for f in futures if f.status == 'finished' and f.result() is not None])
97
- save_results(all_results, file_index)
98
-
99
-
100
- def restart_checkpoint(result_dir):
101
- """
102
- Reloads processed results from CSV files, extracting unique mutation IDs and the highest checkpoint.
103
-
104
- Parameters:
105
- result_dir (str): Directory containing result CSV files.
106
-
107
- Returns:
108
- list: List of unique mutation IDs processed.
109
- int: The highest checkpoint value from the files.
110
- """
111
- result_path = Path(result_dir)
112
- files = sorted(result_path.glob('*'), key=lambda x: int(x.stem.split('_')[-1]), reverse=True)
113
-
114
- if not files:
115
- return [], 0
116
-
117
- try:
118
- data = []
119
- latest_file = files[0]
120
- for file in files:
121
- data.append(pd.read_csv(file))
122
- processed_muts = pd.concat(data).mut_id.unique().tolist()
123
- highest_checkpoint = int(latest_file.stem.split('_')[-1])
124
- return processed_muts, highest_checkpoint
125
-
126
- except Exception as e:
127
- print(f"Error processing file {files}: {e}")
128
- return [], 0
129
-
130
-
131
- if __name__ == '__main__':
132
- parser = argparse.ArgumentParser(description='Run oncosplice with dask.')
133
- parser.add_argument('--input_file', '-i', required=True, help='input text file')
134
- parser.add_argument('--results_directory', '-r', required=False, help='result directory', default=config_setup['ONCOSPLICE'])
135
- parser.add_argument('--num_workers', '-n', type=int, required=False, help='number of dask workers to recruit', default=10)
136
- parser.add_argument('--worker_size', '-m', type=str, required=False, help='dask worker memory allocation', default="3GB")
137
- args = parser.parse_args()
138
-
139
- client, cluster = launch_dask_cluster(memory_size=args.worker_size, num_workers=args.num_workers)
140
- muts = open(args.input_file, 'r').read().splitlines()
141
- processed_mutations, last_count = restart_checkpoint(args.results_directory)
142
- processed_mutations = sorted(list(set(processed_mutations)))
143
- muts = [m for m in tqdm(muts) if not contains(processed_mutations, m)]
144
- valid_genes = available_genes()
145
- muts = [m for m in muts if contains(valid_genes, m.split(':')[0])]
146
- print(f"Valid mutations: {len(muts)}")
147
- process_and_save_tasks(tasks=muts,
148
- save_loc=args.results_directory,
149
- dask_client=client,
150
- file_index=last_count + 1,
151
- num_workers=args.num_workers)
152
- print("Done.")
153
-
@@ -1,2 +0,0 @@
1
- from .splicing_utils import *
2
- from geney import config_setup
@@ -1,253 +0,0 @@
1
- from geney.utils import reverse_complement, find_files_by_gene_name, unload_json, dump_json, unload_pickle
2
- from geney.Fasta_segment import Fasta_segment
3
- from geney.mutations.variant_utils import generate_mut_variant
4
- from geney import config_setup
5
-
6
- '''
7
- SpliceAI util functions.
8
- '''
9
- import numpy as np
10
- import tensorflow as tf
11
- from keras.models import load_model
12
- from pkg_resources import resource_filename
13
- from spliceai.utils import one_hot_encode
14
-
15
- tf.config.threading.set_intra_op_parallelism_threads(1)
16
- tf.config.threading.set_inter_op_parallelism_threads(1)
17
-
18
- sai_paths = ('models/spliceai{}.h5'.format(x) for x in range(1, 6))
19
- sai_models = [load_model(resource_filename('spliceai', x)) for x in sai_paths]
20
-
21
- def sai_predict_probs(seq: str, models: list) -> list:
22
- '''
23
- Predicts the donor and acceptor junction probability of each
24
- NT in seq using SpliceAI.
25
-
26
- Let m:=2*sai_mrg_context + L be the input seq length. It is assumed
27
- that the input seq has the following structure:
28
-
29
- seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|
30
-
31
- The returned probability matrix is of size 2XL, where
32
- the first row is the acceptor probability and the second row
33
- is the donor probability. These probabilities corresponds to the
34
- middel <L NTs> NTs of the input seq.
35
- '''
36
- x = one_hot_encode(seq)[None, :]
37
- y = np.mean([models[m].predict(x) for m in range(5)], axis=0)
38
- return y[0,:,1:].T
39
-
40
-
41
- def get_actual_sai_seq(seq: str, sai_mrg_context: int=5000) -> str:
42
- '''
43
- This dfunction assumes that the input seq has the following structure:
44
-
45
- seq = |<sai_mrg_context NTs><L NTs><sai_mrg_context NTs>|.
46
-
47
- Then, the function returns the sequence: |<L NTs>|
48
- '''
49
- return seq[sai_mrg_context:-sai_mrg_context]
50
-
51
-
52
- ############################################################################################
53
- ############################################################################################
54
- ############# BEGIN CUSTOM SAI USE CASES ###################################################
55
- ############################################################################################
56
- ############################################################################################
57
-
58
-
59
- def find_ss_changes(ref_dct, mut_dct, known_splice_sites, threshold=0.5):
60
- '''
61
- :param ref_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the reference sequence
62
- :param mut_dct: the spliceai probabilities for each nucleotide (by genomic position) as a dictionary for the mutated sequence
63
- :param known_splice_sites: the indices (by genomic position) that serve as known splice sites
64
- :param threshold: the threshold for detection (difference between reference and mutated probabilities)
65
- :return: two dictionaries; discovered_pos is a dictionary containing all the positions that meat the threshold for discovery
66
- and deleted_pos containing all the positions that meet the threshold for missing and the condition for missing
67
- '''
68
-
69
- new_dict = {v: mut_dct.get(v, 0) - ref_dct.get(v, 0) for v in
70
- list(set(list(ref_dct.keys()) + list(mut_dct.keys())))}
71
-
72
- discovered_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct[k]), 3)} for k, v in
73
- new_dict.items() if (k not in known_splice_sites and v >= threshold) or (v > 0.45)}
74
-
75
- deleted_pos = {k: {'delta': round(float(v), 3), 'absolute': round(float(mut_dct.get(k, 0)), 3)} for k, v in
76
- new_dict.items() if k in known_splice_sites and v <= -threshold}
77
-
78
-
79
- return discovered_pos, deleted_pos
80
-
81
-
82
- def run_spliceai(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
83
- positions = mutations.positions #[m.start for m in mutations]
84
- seq_start_pos = min(positions) - sai_mrg_context - min_coverage
85
- seq_end_pos = max(positions) + sai_mrg_context + min_coverage # + 1
86
-
87
- # ref_seq, ref_indices = pull_fasta_seq_endpoints(mutations.chrom, seq_start_pos, seq_end_pos)
88
- fasta_obj = Fasta_segment()
89
- ref_seq, ref_indices = fasta_obj.read_segment_endpoints(config_setup['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
90
- seq_start_pos,
91
- seq_end_pos)
92
-
93
-
94
- # gene_data = unload_pickle(
95
- # find_files_by_gene_name(gene_name=mutations.gene))
96
- gene_start, gene_end, rev = gene_data.gene_start, gene_data.gene_end, gene_data.rev
97
-
98
- mrna_acceptors = sorted(list(set([lst for lsts in
99
- [mrna.get('acceptors', []) for mrna in gene_data.transcripts.values() if
100
- mrna['transcript_biotype'] == 'protein_coding'] for lst in lsts])))
101
- mrna_donors = sorted(list(set([lst for lsts in
102
- [mrna.get('donors', []) for mrna in gene_data.transcripts.values() if
103
- mrna['transcript_biotype'] == 'protein_coding'] for lst in lsts])))
104
-
105
- visible_donors = np.intersect1d(mrna_donors, ref_indices)
106
- visible_acceptors = np.intersect1d(mrna_acceptors, ref_indices)
107
-
108
- start_pad = ref_indices.index(gene_start) if gene_start in ref_indices else 0
109
- end_cutoff = ref_indices.index(gene_end) if gene_end in ref_indices else len(ref_indices) # - 1
110
- end_pad = len(ref_indices) - end_cutoff
111
- ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
112
- ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
113
- mut_seq, mut_indices = ref_seq, ref_indices
114
-
115
- for mut in mutations:
116
- mut_seq, mut_indices, _, _ = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
117
-
118
- ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
119
- mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
120
-
121
- if rev:
122
- ref_seq = reverse_complement(ref_seq)
123
- mut_seq = reverse_complement(mut_seq)
124
- ref_indices = ref_indices[::-1]
125
- mut_indices = mut_indices[::-1]
126
-
127
- ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
128
- mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
129
-
130
- ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
131
- mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
132
-
133
- assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
134
- assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
135
-
136
- iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
137
- {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
138
- visible_acceptors,
139
- threshold=sai_threshold)
140
-
141
- assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
142
- assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
143
-
144
- idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
145
- {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
146
- visible_donors,
147
- threshold=sai_threshold)
148
-
149
- missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
150
- missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
151
- return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
152
-
153
-
154
-
155
- class PredictSpliceAI:
156
- def __init__(self, mutation, gene_data, threshold=0.5, force=False, sai_mrg_context=5000, min_coverage=2500):
157
- self.modification = mutation
158
- self.threshold = threshold
159
-
160
- # if '|' in mutation.mut_id:
161
- self.spliceai_db = config_setup['MISSPLICING_PATH'] / f'spliceai_epistatic'
162
- # else:
163
- # self.spliceai_db = config_setup['MISSPLICING_PATH'] / f'spliceai_individual'
164
-
165
- self.missplicing = {}
166
-
167
- if self.prediction_file_exists() and not force:
168
- self.missplicing = self.load_sai_predictions()
169
-
170
- else:
171
- self.missplicing = run_spliceai(self.modification, gene_data=gene_data, sai_mrg_context=sai_mrg_context, min_coverage=min_coverage, sai_threshold=0.1)
172
- self.save_sai_predictions()
173
-
174
- def __repr__(self):
175
- return f'Missplicing({self.modification.mut_id}) --> {self.missplicing}'
176
-
177
- def __str__(self):
178
- return self.aberrant_splicing
179
- def __bool__(self):
180
- for event, details in self.aberrant_splicing.items():
181
- if details:
182
- return True
183
- return False
184
-
185
- def __eq__(self, alt_splicing):
186
- flag, _ = check_splicing_difference(self.missplicing, alt_splicing, self.threshold)
187
- return not flag
188
-
189
- @property
190
- def aberrant_splicing(self):
191
- return self.apply_sai_threshold(self.missplicing, self.threshold)
192
-
193
- @property
194
- def prediction_file(self):
195
- return self.spliceai_db / self.modification.gene / self.modification.file_identifier_json
196
-
197
- def prediction_file_exists(self):
198
- return self.prediction_file.exists()
199
-
200
- def load_sai_predictions(self):
201
- missplicing = unload_json(self.prediction_file)
202
- missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
203
- missplicing = {outk: {int(k) if k.is_integer() or 'missed' in outk else k: v for k, v in outv.items()} for
204
- outk, outv in
205
- missplicing.items()}
206
- return missplicing
207
-
208
- def save_sai_predictions(self):
209
- self.prediction_file.parent.mkdir(parents=True, exist_ok=True)
210
- dump_json(self.prediction_file, self.missplicing)
211
- def apply_sai_threshold(self, splicing_dict=None, threshold=None):
212
- splicing_dict = self.missplicing if not splicing_dict else splicing_dict
213
- threshold = self.threshold if not threshold else threshold
214
- new_dict = {}
215
- for event, details in splicing_dict.items():
216
- for e, d in details.items():
217
- if abs(d['delta']) >= threshold:
218
- return splicing_dict
219
- new_dict[event] = {} #{k: v for k, v in details.items() if abs(v['delta']) >= threshold}
220
- return new_dict
221
-
222
- def get_max_missplicing_delta(self):
223
- max_delta = 0
224
- for event, details in self.missplicing.items():
225
- for e, d in details.items():
226
- if abs(d['delta']) > max_delta:
227
- max_delta = abs(d['delta'])
228
- return max_delta
229
-
230
- def check_splicing_difference(missplicing1, missplicing2, threshold=None):
231
- flag = False
232
- true_differences = {}
233
- for event in ['missed_acceptors', 'missed_donors']:
234
- td = {}
235
- dct1 = missplicing1[event]
236
- dct2 = missplicing2[event]
237
- for k in list(set(list(dct1.keys()) + list(dct2.keys()))):
238
- diff = abs(dct1.get(k, {'delta': 0})['delta']) - abs(dct2.get(k, {'delta': 0})['delta'])
239
- if abs(diff) >= threshold:
240
- flag = True
241
- td[k] = diff
242
- true_differences[event] = td
243
- for event in ['discovered_acceptors', 'discovered_donors']:
244
- td = {}
245
- dct1 = missplicing1[event]
246
- dct2 = missplicing2[event]
247
- for k in list(set(list(dct1.keys()) + list(dct2.keys()))):
248
- diff = abs(dct1.get(k, {'delta': 0})['delta']) - abs(dct2.get(k, {'delta': 0})['delta'])
249
- if abs(diff) >= threshold:
250
- flag = True
251
- td[k] = diff
252
- true_differences[event] = td
253
- return flag, true_differences
File without changes