geney 1.1.3__py2.py3-none-any.whl → 1.1.4__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
- geney/data_setup.py +7 -1
- geney/gtex_utils.py +68 -0
- geney/netchop.py +3 -2
- geney/performance_utils.py +138 -0
- geney/survival_utils.py +124 -0
- geney/tcga_utils.py +34 -25
- {geney-1.1.3.dist-info → geney-1.1.4.dist-info}/METADATA +1 -1
- {geney-1.1.3.dist-info → geney-1.1.4.dist-info}/RECORD +10 -7
- {geney-1.1.3.dist-info → geney-1.1.4.dist-info}/WHEEL +0 -0
- {geney-1.1.3.dist-info → geney-1.1.4.dist-info}/top_level.txt +0 -0
geney/data_setup.py
CHANGED
|
@@ -218,7 +218,8 @@ def main():
|
|
|
218
218
|
'MRNA_PATH': os.path.join(args.basepath, 'annotations'),
|
|
219
219
|
'MISSPLICING_PATH': os.path.join(args.basepath, 'missplicing'),
|
|
220
220
|
'ONCOSPLICE_PATH': os.path.join(args.basepath, 'oncosplice'),
|
|
221
|
-
'BASE': args.basepath
|
|
221
|
+
'BASE': args.basepath,
|
|
222
|
+
'NETCHOP': os.path.join(args.basepath, 'netchop')
|
|
222
223
|
}
|
|
223
224
|
dump_json(config_file, config_paths)
|
|
224
225
|
|
|
@@ -242,6 +243,11 @@ def main():
|
|
|
242
243
|
fasta_build_path.mkdir()
|
|
243
244
|
split_fasta(fasta_file, fasta_build_path)
|
|
244
245
|
|
|
246
|
+
clinvar_url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz'
|
|
247
|
+
clinvar_file = download_and_ungzip(clinvar_url, base_path)
|
|
248
|
+
# clinvar_build_path = base_path / f'accessory_data'
|
|
249
|
+
# clinvar_build_path.mkdir()
|
|
250
|
+
|
|
245
251
|
ensembl_url = 'https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz'
|
|
246
252
|
ensembl_file = download_and_ungzip(ensembl_url, base_path)
|
|
247
253
|
ensembl_annotation_path = base_path / f'annotations'
|
geney/gtex_utils.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
|
|
4
|
+
# Set pandas display options (if necessary)
|
|
5
|
+
pd.options.display.max_rows = 999
|
|
6
|
+
|
|
7
|
+
# Read metadata
|
|
8
|
+
metadata = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', delimiter='\t')
|
|
9
|
+
metadata_tissue_mapper = metadata[['SAMPID', 'SMTS']].drop_duplicates().set_index('SAMPID').to_dict()['SMTS']
|
|
10
|
+
|
|
11
|
+
# Initialize an empty DataFrame for combined results
|
|
12
|
+
combined_df = pd.DataFrame()
|
|
13
|
+
|
|
14
|
+
# Define chunk size
|
|
15
|
+
tpm_mean = []
|
|
16
|
+
# Process the main data file in chunks
|
|
17
|
+
for chunk in tqdm(pd.read_csv('GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct', header=2, chunksize=1000,
|
|
18
|
+
delimiter='\t')):
|
|
19
|
+
# Perform the same operations on the chunk
|
|
20
|
+
chunk = chunk.set_index(['transcript_id', 'gene_id']).rename(columns=metadata_tissue_mapper)
|
|
21
|
+
# Append the processed chunk to the combined DataFrame
|
|
22
|
+
tpm_mean.append(chunk.T.groupby(by=chunk.columns).mean().T)
|
|
23
|
+
|
|
24
|
+
# Compute the mean TPM per tissue
|
|
25
|
+
tpm_mean = pd.concat(tpm_mean)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
cancer_projects = {
|
|
29
|
+
"Adrenal Gland": "ACC",
|
|
30
|
+
"Bladder": "BLCA",
|
|
31
|
+
"Brain": ["GBM", "LGG"], # Note: Brain maps to two projects
|
|
32
|
+
"Breast": "BRCA",
|
|
33
|
+
"Colon": "COAD",
|
|
34
|
+
"Esophagus": "ESCA",
|
|
35
|
+
"Kidney": ["KICH", "KIRC", "KIRP"], # Note: Kidney maps to three projects
|
|
36
|
+
"Liver": "LIHC",
|
|
37
|
+
"Lung": "LUNG",
|
|
38
|
+
"Ovary": "OV",
|
|
39
|
+
"Pancreas": "PAAD",
|
|
40
|
+
"Prostate": "PRAD",
|
|
41
|
+
"Skin": "SKCM",
|
|
42
|
+
"Stomach": "STAD",
|
|
43
|
+
"Testis": "TGCT",
|
|
44
|
+
"Uterus": "UCS"
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
tissue_projects = {
|
|
48
|
+
"ACC": "Adrenal Gland",
|
|
49
|
+
"BLCA": "Bladder",
|
|
50
|
+
"GBM": "Brain",
|
|
51
|
+
"LGG": "Brain",
|
|
52
|
+
"BRCA": "Breast",
|
|
53
|
+
"COAD": "Colon",
|
|
54
|
+
"ESCA": "Esophagus",
|
|
55
|
+
"KICH": "Kidney",
|
|
56
|
+
"KIRC": "Kidney",
|
|
57
|
+
"KIRP": "Kidney",
|
|
58
|
+
"LIHC": "Liver",
|
|
59
|
+
"LUNG": "Lung",
|
|
60
|
+
"OV": "Ovary",
|
|
61
|
+
"PAAD": "Pancreas",
|
|
62
|
+
"PRAD": "Prostate",
|
|
63
|
+
"SKCM": "Skin",
|
|
64
|
+
"STAD": "Stomach",
|
|
65
|
+
"TGCT": "Testis",
|
|
66
|
+
"UCS": "Uterus"
|
|
67
|
+
}
|
|
68
|
+
|
geney/netchop.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import subprocess
|
|
3
3
|
import logging
|
|
4
4
|
import tempfile
|
|
5
|
-
|
|
5
|
+
from geney import config_setup
|
|
6
6
|
|
|
7
7
|
class NetChop(object):
|
|
8
8
|
"""
|
|
@@ -25,12 +25,13 @@ class NetChop(object):
|
|
|
25
25
|
The i'th list corresponds to the i'th sequence. Each list gives
|
|
26
26
|
the cleavage probability for each position in the sequence.
|
|
27
27
|
"""
|
|
28
|
-
with tempfile.NamedTemporaryFile(suffix=".fsa", mode="w") as input_fd:
|
|
28
|
+
with tempfile.NamedTemporaryFile(dir=config_setup['NETCHOP'], suffix=".fsa", mode="w") as input_fd:
|
|
29
29
|
for (i, sequence) in enumerate(sequences):
|
|
30
30
|
input_fd.write("> %d\n" % i)
|
|
31
31
|
input_fd.write(sequence)
|
|
32
32
|
input_fd.write("\n")
|
|
33
33
|
input_fd.flush()
|
|
34
|
+
|
|
34
35
|
try:
|
|
35
36
|
output = subprocess.check_output(["netChop", input_fd.name])
|
|
36
37
|
except subprocess.CalledProcessError as e:
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.metrics import precision_score, recall_score, accuracy_score
|
|
4
|
+
from sklearn.metrics import roc_auc_score, roc_curve
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# def plot_performance(true_values, predictions):
|
|
9
|
+
# clinsig_map = {'Benign': 0, 'Pathogenic': 1}
|
|
10
|
+
# true_values = [clinsig_map[t] for t in true_values]
|
|
11
|
+
# predictions = scale_predictions(predictions)
|
|
12
|
+
#
|
|
13
|
+
# fpr, tpr, thresholds_roc = roc_curve(true_values, predictions)
|
|
14
|
+
#
|
|
15
|
+
# # Calculate Precision-Recall curve
|
|
16
|
+
# precision, recall, thresholds_pr = precision_recall_curve(true_values, predictions)
|
|
17
|
+
#
|
|
18
|
+
# # Plotting ROC curve
|
|
19
|
+
# plt.figure(figsize=(20, 5))
|
|
20
|
+
#
|
|
21
|
+
# plt.subplot(1, 4, 1)
|
|
22
|
+
# plt.plot(fpr, tpr)
|
|
23
|
+
# plt.title('ROC Curve')
|
|
24
|
+
# plt.xlabel('False Positive Rate')
|
|
25
|
+
# plt.ylabel('True Positive Rate')
|
|
26
|
+
#
|
|
27
|
+
# # Plotting Precision-Recall curve
|
|
28
|
+
# plt.subplot(1, 4, 2)
|
|
29
|
+
# plt.plot(recall, precision)
|
|
30
|
+
# plt.title('Precision-Recall Curve')
|
|
31
|
+
# plt.xlabel('Recall')
|
|
32
|
+
# plt.ylabel('Precision')
|
|
33
|
+
#
|
|
34
|
+
# # Plotting Precision vs. Thresholds
|
|
35
|
+
# plt.subplot(1, 4, 3)
|
|
36
|
+
# plt.plot(thresholds_pr, precision[:-1]) # Precision and thresholds have off-by-one lengths
|
|
37
|
+
# plt.title('Precision vs. Threshold')
|
|
38
|
+
# plt.xlabel('Threshold')
|
|
39
|
+
# plt.ylabel('Precision')
|
|
40
|
+
#
|
|
41
|
+
# # Plotting Sample Percentage Captured vs. Thresholds
|
|
42
|
+
# plt.subplot(1, 4, 4)
|
|
43
|
+
# # Assuming 'tpr' or another appropriate metric represents the cumulative percentage
|
|
44
|
+
# plt.plot(thresholds_roc, tpr) # Update 'tpr' with the correct metric if necessary
|
|
45
|
+
# plt.title('Cumulative Percentage vs. Threshold')
|
|
46
|
+
# plt.xlabel('Threshold')
|
|
47
|
+
# plt.ylabel('Cumulative Percentage of Population')
|
|
48
|
+
#
|
|
49
|
+
# plt.tight_layout()
|
|
50
|
+
# plt.show()
|
|
51
|
+
#
|
|
52
|
+
#
|
|
53
|
+
#
|
|
54
|
+
# def plot_auc_curve(y_true, y_pred_proba):
|
|
55
|
+
# """
|
|
56
|
+
# Plots the AUC curve.
|
|
57
|
+
#
|
|
58
|
+
# Args:
|
|
59
|
+
# y_true (array-like): True labels (0 or 1).
|
|
60
|
+
# y_pred_proba (array-like): Predicted probabilities for positive class.
|
|
61
|
+
#
|
|
62
|
+
# Returns:
|
|
63
|
+
# None
|
|
64
|
+
# """
|
|
65
|
+
# fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
|
|
66
|
+
# auc_value = roc_auc_score(y_true, y_pred_proba)
|
|
67
|
+
#
|
|
68
|
+
# plt.figure(figsize=(8, 6))
|
|
69
|
+
# plt.plot(fpr, tpr, label=f"AUC = {auc_value:.2f}")
|
|
70
|
+
# plt.plot([0, 1], [0, 1], 'k--')
|
|
71
|
+
# plt.xlabel("False Positive Rate")
|
|
72
|
+
# plt.ylabel("True Positive Rate")
|
|
73
|
+
# plt.title("Receiver Operating Characteristic (ROC) Curve")
|
|
74
|
+
# plt.legend()
|
|
75
|
+
# plt.show()
|
|
76
|
+
# return auc_value
|
|
77
|
+
#
|
|
78
|
+
#
|
|
79
|
+
# def optimal_ppv(dataframe, feature_name, plot=False):
|
|
80
|
+
# """
|
|
81
|
+
# Calculates the optimal positive predictive value (PPV) for a given feature.
|
|
82
|
+
#
|
|
83
|
+
# Args:
|
|
84
|
+
# dataframe (pd.DataFrame): Input dataframe.
|
|
85
|
+
# feature_name (str): Name of the feature column.
|
|
86
|
+
#
|
|
87
|
+
# Returns:
|
|
88
|
+
# float: Optimal PPV.
|
|
89
|
+
# """
|
|
90
|
+
# # Assuming 'target' is the binary target column (0 or 1)
|
|
91
|
+
# threshold_values = pd.qcut(dataframe[feature_name], 100, duplicates='drop')
|
|
92
|
+
# ppv_values = []
|
|
93
|
+
#
|
|
94
|
+
# for threshold in threshold_values:
|
|
95
|
+
# predictions = (dataframe[feature_name] >= threshold).astype(int)
|
|
96
|
+
# ppv = precision_score(dataframe['target'], predictions)
|
|
97
|
+
# ppv_values.append(ppv)
|
|
98
|
+
#
|
|
99
|
+
# optimal_threshold = threshold_values[np.argmax(ppv_values)]
|
|
100
|
+
# optimal_ppv = max(ppv_values)
|
|
101
|
+
# if plot:
|
|
102
|
+
# plt.figure(figsize=(8, 6))
|
|
103
|
+
# plt.scatter(threshold_values, ppv_values)
|
|
104
|
+
# plt.xlabel("Threshold")
|
|
105
|
+
# plt.ylabel("Positive Predictive Value (PPV)")
|
|
106
|
+
# plt.title("Optimal Positive Predictive Value (PPV)")
|
|
107
|
+
# plt.show()
|
|
108
|
+
#
|
|
109
|
+
# return optimal_ppv, optimal_threshold
|
|
110
|
+
#
|
|
111
|
+
#
|
|
112
|
+
# def measure_prediction_quality(prediction_vector, quality_vector):
|
|
113
|
+
# """
|
|
114
|
+
# Measure the quality of the predictions using the quality_vector as the characteristic to check.
|
|
115
|
+
# """
|
|
116
|
+
# pass
|
|
117
|
+
#
|
|
118
|
+
#
|
|
119
|
+
#
|
|
120
|
+
# def create_ppv_vector(prediction_vector, true_value_vector):
|
|
121
|
+
# """
|
|
122
|
+
# Create a vector of positive predictive values (PPV) for the prediction_vector using the true_value_vector as the true values.
|
|
123
|
+
# """
|
|
124
|
+
# df = pd.DataFrame({'prediction': prediction_vector, 'true_value': true_value_vector})
|
|
125
|
+
# df.sort_values('prediction', ascending=True, inplace=True)
|
|
126
|
+
# df['bin'] = pd.qcut(df['prediction'], 100, labels=False, duplicates=True, retbins=True)
|
|
127
|
+
# for bin in df.bin.unique():
|
|
128
|
+
# temp_df = df[df.bin >= bin].
|
|
129
|
+
#
|
|
130
|
+
#
|
|
131
|
+
# def group_retention(predictions, predictor):
|
|
132
|
+
# # first i need to get the ratio of values that are retained at particular values
|
|
133
|
+
# predictions.sort_values(predictor, inplace=True)
|
|
134
|
+
# _, thresholds = pd.qcut(predictions[predictor], 100, duplicates='drop')
|
|
135
|
+
# tracker = []
|
|
136
|
+
# for th in thresholds:
|
|
137
|
+
#
|
|
138
|
+
#
|
geney/survival_utils.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from scipy.integrate import trapz
|
|
6
|
+
from geney.utils import unload_pickle, unload_json, contains
|
|
7
|
+
from lifelines.exceptions import ConvergenceError
|
|
8
|
+
from lifelines import KaplanMeierFitter
|
|
9
|
+
from lifelines.statistics import logrank_test
|
|
10
|
+
from lifelines import CoxPHFitter
|
|
11
|
+
|
|
12
|
+
pd.set_option('display.max_columns', None)
|
|
13
|
+
pd.options.mode.chained_assignment = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def prepare_clinical_data(df=None):
|
|
17
|
+
if df is None:
|
|
18
|
+
CLINICAL_DATA_FILE = Path('/tamir2/yoramzar/Projects/Cancer_mut/Explore_data/reports/df_p_all.pkl')
|
|
19
|
+
df = unload_pickle(CLINICAL_DATA_FILE)
|
|
20
|
+
|
|
21
|
+
df.rename(columns={'patient_uuid': 'case_id'}, inplace=True)
|
|
22
|
+
cols = list(df.columns)
|
|
23
|
+
cols_days_to_followup = [col for col in cols if 'days_to_followup' in col] + [col for col in cols if 'days_to_last_followup' in col]
|
|
24
|
+
cols_days_to_know_alive = [col for col in cols if 'days_to_know_alive' in col] + [col for col in cols if 'days_to_last_known_alive' in col]
|
|
25
|
+
cols_days_to_death = [col for col in cols if 'days_to_death' in col]
|
|
26
|
+
cols_duration = cols_days_to_followup + cols_days_to_know_alive + cols_days_to_death
|
|
27
|
+
col_vital_status = 'days_to_death'
|
|
28
|
+
event_col_label = 'event'
|
|
29
|
+
duration_col_label = 'duration'
|
|
30
|
+
df.insert(1, event_col_label, df.apply(lambda x: int(not np.isnan(x[col_vital_status])), axis=1))
|
|
31
|
+
df.insert(1, duration_col_label, df.apply(lambda x: max([x[col] for col in cols_duration if not np.isnan(x[col])], default=-1), axis=1))
|
|
32
|
+
df[duration_col_label] /= 365
|
|
33
|
+
df = df.query(f"{duration_col_label}>=0.0")[['duration', 'event', 'case_id', 'chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy', 'Proj_name']]
|
|
34
|
+
# df.to_csv('/tamir2/nicolaslynn/data/tcga_metadata/tcga_clinical_data.csv')
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SurvivalAnalysis:
|
|
39
|
+
def __init__(self, clindf=None):
|
|
40
|
+
self.clindf = prepare_clinical_data(clindf)
|
|
41
|
+
self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
42
|
+
self.df = self.clindf.copy()
|
|
43
|
+
self.df['group'] = 0
|
|
44
|
+
self.df.fillna(0, inplace=True)
|
|
45
|
+
self.treatment_features = ['chemotherapy', 'hormone_therapy', 'immunotherapy', 'targeted_molecular_therapy']
|
|
46
|
+
|
|
47
|
+
def generate_clinical_dataframe(self, target_cases, control_cases=None, inplace=False, features_of_interest=[]):
|
|
48
|
+
df = self.df.copy()
|
|
49
|
+
df.loc[df[df.case_id.isin(target_cases)].index, 'group'] = 2
|
|
50
|
+
if control_cases is not None:
|
|
51
|
+
df.loc[df[df.case_id.isin(control_cases)].index, 'group'] = 1
|
|
52
|
+
|
|
53
|
+
df = df[df.group > 0]
|
|
54
|
+
df.group -= 1
|
|
55
|
+
core_features = ['duration', 'event']
|
|
56
|
+
df = df[core_features + features_of_interest]
|
|
57
|
+
|
|
58
|
+
for col in self.treatment_features:
|
|
59
|
+
if col not in df:
|
|
60
|
+
continue
|
|
61
|
+
df.loc[df[col] > 0, col] = 1
|
|
62
|
+
|
|
63
|
+
df = df[core_features + [col for col in features_of_interest if
|
|
64
|
+
df[col].nunique() > 1]] # and df[col].value_counts(normalize=True).min() >= 0.01]]
|
|
65
|
+
return df
|
|
66
|
+
|
|
67
|
+
def kaplan_meier_analysis(self, df, control_label='CV', target_label='Epistasis', feature='group', plot=False, time_cap=False):
|
|
68
|
+
# Can only be performed on features with two unique values
|
|
69
|
+
cap_time = df.groupby(feature).duration.max().min()
|
|
70
|
+
# df['duration'] = df['duration'].clip(upper=cap_time)
|
|
71
|
+
auc_vals = []
|
|
72
|
+
results = pd.Series()
|
|
73
|
+
count = 0
|
|
74
|
+
for val in [0, 1]:
|
|
75
|
+
g = df[df[feature] == val]
|
|
76
|
+
kmf = KaplanMeierFitter()
|
|
77
|
+
label = f"{control_label} ({len(g)} cases)" if val == 0 else f"{target_label} ({len(g)} cases)"
|
|
78
|
+
if val == 0:
|
|
79
|
+
results[control_label] = len(g)
|
|
80
|
+
else:
|
|
81
|
+
results[target_label] = len(g)
|
|
82
|
+
|
|
83
|
+
kmf.fit(g['duration'], g['event'], label=label)
|
|
84
|
+
surv_func = kmf.survival_function_
|
|
85
|
+
auc = trapz(surv_func[label], surv_func.index)
|
|
86
|
+
auc_vals.append(auc)
|
|
87
|
+
if plot:
|
|
88
|
+
if count == 0:
|
|
89
|
+
ax = kmf.plot()
|
|
90
|
+
else:
|
|
91
|
+
kmf.plot(ax=ax)
|
|
92
|
+
count += 1
|
|
93
|
+
p_value = self.log_rank(df[df[feature] == 1], df[df[feature] == 0])
|
|
94
|
+
|
|
95
|
+
if plot:
|
|
96
|
+
ax.text(0.5, 0.85, f'p-value: {p_value:.4f}', transform=ax.transAxes, fontsize=12,
|
|
97
|
+
horizontalalignment='center')
|
|
98
|
+
plt.title('Kaplan-Meier Survival Curves')
|
|
99
|
+
plt.xlabel('Time')
|
|
100
|
+
plt.ylabel('Survival Probability')
|
|
101
|
+
if time_cap:
|
|
102
|
+
plt.xlim([0, cap_time])
|
|
103
|
+
plt.show()
|
|
104
|
+
|
|
105
|
+
results['p_value'] = p_value
|
|
106
|
+
results['auc_target'] = auc_vals[-1]
|
|
107
|
+
if len(auc_vals) > 1:
|
|
108
|
+
results['auc_delta'] = auc_vals[-1] - auc_vals[0]
|
|
109
|
+
results['auc_control'] = auc_vals[0]
|
|
110
|
+
|
|
111
|
+
return results
|
|
112
|
+
|
|
113
|
+
def log_rank(self, group1, group2):
|
|
114
|
+
return logrank_test(group1['duration'], group2['duration'],
|
|
115
|
+
event_observed_A=group1['event'],
|
|
116
|
+
event_observed_B=group2['event']).p_value
|
|
117
|
+
|
|
118
|
+
def perform_cox_analysis(self, df, features_of_interest):
|
|
119
|
+
# Very simple... will return a series with p values for each feature
|
|
120
|
+
try:
|
|
121
|
+
return CoxPHFitter().fit(df[features_of_interest + ['duration', 'event']], 'duration', 'event').summary.p
|
|
122
|
+
except ConvergenceError:
|
|
123
|
+
print("Convergence Error")
|
|
124
|
+
return pd.Series()
|
geney/tcga_utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import random
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
|
|
5
6
|
class TCGACase:
|
|
6
7
|
def __init__(self, df):
|
|
7
8
|
# Here we get a dataframe of mutations within a gene
|
|
@@ -109,6 +110,7 @@ class TCGAGene:
|
|
|
109
110
|
|
|
110
111
|
else:
|
|
111
112
|
df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
|
|
113
|
+
df = df.astype({'Start_Position': int})
|
|
112
114
|
|
|
113
115
|
if include_filter is not None:
|
|
114
116
|
df = df[df.FILTER == include_filter]
|
|
@@ -138,36 +140,43 @@ class TCGAGene:
|
|
|
138
140
|
df = df[~df.FILTER.str.contains(filter)]
|
|
139
141
|
return df.case_id.unique().tolist()
|
|
140
142
|
|
|
141
|
-
def get_patient_muts(self, case_id=None):
|
|
143
|
+
def get_patient_muts(self, case_id=None, read_ratio=0, exclude_filters=None):
|
|
142
144
|
if case_id is None:
|
|
143
145
|
case_id = random.choice(self.affected_cases())
|
|
144
146
|
return self.df[self.df.case_id == case_id]
|
|
145
147
|
|
|
148
|
+
def get_patients_affected(self, mut_id, read_ratio=0, exclude_filters=None):
|
|
149
|
+
# returns all patients affected by ALL the mutatins in mut_id
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
def get_patients_unaffected(self, mut_id, must_contain_all=False, read_ratio=0, exclude_filters=None):
|
|
153
|
+
# returns all patients not affected by ALL the mutation in mut id (patients containg individual mutations only allowed) unless must_contain_all= True
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
def split_patients(self, mut_id, strict=True):
|
|
157
|
+
# returns two lists: all patients affected by a mutation and all patients with none of the mutations (or the mutations but not togehter)
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def arrange_patients_by_project(self, mut_id):
|
|
162
|
+
# returns all the patients affected by a mutation grouped by cancer project
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def total_prevalence(self, mut_id):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def project_prevalence(self, mut_id):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
def project_counts(self, mut_id):
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
|
|
146
179
|
|
|
147
|
-
class TCGAMut:
|
|
148
|
-
def __init__(self, mut_id):
|
|
149
|
-
self.num_muts = mut_id.count('|') + 1
|
|
150
|
-
data = []
|
|
151
|
-
for mut in mut_id.split('|'):
|
|
152
|
-
data.append(mut.split(':'))
|
|
153
|
-
data = pd.DataFrame(data, columns=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele',
|
|
154
|
-
'Tumor_Seq_Allele2'])
|
|
155
|
-
data.Chromosome = data.apply(lambda row: f'chr{row.Chromosome}', axis=1)
|
|
156
|
-
data = data.astype({'Start_Position': int})
|
|
157
|
-
self.gene = data.Gene_name.unique().tolist()[0]
|
|
158
|
-
self.df = data
|
|
159
|
-
|
|
160
|
-
def find_affected_patients(self, read_ratio=0, exclude_filters=None):
|
|
161
|
-
gene = TCGAGene(self.gene, exclude_filters=exclude_filters).df
|
|
162
|
-
gene = gene[gene.ratio >= read_ratio]
|
|
163
|
-
return pd.merge(self.df, gene,
|
|
164
|
-
on=['Gene_name', 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2'])
|
|
165
|
-
|
|
166
|
-
def find_affected_patients_list(self, read_ratio=0, exclude_filters=None):
|
|
167
|
-
df = self.find_affected_patients(read_ratio=read_ratio, exclude_filters=exclude_filters)
|
|
168
|
-
case_count = df.case_id.value_counts()
|
|
169
|
-
case_count = case_count[case_count == self.num_muts]
|
|
170
|
-
return case_count.index.tolist()
|
|
171
180
|
|
|
172
181
|
|
|
173
182
|
|
|
@@ -4,15 +4,18 @@ geney/__init__.py,sha256=r-Yvpo_Tc236DcsqsFyexT21iVoYCVl9zoJj5pFuWEE,407
|
|
|
4
4
|
geney/benchmark_clinvar.py,sha256=LLl77e95Qbg9Kd-m2yL8ilmzubSz9SKogeARwssT4Ks,5532
|
|
5
5
|
geney/compare_sets.py,sha256=TcgL57V7BUPxBoW9lv3xr8qK2Acmykn85Ev3avicQr8,2977
|
|
6
6
|
geney/config_setup.py,sha256=SePeooA4RWAtR_KAT1-W1hkD3MT5tH6YMyp80t_RNPQ,385
|
|
7
|
-
geney/data_setup.py,sha256=
|
|
7
|
+
geney/data_setup.py,sha256=DZeksRPr2ZT7bszMo33W0r3OwmqHokVXtZ4gx5Lu_Mo,10725
|
|
8
8
|
geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
9
|
-
geney/
|
|
9
|
+
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
10
|
+
geney/netchop.py,sha256=25oEkGp9NveEMX4owOTDPm6KU4LsALEcK_jk9TqTQRQ,2881
|
|
10
11
|
geney/oncosplice.py,sha256=Fyc_UtAhV3Pv0vk8V55rO_jnb2Dwj5sW98KVwP3PHwU,68964
|
|
11
12
|
geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
|
|
13
|
+
geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
|
|
12
14
|
geney/power_utils.py,sha256=WRpqMnqUv1xrAeTduAUhx6YpSEJQci7bC2od12JcVtE,7267
|
|
13
15
|
geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
|
|
16
|
+
geney/survival_utils.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
|
|
14
17
|
geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
|
|
15
|
-
geney/tcga_utils.py,sha256=
|
|
18
|
+
geney/tcga_utils.py,sha256=XrLI8RzmXhyabvL24sMrqQM3KNusmU1_kyKYdkv6lpo,15591
|
|
16
19
|
geney/utils.py,sha256=YOe22gA0Oew9_QEym7ivM9sb7t3wNeHTeiSDBmvOPso,1984
|
|
17
20
|
geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
21
|
geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
|
|
@@ -40,7 +43,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
|
|
|
40
43
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
41
44
|
geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
45
|
geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
-
geney-1.1.
|
|
44
|
-
geney-1.1.
|
|
45
|
-
geney-1.1.
|
|
46
|
-
geney-1.1.
|
|
46
|
+
geney-1.1.4.dist-info/METADATA,sha256=3nh_1Zr0M9_nDw7i6UhARNMjeY7-Vs4C1gfYJTVhkqI,1130
|
|
47
|
+
geney-1.1.4.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
|
48
|
+
geney-1.1.4.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
49
|
+
geney-1.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|