pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex/__init__.py +28 -4
- pythonflex/analysis.py +287 -579
- pythonflex/examples/basic_usage.py +38 -30
- pythonflex/examples/manuscript.py +37 -43
- pythonflex/examples/runtime/runtime_benchmark.py +218 -0
- pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
- pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
- pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
- pythonflex/old_functions.py +422 -0
- pythonflex/plotting.py +655 -242
- pythonflex/preprocessing.py +54 -216
- pythonflex/utils.py +36 -9
- {pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/METADATA +8 -6
- pythonflex-0.4.dist-info/RECORD +32 -0
- {pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
- pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
- pythonflex-0.3.4.dist-info/RECORD +0 -24
- {pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
# Archived legacy implementations retained for historical reference only.
|
|
2
|
+
# This module is intentionally not imported by pythonFLEX and is not public API.
|
|
3
|
+
# Active implementations live in analysis.py.
|
|
4
|
+
|
|
5
|
+
### OLD FUNCTIONS
|
|
6
|
+
|
|
7
|
+
# new but withoutparallel
|
|
8
|
+
|
|
9
|
+
# def pra_percomplex(dataset_name, matrix, is_corr=False):
|
|
10
|
+
# log.started(f"*** Per-complex PRA started - {dataset_name} ***")
|
|
11
|
+
# config = dload("config")
|
|
12
|
+
# terms = dload("tmp", "terms")
|
|
13
|
+
# genes_present = dload("tmp", "genes_present_in_terms")
|
|
14
|
+
# sorting = dload("input", "sorting")
|
|
15
|
+
# sort_order = sorting.get(dataset_name, "high")
|
|
16
|
+
# if not is_corr:
|
|
17
|
+
# matrix = perform_corr(matrix, config.get("corr_function"))
|
|
18
|
+
# matrix = filter_matrix_by_genes(matrix, genes_present)
|
|
19
|
+
# log.info(f"Matrix shape: {matrix.shape}")
|
|
20
|
+
# df = binary(matrix)
|
|
21
|
+
# log.info(f"Pair-wise shape: {df.shape}")
|
|
22
|
+
# df = quick_sort(df, ascending=(sort_order == "low"))
|
|
23
|
+
# pairwise_df = df.copy()
|
|
24
|
+
# pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
|
|
25
|
+
# pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
|
|
26
|
+
|
|
27
|
+
# # Precompute a mapping from each gene to the row indices in the pairwise DataFrame where it appears.
|
|
28
|
+
# gene_to_pair_indices = {}
|
|
29
|
+
# for i, (gene_a, gene_b) in enumerate(zip(pairwise_df["gene1"], pairwise_df["gene2"])):
|
|
30
|
+
# gene_to_pair_indices.setdefault(gene_a, []).append(i)
|
|
31
|
+
# gene_to_pair_indices.setdefault(gene_b, []).append(i)
|
|
32
|
+
# log.done
|
|
33
|
+
|
|
34
|
+
# # Build gold_pair_to_complex using sets for efficiency
|
|
35
|
+
# gold_pair_to_complex = defaultdict(set)
|
|
36
|
+
# for idx, row in terms.iterrows():
|
|
37
|
+
# genes = row.used_genes
|
|
38
|
+
# if len(genes) < 2:
|
|
39
|
+
# continue
|
|
40
|
+
# for i, g1 in enumerate(genes):
|
|
41
|
+
# for g2 in genes[i + 1:]:
|
|
42
|
+
# pair = tuple(sorted((g1, g2)))
|
|
43
|
+
# gold_pair_to_complex[pair].add(idx)
|
|
44
|
+
|
|
45
|
+
# # Precompute complex_ids as semicolon-separated strings in pairwise_df
|
|
46
|
+
# pairs = [tuple(sorted((g1, g2))) for g1, g2 in zip(pairwise_df["gene1"], pairwise_df["gene2"])]
|
|
47
|
+
# pairwise_df['complex_ids'] = [';'.join(map(str, sorted(gold_pair_to_complex.get(pair, set())))) for pair in pairs]
|
|
48
|
+
|
|
49
|
+
# # Initialize AUC scores
|
|
50
|
+
# auc_scores = {}
|
|
51
|
+
# # Loop over each gene complex
|
|
52
|
+
# for idx, row in tqdm(terms.iterrows()):
|
|
53
|
+
# gene_set = set(row.used_genes)
|
|
54
|
+
# if config["min_genes_per_complex_analysis"] > len(gene_set):
|
|
55
|
+
# continue
|
|
56
|
+
# # Collect all row indices in the pairwise data where either gene belongs to the complex.
|
|
57
|
+
# candidate_indices = bitarray(len(pairwise_df))
|
|
58
|
+
# for gene in gene_set:
|
|
59
|
+
# if gene in gene_to_pair_indices:
|
|
60
|
+
# candidate_indices[gene_to_pair_indices[gene]] = True
|
|
61
|
+
|
|
62
|
+
# if not candidate_indices.any():
|
|
63
|
+
# continue
|
|
64
|
+
|
|
65
|
+
# # Select only the relevant pairwise comparisons.
|
|
66
|
+
# selected_rows = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
|
|
67
|
+
# sub_df = pairwise_df.iloc[selected_rows]
|
|
68
|
+
|
|
69
|
+
# # Get current complex ID (assuming idx is the ID; adjust if row['ID'] is different)
|
|
70
|
+
# complex_id = str(idx) # Or str(row['ID']) if available
|
|
71
|
+
|
|
72
|
+
# # Create true_label: 1 if complex_id in complex_ids (vectorized with str.contains)
|
|
73
|
+
# #true_label = sub_df['complex_ids'].str.contains(complex_id, regex=False).astype(int)
|
|
74
|
+
|
|
75
|
+
# # Inside the loop, for each complex:
|
|
76
|
+
# # Inside the loop:
|
|
77
|
+
# complex_id = str(idx)
|
|
78
|
+
# # Use (?:^|;) and (?:;|$) to avoid capturing groups
|
|
79
|
+
# pattern = r'(?:^|;)' + re.escape(complex_id) + r'(?:;|$)'
|
|
80
|
+
# true_label = sub_df['complex_ids'].str.contains(pattern, regex=True).astype(int)
|
|
81
|
+
# # Filter to keep verified negatives (complex_ids == "") or positives for this complex (true_label == 1)
|
|
82
|
+
# complex_mask = (sub_df['complex_ids'] == "") | (true_label == 1)
|
|
83
|
+
|
|
84
|
+
# # Use the masked true labels for AUPRC (avoids SettingWithCopyWarning)
|
|
85
|
+
# predictions = true_label[complex_mask]
|
|
86
|
+
|
|
87
|
+
# if predictions.sum() == 0:
|
|
88
|
+
# continue
|
|
89
|
+
# # Compute cumulative true positives and derive precision and recall.
|
|
90
|
+
# true_positive_cumsum = predictions.cumsum()
|
|
91
|
+
# precision = true_positive_cumsum / (np.arange(len(predictions)) + 1)
|
|
92
|
+
# recall = true_positive_cumsum / true_positive_cumsum.iloc[-1]
|
|
93
|
+
|
|
94
|
+
# if len(recall) < 2 or recall.iloc[-1] == 0:
|
|
95
|
+
# continue
|
|
96
|
+
# auc_scores[idx] = metrics.auc(recall, precision)
|
|
97
|
+
|
|
98
|
+
# # Add the computed AUC scores to the terms DataFrame.
|
|
99
|
+
# terms["auc_score"] = pd.Series(auc_scores)
|
|
100
|
+
# terms.drop(columns=["hash"], inplace=True)
|
|
101
|
+
# dsave(terms, "pra_percomplex", dataset_name)
|
|
102
|
+
# log.done(f"Per-complex PRA completed.")
|
|
103
|
+
# return terms
|
|
104
|
+
|
|
105
|
+
# it works quick but only maps 1 complex to each pair
|
|
106
|
+
|
|
107
|
+
# def pra_percomplex_old_type_filtering(dataset_name, matrix, is_corr=False):
|
|
108
|
+
# log.started(f"*** Per-complex PRA started - {dataset_name} ***")
|
|
109
|
+
# config = dload("config")
|
|
110
|
+
# terms = dload("tmp", "terms")
|
|
111
|
+
# genes_present = dload("tmp", "genes_present_in_terms")
|
|
112
|
+
# sorting = dload("input", "sorting")
|
|
113
|
+
# sort_order = sorting.get(dataset_name, "high")
|
|
114
|
+
# if not is_corr:
|
|
115
|
+
# matrix = perform_corr(matrix, config.get("corr_function"))
|
|
116
|
+
# matrix = filter_matrix_by_genes(matrix, genes_present)
|
|
117
|
+
# log.info(f"Matrix shape: {matrix.shape}")
|
|
118
|
+
# df = binary(matrix)
|
|
119
|
+
# log.info(f"Pair-wise shape: {df.shape}")
|
|
120
|
+
# df = quick_sort(df, ascending=(sort_order == "low"))
|
|
121
|
+
# pairwise_df = df.copy()
|
|
122
|
+
# pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
|
|
123
|
+
# pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
|
|
124
|
+
# # Precompute a mapping from each gene to the row indices in the pairwise DataFrame where it appears.
|
|
125
|
+
# gene_to_pair_indices = {}
|
|
126
|
+
# for i, (gene_a, gene_b) in enumerate(zip(pairwise_df["gene1"], pairwise_df["gene2"])):
|
|
127
|
+
# gene_to_pair_indices.setdefault(gene_a, []).append(i)
|
|
128
|
+
# gene_to_pair_indices.setdefault(gene_b, []).append(i)
|
|
129
|
+
# # Initialize AUC scores (one for each complex) with NaNs.
|
|
130
|
+
# #auc_scores = np.full(len(terms), np.nan)
|
|
131
|
+
# auc_scores = {}
|
|
132
|
+
# # Loop over each gene complex
|
|
133
|
+
# for idx, row in tqdm(terms.iterrows()):
|
|
134
|
+
# gene_set = set(row.used_genes)
|
|
135
|
+
|
|
136
|
+
# if config["min_genes_per_complex_analysis"] > len(gene_set):
|
|
137
|
+
# continue
|
|
138
|
+
# # Collect all row indices in the pairwise data where either gene belongs to the complex.
|
|
139
|
+
# candidate_indices = bitarray(len(pairwise_df))
|
|
140
|
+
# for gene in gene_set:
|
|
141
|
+
# if gene in gene_to_pair_indices:
|
|
142
|
+
# candidate_indices[gene_to_pair_indices[gene]] = True
|
|
143
|
+
# if not candidate_indices.any():
|
|
144
|
+
# continue
|
|
145
|
+
# # Select only the relevant pairwise comparisons.
|
|
146
|
+
# selected_rows = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
|
|
147
|
+
# sub_df = pairwise_df.iloc[selected_rows]
|
|
148
|
+
# # A prediction is 1 if both genes in the pair are in the complex; otherwise 0.
|
|
149
|
+
# predictions = (sub_df["gene1"].isin(gene_set) & sub_df["gene2"].isin(gene_set)).astype(int)
|
|
150
|
+
# if predictions.sum() == 0:
|
|
151
|
+
# continue
|
|
152
|
+
# # Compute cumulative true positives and derive precision and recall.
|
|
153
|
+
# true_positive_cumsum = predictions.cumsum()
|
|
154
|
+
# precision = true_positive_cumsum / (np.arange(len(predictions)) + 1)
|
|
155
|
+
# recall = true_positive_cumsum / true_positive_cumsum.iloc[-1]
|
|
156
|
+
|
|
157
|
+
# if len(recall) < 2 or recall.iloc[-1] == 0:
|
|
158
|
+
# continue
|
|
159
|
+
# auc_scores[idx] = metrics.auc(recall, precision)
|
|
160
|
+
# # Add the computed AUC scores to the terms DataFrame.
|
|
161
|
+
# terms["auc_score"] = pd.Series(auc_scores)
|
|
162
|
+
# terms.drop(columns=["hash"], inplace=True)
|
|
163
|
+
# dsave(terms, "pra_percomplex", dataset_name)
|
|
164
|
+
# log.done(f"Per-complex PRA completed.")
|
|
165
|
+
# return terms
|
|
166
|
+
|
|
167
|
+
# OLD
|
|
168
|
+
# def pra_percomplex(dataset_name, matrix, is_corr=False):
|
|
169
|
+
# log.started(f"*** Per-complex PRA started for {dataset_name} ***")
|
|
170
|
+
# config = dload("config")
|
|
171
|
+
# terms = dload("tmp", "terms")
|
|
172
|
+
# genes_present = dload("tmp", "genes_present_in_terms")
|
|
173
|
+
# sorting = dload("input", "sorting")
|
|
174
|
+
# sort_order = sorting.get(dataset_name, "high")
|
|
175
|
+
|
|
176
|
+
# if not is_corr:
|
|
177
|
+
# matrix = perform_corr(matrix, "numpy")
|
|
178
|
+
# matrix = filter_matrix_by_genes(matrix, genes_present)
|
|
179
|
+
# log.info(f"Matrix shape: {matrix.shape}")
|
|
180
|
+
# df = binary(matrix)
|
|
181
|
+
# log.info(f"Pair-wise shape: {df.shape}")
|
|
182
|
+
# df = quick_sort(df, ascending=(sort_order == "low"))
|
|
183
|
+
# # Precompute gene → row indices
|
|
184
|
+
# gene_to_rows = {}
|
|
185
|
+
# for i, (g1, g2) in enumerate(zip(df["gene1"], df["gene2"])):
|
|
186
|
+
# gene_to_rows.setdefault(g1, []).append(i)
|
|
187
|
+
# gene_to_rows.setdefault(g2, []).append(i)
|
|
188
|
+
# aucs = np.full(len(terms), np.nan)
|
|
189
|
+
# N = len(df)
|
|
190
|
+
# for idx, row in tqdm(terms.iterrows()):
|
|
191
|
+
# genes = set(row.used_genes)
|
|
192
|
+
# if len(genes) < config["min_complex_size_for_percomplex"]: # Skip small complexes
|
|
193
|
+
# continue
|
|
194
|
+
# # Get all row indices where either gene is in the complex
|
|
195
|
+
# candidate_idxs = set()
|
|
196
|
+
# for g in genes:
|
|
197
|
+
# candidate_idxs.update(gene_to_rows.get(g, []))
|
|
198
|
+
# candidate_idxs = sorted(candidate_idxs)
|
|
199
|
+
# if not candidate_idxs:
|
|
200
|
+
# continue
|
|
201
|
+
# # Use only relevant rows for prediction
|
|
202
|
+
# sub = df.loc[candidate_idxs]
|
|
203
|
+
# preds = (sub["gene1"].isin(genes) & sub["gene2"].isin(genes)).astype(int)
|
|
204
|
+
# if preds.sum() == 0:
|
|
205
|
+
# continue
|
|
206
|
+
# tp = preds.cumsum()
|
|
207
|
+
# prec = tp / (np.arange(len(preds)) + 1)
|
|
208
|
+
# recall = tp / tp.iloc[-1]
|
|
209
|
+
# if len(recall) < 2 or recall.iloc[-1] == 0:
|
|
210
|
+
# continue
|
|
211
|
+
# aucs[idx] = metrics.auc(recall, prec)
|
|
212
|
+
# terms["auc_score"] = aucs
|
|
213
|
+
# terms.drop(columns=["list", "set", "hash"], inplace=True)
|
|
214
|
+
# dsave(terms, "pra_percomplex", dataset_name)
|
|
215
|
+
# log.done(f"Per-complex PRA completed.")
|
|
216
|
+
# return terms
|
|
217
|
+
|
|
218
|
+
# without greedy
|
|
219
|
+
# def complex_contributions(name):
|
|
220
|
+
# log.info(f"Computing complex contributions for dataset: {name}")
|
|
221
|
+
|
|
222
|
+
# pra = dload("pra", name)
|
|
223
|
+
# terms = dload("tmp", "terms")
|
|
224
|
+
# d = pra.query('prediction == 1').drop(columns=['gene1', 'gene2'])
|
|
225
|
+
# results = {}
|
|
226
|
+
# thresholds = [round(i, 2) for i in np.arange(1, 0.0001, -0.025)]
|
|
227
|
+
# for cid in terms.ID.to_list():
|
|
228
|
+
# arr = []
|
|
229
|
+
# for threshold in thresholds:
|
|
230
|
+
# r = d[d.complex_id == cid].query('precision >= @threshold')
|
|
231
|
+
# arr.append(r.shape[0])
|
|
232
|
+
# results[cid] = arr
|
|
233
|
+
|
|
234
|
+
# r = pd.DataFrame(results, index=thresholds).T
|
|
235
|
+
# t = terms[['ID', 'Name']].set_index('ID')
|
|
236
|
+
# r['Name'] = r.index.map(t.Name)
|
|
237
|
+
# r = r[list(reversed(list(r.columns)))]
|
|
238
|
+
# r = r.reset_index(drop=True)
|
|
239
|
+
# dsave(r, "complex_contributions", name)
|
|
240
|
+
# log.info(f"Complex contributions computation completed for dataset: {name}")
|
|
241
|
+
# return r
|
|
242
|
+
|
|
243
|
+
# # new
|
|
244
|
+
# def complex_contributions(name):
|
|
245
|
+
# log.info(f"Computing complex contributions using R-style greedy logic for dataset: {name}")
|
|
246
|
+
# pra = dload("pra", name)
|
|
247
|
+
# terms = dload("common", "terms")
|
|
248
|
+
|
|
249
|
+
# # Ensure pra is sorted by score descending
|
|
250
|
+
# pra = pra.sort_values(by='score', ascending=False).reset_index(drop=True)
|
|
251
|
+
|
|
252
|
+
# # Compute cumulative TP and precision if not present
|
|
253
|
+
# pra['cumTP'] = pra['prediction'].cumsum()
|
|
254
|
+
# pra['rank'] = pra.index + 1
|
|
255
|
+
# pra['precision'] = pra['cumTP'] / pra['rank']
|
|
256
|
+
|
|
257
|
+
# # R-style precision thresholds
|
|
258
|
+
# prec_min = pra['precision'].min()
|
|
259
|
+
# prec_max = pra['precision'].max()
|
|
260
|
+
# precision_cutoffs = [round(prec_min, 3)]
|
|
261
|
+
# cutoffs_range = np.arange(0.1, prec_max + 0.001, 0.025)
|
|
262
|
+
# precision_cutoffs += [round(t, 3) for t in cutoffs_range if t > prec_min]
|
|
263
|
+
# thresholds = sorted(set(precision_cutoffs)) # Ensure unique and sorted
|
|
264
|
+
|
|
265
|
+
# results = {}
|
|
266
|
+
# for t in thresholds:
|
|
267
|
+
# if pra['precision'].max() < t:
|
|
268
|
+
# continue
|
|
269
|
+
# cand = pra[pra['precision'] >= t]
|
|
270
|
+
# if cand.empty:
|
|
271
|
+
# continue
|
|
272
|
+
# k = cand.index.max() # rightmost index where precision >= t
|
|
273
|
+
# tp_target = pra.loc[k, 'cumTP']
|
|
274
|
+
# # Find the smallest m where cumTP[m] >= tp_target
|
|
275
|
+
# ind = pra[pra['cumTP'] >= tp_target].index.min()
|
|
276
|
+
# if pd.isna(ind):
|
|
277
|
+
# continue
|
|
278
|
+
# # Select top (ind+1) rows
|
|
279
|
+
# tmp = pra.iloc[0:ind + 1].copy()
|
|
280
|
+
# # Filter for predicted positives (true == 1)
|
|
281
|
+
# tmp = tmp[tmp['prediction'] == 1]
|
|
282
|
+
# tmp = tmp[tmp["complex_id"].notnull()]
|
|
283
|
+
# tmp["ID"] = tmp["complex_id"].apply(lambda ids: ";".join(str(int(i)) for i in ids if pd.notnull(i)))
|
|
284
|
+
# # Now greedy logic
|
|
285
|
+
# final_contrib = {}
|
|
286
|
+
# while not tmp.empty:
|
|
287
|
+
# all_ids = tmp["ID"].str.split(";").explode()
|
|
288
|
+
# contrib = all_ids.value_counts()
|
|
289
|
+
# if contrib.empty:
|
|
290
|
+
# break
|
|
291
|
+
# top_id = contrib.idxmax()
|
|
292
|
+
# final_contrib[top_id] = contrib[top_id]
|
|
293
|
+
# tmp = tmp[~tmp["ID"].str.contains(rf"\b{top_id}\b", regex=True)]
|
|
294
|
+
# for cid, count in final_contrib.items():
|
|
295
|
+
# if cid not in results:
|
|
296
|
+
# results[cid] = [0] * len(thresholds)
|
|
297
|
+
# results[cid][thresholds.index(t)] = count
|
|
298
|
+
|
|
299
|
+
# # Add back gold standard complexes with 0 contribution
|
|
300
|
+
# gold_ids = set(terms.index.astype(str))
|
|
301
|
+
# all_ids = set(results.keys())
|
|
302
|
+
# missing_ids = gold_ids - all_ids
|
|
303
|
+
# for cid in missing_ids:
|
|
304
|
+
# results[cid] = [0] * len(thresholds)
|
|
305
|
+
|
|
306
|
+
# # Build result DataFrame
|
|
307
|
+
# r = pd.DataFrame(results, index=thresholds).T
|
|
308
|
+
# r['Name'] = r.index.astype(int).map(terms['Name'])
|
|
309
|
+
# r = r[['Name'] + [c for c in r.columns if c != 'Name']] # Name as first col
|
|
310
|
+
# r = r[(r.drop(columns="Name").sum(axis=1) > 0)]
|
|
311
|
+
# # Move ID to first column, keep Name second, then precision columns in order
|
|
312
|
+
# dsave(r, "complex_contributions", name)
|
|
313
|
+
# log.info(f"Greedy R-style complex contribution completed for dataset: {name}")
|
|
314
|
+
# return r
|
|
315
|
+
|
|
316
|
+
# def pra(dataset_name, matrix, is_corr=False):
|
|
317
|
+
# log.info(f"******************** {dataset_name} ********************")
|
|
318
|
+
# log.started(f"** Global Precision-Recall Analysis - {dataset_name} **")
|
|
319
|
+
# config = dload("config")
|
|
320
|
+
|
|
321
|
+
# terms_data = dload("tmp", "terms")
|
|
322
|
+
# if terms_data is None or not isinstance(terms_data, pd.DataFrame):
|
|
323
|
+
# raise ValueError("Expected 'terms' to be a DataFrame, but got None or invalid type.")
|
|
324
|
+
# terms = terms_data
|
|
325
|
+
# genes_present = dload("tmp", "genes_present_in_terms")
|
|
326
|
+
# sorting = dload("input", "sorting")
|
|
327
|
+
# sort_order = sorting.get(dataset_name, "high")
|
|
328
|
+
|
|
329
|
+
# if not is_corr:
|
|
330
|
+
# matrix = perform_corr(matrix, config.get("corr_function"))
|
|
331
|
+
|
|
332
|
+
# matrix = filter_matrix_by_genes(matrix, genes_present)
|
|
333
|
+
|
|
334
|
+
# log.info(f"Matrix shape: {matrix.shape}")
|
|
335
|
+
# df = binary(matrix)
|
|
336
|
+
# log.info(f"Pair-wise shape: {df.shape}")
|
|
337
|
+
# df = quick_sort(df, ascending=(sort_order == "low"))
|
|
338
|
+
|
|
339
|
+
# gold_pair_to_complex = defaultdict(list)
|
|
340
|
+
# for idx, row in terms.iterrows():
|
|
341
|
+
# genes = row.used_genes
|
|
342
|
+
# if len(genes) < 2:
|
|
343
|
+
# continue
|
|
344
|
+
# for i, g1 in enumerate(genes):
|
|
345
|
+
# for g2 in genes[i + 1:]:
|
|
346
|
+
# pair = tuple(sorted((g1, g2)))
|
|
347
|
+
# gold_pair_to_complex[pair].append(idx)
|
|
348
|
+
|
|
349
|
+
# # Label predictions and complex IDs
|
|
350
|
+
# complex_ids = []
|
|
351
|
+
# predictions = []
|
|
352
|
+
# for g1, g2 in zip(df["gene1"], df["gene2"]):
|
|
353
|
+
# pair = tuple(sorted((g1, g2)))
|
|
354
|
+
# ids = gold_pair_to_complex.get(pair, [])
|
|
355
|
+
# if ids:
|
|
356
|
+
# predictions.append(1)
|
|
357
|
+
# complex_ids.append(ids)
|
|
358
|
+
# else:
|
|
359
|
+
# predictions.append(0)
|
|
360
|
+
# complex_ids.append([])
|
|
361
|
+
|
|
362
|
+
# df["prediction"] = predictions
|
|
363
|
+
# df["complex_id"] = complex_ids
|
|
364
|
+
|
|
365
|
+
# if df["prediction"].sum() == 0:
|
|
366
|
+
# log.info("No true positives found in dataset.")
|
|
367
|
+
# pr_auc = np.nan
|
|
368
|
+
# else:
|
|
369
|
+
# tp = df["prediction"].cumsum()
|
|
370
|
+
# df["tp"] = tp
|
|
371
|
+
# precision = tp / (np.arange(len(df)) + 1)
|
|
372
|
+
# recall = tp / tp.iloc[-1]
|
|
373
|
+
# pr_auc = metrics.auc(recall, precision)
|
|
374
|
+
# df["precision"] = precision
|
|
375
|
+
# df["recall"] = recall
|
|
376
|
+
|
|
377
|
+
# log.info(f"PR-AUC: {pr_auc:.4f}, Number of true positives: {df['prediction'].sum()}")
|
|
378
|
+
# dsave(df, "pra", dataset_name)
|
|
379
|
+
# dsave(pr_auc, "pr_auc", dataset_name)
|
|
380
|
+
# log.done(f"Global PRA completed for {dataset_name}")
|
|
381
|
+
# return df, pr_auc
|
|
382
|
+
|
|
383
|
+
# def compute_pra(df):
|
|
384
|
+
# log.info("Calculating precision-recall and AUC score.")
|
|
385
|
+
# if df.empty:
|
|
386
|
+
# log.warning("Empty DataFrame encountered in compute_pra. Returning empty DataFrame.")
|
|
387
|
+
# return df
|
|
388
|
+
# df["tp"] = df["prediction"].cumsum()
|
|
389
|
+
# df.reset_index(drop=True, inplace=True)
|
|
390
|
+
# df["precision"] = df["tp"] / (df.index + 1)
|
|
391
|
+
# df["recall"] = df["tp"] / df["tp"].iloc[-1]
|
|
392
|
+
# log.info("DONE: Calculating precision-recall AUC score.")
|
|
393
|
+
# return df
|
|
394
|
+
|
|
395
|
+
# def pra(dataset_name, matrix, is_corr=False):
|
|
396
|
+
# log.info(f"PRA computation started for {dataset_name}.")
|
|
397
|
+
# genes_present_in_terms = dload("tmp", "genes_present_in_terms")
|
|
398
|
+
# #terms_hash_table = dload("tmp", "terms_hash_table")
|
|
399
|
+
# sorting_prefs = dload("input", "sorting")
|
|
400
|
+
# sort_order = sorting_prefs.get(dataset_name, "high")
|
|
401
|
+
# if not is_corr: matrix = perform_corr(matrix, "numpy")
|
|
402
|
+
# matrix = filter_matrix_by_genes(matrix, genes_present_in_terms)
|
|
403
|
+
# stack = binary(matrix)
|
|
404
|
+
|
|
405
|
+
# log.info("Checking gene pairs against the gold standard.")
|
|
406
|
+
# gene_pairs = list(zip(stack["gene1"], stack["gene2"]))
|
|
407
|
+
# hashed_pairs = [hash(pair) for pair in gene_pairs]
|
|
408
|
+
# stack["complex_id"] = [terms_hash_table.get(h, 0) for h in hashed_pairs]
|
|
409
|
+
# stack["prediction"] = [1 if h in terms_hash_table else 0 for h in hashed_pairs]
|
|
410
|
+
|
|
411
|
+
# annotated = stack.copy()
|
|
412
|
+
# if sort_order == "low":
|
|
413
|
+
# ann_sorted = quick_sort(annotated, ascending=True)
|
|
414
|
+
# else:
|
|
415
|
+
# ann_sorted = quick_sort(annotated)
|
|
416
|
+
|
|
417
|
+
# pra = compute_pra(ann_sorted)
|
|
418
|
+
# pr_auc = metrics.auc(pra.recall, pra.precision)
|
|
419
|
+
# dsave(pra, "pra", dataset_name)
|
|
420
|
+
# dsave(pr_auc, "pr_auc", dataset_name)
|
|
421
|
+
# log.info(f"PRA computation completed for {dataset_name} (Sorting: {sort_order}).")
|
|
422
|
+
# return pra, pr_auc
|