pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,422 @@
1
+ # Archived legacy implementations retained for historical reference only.
2
+ # This module is intentionally not imported by pythonFLEX and is not public API.
3
+ # Active implementations live in analysis.py.
4
+
5
+ ### OLD FUNCTIONS
6
+
7
+ # new but withoutparallel
8
+
9
+ # def pra_percomplex(dataset_name, matrix, is_corr=False):
10
+ # log.started(f"*** Per-complex PRA started - {dataset_name} ***")
11
+ # config = dload("config")
12
+ # terms = dload("tmp", "terms")
13
+ # genes_present = dload("tmp", "genes_present_in_terms")
14
+ # sorting = dload("input", "sorting")
15
+ # sort_order = sorting.get(dataset_name, "high")
16
+ # if not is_corr:
17
+ # matrix = perform_corr(matrix, config.get("corr_function"))
18
+ # matrix = filter_matrix_by_genes(matrix, genes_present)
19
+ # log.info(f"Matrix shape: {matrix.shape}")
20
+ # df = binary(matrix)
21
+ # log.info(f"Pair-wise shape: {df.shape}")
22
+ # df = quick_sort(df, ascending=(sort_order == "low"))
23
+ # pairwise_df = df.copy()
24
+ # pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
25
+ # pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
26
+
27
+ # # Precompute a mapping from each gene to the row indices in the pairwise DataFrame where it appears.
28
+ # gene_to_pair_indices = {}
29
+ # for i, (gene_a, gene_b) in enumerate(zip(pairwise_df["gene1"], pairwise_df["gene2"])):
30
+ # gene_to_pair_indices.setdefault(gene_a, []).append(i)
31
+ # gene_to_pair_indices.setdefault(gene_b, []).append(i)
32
+ # log.done
33
+
34
+ # # Build gold_pair_to_complex using sets for efficiency
35
+ # gold_pair_to_complex = defaultdict(set)
36
+ # for idx, row in terms.iterrows():
37
+ # genes = row.used_genes
38
+ # if len(genes) < 2:
39
+ # continue
40
+ # for i, g1 in enumerate(genes):
41
+ # for g2 in genes[i + 1:]:
42
+ # pair = tuple(sorted((g1, g2)))
43
+ # gold_pair_to_complex[pair].add(idx)
44
+
45
+ # # Precompute complex_ids as semicolon-separated strings in pairwise_df
46
+ # pairs = [tuple(sorted((g1, g2))) for g1, g2 in zip(pairwise_df["gene1"], pairwise_df["gene2"])]
47
+ # pairwise_df['complex_ids'] = [';'.join(map(str, sorted(gold_pair_to_complex.get(pair, set())))) for pair in pairs]
48
+
49
+ # # Initialize AUC scores
50
+ # auc_scores = {}
51
+ # # Loop over each gene complex
52
+ # for idx, row in tqdm(terms.iterrows()):
53
+ # gene_set = set(row.used_genes)
54
+ # if config["min_genes_per_complex_analysis"] > len(gene_set):
55
+ # continue
56
+ # # Collect all row indices in the pairwise data where either gene belongs to the complex.
57
+ # candidate_indices = bitarray(len(pairwise_df))
58
+ # for gene in gene_set:
59
+ # if gene in gene_to_pair_indices:
60
+ # candidate_indices[gene_to_pair_indices[gene]] = True
61
+
62
+ # if not candidate_indices.any():
63
+ # continue
64
+
65
+ # # Select only the relevant pairwise comparisons.
66
+ # selected_rows = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
67
+ # sub_df = pairwise_df.iloc[selected_rows]
68
+
69
+ # # Get current complex ID (assuming idx is the ID; adjust if row['ID'] is different)
70
+ # complex_id = str(idx) # Or str(row['ID']) if available
71
+
72
+ # # Create true_label: 1 if complex_id in complex_ids (vectorized with str.contains)
73
+ # #true_label = sub_df['complex_ids'].str.contains(complex_id, regex=False).astype(int)
74
+
75
+ # # Inside the loop, for each complex:
76
+ # # Inside the loop:
77
+ # complex_id = str(idx)
78
+ # # Use (?:^|;) and (?:;|$) to avoid capturing groups
79
+ # pattern = r'(?:^|;)' + re.escape(complex_id) + r'(?:;|$)'
80
+ # true_label = sub_df['complex_ids'].str.contains(pattern, regex=True).astype(int)
81
+ # # Filter to keep verified negatives (complex_ids == "") or positives for this complex (true_label == 1)
82
+ # complex_mask = (sub_df['complex_ids'] == "") | (true_label == 1)
83
+
84
+ # # Use the masked true labels for AUPRC (avoids SettingWithCopyWarning)
85
+ # predictions = true_label[complex_mask]
86
+
87
+ # if predictions.sum() == 0:
88
+ # continue
89
+ # # Compute cumulative true positives and derive precision and recall.
90
+ # true_positive_cumsum = predictions.cumsum()
91
+ # precision = true_positive_cumsum / (np.arange(len(predictions)) + 1)
92
+ # recall = true_positive_cumsum / true_positive_cumsum.iloc[-1]
93
+
94
+ # if len(recall) < 2 or recall.iloc[-1] == 0:
95
+ # continue
96
+ # auc_scores[idx] = metrics.auc(recall, precision)
97
+
98
+ # # Add the computed AUC scores to the terms DataFrame.
99
+ # terms["auc_score"] = pd.Series(auc_scores)
100
+ # terms.drop(columns=["hash"], inplace=True)
101
+ # dsave(terms, "pra_percomplex", dataset_name)
102
+ # log.done(f"Per-complex PRA completed.")
103
+ # return terms
104
+
105
+ # it works quick but only maps 1 complex to each pair
106
+
107
+ # def pra_percomplex_old_type_filtering(dataset_name, matrix, is_corr=False):
108
+ # log.started(f"*** Per-complex PRA started - {dataset_name} ***")
109
+ # config = dload("config")
110
+ # terms = dload("tmp", "terms")
111
+ # genes_present = dload("tmp", "genes_present_in_terms")
112
+ # sorting = dload("input", "sorting")
113
+ # sort_order = sorting.get(dataset_name, "high")
114
+ # if not is_corr:
115
+ # matrix = perform_corr(matrix, config.get("corr_function"))
116
+ # matrix = filter_matrix_by_genes(matrix, genes_present)
117
+ # log.info(f"Matrix shape: {matrix.shape}")
118
+ # df = binary(matrix)
119
+ # log.info(f"Pair-wise shape: {df.shape}")
120
+ # df = quick_sort(df, ascending=(sort_order == "low"))
121
+ # pairwise_df = df.copy()
122
+ # pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
123
+ # pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
124
+ # # Precompute a mapping from each gene to the row indices in the pairwise DataFrame where it appears.
125
+ # gene_to_pair_indices = {}
126
+ # for i, (gene_a, gene_b) in enumerate(zip(pairwise_df["gene1"], pairwise_df["gene2"])):
127
+ # gene_to_pair_indices.setdefault(gene_a, []).append(i)
128
+ # gene_to_pair_indices.setdefault(gene_b, []).append(i)
129
+ # # Initialize AUC scores (one for each complex) with NaNs.
130
+ # #auc_scores = np.full(len(terms), np.nan)
131
+ # auc_scores = {}
132
+ # # Loop over each gene complex
133
+ # for idx, row in tqdm(terms.iterrows()):
134
+ # gene_set = set(row.used_genes)
135
+
136
+ # if config["min_genes_per_complex_analysis"] > len(gene_set):
137
+ # continue
138
+ # # Collect all row indices in the pairwise data where either gene belongs to the complex.
139
+ # candidate_indices = bitarray(len(pairwise_df))
140
+ # for gene in gene_set:
141
+ # if gene in gene_to_pair_indices:
142
+ # candidate_indices[gene_to_pair_indices[gene]] = True
143
+ # if not candidate_indices.any():
144
+ # continue
145
+ # # Select only the relevant pairwise comparisons.
146
+ # selected_rows = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
147
+ # sub_df = pairwise_df.iloc[selected_rows]
148
+ # # A prediction is 1 if both genes in the pair are in the complex; otherwise 0.
149
+ # predictions = (sub_df["gene1"].isin(gene_set) & sub_df["gene2"].isin(gene_set)).astype(int)
150
+ # if predictions.sum() == 0:
151
+ # continue
152
+ # # Compute cumulative true positives and derive precision and recall.
153
+ # true_positive_cumsum = predictions.cumsum()
154
+ # precision = true_positive_cumsum / (np.arange(len(predictions)) + 1)
155
+ # recall = true_positive_cumsum / true_positive_cumsum.iloc[-1]
156
+
157
+ # if len(recall) < 2 or recall.iloc[-1] == 0:
158
+ # continue
159
+ # auc_scores[idx] = metrics.auc(recall, precision)
160
+ # # Add the computed AUC scores to the terms DataFrame.
161
+ # terms["auc_score"] = pd.Series(auc_scores)
162
+ # terms.drop(columns=["hash"], inplace=True)
163
+ # dsave(terms, "pra_percomplex", dataset_name)
164
+ # log.done(f"Per-complex PRA completed.")
165
+ # return terms
166
+
167
+ # OLD
168
+ # def pra_percomplex(dataset_name, matrix, is_corr=False):
169
+ # log.started(f"*** Per-complex PRA started for {dataset_name} ***")
170
+ # config = dload("config")
171
+ # terms = dload("tmp", "terms")
172
+ # genes_present = dload("tmp", "genes_present_in_terms")
173
+ # sorting = dload("input", "sorting")
174
+ # sort_order = sorting.get(dataset_name, "high")
175
+
176
+ # if not is_corr:
177
+ # matrix = perform_corr(matrix, "numpy")
178
+ # matrix = filter_matrix_by_genes(matrix, genes_present)
179
+ # log.info(f"Matrix shape: {matrix.shape}")
180
+ # df = binary(matrix)
181
+ # log.info(f"Pair-wise shape: {df.shape}")
182
+ # df = quick_sort(df, ascending=(sort_order == "low"))
183
+ # # Precompute gene → row indices
184
+ # gene_to_rows = {}
185
+ # for i, (g1, g2) in enumerate(zip(df["gene1"], df["gene2"])):
186
+ # gene_to_rows.setdefault(g1, []).append(i)
187
+ # gene_to_rows.setdefault(g2, []).append(i)
188
+ # aucs = np.full(len(terms), np.nan)
189
+ # N = len(df)
190
+ # for idx, row in tqdm(terms.iterrows()):
191
+ # genes = set(row.used_genes)
192
+ # if len(genes) < config["min_complex_size_for_percomplex"]: # Skip small complexes
193
+ # continue
194
+ # # Get all row indices where either gene is in the complex
195
+ # candidate_idxs = set()
196
+ # for g in genes:
197
+ # candidate_idxs.update(gene_to_rows.get(g, []))
198
+ # candidate_idxs = sorted(candidate_idxs)
199
+ # if not candidate_idxs:
200
+ # continue
201
+ # # Use only relevant rows for prediction
202
+ # sub = df.loc[candidate_idxs]
203
+ # preds = (sub["gene1"].isin(genes) & sub["gene2"].isin(genes)).astype(int)
204
+ # if preds.sum() == 0:
205
+ # continue
206
+ # tp = preds.cumsum()
207
+ # prec = tp / (np.arange(len(preds)) + 1)
208
+ # recall = tp / tp.iloc[-1]
209
+ # if len(recall) < 2 or recall.iloc[-1] == 0:
210
+ # continue
211
+ # aucs[idx] = metrics.auc(recall, prec)
212
+ # terms["auc_score"] = aucs
213
+ # terms.drop(columns=["list", "set", "hash"], inplace=True)
214
+ # dsave(terms, "pra_percomplex", dataset_name)
215
+ # log.done(f"Per-complex PRA completed.")
216
+ # return terms
217
+
218
+ # without greedy
219
+ # def complex_contributions(name):
220
+ # log.info(f"Computing complex contributions for dataset: {name}")
221
+
222
+ # pra = dload("pra", name)
223
+ # terms = dload("tmp", "terms")
224
+ # d = pra.query('prediction == 1').drop(columns=['gene1', 'gene2'])
225
+ # results = {}
226
+ # thresholds = [round(i, 2) for i in np.arange(1, 0.0001, -0.025)]
227
+ # for cid in terms.ID.to_list():
228
+ # arr = []
229
+ # for threshold in thresholds:
230
+ # r = d[d.complex_id == cid].query('precision >= @threshold')
231
+ # arr.append(r.shape[0])
232
+ # results[cid] = arr
233
+
234
+ # r = pd.DataFrame(results, index=thresholds).T
235
+ # t = terms[['ID', 'Name']].set_index('ID')
236
+ # r['Name'] = r.index.map(t.Name)
237
+ # r = r[list(reversed(list(r.columns)))]
238
+ # r = r.reset_index(drop=True)
239
+ # dsave(r, "complex_contributions", name)
240
+ # log.info(f"Complex contributions computation completed for dataset: {name}")
241
+ # return r
242
+
243
+ # # new
244
+ # def complex_contributions(name):
245
+ # log.info(f"Computing complex contributions using R-style greedy logic for dataset: {name}")
246
+ # pra = dload("pra", name)
247
+ # terms = dload("common", "terms")
248
+
249
+ # # Ensure pra is sorted by score descending
250
+ # pra = pra.sort_values(by='score', ascending=False).reset_index(drop=True)
251
+
252
+ # # Compute cumulative TP and precision if not present
253
+ # pra['cumTP'] = pra['prediction'].cumsum()
254
+ # pra['rank'] = pra.index + 1
255
+ # pra['precision'] = pra['cumTP'] / pra['rank']
256
+
257
+ # # R-style precision thresholds
258
+ # prec_min = pra['precision'].min()
259
+ # prec_max = pra['precision'].max()
260
+ # precision_cutoffs = [round(prec_min, 3)]
261
+ # cutoffs_range = np.arange(0.1, prec_max + 0.001, 0.025)
262
+ # precision_cutoffs += [round(t, 3) for t in cutoffs_range if t > prec_min]
263
+ # thresholds = sorted(set(precision_cutoffs)) # Ensure unique and sorted
264
+
265
+ # results = {}
266
+ # for t in thresholds:
267
+ # if pra['precision'].max() < t:
268
+ # continue
269
+ # cand = pra[pra['precision'] >= t]
270
+ # if cand.empty:
271
+ # continue
272
+ # k = cand.index.max() # rightmost index where precision >= t
273
+ # tp_target = pra.loc[k, 'cumTP']
274
+ # # Find the smallest m where cumTP[m] >= tp_target
275
+ # ind = pra[pra['cumTP'] >= tp_target].index.min()
276
+ # if pd.isna(ind):
277
+ # continue
278
+ # # Select top (ind+1) rows
279
+ # tmp = pra.iloc[0:ind + 1].copy()
280
+ # # Filter for predicted positives (true == 1)
281
+ # tmp = tmp[tmp['prediction'] == 1]
282
+ # tmp = tmp[tmp["complex_id"].notnull()]
283
+ # tmp["ID"] = tmp["complex_id"].apply(lambda ids: ";".join(str(int(i)) for i in ids if pd.notnull(i)))
284
+ # # Now greedy logic
285
+ # final_contrib = {}
286
+ # while not tmp.empty:
287
+ # all_ids = tmp["ID"].str.split(";").explode()
288
+ # contrib = all_ids.value_counts()
289
+ # if contrib.empty:
290
+ # break
291
+ # top_id = contrib.idxmax()
292
+ # final_contrib[top_id] = contrib[top_id]
293
+ # tmp = tmp[~tmp["ID"].str.contains(rf"\b{top_id}\b", regex=True)]
294
+ # for cid, count in final_contrib.items():
295
+ # if cid not in results:
296
+ # results[cid] = [0] * len(thresholds)
297
+ # results[cid][thresholds.index(t)] = count
298
+
299
+ # # Add back gold standard complexes with 0 contribution
300
+ # gold_ids = set(terms.index.astype(str))
301
+ # all_ids = set(results.keys())
302
+ # missing_ids = gold_ids - all_ids
303
+ # for cid in missing_ids:
304
+ # results[cid] = [0] * len(thresholds)
305
+
306
+ # # Build result DataFrame
307
+ # r = pd.DataFrame(results, index=thresholds).T
308
+ # r['Name'] = r.index.astype(int).map(terms['Name'])
309
+ # r = r[['Name'] + [c for c in r.columns if c != 'Name']] # Name as first col
310
+ # r = r[(r.drop(columns="Name").sum(axis=1) > 0)]
311
+ # # Move ID to first column, keep Name second, then precision columns in order
312
+ # dsave(r, "complex_contributions", name)
313
+ # log.info(f"Greedy R-style complex contribution completed for dataset: {name}")
314
+ # return r
315
+
316
+ # def pra(dataset_name, matrix, is_corr=False):
317
+ # log.info(f"******************** {dataset_name} ********************")
318
+ # log.started(f"** Global Precision-Recall Analysis - {dataset_name} **")
319
+ # config = dload("config")
320
+
321
+ # terms_data = dload("tmp", "terms")
322
+ # if terms_data is None or not isinstance(terms_data, pd.DataFrame):
323
+ # raise ValueError("Expected 'terms' to be a DataFrame, but got None or invalid type.")
324
+ # terms = terms_data
325
+ # genes_present = dload("tmp", "genes_present_in_terms")
326
+ # sorting = dload("input", "sorting")
327
+ # sort_order = sorting.get(dataset_name, "high")
328
+
329
+ # if not is_corr:
330
+ # matrix = perform_corr(matrix, config.get("corr_function"))
331
+
332
+ # matrix = filter_matrix_by_genes(matrix, genes_present)
333
+
334
+ # log.info(f"Matrix shape: {matrix.shape}")
335
+ # df = binary(matrix)
336
+ # log.info(f"Pair-wise shape: {df.shape}")
337
+ # df = quick_sort(df, ascending=(sort_order == "low"))
338
+
339
+ # gold_pair_to_complex = defaultdict(list)
340
+ # for idx, row in terms.iterrows():
341
+ # genes = row.used_genes
342
+ # if len(genes) < 2:
343
+ # continue
344
+ # for i, g1 in enumerate(genes):
345
+ # for g2 in genes[i + 1:]:
346
+ # pair = tuple(sorted((g1, g2)))
347
+ # gold_pair_to_complex[pair].append(idx)
348
+
349
+ # # Label predictions and complex IDs
350
+ # complex_ids = []
351
+ # predictions = []
352
+ # for g1, g2 in zip(df["gene1"], df["gene2"]):
353
+ # pair = tuple(sorted((g1, g2)))
354
+ # ids = gold_pair_to_complex.get(pair, [])
355
+ # if ids:
356
+ # predictions.append(1)
357
+ # complex_ids.append(ids)
358
+ # else:
359
+ # predictions.append(0)
360
+ # complex_ids.append([])
361
+
362
+ # df["prediction"] = predictions
363
+ # df["complex_id"] = complex_ids
364
+
365
+ # if df["prediction"].sum() == 0:
366
+ # log.info("No true positives found in dataset.")
367
+ # pr_auc = np.nan
368
+ # else:
369
+ # tp = df["prediction"].cumsum()
370
+ # df["tp"] = tp
371
+ # precision = tp / (np.arange(len(df)) + 1)
372
+ # recall = tp / tp.iloc[-1]
373
+ # pr_auc = metrics.auc(recall, precision)
374
+ # df["precision"] = precision
375
+ # df["recall"] = recall
376
+
377
+ # log.info(f"PR-AUC: {pr_auc:.4f}, Number of true positives: {df['prediction'].sum()}")
378
+ # dsave(df, "pra", dataset_name)
379
+ # dsave(pr_auc, "pr_auc", dataset_name)
380
+ # log.done(f"Global PRA completed for {dataset_name}")
381
+ # return df, pr_auc
382
+
383
+ # def compute_pra(df):
384
+ # log.info("Calculating precision-recall and AUC score.")
385
+ # if df.empty:
386
+ # log.warning("Empty DataFrame encountered in compute_pra. Returning empty DataFrame.")
387
+ # return df
388
+ # df["tp"] = df["prediction"].cumsum()
389
+ # df.reset_index(drop=True, inplace=True)
390
+ # df["precision"] = df["tp"] / (df.index + 1)
391
+ # df["recall"] = df["tp"] / df["tp"].iloc[-1]
392
+ # log.info("DONE: Calculating precision-recall AUC score.")
393
+ # return df
394
+
395
+ # def pra(dataset_name, matrix, is_corr=False):
396
+ # log.info(f"PRA computation started for {dataset_name}.")
397
+ # genes_present_in_terms = dload("tmp", "genes_present_in_terms")
398
+ # #terms_hash_table = dload("tmp", "terms_hash_table")
399
+ # sorting_prefs = dload("input", "sorting")
400
+ # sort_order = sorting_prefs.get(dataset_name, "high")
401
+ # if not is_corr: matrix = perform_corr(matrix, "numpy")
402
+ # matrix = filter_matrix_by_genes(matrix, genes_present_in_terms)
403
+ # stack = binary(matrix)
404
+
405
+ # log.info("Checking gene pairs against the gold standard.")
406
+ # gene_pairs = list(zip(stack["gene1"], stack["gene2"]))
407
+ # hashed_pairs = [hash(pair) for pair in gene_pairs]
408
+ # stack["complex_id"] = [terms_hash_table.get(h, 0) for h in hashed_pairs]
409
+ # stack["prediction"] = [1 if h in terms_hash_table else 0 for h in hashed_pairs]
410
+
411
+ # annotated = stack.copy()
412
+ # if sort_order == "low":
413
+ # ann_sorted = quick_sort(annotated, ascending=True)
414
+ # else:
415
+ # ann_sorted = quick_sort(annotated)
416
+
417
+ # pra = compute_pra(ann_sorted)
418
+ # pr_auc = metrics.auc(pra.recall, pra.precision)
419
+ # dsave(pra, "pra", dataset_name)
420
+ # dsave(pr_auc, "pr_auc", dataset_name)
421
+ # log.info(f"PRA computation completed for {dataset_name} (Sorting: {sort_order}).")
422
+ # return pra, pr_auc