pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,347 @@
1
+ """Run five no-mPR runtime benchmarks for each bundled gold standard.
2
+
3
+ Run from any directory with:
4
+ python path/to/src/pythonflex/examples/runtime_benchmark_repeated.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import sys
11
+ import argparse
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from time import perf_counter
15
+ from typing import Any, Callable
16
+
17
+ import pandas as pd
18
+
19
+
20
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
21
+ SRC_ROOT = PROJECT_ROOT / "src"
22
+ if str(SRC_ROOT) not in sys.path:
23
+ sys.path.insert(0, str(SRC_ROOT))
24
+
25
+ # Plot generation is benchmarked and saved without opening interactive windows.
26
+ os.environ.setdefault("MPLBACKEND", "Agg")
27
+
28
+ import pythonflex as flex # noqa: E402
29
+
30
+
31
+ GENE_EFFECT_PATH = Path(
32
+ "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
33
+ )
34
+ RUN_COUNT = 5
35
+ GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
36
+ DEFAULT_BENCHMARK_ROOT = (
37
+ PROJECT_ROOT
38
+ / "output"
39
+ / f"runtime_benchmark_{RUN_COUNT}_runs_no_mpr_{datetime.now():%Y%m%d_%H%M%S}"
40
+ )
41
+ BENCHMARK_ROOT = DEFAULT_BENCHMARK_ROOT
42
+
43
+
44
+ def build_config(gold_standard: str, output_folder: Path) -> dict[str, Any]:
45
+ return {
46
+ "min_genes_in_complex": 2,
47
+ "min_genes_per_complex_analysis": 3,
48
+ "output_folder": str(output_folder),
49
+ "gold_standard": gold_standard,
50
+ "color_map": "RdYlBu",
51
+ "jaccard": True,
52
+ "analysis_genes": "common",
53
+ "plotting": {
54
+ "save_plot": True,
55
+ "show_plot": False,
56
+ "output_type": "png",
57
+ },
58
+ "preprocessing": {
59
+ "fill_na": True,
60
+ "normalize": False,
61
+ },
62
+ "corr_function": "numpy",
63
+ "logging": {
64
+ "visible_levels": ["DONE", "INFO", "WARNING"],
65
+ },
66
+ }
67
+
68
+
69
+ def timed_call(
70
+ timings: list[dict[str, Any]],
71
+ gold_standard: str,
72
+ repetition: int,
73
+ step: str,
74
+ operation: Callable[[], Any],
75
+ ) -> Any:
76
+ start = perf_counter()
77
+ result = operation()
78
+ timings.append(
79
+ {
80
+ "gold_standard": gold_standard,
81
+ "repetition": repetition,
82
+ "step": step,
83
+ "seconds": perf_counter() - start,
84
+ }
85
+ )
86
+ return result
87
+
88
+
89
+ def run_repetition(
90
+ gold_standard: str,
91
+ repetition: int,
92
+ ) -> list[dict[str, Any]]:
93
+ output_folder = BENCHMARK_ROOT / gold_standard / f"run_{repetition:02d}"
94
+ timings: list[dict[str, Any]] = []
95
+ workflow_start = perf_counter()
96
+
97
+ timed_call(
98
+ timings,
99
+ gold_standard,
100
+ repetition,
101
+ "initialize",
102
+ lambda: flex.initialize(build_config(gold_standard, output_folder)),
103
+ )
104
+ gene_effect = timed_call(
105
+ timings,
106
+ gold_standard,
107
+ repetition,
108
+ "read_gene_effect",
109
+ lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
110
+ )
111
+ inputs = {
112
+ "All screens": {
113
+ "path": gene_effect,
114
+ "sort": "high",
115
+ "color": "#000000",
116
+ },
117
+ }
118
+ data, _ = timed_call(
119
+ timings,
120
+ gold_standard,
121
+ repetition,
122
+ "load_datasets",
123
+ lambda: flex.load_datasets(inputs),
124
+ )
125
+ timed_call(
126
+ timings,
127
+ gold_standard,
128
+ repetition,
129
+ "load_gold_standard",
130
+ flex.load_gold_standard,
131
+ )
132
+
133
+ name, dataset = next(iter(data.items()))
134
+ timed_call(
135
+ timings,
136
+ gold_standard,
137
+ repetition,
138
+ "pra",
139
+ lambda: flex.pra(name, dataset, is_corr=False),
140
+ )
141
+ timed_call(
142
+ timings,
143
+ gold_standard,
144
+ repetition,
145
+ "pra_percomplex",
146
+ lambda: flex.pra_percomplex(name, dataset, is_corr=False),
147
+ )
148
+ timed_call(
149
+ timings,
150
+ gold_standard,
151
+ repetition,
152
+ "complex_contributions",
153
+ lambda: flex.complex_contributions(name),
154
+ )
155
+
156
+ timed_call(
157
+ timings,
158
+ gold_standard,
159
+ repetition,
160
+ "plot_precision_recall_curve",
161
+ flex.plot_precision_recall_curve,
162
+ )
163
+ timed_call(
164
+ timings,
165
+ gold_standard,
166
+ repetition,
167
+ "plot_auc_scores",
168
+ flex.plot_auc_scores,
169
+ )
170
+ timed_call(
171
+ timings,
172
+ gold_standard,
173
+ repetition,
174
+ "plot_significant_complexes",
175
+ flex.plot_significant_complexes,
176
+ )
177
+ timed_call(
178
+ timings,
179
+ gold_standard,
180
+ repetition,
181
+ "plot_percomplex_scatter",
182
+ lambda: flex.plot_percomplex_scatter(n_top=20),
183
+ )
184
+ timed_call(
185
+ timings,
186
+ gold_standard,
187
+ repetition,
188
+ "plot_percomplex_scatter_bysize",
189
+ flex.plot_percomplex_scatter_bysize,
190
+ )
191
+ timed_call(
192
+ timings,
193
+ gold_standard,
194
+ repetition,
195
+ "plot_complex_contributions",
196
+ flex.plot_complex_contributions,
197
+ )
198
+
199
+ timings.append(
200
+ {
201
+ "gold_standard": gold_standard,
202
+ "repetition": repetition,
203
+ "step": "total_runtime",
204
+ "seconds": perf_counter() - workflow_start,
205
+ }
206
+ )
207
+ output_folder.mkdir(parents=True, exist_ok=True)
208
+ pd.DataFrame(timings).to_csv(
209
+ output_folder / "benchmark_results.csv",
210
+ index=False,
211
+ )
212
+ return timings
213
+
214
+
215
+ def write_reports(timings: list[dict[str, Any]]) -> None:
216
+ raw = pd.DataFrame(timings)
217
+ raw.to_csv(BENCHMARK_ROOT / "benchmark_results_all_runs.csv", index=False)
218
+
219
+ summary = (
220
+ raw.groupby(["gold_standard", "step"], as_index=False)["seconds"]
221
+ .agg(
222
+ repetitions="count",
223
+ mean_seconds="mean",
224
+ std_seconds="std",
225
+ min_seconds="min",
226
+ max_seconds="max",
227
+ )
228
+ )
229
+ summary.to_csv(BENCHMARK_ROOT / "benchmark_summary_mean_std.csv", index=False)
230
+
231
+ total_rows = raw[raw["step"] == "total_runtime"]
232
+ if not total_rows.empty:
233
+ per_repetition_total = (
234
+ total_rows.groupby("repetition", as_index=False)["seconds"]
235
+ .sum()
236
+ .rename(columns={"seconds": "all_gold_standards_seconds"})
237
+ )
238
+ per_repetition_total.to_csv(
239
+ BENCHMARK_ROOT / "benchmark_total_by_repetition.csv",
240
+ index=False,
241
+ )
242
+
243
+
244
+ def parse_gold_standards(value: str) -> list[str]:
245
+ selected = [item.strip().upper() for item in value.split(",") if item.strip()]
246
+ invalid = [item for item in selected if item not in GOLD_STANDARDS]
247
+ if invalid:
248
+ raise ValueError(
249
+ f"Invalid gold standard(s): {invalid}. Choose from {GOLD_STANDARDS}."
250
+ )
251
+ return selected
252
+
253
+
254
+ def load_existing_timings() -> list[dict[str, Any]]:
255
+ raw_path = BENCHMARK_ROOT / "benchmark_results_all_runs.csv"
256
+ if not raw_path.exists():
257
+ return []
258
+ return pd.read_csv(raw_path).to_dict("records")
259
+
260
+
261
+ def has_completed_run(
262
+ timings: list[dict[str, Any]],
263
+ gold_standard: str,
264
+ repetition: int,
265
+ ) -> bool:
266
+ return any(
267
+ row.get("gold_standard") == gold_standard
268
+ and int(row.get("repetition", -1)) == repetition
269
+ and row.get("step") == "total_runtime"
270
+ for row in timings
271
+ )
272
+
273
+
274
+ def parse_args() -> argparse.Namespace:
275
+ parser = argparse.ArgumentParser(
276
+ description=(
277
+ "Run repeated no-mPR pythonFLEX runtime benchmarks. "
278
+ "Use --output-root to resume or add runs to an existing output folder."
279
+ )
280
+ )
281
+ parser.add_argument(
282
+ "--output-root",
283
+ type=Path,
284
+ default=None,
285
+ help="Existing or new benchmark root. Defaults to a new timestamped folder.",
286
+ )
287
+ parser.add_argument(
288
+ "--gold-standards",
289
+ default=",".join(GOLD_STANDARDS),
290
+ help="Comma-separated subset of CORUM, PATHWAY, GOBP.",
291
+ )
292
+ parser.add_argument(
293
+ "--start-repetition",
294
+ type=int,
295
+ default=1,
296
+ help="First repetition number to run.",
297
+ )
298
+ parser.add_argument(
299
+ "--end-repetition",
300
+ type=int,
301
+ default=RUN_COUNT,
302
+ help="Last repetition number to run.",
303
+ )
304
+ return parser.parse_args()
305
+
306
+
307
+ def main() -> None:
308
+ if not GENE_EFFECT_PATH.exists():
309
+ raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
310
+
311
+ args = parse_args()
312
+ selected_standards = parse_gold_standards(args.gold_standards)
313
+ if args.start_repetition < 1 or args.end_repetition < args.start_repetition:
314
+ raise ValueError("Invalid repetition range.")
315
+ if args.end_repetition > RUN_COUNT:
316
+ raise ValueError(f"end-repetition cannot exceed RUN_COUNT={RUN_COUNT}.")
317
+
318
+ global BENCHMARK_ROOT
319
+ if args.output_root is not None:
320
+ BENCHMARK_ROOT = args.output_root.resolve()
321
+
322
+ BENCHMARK_ROOT.mkdir(parents=True, exist_ok=args.output_root is not None)
323
+ all_timings = load_existing_timings()
324
+ print(f"Benchmark output folder: {BENCHMARK_ROOT}")
325
+ print("mPR preparation is excluded from this repeated benchmark.")
326
+
327
+ for gold_standard in selected_standards:
328
+ for repetition in range(args.start_repetition, args.end_repetition + 1):
329
+ if has_completed_run(all_timings, gold_standard, repetition):
330
+ print(
331
+ f"Skipping completed no-mPR benchmark: {gold_standard} "
332
+ f"repetition {repetition}/{RUN_COUNT}"
333
+ )
334
+ continue
335
+ print(
336
+ f"Running no-mPR benchmark: {gold_standard} "
337
+ f"repetition {repetition}/{RUN_COUNT}"
338
+ )
339
+ all_timings.extend(run_repetition(gold_standard, repetition))
340
+ write_reports(all_timings)
341
+
342
+ summary_path = BENCHMARK_ROOT / "benchmark_summary_mean_std.csv"
343
+ print(f"Benchmark summary saved to: {summary_path}")
344
+
345
+
346
+ if __name__ == "__main__":
347
+ main()