pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,417 @@
1
+ """Benchmark GOBP runtime optimization strategies.
2
+
3
+ Run from any directory with:
4
+ python path/to/src/pythonflex/examples/runtime_benchmark_gobp_optimization.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gc
10
+ import os
11
+ import sys
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from time import perf_counter
15
+ from typing import Any, Callable
16
+
17
+ import pandas as pd
18
+
19
+
20
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
21
+ SRC_ROOT = PROJECT_ROOT / "src"
22
+ if str(SRC_ROOT) not in sys.path:
23
+ sys.path.insert(0, str(SRC_ROOT))
24
+
25
+ os.environ.setdefault("MPLBACKEND", "Agg")
26
+
27
+ import pythonflex as flex # noqa: E402
28
+ from pythonflex import analysis as flex_analysis # noqa: E402
29
+
30
+
31
+ CHECKPOINT_COMMIT = "33b8ae8"
32
+ GENE_EFFECT_PATH = Path(
33
+ "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
34
+ )
35
+ GOLD_STANDARD = "GOBP"
36
+ CORR_FUNCTION = "numpy_without_mask"
37
+ MAX_NBYTES = "100M"
38
+ COMBINATIONS = ((8, 200), (8, 400), (8, 100))
39
+ FALLBACK_COMBINATION = (4, 200)
40
+ STRATEGIES = (
41
+ "baseline_memmap",
42
+ "no_memmap_threading",
43
+ "worker_globals_threading",
44
+ "shared_pairwise_reuse",
45
+ )
46
+ BENCHMARK_ROOT = (
47
+ PROJECT_ROOT
48
+ / "output"
49
+ / f"runtime_benchmark_gobp_optimization_{datetime.now():%Y%m%d_%H%M%S}"
50
+ )
51
+
52
+
53
+ def build_config(output_folder: Path, n_jobs: int, chunk_size: int) -> dict[str, Any]:
54
+ return {
55
+ "min_genes_in_complex": 2,
56
+ "min_genes_per_complex_analysis": 3,
57
+ "output_folder": str(output_folder),
58
+ "gold_standard": GOLD_STANDARD,
59
+ "color_map": "RdYlBu",
60
+ "jaccard": True,
61
+ "analysis_genes": "common",
62
+ "plotting": {
63
+ "save_plot": True,
64
+ "show_plot": False,
65
+ "output_type": "png",
66
+ },
67
+ "preprocessing": {
68
+ "fill_na": True,
69
+ "normalize": False,
70
+ },
71
+ "corr_function": CORR_FUNCTION,
72
+ "per_complex": {
73
+ "n_jobs": n_jobs,
74
+ "chunk_size": chunk_size,
75
+ "max_nbytes": MAX_NBYTES,
76
+ },
77
+ "logging": {
78
+ "visible_levels": ["DONE", "INFO", "WARNING", "ERROR"],
79
+ },
80
+ }
81
+
82
+
83
+ def timed_call(
84
+ timings: list[dict[str, Any]],
85
+ strategy: str,
86
+ n_jobs: int,
87
+ chunk_size: int,
88
+ fallback: bool,
89
+ step: str,
90
+ operation: Callable[[], Any],
91
+ *,
92
+ is_corr: bool | None = None,
93
+ ) -> Any:
94
+ start = perf_counter()
95
+ result = operation()
96
+ timings.append(
97
+ {
98
+ "gold_standard": GOLD_STANDARD,
99
+ "strategy": strategy,
100
+ "n_jobs": n_jobs,
101
+ "chunk_size": chunk_size,
102
+ "fallback": fallback,
103
+ "step": step,
104
+ "seconds": perf_counter() - start,
105
+ "corr_function": CORR_FUNCTION,
106
+ "is_corr": is_corr,
107
+ "status": "ok",
108
+ "error": "",
109
+ }
110
+ )
111
+ return result
112
+
113
+
114
+ def run_strategy_step(
115
+ timings: list[dict[str, Any]],
116
+ strategy: str,
117
+ n_jobs: int,
118
+ chunk_size: int,
119
+ fallback: bool,
120
+ name: str,
121
+ corr: pd.DataFrame,
122
+ ) -> None:
123
+ if strategy == "baseline_memmap":
124
+ timed_call(
125
+ timings,
126
+ strategy,
127
+ n_jobs,
128
+ chunk_size,
129
+ fallback,
130
+ "pra_is_corr_true",
131
+ lambda: flex.pra(name, corr, is_corr=True),
132
+ is_corr=True,
133
+ )
134
+ timed_call(
135
+ timings,
136
+ strategy,
137
+ n_jobs,
138
+ chunk_size,
139
+ fallback,
140
+ "pra_percomplex_is_corr_true",
141
+ lambda: flex.pra_percomplex(
142
+ name,
143
+ corr,
144
+ is_corr=True,
145
+ chunk_size=chunk_size,
146
+ n_jobs=n_jobs,
147
+ ),
148
+ is_corr=True,
149
+ )
150
+ elif strategy in {"no_memmap_threading", "worker_globals_threading"}:
151
+ timed_call(
152
+ timings,
153
+ strategy,
154
+ n_jobs,
155
+ chunk_size,
156
+ fallback,
157
+ "pra_is_corr_true",
158
+ lambda: flex.pra(name, corr, is_corr=True),
159
+ is_corr=True,
160
+ )
161
+ timed_call(
162
+ timings,
163
+ strategy,
164
+ n_jobs,
165
+ chunk_size,
166
+ fallback,
167
+ "pra_percomplex_is_corr_true",
168
+ lambda: flex_analysis._pra_percomplex_benchmark_strategy(
169
+ name,
170
+ corr,
171
+ is_corr=True,
172
+ chunk_size=chunk_size,
173
+ n_jobs=n_jobs,
174
+ strategy=strategy,
175
+ ),
176
+ is_corr=True,
177
+ )
178
+ elif strategy == "shared_pairwise_reuse":
179
+ prepared: dict[str, Any] = {}
180
+
181
+ def run_shared_pra() -> pd.DataFrame:
182
+ terms, pairwise_df, gene_to_pair_indices = flex_analysis._prepare_pairwise_for_analysis(
183
+ name,
184
+ corr,
185
+ is_corr=True,
186
+ build_gene_index=True,
187
+ )
188
+ prepared["terms"] = terms
189
+ prepared["pairwise_df"] = pairwise_df
190
+ prepared["gene_to_pair_indices"] = gene_to_pair_indices
191
+ return flex_analysis._save_global_pra_from_pairwise(name, pairwise_df)
192
+
193
+ timed_call(
194
+ timings,
195
+ strategy,
196
+ n_jobs,
197
+ chunk_size,
198
+ fallback,
199
+ "pra_is_corr_true",
200
+ run_shared_pra,
201
+ is_corr=True,
202
+ )
203
+ timed_call(
204
+ timings,
205
+ strategy,
206
+ n_jobs,
207
+ chunk_size,
208
+ fallback,
209
+ "pra_percomplex_is_corr_true",
210
+ lambda: flex_analysis._pra_percomplex_from_pairwise(
211
+ name,
212
+ prepared["terms"],
213
+ prepared["pairwise_df"],
214
+ prepared["gene_to_pair_indices"],
215
+ chunk_size=chunk_size,
216
+ n_jobs=n_jobs,
217
+ strategy="shared_pairwise_reuse",
218
+ ),
219
+ is_corr=True,
220
+ )
221
+ else:
222
+ raise ValueError(f"Unknown strategy: {strategy}")
223
+
224
+
225
+ def run_combination(
226
+ strategy: str,
227
+ n_jobs: int,
228
+ chunk_size: int,
229
+ *,
230
+ fallback: bool = False,
231
+ ) -> list[dict[str, Any]]:
232
+ suffix = f"n_jobs_{n_jobs:02d}_chunk_{chunk_size}"
233
+ if fallback:
234
+ suffix = f"fallback_{suffix}"
235
+ output_folder = BENCHMARK_ROOT / strategy / suffix
236
+ timings: list[dict[str, Any]] = []
237
+ workflow_start = perf_counter()
238
+
239
+ try:
240
+ timed_call(
241
+ timings,
242
+ strategy,
243
+ n_jobs,
244
+ chunk_size,
245
+ fallback,
246
+ "initialize",
247
+ lambda: flex.initialize(build_config(output_folder, n_jobs, chunk_size)),
248
+ )
249
+ gene_effect = timed_call(
250
+ timings,
251
+ strategy,
252
+ n_jobs,
253
+ chunk_size,
254
+ fallback,
255
+ "read_gene_effect",
256
+ lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
257
+ )
258
+ inputs = {
259
+ "All screens": {
260
+ "path": gene_effect,
261
+ "sort": "high",
262
+ "color": "#000000",
263
+ },
264
+ }
265
+ data, _ = timed_call(
266
+ timings,
267
+ strategy,
268
+ n_jobs,
269
+ chunk_size,
270
+ fallback,
271
+ "load_datasets",
272
+ lambda: flex.load_datasets(inputs),
273
+ )
274
+ timed_call(
275
+ timings,
276
+ strategy,
277
+ n_jobs,
278
+ chunk_size,
279
+ fallback,
280
+ "load_gold_standard",
281
+ flex.load_gold_standard,
282
+ )
283
+
284
+ name, dataset = next(iter(data.items()))
285
+ corr = timed_call(
286
+ timings,
287
+ strategy,
288
+ n_jobs,
289
+ chunk_size,
290
+ fallback,
291
+ "perform_corr",
292
+ lambda: flex.perform_corr(dataset, CORR_FUNCTION),
293
+ )
294
+ run_strategy_step(timings, strategy, n_jobs, chunk_size, fallback, name, corr)
295
+ timed_call(
296
+ timings,
297
+ strategy,
298
+ n_jobs,
299
+ chunk_size,
300
+ fallback,
301
+ "complex_contributions",
302
+ lambda: flex.complex_contributions(name),
303
+ )
304
+ except Exception as exc:
305
+ print(
306
+ f"Run failed for strategy={strategy}, n_jobs={n_jobs}, "
307
+ f"chunk_size={chunk_size}: {exc!r}",
308
+ file=sys.stderr,
309
+ )
310
+ timings.append(
311
+ {
312
+ "gold_standard": GOLD_STANDARD,
313
+ "strategy": strategy,
314
+ "n_jobs": n_jobs,
315
+ "chunk_size": chunk_size,
316
+ "fallback": fallback,
317
+ "step": "failed",
318
+ "seconds": perf_counter() - workflow_start,
319
+ "corr_function": CORR_FUNCTION,
320
+ "is_corr": None,
321
+ "status": "failed",
322
+ "error": repr(exc),
323
+ }
324
+ )
325
+ finally:
326
+ timings.append(
327
+ {
328
+ "gold_standard": GOLD_STANDARD,
329
+ "strategy": strategy,
330
+ "n_jobs": n_jobs,
331
+ "chunk_size": chunk_size,
332
+ "fallback": fallback,
333
+ "step": "total_runtime",
334
+ "seconds": perf_counter() - workflow_start,
335
+ "corr_function": CORR_FUNCTION,
336
+ "is_corr": None,
337
+ "status": "ok" if not any(t["status"] == "failed" for t in timings) else "failed",
338
+ "error": "",
339
+ }
340
+ )
341
+ output_folder.mkdir(parents=True, exist_ok=True)
342
+ pd.DataFrame(timings).to_csv(output_folder / "benchmark_results.csv", index=False)
343
+ gc.collect()
344
+
345
+ return timings
346
+
347
+
348
+ def write_reports(timings: list[dict[str, Any]]) -> None:
349
+ raw = pd.DataFrame(timings)
350
+ raw.to_csv(BENCHMARK_ROOT / "benchmark_gobp_optimization_comparison.csv", index=False)
351
+
352
+ ok = raw[raw["status"] == "ok"].copy()
353
+ if ok.empty:
354
+ return
355
+
356
+ summary = ok.pivot_table(
357
+ index=[
358
+ "gold_standard",
359
+ "strategy",
360
+ "n_jobs",
361
+ "chunk_size",
362
+ "fallback",
363
+ "corr_function",
364
+ ],
365
+ columns="step",
366
+ values="seconds",
367
+ aggfunc="first",
368
+ ).reset_index()
369
+ summary.columns.name = None
370
+ summary.to_csv(BENCHMARK_ROOT / "benchmark_gobp_optimization_summary.csv", index=False)
371
+
372
+
373
+ def main() -> None:
374
+ if not GENE_EFFECT_PATH.exists():
375
+ raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
376
+
377
+ BENCHMARK_ROOT.mkdir(parents=True, exist_ok=False)
378
+ all_timings: list[dict[str, Any]] = []
379
+ print(f"Checkpoint commit: {CHECKPOINT_COMMIT}")
380
+ print(f"Benchmark output folder: {BENCHMARK_ROOT}")
381
+ print("Plot calls, mPR preparation, and plot_mpr_summary are excluded.")
382
+
383
+ for strategy in STRATEGIES:
384
+ for n_jobs, chunk_size in COMBINATIONS:
385
+ print(
386
+ f"Running GOBP optimization benchmark: strategy={strategy}, "
387
+ f"n_jobs={n_jobs}, chunk_size={chunk_size}"
388
+ )
389
+ timings = run_combination(strategy, n_jobs, chunk_size)
390
+ all_timings.extend(timings)
391
+ write_reports(all_timings)
392
+
393
+ failed = any(row["status"] == "failed" for row in timings)
394
+ if failed and n_jobs == 8:
395
+ fallback_n_jobs, fallback_chunk_size = FALLBACK_COMBINATION
396
+ print(
397
+ f"Running fallback for strategy={strategy}: "
398
+ f"n_jobs={fallback_n_jobs}, chunk_size={fallback_chunk_size}"
399
+ )
400
+ all_timings.extend(
401
+ run_combination(
402
+ strategy,
403
+ fallback_n_jobs,
404
+ fallback_chunk_size,
405
+ fallback=True,
406
+ )
407
+ )
408
+ write_reports(all_timings)
409
+
410
+ print(
411
+ "Benchmark summary saved to: "
412
+ f"{BENCHMARK_ROOT / 'benchmark_gobp_optimization_summary.csv'}"
413
+ )
414
+
415
+
416
+ if __name__ == "__main__":
417
+ main()