pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,534 @@
1
+ """Run 10 core runtime benchmarks for CORUM, PATHWAY, and GOBP.
2
+
3
+ This benchmark uses one in-memory correlation matrix per run and the package's
4
+ memmap + process parallelism path for per-complex PRA.
5
+
6
+ Run from the project root with:
7
+ python src/pythonflex/examples/runtime_benchmark_10_runs_memmap.py
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import os
14
+ import sys
15
+ import traceback
16
+ from datetime import datetime
17
+ from pathlib import Path
18
+ from time import perf_counter
19
+ from typing import Any, Callable
20
+
21
+ import pandas as pd
22
+
23
+
24
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
25
+ SRC_ROOT = PROJECT_ROOT / "src"
26
+ if str(SRC_ROOT) not in sys.path:
27
+ sys.path.insert(0, str(SRC_ROOT))
28
+
29
+ os.environ.setdefault("MPLBACKEND", "Agg")
30
+
31
+ import pythonflex as flex # noqa: E402
32
+
33
+
34
+ DEFAULT_GENE_EFFECT_PATH = Path(
35
+ "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
36
+ )
37
+ DEFAULT_RUN_COUNT = 10
38
+ GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
39
+ DEFAULT_N_JOBS = 8
40
+ DEFAULT_CHUNK_SIZE = 400
41
+ DEFAULT_MAX_NBYTES = "100M"
42
+ DEFAULT_CORR_FUNCTION = "numpy_without_mask"
43
+ DEFAULT_BENCHMARK_ROOT = (
44
+ PROJECT_ROOT
45
+ / "output"
46
+ / f"runtime_benchmark_10_runs_memmap_njobs8_chunk400_{datetime.now():%Y%m%d_%H%M%S}"
47
+ )
48
+
49
+
50
+ def build_config(
51
+ gold_standard: str,
52
+ output_folder: Path,
53
+ corr_function: str,
54
+ n_jobs: int,
55
+ chunk_size: int,
56
+ max_nbytes: str,
57
+ ) -> dict[str, Any]:
58
+ return {
59
+ "min_genes_in_complex": 2,
60
+ "min_genes_per_complex_analysis": 3,
61
+ "output_folder": str(output_folder),
62
+ "gold_standard": gold_standard,
63
+ "color_map": "RdYlBu",
64
+ "jaccard": True,
65
+ "analysis_genes": "common",
66
+ "plotting": {
67
+ "save_plot": True,
68
+ "show_plot": False,
69
+ "output_type": "png",
70
+ },
71
+ "preprocessing": {
72
+ "fill_na": True,
73
+ "normalize": False,
74
+ },
75
+ "corr_function": corr_function,
76
+ "per_complex": {
77
+ "n_jobs": n_jobs,
78
+ "chunk_size": chunk_size,
79
+ "max_nbytes": max_nbytes,
80
+ },
81
+ "logging": {
82
+ "visible_levels": ["DONE", "INFO", "WARNING", "ERROR"],
83
+ },
84
+ }
85
+
86
+
87
+ def base_row(
88
+ gold_standard: str,
89
+ repetition: int,
90
+ step: str,
91
+ n_jobs: int,
92
+ chunk_size: int,
93
+ max_nbytes: str,
94
+ corr_function: str,
95
+ is_corr: bool | None,
96
+ ) -> dict[str, Any]:
97
+ return {
98
+ "gold_standard": gold_standard,
99
+ "repetition": repetition,
100
+ "step": step,
101
+ "seconds": None,
102
+ "n_jobs": n_jobs,
103
+ "chunk_size": chunk_size,
104
+ "max_nbytes": max_nbytes,
105
+ "corr_function": corr_function,
106
+ "is_corr": is_corr,
107
+ "status": "success",
108
+ "error": "",
109
+ }
110
+
111
+
112
+ def timed_call(
113
+ timings: list[dict[str, Any]],
114
+ gold_standard: str,
115
+ repetition: int,
116
+ step: str,
117
+ operation: Callable[[], Any],
118
+ n_jobs: int,
119
+ chunk_size: int,
120
+ max_nbytes: str,
121
+ corr_function: str,
122
+ is_corr: bool | None = None,
123
+ ) -> Any:
124
+ row = base_row(
125
+ gold_standard,
126
+ repetition,
127
+ step,
128
+ n_jobs,
129
+ chunk_size,
130
+ max_nbytes,
131
+ corr_function,
132
+ is_corr,
133
+ )
134
+ start = perf_counter()
135
+ try:
136
+ result = operation()
137
+ except Exception as exc:
138
+ row["seconds"] = perf_counter() - start
139
+ row["status"] = "failed"
140
+ row["error"] = "".join(
141
+ traceback.format_exception_only(type(exc), exc)
142
+ ).strip()
143
+ timings.append(row)
144
+ raise
145
+ row["seconds"] = perf_counter() - start
146
+ timings.append(row)
147
+ return result
148
+
149
+
150
+ def add_total_runtime(
151
+ timings: list[dict[str, Any]],
152
+ gold_standard: str,
153
+ repetition: int,
154
+ seconds: float,
155
+ n_jobs: int,
156
+ chunk_size: int,
157
+ max_nbytes: str,
158
+ corr_function: str,
159
+ status: str,
160
+ error: str = "",
161
+ ) -> None:
162
+ row = base_row(
163
+ gold_standard,
164
+ repetition,
165
+ "total_runtime",
166
+ n_jobs,
167
+ chunk_size,
168
+ max_nbytes,
169
+ corr_function,
170
+ None,
171
+ )
172
+ row["seconds"] = seconds
173
+ row["status"] = status
174
+ row["error"] = error
175
+ timings.append(row)
176
+
177
+
178
+ def run_repetition(
179
+ gold_standard: str,
180
+ repetition: int,
181
+ benchmark_root: Path,
182
+ gene_effect_path: Path,
183
+ corr_function: str,
184
+ n_jobs: int,
185
+ chunk_size: int,
186
+ max_nbytes: str,
187
+ ) -> list[dict[str, Any]]:
188
+ output_folder = benchmark_root / gold_standard / f"run_{repetition:02d}"
189
+ output_folder.mkdir(parents=True, exist_ok=True)
190
+ timings: list[dict[str, Any]] = []
191
+ workflow_start = perf_counter()
192
+ status = "success"
193
+ error = ""
194
+
195
+ try:
196
+ timed_call(
197
+ timings,
198
+ gold_standard,
199
+ repetition,
200
+ "initialize",
201
+ lambda: flex.initialize(
202
+ build_config(
203
+ gold_standard,
204
+ output_folder,
205
+ corr_function,
206
+ n_jobs,
207
+ chunk_size,
208
+ max_nbytes,
209
+ )
210
+ ),
211
+ n_jobs,
212
+ chunk_size,
213
+ max_nbytes,
214
+ corr_function,
215
+ )
216
+ gene_effect = timed_call(
217
+ timings,
218
+ gold_standard,
219
+ repetition,
220
+ "read_gene_effect",
221
+ lambda: pd.read_csv(gene_effect_path, index_col=0),
222
+ n_jobs,
223
+ chunk_size,
224
+ max_nbytes,
225
+ corr_function,
226
+ )
227
+ inputs = {
228
+ "All screens": {
229
+ "path": gene_effect,
230
+ "sort": "high",
231
+ "color": "#000000",
232
+ },
233
+ }
234
+ datasets, _ = timed_call(
235
+ timings,
236
+ gold_standard,
237
+ repetition,
238
+ "load_datasets",
239
+ lambda: flex.load_datasets(inputs),
240
+ n_jobs,
241
+ chunk_size,
242
+ max_nbytes,
243
+ corr_function,
244
+ )
245
+ timed_call(
246
+ timings,
247
+ gold_standard,
248
+ repetition,
249
+ "load_gold_standard",
250
+ flex.load_gold_standard,
251
+ n_jobs,
252
+ chunk_size,
253
+ max_nbytes,
254
+ corr_function,
255
+ )
256
+
257
+ name, dataset = next(iter(datasets.items()))
258
+ corr = timed_call(
259
+ timings,
260
+ gold_standard,
261
+ repetition,
262
+ "perform_corr",
263
+ lambda: flex.perform_corr(dataset, corr_function),
264
+ n_jobs,
265
+ chunk_size,
266
+ max_nbytes,
267
+ corr_function,
268
+ is_corr=False,
269
+ )
270
+ timed_call(
271
+ timings,
272
+ gold_standard,
273
+ repetition,
274
+ "pra_is_corr_true",
275
+ lambda: flex.pra(name, corr, is_corr=True),
276
+ n_jobs,
277
+ chunk_size,
278
+ max_nbytes,
279
+ corr_function,
280
+ is_corr=True,
281
+ )
282
+ timed_call(
283
+ timings,
284
+ gold_standard,
285
+ repetition,
286
+ "pra_percomplex_is_corr_true",
287
+ lambda: flex.pra_percomplex(
288
+ name,
289
+ corr,
290
+ is_corr=True,
291
+ n_jobs=n_jobs,
292
+ chunk_size=chunk_size,
293
+ ),
294
+ n_jobs,
295
+ chunk_size,
296
+ max_nbytes,
297
+ corr_function,
298
+ is_corr=True,
299
+ )
300
+ timed_call(
301
+ timings,
302
+ gold_standard,
303
+ repetition,
304
+ "complex_contributions",
305
+ lambda: flex.complex_contributions(name),
306
+ n_jobs,
307
+ chunk_size,
308
+ max_nbytes,
309
+ corr_function,
310
+ )
311
+ except Exception as exc:
312
+ status = "failed"
313
+ error = "".join(traceback.format_exception_only(type(exc), exc)).strip()
314
+ (output_folder / "benchmark_error.txt").write_text(
315
+ traceback.format_exc(),
316
+ encoding="utf-8",
317
+ )
318
+ finally:
319
+ add_total_runtime(
320
+ timings,
321
+ gold_standard,
322
+ repetition,
323
+ perf_counter() - workflow_start,
324
+ n_jobs,
325
+ chunk_size,
326
+ max_nbytes,
327
+ corr_function,
328
+ status,
329
+ error,
330
+ )
331
+ pd.DataFrame(timings).to_csv(
332
+ output_folder / "benchmark_results.csv",
333
+ index=False,
334
+ )
335
+ return timings
336
+
337
+
338
+ def write_reports(timings: list[dict[str, Any]], benchmark_root: Path) -> None:
339
+ benchmark_root.mkdir(parents=True, exist_ok=True)
340
+ raw = pd.DataFrame(timings)
341
+ raw.to_csv(benchmark_root / "benchmark_results_all_runs.csv", index=False)
342
+
343
+ success = raw[raw["status"] == "success"]
344
+ if success.empty:
345
+ summary = pd.DataFrame(
346
+ columns=[
347
+ "gold_standard",
348
+ "step",
349
+ "count",
350
+ "mean",
351
+ "std",
352
+ "min",
353
+ "max",
354
+ ]
355
+ )
356
+ else:
357
+ summary = (
358
+ success.groupby(["gold_standard", "step"], as_index=False)["seconds"]
359
+ .agg(count="count", mean="mean", std="std", min="min", max="max")
360
+ )
361
+ summary.to_csv(benchmark_root / "benchmark_summary_mean_std.csv", index=False)
362
+
363
+ pivot = raw.pivot_table(
364
+ index=["gold_standard", "repetition"],
365
+ columns="step",
366
+ values="seconds",
367
+ aggfunc="first",
368
+ ).reset_index()
369
+ pivot.columns.name = None
370
+ pivot.to_csv(benchmark_root / "benchmark_pivot_by_run.csv", index=False)
371
+
372
+ total_rows = raw[raw["step"] == "total_runtime"].copy()
373
+ total_rows.to_csv(
374
+ benchmark_root / "benchmark_total_by_repetition.csv",
375
+ index=False,
376
+ )
377
+
378
+
379
+ def parse_gold_standards(value: str) -> list[str]:
380
+ selected = [item.strip().upper() for item in value.split(",") if item.strip()]
381
+ invalid = [item for item in selected if item not in GOLD_STANDARDS]
382
+ if invalid:
383
+ raise ValueError(
384
+ f"Invalid gold standard(s): {invalid}. Choose from {GOLD_STANDARDS}."
385
+ )
386
+ return selected
387
+
388
+
389
+ def load_existing_timings(benchmark_root: Path) -> list[dict[str, Any]]:
390
+ raw_path = benchmark_root / "benchmark_results_all_runs.csv"
391
+ if not raw_path.exists():
392
+ return []
393
+ return pd.read_csv(raw_path).to_dict("records")
394
+
395
+
396
+ def completed_runs(timings: list[dict[str, Any]]) -> set[tuple[str, int]]:
397
+ if not timings:
398
+ return set()
399
+ raw = pd.DataFrame(timings)
400
+ completed = raw[raw["step"] == "total_runtime"]
401
+ return set(
402
+ zip(
403
+ completed["gold_standard"].astype(str),
404
+ completed["repetition"].astype(int),
405
+ )
406
+ )
407
+
408
+
409
+ def parse_args() -> argparse.Namespace:
410
+ parser = argparse.ArgumentParser(
411
+ description="Run repeated core runtime benchmarks with memmap per-complex PRA."
412
+ )
413
+ parser.add_argument(
414
+ "--output-root",
415
+ type=Path,
416
+ default=DEFAULT_BENCHMARK_ROOT,
417
+ help="Benchmark output root. Defaults to a timestamped output folder.",
418
+ )
419
+ parser.add_argument(
420
+ "--gene-effect-path",
421
+ type=Path,
422
+ default=DEFAULT_GENE_EFFECT_PATH,
423
+ help="Path to DepMap gene_effect.csv.",
424
+ )
425
+ parser.add_argument(
426
+ "--gold-standards",
427
+ default=",".join(GOLD_STANDARDS),
428
+ help="Comma-separated subset of CORUM,PATHWAY,GOBP.",
429
+ )
430
+ parser.add_argument(
431
+ "--runs",
432
+ type=int,
433
+ default=DEFAULT_RUN_COUNT,
434
+ help="Number of repetitions per gold standard.",
435
+ )
436
+ parser.add_argument(
437
+ "--start-repetition",
438
+ type=int,
439
+ default=1,
440
+ help="First repetition to run, inclusive.",
441
+ )
442
+ parser.add_argument(
443
+ "--end-repetition",
444
+ type=int,
445
+ default=None,
446
+ help="Last repetition to run, inclusive. Defaults to --runs.",
447
+ )
448
+ parser.add_argument(
449
+ "--skip-existing",
450
+ action="store_true",
451
+ help="Skip gold-standard/repetition pairs already present in the raw CSV.",
452
+ )
453
+ parser.add_argument(
454
+ "--corr-function",
455
+ default=DEFAULT_CORR_FUNCTION,
456
+ help="Correlation implementation to use.",
457
+ )
458
+ parser.add_argument(
459
+ "--n-jobs",
460
+ type=int,
461
+ default=DEFAULT_N_JOBS,
462
+ help="Per-complex process worker count.",
463
+ )
464
+ parser.add_argument(
465
+ "--chunk-size",
466
+ type=int,
467
+ default=DEFAULT_CHUNK_SIZE,
468
+ help="Number of terms per per-complex joblib task.",
469
+ )
470
+ parser.add_argument(
471
+ "--max-nbytes",
472
+ default=DEFAULT_MAX_NBYTES,
473
+ help="Joblib max_nbytes setting from config.",
474
+ )
475
+ return parser.parse_args()
476
+
477
+
478
+ def main() -> None:
479
+ args = parse_args()
480
+ selected_gold_standards = parse_gold_standards(args.gold_standards)
481
+ end_repetition = args.end_repetition or args.runs
482
+
483
+ if args.runs <= 0:
484
+ raise ValueError("--runs must be greater than 0.")
485
+ if args.start_repetition <= 0:
486
+ raise ValueError("--start-repetition must be greater than 0.")
487
+ if end_repetition < args.start_repetition:
488
+ raise ValueError("--end-repetition must be >= --start-repetition.")
489
+ if end_repetition > args.runs:
490
+ raise ValueError("--end-repetition cannot be greater than --runs.")
491
+ if not args.gene_effect_path.exists():
492
+ raise FileNotFoundError(f"Gene-effect CSV not found: {args.gene_effect_path}")
493
+
494
+ benchmark_root = args.output_root.resolve()
495
+ benchmark_root.mkdir(parents=True, exist_ok=True)
496
+
497
+ all_timings = load_existing_timings(benchmark_root)
498
+ completed = completed_runs(all_timings) if args.skip_existing else set()
499
+
500
+ print(f"Benchmark output: {benchmark_root}")
501
+ print(
502
+ "Settings: "
503
+ f"corr_function={args.corr_function}, "
504
+ f"n_jobs={args.n_jobs}, chunk_size={args.chunk_size}, "
505
+ f"max_nbytes={args.max_nbytes}"
506
+ )
507
+
508
+ for gold_standard in selected_gold_standards:
509
+ for repetition in range(args.start_repetition, end_repetition + 1):
510
+ key = (gold_standard, repetition)
511
+ if key in completed:
512
+ print(f"Skipping existing {gold_standard} run {repetition:02d}")
513
+ continue
514
+
515
+ print(f"Running {gold_standard} run {repetition:02d}")
516
+ run_timings = run_repetition(
517
+ gold_standard=gold_standard,
518
+ repetition=repetition,
519
+ benchmark_root=benchmark_root,
520
+ gene_effect_path=args.gene_effect_path,
521
+ corr_function=args.corr_function,
522
+ n_jobs=args.n_jobs,
523
+ chunk_size=args.chunk_size,
524
+ max_nbytes=args.max_nbytes,
525
+ )
526
+ all_timings.extend(run_timings)
527
+ write_reports(all_timings, benchmark_root)
528
+
529
+ write_reports(all_timings, benchmark_root)
530
+ print("Benchmark complete.")
531
+
532
+
533
+ if __name__ == "__main__":
534
+ main()