pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,245 @@
1
+ """Compare CORUM per-complex runtime with different n_jobs values.
2
+
3
+ Run from any directory with:
4
+ python path/to/src/pythonflex/examples/runtime_benchmark_corum_njobs.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gc
10
+ import os
11
+ import sys
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from time import perf_counter
15
+ from typing import Any, Callable
16
+
17
+ import pandas as pd
18
+
19
+
20
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
21
+ SRC_ROOT = PROJECT_ROOT / "src"
22
+ if str(SRC_ROOT) not in sys.path:
23
+ sys.path.insert(0, str(SRC_ROOT))
24
+
25
+ os.environ.setdefault("MPLBACKEND", "Agg")
26
+
27
+ import pythonflex as flex # noqa: E402
28
+
29
+
30
+ GENE_EFFECT_PATH = Path(
31
+ "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
32
+ )
33
+ CORR_FUNCTION = "numpy_without_mask"
34
+ CHUNK_SIZE = 200
35
+ MAX_NBYTES = "100M"
36
+ N_JOBS_VALUES = (2, 4, 8)
37
+ BENCHMARK_ROOT = (
38
+ PROJECT_ROOT
39
+ / "output"
40
+ / f"runtime_benchmark_corum_corr_reuse_njobs_{datetime.now():%Y%m%d_%H%M%S}"
41
+ )
42
+
43
+
44
+ def build_config(output_folder: Path, n_jobs: int) -> dict[str, Any]:
45
+ return {
46
+ "min_genes_in_complex": 2,
47
+ "min_genes_per_complex_analysis": 3,
48
+ "output_folder": str(output_folder),
49
+ "gold_standard": "CORUM",
50
+ "color_map": "RdYlBu",
51
+ "jaccard": True,
52
+ "analysis_genes": "common",
53
+ "plotting": {
54
+ "save_plot": True,
55
+ "show_plot": False,
56
+ "output_type": "png",
57
+ },
58
+ "preprocessing": {
59
+ "fill_na": True,
60
+ "normalize": False,
61
+ },
62
+ "corr_function": CORR_FUNCTION,
63
+ "per_complex": {
64
+ "n_jobs": n_jobs,
65
+ "chunk_size": CHUNK_SIZE,
66
+ "max_nbytes": MAX_NBYTES,
67
+ },
68
+ "logging": {
69
+ "visible_levels": ["DONE", "INFO", "WARNING"],
70
+ },
71
+ }
72
+
73
+
74
+ def timed_call(
75
+ timings: list[dict[str, Any]],
76
+ n_jobs: int,
77
+ step: str,
78
+ operation: Callable[[], Any],
79
+ *,
80
+ is_corr: bool | None = None,
81
+ ) -> Any:
82
+ start = perf_counter()
83
+ result = operation()
84
+ timings.append(
85
+ {
86
+ "n_jobs": n_jobs,
87
+ "step": step,
88
+ "seconds": perf_counter() - start,
89
+ "chunk_size": CHUNK_SIZE,
90
+ "corr_function": CORR_FUNCTION,
91
+ "is_corr": is_corr,
92
+ }
93
+ )
94
+ return result
95
+
96
+
97
+ def run_n_jobs(n_jobs: int) -> list[dict[str, Any]]:
98
+ output_folder = BENCHMARK_ROOT / f"n_jobs_{n_jobs:02d}"
99
+ timings: list[dict[str, Any]] = []
100
+ workflow_start = perf_counter()
101
+
102
+ timed_call(
103
+ timings,
104
+ n_jobs,
105
+ "initialize",
106
+ lambda: flex.initialize(build_config(output_folder, n_jobs)),
107
+ )
108
+ gene_effect = timed_call(
109
+ timings,
110
+ n_jobs,
111
+ "read_gene_effect",
112
+ lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
113
+ )
114
+ inputs = {
115
+ "All screens": {
116
+ "path": gene_effect,
117
+ "sort": "high",
118
+ "color": "#000000",
119
+ },
120
+ }
121
+ data, _ = timed_call(
122
+ timings,
123
+ n_jobs,
124
+ "load_datasets",
125
+ lambda: flex.load_datasets(inputs),
126
+ )
127
+ timed_call(timings, n_jobs, "load_gold_standard", flex.load_gold_standard)
128
+
129
+ name, dataset = next(iter(data.items()))
130
+ corr = timed_call(
131
+ timings,
132
+ n_jobs,
133
+ "perform_corr",
134
+ lambda: flex.perform_corr(dataset, CORR_FUNCTION),
135
+ )
136
+ timed_call(
137
+ timings,
138
+ n_jobs,
139
+ "pra_is_corr_true",
140
+ lambda: flex.pra(name, corr, is_corr=True),
141
+ is_corr=True,
142
+ )
143
+ timed_call(
144
+ timings,
145
+ n_jobs,
146
+ "pra_percomplex_is_corr_true",
147
+ lambda: flex.pra_percomplex(
148
+ name,
149
+ corr,
150
+ is_corr=True,
151
+ chunk_size=CHUNK_SIZE,
152
+ n_jobs=n_jobs,
153
+ ),
154
+ is_corr=True,
155
+ )
156
+ timed_call(
157
+ timings,
158
+ n_jobs,
159
+ "complex_contributions",
160
+ lambda: flex.complex_contributions(name),
161
+ )
162
+
163
+ timed_call(
164
+ timings,
165
+ n_jobs,
166
+ "plot_precision_recall_curve",
167
+ flex.plot_precision_recall_curve,
168
+ )
169
+ timed_call(timings, n_jobs, "plot_auc_scores", flex.plot_auc_scores)
170
+ timed_call(
171
+ timings,
172
+ n_jobs,
173
+ "plot_significant_complexes",
174
+ flex.plot_significant_complexes,
175
+ )
176
+ timed_call(
177
+ timings,
178
+ n_jobs,
179
+ "plot_percomplex_scatter",
180
+ lambda: flex.plot_percomplex_scatter(n_top=20),
181
+ )
182
+ timed_call(
183
+ timings,
184
+ n_jobs,
185
+ "plot_percomplex_scatter_bysize",
186
+ flex.plot_percomplex_scatter_bysize,
187
+ )
188
+ timed_call(
189
+ timings,
190
+ n_jobs,
191
+ "plot_complex_contributions",
192
+ flex.plot_complex_contributions,
193
+ )
194
+
195
+ timings.append(
196
+ {
197
+ "n_jobs": n_jobs,
198
+ "step": "total_runtime",
199
+ "seconds": perf_counter() - workflow_start,
200
+ "chunk_size": CHUNK_SIZE,
201
+ "corr_function": CORR_FUNCTION,
202
+ "is_corr": None,
203
+ }
204
+ )
205
+ output_folder.mkdir(parents=True, exist_ok=True)
206
+ pd.DataFrame(timings).to_csv(output_folder / "benchmark_results.csv", index=False)
207
+ return timings
208
+
209
+
210
+ def write_reports(timings: list[dict[str, Any]]) -> None:
211
+ raw = pd.DataFrame(timings)
212
+ raw.to_csv(BENCHMARK_ROOT / "benchmark_corum_njobs_comparison.csv", index=False)
213
+ summary = raw.pivot_table(
214
+ index=["n_jobs", "chunk_size", "corr_function"],
215
+ columns="step",
216
+ values="seconds",
217
+ aggfunc="first",
218
+ ).reset_index()
219
+ summary.columns.name = None
220
+ summary.to_csv(BENCHMARK_ROOT / "benchmark_corum_njobs_summary.csv", index=False)
221
+
222
+
223
+ def main() -> None:
224
+ if not GENE_EFFECT_PATH.exists():
225
+ raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
226
+
227
+ BENCHMARK_ROOT.mkdir(parents=True, exist_ok=False)
228
+ all_timings: list[dict[str, Any]] = []
229
+ print(f"Benchmark output folder: {BENCHMARK_ROOT}")
230
+ print("mPR preparation and plot_mpr_summary are excluded.")
231
+
232
+ for n_jobs in N_JOBS_VALUES:
233
+ print(f"Running CORUM corr-reuse benchmark with n_jobs={n_jobs}")
234
+ all_timings.extend(run_n_jobs(n_jobs))
235
+ write_reports(all_timings)
236
+ gc.collect()
237
+
238
+ print(
239
+ "Benchmark summary saved to: "
240
+ f"{BENCHMARK_ROOT / 'benchmark_corum_njobs_summary.csv'}"
241
+ )
242
+
243
+
244
+ if __name__ == "__main__":
245
+ main()
@@ -0,0 +1,319 @@
1
+ """Compare GOBP per-complex runtime across n_jobs and chunk sizes.
2
+
3
+ Run from any directory with:
4
+ python path/to/src/pythonflex/examples/runtime_benchmark_gobp_njobs_chunks.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gc
10
+ import os
11
+ import sys
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from time import perf_counter
15
+ from typing import Any, Callable
16
+
17
+ import pandas as pd
18
+
19
+
20
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
21
+ SRC_ROOT = PROJECT_ROOT / "src"
22
+ if str(SRC_ROOT) not in sys.path:
23
+ sys.path.insert(0, str(SRC_ROOT))
24
+
25
+ os.environ.setdefault("MPLBACKEND", "Agg")
26
+
27
+ import pythonflex as flex # noqa: E402
28
+
29
+
30
+ GENE_EFFECT_PATH = Path(
31
+ "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
32
+ )
33
+ GOLD_STANDARD = "GOBP"
34
+ CORR_FUNCTION = "numpy_without_mask"
35
+ MAX_NBYTES = "100M"
36
+ STAGE_1_N_JOBS = (2, 4, 8, 16)
37
+ STAGE_1_CHUNK_SIZE = 200
38
+ STAGE_2_CHUNK_SIZES = (100, 400)
39
+ BENCHMARK_ROOT = (
40
+ PROJECT_ROOT
41
+ / "output"
42
+ / f"runtime_benchmark_gobp_corr_reuse_njobs_chunks_{datetime.now():%Y%m%d_%H%M%S}"
43
+ )
44
+
45
+
46
+ def build_config(output_folder: Path, n_jobs: int, chunk_size: int) -> dict[str, Any]:
47
+ return {
48
+ "min_genes_in_complex": 2,
49
+ "min_genes_per_complex_analysis": 3,
50
+ "output_folder": str(output_folder),
51
+ "gold_standard": GOLD_STANDARD,
52
+ "color_map": "RdYlBu",
53
+ "jaccard": True,
54
+ "analysis_genes": "common",
55
+ "plotting": {
56
+ "save_plot": True,
57
+ "show_plot": False,
58
+ "output_type": "png",
59
+ },
60
+ "preprocessing": {
61
+ "fill_na": True,
62
+ "normalize": False,
63
+ },
64
+ "corr_function": CORR_FUNCTION,
65
+ "per_complex": {
66
+ "n_jobs": n_jobs,
67
+ "chunk_size": chunk_size,
68
+ "max_nbytes": MAX_NBYTES,
69
+ },
70
+ "logging": {
71
+ "visible_levels": ["DONE", "INFO", "WARNING", "ERROR"],
72
+ },
73
+ }
74
+
75
+
76
+ def timed_call(
77
+ timings: list[dict[str, Any]],
78
+ stage: str,
79
+ n_jobs: int,
80
+ chunk_size: int,
81
+ step: str,
82
+ operation: Callable[[], Any],
83
+ *,
84
+ is_corr: bool | None = None,
85
+ ) -> Any:
86
+ start = perf_counter()
87
+ result = operation()
88
+ timings.append(
89
+ {
90
+ "gold_standard": GOLD_STANDARD,
91
+ "stage": stage,
92
+ "n_jobs": n_jobs,
93
+ "chunk_size": chunk_size,
94
+ "step": step,
95
+ "seconds": perf_counter() - start,
96
+ "corr_function": CORR_FUNCTION,
97
+ "is_corr": is_corr,
98
+ "status": "ok",
99
+ "error": "",
100
+ }
101
+ )
102
+ return result
103
+
104
+
105
+ def run_combination(stage: str, n_jobs: int, chunk_size: int) -> list[dict[str, Any]]:
106
+ output_folder = BENCHMARK_ROOT / f"n_jobs_{n_jobs:02d}_chunk_{chunk_size}"
107
+ timings: list[dict[str, Any]] = []
108
+ workflow_start = perf_counter()
109
+
110
+ try:
111
+ timed_call(
112
+ timings,
113
+ stage,
114
+ n_jobs,
115
+ chunk_size,
116
+ "initialize",
117
+ lambda: flex.initialize(build_config(output_folder, n_jobs, chunk_size)),
118
+ )
119
+ gene_effect = timed_call(
120
+ timings,
121
+ stage,
122
+ n_jobs,
123
+ chunk_size,
124
+ "read_gene_effect",
125
+ lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
126
+ )
127
+ inputs = {
128
+ "All screens": {
129
+ "path": gene_effect,
130
+ "sort": "high",
131
+ "color": "#000000",
132
+ },
133
+ }
134
+ data, _ = timed_call(
135
+ timings,
136
+ stage,
137
+ n_jobs,
138
+ chunk_size,
139
+ "load_datasets",
140
+ lambda: flex.load_datasets(inputs),
141
+ )
142
+ timed_call(
143
+ timings,
144
+ stage,
145
+ n_jobs,
146
+ chunk_size,
147
+ "load_gold_standard",
148
+ flex.load_gold_standard,
149
+ )
150
+
151
+ name, dataset = next(iter(data.items()))
152
+ corr = timed_call(
153
+ timings,
154
+ stage,
155
+ n_jobs,
156
+ chunk_size,
157
+ "perform_corr",
158
+ lambda: flex.perform_corr(dataset, CORR_FUNCTION),
159
+ )
160
+ timed_call(
161
+ timings,
162
+ stage,
163
+ n_jobs,
164
+ chunk_size,
165
+ "pra_is_corr_true",
166
+ lambda: flex.pra(name, corr, is_corr=True),
167
+ is_corr=True,
168
+ )
169
+ timed_call(
170
+ timings,
171
+ stage,
172
+ n_jobs,
173
+ chunk_size,
174
+ "pra_percomplex_is_corr_true",
175
+ lambda: flex.pra_percomplex(
176
+ name,
177
+ corr,
178
+ is_corr=True,
179
+ chunk_size=chunk_size,
180
+ n_jobs=n_jobs,
181
+ ),
182
+ is_corr=True,
183
+ )
184
+ timed_call(
185
+ timings,
186
+ stage,
187
+ n_jobs,
188
+ chunk_size,
189
+ "complex_contributions",
190
+ lambda: flex.complex_contributions(name),
191
+ )
192
+ except Exception as exc:
193
+ print(
194
+ f"Run failed for n_jobs={n_jobs}, chunk_size={chunk_size}: {exc!r}",
195
+ file=sys.stderr,
196
+ )
197
+ timings.append(
198
+ {
199
+ "gold_standard": GOLD_STANDARD,
200
+ "stage": stage,
201
+ "n_jobs": n_jobs,
202
+ "chunk_size": chunk_size,
203
+ "step": "failed",
204
+ "seconds": perf_counter() - workflow_start,
205
+ "corr_function": CORR_FUNCTION,
206
+ "is_corr": None,
207
+ "status": "failed",
208
+ "error": repr(exc),
209
+ }
210
+ )
211
+ finally:
212
+ timings.append(
213
+ {
214
+ "gold_standard": GOLD_STANDARD,
215
+ "stage": stage,
216
+ "n_jobs": n_jobs,
217
+ "chunk_size": chunk_size,
218
+ "step": "total_runtime",
219
+ "seconds": perf_counter() - workflow_start,
220
+ "corr_function": CORR_FUNCTION,
221
+ "is_corr": None,
222
+ "status": "ok" if not any(t["status"] == "failed" for t in timings) else "failed",
223
+ "error": "",
224
+ }
225
+ )
226
+ output_folder.mkdir(parents=True, exist_ok=True)
227
+ pd.DataFrame(timings).to_csv(output_folder / "benchmark_results.csv", index=False)
228
+ gc.collect()
229
+
230
+ return timings
231
+
232
+
233
+ def write_reports(timings: list[dict[str, Any]]) -> None:
234
+ raw = pd.DataFrame(timings)
235
+ raw.to_csv(
236
+ BENCHMARK_ROOT / "benchmark_gobp_njobs_chunks_comparison.csv",
237
+ index=False,
238
+ )
239
+
240
+ ok = raw[raw["status"] == "ok"].copy()
241
+ if ok.empty:
242
+ return
243
+
244
+ summary = ok.pivot_table(
245
+ index=["gold_standard", "stage", "n_jobs", "chunk_size", "corr_function"],
246
+ columns="step",
247
+ values="seconds",
248
+ aggfunc="first",
249
+ ).reset_index()
250
+ summary.columns.name = None
251
+ summary.to_csv(
252
+ BENCHMARK_ROOT / "benchmark_gobp_njobs_chunks_summary.csv",
253
+ index=False,
254
+ )
255
+
256
+
257
+ def select_stage_1_winner(timings: list[dict[str, Any]]) -> int:
258
+ raw = pd.DataFrame(timings)
259
+ stage_1 = raw[
260
+ (raw["stage"] == "stage_1")
261
+ & (raw["chunk_size"] == STAGE_1_CHUNK_SIZE)
262
+ & (raw["status"] == "ok")
263
+ & (raw["step"].isin(["pra_percomplex_is_corr_true", "total_runtime"]))
264
+ ]
265
+ if stage_1.empty:
266
+ raise RuntimeError("No successful Stage 1 timings were recorded.")
267
+
268
+ pivot = stage_1.pivot_table(
269
+ index=["n_jobs"],
270
+ columns="step",
271
+ values="seconds",
272
+ aggfunc="first",
273
+ ).reset_index()
274
+ if "pra_percomplex_is_corr_true" not in pivot:
275
+ raise RuntimeError("Stage 1 completed without per-complex timing rows.")
276
+
277
+ sort_columns = ["pra_percomplex_is_corr_true"]
278
+ if "total_runtime" in pivot:
279
+ sort_columns.append("total_runtime")
280
+ winner = pivot.sort_values(sort_columns, ascending=True).iloc[0]
281
+ return int(winner["n_jobs"])
282
+
283
+
284
+ def main() -> None:
285
+ if not GENE_EFFECT_PATH.exists():
286
+ raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
287
+
288
+ BENCHMARK_ROOT.mkdir(parents=True, exist_ok=False)
289
+ all_timings: list[dict[str, Any]] = []
290
+ print(f"Benchmark output folder: {BENCHMARK_ROOT}")
291
+ print("Plot calls, mPR preparation, and plot_mpr_summary are excluded.")
292
+
293
+ for n_jobs in STAGE_1_N_JOBS:
294
+ print(
295
+ f"Running GOBP Stage 1 benchmark: n_jobs={n_jobs}, "
296
+ f"chunk_size={STAGE_1_CHUNK_SIZE}"
297
+ )
298
+ all_timings.extend(run_combination("stage_1", n_jobs, STAGE_1_CHUNK_SIZE))
299
+ write_reports(all_timings)
300
+
301
+ winner_n_jobs = select_stage_1_winner(all_timings)
302
+ print(f"Stage 1 winner by per-complex runtime: n_jobs={winner_n_jobs}")
303
+
304
+ for chunk_size in STAGE_2_CHUNK_SIZES:
305
+ print(
306
+ f"Running GOBP Stage 2 benchmark: n_jobs={winner_n_jobs}, "
307
+ f"chunk_size={chunk_size}"
308
+ )
309
+ all_timings.extend(run_combination("stage_2", winner_n_jobs, chunk_size))
310
+ write_reports(all_timings)
311
+
312
+ print(
313
+ "Benchmark summary saved to: "
314
+ f"{BENCHMARK_ROOT / 'benchmark_gobp_njobs_chunks_summary.csv'}"
315
+ )
316
+
317
+
318
+ if __name__ == "__main__":
319
+ main()