pythonflex 0.3.3__py3-none-any.whl → 0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex/__init__.py +28 -4
- pythonflex/analysis.py +287 -578
- pythonflex/examples/basic_usage.py +40 -32
- pythonflex/examples/manuscript.py +37 -42
- pythonflex/examples/runtime/runtime_benchmark.py +218 -0
- pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
- pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
- pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
- pythonflex/old_functions.py +422 -0
- pythonflex/plotting.py +655 -242
- pythonflex/preprocessing.py +62 -60
- pythonflex/utils.py +36 -9
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/METADATA +9 -4
- pythonflex-0.4.dist-info/RECORD +32 -0
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
- pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
- pythonflex-0.3.3.dist-info/RECORD +0 -24
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Compare CORUM per-complex runtime with different n_jobs values.
|
|
2
|
+
|
|
3
|
+
Run from any directory with:
|
|
4
|
+
python path/to/src/pythonflex/examples/runtime_benchmark_corum_njobs.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import gc
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from time import perf_counter
|
|
15
|
+
from typing import Any, Callable
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
|
21
|
+
SRC_ROOT = PROJECT_ROOT / "src"
|
|
22
|
+
if str(SRC_ROOT) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(SRC_ROOT))
|
|
24
|
+
|
|
25
|
+
os.environ.setdefault("MPLBACKEND", "Agg")
|
|
26
|
+
|
|
27
|
+
import pythonflex as flex # noqa: E402
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
GENE_EFFECT_PATH = Path(
|
|
31
|
+
"C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
|
|
32
|
+
)
|
|
33
|
+
CORR_FUNCTION = "numpy_without_mask"
|
|
34
|
+
CHUNK_SIZE = 200
|
|
35
|
+
MAX_NBYTES = "100M"
|
|
36
|
+
N_JOBS_VALUES = (2, 4, 8)
|
|
37
|
+
BENCHMARK_ROOT = (
|
|
38
|
+
PROJECT_ROOT
|
|
39
|
+
/ "output"
|
|
40
|
+
/ f"runtime_benchmark_corum_corr_reuse_njobs_{datetime.now():%Y%m%d_%H%M%S}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def build_config(output_folder: Path, n_jobs: int) -> dict[str, Any]:
|
|
45
|
+
return {
|
|
46
|
+
"min_genes_in_complex": 2,
|
|
47
|
+
"min_genes_per_complex_analysis": 3,
|
|
48
|
+
"output_folder": str(output_folder),
|
|
49
|
+
"gold_standard": "CORUM",
|
|
50
|
+
"color_map": "RdYlBu",
|
|
51
|
+
"jaccard": True,
|
|
52
|
+
"analysis_genes": "common",
|
|
53
|
+
"plotting": {
|
|
54
|
+
"save_plot": True,
|
|
55
|
+
"show_plot": False,
|
|
56
|
+
"output_type": "png",
|
|
57
|
+
},
|
|
58
|
+
"preprocessing": {
|
|
59
|
+
"fill_na": True,
|
|
60
|
+
"normalize": False,
|
|
61
|
+
},
|
|
62
|
+
"corr_function": CORR_FUNCTION,
|
|
63
|
+
"per_complex": {
|
|
64
|
+
"n_jobs": n_jobs,
|
|
65
|
+
"chunk_size": CHUNK_SIZE,
|
|
66
|
+
"max_nbytes": MAX_NBYTES,
|
|
67
|
+
},
|
|
68
|
+
"logging": {
|
|
69
|
+
"visible_levels": ["DONE", "INFO", "WARNING"],
|
|
70
|
+
},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def timed_call(
|
|
75
|
+
timings: list[dict[str, Any]],
|
|
76
|
+
n_jobs: int,
|
|
77
|
+
step: str,
|
|
78
|
+
operation: Callable[[], Any],
|
|
79
|
+
*,
|
|
80
|
+
is_corr: bool | None = None,
|
|
81
|
+
) -> Any:
|
|
82
|
+
start = perf_counter()
|
|
83
|
+
result = operation()
|
|
84
|
+
timings.append(
|
|
85
|
+
{
|
|
86
|
+
"n_jobs": n_jobs,
|
|
87
|
+
"step": step,
|
|
88
|
+
"seconds": perf_counter() - start,
|
|
89
|
+
"chunk_size": CHUNK_SIZE,
|
|
90
|
+
"corr_function": CORR_FUNCTION,
|
|
91
|
+
"is_corr": is_corr,
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def run_n_jobs(n_jobs: int) -> list[dict[str, Any]]:
|
|
98
|
+
output_folder = BENCHMARK_ROOT / f"n_jobs_{n_jobs:02d}"
|
|
99
|
+
timings: list[dict[str, Any]] = []
|
|
100
|
+
workflow_start = perf_counter()
|
|
101
|
+
|
|
102
|
+
timed_call(
|
|
103
|
+
timings,
|
|
104
|
+
n_jobs,
|
|
105
|
+
"initialize",
|
|
106
|
+
lambda: flex.initialize(build_config(output_folder, n_jobs)),
|
|
107
|
+
)
|
|
108
|
+
gene_effect = timed_call(
|
|
109
|
+
timings,
|
|
110
|
+
n_jobs,
|
|
111
|
+
"read_gene_effect",
|
|
112
|
+
lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
|
|
113
|
+
)
|
|
114
|
+
inputs = {
|
|
115
|
+
"All screens": {
|
|
116
|
+
"path": gene_effect,
|
|
117
|
+
"sort": "high",
|
|
118
|
+
"color": "#000000",
|
|
119
|
+
},
|
|
120
|
+
}
|
|
121
|
+
data, _ = timed_call(
|
|
122
|
+
timings,
|
|
123
|
+
n_jobs,
|
|
124
|
+
"load_datasets",
|
|
125
|
+
lambda: flex.load_datasets(inputs),
|
|
126
|
+
)
|
|
127
|
+
timed_call(timings, n_jobs, "load_gold_standard", flex.load_gold_standard)
|
|
128
|
+
|
|
129
|
+
name, dataset = next(iter(data.items()))
|
|
130
|
+
corr = timed_call(
|
|
131
|
+
timings,
|
|
132
|
+
n_jobs,
|
|
133
|
+
"perform_corr",
|
|
134
|
+
lambda: flex.perform_corr(dataset, CORR_FUNCTION),
|
|
135
|
+
)
|
|
136
|
+
timed_call(
|
|
137
|
+
timings,
|
|
138
|
+
n_jobs,
|
|
139
|
+
"pra_is_corr_true",
|
|
140
|
+
lambda: flex.pra(name, corr, is_corr=True),
|
|
141
|
+
is_corr=True,
|
|
142
|
+
)
|
|
143
|
+
timed_call(
|
|
144
|
+
timings,
|
|
145
|
+
n_jobs,
|
|
146
|
+
"pra_percomplex_is_corr_true",
|
|
147
|
+
lambda: flex.pra_percomplex(
|
|
148
|
+
name,
|
|
149
|
+
corr,
|
|
150
|
+
is_corr=True,
|
|
151
|
+
chunk_size=CHUNK_SIZE,
|
|
152
|
+
n_jobs=n_jobs,
|
|
153
|
+
),
|
|
154
|
+
is_corr=True,
|
|
155
|
+
)
|
|
156
|
+
timed_call(
|
|
157
|
+
timings,
|
|
158
|
+
n_jobs,
|
|
159
|
+
"complex_contributions",
|
|
160
|
+
lambda: flex.complex_contributions(name),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
timed_call(
|
|
164
|
+
timings,
|
|
165
|
+
n_jobs,
|
|
166
|
+
"plot_precision_recall_curve",
|
|
167
|
+
flex.plot_precision_recall_curve,
|
|
168
|
+
)
|
|
169
|
+
timed_call(timings, n_jobs, "plot_auc_scores", flex.plot_auc_scores)
|
|
170
|
+
timed_call(
|
|
171
|
+
timings,
|
|
172
|
+
n_jobs,
|
|
173
|
+
"plot_significant_complexes",
|
|
174
|
+
flex.plot_significant_complexes,
|
|
175
|
+
)
|
|
176
|
+
timed_call(
|
|
177
|
+
timings,
|
|
178
|
+
n_jobs,
|
|
179
|
+
"plot_percomplex_scatter",
|
|
180
|
+
lambda: flex.plot_percomplex_scatter(n_top=20),
|
|
181
|
+
)
|
|
182
|
+
timed_call(
|
|
183
|
+
timings,
|
|
184
|
+
n_jobs,
|
|
185
|
+
"plot_percomplex_scatter_bysize",
|
|
186
|
+
flex.plot_percomplex_scatter_bysize,
|
|
187
|
+
)
|
|
188
|
+
timed_call(
|
|
189
|
+
timings,
|
|
190
|
+
n_jobs,
|
|
191
|
+
"plot_complex_contributions",
|
|
192
|
+
flex.plot_complex_contributions,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
timings.append(
|
|
196
|
+
{
|
|
197
|
+
"n_jobs": n_jobs,
|
|
198
|
+
"step": "total_runtime",
|
|
199
|
+
"seconds": perf_counter() - workflow_start,
|
|
200
|
+
"chunk_size": CHUNK_SIZE,
|
|
201
|
+
"corr_function": CORR_FUNCTION,
|
|
202
|
+
"is_corr": None,
|
|
203
|
+
}
|
|
204
|
+
)
|
|
205
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
pd.DataFrame(timings).to_csv(output_folder / "benchmark_results.csv", index=False)
|
|
207
|
+
return timings
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def write_reports(timings: list[dict[str, Any]]) -> None:
|
|
211
|
+
raw = pd.DataFrame(timings)
|
|
212
|
+
raw.to_csv(BENCHMARK_ROOT / "benchmark_corum_njobs_comparison.csv", index=False)
|
|
213
|
+
summary = raw.pivot_table(
|
|
214
|
+
index=["n_jobs", "chunk_size", "corr_function"],
|
|
215
|
+
columns="step",
|
|
216
|
+
values="seconds",
|
|
217
|
+
aggfunc="first",
|
|
218
|
+
).reset_index()
|
|
219
|
+
summary.columns.name = None
|
|
220
|
+
summary.to_csv(BENCHMARK_ROOT / "benchmark_corum_njobs_summary.csv", index=False)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def main() -> None:
|
|
224
|
+
if not GENE_EFFECT_PATH.exists():
|
|
225
|
+
raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
|
|
226
|
+
|
|
227
|
+
BENCHMARK_ROOT.mkdir(parents=True, exist_ok=False)
|
|
228
|
+
all_timings: list[dict[str, Any]] = []
|
|
229
|
+
print(f"Benchmark output folder: {BENCHMARK_ROOT}")
|
|
230
|
+
print("mPR preparation and plot_mpr_summary are excluded.")
|
|
231
|
+
|
|
232
|
+
for n_jobs in N_JOBS_VALUES:
|
|
233
|
+
print(f"Running CORUM corr-reuse benchmark with n_jobs={n_jobs}")
|
|
234
|
+
all_timings.extend(run_n_jobs(n_jobs))
|
|
235
|
+
write_reports(all_timings)
|
|
236
|
+
gc.collect()
|
|
237
|
+
|
|
238
|
+
print(
|
|
239
|
+
"Benchmark summary saved to: "
|
|
240
|
+
f"{BENCHMARK_ROOT / 'benchmark_corum_njobs_summary.csv'}"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
if __name__ == "__main__":
|
|
245
|
+
main()
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""Compare GOBP per-complex runtime across n_jobs and chunk sizes.
|
|
2
|
+
|
|
3
|
+
Run from any directory with:
|
|
4
|
+
python path/to/src/pythonflex/examples/runtime_benchmark_gobp_njobs_chunks.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import gc
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from time import perf_counter
|
|
15
|
+
from typing import Any, Callable
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
|
21
|
+
SRC_ROOT = PROJECT_ROOT / "src"
|
|
22
|
+
if str(SRC_ROOT) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(SRC_ROOT))
|
|
24
|
+
|
|
25
|
+
os.environ.setdefault("MPLBACKEND", "Agg")
|
|
26
|
+
|
|
27
|
+
import pythonflex as flex # noqa: E402
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
GENE_EFFECT_PATH = Path(
|
|
31
|
+
"C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
|
|
32
|
+
)
|
|
33
|
+
GOLD_STANDARD = "GOBP"
|
|
34
|
+
CORR_FUNCTION = "numpy_without_mask"
|
|
35
|
+
MAX_NBYTES = "100M"
|
|
36
|
+
STAGE_1_N_JOBS = (2, 4, 8, 16)
|
|
37
|
+
STAGE_1_CHUNK_SIZE = 200
|
|
38
|
+
STAGE_2_CHUNK_SIZES = (100, 400)
|
|
39
|
+
BENCHMARK_ROOT = (
|
|
40
|
+
PROJECT_ROOT
|
|
41
|
+
/ "output"
|
|
42
|
+
/ f"runtime_benchmark_gobp_corr_reuse_njobs_chunks_{datetime.now():%Y%m%d_%H%M%S}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def build_config(output_folder: Path, n_jobs: int, chunk_size: int) -> dict[str, Any]:
|
|
47
|
+
return {
|
|
48
|
+
"min_genes_in_complex": 2,
|
|
49
|
+
"min_genes_per_complex_analysis": 3,
|
|
50
|
+
"output_folder": str(output_folder),
|
|
51
|
+
"gold_standard": GOLD_STANDARD,
|
|
52
|
+
"color_map": "RdYlBu",
|
|
53
|
+
"jaccard": True,
|
|
54
|
+
"analysis_genes": "common",
|
|
55
|
+
"plotting": {
|
|
56
|
+
"save_plot": True,
|
|
57
|
+
"show_plot": False,
|
|
58
|
+
"output_type": "png",
|
|
59
|
+
},
|
|
60
|
+
"preprocessing": {
|
|
61
|
+
"fill_na": True,
|
|
62
|
+
"normalize": False,
|
|
63
|
+
},
|
|
64
|
+
"corr_function": CORR_FUNCTION,
|
|
65
|
+
"per_complex": {
|
|
66
|
+
"n_jobs": n_jobs,
|
|
67
|
+
"chunk_size": chunk_size,
|
|
68
|
+
"max_nbytes": MAX_NBYTES,
|
|
69
|
+
},
|
|
70
|
+
"logging": {
|
|
71
|
+
"visible_levels": ["DONE", "INFO", "WARNING", "ERROR"],
|
|
72
|
+
},
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def timed_call(
|
|
77
|
+
timings: list[dict[str, Any]],
|
|
78
|
+
stage: str,
|
|
79
|
+
n_jobs: int,
|
|
80
|
+
chunk_size: int,
|
|
81
|
+
step: str,
|
|
82
|
+
operation: Callable[[], Any],
|
|
83
|
+
*,
|
|
84
|
+
is_corr: bool | None = None,
|
|
85
|
+
) -> Any:
|
|
86
|
+
start = perf_counter()
|
|
87
|
+
result = operation()
|
|
88
|
+
timings.append(
|
|
89
|
+
{
|
|
90
|
+
"gold_standard": GOLD_STANDARD,
|
|
91
|
+
"stage": stage,
|
|
92
|
+
"n_jobs": n_jobs,
|
|
93
|
+
"chunk_size": chunk_size,
|
|
94
|
+
"step": step,
|
|
95
|
+
"seconds": perf_counter() - start,
|
|
96
|
+
"corr_function": CORR_FUNCTION,
|
|
97
|
+
"is_corr": is_corr,
|
|
98
|
+
"status": "ok",
|
|
99
|
+
"error": "",
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def run_combination(stage: str, n_jobs: int, chunk_size: int) -> list[dict[str, Any]]:
|
|
106
|
+
output_folder = BENCHMARK_ROOT / f"n_jobs_{n_jobs:02d}_chunk_{chunk_size}"
|
|
107
|
+
timings: list[dict[str, Any]] = []
|
|
108
|
+
workflow_start = perf_counter()
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
timed_call(
|
|
112
|
+
timings,
|
|
113
|
+
stage,
|
|
114
|
+
n_jobs,
|
|
115
|
+
chunk_size,
|
|
116
|
+
"initialize",
|
|
117
|
+
lambda: flex.initialize(build_config(output_folder, n_jobs, chunk_size)),
|
|
118
|
+
)
|
|
119
|
+
gene_effect = timed_call(
|
|
120
|
+
timings,
|
|
121
|
+
stage,
|
|
122
|
+
n_jobs,
|
|
123
|
+
chunk_size,
|
|
124
|
+
"read_gene_effect",
|
|
125
|
+
lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
|
|
126
|
+
)
|
|
127
|
+
inputs = {
|
|
128
|
+
"All screens": {
|
|
129
|
+
"path": gene_effect,
|
|
130
|
+
"sort": "high",
|
|
131
|
+
"color": "#000000",
|
|
132
|
+
},
|
|
133
|
+
}
|
|
134
|
+
data, _ = timed_call(
|
|
135
|
+
timings,
|
|
136
|
+
stage,
|
|
137
|
+
n_jobs,
|
|
138
|
+
chunk_size,
|
|
139
|
+
"load_datasets",
|
|
140
|
+
lambda: flex.load_datasets(inputs),
|
|
141
|
+
)
|
|
142
|
+
timed_call(
|
|
143
|
+
timings,
|
|
144
|
+
stage,
|
|
145
|
+
n_jobs,
|
|
146
|
+
chunk_size,
|
|
147
|
+
"load_gold_standard",
|
|
148
|
+
flex.load_gold_standard,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
name, dataset = next(iter(data.items()))
|
|
152
|
+
corr = timed_call(
|
|
153
|
+
timings,
|
|
154
|
+
stage,
|
|
155
|
+
n_jobs,
|
|
156
|
+
chunk_size,
|
|
157
|
+
"perform_corr",
|
|
158
|
+
lambda: flex.perform_corr(dataset, CORR_FUNCTION),
|
|
159
|
+
)
|
|
160
|
+
timed_call(
|
|
161
|
+
timings,
|
|
162
|
+
stage,
|
|
163
|
+
n_jobs,
|
|
164
|
+
chunk_size,
|
|
165
|
+
"pra_is_corr_true",
|
|
166
|
+
lambda: flex.pra(name, corr, is_corr=True),
|
|
167
|
+
is_corr=True,
|
|
168
|
+
)
|
|
169
|
+
timed_call(
|
|
170
|
+
timings,
|
|
171
|
+
stage,
|
|
172
|
+
n_jobs,
|
|
173
|
+
chunk_size,
|
|
174
|
+
"pra_percomplex_is_corr_true",
|
|
175
|
+
lambda: flex.pra_percomplex(
|
|
176
|
+
name,
|
|
177
|
+
corr,
|
|
178
|
+
is_corr=True,
|
|
179
|
+
chunk_size=chunk_size,
|
|
180
|
+
n_jobs=n_jobs,
|
|
181
|
+
),
|
|
182
|
+
is_corr=True,
|
|
183
|
+
)
|
|
184
|
+
timed_call(
|
|
185
|
+
timings,
|
|
186
|
+
stage,
|
|
187
|
+
n_jobs,
|
|
188
|
+
chunk_size,
|
|
189
|
+
"complex_contributions",
|
|
190
|
+
lambda: flex.complex_contributions(name),
|
|
191
|
+
)
|
|
192
|
+
except Exception as exc:
|
|
193
|
+
print(
|
|
194
|
+
f"Run failed for n_jobs={n_jobs}, chunk_size={chunk_size}: {exc!r}",
|
|
195
|
+
file=sys.stderr,
|
|
196
|
+
)
|
|
197
|
+
timings.append(
|
|
198
|
+
{
|
|
199
|
+
"gold_standard": GOLD_STANDARD,
|
|
200
|
+
"stage": stage,
|
|
201
|
+
"n_jobs": n_jobs,
|
|
202
|
+
"chunk_size": chunk_size,
|
|
203
|
+
"step": "failed",
|
|
204
|
+
"seconds": perf_counter() - workflow_start,
|
|
205
|
+
"corr_function": CORR_FUNCTION,
|
|
206
|
+
"is_corr": None,
|
|
207
|
+
"status": "failed",
|
|
208
|
+
"error": repr(exc),
|
|
209
|
+
}
|
|
210
|
+
)
|
|
211
|
+
finally:
|
|
212
|
+
timings.append(
|
|
213
|
+
{
|
|
214
|
+
"gold_standard": GOLD_STANDARD,
|
|
215
|
+
"stage": stage,
|
|
216
|
+
"n_jobs": n_jobs,
|
|
217
|
+
"chunk_size": chunk_size,
|
|
218
|
+
"step": "total_runtime",
|
|
219
|
+
"seconds": perf_counter() - workflow_start,
|
|
220
|
+
"corr_function": CORR_FUNCTION,
|
|
221
|
+
"is_corr": None,
|
|
222
|
+
"status": "ok" if not any(t["status"] == "failed" for t in timings) else "failed",
|
|
223
|
+
"error": "",
|
|
224
|
+
}
|
|
225
|
+
)
|
|
226
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
pd.DataFrame(timings).to_csv(output_folder / "benchmark_results.csv", index=False)
|
|
228
|
+
gc.collect()
|
|
229
|
+
|
|
230
|
+
return timings
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def write_reports(timings: list[dict[str, Any]]) -> None:
|
|
234
|
+
raw = pd.DataFrame(timings)
|
|
235
|
+
raw.to_csv(
|
|
236
|
+
BENCHMARK_ROOT / "benchmark_gobp_njobs_chunks_comparison.csv",
|
|
237
|
+
index=False,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
ok = raw[raw["status"] == "ok"].copy()
|
|
241
|
+
if ok.empty:
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
summary = ok.pivot_table(
|
|
245
|
+
index=["gold_standard", "stage", "n_jobs", "chunk_size", "corr_function"],
|
|
246
|
+
columns="step",
|
|
247
|
+
values="seconds",
|
|
248
|
+
aggfunc="first",
|
|
249
|
+
).reset_index()
|
|
250
|
+
summary.columns.name = None
|
|
251
|
+
summary.to_csv(
|
|
252
|
+
BENCHMARK_ROOT / "benchmark_gobp_njobs_chunks_summary.csv",
|
|
253
|
+
index=False,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def select_stage_1_winner(timings: list[dict[str, Any]]) -> int:
|
|
258
|
+
raw = pd.DataFrame(timings)
|
|
259
|
+
stage_1 = raw[
|
|
260
|
+
(raw["stage"] == "stage_1")
|
|
261
|
+
& (raw["chunk_size"] == STAGE_1_CHUNK_SIZE)
|
|
262
|
+
& (raw["status"] == "ok")
|
|
263
|
+
& (raw["step"].isin(["pra_percomplex_is_corr_true", "total_runtime"]))
|
|
264
|
+
]
|
|
265
|
+
if stage_1.empty:
|
|
266
|
+
raise RuntimeError("No successful Stage 1 timings were recorded.")
|
|
267
|
+
|
|
268
|
+
pivot = stage_1.pivot_table(
|
|
269
|
+
index=["n_jobs"],
|
|
270
|
+
columns="step",
|
|
271
|
+
values="seconds",
|
|
272
|
+
aggfunc="first",
|
|
273
|
+
).reset_index()
|
|
274
|
+
if "pra_percomplex_is_corr_true" not in pivot:
|
|
275
|
+
raise RuntimeError("Stage 1 completed without per-complex timing rows.")
|
|
276
|
+
|
|
277
|
+
sort_columns = ["pra_percomplex_is_corr_true"]
|
|
278
|
+
if "total_runtime" in pivot:
|
|
279
|
+
sort_columns.append("total_runtime")
|
|
280
|
+
winner = pivot.sort_values(sort_columns, ascending=True).iloc[0]
|
|
281
|
+
return int(winner["n_jobs"])
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def main() -> None:
|
|
285
|
+
if not GENE_EFFECT_PATH.exists():
|
|
286
|
+
raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
|
|
287
|
+
|
|
288
|
+
BENCHMARK_ROOT.mkdir(parents=True, exist_ok=False)
|
|
289
|
+
all_timings: list[dict[str, Any]] = []
|
|
290
|
+
print(f"Benchmark output folder: {BENCHMARK_ROOT}")
|
|
291
|
+
print("Plot calls, mPR preparation, and plot_mpr_summary are excluded.")
|
|
292
|
+
|
|
293
|
+
for n_jobs in STAGE_1_N_JOBS:
|
|
294
|
+
print(
|
|
295
|
+
f"Running GOBP Stage 1 benchmark: n_jobs={n_jobs}, "
|
|
296
|
+
f"chunk_size={STAGE_1_CHUNK_SIZE}"
|
|
297
|
+
)
|
|
298
|
+
all_timings.extend(run_combination("stage_1", n_jobs, STAGE_1_CHUNK_SIZE))
|
|
299
|
+
write_reports(all_timings)
|
|
300
|
+
|
|
301
|
+
winner_n_jobs = select_stage_1_winner(all_timings)
|
|
302
|
+
print(f"Stage 1 winner by per-complex runtime: n_jobs={winner_n_jobs}")
|
|
303
|
+
|
|
304
|
+
for chunk_size in STAGE_2_CHUNK_SIZES:
|
|
305
|
+
print(
|
|
306
|
+
f"Running GOBP Stage 2 benchmark: n_jobs={winner_n_jobs}, "
|
|
307
|
+
f"chunk_size={chunk_size}"
|
|
308
|
+
)
|
|
309
|
+
all_timings.extend(run_combination("stage_2", winner_n_jobs, chunk_size))
|
|
310
|
+
write_reports(all_timings)
|
|
311
|
+
|
|
312
|
+
print(
|
|
313
|
+
"Benchmark summary saved to: "
|
|
314
|
+
f"{BENCHMARK_ROOT / 'benchmark_gobp_njobs_chunks_summary.csv'}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
if __name__ == "__main__":
|
|
319
|
+
main()
|