pythonflex 0.3.3__py3-none-any.whl → 0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex/__init__.py +28 -4
- pythonflex/analysis.py +287 -578
- pythonflex/examples/basic_usage.py +40 -32
- pythonflex/examples/manuscript.py +37 -42
- pythonflex/examples/runtime/runtime_benchmark.py +218 -0
- pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
- pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
- pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
- pythonflex/old_functions.py +422 -0
- pythonflex/plotting.py +655 -242
- pythonflex/preprocessing.py +62 -60
- pythonflex/utils.py +36 -9
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/METADATA +9 -4
- pythonflex-0.4.dist-info/RECORD +32 -0
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
- pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
- pythonflex-0.3.3.dist-info/RECORD +0 -24
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""Run five no-mPR runtime benchmarks for each bundled gold standard.
|
|
2
|
+
|
|
3
|
+
Run from any directory with:
|
|
4
|
+
python path/to/src/pythonflex/examples/runtime_benchmark_repeated.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import argparse
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from time import perf_counter
|
|
15
|
+
from typing import Any, Callable
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
|
21
|
+
SRC_ROOT = PROJECT_ROOT / "src"
|
|
22
|
+
if str(SRC_ROOT) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(SRC_ROOT))
|
|
24
|
+
|
|
25
|
+
# Plot generation is benchmarked and saved without opening interactive windows.
|
|
26
|
+
os.environ.setdefault("MPLBACKEND", "Agg")
|
|
27
|
+
|
|
28
|
+
import pythonflex as flex # noqa: E402
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
GENE_EFFECT_PATH = Path(
|
|
32
|
+
"C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
|
|
33
|
+
)
|
|
34
|
+
RUN_COUNT = 5
|
|
35
|
+
GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
|
|
36
|
+
DEFAULT_BENCHMARK_ROOT = (
|
|
37
|
+
PROJECT_ROOT
|
|
38
|
+
/ "output"
|
|
39
|
+
/ f"runtime_benchmark_{RUN_COUNT}_runs_no_mpr_{datetime.now():%Y%m%d_%H%M%S}"
|
|
40
|
+
)
|
|
41
|
+
BENCHMARK_ROOT = DEFAULT_BENCHMARK_ROOT
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def build_config(gold_standard: str, output_folder: Path) -> dict[str, Any]:
|
|
45
|
+
return {
|
|
46
|
+
"min_genes_in_complex": 2,
|
|
47
|
+
"min_genes_per_complex_analysis": 3,
|
|
48
|
+
"output_folder": str(output_folder),
|
|
49
|
+
"gold_standard": gold_standard,
|
|
50
|
+
"color_map": "RdYlBu",
|
|
51
|
+
"jaccard": True,
|
|
52
|
+
"analysis_genes": "common",
|
|
53
|
+
"plotting": {
|
|
54
|
+
"save_plot": True,
|
|
55
|
+
"show_plot": False,
|
|
56
|
+
"output_type": "png",
|
|
57
|
+
},
|
|
58
|
+
"preprocessing": {
|
|
59
|
+
"fill_na": True,
|
|
60
|
+
"normalize": False,
|
|
61
|
+
},
|
|
62
|
+
"corr_function": "numpy",
|
|
63
|
+
"logging": {
|
|
64
|
+
"visible_levels": ["DONE", "INFO", "WARNING"],
|
|
65
|
+
},
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def timed_call(
|
|
70
|
+
timings: list[dict[str, Any]],
|
|
71
|
+
gold_standard: str,
|
|
72
|
+
repetition: int,
|
|
73
|
+
step: str,
|
|
74
|
+
operation: Callable[[], Any],
|
|
75
|
+
) -> Any:
|
|
76
|
+
start = perf_counter()
|
|
77
|
+
result = operation()
|
|
78
|
+
timings.append(
|
|
79
|
+
{
|
|
80
|
+
"gold_standard": gold_standard,
|
|
81
|
+
"repetition": repetition,
|
|
82
|
+
"step": step,
|
|
83
|
+
"seconds": perf_counter() - start,
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def run_repetition(
|
|
90
|
+
gold_standard: str,
|
|
91
|
+
repetition: int,
|
|
92
|
+
) -> list[dict[str, Any]]:
|
|
93
|
+
output_folder = BENCHMARK_ROOT / gold_standard / f"run_{repetition:02d}"
|
|
94
|
+
timings: list[dict[str, Any]] = []
|
|
95
|
+
workflow_start = perf_counter()
|
|
96
|
+
|
|
97
|
+
timed_call(
|
|
98
|
+
timings,
|
|
99
|
+
gold_standard,
|
|
100
|
+
repetition,
|
|
101
|
+
"initialize",
|
|
102
|
+
lambda: flex.initialize(build_config(gold_standard, output_folder)),
|
|
103
|
+
)
|
|
104
|
+
gene_effect = timed_call(
|
|
105
|
+
timings,
|
|
106
|
+
gold_standard,
|
|
107
|
+
repetition,
|
|
108
|
+
"read_gene_effect",
|
|
109
|
+
lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
|
|
110
|
+
)
|
|
111
|
+
inputs = {
|
|
112
|
+
"All screens": {
|
|
113
|
+
"path": gene_effect,
|
|
114
|
+
"sort": "high",
|
|
115
|
+
"color": "#000000",
|
|
116
|
+
},
|
|
117
|
+
}
|
|
118
|
+
data, _ = timed_call(
|
|
119
|
+
timings,
|
|
120
|
+
gold_standard,
|
|
121
|
+
repetition,
|
|
122
|
+
"load_datasets",
|
|
123
|
+
lambda: flex.load_datasets(inputs),
|
|
124
|
+
)
|
|
125
|
+
timed_call(
|
|
126
|
+
timings,
|
|
127
|
+
gold_standard,
|
|
128
|
+
repetition,
|
|
129
|
+
"load_gold_standard",
|
|
130
|
+
flex.load_gold_standard,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
name, dataset = next(iter(data.items()))
|
|
134
|
+
timed_call(
|
|
135
|
+
timings,
|
|
136
|
+
gold_standard,
|
|
137
|
+
repetition,
|
|
138
|
+
"pra",
|
|
139
|
+
lambda: flex.pra(name, dataset, is_corr=False),
|
|
140
|
+
)
|
|
141
|
+
timed_call(
|
|
142
|
+
timings,
|
|
143
|
+
gold_standard,
|
|
144
|
+
repetition,
|
|
145
|
+
"pra_percomplex",
|
|
146
|
+
lambda: flex.pra_percomplex(name, dataset, is_corr=False),
|
|
147
|
+
)
|
|
148
|
+
timed_call(
|
|
149
|
+
timings,
|
|
150
|
+
gold_standard,
|
|
151
|
+
repetition,
|
|
152
|
+
"complex_contributions",
|
|
153
|
+
lambda: flex.complex_contributions(name),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
timed_call(
|
|
157
|
+
timings,
|
|
158
|
+
gold_standard,
|
|
159
|
+
repetition,
|
|
160
|
+
"plot_precision_recall_curve",
|
|
161
|
+
flex.plot_precision_recall_curve,
|
|
162
|
+
)
|
|
163
|
+
timed_call(
|
|
164
|
+
timings,
|
|
165
|
+
gold_standard,
|
|
166
|
+
repetition,
|
|
167
|
+
"plot_auc_scores",
|
|
168
|
+
flex.plot_auc_scores,
|
|
169
|
+
)
|
|
170
|
+
timed_call(
|
|
171
|
+
timings,
|
|
172
|
+
gold_standard,
|
|
173
|
+
repetition,
|
|
174
|
+
"plot_significant_complexes",
|
|
175
|
+
flex.plot_significant_complexes,
|
|
176
|
+
)
|
|
177
|
+
timed_call(
|
|
178
|
+
timings,
|
|
179
|
+
gold_standard,
|
|
180
|
+
repetition,
|
|
181
|
+
"plot_percomplex_scatter",
|
|
182
|
+
lambda: flex.plot_percomplex_scatter(n_top=20),
|
|
183
|
+
)
|
|
184
|
+
timed_call(
|
|
185
|
+
timings,
|
|
186
|
+
gold_standard,
|
|
187
|
+
repetition,
|
|
188
|
+
"plot_percomplex_scatter_bysize",
|
|
189
|
+
flex.plot_percomplex_scatter_bysize,
|
|
190
|
+
)
|
|
191
|
+
timed_call(
|
|
192
|
+
timings,
|
|
193
|
+
gold_standard,
|
|
194
|
+
repetition,
|
|
195
|
+
"plot_complex_contributions",
|
|
196
|
+
flex.plot_complex_contributions,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
timings.append(
|
|
200
|
+
{
|
|
201
|
+
"gold_standard": gold_standard,
|
|
202
|
+
"repetition": repetition,
|
|
203
|
+
"step": "total_runtime",
|
|
204
|
+
"seconds": perf_counter() - workflow_start,
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
208
|
+
pd.DataFrame(timings).to_csv(
|
|
209
|
+
output_folder / "benchmark_results.csv",
|
|
210
|
+
index=False,
|
|
211
|
+
)
|
|
212
|
+
return timings
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def write_reports(timings: list[dict[str, Any]]) -> None:
|
|
216
|
+
raw = pd.DataFrame(timings)
|
|
217
|
+
raw.to_csv(BENCHMARK_ROOT / "benchmark_results_all_runs.csv", index=False)
|
|
218
|
+
|
|
219
|
+
summary = (
|
|
220
|
+
raw.groupby(["gold_standard", "step"], as_index=False)["seconds"]
|
|
221
|
+
.agg(
|
|
222
|
+
repetitions="count",
|
|
223
|
+
mean_seconds="mean",
|
|
224
|
+
std_seconds="std",
|
|
225
|
+
min_seconds="min",
|
|
226
|
+
max_seconds="max",
|
|
227
|
+
)
|
|
228
|
+
)
|
|
229
|
+
summary.to_csv(BENCHMARK_ROOT / "benchmark_summary_mean_std.csv", index=False)
|
|
230
|
+
|
|
231
|
+
total_rows = raw[raw["step"] == "total_runtime"]
|
|
232
|
+
if not total_rows.empty:
|
|
233
|
+
per_repetition_total = (
|
|
234
|
+
total_rows.groupby("repetition", as_index=False)["seconds"]
|
|
235
|
+
.sum()
|
|
236
|
+
.rename(columns={"seconds": "all_gold_standards_seconds"})
|
|
237
|
+
)
|
|
238
|
+
per_repetition_total.to_csv(
|
|
239
|
+
BENCHMARK_ROOT / "benchmark_total_by_repetition.csv",
|
|
240
|
+
index=False,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def parse_gold_standards(value: str) -> list[str]:
|
|
245
|
+
selected = [item.strip().upper() for item in value.split(",") if item.strip()]
|
|
246
|
+
invalid = [item for item in selected if item not in GOLD_STANDARDS]
|
|
247
|
+
if invalid:
|
|
248
|
+
raise ValueError(
|
|
249
|
+
f"Invalid gold standard(s): {invalid}. Choose from {GOLD_STANDARDS}."
|
|
250
|
+
)
|
|
251
|
+
return selected
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def load_existing_timings() -> list[dict[str, Any]]:
|
|
255
|
+
raw_path = BENCHMARK_ROOT / "benchmark_results_all_runs.csv"
|
|
256
|
+
if not raw_path.exists():
|
|
257
|
+
return []
|
|
258
|
+
return pd.read_csv(raw_path).to_dict("records")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def has_completed_run(
|
|
262
|
+
timings: list[dict[str, Any]],
|
|
263
|
+
gold_standard: str,
|
|
264
|
+
repetition: int,
|
|
265
|
+
) -> bool:
|
|
266
|
+
return any(
|
|
267
|
+
row.get("gold_standard") == gold_standard
|
|
268
|
+
and int(row.get("repetition", -1)) == repetition
|
|
269
|
+
and row.get("step") == "total_runtime"
|
|
270
|
+
for row in timings
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def parse_args() -> argparse.Namespace:
|
|
275
|
+
parser = argparse.ArgumentParser(
|
|
276
|
+
description=(
|
|
277
|
+
"Run repeated no-mPR pythonFLEX runtime benchmarks. "
|
|
278
|
+
"Use --output-root to resume or add runs to an existing output folder."
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
parser.add_argument(
|
|
282
|
+
"--output-root",
|
|
283
|
+
type=Path,
|
|
284
|
+
default=None,
|
|
285
|
+
help="Existing or new benchmark root. Defaults to a new timestamped folder.",
|
|
286
|
+
)
|
|
287
|
+
parser.add_argument(
|
|
288
|
+
"--gold-standards",
|
|
289
|
+
default=",".join(GOLD_STANDARDS),
|
|
290
|
+
help="Comma-separated subset of CORUM, PATHWAY, GOBP.",
|
|
291
|
+
)
|
|
292
|
+
parser.add_argument(
|
|
293
|
+
"--start-repetition",
|
|
294
|
+
type=int,
|
|
295
|
+
default=1,
|
|
296
|
+
help="First repetition number to run.",
|
|
297
|
+
)
|
|
298
|
+
parser.add_argument(
|
|
299
|
+
"--end-repetition",
|
|
300
|
+
type=int,
|
|
301
|
+
default=RUN_COUNT,
|
|
302
|
+
help="Last repetition number to run.",
|
|
303
|
+
)
|
|
304
|
+
return parser.parse_args()
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def main() -> None:
|
|
308
|
+
if not GENE_EFFECT_PATH.exists():
|
|
309
|
+
raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
|
|
310
|
+
|
|
311
|
+
args = parse_args()
|
|
312
|
+
selected_standards = parse_gold_standards(args.gold_standards)
|
|
313
|
+
if args.start_repetition < 1 or args.end_repetition < args.start_repetition:
|
|
314
|
+
raise ValueError("Invalid repetition range.")
|
|
315
|
+
if args.end_repetition > RUN_COUNT:
|
|
316
|
+
raise ValueError(f"end-repetition cannot exceed RUN_COUNT={RUN_COUNT}.")
|
|
317
|
+
|
|
318
|
+
global BENCHMARK_ROOT
|
|
319
|
+
if args.output_root is not None:
|
|
320
|
+
BENCHMARK_ROOT = args.output_root.resolve()
|
|
321
|
+
|
|
322
|
+
BENCHMARK_ROOT.mkdir(parents=True, exist_ok=args.output_root is not None)
|
|
323
|
+
all_timings = load_existing_timings()
|
|
324
|
+
print(f"Benchmark output folder: {BENCHMARK_ROOT}")
|
|
325
|
+
print("mPR preparation is excluded from this repeated benchmark.")
|
|
326
|
+
|
|
327
|
+
for gold_standard in selected_standards:
|
|
328
|
+
for repetition in range(args.start_repetition, args.end_repetition + 1):
|
|
329
|
+
if has_completed_run(all_timings, gold_standard, repetition):
|
|
330
|
+
print(
|
|
331
|
+
f"Skipping completed no-mPR benchmark: {gold_standard} "
|
|
332
|
+
f"repetition {repetition}/{RUN_COUNT}"
|
|
333
|
+
)
|
|
334
|
+
continue
|
|
335
|
+
print(
|
|
336
|
+
f"Running no-mPR benchmark: {gold_standard} "
|
|
337
|
+
f"repetition {repetition}/{RUN_COUNT}"
|
|
338
|
+
)
|
|
339
|
+
all_timings.extend(run_repetition(gold_standard, repetition))
|
|
340
|
+
write_reports(all_timings)
|
|
341
|
+
|
|
342
|
+
summary_path = BENCHMARK_ROOT / "benchmark_summary_mean_std.csv"
|
|
343
|
+
print(f"Benchmark summary saved to: {summary_path}")
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
if __name__ == "__main__":
|
|
347
|
+
main()
|