PyPI - pythonflex - Versions diffs - 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl - Mend

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pythonflex/__init__.py +28 -4
pythonflex/analysis.py +287 -579
pythonflex/examples/basic_usage.py +38 -30
pythonflex/examples/manuscript.py +37 -43
pythonflex/examples/runtime/runtime_benchmark.py +218 -0
pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
pythonflex/old_functions.py +422 -0
pythonflex/plotting.py +655 -242
pythonflex/preprocessing.py +54 -216
pythonflex/utils.py +36 -9
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/METADATA +8 -6
pythonflex-0.4.dist-info/RECORD +32 -0
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
pythonflex-0.3.4.dist-info/RECORD +0 -24
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0

pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py ADDED Viewed

@@ -0,0 +1,417 @@
+"""Benchmark GOBP runtime optimization strategies.
+Run from any directory with:
+    python path/to/src/pythonflex/examples/runtime_benchmark_gobp_optimization.py
+"""
+from __future__ import annotations
+import gc
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from time import perf_counter
+from typing import Any, Callable
+import pandas as pd
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+SRC_ROOT = PROJECT_ROOT / "src"
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+os.environ.setdefault("MPLBACKEND", "Agg")
+import pythonflex as flex  # noqa: E402
+from pythonflex import analysis as flex_analysis  # noqa: E402
+CHECKPOINT_COMMIT = "33b8ae8"
+GENE_EFFECT_PATH = Path(
+    "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
+)
+GOLD_STANDARD = "GOBP"
+CORR_FUNCTION = "numpy_without_mask"
+MAX_NBYTES = "100M"
+COMBINATIONS = ((8, 200), (8, 400), (8, 100))
+FALLBACK_COMBINATION = (4, 200)
+STRATEGIES = (
+    "baseline_memmap",
+    "no_memmap_threading",
+    "worker_globals_threading",
+    "shared_pairwise_reuse",
+)
+BENCHMARK_ROOT = (
+    PROJECT_ROOT
+    / "output"
+    / f"runtime_benchmark_gobp_optimization_{datetime.now():%Y%m%d_%H%M%S}"
+)
+def build_config(output_folder: Path, n_jobs: int, chunk_size: int) -> dict[str, Any]:
+    return {
+        "min_genes_in_complex": 2,
+        "min_genes_per_complex_analysis": 3,
+        "output_folder": str(output_folder),
+        "gold_standard": GOLD_STANDARD,
+        "color_map": "RdYlBu",
+        "jaccard": True,
+        "analysis_genes": "common",
+        "plotting": {
+            "save_plot": True,
+            "show_plot": False,
+            "output_type": "png",
+        },
+        "preprocessing": {
+            "fill_na": True,
+            "normalize": False,
+        },
+        "corr_function": CORR_FUNCTION,
+        "per_complex": {
+            "n_jobs": n_jobs,
+            "chunk_size": chunk_size,
+            "max_nbytes": MAX_NBYTES,
+        },
+        "logging": {
+            "visible_levels": ["DONE", "INFO", "WARNING", "ERROR"],
+        },
+    }
+def timed_call(
+    timings: list[dict[str, Any]],
+    strategy: str,
+    n_jobs: int,
+    chunk_size: int,
+    fallback: bool,
+    step: str,
+    operation: Callable[[], Any],
+    *,
+    is_corr: bool | None = None,
+) -> Any:
+    start = perf_counter()
+    result = operation()
+    timings.append(
+        {
+            "gold_standard": GOLD_STANDARD,
+            "strategy": strategy,
+            "n_jobs": n_jobs,
+            "chunk_size": chunk_size,
+            "fallback": fallback,
+            "step": step,
+            "seconds": perf_counter() - start,
+            "corr_function": CORR_FUNCTION,
+            "is_corr": is_corr,
+            "status": "ok",
+            "error": "",
+        }
+    )
+    return result
+def run_strategy_step(
+    timings: list[dict[str, Any]],
+    strategy: str,
+    n_jobs: int,
+    chunk_size: int,
+    fallback: bool,
+    name: str,
+    corr: pd.DataFrame,
+) -> None:
+    if strategy == "baseline_memmap":
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "pra_is_corr_true",
+            lambda: flex.pra(name, corr, is_corr=True),
+            is_corr=True,
+        )
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "pra_percomplex_is_corr_true",
+            lambda: flex.pra_percomplex(
+                name,
+                corr,
+                is_corr=True,
+                chunk_size=chunk_size,
+                n_jobs=n_jobs,
+            ),
+            is_corr=True,
+        )
+    elif strategy in {"no_memmap_threading", "worker_globals_threading"}:
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "pra_is_corr_true",
+            lambda: flex.pra(name, corr, is_corr=True),
+            is_corr=True,
+        )
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "pra_percomplex_is_corr_true",
+            lambda: flex_analysis._pra_percomplex_benchmark_strategy(
+                name,
+                corr,
+                is_corr=True,
+                chunk_size=chunk_size,
+                n_jobs=n_jobs,
+                strategy=strategy,
+            ),
+            is_corr=True,
+        )
+    elif strategy == "shared_pairwise_reuse":
+        prepared: dict[str, Any] = {}
+        def run_shared_pra() -> pd.DataFrame:
+            terms, pairwise_df, gene_to_pair_indices = flex_analysis._prepare_pairwise_for_analysis(
+                name,
+                corr,
+                is_corr=True,
+                build_gene_index=True,
+            )
+            prepared["terms"] = terms
+            prepared["pairwise_df"] = pairwise_df
+            prepared["gene_to_pair_indices"] = gene_to_pair_indices
+            return flex_analysis._save_global_pra_from_pairwise(name, pairwise_df)
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "pra_is_corr_true",
+            run_shared_pra,
+            is_corr=True,
+        )
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "pra_percomplex_is_corr_true",
+            lambda: flex_analysis._pra_percomplex_from_pairwise(
+                name,
+                prepared["terms"],
+                prepared["pairwise_df"],
+                prepared["gene_to_pair_indices"],
+                chunk_size=chunk_size,
+                n_jobs=n_jobs,
+                strategy="shared_pairwise_reuse",
+            ),
+            is_corr=True,
+        )
+    else:
+        raise ValueError(f"Unknown strategy: {strategy}")
+def run_combination(
+    strategy: str,
+    n_jobs: int,
+    chunk_size: int,
+    *,
+    fallback: bool = False,
+) -> list[dict[str, Any]]:
+    suffix = f"n_jobs_{n_jobs:02d}_chunk_{chunk_size}"
+    if fallback:
+        suffix = f"fallback_{suffix}"
+    output_folder = BENCHMARK_ROOT / strategy / suffix
+    timings: list[dict[str, Any]] = []
+    workflow_start = perf_counter()
+    try:
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "initialize",
+            lambda: flex.initialize(build_config(output_folder, n_jobs, chunk_size)),
+        )
+        gene_effect = timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "read_gene_effect",
+            lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
+        )
+        inputs = {
+            "All screens": {
+                "path": gene_effect,
+                "sort": "high",
+                "color": "#000000",
+            },
+        }
+        data, _ = timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "load_datasets",
+            lambda: flex.load_datasets(inputs),
+        )
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "load_gold_standard",
+            flex.load_gold_standard,
+        )
+        name, dataset = next(iter(data.items()))
+        corr = timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "perform_corr",
+            lambda: flex.perform_corr(dataset, CORR_FUNCTION),
+        )
+        run_strategy_step(timings, strategy, n_jobs, chunk_size, fallback, name, corr)
+        timed_call(
+            timings,
+            strategy,
+            n_jobs,
+            chunk_size,
+            fallback,
+            "complex_contributions",
+            lambda: flex.complex_contributions(name),
+        )
+    except Exception as exc:
+        print(
+            f"Run failed for strategy={strategy}, n_jobs={n_jobs}, "
+            f"chunk_size={chunk_size}: {exc!r}",
+            file=sys.stderr,
+        )
+        timings.append(
+            {
+                "gold_standard": GOLD_STANDARD,
+                "strategy": strategy,
+                "n_jobs": n_jobs,
+                "chunk_size": chunk_size,
+                "fallback": fallback,
+                "step": "failed",
+                "seconds": perf_counter() - workflow_start,
+                "corr_function": CORR_FUNCTION,
+                "is_corr": None,
+                "status": "failed",
+                "error": repr(exc),
+            }
+        )
+    finally:
+        timings.append(
+            {
+                "gold_standard": GOLD_STANDARD,
+                "strategy": strategy,
+                "n_jobs": n_jobs,
+                "chunk_size": chunk_size,
+                "fallback": fallback,
+                "step": "total_runtime",
+                "seconds": perf_counter() - workflow_start,
+                "corr_function": CORR_FUNCTION,
+                "is_corr": None,
+                "status": "ok" if not any(t["status"] == "failed" for t in timings) else "failed",
+                "error": "",
+            }
+        )
+        output_folder.mkdir(parents=True, exist_ok=True)
+        pd.DataFrame(timings).to_csv(output_folder / "benchmark_results.csv", index=False)
+        gc.collect()
+    return timings
+def write_reports(timings: list[dict[str, Any]]) -> None:
+    raw = pd.DataFrame(timings)
+    raw.to_csv(BENCHMARK_ROOT / "benchmark_gobp_optimization_comparison.csv", index=False)
+    ok = raw[raw["status"] == "ok"].copy()
+    if ok.empty:
+        return
+    summary = ok.pivot_table(
+        index=[
+            "gold_standard",
+            "strategy",
+            "n_jobs",
+            "chunk_size",
+            "fallback",
+            "corr_function",
+        ],
+        columns="step",
+        values="seconds",
+        aggfunc="first",
+    ).reset_index()
+    summary.columns.name = None
+    summary.to_csv(BENCHMARK_ROOT / "benchmark_gobp_optimization_summary.csv", index=False)
+def main() -> None:
+    if not GENE_EFFECT_PATH.exists():
+        raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
+    BENCHMARK_ROOT.mkdir(parents=True, exist_ok=False)
+    all_timings: list[dict[str, Any]] = []
+    print(f"Checkpoint commit: {CHECKPOINT_COMMIT}")
+    print(f"Benchmark output folder: {BENCHMARK_ROOT}")
+    print("Plot calls, mPR preparation, and plot_mpr_summary are excluded.")
+    for strategy in STRATEGIES:
+        for n_jobs, chunk_size in COMBINATIONS:
+            print(
+                f"Running GOBP optimization benchmark: strategy={strategy}, "
+                f"n_jobs={n_jobs}, chunk_size={chunk_size}"
+            )
+            timings = run_combination(strategy, n_jobs, chunk_size)
+            all_timings.extend(timings)
+            write_reports(all_timings)
+            failed = any(row["status"] == "failed" for row in timings)
+            if failed and n_jobs == 8:
+                fallback_n_jobs, fallback_chunk_size = FALLBACK_COMBINATION
+                print(
+                    f"Running fallback for strategy={strategy}: "
+                    f"n_jobs={fallback_n_jobs}, chunk_size={fallback_chunk_size}"
+                )
+                all_timings.extend(
+                    run_combination(
+                        strategy,
+                        fallback_n_jobs,
+                        fallback_chunk_size,
+                        fallback=True,
+                    )
+                )
+                write_reports(all_timings)
+    print(
+        "Benchmark summary saved to: "
+        f"{BENCHMARK_ROOT / 'benchmark_gobp_optimization_summary.csv'}"
+    )
+if __name__ == "__main__":
+    main()

pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl