PyPI - pythonflex - Versions diffs - 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl - Mend

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pythonflex/__init__.py +28 -4
pythonflex/analysis.py +287 -579
pythonflex/examples/basic_usage.py +38 -30
pythonflex/examples/manuscript.py +37 -43
pythonflex/examples/runtime/runtime_benchmark.py +218 -0
pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
pythonflex/old_functions.py +422 -0
pythonflex/plotting.py +655 -242
pythonflex/preprocessing.py +54 -216
pythonflex/utils.py +36 -9
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/METADATA +8 -6
pythonflex-0.4.dist-info/RECORD +32 -0
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
pythonflex-0.3.4.dist-info/RECORD +0 -24
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0

pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py ADDED Viewed

@@ -0,0 +1,534 @@
+"""Run 10 core runtime benchmarks for CORUM, PATHWAY, and GOBP.
+This benchmark uses one in-memory correlation matrix per run and the package's
+memmap + process parallelism path for per-complex PRA.
+Run from the project root with:
+    python src/pythonflex/examples/runtime_benchmark_10_runs_memmap.py
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import traceback
+from datetime import datetime
+from pathlib import Path
+from time import perf_counter
+from typing import Any, Callable
+import pandas as pd
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+SRC_ROOT = PROJECT_ROOT / "src"
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+os.environ.setdefault("MPLBACKEND", "Agg")
+import pythonflex as flex  # noqa: E402
+DEFAULT_GENE_EFFECT_PATH = Path(
+    "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
+)
+DEFAULT_RUN_COUNT = 10
+GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
+DEFAULT_N_JOBS = 8
+DEFAULT_CHUNK_SIZE = 400
+DEFAULT_MAX_NBYTES = "100M"
+DEFAULT_CORR_FUNCTION = "numpy_without_mask"
+DEFAULT_BENCHMARK_ROOT = (
+    PROJECT_ROOT
+    / "output"
+    / f"runtime_benchmark_10_runs_memmap_njobs8_chunk400_{datetime.now():%Y%m%d_%H%M%S}"
+)
+def build_config(
+    gold_standard: str,
+    output_folder: Path,
+    corr_function: str,
+    n_jobs: int,
+    chunk_size: int,
+    max_nbytes: str,
+) -> dict[str, Any]:
+    return {
+        "min_genes_in_complex": 2,
+        "min_genes_per_complex_analysis": 3,
+        "output_folder": str(output_folder),
+        "gold_standard": gold_standard,
+        "color_map": "RdYlBu",
+        "jaccard": True,
+        "analysis_genes": "common",
+        "plotting": {
+            "save_plot": True,
+            "show_plot": False,
+            "output_type": "png",
+        },
+        "preprocessing": {
+            "fill_na": True,
+            "normalize": False,
+        },
+        "corr_function": corr_function,
+        "per_complex": {
+            "n_jobs": n_jobs,
+            "chunk_size": chunk_size,
+            "max_nbytes": max_nbytes,
+        },
+        "logging": {
+            "visible_levels": ["DONE", "INFO", "WARNING", "ERROR"],
+        },
+    }
+def base_row(
+    gold_standard: str,
+    repetition: int,
+    step: str,
+    n_jobs: int,
+    chunk_size: int,
+    max_nbytes: str,
+    corr_function: str,
+    is_corr: bool | None,
+) -> dict[str, Any]:
+    return {
+        "gold_standard": gold_standard,
+        "repetition": repetition,
+        "step": step,
+        "seconds": None,
+        "n_jobs": n_jobs,
+        "chunk_size": chunk_size,
+        "max_nbytes": max_nbytes,
+        "corr_function": corr_function,
+        "is_corr": is_corr,
+        "status": "success",
+        "error": "",
+    }
+def timed_call(
+    timings: list[dict[str, Any]],
+    gold_standard: str,
+    repetition: int,
+    step: str,
+    operation: Callable[[], Any],
+    n_jobs: int,
+    chunk_size: int,
+    max_nbytes: str,
+    corr_function: str,
+    is_corr: bool | None = None,
+) -> Any:
+    row = base_row(
+        gold_standard,
+        repetition,
+        step,
+        n_jobs,
+        chunk_size,
+        max_nbytes,
+        corr_function,
+        is_corr,
+    )
+    start = perf_counter()
+    try:
+        result = operation()
+    except Exception as exc:
+        row["seconds"] = perf_counter() - start
+        row["status"] = "failed"
+        row["error"] = "".join(
+            traceback.format_exception_only(type(exc), exc)
+        ).strip()
+        timings.append(row)
+        raise
+    row["seconds"] = perf_counter() - start
+    timings.append(row)
+    return result
+def add_total_runtime(
+    timings: list[dict[str, Any]],
+    gold_standard: str,
+    repetition: int,
+    seconds: float,
+    n_jobs: int,
+    chunk_size: int,
+    max_nbytes: str,
+    corr_function: str,
+    status: str,
+    error: str = "",
+) -> None:
+    row = base_row(
+        gold_standard,
+        repetition,
+        "total_runtime",
+        n_jobs,
+        chunk_size,
+        max_nbytes,
+        corr_function,
+        None,
+    )
+    row["seconds"] = seconds
+    row["status"] = status
+    row["error"] = error
+    timings.append(row)
+def run_repetition(
+    gold_standard: str,
+    repetition: int,
+    benchmark_root: Path,
+    gene_effect_path: Path,
+    corr_function: str,
+    n_jobs: int,
+    chunk_size: int,
+    max_nbytes: str,
+) -> list[dict[str, Any]]:
+    output_folder = benchmark_root / gold_standard / f"run_{repetition:02d}"
+    output_folder.mkdir(parents=True, exist_ok=True)
+    timings: list[dict[str, Any]] = []
+    workflow_start = perf_counter()
+    status = "success"
+    error = ""
+    try:
+        timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "initialize",
+            lambda: flex.initialize(
+                build_config(
+                    gold_standard,
+                    output_folder,
+                    corr_function,
+                    n_jobs,
+                    chunk_size,
+                    max_nbytes,
+                )
+            ),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+        )
+        gene_effect = timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "read_gene_effect",
+            lambda: pd.read_csv(gene_effect_path, index_col=0),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+        )
+        inputs = {
+            "All screens": {
+                "path": gene_effect,
+                "sort": "high",
+                "color": "#000000",
+            },
+        }
+        datasets, _ = timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "load_datasets",
+            lambda: flex.load_datasets(inputs),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+        )
+        timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "load_gold_standard",
+            flex.load_gold_standard,
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+        )
+        name, dataset = next(iter(datasets.items()))
+        corr = timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "perform_corr",
+            lambda: flex.perform_corr(dataset, corr_function),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+            is_corr=False,
+        )
+        timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "pra_is_corr_true",
+            lambda: flex.pra(name, corr, is_corr=True),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+            is_corr=True,
+        )
+        timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "pra_percomplex_is_corr_true",
+            lambda: flex.pra_percomplex(
+                name,
+                corr,
+                is_corr=True,
+                n_jobs=n_jobs,
+                chunk_size=chunk_size,
+            ),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+            is_corr=True,
+        )
+        timed_call(
+            timings,
+            gold_standard,
+            repetition,
+            "complex_contributions",
+            lambda: flex.complex_contributions(name),
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+        )
+    except Exception as exc:
+        status = "failed"
+        error = "".join(traceback.format_exception_only(type(exc), exc)).strip()
+        (output_folder / "benchmark_error.txt").write_text(
+            traceback.format_exc(),
+            encoding="utf-8",
+        )
+    finally:
+        add_total_runtime(
+            timings,
+            gold_standard,
+            repetition,
+            perf_counter() - workflow_start,
+            n_jobs,
+            chunk_size,
+            max_nbytes,
+            corr_function,
+            status,
+            error,
+        )
+        pd.DataFrame(timings).to_csv(
+            output_folder / "benchmark_results.csv",
+            index=False,
+        )
+    return timings
+def write_reports(timings: list[dict[str, Any]], benchmark_root: Path) -> None:
+    benchmark_root.mkdir(parents=True, exist_ok=True)
+    raw = pd.DataFrame(timings)
+    raw.to_csv(benchmark_root / "benchmark_results_all_runs.csv", index=False)
+    success = raw[raw["status"] == "success"]
+    if success.empty:
+        summary = pd.DataFrame(
+            columns=[
+                "gold_standard",
+                "step",
+                "count",
+                "mean",
+                "std",
+                "min",
+                "max",
+            ]
+        )
+    else:
+        summary = (
+            success.groupby(["gold_standard", "step"], as_index=False)["seconds"]
+            .agg(count="count", mean="mean", std="std", min="min", max="max")
+        )
+    summary.to_csv(benchmark_root / "benchmark_summary_mean_std.csv", index=False)
+    pivot = raw.pivot_table(
+        index=["gold_standard", "repetition"],
+        columns="step",
+        values="seconds",
+        aggfunc="first",
+    ).reset_index()
+    pivot.columns.name = None
+    pivot.to_csv(benchmark_root / "benchmark_pivot_by_run.csv", index=False)
+    total_rows = raw[raw["step"] == "total_runtime"].copy()
+    total_rows.to_csv(
+        benchmark_root / "benchmark_total_by_repetition.csv",
+        index=False,
+    )
+def parse_gold_standards(value: str) -> list[str]:
+    selected = [item.strip().upper() for item in value.split(",") if item.strip()]
+    invalid = [item for item in selected if item not in GOLD_STANDARDS]
+    if invalid:
+        raise ValueError(
+            f"Invalid gold standard(s): {invalid}. Choose from {GOLD_STANDARDS}."
+        )
+    return selected
+def load_existing_timings(benchmark_root: Path) -> list[dict[str, Any]]:
+    raw_path = benchmark_root / "benchmark_results_all_runs.csv"
+    if not raw_path.exists():
+        return []
+    return pd.read_csv(raw_path).to_dict("records")
+def completed_runs(timings: list[dict[str, Any]]) -> set[tuple[str, int]]:
+    if not timings:
+        return set()
+    raw = pd.DataFrame(timings)
+    completed = raw[raw["step"] == "total_runtime"]
+    return set(
+        zip(
+            completed["gold_standard"].astype(str),
+            completed["repetition"].astype(int),
+        )
+    )
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run repeated core runtime benchmarks with memmap per-complex PRA."
+    )
+    parser.add_argument(
+        "--output-root",
+        type=Path,
+        default=DEFAULT_BENCHMARK_ROOT,
+        help="Benchmark output root. Defaults to a timestamped output folder.",
+    )
+    parser.add_argument(
+        "--gene-effect-path",
+        type=Path,
+        default=DEFAULT_GENE_EFFECT_PATH,
+        help="Path to DepMap gene_effect.csv.",
+    )
+    parser.add_argument(
+        "--gold-standards",
+        default=",".join(GOLD_STANDARDS),
+        help="Comma-separated subset of CORUM,PATHWAY,GOBP.",
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=DEFAULT_RUN_COUNT,
+        help="Number of repetitions per gold standard.",
+    )
+    parser.add_argument(
+        "--start-repetition",
+        type=int,
+        default=1,
+        help="First repetition to run, inclusive.",
+    )
+    parser.add_argument(
+        "--end-repetition",
+        type=int,
+        default=None,
+        help="Last repetition to run, inclusive. Defaults to --runs.",
+    )
+    parser.add_argument(
+        "--skip-existing",
+        action="store_true",
+        help="Skip gold-standard/repetition pairs already present in the raw CSV.",
+    )
+    parser.add_argument(
+        "--corr-function",
+        default=DEFAULT_CORR_FUNCTION,
+        help="Correlation implementation to use.",
+    )
+    parser.add_argument(
+        "--n-jobs",
+        type=int,
+        default=DEFAULT_N_JOBS,
+        help="Per-complex process worker count.",
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=DEFAULT_CHUNK_SIZE,
+        help="Number of terms per per-complex joblib task.",
+    )
+    parser.add_argument(
+        "--max-nbytes",
+        default=DEFAULT_MAX_NBYTES,
+        help="Joblib max_nbytes setting from config.",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    selected_gold_standards = parse_gold_standards(args.gold_standards)
+    end_repetition = args.end_repetition or args.runs
+    if args.runs <= 0:
+        raise ValueError("--runs must be greater than 0.")
+    if args.start_repetition <= 0:
+        raise ValueError("--start-repetition must be greater than 0.")
+    if end_repetition < args.start_repetition:
+        raise ValueError("--end-repetition must be >= --start-repetition.")
+    if end_repetition > args.runs:
+        raise ValueError("--end-repetition cannot be greater than --runs.")
+    if not args.gene_effect_path.exists():
+        raise FileNotFoundError(f"Gene-effect CSV not found: {args.gene_effect_path}")
+    benchmark_root = args.output_root.resolve()
+    benchmark_root.mkdir(parents=True, exist_ok=True)
+    all_timings = load_existing_timings(benchmark_root)
+    completed = completed_runs(all_timings) if args.skip_existing else set()
+    print(f"Benchmark output: {benchmark_root}")
+    print(
+        "Settings: "
+        f"corr_function={args.corr_function}, "
+        f"n_jobs={args.n_jobs}, chunk_size={args.chunk_size}, "
+        f"max_nbytes={args.max_nbytes}"
+    )
+    for gold_standard in selected_gold_standards:
+        for repetition in range(args.start_repetition, end_repetition + 1):
+            key = (gold_standard, repetition)
+            if key in completed:
+                print(f"Skipping existing {gold_standard} run {repetition:02d}")
+                continue
+            print(f"Running {gold_standard} run {repetition:02d}")
+            run_timings = run_repetition(
+                gold_standard=gold_standard,
+                repetition=repetition,
+                benchmark_root=benchmark_root,
+                gene_effect_path=args.gene_effect_path,
+                corr_function=args.corr_function,
+                n_jobs=args.n_jobs,
+                chunk_size=args.chunk_size,
+                max_nbytes=args.max_nbytes,
+            )
+            all_timings.extend(run_timings)
+            write_reports(all_timings, benchmark_root)
+    write_reports(all_timings, benchmark_root)
+    print("Benchmark complete.")
+if __name__ == "__main__":
+    main()

pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl