PyPI - pythonflex - Versions diffs - 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl - Mend

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pythonflex/__init__.py +28 -4
pythonflex/analysis.py +287 -579
pythonflex/examples/basic_usage.py +38 -30
pythonflex/examples/manuscript.py +37 -43
pythonflex/examples/runtime/runtime_benchmark.py +218 -0
pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
pythonflex/old_functions.py +422 -0
pythonflex/plotting.py +655 -242
pythonflex/preprocessing.py +54 -216
pythonflex/utils.py +36 -9
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/METADATA +8 -6
pythonflex-0.4.dist-info/RECORD +32 -0
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
pythonflex-0.3.4.dist-info/RECORD +0 -24
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0

pythonflex/examples/basic_usage.py CHANGED Viewed

@@ -7,33 +7,32 @@ import pythonflex as flex
 inputs = {
     "Melanoma (63 Screens)": {
-        "path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
+        "path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
         "sort": "high",
-        "color": "#FF0000"
+        "color": "#4E79A7",
     },
     "Liver (24 Screens)": {
-        "path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
+        "path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
         "sort": "high",
-        "color": "#FFDD00"
+        "color": "#F28E2B",
     },
     "Neuroblastoma (37 Screens)": {
-        "path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
+        "path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
         "sort": "high",
-        "color": "#FFDDDD"
+        "color": "#59A14F",
     },
 }
 default_config = {
-    "min_genes_in_complex": 0,
-    "min_genes_per_complex_analysis": 3,
-    "output_folder": "CORUM",
-    "gold_standard": "CORUM",
-    "color_map": "BuGn",
+    "min_genes_in_complex": 2,
+    "min_genes_per_complex_analysis": 2,
+    "output_folder": "output_test",
+    "gold_standard": "GOBP",
+    "color_map": "RdYlBu",
     "jaccard": True,
-    "jaccard_threshold": 1,
-    "use_common_genes": False,  # Set to False for individual dataset-gold standard intersections
+    "analysis_genes": "shared",  # or "dataset_specific" (genes present per dataset)
     "plotting": {
         "save_plot": True,
         "output_type": "png",
@@ -42,9 +41,14 @@ default_config = {
         "fill_na": True,
         "normalize": False,
     },
-    "corr_function": "numpy",
+    "corr_function": "numpy_without_mask",
+    "per_complex": {
+        "n_jobs": 8,
+        "chunk_size": 400,
+        "max_nbytes": "100M",
+    },
     "logging": {
-        "visible_levels": ["DONE"]
+        "visible_levels": ["DONE", "INFO", "WARNING"]
         # "PROGRESS", "STARTED", ,"INFO","WARNING"
     }
 }
@@ -58,30 +62,34 @@ terms, genes_in_terms = flex.load_gold_standard()
 # Run analysis
 for name, dataset in data.items():
-    pra = flex.pra(name, dataset, is_corr=False)
-    fpc = flex.pra_percomplex(name, dataset, is_corr=False)
+    # Calculate correlation once and reuse it for global and per-complex PRA.
+    corr = flex.perform_corr(dataset, default_config["corr_function"])
+    pra = flex.pra(name, corr, is_corr=True)
+    fpc = flex.pra_percomplex(name, corr, is_corr=True)
     cc = flex.complex_contributions(name)
-    flex.mpr_prepare(name)
+    # Optional mPR analysis. This can be slow on large datasets.
+    # flex.mpr_prepare(name)
 #%%
 # Generate plots
-# flex.plot_precision_recall_curve()
-# flex.plot_auc_scores()
-# flex.plot_significant_complexes()
-# flex.plot_percomplex_scatter(n_top=20)
-# flex.plot_percomplex_scatter_bysize()
-# flex.plot_complex_contributions()
-# flex.plot_mpr_tp_multi(show_filters="all")
-# flex.plot_mpr_complexes_multi(show_filters="all")
-# flex.plot_mpr_complexes_auc_scores("all")
+flex.plot_precision_recall_curve()
+flex.plot_auc_scores()
+flex.plot_significant_complexes()
+#%%
+flex.plot_percomplex_scatter(n_top=10)
+flex.plot_percomplex_scatter_bysize(n_top=10)
+#flex.plot_complex_contributions()
+# Optional mPR summary plot. Requires flex.mpr_prepare(name) above.
+# flex.plot_mpr_summary(variants="unfiltered")
 #%%
 # Save results to CSV
 # flex.save_results_to_csv()
+#how many cpu I have?
+import multiprocessing
+print(f"Number of CPU cores available: {multiprocessing.cpu_count()}")

pythonflex/examples/manuscript.py CHANGED Viewed

@@ -12,54 +12,43 @@ skin = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/sk
 soft = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/soft_tissue_cell_lines.csv', index_col=0)
 cholesky = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/25Q2_chronos_whitened_Cholesky.csv', index_col=0).T
-# inputs = {
-#     "All Screens": {
-#         "path": gene_effect,
-#         "sort": "high",
-#         "color": "#000000"
-#     },
-#     "Skin": {
-#         "path": skin,
-#         "sort": "high",
-#         "color": "#FF0000"
-#     },
-#     "Soft Tissue": {
-#         "path": soft,
-#         "sort": "high",
-#         "color": "#FFFF00"
-#     },
-# }
 inputs = {
-    "DM All Screens": {
+    "depmap": {
         "path": gene_effect,
         "sort": "high",
         "color": "#000000"
     },
-    "DM Cholesky Whitening": {
-        "path": cholesky,
+    # "cholesky": {
+    #     "path": cholesky,
+    #     "sort": "high",
+    #     "color": "#000CF4"
+    # },
+    "Skin": {
+        "path": skin,
         "sort": "high",
-        "color": "#FF0000"
+        "color": "#2E7D32"
+    },
+    "Soft Tissue": {
+        "path": soft,
+        "sort": "high",
+        "color": "#7A8B00"
     },
 }
 default_config = {
     "min_genes_in_complex": 2,
-    "min_genes_per_complex_analysis": 3,
-    "output_folder": "CORUM_DMvsCholesky",
+    "min_genes_per_complex_analysis": 2,
+    "output_folder": "for_paper_output_01062026_CORUM_dm_skin_soft",
     "gold_standard": "CORUM",
-    "color_map": "BuGn",
-    "jaccard": False,
-        "jaccard_threshold": 1.0,
-    "use_common_genes": False,  # Set to False for individual dataset-gold standard intersections
+    "color_map": "RdYlBu",
+    "jaccard": True,
+    "analysis_genes": "shared",  # or "dataset_specific" (genes present per dataset)
     "plotting": {
         "save_plot": True,
         "output_type": "pdf",
@@ -69,8 +58,13 @@ default_config = {
         "normalize": False,
     },
     "corr_function": "numpy",
+    "per_complex": {
+        "n_jobs": 8,
+        "chunk_size": 400,
+        "max_nbytes": "100M",
+    },
     "logging": {
-        "visible_levels": ["DONE"]
+        "visible_levels": ["DONE", "INFO", "WARNING"]
         # "PROGRESS", "STARTED", ,"INFO","WARNING"
     }
 }
@@ -84,12 +78,14 @@ terms, genes_in_terms = flex.load_gold_standard()
 # Run analysis
 for name, dataset in data.items():
-    pra = flex.pra(name, dataset, is_corr=False)
-    fpc = flex.pra_percomplex(name, dataset, is_corr=False)
+    # Calculate correlation once and reuse it for global and per-complex PRA.
+    corr = flex.perform_corr(dataset, default_config["corr_function"])
+    pra = flex.pra(name, corr, is_corr=True)
+    fpc = flex.pra_percomplex(name, corr, is_corr=True)
     cc = flex.complex_contributions(name)
-    flex.mpr_prepare(name)
+    # Optional mPR analysis. This can be slow on large datasets.
+    flex.mpr_prepare(name)
 #%%
@@ -100,13 +96,11 @@ flex.plot_significant_complexes()
 flex.plot_percomplex_scatter(n_top=20)
 flex.plot_percomplex_scatter_bysize()
 flex.plot_complex_contributions()
-##
-#%%
-flex.plot_mpr_tp_multi(show_filters="all")
-flex.plot_mpr_complexes_multi(show_filters="all")
+# Optional mPR summary plot. Requires flex.mpr_prepare(name) above.
+flex.plot_mpr_summary(variants="unfiltered")
 # Save results to CSV
 flex.save_results_to_csv()
-# %%
 # %%

pythonflex/examples/runtime/runtime_benchmark.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""Benchmark the manuscript workflow against each bundled gold standard.
+Run from any directory with:
+    python path/to/src/pythonflex/examples/runtime_benchmark.py
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+from time import perf_counter
+from typing import Any, Callable
+import pandas as pd
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+SRC_ROOT = PROJECT_ROOT / "src"
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+# Plot generation is benchmarked and saved without opening interactive windows.
+os.environ.setdefault("MPLBACKEND", "Agg")
+import pythonflex as flex  # noqa: E402
+GENE_EFFECT_PATH = Path(
+    "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
+)
+BENCHMARK_ROOT = PROJECT_ROOT / "output" / "runtime_benchmark"
+GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
+def build_config(gold_standard: str, output_folder: Path) -> dict[str, Any]:
+    return {
+        "min_genes_in_complex": 2,
+        "min_genes_per_complex_analysis": 3,
+        "output_folder": str(output_folder),
+        "gold_standard": gold_standard,
+        "color_map": "RdYlBu",
+        "jaccard": True,
+        "analysis_genes": "common",
+        "plotting": {
+            "save_plot": True,
+            "show_plot": False,
+            "output_type": "png",
+        },
+        "preprocessing": {
+            "fill_na": True,
+            "normalize": False,
+        },
+        "corr_function": "numpy",
+        "logging": {
+            "visible_levels": ["DONE", "INFO", "WARNING"],
+        },
+    }
+def timed_call(
+    timings: list[dict[str, Any]],
+    gold_standard: str,
+    step: str,
+    operation: Callable[[], Any],
+) -> Any:
+    start = perf_counter()
+    result = operation()
+    timings.append(
+        {
+            "gold_standard": gold_standard,
+            "step": step,
+            "seconds": perf_counter() - start,
+        }
+    )
+    return result
+def run_gold_standard(gold_standard: str) -> list[dict[str, Any]]:
+    output_folder = BENCHMARK_ROOT / gold_standard
+    timings: list[dict[str, Any]] = []
+    workflow_start = perf_counter()
+    timed_call(
+        timings,
+        gold_standard,
+        "initialize",
+        lambda: flex.initialize(build_config(gold_standard, output_folder)),
+    )
+    gene_effect = timed_call(
+        timings,
+        gold_standard,
+        "read_gene_effect",
+        lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
+    )
+    inputs = {
+        "All screens": {
+            "path": gene_effect,
+            "sort": "high",
+            "color": "#000000",
+        },
+    }
+    data, _ = timed_call(
+        timings,
+        gold_standard,
+        "load_datasets",
+        lambda: flex.load_datasets(inputs),
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "load_gold_standard",
+        flex.load_gold_standard,
+    )
+    name, dataset = next(iter(data.items()))
+    timed_call(
+        timings,
+        gold_standard,
+        "pra",
+        lambda: flex.pra(name, dataset, is_corr=False),
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "pra_percomplex",
+        lambda: flex.pra_percomplex(name, dataset, is_corr=False),
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "complex_contributions",
+        lambda: flex.complex_contributions(name),
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "mpr_prepare",
+        lambda: flex.mpr_prepare(name),
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "plot_precision_recall_curve",
+        flex.plot_precision_recall_curve,
+    )
+    timed_call(timings, gold_standard, "plot_auc_scores", flex.plot_auc_scores)
+    timed_call(
+        timings,
+        gold_standard,
+        "plot_significant_complexes",
+        flex.plot_significant_complexes,
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "plot_percomplex_scatter",
+        lambda: flex.plot_percomplex_scatter(n_top=20),
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "plot_percomplex_scatter_bysize",
+        flex.plot_percomplex_scatter_bysize,
+    )
+    timed_call(
+        timings,
+        gold_standard,
+        "plot_complex_contributions",
+        flex.plot_complex_contributions,
+    )
+    timings.append(
+        {
+            "gold_standard": gold_standard,
+            "step": "total_runtime",
+            "seconds": perf_counter() - workflow_start,
+        }
+    )
+    output_folder.mkdir(parents=True, exist_ok=True)
+    pd.DataFrame(timings).to_csv(
+        output_folder / "benchmark_results.csv",
+        index=False,
+    )
+    return timings
+def main() -> None:
+    if not GENE_EFFECT_PATH.exists():
+        raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
+    all_timings: list[dict[str, Any]] = []
+    for gold_standard in GOLD_STANDARDS:
+        print(f"Running runtime benchmark: {gold_standard}")
+        all_timings.extend(run_gold_standard(gold_standard))
+    total_seconds = sum(
+        timing["seconds"]
+        for timing in all_timings
+        if timing["step"] == "total_runtime"
+    )
+    all_timings.append(
+        {
+            "gold_standard": "ALL",
+            "step": "grand_total_runtime",
+            "seconds": total_seconds,
+        }
+    )
+    BENCHMARK_ROOT.mkdir(parents=True, exist_ok=True)
+    combined_path = BENCHMARK_ROOT / "benchmark_results_all_gold_standards.csv"
+    pd.DataFrame(all_timings).to_csv(combined_path, index=False)
+    print(f"Benchmark results saved to: {combined_path}")
+    print(f"Grand total workflow runtime: {total_seconds:.3f} seconds")
+if __name__ == "__main__":
+    main()

pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl