pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,33 +7,32 @@ import pythonflex as flex
7
7
 
8
8
  inputs = {
9
9
  "Melanoma (63 Screens)": {
10
- "path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
10
+ "path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
11
11
  "sort": "high",
12
- "color": "#FF0000"
12
+ "color": "#4E79A7",
13
13
  },
14
14
  "Liver (24 Screens)": {
15
- "path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
15
+ "path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
16
16
  "sort": "high",
17
- "color": "#FFDD00"
17
+ "color": "#F28E2B",
18
18
  },
19
19
  "Neuroblastoma (37 Screens)": {
20
- "path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
20
+ "path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
21
21
  "sort": "high",
22
- "color": "#FFDDDD"
22
+ "color": "#59A14F",
23
23
  },
24
24
  }
25
25
 
26
26
 
27
27
 
28
28
  default_config = {
29
- "min_genes_in_complex": 0,
30
- "min_genes_per_complex_analysis": 3,
31
- "output_folder": "CORUM",
32
- "gold_standard": "CORUM",
33
- "color_map": "BuGn",
29
+ "min_genes_in_complex": 2,
30
+ "min_genes_per_complex_analysis": 2,
31
+ "output_folder": "output_test",
32
+ "gold_standard": "GOBP",
33
+ "color_map": "RdYlBu",
34
34
  "jaccard": True,
35
- "jaccard_threshold": 1,
36
- "use_common_genes": False, # Set to False for individual dataset-gold standard intersections
35
+ "analysis_genes": "shared", # or "dataset_specific" (genes present per dataset)
37
36
  "plotting": {
38
37
  "save_plot": True,
39
38
  "output_type": "png",
@@ -42,9 +41,14 @@ default_config = {
42
41
  "fill_na": True,
43
42
  "normalize": False,
44
43
  },
45
- "corr_function": "numpy",
44
+ "corr_function": "numpy_without_mask",
45
+ "per_complex": {
46
+ "n_jobs": 8,
47
+ "chunk_size": 400,
48
+ "max_nbytes": "100M",
49
+ },
46
50
  "logging": {
47
- "visible_levels": ["DONE"]
51
+ "visible_levels": ["DONE", "INFO", "WARNING"]
48
52
  # "PROGRESS", "STARTED", ,"INFO","WARNING"
49
53
  }
50
54
  }
@@ -58,30 +62,34 @@ terms, genes_in_terms = flex.load_gold_standard()
58
62
 
59
63
  # Run analysis
60
64
  for name, dataset in data.items():
61
- pra = flex.pra(name, dataset, is_corr=False)
62
- fpc = flex.pra_percomplex(name, dataset, is_corr=False)
65
+ # Calculate correlation once and reuse it for global and per-complex PRA.
66
+ corr = flex.perform_corr(dataset, default_config["corr_function"])
67
+ pra = flex.pra(name, corr, is_corr=True)
68
+ fpc = flex.pra_percomplex(name, corr, is_corr=True)
63
69
  cc = flex.complex_contributions(name)
64
- flex.mpr_prepare(name)
65
-
66
-
67
70
 
71
+ # Optional mPR analysis. This can be slow on large datasets.
72
+ # flex.mpr_prepare(name)
73
+
68
74
 
69
75
 
70
76
  #%%
71
77
  # Generate plots
72
- # flex.plot_precision_recall_curve()
73
- # flex.plot_auc_scores()
74
- # flex.plot_significant_complexes()
75
- # flex.plot_percomplex_scatter(n_top=20)
76
- # flex.plot_percomplex_scatter_bysize()
77
- # flex.plot_complex_contributions()
78
- # flex.plot_mpr_tp_multi(show_filters="all")
79
- # flex.plot_mpr_complexes_multi(show_filters="all")
80
- # flex.plot_mpr_complexes_auc_scores("all")
81
-
78
+ flex.plot_precision_recall_curve()
79
+ flex.plot_auc_scores()
80
+ flex.plot_significant_complexes()
81
+ #%%
82
+ flex.plot_percomplex_scatter(n_top=10)
83
+ flex.plot_percomplex_scatter_bysize(n_top=10)
84
+ #flex.plot_complex_contributions()
82
85
 
86
+ # Optional mPR summary plot. Requires flex.mpr_prepare(name) above.
87
+ # flex.plot_mpr_summary(variants="unfiltered")
83
88
 
84
89
  #%%
85
90
  # Save results to CSV
86
91
  # flex.save_results_to_csv()
87
92
 
93
+ #how many cpu I have?
94
+ import multiprocessing
95
+ print(f"Number of CPU cores available: {multiprocessing.cpu_count()}")
@@ -12,54 +12,43 @@ skin = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/sk
12
12
 
13
13
  soft = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/soft_tissue_cell_lines.csv', index_col=0)
14
14
 
15
-
16
15
  cholesky = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/25Q2_chronos_whitened_Cholesky.csv', index_col=0).T
17
16
 
18
- # inputs = {
19
- # "All Screens": {
20
- # "path": gene_effect,
21
- # "sort": "high",
22
- # "color": "#000000"
23
- # },
24
- # "Skin": {
25
- # "path": skin,
26
- # "sort": "high",
27
- # "color": "#FF0000"
28
- # },
29
- # "Soft Tissue": {
30
- # "path": soft,
31
- # "sort": "high",
32
- # "color": "#FFFF00"
33
- # },
34
- # }
35
-
36
-
37
17
  inputs = {
38
- "DM All Screens": {
18
+ "depmap": {
39
19
  "path": gene_effect,
40
20
  "sort": "high",
41
21
  "color": "#000000"
42
22
  },
43
- "DM Cholesky Whitening": {
44
- "path": cholesky,
23
+ # "cholesky": {
24
+ # "path": cholesky,
25
+ # "sort": "high",
26
+ # "color": "#000CF4"
27
+ # },
28
+ "Skin": {
29
+ "path": skin,
45
30
  "sort": "high",
46
- "color": "#FF0000"
31
+ "color": "#2E7D32"
32
+ },
33
+ "Soft Tissue": {
34
+ "path": soft,
35
+ "sort": "high",
36
+ "color": "#7A8B00"
47
37
  },
48
-
49
38
  }
50
39
 
51
40
 
52
41
 
53
42
 
43
+
54
44
  default_config = {
55
45
  "min_genes_in_complex": 2,
56
- "min_genes_per_complex_analysis": 3,
57
- "output_folder": "CORUM_DMvsCholesky",
46
+ "min_genes_per_complex_analysis": 2,
47
+ "output_folder": "for_paper_output_01062026_CORUM_dm_skin_soft",
58
48
  "gold_standard": "CORUM",
59
- "color_map": "BuGn",
60
- "jaccard": False,
61
- "jaccard_threshold": 1.0,
62
- "use_common_genes": False, # Set to False for individual dataset-gold standard intersections
49
+ "color_map": "RdYlBu",
50
+ "jaccard": True,
51
+ "analysis_genes": "shared", # or "dataset_specific" (genes present per dataset)
63
52
  "plotting": {
64
53
  "save_plot": True,
65
54
  "output_type": "pdf",
@@ -69,8 +58,13 @@ default_config = {
69
58
  "normalize": False,
70
59
  },
71
60
  "corr_function": "numpy",
61
+ "per_complex": {
62
+ "n_jobs": 8,
63
+ "chunk_size": 400,
64
+ "max_nbytes": "100M",
65
+ },
72
66
  "logging": {
73
- "visible_levels": ["DONE"]
67
+ "visible_levels": ["DONE", "INFO", "WARNING"]
74
68
  # "PROGRESS", "STARTED", ,"INFO","WARNING"
75
69
  }
76
70
  }
@@ -84,12 +78,14 @@ terms, genes_in_terms = flex.load_gold_standard()
84
78
 
85
79
  # Run analysis
86
80
  for name, dataset in data.items():
87
- pra = flex.pra(name, dataset, is_corr=False)
88
- fpc = flex.pra_percomplex(name, dataset, is_corr=False)
81
+ # Calculate correlation once and reuse it for global and per-complex PRA.
82
+ corr = flex.perform_corr(dataset, default_config["corr_function"])
83
+ pra = flex.pra(name, corr, is_corr=True)
84
+ fpc = flex.pra_percomplex(name, corr, is_corr=True)
89
85
  cc = flex.complex_contributions(name)
90
- flex.mpr_prepare(name)
91
-
92
-
86
+ # Optional mPR analysis. This can be slow on large datasets.
87
+ flex.mpr_prepare(name)
88
+
93
89
 
94
90
 
95
91
  #%%
@@ -100,13 +96,11 @@ flex.plot_significant_complexes()
100
96
  flex.plot_percomplex_scatter(n_top=20)
101
97
  flex.plot_percomplex_scatter_bysize()
102
98
  flex.plot_complex_contributions()
103
- ##
104
- #%%
105
- flex.plot_mpr_tp_multi(show_filters="all")
106
- flex.plot_mpr_complexes_multi(show_filters="all")
107
-
99
+ # Optional mPR summary plot. Requires flex.mpr_prepare(name) above.
100
+ flex.plot_mpr_summary(variants="unfiltered")
108
101
  # Save results to CSV
109
102
  flex.save_results_to_csv()
110
103
 
111
- # %%
104
+
105
+
112
106
  # %%
@@ -0,0 +1,218 @@
1
+ """Benchmark the manuscript workflow against each bundled gold standard.
2
+
3
+ Run from any directory with:
4
+ python path/to/src/pythonflex/examples/runtime_benchmark.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import sys
11
+ from pathlib import Path
12
+ from time import perf_counter
13
+ from typing import Any, Callable
14
+
15
+ import pandas as pd
16
+
17
+
18
+ PROJECT_ROOT = Path(__file__).resolve().parents[3]
19
+ SRC_ROOT = PROJECT_ROOT / "src"
20
+ if str(SRC_ROOT) not in sys.path:
21
+ sys.path.insert(0, str(SRC_ROOT))
22
+
23
+ # Plot generation is benchmarked and saved without opening interactive windows.
24
+ os.environ.setdefault("MPLBACKEND", "Agg")
25
+
26
+ import pythonflex as flex # noqa: E402
27
+
28
+
29
+ GENE_EFFECT_PATH = Path(
30
+ "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
31
+ )
32
+ BENCHMARK_ROOT = PROJECT_ROOT / "output" / "runtime_benchmark"
33
+ GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
34
+
35
+
36
+ def build_config(gold_standard: str, output_folder: Path) -> dict[str, Any]:
37
+ return {
38
+ "min_genes_in_complex": 2,
39
+ "min_genes_per_complex_analysis": 3,
40
+ "output_folder": str(output_folder),
41
+ "gold_standard": gold_standard,
42
+ "color_map": "RdYlBu",
43
+ "jaccard": True,
44
+ "analysis_genes": "common",
45
+ "plotting": {
46
+ "save_plot": True,
47
+ "show_plot": False,
48
+ "output_type": "png",
49
+ },
50
+ "preprocessing": {
51
+ "fill_na": True,
52
+ "normalize": False,
53
+ },
54
+ "corr_function": "numpy",
55
+ "logging": {
56
+ "visible_levels": ["DONE", "INFO", "WARNING"],
57
+ },
58
+ }
59
+
60
+
61
+ def timed_call(
62
+ timings: list[dict[str, Any]],
63
+ gold_standard: str,
64
+ step: str,
65
+ operation: Callable[[], Any],
66
+ ) -> Any:
67
+ start = perf_counter()
68
+ result = operation()
69
+ timings.append(
70
+ {
71
+ "gold_standard": gold_standard,
72
+ "step": step,
73
+ "seconds": perf_counter() - start,
74
+ }
75
+ )
76
+ return result
77
+
78
+
79
+ def run_gold_standard(gold_standard: str) -> list[dict[str, Any]]:
80
+ output_folder = BENCHMARK_ROOT / gold_standard
81
+ timings: list[dict[str, Any]] = []
82
+ workflow_start = perf_counter()
83
+
84
+ timed_call(
85
+ timings,
86
+ gold_standard,
87
+ "initialize",
88
+ lambda: flex.initialize(build_config(gold_standard, output_folder)),
89
+ )
90
+ gene_effect = timed_call(
91
+ timings,
92
+ gold_standard,
93
+ "read_gene_effect",
94
+ lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
95
+ )
96
+ inputs = {
97
+ "All screens": {
98
+ "path": gene_effect,
99
+ "sort": "high",
100
+ "color": "#000000",
101
+ },
102
+ }
103
+ data, _ = timed_call(
104
+ timings,
105
+ gold_standard,
106
+ "load_datasets",
107
+ lambda: flex.load_datasets(inputs),
108
+ )
109
+ timed_call(
110
+ timings,
111
+ gold_standard,
112
+ "load_gold_standard",
113
+ flex.load_gold_standard,
114
+ )
115
+
116
+ name, dataset = next(iter(data.items()))
117
+ timed_call(
118
+ timings,
119
+ gold_standard,
120
+ "pra",
121
+ lambda: flex.pra(name, dataset, is_corr=False),
122
+ )
123
+ timed_call(
124
+ timings,
125
+ gold_standard,
126
+ "pra_percomplex",
127
+ lambda: flex.pra_percomplex(name, dataset, is_corr=False),
128
+ )
129
+ timed_call(
130
+ timings,
131
+ gold_standard,
132
+ "complex_contributions",
133
+ lambda: flex.complex_contributions(name),
134
+ )
135
+ timed_call(
136
+ timings,
137
+ gold_standard,
138
+ "mpr_prepare",
139
+ lambda: flex.mpr_prepare(name),
140
+ )
141
+
142
+ timed_call(
143
+ timings,
144
+ gold_standard,
145
+ "plot_precision_recall_curve",
146
+ flex.plot_precision_recall_curve,
147
+ )
148
+ timed_call(timings, gold_standard, "plot_auc_scores", flex.plot_auc_scores)
149
+ timed_call(
150
+ timings,
151
+ gold_standard,
152
+ "plot_significant_complexes",
153
+ flex.plot_significant_complexes,
154
+ )
155
+ timed_call(
156
+ timings,
157
+ gold_standard,
158
+ "plot_percomplex_scatter",
159
+ lambda: flex.plot_percomplex_scatter(n_top=20),
160
+ )
161
+ timed_call(
162
+ timings,
163
+ gold_standard,
164
+ "plot_percomplex_scatter_bysize",
165
+ flex.plot_percomplex_scatter_bysize,
166
+ )
167
+ timed_call(
168
+ timings,
169
+ gold_standard,
170
+ "plot_complex_contributions",
171
+ flex.plot_complex_contributions,
172
+ )
173
+
174
+ timings.append(
175
+ {
176
+ "gold_standard": gold_standard,
177
+ "step": "total_runtime",
178
+ "seconds": perf_counter() - workflow_start,
179
+ }
180
+ )
181
+ output_folder.mkdir(parents=True, exist_ok=True)
182
+ pd.DataFrame(timings).to_csv(
183
+ output_folder / "benchmark_results.csv",
184
+ index=False,
185
+ )
186
+ return timings
187
+
188
+
189
+ def main() -> None:
190
+ if not GENE_EFFECT_PATH.exists():
191
+ raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
192
+
193
+ all_timings: list[dict[str, Any]] = []
194
+ for gold_standard in GOLD_STANDARDS:
195
+ print(f"Running runtime benchmark: {gold_standard}")
196
+ all_timings.extend(run_gold_standard(gold_standard))
197
+
198
+ total_seconds = sum(
199
+ timing["seconds"]
200
+ for timing in all_timings
201
+ if timing["step"] == "total_runtime"
202
+ )
203
+ all_timings.append(
204
+ {
205
+ "gold_standard": "ALL",
206
+ "step": "grand_total_runtime",
207
+ "seconds": total_seconds,
208
+ }
209
+ )
210
+ BENCHMARK_ROOT.mkdir(parents=True, exist_ok=True)
211
+ combined_path = BENCHMARK_ROOT / "benchmark_results_all_gold_standards.csv"
212
+ pd.DataFrame(all_timings).to_csv(combined_path, index=False)
213
+ print(f"Benchmark results saved to: {combined_path}")
214
+ print(f"Grand total workflow runtime: {total_seconds:.3f} seconds")
215
+
216
+
217
+ if __name__ == "__main__":
218
+ main()