pythonflex 0.3.3__py3-none-any.whl → 0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pythonflex/__init__.py +28 -4
- pythonflex/analysis.py +287 -578
- pythonflex/examples/basic_usage.py +40 -32
- pythonflex/examples/manuscript.py +37 -42
- pythonflex/examples/runtime/runtime_benchmark.py +218 -0
- pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
- pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
- pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
- pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
- pythonflex/old_functions.py +422 -0
- pythonflex/plotting.py +655 -242
- pythonflex/preprocessing.py +62 -60
- pythonflex/utils.py +36 -9
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/METADATA +9 -4
- pythonflex-0.4.dist-info/RECORD +32 -0
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
- pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
- pythonflex-0.3.3.dist-info/RECORD +0 -24
- {pythonflex-0.3.3.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0
|
@@ -7,32 +7,32 @@ import pythonflex as flex
|
|
|
7
7
|
|
|
8
8
|
inputs = {
|
|
9
9
|
"Melanoma (63 Screens)": {
|
|
10
|
-
"path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
|
|
10
|
+
"path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
|
|
11
11
|
"sort": "high",
|
|
12
|
-
"color": "#
|
|
12
|
+
"color": "#4E79A7",
|
|
13
13
|
},
|
|
14
14
|
"Liver (24 Screens)": {
|
|
15
|
-
"path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
|
|
15
|
+
"path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
|
|
16
16
|
"sort": "high",
|
|
17
|
-
"color": "#
|
|
17
|
+
"color": "#F28E2B",
|
|
18
18
|
},
|
|
19
19
|
"Neuroblastoma (37 Screens)": {
|
|
20
|
-
"path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
|
|
20
|
+
"path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
|
|
21
21
|
"sort": "high",
|
|
22
|
-
"color": "#
|
|
22
|
+
"color": "#59A14F",
|
|
23
23
|
},
|
|
24
24
|
}
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
default_config = {
|
|
29
|
-
"min_genes_in_complex":
|
|
30
|
-
"min_genes_per_complex_analysis":
|
|
31
|
-
"output_folder": "
|
|
32
|
-
"gold_standard": "
|
|
33
|
-
"color_map": "
|
|
34
|
-
"jaccard":
|
|
35
|
-
"
|
|
29
|
+
"min_genes_in_complex": 2,
|
|
30
|
+
"min_genes_per_complex_analysis": 2,
|
|
31
|
+
"output_folder": "output_test",
|
|
32
|
+
"gold_standard": "GOBP",
|
|
33
|
+
"color_map": "RdYlBu",
|
|
34
|
+
"jaccard": True,
|
|
35
|
+
"analysis_genes": "shared", # or "dataset_specific" (genes present per dataset)
|
|
36
36
|
"plotting": {
|
|
37
37
|
"save_plot": True,
|
|
38
38
|
"output_type": "png",
|
|
@@ -41,9 +41,14 @@ default_config = {
|
|
|
41
41
|
"fill_na": True,
|
|
42
42
|
"normalize": False,
|
|
43
43
|
},
|
|
44
|
-
"corr_function": "
|
|
44
|
+
"corr_function": "numpy_without_mask",
|
|
45
|
+
"per_complex": {
|
|
46
|
+
"n_jobs": 8,
|
|
47
|
+
"chunk_size": 400,
|
|
48
|
+
"max_nbytes": "100M",
|
|
49
|
+
},
|
|
45
50
|
"logging": {
|
|
46
|
-
"visible_levels": ["DONE"]
|
|
51
|
+
"visible_levels": ["DONE", "INFO", "WARNING"]
|
|
47
52
|
# "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
48
53
|
}
|
|
49
54
|
}
|
|
@@ -57,31 +62,34 @@ terms, genes_in_terms = flex.load_gold_standard()
|
|
|
57
62
|
|
|
58
63
|
# Run analysis
|
|
59
64
|
for name, dataset in data.items():
|
|
60
|
-
|
|
61
|
-
|
|
65
|
+
# Calculate correlation once and reuse it for global and per-complex PRA.
|
|
66
|
+
corr = flex.perform_corr(dataset, default_config["corr_function"])
|
|
67
|
+
pra = flex.pra(name, corr, is_corr=True)
|
|
68
|
+
fpc = flex.pra_percomplex(name, corr, is_corr=True)
|
|
62
69
|
cc = flex.complex_contributions(name)
|
|
63
|
-
flex.mpr_prepare(name)
|
|
64
|
-
|
|
65
70
|
|
|
71
|
+
# Optional mPR analysis. This can be slow on large datasets.
|
|
72
|
+
# flex.mpr_prepare(name)
|
|
73
|
+
|
|
66
74
|
|
|
67
75
|
|
|
68
76
|
#%%
|
|
69
77
|
# Generate plots
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
# flex.plot_percomplex_scatter(n_top=20)
|
|
74
|
-
# flex.plot_percomplex_scatter_bysize()
|
|
75
|
-
# flex.plot_complex_contributions()
|
|
78
|
+
flex.plot_precision_recall_curve()
|
|
79
|
+
flex.plot_auc_scores()
|
|
80
|
+
flex.plot_significant_complexes()
|
|
76
81
|
#%%
|
|
77
|
-
|
|
78
|
-
flex.
|
|
82
|
+
flex.plot_percomplex_scatter(n_top=10)
|
|
83
|
+
flex.plot_percomplex_scatter_bysize(n_top=10)
|
|
84
|
+
#flex.plot_complex_contributions()
|
|
85
|
+
|
|
86
|
+
# Optional mPR summary plot. Requires flex.mpr_prepare(name) above.
|
|
87
|
+
# flex.plot_mpr_summary(variants="unfiltered")
|
|
79
88
|
|
|
80
89
|
#%%
|
|
81
90
|
# Save results to CSV
|
|
82
|
-
flex.save_results_to_csv()
|
|
83
|
-
|
|
91
|
+
# flex.save_results_to_csv()
|
|
84
92
|
|
|
85
|
-
#
|
|
86
|
-
|
|
87
|
-
|
|
93
|
+
#how many cpu I have?
|
|
94
|
+
import multiprocessing
|
|
95
|
+
print(f"Number of CPU cores available: {multiprocessing.cpu_count()}")
|
|
@@ -12,53 +12,43 @@ skin = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/sk
|
|
|
12
12
|
|
|
13
13
|
soft = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/soft_tissue_cell_lines.csv', index_col=0)
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
cholesky = pd.read_csv('C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/25Q2_chronos_whitened_Cholesky.csv', index_col=0).T
|
|
17
16
|
|
|
18
|
-
# inputs = {
|
|
19
|
-
# "All Screens": {
|
|
20
|
-
# "path": gene_effect,
|
|
21
|
-
# "sort": "high",
|
|
22
|
-
# "color": "#000000"
|
|
23
|
-
# },
|
|
24
|
-
# "Skin": {
|
|
25
|
-
# "path": skin,
|
|
26
|
-
# "sort": "high",
|
|
27
|
-
# "color": "#FF0000"
|
|
28
|
-
# },
|
|
29
|
-
# "Soft Tissue": {
|
|
30
|
-
# "path": soft,
|
|
31
|
-
# "sort": "high",
|
|
32
|
-
# "color": "#FFFF00"
|
|
33
|
-
# },
|
|
34
|
-
# }
|
|
35
|
-
|
|
36
|
-
|
|
37
17
|
inputs = {
|
|
38
|
-
"
|
|
18
|
+
"depmap": {
|
|
39
19
|
"path": gene_effect,
|
|
40
20
|
"sort": "high",
|
|
41
21
|
"color": "#000000"
|
|
42
22
|
},
|
|
43
|
-
|
|
44
|
-
|
|
23
|
+
# "cholesky": {
|
|
24
|
+
# "path": cholesky,
|
|
25
|
+
# "sort": "high",
|
|
26
|
+
# "color": "#000CF4"
|
|
27
|
+
# },
|
|
28
|
+
"Skin": {
|
|
29
|
+
"path": skin,
|
|
45
30
|
"sort": "high",
|
|
46
|
-
"color": "#
|
|
31
|
+
"color": "#2E7D32"
|
|
32
|
+
},
|
|
33
|
+
"Soft Tissue": {
|
|
34
|
+
"path": soft,
|
|
35
|
+
"sort": "high",
|
|
36
|
+
"color": "#7A8B00"
|
|
47
37
|
},
|
|
48
|
-
|
|
49
38
|
}
|
|
50
39
|
|
|
51
40
|
|
|
52
41
|
|
|
53
42
|
|
|
43
|
+
|
|
54
44
|
default_config = {
|
|
55
45
|
"min_genes_in_complex": 2,
|
|
56
|
-
"min_genes_per_complex_analysis":
|
|
57
|
-
"output_folder": "
|
|
46
|
+
"min_genes_per_complex_analysis": 2,
|
|
47
|
+
"output_folder": "for_paper_output_01062026_CORUM_dm_skin_soft",
|
|
58
48
|
"gold_standard": "CORUM",
|
|
59
|
-
"color_map": "
|
|
60
|
-
"jaccard":
|
|
61
|
-
"
|
|
49
|
+
"color_map": "RdYlBu",
|
|
50
|
+
"jaccard": True,
|
|
51
|
+
"analysis_genes": "shared", # or "dataset_specific" (genes present per dataset)
|
|
62
52
|
"plotting": {
|
|
63
53
|
"save_plot": True,
|
|
64
54
|
"output_type": "pdf",
|
|
@@ -68,8 +58,13 @@ default_config = {
|
|
|
68
58
|
"normalize": False,
|
|
69
59
|
},
|
|
70
60
|
"corr_function": "numpy",
|
|
61
|
+
"per_complex": {
|
|
62
|
+
"n_jobs": 8,
|
|
63
|
+
"chunk_size": 400,
|
|
64
|
+
"max_nbytes": "100M",
|
|
65
|
+
},
|
|
71
66
|
"logging": {
|
|
72
|
-
"visible_levels": ["DONE"]
|
|
67
|
+
"visible_levels": ["DONE", "INFO", "WARNING"]
|
|
73
68
|
# "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
74
69
|
}
|
|
75
70
|
}
|
|
@@ -83,12 +78,14 @@ terms, genes_in_terms = flex.load_gold_standard()
|
|
|
83
78
|
|
|
84
79
|
# Run analysis
|
|
85
80
|
for name, dataset in data.items():
|
|
86
|
-
|
|
87
|
-
|
|
81
|
+
# Calculate correlation once and reuse it for global and per-complex PRA.
|
|
82
|
+
corr = flex.perform_corr(dataset, default_config["corr_function"])
|
|
83
|
+
pra = flex.pra(name, corr, is_corr=True)
|
|
84
|
+
fpc = flex.pra_percomplex(name, corr, is_corr=True)
|
|
88
85
|
cc = flex.complex_contributions(name)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
86
|
+
# Optional mPR analysis. This can be slow on large datasets.
|
|
87
|
+
flex.mpr_prepare(name)
|
|
88
|
+
|
|
92
89
|
|
|
93
90
|
|
|
94
91
|
#%%
|
|
@@ -99,13 +96,11 @@ flex.plot_significant_complexes()
|
|
|
99
96
|
flex.plot_percomplex_scatter(n_top=20)
|
|
100
97
|
flex.plot_percomplex_scatter_bysize()
|
|
101
98
|
flex.plot_complex_contributions()
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
flex.plot_mpr_tp_multi(show_filters="all")
|
|
105
|
-
flex.plot_mpr_complexes_multi(show_filters="all")
|
|
106
|
-
|
|
99
|
+
# Optional mPR summary plot. Requires flex.mpr_prepare(name) above.
|
|
100
|
+
flex.plot_mpr_summary(variants="unfiltered")
|
|
107
101
|
# Save results to CSV
|
|
108
102
|
flex.save_results_to_csv()
|
|
109
103
|
|
|
110
|
-
|
|
104
|
+
|
|
105
|
+
|
|
111
106
|
# %%
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Benchmark the manuscript workflow against each bundled gold standard.
|
|
2
|
+
|
|
3
|
+
Run from any directory with:
|
|
4
|
+
python path/to/src/pythonflex/examples/runtime_benchmark.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from time import perf_counter
|
|
13
|
+
from typing import Any, Callable
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
|
19
|
+
SRC_ROOT = PROJECT_ROOT / "src"
|
|
20
|
+
if str(SRC_ROOT) not in sys.path:
|
|
21
|
+
sys.path.insert(0, str(SRC_ROOT))
|
|
22
|
+
|
|
23
|
+
# Plot generation is benchmarked and saved without opening interactive windows.
|
|
24
|
+
os.environ.setdefault("MPLBACKEND", "Agg")
|
|
25
|
+
|
|
26
|
+
import pythonflex as flex # noqa: E402
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
GENE_EFFECT_PATH = Path(
|
|
30
|
+
"C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv"
|
|
31
|
+
)
|
|
32
|
+
BENCHMARK_ROOT = PROJECT_ROOT / "output" / "runtime_benchmark"
|
|
33
|
+
GOLD_STANDARDS = ("CORUM", "PATHWAY", "GOBP")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def build_config(gold_standard: str, output_folder: Path) -> dict[str, Any]:
|
|
37
|
+
return {
|
|
38
|
+
"min_genes_in_complex": 2,
|
|
39
|
+
"min_genes_per_complex_analysis": 3,
|
|
40
|
+
"output_folder": str(output_folder),
|
|
41
|
+
"gold_standard": gold_standard,
|
|
42
|
+
"color_map": "RdYlBu",
|
|
43
|
+
"jaccard": True,
|
|
44
|
+
"analysis_genes": "common",
|
|
45
|
+
"plotting": {
|
|
46
|
+
"save_plot": True,
|
|
47
|
+
"show_plot": False,
|
|
48
|
+
"output_type": "png",
|
|
49
|
+
},
|
|
50
|
+
"preprocessing": {
|
|
51
|
+
"fill_na": True,
|
|
52
|
+
"normalize": False,
|
|
53
|
+
},
|
|
54
|
+
"corr_function": "numpy",
|
|
55
|
+
"logging": {
|
|
56
|
+
"visible_levels": ["DONE", "INFO", "WARNING"],
|
|
57
|
+
},
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def timed_call(
|
|
62
|
+
timings: list[dict[str, Any]],
|
|
63
|
+
gold_standard: str,
|
|
64
|
+
step: str,
|
|
65
|
+
operation: Callable[[], Any],
|
|
66
|
+
) -> Any:
|
|
67
|
+
start = perf_counter()
|
|
68
|
+
result = operation()
|
|
69
|
+
timings.append(
|
|
70
|
+
{
|
|
71
|
+
"gold_standard": gold_standard,
|
|
72
|
+
"step": step,
|
|
73
|
+
"seconds": perf_counter() - start,
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def run_gold_standard(gold_standard: str) -> list[dict[str, Any]]:
|
|
80
|
+
output_folder = BENCHMARK_ROOT / gold_standard
|
|
81
|
+
timings: list[dict[str, Any]] = []
|
|
82
|
+
workflow_start = perf_counter()
|
|
83
|
+
|
|
84
|
+
timed_call(
|
|
85
|
+
timings,
|
|
86
|
+
gold_standard,
|
|
87
|
+
"initialize",
|
|
88
|
+
lambda: flex.initialize(build_config(gold_standard, output_folder)),
|
|
89
|
+
)
|
|
90
|
+
gene_effect = timed_call(
|
|
91
|
+
timings,
|
|
92
|
+
gold_standard,
|
|
93
|
+
"read_gene_effect",
|
|
94
|
+
lambda: pd.read_csv(GENE_EFFECT_PATH, index_col=0),
|
|
95
|
+
)
|
|
96
|
+
inputs = {
|
|
97
|
+
"All screens": {
|
|
98
|
+
"path": gene_effect,
|
|
99
|
+
"sort": "high",
|
|
100
|
+
"color": "#000000",
|
|
101
|
+
},
|
|
102
|
+
}
|
|
103
|
+
data, _ = timed_call(
|
|
104
|
+
timings,
|
|
105
|
+
gold_standard,
|
|
106
|
+
"load_datasets",
|
|
107
|
+
lambda: flex.load_datasets(inputs),
|
|
108
|
+
)
|
|
109
|
+
timed_call(
|
|
110
|
+
timings,
|
|
111
|
+
gold_standard,
|
|
112
|
+
"load_gold_standard",
|
|
113
|
+
flex.load_gold_standard,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
name, dataset = next(iter(data.items()))
|
|
117
|
+
timed_call(
|
|
118
|
+
timings,
|
|
119
|
+
gold_standard,
|
|
120
|
+
"pra",
|
|
121
|
+
lambda: flex.pra(name, dataset, is_corr=False),
|
|
122
|
+
)
|
|
123
|
+
timed_call(
|
|
124
|
+
timings,
|
|
125
|
+
gold_standard,
|
|
126
|
+
"pra_percomplex",
|
|
127
|
+
lambda: flex.pra_percomplex(name, dataset, is_corr=False),
|
|
128
|
+
)
|
|
129
|
+
timed_call(
|
|
130
|
+
timings,
|
|
131
|
+
gold_standard,
|
|
132
|
+
"complex_contributions",
|
|
133
|
+
lambda: flex.complex_contributions(name),
|
|
134
|
+
)
|
|
135
|
+
timed_call(
|
|
136
|
+
timings,
|
|
137
|
+
gold_standard,
|
|
138
|
+
"mpr_prepare",
|
|
139
|
+
lambda: flex.mpr_prepare(name),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
timed_call(
|
|
143
|
+
timings,
|
|
144
|
+
gold_standard,
|
|
145
|
+
"plot_precision_recall_curve",
|
|
146
|
+
flex.plot_precision_recall_curve,
|
|
147
|
+
)
|
|
148
|
+
timed_call(timings, gold_standard, "plot_auc_scores", flex.plot_auc_scores)
|
|
149
|
+
timed_call(
|
|
150
|
+
timings,
|
|
151
|
+
gold_standard,
|
|
152
|
+
"plot_significant_complexes",
|
|
153
|
+
flex.plot_significant_complexes,
|
|
154
|
+
)
|
|
155
|
+
timed_call(
|
|
156
|
+
timings,
|
|
157
|
+
gold_standard,
|
|
158
|
+
"plot_percomplex_scatter",
|
|
159
|
+
lambda: flex.plot_percomplex_scatter(n_top=20),
|
|
160
|
+
)
|
|
161
|
+
timed_call(
|
|
162
|
+
timings,
|
|
163
|
+
gold_standard,
|
|
164
|
+
"plot_percomplex_scatter_bysize",
|
|
165
|
+
flex.plot_percomplex_scatter_bysize,
|
|
166
|
+
)
|
|
167
|
+
timed_call(
|
|
168
|
+
timings,
|
|
169
|
+
gold_standard,
|
|
170
|
+
"plot_complex_contributions",
|
|
171
|
+
flex.plot_complex_contributions,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
timings.append(
|
|
175
|
+
{
|
|
176
|
+
"gold_standard": gold_standard,
|
|
177
|
+
"step": "total_runtime",
|
|
178
|
+
"seconds": perf_counter() - workflow_start,
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
output_folder.mkdir(parents=True, exist_ok=True)
|
|
182
|
+
pd.DataFrame(timings).to_csv(
|
|
183
|
+
output_folder / "benchmark_results.csv",
|
|
184
|
+
index=False,
|
|
185
|
+
)
|
|
186
|
+
return timings
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def main() -> None:
|
|
190
|
+
if not GENE_EFFECT_PATH.exists():
|
|
191
|
+
raise FileNotFoundError(f"Input dataset was not found: {GENE_EFFECT_PATH}")
|
|
192
|
+
|
|
193
|
+
all_timings: list[dict[str, Any]] = []
|
|
194
|
+
for gold_standard in GOLD_STANDARDS:
|
|
195
|
+
print(f"Running runtime benchmark: {gold_standard}")
|
|
196
|
+
all_timings.extend(run_gold_standard(gold_standard))
|
|
197
|
+
|
|
198
|
+
total_seconds = sum(
|
|
199
|
+
timing["seconds"]
|
|
200
|
+
for timing in all_timings
|
|
201
|
+
if timing["step"] == "total_runtime"
|
|
202
|
+
)
|
|
203
|
+
all_timings.append(
|
|
204
|
+
{
|
|
205
|
+
"gold_standard": "ALL",
|
|
206
|
+
"step": "grand_total_runtime",
|
|
207
|
+
"seconds": total_seconds,
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
BENCHMARK_ROOT.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
combined_path = BENCHMARK_ROOT / "benchmark_results_all_gold_standards.csv"
|
|
212
|
+
pd.DataFrame(all_timings).to_csv(combined_path, index=False)
|
|
213
|
+
print(f"Benchmark results saved to: {combined_path}")
|
|
214
|
+
print(f"Grand total workflow runtime: {total_seconds:.3f} seconds")
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
if __name__ == "__main__":
|
|
218
|
+
main()
|