pythonflex 0.3__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pythonflex-0.3 → pythonflex-0.3.2}/PKG-INFO +7 -1
- {pythonflex-0.3 → pythonflex-0.3.2}/pyproject.toml +10 -2
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/__init__.py +2 -2
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/examples/basic_usage.py +24 -15
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/plotting.py +225 -22
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/preprocessing.py +28 -21
- pythonflex-0.3/src/pythonflex/examples/comparison.py +0 -78
- pythonflex-0.3/src/pythonflex/examples/dataset_filtering.py +0 -42
- pythonflex-0.3/src/pythonflex/examples/diag.py +0 -106
- pythonflex-0.3/src/pythonflex/examples/test.py +0 -104
- {pythonflex-0.3 → pythonflex-0.3.2}/.gitignore +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/.python-version +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/README.md +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/analysis.py +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/__init__.py +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/__init__.py +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/liver_cell_lines_500_genes.csv +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/neuroblastoma_cell_lines_500_genes.csv +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/CORUM.parquet +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/GOBP.parquet +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/PATHWAY.parquet +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/__init__.py +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/corum.csv +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/gobp.csv +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/pathway.csv +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/logging_config.py +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/utils.py +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/todo.txt +0 -0
- {pythonflex-0.3 → pythonflex-0.3.2}/uv.lock +0 -0
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pythonflex
|
|
3
|
-
Version: 0.3
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
|
|
5
5
|
Author-email: Yasir Demirtaş <tyasird@hotmail.com>
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
6
12
|
Requires-Python: >=3.9
|
|
7
13
|
Requires-Dist: adjusttext
|
|
8
14
|
Requires-Dist: art
|
|
@@ -1,13 +1,20 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pythonflex"
|
|
3
|
-
version = "0.3"
|
|
3
|
+
version = "0.3.2"
|
|
4
4
|
description = "pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "Yasir Demirtaş", email = "tyasird@hotmail.com" }
|
|
8
8
|
]
|
|
9
9
|
requires-python = ">=3.9"
|
|
10
|
-
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Programming Language :: Python :: 3",
|
|
12
|
+
"Programming Language :: Python :: 3.9",
|
|
13
|
+
"Programming Language :: Python :: 3.10",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
11
18
|
|
|
12
19
|
# Exclude the input folder
|
|
13
20
|
exclude = ["src/pythonflex/input/*", "src/pythonflex/output/*", "src/pythonflex/examples/output/*",
|
|
@@ -67,3 +74,4 @@ pythonflex = { workspace = true }
|
|
|
67
74
|
dev = [
|
|
68
75
|
"pythonflex",
|
|
69
76
|
]
|
|
77
|
+
|
|
@@ -3,7 +3,7 @@ from .utils import dsave, dload
|
|
|
3
3
|
from .preprocessing import get_example_data_path, load_datasets, get_common_genes, filter_matrix_by_genes, load_gold_standard, filter_duplicate_terms
|
|
4
4
|
from .analysis import initialize, pra, pra_percomplex, fast_corr, perform_corr, is_symmetric, binary, has_mirror_of_first_pair, convert_full_to_half_matrix, drop_mirror_pairs, quick_sort, complex_contributions, save_results_to_csv, update_matploblib_config, mpr_prepare
|
|
5
5
|
from .plotting import (
|
|
6
|
-
adjust_text_positions, plot_precision_recall_curve, plot_percomplex_scatter,
|
|
6
|
+
adjust_text_positions, plot_precision_recall_curve, plot_aggregated_pra, plot_iqr_pra, plot_all_runs_pra, plot_percomplex_scatter,
|
|
7
7
|
plot_percomplex_scatter_bysize, plot_complex_contributions, plot_significant_complexes, plot_auc_scores,
|
|
8
8
|
plot_mpr_tp, plot_mpr_complexes, plot_mpr_tp_multi, plot_mpr_complexes_multi
|
|
9
9
|
)
|
|
@@ -13,7 +13,7 @@ __all__ = [ "log", "get_example_data_path", "fast_corr",
|
|
|
13
13
|
"filter_matrix_by_genes", "load_gold_standard", "filter_duplicate_terms", "pra", "pra_percomplex",
|
|
14
14
|
"perform_corr", "is_symmetric", "binary", "has_mirror_of_first_pair", "convert_full_to_half_matrix",
|
|
15
15
|
"drop_mirror_pairs", "quick_sort", "complex_contributions", "adjust_text_positions", "plot_precision_recall_curve",
|
|
16
|
-
"plot_percomplex_scatter", "plot_percomplex_scatter_bysize", "plot_complex_contributions",
|
|
16
|
+
"plot_aggregated_pra", "plot_iqr_pra", "plot_all_runs_pra", "plot_percomplex_scatter", "plot_percomplex_scatter_bysize", "plot_complex_contributions",
|
|
17
17
|
"plot_significant_complexes", "plot_auc_scores", "save_results_to_csv", "update_matploblib_config",
|
|
18
18
|
"mpr_prepare", "plot_mpr_tp", "plot_mpr_complexes",
|
|
19
19
|
"plot_mpr_tp_multi", "plot_mpr_complexes_multi"
|
|
@@ -8,32 +8,34 @@ import pythonflex as flex
|
|
|
8
8
|
inputs = {
|
|
9
9
|
"Melanoma (63 Screens)": {
|
|
10
10
|
"path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
|
|
11
|
-
"sort": "high"
|
|
11
|
+
"sort": "high",
|
|
12
|
+
"color": "#FF0000"
|
|
12
13
|
},
|
|
13
14
|
"Liver (24 Screens)": {
|
|
14
15
|
"path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
|
|
15
|
-
"sort": "high"
|
|
16
|
+
"sort": "high",
|
|
17
|
+
"color": "#FFDD00"
|
|
16
18
|
},
|
|
17
19
|
"Neuroblastoma (37 Screens)": {
|
|
18
20
|
"path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
|
|
19
|
-
"sort": "high"
|
|
21
|
+
"sort": "high",
|
|
22
|
+
"color": "#FFDDDD"
|
|
20
23
|
},
|
|
21
24
|
}
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
|
|
25
|
-
#%%
|
|
26
28
|
default_config = {
|
|
27
29
|
"min_genes_in_complex": 0,
|
|
28
30
|
"min_genes_per_complex_analysis": 3,
|
|
29
|
-
"output_folder": "
|
|
31
|
+
"output_folder": "CORUM",
|
|
30
32
|
"gold_standard": "CORUM",
|
|
31
|
-
"color_map": "
|
|
32
|
-
"jaccard":
|
|
33
|
+
"color_map": "BuGn",
|
|
34
|
+
"jaccard": False,
|
|
33
35
|
"use_common_genes": False, # Set to False for individual dataset-gold standard intersections
|
|
34
36
|
"plotting": {
|
|
35
37
|
"save_plot": True,
|
|
36
|
-
"output_type": "
|
|
38
|
+
"output_type": "png",
|
|
37
39
|
},
|
|
38
40
|
"preprocessing": {
|
|
39
41
|
"fill_na": True,
|
|
@@ -41,7 +43,8 @@ default_config = {
|
|
|
41
43
|
},
|
|
42
44
|
"corr_function": "numpy",
|
|
43
45
|
"logging": {
|
|
44
|
-
"visible_levels": ["DONE"
|
|
46
|
+
"visible_levels": ["DONE"]
|
|
47
|
+
# "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
45
48
|
}
|
|
46
49
|
}
|
|
47
50
|
|
|
@@ -52,26 +55,32 @@ flex.initialize(default_config)
|
|
|
52
55
|
data, _ = flex.load_datasets(inputs)
|
|
53
56
|
terms, genes_in_terms = flex.load_gold_standard()
|
|
54
57
|
|
|
55
|
-
|
|
56
|
-
#%%
|
|
57
58
|
# Run analysis
|
|
58
59
|
for name, dataset in data.items():
|
|
59
60
|
pra = flex.pra(name, dataset, is_corr=False)
|
|
60
61
|
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
61
62
|
cc = flex.complex_contributions(name)
|
|
62
|
-
|
|
63
|
+
flex.mpr_prepare(name)
|
|
64
|
+
|
|
65
|
+
|
|
63
66
|
|
|
64
67
|
|
|
65
68
|
#%%
|
|
66
69
|
# Generate plots
|
|
67
|
-
flex.plot_auc_scores()
|
|
68
70
|
flex.plot_precision_recall_curve()
|
|
71
|
+
flex.plot_auc_scores()
|
|
72
|
+
flex.plot_significant_complexes()
|
|
69
73
|
flex.plot_percomplex_scatter(n_top=20)
|
|
70
74
|
flex.plot_percomplex_scatter_bysize()
|
|
71
|
-
flex.plot_significant_complexes()
|
|
72
75
|
flex.plot_complex_contributions()
|
|
73
|
-
|
|
76
|
+
##
|
|
77
|
+
flex.plot_mpr_tp_multi()
|
|
78
|
+
flex.plot_mpr_complexes_multi()
|
|
74
79
|
|
|
75
80
|
#%%
|
|
76
81
|
# Save results to CSV
|
|
77
82
|
flex.save_results_to_csv()
|
|
83
|
+
|
|
84
|
+
# %%
|
|
85
|
+
flex.plot_mpr_complexes_multi(show_filters="no_mtRibo_ETCI")
|
|
86
|
+
# %%
|
|
@@ -58,7 +58,12 @@ def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
|
|
|
58
58
|
log.warning(f"Color map '{cmap_name}' not found. Falling back to 'tab10'.")
|
|
59
59
|
cmap = get_cmap("tab10")
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
# Increase figure width to accommodate external legend without squashing axes
|
|
62
|
+
fig, ax = plt.subplots(figsize=(6, 4))
|
|
63
|
+
|
|
64
|
+
# Adjust layout to make room for legend on the right
|
|
65
|
+
plt.subplots_adjust(right=0.7)
|
|
66
|
+
|
|
62
67
|
ax.set_xscale("log")
|
|
63
68
|
|
|
64
69
|
# optionally hide minor ticks on the log axis
|
|
@@ -92,7 +97,7 @@ def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
|
|
|
92
97
|
ax.set(title="",
|
|
93
98
|
xlabel="Number of True Positives (TP)",
|
|
94
99
|
ylabel="Precision")
|
|
95
|
-
ax.legend(loc="upper
|
|
100
|
+
ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
|
|
96
101
|
ax.set_ylim(0, 1)
|
|
97
102
|
|
|
98
103
|
# Nature style: no grid, open top/right spines
|
|
@@ -109,6 +114,171 @@ def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
|
|
|
109
114
|
plt.show()
|
|
110
115
|
plt.close(fig)
|
|
111
116
|
|
|
117
|
+
def plot_aggregated_pra(agg_df, line_width=2.0, hide_minor_ticks=True):
|
|
118
|
+
"""
|
|
119
|
+
Plots an aggregated Precision-Recall curve with mean line and min-max shading.
|
|
120
|
+
agg_df should be indexed by 'tp' and contain 'mean', 'min', 'max' columns for precision.
|
|
121
|
+
"""
|
|
122
|
+
config = dload("config")
|
|
123
|
+
plot_config = config["plotting"]
|
|
124
|
+
|
|
125
|
+
# Increase figure width to accommodate external legend without squashing axes
|
|
126
|
+
fig, ax = plt.subplots(figsize=(6, 4))
|
|
127
|
+
|
|
128
|
+
# Adjust layout to make room for legend on the right
|
|
129
|
+
plt.subplots_adjust(right=0.7)
|
|
130
|
+
|
|
131
|
+
ax.set_xscale("log")
|
|
132
|
+
|
|
133
|
+
# optionally hide minor ticks on the log axis
|
|
134
|
+
if hide_minor_ticks:
|
|
135
|
+
ax.xaxis.set_minor_locator(NullLocator())
|
|
136
|
+
ax.xaxis.set_minor_formatter(NullFormatter())
|
|
137
|
+
|
|
138
|
+
# Filter out very low TP counts if necessary, similar to plot_precision_recall_curve
|
|
139
|
+
agg_df = agg_df[agg_df.index > 10]
|
|
140
|
+
|
|
141
|
+
tp = agg_df.index
|
|
142
|
+
mean_prec = agg_df['mean']
|
|
143
|
+
min_prec = agg_df['min']
|
|
144
|
+
max_prec = agg_df['max']
|
|
145
|
+
|
|
146
|
+
# Plot shading
|
|
147
|
+
ax.fill_between(tp, min_prec, max_prec, color='gray', alpha=0.3, label='Range (Min-Max)')
|
|
148
|
+
|
|
149
|
+
# Plot mean line
|
|
150
|
+
ax.plot(tp, mean_prec, c="black", label="Mean Precision", linewidth=line_width, alpha=0.9)
|
|
151
|
+
|
|
152
|
+
ax.set(title="",
|
|
153
|
+
xlabel="Number of True Positives (TP)",
|
|
154
|
+
ylabel="Precision")
|
|
155
|
+
ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
|
|
156
|
+
ax.set_ylim(0, 1)
|
|
157
|
+
|
|
158
|
+
# Nature style: no grid, open top/right spines
|
|
159
|
+
ax.grid(False)
|
|
160
|
+
ax.spines['top'].set_visible(False)
|
|
161
|
+
ax.spines['right'].set_visible(False)
|
|
162
|
+
|
|
163
|
+
if plot_config["save_plot"]:
|
|
164
|
+
output_type = plot_config["output_type"]
|
|
165
|
+
output_path = Path(config["output_folder"]) / f"aggregated_precision_recall_curve.{output_type}"
|
|
166
|
+
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
167
|
+
|
|
168
|
+
if plot_config.get("show_plot", True):
|
|
169
|
+
plt.show()
|
|
170
|
+
plt.close(fig)
|
|
171
|
+
|
|
172
|
+
def plot_iqr_pra(agg_df, line_width=2.0, hide_minor_ticks=True):
|
|
173
|
+
"""
|
|
174
|
+
Plots an aggregated Precision-Recall curve with mean line and IQR (25-75%) shading.
|
|
175
|
+
agg_df should be indexed by 'tp' and contain 'mean', '25%', '75%' columns for precision.
|
|
176
|
+
"""
|
|
177
|
+
config = dload("config")
|
|
178
|
+
plot_config = config["plotting"]
|
|
179
|
+
|
|
180
|
+
# Increase figure width to accommodate external legend without squashing axes
|
|
181
|
+
fig, ax = plt.subplots(figsize=(6, 4))
|
|
182
|
+
|
|
183
|
+
# Adjust layout to make room for legend on the right
|
|
184
|
+
plt.subplots_adjust(right=0.7)
|
|
185
|
+
|
|
186
|
+
ax.set_xscale("log")
|
|
187
|
+
|
|
188
|
+
# optionally hide minor ticks on the log axis
|
|
189
|
+
if hide_minor_ticks:
|
|
190
|
+
ax.xaxis.set_minor_locator(NullLocator())
|
|
191
|
+
ax.xaxis.set_minor_formatter(NullFormatter())
|
|
192
|
+
|
|
193
|
+
# Filter out very low TP counts
|
|
194
|
+
agg_df = agg_df[agg_df.index > 10]
|
|
195
|
+
|
|
196
|
+
tp = agg_df.index
|
|
197
|
+
mean_prec = agg_df['mean']
|
|
198
|
+
q25_prec = agg_df['25%']
|
|
199
|
+
q75_prec = agg_df['75%']
|
|
200
|
+
|
|
201
|
+
# Plot shading
|
|
202
|
+
ax.fill_between(tp, q25_prec, q75_prec, color='gray', alpha=0.3, label='IQR (25-75%)')
|
|
203
|
+
|
|
204
|
+
# Plot mean line
|
|
205
|
+
ax.plot(tp, mean_prec, c="black", label="Mean Precision", linewidth=line_width, alpha=0.9)
|
|
206
|
+
|
|
207
|
+
ax.set(title="Precision-Recall (IQR)",
|
|
208
|
+
xlabel="Number of True Positives (TP)",
|
|
209
|
+
ylabel="Precision")
|
|
210
|
+
ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
|
|
211
|
+
ax.set_ylim(0, 1)
|
|
212
|
+
|
|
213
|
+
# Nature style
|
|
214
|
+
ax.grid(False)
|
|
215
|
+
ax.spines['top'].set_visible(False)
|
|
216
|
+
ax.spines['right'].set_visible(False)
|
|
217
|
+
|
|
218
|
+
if plot_config["save_plot"]:
|
|
219
|
+
output_type = plot_config["output_type"]
|
|
220
|
+
output_path = Path(config["output_folder"]) / f"aggregated_iqr_precision_recall_curve.{output_type}"
|
|
221
|
+
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
222
|
+
|
|
223
|
+
if plot_config.get("show_plot", True):
|
|
224
|
+
plt.show()
|
|
225
|
+
plt.close(fig)
|
|
226
|
+
|
|
227
|
+
def plot_all_runs_pra(pra_list, mean_df=None, line_width=2.0, hide_minor_ticks=True):
|
|
228
|
+
"""
|
|
229
|
+
Plots all individual Precision-Recall curves faintly, with an optional mean line.
|
|
230
|
+
pra_list: list of dataframes (each with 'tp' and 'precision' columns) OR list of Series (if index is tp)
|
|
231
|
+
mean_df: optional dataframe with 'mean' column indexed by tp
|
|
232
|
+
"""
|
|
233
|
+
config = dload("config")
|
|
234
|
+
plot_config = config["plotting"]
|
|
235
|
+
|
|
236
|
+
fig, ax = plt.subplots(figsize=(6, 4))
|
|
237
|
+
plt.subplots_adjust(right=0.7)
|
|
238
|
+
|
|
239
|
+
ax.set_xscale("log")
|
|
240
|
+
|
|
241
|
+
if hide_minor_ticks:
|
|
242
|
+
ax.xaxis.set_minor_locator(NullLocator())
|
|
243
|
+
ax.xaxis.set_minor_formatter(NullFormatter())
|
|
244
|
+
|
|
245
|
+
# Plot individual lines
|
|
246
|
+
for i, df in enumerate(pra_list):
|
|
247
|
+
# Ensure we filter low TPs same as others
|
|
248
|
+
df_filtered = df[df['tp'] > 10] if 'tp' in df.columns else df[df.index > 10]
|
|
249
|
+
|
|
250
|
+
x = df_filtered['tp'] if 'tp' in df_filtered.columns else df_filtered.index
|
|
251
|
+
y = df_filtered['precision'] if 'precision' in df_filtered.columns else df_filtered.values
|
|
252
|
+
|
|
253
|
+
# Only add label for the first line to avoid cluttering legend
|
|
254
|
+
lbl = "Individual Runs" if i == 0 else None
|
|
255
|
+
ax.plot(x, y, c="gray", linewidth=0.5, alpha=0.3, label=lbl)
|
|
256
|
+
|
|
257
|
+
# Plot mean line if provided
|
|
258
|
+
if mean_df is not None:
|
|
259
|
+
mean_df = mean_df[mean_df.index > 10]
|
|
260
|
+
ax.plot(mean_df.index, mean_df['mean'], c="black", label="Mean Precision", linewidth=line_width, alpha=0.9)
|
|
261
|
+
|
|
262
|
+
ax.set(title="Precision-Recall (All Runs)",
|
|
263
|
+
xlabel="Number of True Positives (TP)",
|
|
264
|
+
ylabel="Precision")
|
|
265
|
+
ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
|
|
266
|
+
ax.set_ylim(0, 1)
|
|
267
|
+
|
|
268
|
+
# Nature style
|
|
269
|
+
ax.grid(False)
|
|
270
|
+
ax.spines['top'].set_visible(False)
|
|
271
|
+
ax.spines['right'].set_visible(False)
|
|
272
|
+
|
|
273
|
+
if plot_config["save_plot"]:
|
|
274
|
+
output_type = plot_config["output_type"]
|
|
275
|
+
output_path = Path(config["output_folder"]) / f"aggregated_all_runs_precision_recall_curve.{output_type}"
|
|
276
|
+
fig.savefig(output_path, bbox_inches="tight", format=output_type)
|
|
277
|
+
|
|
278
|
+
if plot_config.get("show_plot", True):
|
|
279
|
+
plt.show()
|
|
280
|
+
plt.close(fig)
|
|
281
|
+
|
|
112
282
|
def plot_percomplex_scatter(n_top=10, sig_color='#B71A2A', nonsig_color='#DBDDDD', label_color='black', border_color='black', border_width=1.0, show_text_background=True):
|
|
113
283
|
config = dload("config")
|
|
114
284
|
plot_config = config["plotting"]
|
|
@@ -1050,14 +1220,10 @@ def plot_auc_scores():
|
|
|
1050
1220
|
plt.close(fig)
|
|
1051
1221
|
return pra_dict
|
|
1052
1222
|
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
1223
|
# -----------------------------------------------------------------------------
|
|
1057
1224
|
# mPR plots (Fig. 1E and Fig. 1F)
|
|
1058
1225
|
# -----------------------------------------------------------------------------
|
|
1059
1226
|
|
|
1060
|
-
|
|
1061
1227
|
def plot_mpr_complexes(name, ax=None, save=True, outname=None):
|
|
1062
1228
|
"""
|
|
1063
1229
|
Fig. 1F-style module-level PR:
|
|
@@ -1208,7 +1374,6 @@ def plot_mpr_tp(name, ax=None, save=True, outname=None):
|
|
|
1208
1374
|
|
|
1209
1375
|
return ax
|
|
1210
1376
|
|
|
1211
|
-
|
|
1212
1377
|
"""
|
|
1213
1378
|
Multi-dataset mPR plotting functions.
|
|
1214
1379
|
|
|
@@ -1229,7 +1394,6 @@ from pathlib import Path
|
|
|
1229
1394
|
from .utils import dload
|
|
1230
1395
|
from .logging_config import log
|
|
1231
1396
|
|
|
1232
|
-
|
|
1233
1397
|
# Default color palette (colorblind-friendly)
|
|
1234
1398
|
DEFAULT_COLORS = [
|
|
1235
1399
|
"#4E79A7", # blue
|
|
@@ -1252,6 +1416,21 @@ FILTER_STYLES = {
|
|
|
1252
1416
|
}
|
|
1253
1417
|
|
|
1254
1418
|
|
|
1419
|
+
def _normalize_show_filters(show_filters):
|
|
1420
|
+
"""Normalize show_filters to an ordered tuple of filter keys.
|
|
1421
|
+
|
|
1422
|
+
Common footgun: passing a single string (e.g. "no_mtRibo_ETCI") is iterable,
|
|
1423
|
+
which would otherwise be treated as a sequence of characters.
|
|
1424
|
+
"""
|
|
1425
|
+
if show_filters is None:
|
|
1426
|
+
return tuple(FILTER_STYLES.keys())
|
|
1427
|
+
if isinstance(show_filters, str):
|
|
1428
|
+
return (show_filters,)
|
|
1429
|
+
try:
|
|
1430
|
+
return tuple(show_filters)
|
|
1431
|
+
except TypeError:
|
|
1432
|
+
return (show_filters,)
|
|
1433
|
+
|
|
1255
1434
|
def plot_mpr_tp_multi(
|
|
1256
1435
|
dataset_names=None,
|
|
1257
1436
|
colors=None,
|
|
@@ -1292,6 +1471,8 @@ def plot_mpr_tp_multi(
|
|
|
1292
1471
|
config = dload("config")
|
|
1293
1472
|
plot_config = config["plotting"]
|
|
1294
1473
|
input_colors = dload("input", "colors")
|
|
1474
|
+
|
|
1475
|
+
show_filters = _normalize_show_filters(show_filters)
|
|
1295
1476
|
|
|
1296
1477
|
# Sanitize color keys
|
|
1297
1478
|
if input_colors:
|
|
@@ -1335,7 +1516,10 @@ def plot_mpr_tp_multi(
|
|
|
1335
1516
|
colors = final_colors
|
|
1336
1517
|
|
|
1337
1518
|
if ax is None:
|
|
1338
|
-
|
|
1519
|
+
# Increase width slightly
|
|
1520
|
+
fig, ax = plt.subplots(figsize=(6, 4))
|
|
1521
|
+
# Reserve space for legend on right
|
|
1522
|
+
plt.subplots_adjust(right=0.7)
|
|
1339
1523
|
else:
|
|
1340
1524
|
fig = ax.figure
|
|
1341
1525
|
|
|
@@ -1413,14 +1597,21 @@ def plot_mpr_tp_multi(
|
|
|
1413
1597
|
|
|
1414
1598
|
# Save
|
|
1415
1599
|
if save:
|
|
1600
|
+
output_type = plot_config.get("output_type", "pdf")
|
|
1416
1601
|
if outname is None:
|
|
1417
|
-
outname = "mpr_tp_multi.
|
|
1602
|
+
outname = f"mpr_tp_multi.{output_type}"
|
|
1603
|
+
|
|
1604
|
+
# Check if outname is just a filename or a full path
|
|
1605
|
+
outpath = Path(outname)
|
|
1606
|
+
if len(outpath.parts) == 1:
|
|
1607
|
+
# Just a filename, prepend configured output folder
|
|
1608
|
+
outpath = Path(config["output_folder"]) / outname
|
|
1609
|
+
|
|
1418
1610
|
fig.tight_layout()
|
|
1419
|
-
fig.savefig(
|
|
1611
|
+
fig.savefig(outpath, bbox_inches="tight", format=output_type)
|
|
1420
1612
|
|
|
1421
1613
|
return ax
|
|
1422
1614
|
|
|
1423
|
-
|
|
1424
1615
|
def plot_mpr_complexes_multi(
|
|
1425
1616
|
dataset_names=None,
|
|
1426
1617
|
colors=None,
|
|
@@ -1461,6 +1652,8 @@ def plot_mpr_complexes_multi(
|
|
|
1461
1652
|
config = dload("config")
|
|
1462
1653
|
plot_config = config["plotting"]
|
|
1463
1654
|
input_colors = dload("input", "colors")
|
|
1655
|
+
|
|
1656
|
+
show_filters = _normalize_show_filters(show_filters)
|
|
1464
1657
|
|
|
1465
1658
|
# Sanitize color keys
|
|
1466
1659
|
if input_colors:
|
|
@@ -1504,7 +1697,10 @@ def plot_mpr_complexes_multi(
|
|
|
1504
1697
|
colors = final_colors
|
|
1505
1698
|
|
|
1506
1699
|
if ax is None:
|
|
1507
|
-
|
|
1700
|
+
# Increase width slightly
|
|
1701
|
+
fig, ax = plt.subplots(figsize=(6, 4))
|
|
1702
|
+
# Reserve space for legend on right
|
|
1703
|
+
plt.subplots_adjust(right=0.7)
|
|
1508
1704
|
else:
|
|
1509
1705
|
fig = ax.figure
|
|
1510
1706
|
|
|
@@ -1564,18 +1760,26 @@ def plot_mpr_complexes_multi(
|
|
|
1564
1760
|
|
|
1565
1761
|
# Save
|
|
1566
1762
|
if save:
|
|
1763
|
+
output_type = plot_config.get("output_type", "pdf")
|
|
1567
1764
|
if outname is None:
|
|
1568
|
-
outname = "mpr_complexes_multi.
|
|
1765
|
+
outname = f"mpr_complexes_multi.{output_type}"
|
|
1766
|
+
|
|
1767
|
+
# Check if outname is just a filename or a full path
|
|
1768
|
+
outpath = Path(outname)
|
|
1769
|
+
if len(outpath.parts) == 1:
|
|
1770
|
+
# Just a filename, prepend configured output folder
|
|
1771
|
+
outpath = Path(config["output_folder"]) / outname
|
|
1772
|
+
|
|
1569
1773
|
fig.tight_layout()
|
|
1570
|
-
fig.savefig(
|
|
1774
|
+
fig.savefig(outpath, bbox_inches="tight", format=output_type)
|
|
1571
1775
|
|
|
1572
1776
|
return ax
|
|
1573
1777
|
|
|
1574
|
-
|
|
1575
1778
|
def _add_vertical_legend(ax, dataset_names, colors, show_filters, linewidth):
|
|
1576
1779
|
"""
|
|
1577
1780
|
Add vertically stacked legends: Dataset on top, Filter below.
|
|
1578
1781
|
"""
|
|
1782
|
+
show_filters = _normalize_show_filters(show_filters)
|
|
1579
1783
|
# Legend 1: Datasets (colors) - solid lines
|
|
1580
1784
|
dataset_handles = []
|
|
1581
1785
|
for i, name in enumerate(dataset_names):
|
|
@@ -1602,12 +1806,12 @@ def _add_vertical_legend(ax, dataset_names, colors, show_filters, linewidth):
|
|
|
1602
1806
|
legend1 = ax.legend(
|
|
1603
1807
|
dataset_handles,
|
|
1604
1808
|
dataset_names,
|
|
1605
|
-
loc="upper
|
|
1809
|
+
loc="upper left",
|
|
1606
1810
|
frameon=False,
|
|
1607
1811
|
title="Dataset",
|
|
1608
1812
|
fontsize=7,
|
|
1609
1813
|
title_fontsize=8,
|
|
1610
|
-
bbox_to_anchor=(1.
|
|
1814
|
+
bbox_to_anchor=(1.05, 1.0)
|
|
1611
1815
|
)
|
|
1612
1816
|
ax.add_artist(legend1)
|
|
1613
1817
|
|
|
@@ -1615,17 +1819,17 @@ def _add_vertical_legend(ax, dataset_names, colors, show_filters, linewidth):
|
|
|
1615
1819
|
legend2 = ax.legend(
|
|
1616
1820
|
filter_handles,
|
|
1617
1821
|
filter_labels,
|
|
1618
|
-
loc="upper
|
|
1822
|
+
loc="upper left",
|
|
1619
1823
|
frameon=False,
|
|
1620
1824
|
fontsize=7,
|
|
1621
|
-
bbox_to_anchor=(1.
|
|
1825
|
+
bbox_to_anchor=(1.05, 1.0 - len(dataset_names) * 0.06 - 0.1)
|
|
1622
1826
|
)
|
|
1623
1827
|
|
|
1624
|
-
|
|
1625
1828
|
def _add_dual_legend(ax, dataset_names, colors, show_filters, linewidth):
|
|
1626
1829
|
"""
|
|
1627
1830
|
Add two legends: one for datasets (colors), one for filters (line styles).
|
|
1628
1831
|
"""
|
|
1832
|
+
show_filters = _normalize_show_filters(show_filters)
|
|
1629
1833
|
# Legend 1: Datasets (colors) - solid lines
|
|
1630
1834
|
dataset_handles = []
|
|
1631
1835
|
for i, name in enumerate(dataset_names):
|
|
@@ -1671,7 +1875,6 @@ def _add_dual_legend(ax, dataset_names, colors, show_filters, linewidth):
|
|
|
1671
1875
|
title_fontsize=8,
|
|
1672
1876
|
)
|
|
1673
1877
|
|
|
1674
|
-
|
|
1675
1878
|
# ============================================================================
|
|
1676
1879
|
# Single dataset functions are now obsolete
|
|
1677
1880
|
# ============================================================================
|
|
@@ -13,28 +13,36 @@ from pathlib import Path
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def return_package_dir():
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
16
|
+
try:
|
|
17
|
+
# Get the distribution
|
|
18
|
+
dist = distribution('pythonflex')
|
|
19
|
+
|
|
20
|
+
# Check for direct_url.json
|
|
21
|
+
try:
|
|
22
|
+
direct_url_text = dist.read_text('direct_url.json')
|
|
23
|
+
except FileNotFoundError:
|
|
24
|
+
direct_url_text = None
|
|
25
|
+
|
|
26
|
+
if direct_url_text:
|
|
27
|
+
direct_url = json.loads(direct_url_text)
|
|
28
|
+
if direct_url.get('dir_info', {}).get('editable'):
|
|
29
|
+
# Editable install detected
|
|
30
|
+
project_url = direct_url['url']
|
|
31
|
+
# Remove 'file:///' prefix and handle Windows paths
|
|
32
|
+
project_root = project_url.removeprefix('file:///').replace('/', os.sep)
|
|
33
|
+
# Assuming src layout: project_root/src/pythonflex
|
|
34
|
+
package_dir = os.path.join(project_root, 'src', 'pythonflex')
|
|
35
|
+
else:
|
|
36
|
+
# Non-editable
|
|
37
|
+
package_dir = str(files('pythonflex'))
|
|
32
38
|
else:
|
|
33
|
-
#
|
|
39
|
+
# No direct_url, assume non-editable
|
|
34
40
|
package_dir = str(files('pythonflex'))
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
41
|
+
|
|
42
|
+
except Exception: # PackageNotFoundError or other issues
|
|
43
|
+
# Fallback to local directory relative to this file
|
|
44
|
+
# precise location: src/pythonflex/preprocessing.py -> package dir is parent
|
|
45
|
+
package_dir = str(Path(__file__).parent)
|
|
38
46
|
|
|
39
47
|
return package_dir
|
|
40
48
|
|
|
@@ -190,7 +198,6 @@ def load_gold_standard():
|
|
|
190
198
|
"PATHWAY": "gold_standard/PATHWAY.parquet"
|
|
191
199
|
}
|
|
192
200
|
|
|
193
|
-
|
|
194
201
|
if gold_standard_source in gold_standard_files:
|
|
195
202
|
# Load predefined gold standard from package resources
|
|
196
203
|
filename = gold_standard_files[gold_standard_source]
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Basic usage example of the pythonFLEX package.
|
|
3
|
-
Demonstrates initialization, data loading, analysis, and plotting.
|
|
4
|
-
"""
|
|
5
|
-
#%%
|
|
6
|
-
import pythonflex as flex
|
|
7
|
-
import pandas as pd
|
|
8
|
-
|
|
9
|
-
depmap = pd.read_csv('../../../../_datasets/depmap/25Q2/gene_effect.csv', index_col=0)
|
|
10
|
-
white = pd.read_csv('../../../../_datasets/depmap/25Q2/25Q2_chronos_whitened_PCA.csv', index_col=0).T
|
|
11
|
-
|
|
12
|
-
inputs = {
|
|
13
|
-
"25Q2": {
|
|
14
|
-
"path": depmap,
|
|
15
|
-
"sort": "high",
|
|
16
|
-
"color": "#fff000" # Black
|
|
17
|
-
},
|
|
18
|
-
|
|
19
|
-
"25Q2 white": {
|
|
20
|
-
"path": white,
|
|
21
|
-
"sort": "high",
|
|
22
|
-
"color": "#ff0000" # Orange
|
|
23
|
-
},
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
default_config = {
|
|
27
|
-
"min_genes_in_complex": 0,
|
|
28
|
-
"min_genes_per_complex_analysis": 3,
|
|
29
|
-
"output_folder": "CORUM_25Q2_comparison2",
|
|
30
|
-
"gold_standard": "CORUM",
|
|
31
|
-
"color_map": "BuGn",
|
|
32
|
-
"jaccard": False,
|
|
33
|
-
"use_common_genes": False, # Set to False for individual dataset-gold standard intersections
|
|
34
|
-
"plotting": {
|
|
35
|
-
"save_plot": True,
|
|
36
|
-
"output_type": "png",
|
|
37
|
-
},
|
|
38
|
-
"preprocessing": {
|
|
39
|
-
"fill_na": True,
|
|
40
|
-
"normalize": False,
|
|
41
|
-
},
|
|
42
|
-
"corr_function": "numpy",
|
|
43
|
-
"logging": {
|
|
44
|
-
"visible_levels": ["DONE"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
# Initialize logger, config, and output folder
|
|
49
|
-
flex.initialize(default_config)
|
|
50
|
-
|
|
51
|
-
# Load datasets and gold standard terms
|
|
52
|
-
data, _ = flex.load_datasets(inputs)
|
|
53
|
-
terms, genes_in_terms = flex.load_gold_standard()
|
|
54
|
-
|
|
55
|
-
# Run analysis
|
|
56
|
-
for name, dataset in data.items():
|
|
57
|
-
pra = flex.pra(name, dataset, is_corr=False)
|
|
58
|
-
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
59
|
-
flex.mpr_prepare(name) # Add this line
|
|
60
|
-
cc = flex.complex_contributions(name)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
#%%
|
|
66
|
-
# Generate plots
|
|
67
|
-
flex.plot_precision_recall_curve()
|
|
68
|
-
flex.plot_auc_scores()
|
|
69
|
-
flex.plot_significant_complexes()
|
|
70
|
-
flex.plot_percomplex_scatter(n_top=20)
|
|
71
|
-
flex.plot_percomplex_scatter_bysize()
|
|
72
|
-
flex.plot_complex_contributions()
|
|
73
|
-
flex.plot_mpr_tp_multi()
|
|
74
|
-
flex.plot_mpr_complexes_multi()
|
|
75
|
-
# Save results to CSV
|
|
76
|
-
# flex.save_results_to_csv()
|
|
77
|
-
|
|
78
|
-
# %%
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# %%
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
df = pd.read_csv("../../../../datasets/depmap/24Q4/CRISPRGeneEffect.csv",index_col=0)
|
|
6
|
-
model = pd.read_csv("../../../../datasets/depmap/24Q4/Model.csv",index_col=0)
|
|
7
|
-
|
|
8
|
-
df.columns = df.columns.str.split(" \\(").str[0]
|
|
9
|
-
df = df.T
|
|
10
|
-
|
|
11
|
-
#%%
|
|
12
|
-
|
|
13
|
-
# %%
|
|
14
|
-
# get ModelID of selected disease for example OncotreePrimaryDisease==Melanoma
|
|
15
|
-
melanoma = model[model.OncotreePrimaryDisease=="Melanoma"].index.unique().values
|
|
16
|
-
liver = model[model.OncotreeLineage=="Liver"].index.unique().values
|
|
17
|
-
neuroblastoma = model[model.OncotreePrimaryDisease=="Neuroblastoma"].index.unique().values
|
|
18
|
-
|
|
19
|
-
# %%
|
|
20
|
-
# mel.index is model ids, filter that ids in the columns of df
|
|
21
|
-
mel_df = df.loc[:,df.columns.isin(melanoma)]
|
|
22
|
-
liver_df = df.loc[:,df.columns.isin(liver)]
|
|
23
|
-
neuro_df = df.loc[:,df.columns.isin(neuroblastoma)]
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# %%
|
|
27
|
-
mel_df.to_csv("melanoma.csv")
|
|
28
|
-
liver_df.to_csv("liver.csv")
|
|
29
|
-
neuro_df.to_csv("neuroblastoma.csv")
|
|
30
|
-
df.to_csv("depmap_geneeffect_all_cellines.csv")
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
# %%
|
|
34
|
-
import pandas as pd
|
|
35
|
-
df = pd.read_csv('../../../../_datasets/depmap/19Q2/Achilles_gene_effect.csv', index_col=0)
|
|
36
|
-
df.columns = df.columns.str.split(" \\(").str[0]
|
|
37
|
-
df = df.T
|
|
38
|
-
|
|
39
|
-
# %%
|
|
40
|
-
df.to_csv("../../../../_datasets/depmap/19Q2/gene_effect.csv")
|
|
41
|
-
|
|
42
|
-
# %%
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
#%%
|
|
2
|
-
# Run this in Jupyter to test the two approaches
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from pythonflex.utils import dload
|
|
7
|
-
|
|
8
|
-
dataset_name = "[CORUM] 19Q2"
|
|
9
|
-
|
|
10
|
-
pra = dload("pra", dataset_name)
|
|
11
|
-
mpr = dload("mpr", dataset_name)
|
|
12
|
-
|
|
13
|
-
filter_ids = set(mpr["filters"]["no_mtRibo_ETCI"])
|
|
14
|
-
print(f"Filter IDs: {filter_ids}")
|
|
15
|
-
|
|
16
|
-
cid_col = "complex_id" if "complex_id" in pra.columns else "complex_ids"
|
|
17
|
-
|
|
18
|
-
# Sort by score descending
|
|
19
|
-
pra_sorted = pra.sort_values("score", ascending=False).reset_index(drop=True)
|
|
20
|
-
|
|
21
|
-
def has_filter_id(cids, filter_ids):
|
|
22
|
-
"""Check if any complex ID is in filter_ids"""
|
|
23
|
-
if isinstance(cids, (np.ndarray, list)):
|
|
24
|
-
ids = [int(x) for x in cids if pd.notnull(x)]
|
|
25
|
-
else:
|
|
26
|
-
return False
|
|
27
|
-
return any(c in filter_ids for c in ids)
|
|
28
|
-
|
|
29
|
-
# Mark which pairs should be filtered
|
|
30
|
-
pra_sorted["should_filter"] = pra_sorted[cid_col].apply(lambda x: has_filter_id(x, filter_ids))
|
|
31
|
-
|
|
32
|
-
print(f"\nTotal pairs: {len(pra_sorted)}")
|
|
33
|
-
print(f"Pairs to filter: {pra_sorted['should_filter'].sum()}")
|
|
34
|
-
print(f"TPs to filter: {(pra_sorted['should_filter'] & (pra_sorted['prediction']==1)).sum()}")
|
|
35
|
-
|
|
36
|
-
# APPROACH 1: Mark as negative (what your Python does)
|
|
37
|
-
# Keep all rows, but filtered TPs become FPs
|
|
38
|
-
print("\n" + "=" * 70)
|
|
39
|
-
print("APPROACH 1: Mark filtered TPs as negatives (keep rows)")
|
|
40
|
-
print("=" * 70)
|
|
41
|
-
|
|
42
|
-
df1 = pra_sorted.copy()
|
|
43
|
-
df1["true_filtered"] = df1["prediction"].copy()
|
|
44
|
-
df1.loc[df1["should_filter"] & (df1["prediction"]==1), "true_filtered"] = 0
|
|
45
|
-
|
|
46
|
-
tp_cum_1 = df1["true_filtered"].cumsum()
|
|
47
|
-
prec_1 = tp_cum_1 / (np.arange(len(df1)) + 1)
|
|
48
|
-
|
|
49
|
-
# Show precision at key TP counts
|
|
50
|
-
print("\nPrecision at key TP counts:")
|
|
51
|
-
for target_tp in [10, 50, 100, 500, 1000]:
|
|
52
|
-
if target_tp <= tp_cum_1.max():
|
|
53
|
-
idx = np.where(tp_cum_1 >= target_tp)[0][0]
|
|
54
|
-
print(f" TP={target_tp}: precision={prec_1.iloc[idx]:.3f} (at rank {idx+1})")
|
|
55
|
-
|
|
56
|
-
# APPROACH 2: Remove rows entirely (what R does with replace=FALSE)
|
|
57
|
-
print("\n" + "=" * 70)
|
|
58
|
-
print("APPROACH 2: Remove filtered rows entirely")
|
|
59
|
-
print("=" * 70)
|
|
60
|
-
|
|
61
|
-
df2 = pra_sorted[~pra_sorted["should_filter"]].copy().reset_index(drop=True)
|
|
62
|
-
|
|
63
|
-
tp_cum_2 = df2["prediction"].cumsum()
|
|
64
|
-
prec_2 = tp_cum_2 / (np.arange(len(df2)) + 1)
|
|
65
|
-
|
|
66
|
-
print(f"\nRows remaining after removal: {len(df2)}")
|
|
67
|
-
print(f"TPs remaining: {df2['prediction'].sum()}")
|
|
68
|
-
|
|
69
|
-
print("\nPrecision at key TP counts:")
|
|
70
|
-
for target_tp in [10, 50, 100, 500, 1000]:
|
|
71
|
-
if target_tp <= tp_cum_2.max():
|
|
72
|
-
idx = np.where(tp_cum_2 >= target_tp)[0][0]
|
|
73
|
-
print(f" TP={target_tp}: precision={prec_2.iloc[idx]:.3f} (at rank {idx+1})")
|
|
74
|
-
|
|
75
|
-
# APPROACH 3: Only remove filtered POSITIVE pairs, keep negatives
|
|
76
|
-
print("\n" + "=" * 70)
|
|
77
|
-
print("APPROACH 3: Remove only filtered TPs (keep filtered negatives)")
|
|
78
|
-
print("=" * 70)
|
|
79
|
-
|
|
80
|
-
# This removes TP rows that contain filter IDs, but keeps negative rows
|
|
81
|
-
remove_mask = pra_sorted["should_filter"] & (pra_sorted["prediction"] == 1)
|
|
82
|
-
df3 = pra_sorted[~remove_mask].copy().reset_index(drop=True)
|
|
83
|
-
|
|
84
|
-
tp_cum_3 = df3["prediction"].cumsum()
|
|
85
|
-
prec_3 = tp_cum_3 / (np.arange(len(df3)) + 1)
|
|
86
|
-
|
|
87
|
-
print(f"\nRows remaining: {len(df3)}")
|
|
88
|
-
print(f"TPs remaining: {df3['prediction'].sum()}")
|
|
89
|
-
|
|
90
|
-
print("\nPrecision at key TP counts:")
|
|
91
|
-
for target_tp in [10, 50, 100, 500, 1000]:
|
|
92
|
-
if target_tp <= tp_cum_3.max():
|
|
93
|
-
idx = np.where(tp_cum_3 >= target_tp)[0][0]
|
|
94
|
-
print(f" TP={target_tp}: precision={prec_3.iloc[idx]:.3f} (at rank {idx+1})")
|
|
95
|
-
|
|
96
|
-
print("\n" + "=" * 70)
|
|
97
|
-
print("COMPARISON")
|
|
98
|
-
print("=" * 70)
|
|
99
|
-
print("""
|
|
100
|
-
Approach 1 (mark as negative): Filtered TPs become FPs, lowering precision
|
|
101
|
-
Approach 2 (remove all filtered): Both TPs and negatives removed
|
|
102
|
-
Approach 3 (remove only TPs): Only filtered TPs removed, negatives kept
|
|
103
|
-
|
|
104
|
-
The R code uses Approach 3 (remove positive pairs that contain the filter ID).
|
|
105
|
-
""")
|
|
106
|
-
# %%
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
#%%
|
|
2
|
-
import pythonflex as flex
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
# # Define specific cell line types you're interested in
|
|
6
|
-
DATA_DIR = "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/"
|
|
7
|
-
|
|
8
|
-
# Specific cell lines of interest with "_cell_lines" suffix removed
|
|
9
|
-
cell_line_files = [
|
|
10
|
-
"soft_tissue_cell_lines.csv",
|
|
11
|
-
"skin_cell_lines.csv",
|
|
12
|
-
# "lung_cell_lines.csv",
|
|
13
|
-
# "head_and_neck_cell_lines.csv",
|
|
14
|
-
# "esophagus_stomach_cell_lines.csv",
|
|
15
|
-
]
|
|
16
|
-
|
|
17
|
-
inputs = {}
|
|
18
|
-
|
|
19
|
-
# Create inputs dict with shortened names (removing "_cell_lines" suffix)
|
|
20
|
-
for filename in cell_line_files:
|
|
21
|
-
# Remove .csv extension and _cell_lines suffix
|
|
22
|
-
key = filename.replace("_cell_lines.csv", "")
|
|
23
|
-
full_path = os.path.join(DATA_DIR, filename)
|
|
24
|
-
|
|
25
|
-
inputs[key] = {
|
|
26
|
-
"path": full_path,
|
|
27
|
-
"sort": "high"
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
inputs['depmap'] = {
|
|
31
|
-
"path": "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv",
|
|
32
|
-
"sort": "high"
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
# Print the resulting inputs dictionary
|
|
36
|
-
print("Configured inputs:")
|
|
37
|
-
for key, value in inputs.items():
|
|
38
|
-
print(f" {key}: {value['path']}")
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
default_config = {
|
|
43
|
-
"min_genes_in_complex": 2,
|
|
44
|
-
"min_genes_per_complex_analysis": 2,
|
|
45
|
-
"output_folder": "25q2_min_genes_2",
|
|
46
|
-
"gold_standard": "CORUM",
|
|
47
|
-
"color_map": "RdYlBu",
|
|
48
|
-
"jaccard": True,
|
|
49
|
-
"plotting": {
|
|
50
|
-
"save_plot": True,
|
|
51
|
-
"output_type": "pdf",
|
|
52
|
-
},
|
|
53
|
-
"preprocessing": {
|
|
54
|
-
"fill_na": True,
|
|
55
|
-
"normalize": False,
|
|
56
|
-
},
|
|
57
|
-
"corr_function": "numpy",
|
|
58
|
-
"logging": {
|
|
59
|
-
"visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
# Initialize logger, config, and output folder
|
|
64
|
-
flex.initialize(default_config)
|
|
65
|
-
|
|
66
|
-
# Load datasets and gold standard terms
|
|
67
|
-
data, _ = flex.load_datasets(inputs)
|
|
68
|
-
terms, genes_in_terms = flex.load_gold_standard()
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
#%%
|
|
72
|
-
# Run analysis
|
|
73
|
-
for name, dataset in data.items():
|
|
74
|
-
pra = flex.pra(name, dataset, is_corr=False)
|
|
75
|
-
fpc = flex.pra_percomplex(name, dataset, is_corr=False)
|
|
76
|
-
cc = flex.complex_contributions(name)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
#%%
|
|
81
|
-
# Generate plots
|
|
82
|
-
flex.plot_auc_scores()
|
|
83
|
-
flex.plot_precision_recall_curve()
|
|
84
|
-
flex.plot_percomplex_scatter()
|
|
85
|
-
flex.plot_percomplex_scatter_bysize()
|
|
86
|
-
flex.plot_significant_complexes()
|
|
87
|
-
flex.plot_complex_contributions()
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
#%%
|
|
91
|
-
# Save results to CSV
|
|
92
|
-
flex.save_results_to_csv()
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
#%%
|
|
103
|
-
|
|
104
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/liver_cell_lines_500_genes.csv
RENAMED
|
File without changes
|
{pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|