pythonflex 0.3__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {pythonflex-0.3 → pythonflex-0.3.2}/PKG-INFO +7 -1
  2. {pythonflex-0.3 → pythonflex-0.3.2}/pyproject.toml +10 -2
  3. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/__init__.py +2 -2
  4. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/examples/basic_usage.py +24 -15
  5. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/plotting.py +225 -22
  6. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/preprocessing.py +28 -21
  7. pythonflex-0.3/src/pythonflex/examples/comparison.py +0 -78
  8. pythonflex-0.3/src/pythonflex/examples/dataset_filtering.py +0 -42
  9. pythonflex-0.3/src/pythonflex/examples/diag.py +0 -106
  10. pythonflex-0.3/src/pythonflex/examples/test.py +0 -104
  11. {pythonflex-0.3 → pythonflex-0.3.2}/.gitignore +0 -0
  12. {pythonflex-0.3 → pythonflex-0.3.2}/.python-version +0 -0
  13. {pythonflex-0.3 → pythonflex-0.3.2}/README.md +0 -0
  14. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/analysis.py +0 -0
  15. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/__init__.py +0 -0
  16. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/__init__.py +0 -0
  17. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/liver_cell_lines_500_genes.csv +0 -0
  18. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/melanoma_cell_lines_500_genes.csv +0 -0
  19. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/dataset/neuroblastoma_cell_lines_500_genes.csv +0 -0
  20. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/CORUM.parquet +0 -0
  21. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/GOBP.parquet +0 -0
  22. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/PATHWAY.parquet +0 -0
  23. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/__init__.py +0 -0
  24. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/corum.csv +0 -0
  25. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/gobp.csv +0 -0
  26. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/data/gold_standard/pathway.csv +0 -0
  27. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/logging_config.py +0 -0
  28. {pythonflex-0.3 → pythonflex-0.3.2}/src/pythonflex/utils.py +0 -0
  29. {pythonflex-0.3 → pythonflex-0.3.2}/todo.txt +0 -0
  30. {pythonflex-0.3 → pythonflex-0.3.2}/uv.lock +0 -0
@@ -1,8 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pythonflex
3
- Version: 0.3
3
+ Version: 0.3.2
4
4
  Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
5
5
  Author-email: Yasir Demirtaş <tyasird@hotmail.com>
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
6
12
  Requires-Python: >=3.9
7
13
  Requires-Dist: adjusttext
8
14
  Requires-Dist: art
@@ -1,13 +1,20 @@
1
1
  [project]
2
2
  name = "pythonflex"
3
- version = "0.3"
3
+ version = "0.3.2"
4
4
  description = "pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data."
5
5
  readme = "README.md"
6
6
  authors = [
7
7
  { name = "Yasir Demirtaş", email = "tyasird@hotmail.com" }
8
8
  ]
9
9
  requires-python = ">=3.9"
10
-
10
+ classifiers = [
11
+ "Programming Language :: Python :: 3",
12
+ "Programming Language :: Python :: 3.9",
13
+ "Programming Language :: Python :: 3.10",
14
+ "Programming Language :: Python :: 3.11",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ ]
11
18
 
12
19
  # Exclude the input folder
13
20
  exclude = ["src/pythonflex/input/*", "src/pythonflex/output/*", "src/pythonflex/examples/output/*",
@@ -67,3 +74,4 @@ pythonflex = { workspace = true }
67
74
  dev = [
68
75
  "pythonflex",
69
76
  ]
77
+
@@ -3,7 +3,7 @@ from .utils import dsave, dload
3
3
  from .preprocessing import get_example_data_path, load_datasets, get_common_genes, filter_matrix_by_genes, load_gold_standard, filter_duplicate_terms
4
4
  from .analysis import initialize, pra, pra_percomplex, fast_corr, perform_corr, is_symmetric, binary, has_mirror_of_first_pair, convert_full_to_half_matrix, drop_mirror_pairs, quick_sort, complex_contributions, save_results_to_csv, update_matploblib_config, mpr_prepare
5
5
  from .plotting import (
6
- adjust_text_positions, plot_precision_recall_curve, plot_percomplex_scatter,
6
+ adjust_text_positions, plot_precision_recall_curve, plot_aggregated_pra, plot_iqr_pra, plot_all_runs_pra, plot_percomplex_scatter,
7
7
  plot_percomplex_scatter_bysize, plot_complex_contributions, plot_significant_complexes, plot_auc_scores,
8
8
  plot_mpr_tp, plot_mpr_complexes, plot_mpr_tp_multi, plot_mpr_complexes_multi
9
9
  )
@@ -13,7 +13,7 @@ __all__ = [ "log", "get_example_data_path", "fast_corr",
13
13
  "filter_matrix_by_genes", "load_gold_standard", "filter_duplicate_terms", "pra", "pra_percomplex",
14
14
  "perform_corr", "is_symmetric", "binary", "has_mirror_of_first_pair", "convert_full_to_half_matrix",
15
15
  "drop_mirror_pairs", "quick_sort", "complex_contributions", "adjust_text_positions", "plot_precision_recall_curve",
16
- "plot_percomplex_scatter", "plot_percomplex_scatter_bysize", "plot_complex_contributions",
16
+ "plot_aggregated_pra", "plot_iqr_pra", "plot_all_runs_pra", "plot_percomplex_scatter", "plot_percomplex_scatter_bysize", "plot_complex_contributions",
17
17
  "plot_significant_complexes", "plot_auc_scores", "save_results_to_csv", "update_matploblib_config",
18
18
  "mpr_prepare", "plot_mpr_tp", "plot_mpr_complexes",
19
19
  "plot_mpr_tp_multi", "plot_mpr_complexes_multi"
@@ -8,32 +8,34 @@ import pythonflex as flex
8
8
  inputs = {
9
9
  "Melanoma (63 Screens)": {
10
10
  "path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
11
- "sort": "high"
11
+ "sort": "high",
12
+ "color": "#FF0000"
12
13
  },
13
14
  "Liver (24 Screens)": {
14
15
  "path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
15
- "sort": "high"
16
+ "sort": "high",
17
+ "color": "#FFDD00"
16
18
  },
17
19
  "Neuroblastoma (37 Screens)": {
18
20
  "path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
19
- "sort": "high"
21
+ "sort": "high",
22
+ "color": "#FFDDDD"
20
23
  },
21
24
  }
22
25
 
23
26
 
24
27
 
25
- #%%
26
28
  default_config = {
27
29
  "min_genes_in_complex": 0,
28
30
  "min_genes_per_complex_analysis": 3,
29
- "output_folder": "output",
31
+ "output_folder": "CORUM",
30
32
  "gold_standard": "CORUM",
31
- "color_map": "RdYlBu",
32
- "jaccard": True,
33
+ "color_map": "BuGn",
34
+ "jaccard": False,
33
35
  "use_common_genes": False, # Set to False for individual dataset-gold standard intersections
34
36
  "plotting": {
35
37
  "save_plot": True,
36
- "output_type": "pdf",
38
+ "output_type": "png",
37
39
  },
38
40
  "preprocessing": {
39
41
  "fill_na": True,
@@ -41,7 +43,8 @@ default_config = {
41
43
  },
42
44
  "corr_function": "numpy",
43
45
  "logging": {
44
- "visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
46
+ "visible_levels": ["DONE"]
47
+ # "PROGRESS", "STARTED", ,"INFO","WARNING"
45
48
  }
46
49
  }
47
50
 
@@ -52,26 +55,32 @@ flex.initialize(default_config)
52
55
  data, _ = flex.load_datasets(inputs)
53
56
  terms, genes_in_terms = flex.load_gold_standard()
54
57
 
55
-
56
- #%%
57
58
  # Run analysis
58
59
  for name, dataset in data.items():
59
60
  pra = flex.pra(name, dataset, is_corr=False)
60
61
  fpc = flex.pra_percomplex(name, dataset, is_corr=False)
61
62
  cc = flex.complex_contributions(name)
62
-
63
+ flex.mpr_prepare(name)
64
+
65
+
63
66
 
64
67
 
65
68
  #%%
66
69
  # Generate plots
67
- flex.plot_auc_scores()
68
70
  flex.plot_precision_recall_curve()
71
+ flex.plot_auc_scores()
72
+ flex.plot_significant_complexes()
69
73
  flex.plot_percomplex_scatter(n_top=20)
70
74
  flex.plot_percomplex_scatter_bysize()
71
- flex.plot_significant_complexes()
72
75
  flex.plot_complex_contributions()
73
-
76
+ ##
77
+ flex.plot_mpr_tp_multi()
78
+ flex.plot_mpr_complexes_multi()
74
79
 
75
80
  #%%
76
81
  # Save results to CSV
77
82
  flex.save_results_to_csv()
83
+
84
+ # %%
85
+ flex.plot_mpr_complexes_multi(show_filters="no_mtRibo_ETCI")
86
+ # %%
@@ -58,7 +58,12 @@ def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
58
58
  log.warning(f"Color map '{cmap_name}' not found. Falling back to 'tab10'.")
59
59
  cmap = get_cmap("tab10")
60
60
 
61
- fig, ax = plt.subplots()
61
+ # Increase figure width to accommodate external legend without squashing axes
62
+ fig, ax = plt.subplots(figsize=(6, 4))
63
+
64
+ # Adjust layout to make room for legend on the right
65
+ plt.subplots_adjust(right=0.7)
66
+
62
67
  ax.set_xscale("log")
63
68
 
64
69
  # optionally hide minor ticks on the log axis
@@ -92,7 +97,7 @@ def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
92
97
  ax.set(title="",
93
98
  xlabel="Number of True Positives (TP)",
94
99
  ylabel="Precision")
95
- ax.legend(loc="upper right", frameon=False)
100
+ ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
96
101
  ax.set_ylim(0, 1)
97
102
 
98
103
  # Nature style: no grid, open top/right spines
@@ -109,6 +114,171 @@ def plot_precision_recall_curve(line_width=2.0, hide_minor_ticks=True):
109
114
  plt.show()
110
115
  plt.close(fig)
111
116
 
117
+ def plot_aggregated_pra(agg_df, line_width=2.0, hide_minor_ticks=True):
118
+ """
119
+ Plots an aggregated Precision-Recall curve with mean line and min-max shading.
120
+ agg_df should be indexed by 'tp' and contain 'mean', 'min', 'max' columns for precision.
121
+ """
122
+ config = dload("config")
123
+ plot_config = config["plotting"]
124
+
125
+ # Increase figure width to accommodate external legend without squashing axes
126
+ fig, ax = plt.subplots(figsize=(6, 4))
127
+
128
+ # Adjust layout to make room for legend on the right
129
+ plt.subplots_adjust(right=0.7)
130
+
131
+ ax.set_xscale("log")
132
+
133
+ # optionally hide minor ticks on the log axis
134
+ if hide_minor_ticks:
135
+ ax.xaxis.set_minor_locator(NullLocator())
136
+ ax.xaxis.set_minor_formatter(NullFormatter())
137
+
138
+ # Filter out very low TP counts if necessary, similar to plot_precision_recall_curve
139
+ agg_df = agg_df[agg_df.index > 10]
140
+
141
+ tp = agg_df.index
142
+ mean_prec = agg_df['mean']
143
+ min_prec = agg_df['min']
144
+ max_prec = agg_df['max']
145
+
146
+ # Plot shading
147
+ ax.fill_between(tp, min_prec, max_prec, color='gray', alpha=0.3, label='Range (Min-Max)')
148
+
149
+ # Plot mean line
150
+ ax.plot(tp, mean_prec, c="black", label="Mean Precision", linewidth=line_width, alpha=0.9)
151
+
152
+ ax.set(title="",
153
+ xlabel="Number of True Positives (TP)",
154
+ ylabel="Precision")
155
+ ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
156
+ ax.set_ylim(0, 1)
157
+
158
+ # Nature style: no grid, open top/right spines
159
+ ax.grid(False)
160
+ ax.spines['top'].set_visible(False)
161
+ ax.spines['right'].set_visible(False)
162
+
163
+ if plot_config["save_plot"]:
164
+ output_type = plot_config["output_type"]
165
+ output_path = Path(config["output_folder"]) / f"aggregated_precision_recall_curve.{output_type}"
166
+ fig.savefig(output_path, bbox_inches="tight", format=output_type)
167
+
168
+ if plot_config.get("show_plot", True):
169
+ plt.show()
170
+ plt.close(fig)
171
+
172
+ def plot_iqr_pra(agg_df, line_width=2.0, hide_minor_ticks=True):
173
+ """
174
+ Plots an aggregated Precision-Recall curve with mean line and IQR (25-75%) shading.
175
+ agg_df should be indexed by 'tp' and contain 'mean', '25%', '75%' columns for precision.
176
+ """
177
+ config = dload("config")
178
+ plot_config = config["plotting"]
179
+
180
+ # Increase figure width to accommodate external legend without squashing axes
181
+ fig, ax = plt.subplots(figsize=(6, 4))
182
+
183
+ # Adjust layout to make room for legend on the right
184
+ plt.subplots_adjust(right=0.7)
185
+
186
+ ax.set_xscale("log")
187
+
188
+ # optionally hide minor ticks on the log axis
189
+ if hide_minor_ticks:
190
+ ax.xaxis.set_minor_locator(NullLocator())
191
+ ax.xaxis.set_minor_formatter(NullFormatter())
192
+
193
+ # Filter out very low TP counts
194
+ agg_df = agg_df[agg_df.index > 10]
195
+
196
+ tp = agg_df.index
197
+ mean_prec = agg_df['mean']
198
+ q25_prec = agg_df['25%']
199
+ q75_prec = agg_df['75%']
200
+
201
+ # Plot shading
202
+ ax.fill_between(tp, q25_prec, q75_prec, color='gray', alpha=0.3, label='IQR (25-75%)')
203
+
204
+ # Plot mean line
205
+ ax.plot(tp, mean_prec, c="black", label="Mean Precision", linewidth=line_width, alpha=0.9)
206
+
207
+ ax.set(title="Precision-Recall (IQR)",
208
+ xlabel="Number of True Positives (TP)",
209
+ ylabel="Precision")
210
+ ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
211
+ ax.set_ylim(0, 1)
212
+
213
+ # Nature style
214
+ ax.grid(False)
215
+ ax.spines['top'].set_visible(False)
216
+ ax.spines['right'].set_visible(False)
217
+
218
+ if plot_config["save_plot"]:
219
+ output_type = plot_config["output_type"]
220
+ output_path = Path(config["output_folder"]) / f"aggregated_iqr_precision_recall_curve.{output_type}"
221
+ fig.savefig(output_path, bbox_inches="tight", format=output_type)
222
+
223
+ if plot_config.get("show_plot", True):
224
+ plt.show()
225
+ plt.close(fig)
226
+
227
+ def plot_all_runs_pra(pra_list, mean_df=None, line_width=2.0, hide_minor_ticks=True):
228
+ """
229
+ Plots all individual Precision-Recall curves faintly, with an optional mean line.
230
+ pra_list: list of dataframes (each with 'tp' and 'precision' columns) OR list of Series (if index is tp)
231
+ mean_df: optional dataframe with 'mean' column indexed by tp
232
+ """
233
+ config = dload("config")
234
+ plot_config = config["plotting"]
235
+
236
+ fig, ax = plt.subplots(figsize=(6, 4))
237
+ plt.subplots_adjust(right=0.7)
238
+
239
+ ax.set_xscale("log")
240
+
241
+ if hide_minor_ticks:
242
+ ax.xaxis.set_minor_locator(NullLocator())
243
+ ax.xaxis.set_minor_formatter(NullFormatter())
244
+
245
+ # Plot individual lines
246
+ for i, df in enumerate(pra_list):
247
+ # Ensure we filter low TPs same as others
248
+ df_filtered = df[df['tp'] > 10] if 'tp' in df.columns else df[df.index > 10]
249
+
250
+ x = df_filtered['tp'] if 'tp' in df_filtered.columns else df_filtered.index
251
+ y = df_filtered['precision'] if 'precision' in df_filtered.columns else df_filtered.values
252
+
253
+ # Only add label for the first line to avoid cluttering legend
254
+ lbl = "Individual Runs" if i == 0 else None
255
+ ax.plot(x, y, c="gray", linewidth=0.5, alpha=0.3, label=lbl)
256
+
257
+ # Plot mean line if provided
258
+ if mean_df is not None:
259
+ mean_df = mean_df[mean_df.index > 10]
260
+ ax.plot(mean_df.index, mean_df['mean'], c="black", label="Mean Precision", linewidth=line_width, alpha=0.9)
261
+
262
+ ax.set(title="Precision-Recall (All Runs)",
263
+ xlabel="Number of True Positives (TP)",
264
+ ylabel="Precision")
265
+ ax.legend(loc="upper left", bbox_to_anchor=(1.05, 1), frameon=False)
266
+ ax.set_ylim(0, 1)
267
+
268
+ # Nature style
269
+ ax.grid(False)
270
+ ax.spines['top'].set_visible(False)
271
+ ax.spines['right'].set_visible(False)
272
+
273
+ if plot_config["save_plot"]:
274
+ output_type = plot_config["output_type"]
275
+ output_path = Path(config["output_folder"]) / f"aggregated_all_runs_precision_recall_curve.{output_type}"
276
+ fig.savefig(output_path, bbox_inches="tight", format=output_type)
277
+
278
+ if plot_config.get("show_plot", True):
279
+ plt.show()
280
+ plt.close(fig)
281
+
112
282
  def plot_percomplex_scatter(n_top=10, sig_color='#B71A2A', nonsig_color='#DBDDDD', label_color='black', border_color='black', border_width=1.0, show_text_background=True):
113
283
  config = dload("config")
114
284
  plot_config = config["plotting"]
@@ -1050,14 +1220,10 @@ def plot_auc_scores():
1050
1220
  plt.close(fig)
1051
1221
  return pra_dict
1052
1222
 
1053
-
1054
-
1055
-
1056
1223
  # -----------------------------------------------------------------------------
1057
1224
  # mPR plots (Fig. 1E and Fig. 1F)
1058
1225
  # -----------------------------------------------------------------------------
1059
1226
 
1060
-
1061
1227
  def plot_mpr_complexes(name, ax=None, save=True, outname=None):
1062
1228
  """
1063
1229
  Fig. 1F-style module-level PR:
@@ -1208,7 +1374,6 @@ def plot_mpr_tp(name, ax=None, save=True, outname=None):
1208
1374
 
1209
1375
  return ax
1210
1376
 
1211
-
1212
1377
  """
1213
1378
  Multi-dataset mPR plotting functions.
1214
1379
 
@@ -1229,7 +1394,6 @@ from pathlib import Path
1229
1394
  from .utils import dload
1230
1395
  from .logging_config import log
1231
1396
 
1232
-
1233
1397
  # Default color palette (colorblind-friendly)
1234
1398
  DEFAULT_COLORS = [
1235
1399
  "#4E79A7", # blue
@@ -1252,6 +1416,21 @@ FILTER_STYLES = {
1252
1416
  }
1253
1417
 
1254
1418
 
1419
+ def _normalize_show_filters(show_filters):
1420
+ """Normalize show_filters to an ordered tuple of filter keys.
1421
+
1422
+ Common footgun: passing a single string (e.g. "no_mtRibo_ETCI") is iterable,
1423
+ which would otherwise be treated as a sequence of characters.
1424
+ """
1425
+ if show_filters is None:
1426
+ return tuple(FILTER_STYLES.keys())
1427
+ if isinstance(show_filters, str):
1428
+ return (show_filters,)
1429
+ try:
1430
+ return tuple(show_filters)
1431
+ except TypeError:
1432
+ return (show_filters,)
1433
+
1255
1434
  def plot_mpr_tp_multi(
1256
1435
  dataset_names=None,
1257
1436
  colors=None,
@@ -1292,6 +1471,8 @@ def plot_mpr_tp_multi(
1292
1471
  config = dload("config")
1293
1472
  plot_config = config["plotting"]
1294
1473
  input_colors = dload("input", "colors")
1474
+
1475
+ show_filters = _normalize_show_filters(show_filters)
1295
1476
 
1296
1477
  # Sanitize color keys
1297
1478
  if input_colors:
@@ -1335,7 +1516,10 @@ def plot_mpr_tp_multi(
1335
1516
  colors = final_colors
1336
1517
 
1337
1518
  if ax is None:
1338
- fig, ax = plt.subplots(figsize=(5, 4))
1519
+ # Increase width slightly
1520
+ fig, ax = plt.subplots(figsize=(6, 4))
1521
+ # Reserve space for legend on right
1522
+ plt.subplots_adjust(right=0.7)
1339
1523
  else:
1340
1524
  fig = ax.figure
1341
1525
 
@@ -1413,14 +1597,21 @@ def plot_mpr_tp_multi(
1413
1597
 
1414
1598
  # Save
1415
1599
  if save:
1600
+ output_type = plot_config.get("output_type", "pdf")
1416
1601
  if outname is None:
1417
- outname = "mpr_tp_multi.pdf"
1602
+ outname = f"mpr_tp_multi.{output_type}"
1603
+
1604
+ # Check if outname is just a filename or a full path
1605
+ outpath = Path(outname)
1606
+ if len(outpath.parts) == 1:
1607
+ # Just a filename, prepend configured output folder
1608
+ outpath = Path(config["output_folder"]) / outname
1609
+
1418
1610
  fig.tight_layout()
1419
- fig.savefig(outname, bbox_inches="tight")
1611
+ fig.savefig(outpath, bbox_inches="tight", format=output_type)
1420
1612
 
1421
1613
  return ax
1422
1614
 
1423
-
1424
1615
  def plot_mpr_complexes_multi(
1425
1616
  dataset_names=None,
1426
1617
  colors=None,
@@ -1461,6 +1652,8 @@ def plot_mpr_complexes_multi(
1461
1652
  config = dload("config")
1462
1653
  plot_config = config["plotting"]
1463
1654
  input_colors = dload("input", "colors")
1655
+
1656
+ show_filters = _normalize_show_filters(show_filters)
1464
1657
 
1465
1658
  # Sanitize color keys
1466
1659
  if input_colors:
@@ -1504,7 +1697,10 @@ def plot_mpr_complexes_multi(
1504
1697
  colors = final_colors
1505
1698
 
1506
1699
  if ax is None:
1507
- fig, ax = plt.subplots(figsize=(5, 4))
1700
+ # Increase width slightly
1701
+ fig, ax = plt.subplots(figsize=(6, 4))
1702
+ # Reserve space for legend on right
1703
+ plt.subplots_adjust(right=0.7)
1508
1704
  else:
1509
1705
  fig = ax.figure
1510
1706
 
@@ -1564,18 +1760,26 @@ def plot_mpr_complexes_multi(
1564
1760
 
1565
1761
  # Save
1566
1762
  if save:
1763
+ output_type = plot_config.get("output_type", "pdf")
1567
1764
  if outname is None:
1568
- outname = "mpr_complexes_multi.pdf"
1765
+ outname = f"mpr_complexes_multi.{output_type}"
1766
+
1767
+ # Check if outname is just a filename or a full path
1768
+ outpath = Path(outname)
1769
+ if len(outpath.parts) == 1:
1770
+ # Just a filename, prepend configured output folder
1771
+ outpath = Path(config["output_folder"]) / outname
1772
+
1569
1773
  fig.tight_layout()
1570
- fig.savefig(outname, bbox_inches="tight")
1774
+ fig.savefig(outpath, bbox_inches="tight", format=output_type)
1571
1775
 
1572
1776
  return ax
1573
1777
 
1574
-
1575
1778
  def _add_vertical_legend(ax, dataset_names, colors, show_filters, linewidth):
1576
1779
  """
1577
1780
  Add vertically stacked legends: Dataset on top, Filter below.
1578
1781
  """
1782
+ show_filters = _normalize_show_filters(show_filters)
1579
1783
  # Legend 1: Datasets (colors) - solid lines
1580
1784
  dataset_handles = []
1581
1785
  for i, name in enumerate(dataset_names):
@@ -1602,12 +1806,12 @@ def _add_vertical_legend(ax, dataset_names, colors, show_filters, linewidth):
1602
1806
  legend1 = ax.legend(
1603
1807
  dataset_handles,
1604
1808
  dataset_names,
1605
- loc="upper right",
1809
+ loc="upper left",
1606
1810
  frameon=False,
1607
1811
  title="Dataset",
1608
1812
  fontsize=7,
1609
1813
  title_fontsize=8,
1610
- bbox_to_anchor=(1.0, 1.0)
1814
+ bbox_to_anchor=(1.05, 1.0)
1611
1815
  )
1612
1816
  ax.add_artist(legend1)
1613
1817
 
@@ -1615,17 +1819,17 @@ def _add_vertical_legend(ax, dataset_names, colors, show_filters, linewidth):
1615
1819
  legend2 = ax.legend(
1616
1820
  filter_handles,
1617
1821
  filter_labels,
1618
- loc="upper right",
1822
+ loc="upper left",
1619
1823
  frameon=False,
1620
1824
  fontsize=7,
1621
- bbox_to_anchor=(1.0, 1.0 - len(dataset_names) * 0.04 - 0.08)
1825
+ bbox_to_anchor=(1.05, 1.0 - len(dataset_names) * 0.06 - 0.1)
1622
1826
  )
1623
1827
 
1624
-
1625
1828
  def _add_dual_legend(ax, dataset_names, colors, show_filters, linewidth):
1626
1829
  """
1627
1830
  Add two legends: one for datasets (colors), one for filters (line styles).
1628
1831
  """
1832
+ show_filters = _normalize_show_filters(show_filters)
1629
1833
  # Legend 1: Datasets (colors) - solid lines
1630
1834
  dataset_handles = []
1631
1835
  for i, name in enumerate(dataset_names):
@@ -1671,7 +1875,6 @@ def _add_dual_legend(ax, dataset_names, colors, show_filters, linewidth):
1671
1875
  title_fontsize=8,
1672
1876
  )
1673
1877
 
1674
-
1675
1878
  # ============================================================================
1676
1879
  # Single dataset functions are now obsolete
1677
1880
  # ============================================================================
@@ -13,28 +13,36 @@ from pathlib import Path
13
13
 
14
14
 
15
15
  def return_package_dir():
16
-
17
- # Get the distribution
18
- dist = distribution('pythonflex')
19
-
20
- # Check for direct_url.json
21
- direct_url_text = dist.read_text('direct_url.json')
22
-
23
- if direct_url_text:
24
- direct_url = json.loads(direct_url_text)
25
- if direct_url.get('dir_info', {}).get('editable'):
26
- # Editable install detected
27
- project_url = direct_url['url']
28
- # Remove 'file:///' prefix and handle Windows paths
29
- project_root = project_url.removeprefix('file:///').replace('/', os.sep)
30
- # Assuming src layout: project_root/src/pythonflex
31
- package_dir = os.path.join(project_root, 'src', 'pythonflex')
16
+ try:
17
+ # Get the distribution
18
+ dist = distribution('pythonflex')
19
+
20
+ # Check for direct_url.json
21
+ try:
22
+ direct_url_text = dist.read_text('direct_url.json')
23
+ except FileNotFoundError:
24
+ direct_url_text = None
25
+
26
+ if direct_url_text:
27
+ direct_url = json.loads(direct_url_text)
28
+ if direct_url.get('dir_info', {}).get('editable'):
29
+ # Editable install detected
30
+ project_url = direct_url['url']
31
+ # Remove 'file:///' prefix and handle Windows paths
32
+ project_root = project_url.removeprefix('file:///').replace('/', os.sep)
33
+ # Assuming src layout: project_root/src/pythonflex
34
+ package_dir = os.path.join(project_root, 'src', 'pythonflex')
35
+ else:
36
+ # Non-editable
37
+ package_dir = str(files('pythonflex'))
32
38
  else:
33
- # Non-editable
39
+ # No direct_url, assume non-editable
34
40
  package_dir = str(files('pythonflex'))
35
- else:
36
- # No direct_url, assume non-editable
37
- package_dir = str(files('pythonflex'))
41
+
42
+ except Exception: # PackageNotFoundError or other issues
43
+ # Fallback to local directory relative to this file
44
+ # precise location: src/pythonflex/preprocessing.py -> package dir is parent
45
+ package_dir = str(Path(__file__).parent)
38
46
 
39
47
  return package_dir
40
48
 
@@ -190,7 +198,6 @@ def load_gold_standard():
190
198
  "PATHWAY": "gold_standard/PATHWAY.parquet"
191
199
  }
192
200
 
193
-
194
201
  if gold_standard_source in gold_standard_files:
195
202
  # Load predefined gold standard from package resources
196
203
  filename = gold_standard_files[gold_standard_source]
@@ -1,78 +0,0 @@
1
- """
2
- Basic usage example of the pythonFLEX package.
3
- Demonstrates initialization, data loading, analysis, and plotting.
4
- """
5
- #%%
6
- import pythonflex as flex
7
- import pandas as pd
8
-
9
- depmap = pd.read_csv('../../../../_datasets/depmap/25Q2/gene_effect.csv', index_col=0)
10
- white = pd.read_csv('../../../../_datasets/depmap/25Q2/25Q2_chronos_whitened_PCA.csv', index_col=0).T
11
-
12
- inputs = {
13
- "25Q2": {
14
- "path": depmap,
15
- "sort": "high",
16
- "color": "#fff000" # Black
17
- },
18
-
19
- "25Q2 white": {
20
- "path": white,
21
- "sort": "high",
22
- "color": "#ff0000" # Orange
23
- },
24
- }
25
-
26
- default_config = {
27
- "min_genes_in_complex": 0,
28
- "min_genes_per_complex_analysis": 3,
29
- "output_folder": "CORUM_25Q2_comparison2",
30
- "gold_standard": "CORUM",
31
- "color_map": "BuGn",
32
- "jaccard": False,
33
- "use_common_genes": False, # Set to False for individual dataset-gold standard intersections
34
- "plotting": {
35
- "save_plot": True,
36
- "output_type": "png",
37
- },
38
- "preprocessing": {
39
- "fill_na": True,
40
- "normalize": False,
41
- },
42
- "corr_function": "numpy",
43
- "logging": {
44
- "visible_levels": ["DONE"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
45
- }
46
- }
47
-
48
- # Initialize logger, config, and output folder
49
- flex.initialize(default_config)
50
-
51
- # Load datasets and gold standard terms
52
- data, _ = flex.load_datasets(inputs)
53
- terms, genes_in_terms = flex.load_gold_standard()
54
-
55
- # Run analysis
56
- for name, dataset in data.items():
57
- pra = flex.pra(name, dataset, is_corr=False)
58
- fpc = flex.pra_percomplex(name, dataset, is_corr=False)
59
- flex.mpr_prepare(name) # Add this line
60
- cc = flex.complex_contributions(name)
61
-
62
-
63
-
64
-
65
- #%%
66
- # Generate plots
67
- flex.plot_precision_recall_curve()
68
- flex.plot_auc_scores()
69
- flex.plot_significant_complexes()
70
- flex.plot_percomplex_scatter(n_top=20)
71
- flex.plot_percomplex_scatter_bysize()
72
- flex.plot_complex_contributions()
73
- flex.plot_mpr_tp_multi()
74
- flex.plot_mpr_complexes_multi()
75
- # Save results to CSV
76
- # flex.save_results_to_csv()
77
-
78
- # %%
@@ -1,42 +0,0 @@
1
-
2
- # %%
3
- import pandas as pd
4
-
5
- df = pd.read_csv("../../../../datasets/depmap/24Q4/CRISPRGeneEffect.csv",index_col=0)
6
- model = pd.read_csv("../../../../datasets/depmap/24Q4/Model.csv",index_col=0)
7
-
8
- df.columns = df.columns.str.split(" \\(").str[0]
9
- df = df.T
10
-
11
- #%%
12
-
13
- # %%
14
- # get ModelID of selected disease for example OncotreePrimaryDisease==Melanoma
15
- melanoma = model[model.OncotreePrimaryDisease=="Melanoma"].index.unique().values
16
- liver = model[model.OncotreeLineage=="Liver"].index.unique().values
17
- neuroblastoma = model[model.OncotreePrimaryDisease=="Neuroblastoma"].index.unique().values
18
-
19
- # %%
20
- # mel.index is model ids, filter that ids in the columns of df
21
- mel_df = df.loc[:,df.columns.isin(melanoma)]
22
- liver_df = df.loc[:,df.columns.isin(liver)]
23
- neuro_df = df.loc[:,df.columns.isin(neuroblastoma)]
24
-
25
-
26
- # %%
27
- mel_df.to_csv("melanoma.csv")
28
- liver_df.to_csv("liver.csv")
29
- neuro_df.to_csv("neuroblastoma.csv")
30
- df.to_csv("depmap_geneeffect_all_cellines.csv")
31
-
32
-
33
- # %%
34
- import pandas as pd
35
- df = pd.read_csv('../../../../_datasets/depmap/19Q2/Achilles_gene_effect.csv', index_col=0)
36
- df.columns = df.columns.str.split(" \\(").str[0]
37
- df = df.T
38
-
39
- # %%
40
- df.to_csv("../../../../_datasets/depmap/19Q2/gene_effect.csv")
41
-
42
- # %%
@@ -1,106 +0,0 @@
1
- #%%
2
- # Run this in Jupyter to test the two approaches
3
-
4
- import numpy as np
5
- import pandas as pd
6
- from pythonflex.utils import dload
7
-
8
- dataset_name = "[CORUM] 19Q2"
9
-
10
- pra = dload("pra", dataset_name)
11
- mpr = dload("mpr", dataset_name)
12
-
13
- filter_ids = set(mpr["filters"]["no_mtRibo_ETCI"])
14
- print(f"Filter IDs: {filter_ids}")
15
-
16
- cid_col = "complex_id" if "complex_id" in pra.columns else "complex_ids"
17
-
18
- # Sort by score descending
19
- pra_sorted = pra.sort_values("score", ascending=False).reset_index(drop=True)
20
-
21
- def has_filter_id(cids, filter_ids):
22
- """Check if any complex ID is in filter_ids"""
23
- if isinstance(cids, (np.ndarray, list)):
24
- ids = [int(x) for x in cids if pd.notnull(x)]
25
- else:
26
- return False
27
- return any(c in filter_ids for c in ids)
28
-
29
- # Mark which pairs should be filtered
30
- pra_sorted["should_filter"] = pra_sorted[cid_col].apply(lambda x: has_filter_id(x, filter_ids))
31
-
32
- print(f"\nTotal pairs: {len(pra_sorted)}")
33
- print(f"Pairs to filter: {pra_sorted['should_filter'].sum()}")
34
- print(f"TPs to filter: {(pra_sorted['should_filter'] & (pra_sorted['prediction']==1)).sum()}")
35
-
36
- # APPROACH 1: Mark as negative (what your Python does)
37
- # Keep all rows, but filtered TPs become FPs
38
- print("\n" + "=" * 70)
39
- print("APPROACH 1: Mark filtered TPs as negatives (keep rows)")
40
- print("=" * 70)
41
-
42
- df1 = pra_sorted.copy()
43
- df1["true_filtered"] = df1["prediction"].copy()
44
- df1.loc[df1["should_filter"] & (df1["prediction"]==1), "true_filtered"] = 0
45
-
46
- tp_cum_1 = df1["true_filtered"].cumsum()
47
- prec_1 = tp_cum_1 / (np.arange(len(df1)) + 1)
48
-
49
- # Show precision at key TP counts
50
- print("\nPrecision at key TP counts:")
51
- for target_tp in [10, 50, 100, 500, 1000]:
52
- if target_tp <= tp_cum_1.max():
53
- idx = np.where(tp_cum_1 >= target_tp)[0][0]
54
- print(f" TP={target_tp}: precision={prec_1.iloc[idx]:.3f} (at rank {idx+1})")
55
-
56
- # APPROACH 2: Remove rows entirely (what R does with replace=FALSE)
57
- print("\n" + "=" * 70)
58
- print("APPROACH 2: Remove filtered rows entirely")
59
- print("=" * 70)
60
-
61
- df2 = pra_sorted[~pra_sorted["should_filter"]].copy().reset_index(drop=True)
62
-
63
- tp_cum_2 = df2["prediction"].cumsum()
64
- prec_2 = tp_cum_2 / (np.arange(len(df2)) + 1)
65
-
66
- print(f"\nRows remaining after removal: {len(df2)}")
67
- print(f"TPs remaining: {df2['prediction'].sum()}")
68
-
69
- print("\nPrecision at key TP counts:")
70
- for target_tp in [10, 50, 100, 500, 1000]:
71
- if target_tp <= tp_cum_2.max():
72
- idx = np.where(tp_cum_2 >= target_tp)[0][0]
73
- print(f" TP={target_tp}: precision={prec_2.iloc[idx]:.3f} (at rank {idx+1})")
74
-
75
- # APPROACH 3: Only remove filtered POSITIVE pairs, keep negatives
76
- print("\n" + "=" * 70)
77
- print("APPROACH 3: Remove only filtered TPs (keep filtered negatives)")
78
- print("=" * 70)
79
-
80
- # This removes TP rows that contain filter IDs, but keeps negative rows
81
- remove_mask = pra_sorted["should_filter"] & (pra_sorted["prediction"] == 1)
82
- df3 = pra_sorted[~remove_mask].copy().reset_index(drop=True)
83
-
84
- tp_cum_3 = df3["prediction"].cumsum()
85
- prec_3 = tp_cum_3 / (np.arange(len(df3)) + 1)
86
-
87
- print(f"\nRows remaining: {len(df3)}")
88
- print(f"TPs remaining: {df3['prediction'].sum()}")
89
-
90
- print("\nPrecision at key TP counts:")
91
- for target_tp in [10, 50, 100, 500, 1000]:
92
- if target_tp <= tp_cum_3.max():
93
- idx = np.where(tp_cum_3 >= target_tp)[0][0]
94
- print(f" TP={target_tp}: precision={prec_3.iloc[idx]:.3f} (at rank {idx+1})")
95
-
96
- print("\n" + "=" * 70)
97
- print("COMPARISON")
98
- print("=" * 70)
99
- print("""
100
- Approach 1 (mark as negative): Filtered TPs become FPs, lowering precision
101
- Approach 2 (remove all filtered): Both TPs and negatives removed
102
- Approach 3 (remove only TPs): Only filtered TPs removed, negatives kept
103
-
104
- The R code uses Approach 3 (remove positive pairs that contain the filter ID).
105
- """)
106
- # %%
@@ -1,104 +0,0 @@
1
- #%%
2
- import pythonflex as flex
3
- import os
4
-
5
- # # Define specific cell line types you're interested in
6
- DATA_DIR = "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/"
7
-
8
- # Specific cell lines of interest with "_cell_lines" suffix removed
9
- cell_line_files = [
10
- "soft_tissue_cell_lines.csv",
11
- "skin_cell_lines.csv",
12
- # "lung_cell_lines.csv",
13
- # "head_and_neck_cell_lines.csv",
14
- # "esophagus_stomach_cell_lines.csv",
15
- ]
16
-
17
- inputs = {}
18
-
19
- # Create inputs dict with shortened names (removing "_cell_lines" suffix)
20
- for filename in cell_line_files:
21
- # Remove .csv extension and _cell_lines suffix
22
- key = filename.replace("_cell_lines.csv", "")
23
- full_path = os.path.join(DATA_DIR, filename)
24
-
25
- inputs[key] = {
26
- "path": full_path,
27
- "sort": "high"
28
- }
29
-
30
- inputs['depmap'] = {
31
- "path": "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv",
32
- "sort": "high"
33
- }
34
-
35
- # Print the resulting inputs dictionary
36
- print("Configured inputs:")
37
- for key, value in inputs.items():
38
- print(f" {key}: {value['path']}")
39
-
40
-
41
-
42
- default_config = {
43
- "min_genes_in_complex": 2,
44
- "min_genes_per_complex_analysis": 2,
45
- "output_folder": "25q2_min_genes_2",
46
- "gold_standard": "CORUM",
47
- "color_map": "RdYlBu",
48
- "jaccard": True,
49
- "plotting": {
50
- "save_plot": True,
51
- "output_type": "pdf",
52
- },
53
- "preprocessing": {
54
- "fill_na": True,
55
- "normalize": False,
56
- },
57
- "corr_function": "numpy",
58
- "logging": {
59
- "visible_levels": ["DONE","STARTED"] # "PROGRESS", "STARTED", ,"INFO","WARNING"
60
- }
61
- }
62
-
63
- # Initialize logger, config, and output folder
64
- flex.initialize(default_config)
65
-
66
- # Load datasets and gold standard terms
67
- data, _ = flex.load_datasets(inputs)
68
- terms, genes_in_terms = flex.load_gold_standard()
69
-
70
-
71
- #%%
72
- # Run analysis
73
- for name, dataset in data.items():
74
- pra = flex.pra(name, dataset, is_corr=False)
75
- fpc = flex.pra_percomplex(name, dataset, is_corr=False)
76
- cc = flex.complex_contributions(name)
77
-
78
-
79
-
80
- #%%
81
- # Generate plots
82
- flex.plot_auc_scores()
83
- flex.plot_precision_recall_curve()
84
- flex.plot_percomplex_scatter()
85
- flex.plot_percomplex_scatter_bysize()
86
- flex.plot_significant_complexes()
87
- flex.plot_complex_contributions()
88
-
89
-
90
- #%%
91
- # Save results to CSV
92
- flex.save_results_to_csv()
93
-
94
-
95
-
96
-
97
-
98
-
99
-
100
-
101
-
102
- #%%
103
-
104
-
File without changes
File without changes
File without changes
File without changes
File without changes